cannaiq/backend/dist/services/crawl-scheduler.js

"use strict";
/**
 * Crawl Scheduler Service
 *
 * This service manages crawl scheduling using a job queue approach.
 * It does NOT modify the crawler - it only TRIGGERS the existing crawler.
 *
 * Features:
 * - Global schedule: crawl all stores every N hours
 * - Daily special run: 12:01 AM local store time
 * - Per-store schedule overrides
 * - Job queue for tracking pending/running crawls
 */
var __importDefault = (this && this.__importDefault) || function (mod) {
    return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getGlobalSchedule = getGlobalSchedule;
exports.updateGlobalSchedule = updateGlobalSchedule;
exports.getStoreScheduleStatuses = getStoreScheduleStatuses;
exports.getStoreSchedule = getStoreSchedule;
exports.updateStoreSchedule = updateStoreSchedule;
exports.createCrawlJob = createCrawlJob;
exports.getPendingJobs = getPendingJobs;
exports.claimJob = claimJob;
exports.completeJob = completeJob;
exports.getRecentJobs = getRecentJobs;
exports.getAllRecentJobs = getAllRecentJobs;
exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs;
exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs;
exports.processJobs = processJobs;
exports.processOrchestrator = processOrchestrator;
exports.setSchedulerMode = setSchedulerMode;
exports.getSchedulerMode = getSchedulerMode;
exports.startCrawlScheduler = startCrawlScheduler;
exports.stopCrawlScheduler = stopCrawlScheduler;
exports.restartCrawlScheduler = restartCrawlScheduler;
exports.triggerManualCrawl = triggerManualCrawl;
exports.triggerAllStoresCrawl = triggerAllStoresCrawl;
exports.cancelJob = cancelJob;
const node_cron_1 = __importDefault(require("node-cron"));
const migrate_1 = require("../db/migrate");
const scraper_v2_1 = require("../scraper-v2");
const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator");
// Worker identification
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
let schedulerCronJob = null;
let jobProcessorRunning = false;
let orchestratorProcessorRunning = false;
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
let schedulerMode = 'orchestrator';
// ============================================
// Schedule Management
// ============================================
/**
 * Get global schedule settings
 */
async function getGlobalSchedule() {
    const result = await migrate_1.pool.query(`
    SELECT * FROM crawler_schedule ORDER BY id
  `);
    return result.rows;
}
/**
 * Update global schedule setting
 */
async function updateGlobalSchedule(scheduleType, updates) {
    const setClauses = [];
    const values = [];
    let paramIndex = 1;
    if (updates.enabled !== undefined) {
        setClauses.push(`enabled = $${paramIndex++}`);
        values.push(updates.enabled);
    }
    if (updates.interval_hours !== undefined) {
        setClauses.push(`interval_hours = $${paramIndex++}`);
        values.push(updates.interval_hours);
    }
    if (updates.run_time !== undefined) {
        setClauses.push(`run_time = $${paramIndex++}`);
        values.push(updates.run_time);
    }
    values.push(scheduleType);
    const result = await migrate_1.pool.query(`
    UPDATE crawler_schedule
    SET ${setClauses.join(', ')}
    WHERE schedule_type = $${paramIndex}
    RETURNING *
  `, values);
    return result.rows[0];
}
/**
 * Get all store schedule statuses
 */
async function getStoreScheduleStatuses() {
    const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
    return result.rows;
}
/**
 * Get or create per-store schedule override
 */
async function getStoreSchedule(storeId) {
    const result = await migrate_1.pool.query(`
    SELECT * FROM store_crawl_schedule WHERE store_id = $1
  `, [storeId]);
    if (result.rows.length > 0) {
        return result.rows[0];
    }
    // Return default (use global)
    return {
        store_id: storeId,
        enabled: true,
        interval_hours: null,
        daily_special_enabled: true,
        daily_special_time: null,
        priority: 0
    };
}
/**
 * Update per-store schedule override
 */
async function updateStoreSchedule(storeId, updates) {
    const result = await migrate_1.pool.query(`
    INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
    VALUES ($1, $2, $3, $4, $5, $6)
    ON CONFLICT (store_id) DO UPDATE SET
      enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
      interval_hours = EXCLUDED.interval_hours,
      daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
      daily_special_time = EXCLUDED.daily_special_time,
      priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
      updated_at = NOW()
    RETURNING *
  `, [
        storeId,
        updates.enabled ?? true,
        updates.interval_hours ?? null,
        updates.daily_special_enabled ?? true,
        updates.daily_special_time ?? null,
        updates.priority ?? 0
    ]);
    return result.rows[0];
}
// ============================================
// Job Queue Management
// ============================================
/**
 * Create a new crawl job
 */
async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) {
    // Check if there's already a pending or running job for this store
    const existing = await migrate_1.pool.query(`
    SELECT id FROM crawl_jobs
    WHERE store_id = $1 AND status IN ('pending', 'running')
    LIMIT 1
  `, [storeId]);
    if (existing.rows.length > 0) {
        console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
        return existing.rows[0];
    }
    const result = await migrate_1.pool.query(`
    INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
    VALUES ($1, $2, $3, $4, $5, 'pending')
    RETURNING *
  `, [storeId, jobType, triggerType, scheduledAt, priority]);
    console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
    return result.rows[0];
}
/**
 * Get pending jobs ready to run
 */
async function getPendingJobs(limit = 5) {
    const result = await migrate_1.pool.query(`
    SELECT cj.*, s.name as store_name
    FROM crawl_jobs cj
    JOIN stores s ON s.id = cj.store_id
    WHERE cj.status = 'pending'
      AND cj.scheduled_at <= NOW()
    ORDER BY cj.priority DESC, cj.scheduled_at ASC
    LIMIT $1
  `, [limit]);
    return result.rows;
}
/**
 * Claim a job for processing
 */
async function claimJob(jobId) {
    const result = await migrate_1.pool.query(`
    UPDATE crawl_jobs
    SET status = 'running', started_at = NOW(), worker_id = $2
    WHERE id = $1 AND status = 'pending'
    RETURNING id
  `, [jobId, WORKER_ID]);
    return result.rows.length > 0;
}
/**
 * Complete a job
 */
async function completeJob(jobId, success, results) {
    await migrate_1.pool.query(`
    UPDATE crawl_jobs
    SET
      status = $2,
      completed_at = NOW(),
      products_found = $3,
      error_message = $4
    WHERE id = $1
  `, [
        jobId,
        success ? 'completed' : 'failed',
        results?.products_found ?? null,
        results?.error_message ?? null
    ]);
}
/**
 * Get recent jobs for a store
 */
async function getRecentJobs(storeId, limit = 10) {
    const result = await migrate_1.pool.query(`
    SELECT * FROM crawl_jobs
    WHERE store_id = $1
    ORDER BY created_at DESC
    LIMIT $2
  `, [storeId, limit]);
    return result.rows;
}
/**
 * Get all recent jobs
 */
async function getAllRecentJobs(limit = 50) {
    const result = await migrate_1.pool.query(`
    SELECT cj.*, s.name as store_name, s.slug as store_slug
    FROM crawl_jobs cj
    JOIN stores s ON s.id = cj.store_id
    ORDER BY cj.created_at DESC
    LIMIT $1
  `, [limit]);
    return result.rows;
}
// ============================================
// Scheduler Logic
// ============================================
/**
 * Check which stores are due for a crawl and create jobs
 */
async function checkAndCreateScheduledJobs() {
    console.log('Checking for stores due for crawl...');
    // Get global schedule settings
    const globalSchedule = await migrate_1.pool.query(`
    SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
  `);
    if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
        console.log('Global scheduler is disabled');
        return 0;
    }
    const intervalHours = globalSchedule.rows[0].interval_hours || 4;
    // Find stores due for crawl
    const result = await migrate_1.pool.query(`
    SELECT
      s.id,
      s.name,
      s.timezone,
      s.last_scraped_at,
      COALESCE(scs.enabled, TRUE) as schedule_enabled,
      COALESCE(scs.interval_hours, $1) as interval_hours,
      COALESCE(scs.priority, 0) as priority
    FROM stores s
    LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
    WHERE s.active = TRUE
      AND s.scrape_enabled = TRUE
      AND COALESCE(scs.enabled, TRUE) = TRUE
      AND (
        s.last_scraped_at IS NULL
        OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
      )
      AND NOT EXISTS (
        SELECT 1 FROM crawl_jobs cj
        WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
      )
    ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
  `, [intervalHours]);
    let jobsCreated = 0;
    for (const store of result.rows) {
        try {
            await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
            jobsCreated++;
            console.log(`Scheduled crawl job for: ${store.name}`);
        }
        catch (error) {
            console.error(`Failed to create job for store ${store.name}:`, error);
        }
    }
    console.log(`Created ${jobsCreated} scheduled crawl jobs`);
    return jobsCreated;
}
/**
 * Check for daily special runs (12:01 AM local time)
 */
async function checkAndCreateDailySpecialJobs() {
    console.log('Checking for daily special runs...');
    // Get daily special schedule
    const dailySchedule = await migrate_1.pool.query(`
    SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
  `);
    if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
        console.log('Daily special scheduler is disabled');
        return 0;
    }
    const targetTime = dailySchedule.rows[0].run_time || '00:01';
    // Find stores where it's currently the target time in their local timezone
    // and they haven't had a daily special run today
    const result = await migrate_1.pool.query(`
    SELECT
      s.id,
      s.name,
      s.timezone,
      COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
      COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
      COALESCE(scs.priority, 0) as priority
    FROM stores s
    LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
    WHERE s.active = TRUE
      AND s.scrape_enabled = TRUE
      AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
      -- Check if current time in store timezone matches the target time (within 2 minutes)
      AND ABS(
        EXTRACT(EPOCH FROM (
          (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
          - COALESCE(scs.daily_special_time, $1::TIME)
        ))
      ) < 120  -- within 2 minutes
      -- Ensure we haven't already created a daily_special job today for this store
      AND NOT EXISTS (
        SELECT 1 FROM crawl_jobs cj
        WHERE cj.store_id = s.id
          AND cj.trigger_type = 'daily_special'
          AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
      )
      AND NOT EXISTS (
        SELECT 1 FROM crawl_jobs cj
        WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
      )
    ORDER BY COALESCE(scs.priority, 0) DESC
  `, [targetTime]);
    let jobsCreated = 0;
    for (const store of result.rows) {
        try {
            await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
            jobsCreated++;
            console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
        }
        catch (error) {
            console.error(`Failed to create daily special job for store ${store.name}:`, error);
        }
    }
    if (jobsCreated > 0) {
        console.log(`Created ${jobsCreated} daily special crawl jobs`);
    }
    return jobsCreated;
}
/**
 * Process pending jobs
 */
async function processJobs() {
    if (jobProcessorRunning) {
        console.log('Job processor already running, skipping...');
        return;
    }
    jobProcessorRunning = true;
    try {
        const jobs = await getPendingJobs(1); // Process one at a time for safety
        for (const job of jobs) {
            console.log(`Processing job ${job.id} for store: ${job.store_name}`);
            const claimed = await claimJob(job.id);
            if (!claimed) {
                console.log(`Job ${job.id} already claimed by another worker`);
                continue;
            }
            try {
                // Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
                await (0, scraper_v2_1.scrapeStore)(job.store_id);
                // Update store's last_scraped_at
                await migrate_1.pool.query(`
          UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
        `, [job.store_id]);
                await completeJob(job.id, true, {});
                console.log(`Job ${job.id} completed successfully`);
            }
            catch (error) {
                console.error(`Job ${job.id} failed:`, error);
                await completeJob(job.id, false, { error_message: error.message });
            }
        }
    }
    finally {
        jobProcessorRunning = false;
    }
}
/**
 * Process stores using the intelligent orchestrator
 * This replaces the simple job queue approach with intelligent provider detection
 */
async function processOrchestrator() {
    if (orchestratorProcessorRunning) {
        console.log('Orchestrator processor already running, skipping...');
        return;
    }
    orchestratorProcessorRunning = true;
    try {
        // Get stores due for orchestration (respects schedule, intervals, etc.)
        const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time
        if (storeIds.length === 0) {
            return;
        }
        console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
        // Process each store through the orchestrator
        for (const storeId of storeIds) {
            try {
                console.log(`Orchestrator: Starting crawl for store ${storeId}`);
                const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
                console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
            }
            catch (error) {
                console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
            }
        }
        console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
    }
    finally {
        orchestratorProcessorRunning = false;
    }
}
// ============================================
// Scheduler Control
// ============================================
/**
 * Set scheduler mode
 */
function setSchedulerMode(mode) {
    schedulerMode = mode;
    console.log(`Scheduler mode set to: ${mode}`);
}
/**
 * Get current scheduler mode
 */
function getSchedulerMode() {
    return schedulerMode;
}
/**
 * Start the scheduler (runs every minute to check for due jobs)
 */
async function startCrawlScheduler() {
    stopCrawlScheduler();
    console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
    // Run every minute
    schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => {
        try {
            if (schedulerMode === 'orchestrator') {
                // Use intelligent orchestrator (handles detection + crawl)
                await processOrchestrator();
            }
            else {
                // Legacy mode: job queue approach
                // Check for interval-based scheduled jobs
                await checkAndCreateScheduledJobs();
                // Check for daily special runs
                await checkAndCreateDailySpecialJobs();
                // Process any pending jobs
                await processJobs();
            }
        }
        catch (error) {
            console.error('Scheduler tick error:', error);
        }
    });
    console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
}
/**
 * Stop the scheduler
 */
function stopCrawlScheduler() {
    if (schedulerCronJob) {
        schedulerCronJob.stop();
        schedulerCronJob = null;
        console.log('Crawl scheduler stopped');
    }
}
/**
 * Restart the scheduler
 */
async function restartCrawlScheduler() {
    await startCrawlScheduler();
}
// ============================================
// Manual Triggers
// ============================================
/**
 * Manually trigger a crawl for a specific store (creates a job immediately)
 */
async function triggerManualCrawl(storeId) {
    console.log(`Manual crawl triggered for store ID: ${storeId}`);
    return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
}
/**
 * Manually trigger crawls for all stores
 */
async function triggerAllStoresCrawl() {
    console.log('Manual crawl triggered for all stores');
    const result = await migrate_1.pool.query(`
    SELECT id, name FROM stores
    WHERE active = TRUE AND scrape_enabled = TRUE
    AND NOT EXISTS (
      SELECT 1 FROM crawl_jobs cj
      WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
    )
  `);
    let jobsCreated = 0;
    for (const store of result.rows) {
        await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
        jobsCreated++;
    }
    console.log(`Created ${jobsCreated} manual crawl jobs`);
    return jobsCreated;
}
/**
 * Cancel a pending job
 */
async function cancelJob(jobId) {
    const result = await migrate_1.pool.query(`
    UPDATE crawl_jobs
    SET status = 'cancelled'
    WHERE id = $1 AND status = 'pending'
    RETURNING id
  `, [jobId]);
    return result.rows.length > 0;
}