"use strict"; /** * Crawl Scheduler Service * * This service manages crawl scheduling using a job queue approach. * It does NOT modify the crawler - it only TRIGGERS the existing crawler. * * Features: * - Global schedule: crawl all stores every N hours * - Daily special run: 12:01 AM local store time * - Per-store schedule overrides * - Job queue for tracking pending/running crawls */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.getGlobalSchedule = getGlobalSchedule; exports.updateGlobalSchedule = updateGlobalSchedule; exports.getStoreScheduleStatuses = getStoreScheduleStatuses; exports.getStoreSchedule = getStoreSchedule; exports.updateStoreSchedule = updateStoreSchedule; exports.createCrawlJob = createCrawlJob; exports.getPendingJobs = getPendingJobs; exports.claimJob = claimJob; exports.completeJob = completeJob; exports.getRecentJobs = getRecentJobs; exports.getAllRecentJobs = getAllRecentJobs; exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs; exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs; exports.processJobs = processJobs; exports.processOrchestrator = processOrchestrator; exports.setSchedulerMode = setSchedulerMode; exports.getSchedulerMode = getSchedulerMode; exports.startCrawlScheduler = startCrawlScheduler; exports.stopCrawlScheduler = stopCrawlScheduler; exports.restartCrawlScheduler = restartCrawlScheduler; exports.triggerManualCrawl = triggerManualCrawl; exports.triggerAllStoresCrawl = triggerAllStoresCrawl; exports.cancelJob = cancelJob; const node_cron_1 = __importDefault(require("node-cron")); const migrate_1 = require("../db/migrate"); const scraper_v2_1 = require("../scraper-v2"); const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator"); // Worker identification const WORKER_ID = `worker-${process.pid}-${Date.now()}`; let schedulerCronJob = null; let jobProcessorRunning = false; let orchestratorProcessorRunning = false; // Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration let schedulerMode = 'orchestrator'; // ============================================ // Schedule Management // ============================================ /** * Get global schedule settings */ async function getGlobalSchedule() { const result = await migrate_1.pool.query(` SELECT * FROM crawler_schedule ORDER BY id `); return result.rows; } /** * Update global schedule setting */ async function updateGlobalSchedule(scheduleType, updates) { const setClauses = []; const values = []; let paramIndex = 1; if (updates.enabled !== undefined) { setClauses.push(`enabled = $${paramIndex++}`); values.push(updates.enabled); } if (updates.interval_hours !== undefined) { setClauses.push(`interval_hours = $${paramIndex++}`); values.push(updates.interval_hours); } if (updates.run_time !== undefined) { setClauses.push(`run_time = $${paramIndex++}`); values.push(updates.run_time); } values.push(scheduleType); const result = await migrate_1.pool.query(` UPDATE crawler_schedule SET ${setClauses.join(', ')} WHERE schedule_type = $${paramIndex} RETURNING * `, values); return result.rows[0]; } /** * Get all store schedule statuses */ async function getStoreScheduleStatuses() { const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`); return result.rows; } /** * Get or create per-store schedule override */ async function getStoreSchedule(storeId) { const result = await migrate_1.pool.query(` SELECT * FROM store_crawl_schedule WHERE store_id = $1 `, [storeId]); if (result.rows.length > 0) { return result.rows[0]; } // Return default (use global) return { store_id: storeId, enabled: true, interval_hours: null, daily_special_enabled: true, daily_special_time: null, priority: 0 }; } /** * Update per-store schedule override */ async function updateStoreSchedule(storeId, updates) { const result = await migrate_1.pool.query(` INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (store_id) DO UPDATE SET enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled), interval_hours = EXCLUDED.interval_hours, daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled), daily_special_time = EXCLUDED.daily_special_time, priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority), updated_at = NOW() RETURNING * `, [ storeId, updates.enabled ?? true, updates.interval_hours ?? null, updates.daily_special_enabled ?? true, updates.daily_special_time ?? null, updates.priority ?? 0 ]); return result.rows[0]; } // ============================================ // Job Queue Management // ============================================ /** * Create a new crawl job */ async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) { // Check if there's already a pending or running job for this store const existing = await migrate_1.pool.query(` SELECT id FROM crawl_jobs WHERE store_id = $1 AND status IN ('pending', 'running') LIMIT 1 `, [storeId]); if (existing.rows.length > 0) { console.log(`Skipping job creation for store ${storeId} - already has pending/running job`); return existing.rows[0]; } const result = await migrate_1.pool.query(` INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status) VALUES ($1, $2, $3, $4, $5, 'pending') RETURNING * `, [storeId, jobType, triggerType, scheduledAt, priority]); console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`); return result.rows[0]; } /** * Get pending jobs ready to run */ async function getPendingJobs(limit = 5) { const result = await migrate_1.pool.query(` SELECT cj.*, s.name as store_name FROM crawl_jobs cj JOIN stores s ON s.id = cj.store_id WHERE cj.status = 'pending' AND cj.scheduled_at <= NOW() ORDER BY cj.priority DESC, cj.scheduled_at ASC LIMIT $1 `, [limit]); return result.rows; } /** * Claim a job for processing */ async function claimJob(jobId) { const result = await migrate_1.pool.query(` UPDATE crawl_jobs SET status = 'running', started_at = NOW(), worker_id = $2 WHERE id = $1 AND status = 'pending' RETURNING id `, [jobId, WORKER_ID]); return result.rows.length > 0; } /** * Complete a job */ async function completeJob(jobId, success, results) { await migrate_1.pool.query(` UPDATE crawl_jobs SET status = $2, completed_at = NOW(), products_found = $3, error_message = $4 WHERE id = $1 `, [ jobId, success ? 'completed' : 'failed', results?.products_found ?? null, results?.error_message ?? null ]); } /** * Get recent jobs for a store */ async function getRecentJobs(storeId, limit = 10) { const result = await migrate_1.pool.query(` SELECT * FROM crawl_jobs WHERE store_id = $1 ORDER BY created_at DESC LIMIT $2 `, [storeId, limit]); return result.rows; } /** * Get all recent jobs */ async function getAllRecentJobs(limit = 50) { const result = await migrate_1.pool.query(` SELECT cj.*, s.name as store_name, s.slug as store_slug FROM crawl_jobs cj JOIN stores s ON s.id = cj.store_id ORDER BY cj.created_at DESC LIMIT $1 `, [limit]); return result.rows; } // ============================================ // Scheduler Logic // ============================================ /** * Check which stores are due for a crawl and create jobs */ async function checkAndCreateScheduledJobs() { console.log('Checking for stores due for crawl...'); // Get global schedule settings const globalSchedule = await migrate_1.pool.query(` SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval' `); if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) { console.log('Global scheduler is disabled'); return 0; } const intervalHours = globalSchedule.rows[0].interval_hours || 4; // Find stores due for crawl const result = await migrate_1.pool.query(` SELECT s.id, s.name, s.timezone, s.last_scraped_at, COALESCE(scs.enabled, TRUE) as schedule_enabled, COALESCE(scs.interval_hours, $1) as interval_hours, COALESCE(scs.priority, 0) as priority FROM stores s LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id WHERE s.active = TRUE AND s.scrape_enabled = TRUE AND COALESCE(scs.enabled, TRUE) = TRUE AND ( s.last_scraped_at IS NULL OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL ) AND NOT EXISTS ( SELECT 1 FROM crawl_jobs cj WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running') ) ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST `, [intervalHours]); let jobsCreated = 0; for (const store of result.rows) { try { await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority); jobsCreated++; console.log(`Scheduled crawl job for: ${store.name}`); } catch (error) { console.error(`Failed to create job for store ${store.name}:`, error); } } console.log(`Created ${jobsCreated} scheduled crawl jobs`); return jobsCreated; } /** * Check for daily special runs (12:01 AM local time) */ async function checkAndCreateDailySpecialJobs() { console.log('Checking for daily special runs...'); // Get daily special schedule const dailySchedule = await migrate_1.pool.query(` SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special' `); if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) { console.log('Daily special scheduler is disabled'); return 0; } const targetTime = dailySchedule.rows[0].run_time || '00:01'; // Find stores where it's currently the target time in their local timezone // and they haven't had a daily special run today const result = await migrate_1.pool.query(` SELECT s.id, s.name, s.timezone, COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled, COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time, COALESCE(scs.priority, 0) as priority FROM stores s LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id WHERE s.active = TRUE AND s.scrape_enabled = TRUE AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE -- Check if current time in store timezone matches the target time (within 2 minutes) AND ABS( EXTRACT(EPOCH FROM ( (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME - COALESCE(scs.daily_special_time, $1::TIME) )) ) < 120 -- within 2 minutes -- Ensure we haven't already created a daily_special job today for this store AND NOT EXISTS ( SELECT 1 FROM crawl_jobs cj WHERE cj.store_id = s.id AND cj.trigger_type = 'daily_special' AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE ) AND NOT EXISTS ( SELECT 1 FROM crawl_jobs cj WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running') ) ORDER BY COALESCE(scs.priority, 0) DESC `, [targetTime]); let jobsCreated = 0; for (const store of result.rows) { try { await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10); jobsCreated++; console.log(`Created daily special job for: ${store.name} (${store.timezone})`); } catch (error) { console.error(`Failed to create daily special job for store ${store.name}:`, error); } } if (jobsCreated > 0) { console.log(`Created ${jobsCreated} daily special crawl jobs`); } return jobsCreated; } /** * Process pending jobs */ async function processJobs() { if (jobProcessorRunning) { console.log('Job processor already running, skipping...'); return; } jobProcessorRunning = true; try { const jobs = await getPendingJobs(1); // Process one at a time for safety for (const job of jobs) { console.log(`Processing job ${job.id} for store: ${job.store_name}`); const claimed = await claimJob(job.id); if (!claimed) { console.log(`Job ${job.id} already claimed by another worker`); continue; } try { // Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC await (0, scraper_v2_1.scrapeStore)(job.store_id); // Update store's last_scraped_at await migrate_1.pool.query(` UPDATE stores SET last_scraped_at = NOW() WHERE id = $1 `, [job.store_id]); await completeJob(job.id, true, {}); console.log(`Job ${job.id} completed successfully`); } catch (error) { console.error(`Job ${job.id} failed:`, error); await completeJob(job.id, false, { error_message: error.message }); } } } finally { jobProcessorRunning = false; } } /** * Process stores using the intelligent orchestrator * This replaces the simple job queue approach with intelligent provider detection */ async function processOrchestrator() { if (orchestratorProcessorRunning) { console.log('Orchestrator processor already running, skipping...'); return; } orchestratorProcessorRunning = true; try { // Get stores due for orchestration (respects schedule, intervals, etc.) const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time if (storeIds.length === 0) { return; } console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`); // Process each store through the orchestrator for (const storeId of storeIds) { try { console.log(`Orchestrator: Starting crawl for store ${storeId}`); const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId); console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`); } catch (error) { console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`); } } console.log(`Orchestrator: Finished processing ${storeIds.length} stores`); } finally { orchestratorProcessorRunning = false; } } // ============================================ // Scheduler Control // ============================================ /** * Set scheduler mode */ function setSchedulerMode(mode) { schedulerMode = mode; console.log(`Scheduler mode set to: ${mode}`); } /** * Get current scheduler mode */ function getSchedulerMode() { return schedulerMode; } /** * Start the scheduler (runs every minute to check for due jobs) */ async function startCrawlScheduler() { stopCrawlScheduler(); console.log(`Starting crawl scheduler in ${schedulerMode} mode...`); // Run every minute schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => { try { if (schedulerMode === 'orchestrator') { // Use intelligent orchestrator (handles detection + crawl) await processOrchestrator(); } else { // Legacy mode: job queue approach // Check for interval-based scheduled jobs await checkAndCreateScheduledJobs(); // Check for daily special runs await checkAndCreateDailySpecialJobs(); // Process any pending jobs await processJobs(); } } catch (error) { console.error('Scheduler tick error:', error); } }); console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`); } /** * Stop the scheduler */ function stopCrawlScheduler() { if (schedulerCronJob) { schedulerCronJob.stop(); schedulerCronJob = null; console.log('Crawl scheduler stopped'); } } /** * Restart the scheduler */ async function restartCrawlScheduler() { await startCrawlScheduler(); } // ============================================ // Manual Triggers // ============================================ /** * Manually trigger a crawl for a specific store (creates a job immediately) */ async function triggerManualCrawl(storeId) { console.log(`Manual crawl triggered for store ID: ${storeId}`); return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority } /** * Manually trigger crawls for all stores */ async function triggerAllStoresCrawl() { console.log('Manual crawl triggered for all stores'); const result = await migrate_1.pool.query(` SELECT id, name FROM stores WHERE active = TRUE AND scrape_enabled = TRUE AND NOT EXISTS ( SELECT 1 FROM crawl_jobs cj WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running') ) `); let jobsCreated = 0; for (const store of result.rows) { await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50); jobsCreated++; } console.log(`Created ${jobsCreated} manual crawl jobs`); return jobsCreated; } /** * Cancel a pending job */ async function cancelJob(jobId) { const result = await migrate_1.pool.query(` UPDATE crawl_jobs SET status = 'cancelled' WHERE id = $1 AND status = 'pending' RETURNING id `, [jobId]); return result.rows.length > 0; }