Files
cannaiq/backend/dist/services/crawl-scheduler.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

537 lines
18 KiB
JavaScript

"use strict";
/**
* Crawl Scheduler Service
*
* This service manages crawl scheduling using a job queue approach.
* It does NOT modify the crawler - it only TRIGGERS the existing crawler.
*
* Features:
* - Global schedule: crawl all stores every N hours
* - Daily special run: 12:01 AM local store time
* - Per-store schedule overrides
* - Job queue for tracking pending/running crawls
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getGlobalSchedule = getGlobalSchedule;
exports.updateGlobalSchedule = updateGlobalSchedule;
exports.getStoreScheduleStatuses = getStoreScheduleStatuses;
exports.getStoreSchedule = getStoreSchedule;
exports.updateStoreSchedule = updateStoreSchedule;
exports.createCrawlJob = createCrawlJob;
exports.getPendingJobs = getPendingJobs;
exports.claimJob = claimJob;
exports.completeJob = completeJob;
exports.getRecentJobs = getRecentJobs;
exports.getAllRecentJobs = getAllRecentJobs;
exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs;
exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs;
exports.processJobs = processJobs;
exports.processOrchestrator = processOrchestrator;
exports.setSchedulerMode = setSchedulerMode;
exports.getSchedulerMode = getSchedulerMode;
exports.startCrawlScheduler = startCrawlScheduler;
exports.stopCrawlScheduler = stopCrawlScheduler;
exports.restartCrawlScheduler = restartCrawlScheduler;
exports.triggerManualCrawl = triggerManualCrawl;
exports.triggerAllStoresCrawl = triggerAllStoresCrawl;
exports.cancelJob = cancelJob;
const node_cron_1 = __importDefault(require("node-cron"));
const migrate_1 = require("../db/migrate");
const scraper_v2_1 = require("../scraper-v2");
const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator");
// Worker identification
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
let schedulerCronJob = null;
let jobProcessorRunning = false;
let orchestratorProcessorRunning = false;
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
let schedulerMode = 'orchestrator';
// ============================================
// Schedule Management
// ============================================
/**
* Get global schedule settings
*/
async function getGlobalSchedule() {
const result = await migrate_1.pool.query(`
SELECT * FROM crawler_schedule ORDER BY id
`);
return result.rows;
}
/**
* Update global schedule setting
*/
async function updateGlobalSchedule(scheduleType, updates) {
const setClauses = [];
const values = [];
let paramIndex = 1;
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
values.push(updates.enabled);
}
if (updates.interval_hours !== undefined) {
setClauses.push(`interval_hours = $${paramIndex++}`);
values.push(updates.interval_hours);
}
if (updates.run_time !== undefined) {
setClauses.push(`run_time = $${paramIndex++}`);
values.push(updates.run_time);
}
values.push(scheduleType);
const result = await migrate_1.pool.query(`
UPDATE crawler_schedule
SET ${setClauses.join(', ')}
WHERE schedule_type = $${paramIndex}
RETURNING *
`, values);
return result.rows[0];
}
/**
* Get all store schedule statuses
*/
async function getStoreScheduleStatuses() {
const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
return result.rows;
}
/**
* Get or create per-store schedule override
*/
async function getStoreSchedule(storeId) {
const result = await migrate_1.pool.query(`
SELECT * FROM store_crawl_schedule WHERE store_id = $1
`, [storeId]);
if (result.rows.length > 0) {
return result.rows[0];
}
// Return default (use global)
return {
store_id: storeId,
enabled: true,
interval_hours: null,
daily_special_enabled: true,
daily_special_time: null,
priority: 0
};
}
/**
* Update per-store schedule override
*/
async function updateStoreSchedule(storeId, updates) {
const result = await migrate_1.pool.query(`
INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (store_id) DO UPDATE SET
enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
interval_hours = EXCLUDED.interval_hours,
daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
daily_special_time = EXCLUDED.daily_special_time,
priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
updated_at = NOW()
RETURNING *
`, [
storeId,
updates.enabled ?? true,
updates.interval_hours ?? null,
updates.daily_special_enabled ?? true,
updates.daily_special_time ?? null,
updates.priority ?? 0
]);
return result.rows[0];
}
// ============================================
// Job Queue Management
// ============================================
/**
* Create a new crawl job
*/
async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) {
// Check if there's already a pending or running job for this store
const existing = await migrate_1.pool.query(`
SELECT id FROM crawl_jobs
WHERE store_id = $1 AND status IN ('pending', 'running')
LIMIT 1
`, [storeId]);
if (existing.rows.length > 0) {
console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
return existing.rows[0];
}
const result = await migrate_1.pool.query(`
INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
VALUES ($1, $2, $3, $4, $5, 'pending')
RETURNING *
`, [storeId, jobType, triggerType, scheduledAt, priority]);
console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
return result.rows[0];
}
/**
* Get pending jobs ready to run
*/
async function getPendingJobs(limit = 5) {
const result = await migrate_1.pool.query(`
SELECT cj.*, s.name as store_name
FROM crawl_jobs cj
JOIN stores s ON s.id = cj.store_id
WHERE cj.status = 'pending'
AND cj.scheduled_at <= NOW()
ORDER BY cj.priority DESC, cj.scheduled_at ASC
LIMIT $1
`, [limit]);
return result.rows;
}
/**
* Claim a job for processing
*/
async function claimJob(jobId) {
const result = await migrate_1.pool.query(`
UPDATE crawl_jobs
SET status = 'running', started_at = NOW(), worker_id = $2
WHERE id = $1 AND status = 'pending'
RETURNING id
`, [jobId, WORKER_ID]);
return result.rows.length > 0;
}
/**
* Complete a job
*/
async function completeJob(jobId, success, results) {
await migrate_1.pool.query(`
UPDATE crawl_jobs
SET
status = $2,
completed_at = NOW(),
products_found = $3,
error_message = $4
WHERE id = $1
`, [
jobId,
success ? 'completed' : 'failed',
results?.products_found ?? null,
results?.error_message ?? null
]);
}
/**
* Get recent jobs for a store
*/
async function getRecentJobs(storeId, limit = 10) {
const result = await migrate_1.pool.query(`
SELECT * FROM crawl_jobs
WHERE store_id = $1
ORDER BY created_at DESC
LIMIT $2
`, [storeId, limit]);
return result.rows;
}
/**
* Get all recent jobs
*/
async function getAllRecentJobs(limit = 50) {
const result = await migrate_1.pool.query(`
SELECT cj.*, s.name as store_name, s.slug as store_slug
FROM crawl_jobs cj
JOIN stores s ON s.id = cj.store_id
ORDER BY cj.created_at DESC
LIMIT $1
`, [limit]);
return result.rows;
}
// ============================================
// Scheduler Logic
// ============================================
/**
* Check which stores are due for a crawl and create jobs
*/
async function checkAndCreateScheduledJobs() {
console.log('Checking for stores due for crawl...');
// Get global schedule settings
const globalSchedule = await migrate_1.pool.query(`
SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
`);
if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
console.log('Global scheduler is disabled');
return 0;
}
const intervalHours = globalSchedule.rows[0].interval_hours || 4;
// Find stores due for crawl
const result = await migrate_1.pool.query(`
SELECT
s.id,
s.name,
s.timezone,
s.last_scraped_at,
COALESCE(scs.enabled, TRUE) as schedule_enabled,
COALESCE(scs.interval_hours, $1) as interval_hours,
COALESCE(scs.priority, 0) as priority
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.enabled, TRUE) = TRUE
AND (
s.last_scraped_at IS NULL
OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
)
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
)
ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
`, [intervalHours]);
let jobsCreated = 0;
for (const store of result.rows) {
try {
await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
jobsCreated++;
console.log(`Scheduled crawl job for: ${store.name}`);
}
catch (error) {
console.error(`Failed to create job for store ${store.name}:`, error);
}
}
console.log(`Created ${jobsCreated} scheduled crawl jobs`);
return jobsCreated;
}
/**
* Check for daily special runs (12:01 AM local time)
*/
async function checkAndCreateDailySpecialJobs() {
console.log('Checking for daily special runs...');
// Get daily special schedule
const dailySchedule = await migrate_1.pool.query(`
SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
`);
if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
console.log('Daily special scheduler is disabled');
return 0;
}
const targetTime = dailySchedule.rows[0].run_time || '00:01';
// Find stores where it's currently the target time in their local timezone
// and they haven't had a daily special run today
const result = await migrate_1.pool.query(`
SELECT
s.id,
s.name,
s.timezone,
COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
COALESCE(scs.priority, 0) as priority
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
-- Check if current time in store timezone matches the target time (within 2 minutes)
AND ABS(
EXTRACT(EPOCH FROM (
(NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
- COALESCE(scs.daily_special_time, $1::TIME)
))
) < 120 -- within 2 minutes
-- Ensure we haven't already created a daily_special job today for this store
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id
AND cj.trigger_type = 'daily_special'
AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
)
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
)
ORDER BY COALESCE(scs.priority, 0) DESC
`, [targetTime]);
let jobsCreated = 0;
for (const store of result.rows) {
try {
await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
jobsCreated++;
console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
}
catch (error) {
console.error(`Failed to create daily special job for store ${store.name}:`, error);
}
}
if (jobsCreated > 0) {
console.log(`Created ${jobsCreated} daily special crawl jobs`);
}
return jobsCreated;
}
/**
* Process pending jobs
*/
async function processJobs() {
if (jobProcessorRunning) {
console.log('Job processor already running, skipping...');
return;
}
jobProcessorRunning = true;
try {
const jobs = await getPendingJobs(1); // Process one at a time for safety
for (const job of jobs) {
console.log(`Processing job ${job.id} for store: ${job.store_name}`);
const claimed = await claimJob(job.id);
if (!claimed) {
console.log(`Job ${job.id} already claimed by another worker`);
continue;
}
try {
// Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
await (0, scraper_v2_1.scrapeStore)(job.store_id);
// Update store's last_scraped_at
await migrate_1.pool.query(`
UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
`, [job.store_id]);
await completeJob(job.id, true, {});
console.log(`Job ${job.id} completed successfully`);
}
catch (error) {
console.error(`Job ${job.id} failed:`, error);
await completeJob(job.id, false, { error_message: error.message });
}
}
}
finally {
jobProcessorRunning = false;
}
}
/**
* Process stores using the intelligent orchestrator
* This replaces the simple job queue approach with intelligent provider detection
*/
async function processOrchestrator() {
if (orchestratorProcessorRunning) {
console.log('Orchestrator processor already running, skipping...');
return;
}
orchestratorProcessorRunning = true;
try {
// Get stores due for orchestration (respects schedule, intervals, etc.)
const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time
if (storeIds.length === 0) {
return;
}
console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
// Process each store through the orchestrator
for (const storeId of storeIds) {
try {
console.log(`Orchestrator: Starting crawl for store ${storeId}`);
const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
}
catch (error) {
console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
}
}
console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
}
finally {
orchestratorProcessorRunning = false;
}
}
// ============================================
// Scheduler Control
// ============================================
/**
* Set scheduler mode
*/
function setSchedulerMode(mode) {
schedulerMode = mode;
console.log(`Scheduler mode set to: ${mode}`);
}
/**
* Get current scheduler mode
*/
function getSchedulerMode() {
return schedulerMode;
}
/**
* Start the scheduler (runs every minute to check for due jobs)
*/
async function startCrawlScheduler() {
stopCrawlScheduler();
console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
// Run every minute
schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => {
try {
if (schedulerMode === 'orchestrator') {
// Use intelligent orchestrator (handles detection + crawl)
await processOrchestrator();
}
else {
// Legacy mode: job queue approach
// Check for interval-based scheduled jobs
await checkAndCreateScheduledJobs();
// Check for daily special runs
await checkAndCreateDailySpecialJobs();
// Process any pending jobs
await processJobs();
}
}
catch (error) {
console.error('Scheduler tick error:', error);
}
});
console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
}
/**
* Stop the scheduler
*/
function stopCrawlScheduler() {
if (schedulerCronJob) {
schedulerCronJob.stop();
schedulerCronJob = null;
console.log('Crawl scheduler stopped');
}
}
/**
* Restart the scheduler
*/
async function restartCrawlScheduler() {
await startCrawlScheduler();
}
// ============================================
// Manual Triggers
// ============================================
/**
* Manually trigger a crawl for a specific store (creates a job immediately)
*/
async function triggerManualCrawl(storeId) {
console.log(`Manual crawl triggered for store ID: ${storeId}`);
return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
}
/**
* Manually trigger crawls for all stores
*/
async function triggerAllStoresCrawl() {
console.log('Manual crawl triggered for all stores');
const result = await migrate_1.pool.query(`
SELECT id, name FROM stores
WHERE active = TRUE AND scrape_enabled = TRUE
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
)
`);
let jobsCreated = 0;
for (const store of result.rows) {
await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
jobsCreated++;
}
console.log(`Created ${jobsCreated} manual crawl jobs`);
return jobsCreated;
}
/**
* Cancel a pending job
*/
async function cancelJob(jobId) {
const result = await migrate_1.pool.query(`
UPDATE crawl_jobs
SET status = 'cancelled'
WHERE id = $1 AND status = 'pending'
RETURNING id
`, [jobId]);
return result.rows.length > 0;
}