#!/usr/bin/env npx tsx /** * Queue Dispensaries Script * * Orchestrates the multi-provider crawler system: * 1. Queue dispensaries that need provider detection * 2. Queue Dutchie dispensaries for production crawl * 3. Queue sandbox dispensaries for learning crawls * * Usage: * npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all] * npx tsx src/scripts/queue-dispensaries.ts --dry-run * npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs */ import { pool } from '../db/pool'; import { logger } from '../services/logger'; import { runDetectMenuProviderJob, runDutchieMenuCrawlJob, runSandboxCrawlJob, processSandboxJobs, } from '../services/crawler-jobs'; // Parse command line args const args = process.argv.slice(2); const flags = { detection: args.includes('--detection') || args.includes('--all'), production: args.includes('--production') || args.includes('--all'), sandbox: args.includes('--sandbox') || args.includes('--all'), dryRun: args.includes('--dry-run'), process: args.includes('--process'), help: args.includes('--help') || args.includes('-h'), limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'), }; // If no specific flags, default to all if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) { flags.detection = true; flags.production = true; flags.sandbox = true; } async function showHelp() { console.log(` Queue Dispensaries - Multi-Provider Crawler Orchestration USAGE: npx tsx src/scripts/queue-dispensaries.ts [OPTIONS] OPTIONS: --detection Queue dispensaries that need provider detection --production Queue Dutchie production crawls --sandbox Queue sandbox/learning crawls --all Queue all job types (default if no specific flag) --process Process queued jobs instead of just queuing --dry-run Show what would be queued without making changes --limit=N Maximum dispensaries to queue per type (default: 10) --help, -h Show this help message EXAMPLES: # Queue all dispensaries for appropriate jobs npx tsx src/scripts/queue-dispensaries.ts # Only queue detection jobs npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20 # Dry run to see what would be queued npx tsx src/scripts/queue-dispensaries.ts --dry-run # Process sandbox jobs npx tsx src/scripts/queue-dispensaries.ts --process `); } async function queueDetectionJobs(): Promise { console.log('\n๐Ÿ“ก Queueing Detection Jobs...'); // Find dispensaries that need provider detection: // - menu_provider is null OR // - menu_provider_confidence < 70 AND // - crawler_status is idle (not already queued/running) // - has a website URL const query = ` SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence FROM dispensaries WHERE (website IS NOT NULL OR menu_url IS NOT NULL) AND crawler_status = 'idle' AND (menu_provider IS NULL OR menu_provider_confidence < 70) ORDER BY CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END, menu_provider_confidence ASC LIMIT $1 `; const result = await pool.query(query, [flags.limit]); if (flags.dryRun) { console.log(` Would queue ${result.rows.length} dispensaries for detection:`); for (const row of result.rows) { console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`); } return result.rows.length; } let queued = 0; for (const dispensary of result.rows) { try { // Update status to queued await pool.query( `UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensary.id] ); // Create sandbox job for detection await pool.query( `INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority) VALUES ($1, 'detection', 'pending', 10)`, [dispensary.id] ); console.log(` โœ“ Queued detection: [${dispensary.id}] ${dispensary.name}`); queued++; } catch (error: any) { console.error(` โœ— Failed to queue [${dispensary.id}]: ${error.message}`); } } return queued; } async function queueProductionCrawls(): Promise { console.log('\n๐Ÿญ Queueing Production Dutchie Crawls...'); // Find Dutchie dispensaries ready for production crawl: // - menu_provider = 'dutchie' // - crawler_mode = 'production' // - crawler_status is idle // - last_menu_scrape is old or null const query = ` SELECT d.id, d.name, d.last_menu_scrape, d.menu_url FROM dispensaries d WHERE d.menu_provider = 'dutchie' AND d.crawler_mode = 'production' AND d.crawler_status = 'idle' AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours') ORDER BY CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END, d.last_menu_scrape ASC LIMIT $1 `; const result = await pool.query(query, [flags.limit]); if (flags.dryRun) { console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`); for (const row of result.rows) { const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never'; console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`); } return result.rows.length; } let queued = 0; for (const dispensary of result.rows) { try { // Update status to queued await pool.query( `UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id] ); // Create crawl job in the main crawl_jobs table (production queue) await pool.query( `INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata) SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50, jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries') FROM stores s JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%') WHERE d.id = $1 LIMIT 1`, [dispensary.id] ); console.log(` โœ“ Queued production crawl: [${dispensary.id}] ${dispensary.name}`); queued++; } catch (error: any) { console.error(` โœ— Failed to queue [${dispensary.id}]: ${error.message}`); } } return queued; } async function queueSandboxCrawls(): Promise { console.log('\n๐Ÿงช Queueing Sandbox Crawls...'); // Find sandbox dispensaries needing crawls: // - crawler_mode = 'sandbox' // - crawler_status in (idle, error_needs_review) // - No recent sandbox job const query = ` SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website FROM dispensaries d WHERE d.crawler_mode = 'sandbox' AND d.crawler_status IN ('idle', 'error_needs_review') AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL) AND NOT EXISTS ( SELECT 1 FROM sandbox_crawl_jobs sj WHERE sj.dispensary_id = d.id AND sj.status IN ('pending', 'running') ) ORDER BY d.updated_at ASC LIMIT $1 `; const result = await pool.query(query, [flags.limit]); if (flags.dryRun) { console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`); for (const row of result.rows) { console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`); } return result.rows.length; } let queued = 0; for (const dispensary of result.rows) { try { // Update status await pool.query( `UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id] ); // Create sandbox job await pool.query( `INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority) VALUES ($1, 'deep_crawl', 'pending', 5)`, [dispensary.id] ); console.log(` โœ“ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`); queued++; } catch (error: any) { console.error(` โœ— Failed to queue [${dispensary.id}]: ${error.message}`); } } return queued; } async function processJobs(): Promise { console.log('\nโš™๏ธ Processing Queued Jobs...\n'); // Process sandbox jobs (detection + sandbox crawls) const sandboxJobs = await pool.query( `SELECT * FROM sandbox_crawl_jobs WHERE status = 'pending' ORDER BY priority DESC, scheduled_at ASC LIMIT $1`, [flags.limit] ); console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`); for (const job of sandboxJobs.rows) { console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`); try { // Mark as running await pool.query( `UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`, [job.id] ); let result; if (job.job_type === 'detection') { result = await runDetectMenuProviderJob(job.dispensary_id); } else { result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id); } // Update job status await pool.query( `UPDATE sandbox_crawl_jobs SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3 WHERE id = $4`, [ result.success ? 'completed' : 'failed', JSON.stringify(result.data || {}), result.success ? null : result.message, job.id, ] ); console.log(` ${result.success ? 'โœ“' : 'โœ—'} ${result.message}\n`); } catch (error: any) { await pool.query( `UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id] ); console.log(` โœ— Error: ${error.message}\n`); } } } async function showStats(): Promise { console.log('\n๐Ÿ“Š Current Stats:'); // Dispensary stats const stats = await pool.query(` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider, COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie, COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers, COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown, COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode, COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode, COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle, COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued, COUNT(*) FILTER (WHERE crawler_status = 'running') as running, COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok, COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review FROM dispensaries `); const s = stats.rows[0]; console.log(` Dispensaries: ${s.total} - No provider detected: ${s.no_provider} - Dutchie: ${s.dutchie} - Other providers: ${s.other_providers} - Unknown: ${s.unknown} Crawler Mode: - Production: ${s.production_mode} - Sandbox: ${s.sandbox_mode} Status: - Idle: ${s.idle} - Queued: ${s.queued} - Running: ${s.running} - OK: ${s.ok} - Needs Review: ${s.needs_review} `); // Job stats const jobStats = await pool.query(` SELECT COUNT(*) FILTER (WHERE status = 'pending') as pending, COUNT(*) FILTER (WHERE status = 'running') as running, COUNT(*) FILTER (WHERE status = 'completed') as completed, COUNT(*) FILTER (WHERE status = 'failed') as failed FROM sandbox_crawl_jobs `); const j = jobStats.rows[0]; console.log(` Sandbox Jobs: - Pending: ${j.pending} - Running: ${j.running} - Completed: ${j.completed} - Failed: ${j.failed} `); } async function main() { if (flags.help) { await showHelp(); process.exit(0); } console.log('โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•'); console.log(' Multi-Provider Crawler Queue Manager'); console.log('โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•'); if (flags.dryRun) { console.log('\n๐Ÿ” DRY RUN MODE - No changes will be made\n'); } try { // Show current stats first await showStats(); if (flags.process) { // Process mode - run jobs instead of queuing await processJobs(); } else { // Queuing mode let totalQueued = 0; if (flags.detection) { totalQueued += await queueDetectionJobs(); } if (flags.production) { totalQueued += await queueProductionCrawls(); } if (flags.sandbox) { totalQueued += await queueSandboxCrawls(); } console.log('\nโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•'); console.log(` Total dispensaries queued: ${totalQueued}`); console.log('โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•\n'); } // Show updated stats if (!flags.dryRun) { await showStats(); } } catch (error) { console.error('Fatal error:', error); process.exit(1); } finally { await pool.end(); } } main();