#!/usr/bin/env npx tsx "use strict"; /** * Bootstrap Discovery Script * * One-time (but reusable) bootstrap command that: * 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default) * 2. Optionally runs RunDispensaryOrchestrator for each dispensary * * Usage: * npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only * npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator * npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries * npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen * npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only */ Object.defineProperty(exports, "__esModule", { value: true }); const migrate_1 = require("../db/migrate"); const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator"); // Parse command line args const args = process.argv.slice(2); const flags = { run: args.includes('--run'), dryRun: args.includes('--dry-run'), status: args.includes('--status'), help: args.includes('--help') || args.includes('-h'), limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'), concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'), interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'), detectionOnly: args.includes('--detection-only'), productionOnly: args.includes('--production-only'), sandboxOnly: args.includes('--sandbox-only'), }; async function showHelp() { console.log(` Bootstrap Discovery - Initialize Dispensary Crawl System USAGE: npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS] OPTIONS: --run After creating schedules, run the orchestrator for each dispensary --dry-run Show what would happen without making changes --status Show current status and exit --limit=N Limit how many dispensaries to process (0 = all, default: 0) --concurrency=N How many dispensaries to process in parallel (default: 3) --interval=M Default interval in minutes for new schedules (default: 240 = 4 hours) --detection-only Only run detection, don't crawl --production-only Only run dispensaries in production mode --sandbox-only Only run dispensaries in sandbox mode --help, -h Show this help message EXAMPLES: # Create schedule entries for all dispensaries (no crawling) npx tsx src/scripts/bootstrap-discovery.ts # Create schedules and run orchestrator for all dispensaries npx tsx src/scripts/bootstrap-discovery.ts --run # Run orchestrator for first 10 dispensaries npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run with higher concurrency npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5 # Show current status npx tsx src/scripts/bootstrap-discovery.ts --status WHAT IT DOES: 1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one 2. If --run: For each dispensary, runs the orchestrator which: a. Checks if provider detection is needed (null/unknown/stale/low confidence) b. Runs detection if needed c. If Dutchie + production mode: runs production crawl d. Otherwise: runs sandbox crawl 3. Updates schedule status and job records `); } async function showStatus() { console.log('\nšŸ“Š Current Dispensary Crawl Status\n'); console.log('═'.repeat(70)); // Get dispensary counts by provider const providerStats = await migrate_1.pool.query(` SELECT COALESCE(product_provider, 'undetected') as provider, COUNT(*) as count, COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production, COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox, COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode FROM dispensaries GROUP BY COALESCE(product_provider, 'undetected') ORDER BY count DESC `); console.log('\nProvider Distribution:'); console.log('-'.repeat(60)); console.log('Provider'.padEnd(20) + 'Total'.padStart(8) + 'Production'.padStart(12) + 'Sandbox'.padStart(10) + 'No Mode'.padStart(10)); console.log('-'.repeat(60)); for (const row of providerStats.rows) { console.log(row.provider.padEnd(20) + row.count.toString().padStart(8) + row.production.toString().padStart(12) + row.sandbox.toString().padStart(10) + row.no_mode.toString().padStart(10)); } // Get schedule stats const scheduleStats = await migrate_1.pool.query(` SELECT COUNT(DISTINCT d.id) as total_dispensaries, COUNT(DISTINCT dcs.id) as with_schedule, COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule, COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules, COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success, COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error, COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox, COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection, COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now, AVG(dcs.interval_minutes)::INTEGER as avg_interval FROM dispensaries d LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id `); const s = scheduleStats.rows[0]; console.log('\n\nSchedule Status:'); console.log('-'.repeat(60)); console.log(` Total Dispensaries: ${s.total_dispensaries}`); console.log(` With Schedule: ${s.with_schedule}`); console.log(` Without Schedule: ${s.without_schedule}`); console.log(` Active Schedules: ${s.active_schedules || 0}`); console.log(` Average Interval: ${s.avg_interval || 240} minutes`); console.log('\n Last Run Status:'); console.log(` - Success: ${s.last_success || 0}`); console.log(` - Error: ${s.last_error || 0}`); console.log(` - Sandbox Only: ${s.last_sandbox || 0}`); console.log(` - Detection Only: ${s.last_detection || 0}`); console.log(` - Due Now: ${s.due_now || 0}`); // Get recent job stats const jobStats = await migrate_1.pool.query(` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE status = 'completed') as completed, COUNT(*) FILTER (WHERE status = 'failed') as failed, COUNT(*) FILTER (WHERE status = 'running') as running, COUNT(*) FILTER (WHERE status = 'pending') as pending, COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection, COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl, COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls, COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls, SUM(products_found) as total_products_found FROM dispensary_crawl_jobs WHERE created_at > NOW() - INTERVAL '24 hours' `); const j = jobStats.rows[0]; console.log('\n\nJobs (Last 24 Hours):'); console.log('-'.repeat(60)); console.log(` Total Jobs: ${j.total || 0}`); console.log(` Completed: ${j.completed || 0}`); console.log(` Failed: ${j.failed || 0}`); console.log(` Running: ${j.running || 0}`); console.log(` Pending: ${j.pending || 0}`); console.log(` With Detection: ${j.with_detection || 0}`); console.log(` With Crawl: ${j.with_crawl || 0}`); console.log(` - Production: ${j.production_crawls || 0}`); console.log(` - Sandbox: ${j.sandbox_crawls || 0}`); console.log(` Products Found: ${j.total_products_found || 0}`); console.log('\n' + '═'.repeat(70) + '\n'); } async function createSchedules() { console.log('\nšŸ“… Creating Dispensary Schedules...\n'); if (flags.dryRun) { // Count how many would be created const result = await migrate_1.pool.query(` SELECT COUNT(*) as count FROM dispensaries d WHERE NOT EXISTS ( SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id ) `); const wouldCreate = parseInt(result.rows[0].count); console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`); return { created: wouldCreate, existing: 0 }; } const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval); console.log(` āœ“ Created ${result.created} new schedule entries`); console.log(` āœ“ ${result.existing} dispensaries already had schedules`); return result; } async function getDispensariesToProcess() { // Build query based on filters let whereClause = 'TRUE'; if (flags.productionOnly) { whereClause += ` AND d.product_crawler_mode = 'production'`; } else if (flags.sandboxOnly) { whereClause += ` AND d.product_crawler_mode = 'sandbox'`; } if (flags.detectionOnly) { whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`; } const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : ''; const query = ` SELECT d.id, d.name, d.product_provider, d.product_crawler_mode FROM dispensaries d LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id WHERE ${whereClause} ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST, d.id ASC ${limitClause} `; const result = await migrate_1.pool.query(query); return result.rows.map(row => row.id); } async function runOrchestrator() { console.log('\nšŸš€ Running Dispensary Orchestrator...\n'); const dispensaryIds = await getDispensariesToProcess(); if (dispensaryIds.length === 0) { console.log(' No dispensaries to process.'); return; } console.log(` Found ${dispensaryIds.length} dispensaries to process`); console.log(` Concurrency: ${flags.concurrency}`); if (flags.dryRun) { console.log('\n Would process these dispensaries:'); const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]); for (const row of details.rows.slice(0, 20)) { console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`); } if (details.rows.length > 20) { console.log(` ... and ${details.rows.length - 20} more`); } return; } console.log('\n Starting batch processing...\n'); const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency); // Summarize results const summary = { total: results.length, success: results.filter(r => r.status === 'success').length, sandboxOnly: results.filter(r => r.status === 'sandbox_only').length, detectionOnly: results.filter(r => r.status === 'detection_only').length, error: results.filter(r => r.status === 'error').length, detectionsRan: results.filter(r => r.detectionRan).length, crawlsRan: results.filter(r => r.crawlRan).length, productionCrawls: results.filter(r => r.crawlType === 'production').length, sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length, totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0), totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0), }; console.log('\n' + '═'.repeat(70)); console.log(' Orchestrator Results'); console.log('═'.repeat(70)); console.log(` Total Processed: ${summary.total} Status: - Success: ${summary.success} - Sandbox Only: ${summary.sandboxOnly} - Detection Only: ${summary.detectionOnly} - Error: ${summary.error} Operations: - Detections Ran: ${summary.detectionsRan} - Crawls Ran: ${summary.crawlsRan} - Production: ${summary.productionCrawls} - Sandbox: ${summary.sandboxCrawls} Results: - Products Found: ${summary.totalProducts} - Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s - Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s `); console.log('═'.repeat(70) + '\n'); // Show errors if any const errors = results.filter(r => r.status === 'error'); if (errors.length > 0) { console.log('\nāš ļø Errors encountered:'); for (const err of errors.slice(0, 10)) { console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`); } if (errors.length > 10) { console.log(` ... and ${errors.length - 10} more errors`); } } } async function main() { if (flags.help) { await showHelp(); process.exit(0); } console.log('\n' + '═'.repeat(70)); console.log(' Dispensary Crawl Bootstrap Discovery'); console.log('═'.repeat(70)); if (flags.dryRun) { console.log('\nšŸ” DRY RUN MODE - No changes will be made'); } try { // Always show status first await showStatus(); if (flags.status) { // Status-only mode, we're done await migrate_1.pool.end(); process.exit(0); } // Step 1: Create schedule entries await createSchedules(); // Step 2: Optionally run orchestrator if (flags.run) { await runOrchestrator(); } else { console.log('\nšŸ’” Tip: Use --run to also run the orchestrator for each dispensary'); } // Show final status if (!flags.dryRun) { await showStatus(); } } catch (error) { console.error('\nāŒ Fatal error:', error.message); console.error(error.stack); process.exit(1); } finally { await migrate_1.pool.end(); } } main();