The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
333 lines
14 KiB
JavaScript
333 lines
14 KiB
JavaScript
#!/usr/bin/env npx tsx
|
||
"use strict";
|
||
/**
|
||
* Bootstrap Discovery Script
|
||
*
|
||
* One-time (but reusable) bootstrap command that:
|
||
* 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
|
||
* 2. Optionally runs RunDispensaryOrchestrator for each dispensary
|
||
*
|
||
* Usage:
|
||
* npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
|
||
* npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
|
||
* npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
|
||
* npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
|
||
* npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
|
||
*/
|
||
Object.defineProperty(exports, "__esModule", { value: true });
|
||
const migrate_1 = require("../db/migrate");
|
||
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
|
||
// Parse command line args
|
||
const args = process.argv.slice(2);
|
||
const flags = {
|
||
run: args.includes('--run'),
|
||
dryRun: args.includes('--dry-run'),
|
||
status: args.includes('--status'),
|
||
help: args.includes('--help') || args.includes('-h'),
|
||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
|
||
concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
|
||
interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
|
||
detectionOnly: args.includes('--detection-only'),
|
||
productionOnly: args.includes('--production-only'),
|
||
sandboxOnly: args.includes('--sandbox-only'),
|
||
};
|
||
async function showHelp() {
|
||
console.log(`
|
||
Bootstrap Discovery - Initialize Dispensary Crawl System
|
||
|
||
USAGE:
|
||
npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
|
||
|
||
OPTIONS:
|
||
--run After creating schedules, run the orchestrator for each dispensary
|
||
--dry-run Show what would happen without making changes
|
||
--status Show current status and exit
|
||
--limit=N Limit how many dispensaries to process (0 = all, default: 0)
|
||
--concurrency=N How many dispensaries to process in parallel (default: 3)
|
||
--interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
|
||
--detection-only Only run detection, don't crawl
|
||
--production-only Only run dispensaries in production mode
|
||
--sandbox-only Only run dispensaries in sandbox mode
|
||
--help, -h Show this help message
|
||
|
||
EXAMPLES:
|
||
# Create schedule entries for all dispensaries (no crawling)
|
||
npx tsx src/scripts/bootstrap-discovery.ts
|
||
|
||
# Create schedules and run orchestrator for all dispensaries
|
||
npx tsx src/scripts/bootstrap-discovery.ts --run
|
||
|
||
# Run orchestrator for first 10 dispensaries
|
||
npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
|
||
|
||
# Run with higher concurrency
|
||
npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
|
||
|
||
# Show current status
|
||
npx tsx src/scripts/bootstrap-discovery.ts --status
|
||
|
||
WHAT IT DOES:
|
||
1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
|
||
2. If --run: For each dispensary, runs the orchestrator which:
|
||
a. Checks if provider detection is needed (null/unknown/stale/low confidence)
|
||
b. Runs detection if needed
|
||
c. If Dutchie + production mode: runs production crawl
|
||
d. Otherwise: runs sandbox crawl
|
||
3. Updates schedule status and job records
|
||
`);
|
||
}
|
||
async function showStatus() {
|
||
console.log('\n📊 Current Dispensary Crawl Status\n');
|
||
console.log('═'.repeat(70));
|
||
// Get dispensary counts by provider
|
||
const providerStats = await migrate_1.pool.query(`
|
||
SELECT
|
||
COALESCE(product_provider, 'undetected') as provider,
|
||
COUNT(*) as count,
|
||
COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
|
||
COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
|
||
COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
|
||
FROM dispensaries
|
||
GROUP BY COALESCE(product_provider, 'undetected')
|
||
ORDER BY count DESC
|
||
`);
|
||
console.log('\nProvider Distribution:');
|
||
console.log('-'.repeat(60));
|
||
console.log('Provider'.padEnd(20) +
|
||
'Total'.padStart(8) +
|
||
'Production'.padStart(12) +
|
||
'Sandbox'.padStart(10) +
|
||
'No Mode'.padStart(10));
|
||
console.log('-'.repeat(60));
|
||
for (const row of providerStats.rows) {
|
||
console.log(row.provider.padEnd(20) +
|
||
row.count.toString().padStart(8) +
|
||
row.production.toString().padStart(12) +
|
||
row.sandbox.toString().padStart(10) +
|
||
row.no_mode.toString().padStart(10));
|
||
}
|
||
// Get schedule stats
|
||
const scheduleStats = await migrate_1.pool.query(`
|
||
SELECT
|
||
COUNT(DISTINCT d.id) as total_dispensaries,
|
||
COUNT(DISTINCT dcs.id) as with_schedule,
|
||
COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
|
||
COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
|
||
COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
|
||
COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
|
||
COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
|
||
COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
|
||
COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
|
||
AVG(dcs.interval_minutes)::INTEGER as avg_interval
|
||
FROM dispensaries d
|
||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||
`);
|
||
const s = scheduleStats.rows[0];
|
||
console.log('\n\nSchedule Status:');
|
||
console.log('-'.repeat(60));
|
||
console.log(` Total Dispensaries: ${s.total_dispensaries}`);
|
||
console.log(` With Schedule: ${s.with_schedule}`);
|
||
console.log(` Without Schedule: ${s.without_schedule}`);
|
||
console.log(` Active Schedules: ${s.active_schedules || 0}`);
|
||
console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
|
||
console.log('\n Last Run Status:');
|
||
console.log(` - Success: ${s.last_success || 0}`);
|
||
console.log(` - Error: ${s.last_error || 0}`);
|
||
console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
|
||
console.log(` - Detection Only: ${s.last_detection || 0}`);
|
||
console.log(` - Due Now: ${s.due_now || 0}`);
|
||
// Get recent job stats
|
||
const jobStats = await migrate_1.pool.query(`
|
||
SELECT
|
||
COUNT(*) as total,
|
||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||
COUNT(*) FILTER (WHERE status = 'failed') as failed,
|
||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||
COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
|
||
COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
|
||
COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
|
||
COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
|
||
SUM(products_found) as total_products_found
|
||
FROM dispensary_crawl_jobs
|
||
WHERE created_at > NOW() - INTERVAL '24 hours'
|
||
`);
|
||
const j = jobStats.rows[0];
|
||
console.log('\n\nJobs (Last 24 Hours):');
|
||
console.log('-'.repeat(60));
|
||
console.log(` Total Jobs: ${j.total || 0}`);
|
||
console.log(` Completed: ${j.completed || 0}`);
|
||
console.log(` Failed: ${j.failed || 0}`);
|
||
console.log(` Running: ${j.running || 0}`);
|
||
console.log(` Pending: ${j.pending || 0}`);
|
||
console.log(` With Detection: ${j.with_detection || 0}`);
|
||
console.log(` With Crawl: ${j.with_crawl || 0}`);
|
||
console.log(` - Production: ${j.production_crawls || 0}`);
|
||
console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
|
||
console.log(` Products Found: ${j.total_products_found || 0}`);
|
||
console.log('\n' + '═'.repeat(70) + '\n');
|
||
}
|
||
async function createSchedules() {
|
||
console.log('\n📅 Creating Dispensary Schedules...\n');
|
||
if (flags.dryRun) {
|
||
// Count how many would be created
|
||
const result = await migrate_1.pool.query(`
|
||
SELECT COUNT(*) as count
|
||
FROM dispensaries d
|
||
WHERE NOT EXISTS (
|
||
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
|
||
)
|
||
`);
|
||
const wouldCreate = parseInt(result.rows[0].count);
|
||
console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
|
||
return { created: wouldCreate, existing: 0 };
|
||
}
|
||
const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval);
|
||
console.log(` ✓ Created ${result.created} new schedule entries`);
|
||
console.log(` ✓ ${result.existing} dispensaries already had schedules`);
|
||
return result;
|
||
}
|
||
async function getDispensariesToProcess() {
|
||
// Build query based on filters
|
||
let whereClause = 'TRUE';
|
||
if (flags.productionOnly) {
|
||
whereClause += ` AND d.product_crawler_mode = 'production'`;
|
||
}
|
||
else if (flags.sandboxOnly) {
|
||
whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
|
||
}
|
||
if (flags.detectionOnly) {
|
||
whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
|
||
}
|
||
const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
|
||
const query = `
|
||
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
|
||
FROM dispensaries d
|
||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||
WHERE ${whereClause}
|
||
ORDER BY
|
||
COALESCE(dcs.priority, 0) DESC,
|
||
dcs.last_run_at ASC NULLS FIRST,
|
||
d.id ASC
|
||
${limitClause}
|
||
`;
|
||
const result = await migrate_1.pool.query(query);
|
||
return result.rows.map(row => row.id);
|
||
}
|
||
async function runOrchestrator() {
|
||
console.log('\n🚀 Running Dispensary Orchestrator...\n');
|
||
const dispensaryIds = await getDispensariesToProcess();
|
||
if (dispensaryIds.length === 0) {
|
||
console.log(' No dispensaries to process.');
|
||
return;
|
||
}
|
||
console.log(` Found ${dispensaryIds.length} dispensaries to process`);
|
||
console.log(` Concurrency: ${flags.concurrency}`);
|
||
if (flags.dryRun) {
|
||
console.log('\n Would process these dispensaries:');
|
||
const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode
|
||
FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]);
|
||
for (const row of details.rows.slice(0, 20)) {
|
||
console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
|
||
}
|
||
if (details.rows.length > 20) {
|
||
console.log(` ... and ${details.rows.length - 20} more`);
|
||
}
|
||
return;
|
||
}
|
||
console.log('\n Starting batch processing...\n');
|
||
const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency);
|
||
// Summarize results
|
||
const summary = {
|
||
total: results.length,
|
||
success: results.filter(r => r.status === 'success').length,
|
||
sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
|
||
detectionOnly: results.filter(r => r.status === 'detection_only').length,
|
||
error: results.filter(r => r.status === 'error').length,
|
||
detectionsRan: results.filter(r => r.detectionRan).length,
|
||
crawlsRan: results.filter(r => r.crawlRan).length,
|
||
productionCrawls: results.filter(r => r.crawlType === 'production').length,
|
||
sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
|
||
totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
|
||
totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
|
||
};
|
||
console.log('\n' + '═'.repeat(70));
|
||
console.log(' Orchestrator Results');
|
||
console.log('═'.repeat(70));
|
||
console.log(`
|
||
Total Processed: ${summary.total}
|
||
|
||
Status:
|
||
- Success: ${summary.success}
|
||
- Sandbox Only: ${summary.sandboxOnly}
|
||
- Detection Only: ${summary.detectionOnly}
|
||
- Error: ${summary.error}
|
||
|
||
Operations:
|
||
- Detections Ran: ${summary.detectionsRan}
|
||
- Crawls Ran: ${summary.crawlsRan}
|
||
- Production: ${summary.productionCrawls}
|
||
- Sandbox: ${summary.sandboxCrawls}
|
||
|
||
Results:
|
||
- Products Found: ${summary.totalProducts}
|
||
- Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
|
||
- Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
|
||
`);
|
||
console.log('═'.repeat(70) + '\n');
|
||
// Show errors if any
|
||
const errors = results.filter(r => r.status === 'error');
|
||
if (errors.length > 0) {
|
||
console.log('\n⚠️ Errors encountered:');
|
||
for (const err of errors.slice(0, 10)) {
|
||
console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
|
||
}
|
||
if (errors.length > 10) {
|
||
console.log(` ... and ${errors.length - 10} more errors`);
|
||
}
|
||
}
|
||
}
|
||
async function main() {
|
||
if (flags.help) {
|
||
await showHelp();
|
||
process.exit(0);
|
||
}
|
||
console.log('\n' + '═'.repeat(70));
|
||
console.log(' Dispensary Crawl Bootstrap Discovery');
|
||
console.log('═'.repeat(70));
|
||
if (flags.dryRun) {
|
||
console.log('\n🔍 DRY RUN MODE - No changes will be made');
|
||
}
|
||
try {
|
||
// Always show status first
|
||
await showStatus();
|
||
if (flags.status) {
|
||
// Status-only mode, we're done
|
||
await migrate_1.pool.end();
|
||
process.exit(0);
|
||
}
|
||
// Step 1: Create schedule entries
|
||
await createSchedules();
|
||
// Step 2: Optionally run orchestrator
|
||
if (flags.run) {
|
||
await runOrchestrator();
|
||
}
|
||
else {
|
||
console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
|
||
}
|
||
// Show final status
|
||
if (!flags.dryRun) {
|
||
await showStatus();
|
||
}
|
||
}
|
||
catch (error) {
|
||
console.error('\n❌ Fatal error:', error.message);
|
||
console.error(error.stack);
|
||
process.exit(1);
|
||
}
|
||
finally {
|
||
await migrate_1.pool.end();
|
||
}
|
||
}
|
||
main();
|