Files
cannaiq/backend/dist/scripts/bootstrap-discovery.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

333 lines
14 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env npx tsx
"use strict";
/**
* Bootstrap Discovery Script
*
* One-time (but reusable) bootstrap command that:
* 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
* 2. Optionally runs RunDispensaryOrchestrator for each dispensary
*
* Usage:
* npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
* npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
* npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
* npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
* npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
*/
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
// Parse command line args
const args = process.argv.slice(2);
const flags = {
run: args.includes('--run'),
dryRun: args.includes('--dry-run'),
status: args.includes('--status'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
detectionOnly: args.includes('--detection-only'),
productionOnly: args.includes('--production-only'),
sandboxOnly: args.includes('--sandbox-only'),
};
async function showHelp() {
console.log(`
Bootstrap Discovery - Initialize Dispensary Crawl System
USAGE:
npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
OPTIONS:
--run After creating schedules, run the orchestrator for each dispensary
--dry-run Show what would happen without making changes
--status Show current status and exit
--limit=N Limit how many dispensaries to process (0 = all, default: 0)
--concurrency=N How many dispensaries to process in parallel (default: 3)
--interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
--detection-only Only run detection, don't crawl
--production-only Only run dispensaries in production mode
--sandbox-only Only run dispensaries in sandbox mode
--help, -h Show this help message
EXAMPLES:
# Create schedule entries for all dispensaries (no crawling)
npx tsx src/scripts/bootstrap-discovery.ts
# Create schedules and run orchestrator for all dispensaries
npx tsx src/scripts/bootstrap-discovery.ts --run
# Run orchestrator for first 10 dispensaries
npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
# Run with higher concurrency
npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
# Show current status
npx tsx src/scripts/bootstrap-discovery.ts --status
WHAT IT DOES:
1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
2. If --run: For each dispensary, runs the orchestrator which:
a. Checks if provider detection is needed (null/unknown/stale/low confidence)
b. Runs detection if needed
c. If Dutchie + production mode: runs production crawl
d. Otherwise: runs sandbox crawl
3. Updates schedule status and job records
`);
}
async function showStatus() {
console.log('\n📊 Current Dispensary Crawl Status\n');
console.log('═'.repeat(70));
// Get dispensary counts by provider
const providerStats = await migrate_1.pool.query(`
SELECT
COALESCE(product_provider, 'undetected') as provider,
COUNT(*) as count,
COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
FROM dispensaries
GROUP BY COALESCE(product_provider, 'undetected')
ORDER BY count DESC
`);
console.log('\nProvider Distribution:');
console.log('-'.repeat(60));
console.log('Provider'.padEnd(20) +
'Total'.padStart(8) +
'Production'.padStart(12) +
'Sandbox'.padStart(10) +
'No Mode'.padStart(10));
console.log('-'.repeat(60));
for (const row of providerStats.rows) {
console.log(row.provider.padEnd(20) +
row.count.toString().padStart(8) +
row.production.toString().padStart(12) +
row.sandbox.toString().padStart(10) +
row.no_mode.toString().padStart(10));
}
// Get schedule stats
const scheduleStats = await migrate_1.pool.query(`
SELECT
COUNT(DISTINCT d.id) as total_dispensaries,
COUNT(DISTINCT dcs.id) as with_schedule,
COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
AVG(dcs.interval_minutes)::INTEGER as avg_interval
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
`);
const s = scheduleStats.rows[0];
console.log('\n\nSchedule Status:');
console.log('-'.repeat(60));
console.log(` Total Dispensaries: ${s.total_dispensaries}`);
console.log(` With Schedule: ${s.with_schedule}`);
console.log(` Without Schedule: ${s.without_schedule}`);
console.log(` Active Schedules: ${s.active_schedules || 0}`);
console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
console.log('\n Last Run Status:');
console.log(` - Success: ${s.last_success || 0}`);
console.log(` - Error: ${s.last_error || 0}`);
console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
console.log(` - Detection Only: ${s.last_detection || 0}`);
console.log(` - Due Now: ${s.due_now || 0}`);
// Get recent job stats
const jobStats = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
SUM(products_found) as total_products_found
FROM dispensary_crawl_jobs
WHERE created_at > NOW() - INTERVAL '24 hours'
`);
const j = jobStats.rows[0];
console.log('\n\nJobs (Last 24 Hours):');
console.log('-'.repeat(60));
console.log(` Total Jobs: ${j.total || 0}`);
console.log(` Completed: ${j.completed || 0}`);
console.log(` Failed: ${j.failed || 0}`);
console.log(` Running: ${j.running || 0}`);
console.log(` Pending: ${j.pending || 0}`);
console.log(` With Detection: ${j.with_detection || 0}`);
console.log(` With Crawl: ${j.with_crawl || 0}`);
console.log(` - Production: ${j.production_crawls || 0}`);
console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
console.log(` Products Found: ${j.total_products_found || 0}`);
console.log('\n' + '═'.repeat(70) + '\n');
}
async function createSchedules() {
console.log('\n📅 Creating Dispensary Schedules...\n');
if (flags.dryRun) {
// Count how many would be created
const result = await migrate_1.pool.query(`
SELECT COUNT(*) as count
FROM dispensaries d
WHERE NOT EXISTS (
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
)
`);
const wouldCreate = parseInt(result.rows[0].count);
console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
return { created: wouldCreate, existing: 0 };
}
const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval);
console.log(` ✓ Created ${result.created} new schedule entries`);
console.log(`${result.existing} dispensaries already had schedules`);
return result;
}
async function getDispensariesToProcess() {
// Build query based on filters
let whereClause = 'TRUE';
if (flags.productionOnly) {
whereClause += ` AND d.product_crawler_mode = 'production'`;
}
else if (flags.sandboxOnly) {
whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
}
if (flags.detectionOnly) {
whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
}
const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
const query = `
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
WHERE ${whereClause}
ORDER BY
COALESCE(dcs.priority, 0) DESC,
dcs.last_run_at ASC NULLS FIRST,
d.id ASC
${limitClause}
`;
const result = await migrate_1.pool.query(query);
return result.rows.map(row => row.id);
}
async function runOrchestrator() {
console.log('\n🚀 Running Dispensary Orchestrator...\n');
const dispensaryIds = await getDispensariesToProcess();
if (dispensaryIds.length === 0) {
console.log(' No dispensaries to process.');
return;
}
console.log(` Found ${dispensaryIds.length} dispensaries to process`);
console.log(` Concurrency: ${flags.concurrency}`);
if (flags.dryRun) {
console.log('\n Would process these dispensaries:');
const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode
FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]);
for (const row of details.rows.slice(0, 20)) {
console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
}
if (details.rows.length > 20) {
console.log(` ... and ${details.rows.length - 20} more`);
}
return;
}
console.log('\n Starting batch processing...\n');
const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency);
// Summarize results
const summary = {
total: results.length,
success: results.filter(r => r.status === 'success').length,
sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
detectionOnly: results.filter(r => r.status === 'detection_only').length,
error: results.filter(r => r.status === 'error').length,
detectionsRan: results.filter(r => r.detectionRan).length,
crawlsRan: results.filter(r => r.crawlRan).length,
productionCrawls: results.filter(r => r.crawlType === 'production').length,
sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
};
console.log('\n' + '═'.repeat(70));
console.log(' Orchestrator Results');
console.log('═'.repeat(70));
console.log(`
Total Processed: ${summary.total}
Status:
- Success: ${summary.success}
- Sandbox Only: ${summary.sandboxOnly}
- Detection Only: ${summary.detectionOnly}
- Error: ${summary.error}
Operations:
- Detections Ran: ${summary.detectionsRan}
- Crawls Ran: ${summary.crawlsRan}
- Production: ${summary.productionCrawls}
- Sandbox: ${summary.sandboxCrawls}
Results:
- Products Found: ${summary.totalProducts}
- Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
- Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
`);
console.log('═'.repeat(70) + '\n');
// Show errors if any
const errors = results.filter(r => r.status === 'error');
if (errors.length > 0) {
console.log('\n⚠ Errors encountered:');
for (const err of errors.slice(0, 10)) {
console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
}
if (errors.length > 10) {
console.log(` ... and ${errors.length - 10} more errors`);
}
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('\n' + '═'.repeat(70));
console.log(' Dispensary Crawl Bootstrap Discovery');
console.log('═'.repeat(70));
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made');
}
try {
// Always show status first
await showStatus();
if (flags.status) {
// Status-only mode, we're done
await migrate_1.pool.end();
process.exit(0);
}
// Step 1: Create schedule entries
await createSchedules();
// Step 2: Optionally run orchestrator
if (flags.run) {
await runOrchestrator();
}
else {
console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
}
// Show final status
if (!flags.dryRun) {
await showStatus();
}
}
catch (error) {
console.error('\n❌ Fatal error:', error.message);
console.error(error.stack);
process.exit(1);
}
finally {
await migrate_1.pool.end();
}
}
main();