- Moved hydration module back from _deprecated (needed for product_refresh) - Restored product_refresh handler for processing stored payloads - Restored geolocation service for findadispo/findagram - Stubbed system routes that depend on deprecated SyncOrchestrator - Removed crawler-sandbox route (deprecated) - Fixed all TypeScript compilation errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
425 lines
14 KiB
TypeScript
425 lines
14 KiB
TypeScript
#!/usr/bin/env npx tsx
|
||
/**
|
||
* Queue Dispensaries Script
|
||
*
|
||
* Orchestrates the multi-provider crawler system:
|
||
* 1. Queue dispensaries that need provider detection
|
||
* 2. Queue Dutchie dispensaries for production crawl
|
||
* 3. Queue sandbox dispensaries for learning crawls
|
||
*
|
||
* Usage:
|
||
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
|
||
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
|
||
*/
|
||
|
||
import { pool } from '../db/pool';
|
||
import { logger } from '../services/logger';
|
||
import {
|
||
runDetectMenuProviderJob,
|
||
runDutchieMenuCrawlJob,
|
||
runSandboxCrawlJob,
|
||
processSandboxJobs,
|
||
} from '../services/crawler-jobs';
|
||
|
||
// Parse command line args
|
||
const args = process.argv.slice(2);
|
||
const flags = {
|
||
detection: args.includes('--detection') || args.includes('--all'),
|
||
production: args.includes('--production') || args.includes('--all'),
|
||
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||
dryRun: args.includes('--dry-run'),
|
||
process: args.includes('--process'),
|
||
help: args.includes('--help') || args.includes('-h'),
|
||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||
};
|
||
|
||
// If no specific flags, default to all
|
||
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||
flags.detection = true;
|
||
flags.production = true;
|
||
flags.sandbox = true;
|
||
}
|
||
|
||
async function showHelp() {
|
||
console.log(`
|
||
Queue Dispensaries - Multi-Provider Crawler Orchestration
|
||
|
||
USAGE:
|
||
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
|
||
|
||
OPTIONS:
|
||
--detection Queue dispensaries that need provider detection
|
||
--production Queue Dutchie production crawls
|
||
--sandbox Queue sandbox/learning crawls
|
||
--all Queue all job types (default if no specific flag)
|
||
--process Process queued jobs instead of just queuing
|
||
--dry-run Show what would be queued without making changes
|
||
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||
--help, -h Show this help message
|
||
|
||
EXAMPLES:
|
||
# Queue all dispensaries for appropriate jobs
|
||
npx tsx src/scripts/queue-dispensaries.ts
|
||
|
||
# Only queue detection jobs
|
||
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
|
||
|
||
# Dry run to see what would be queued
|
||
npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||
|
||
# Process sandbox jobs
|
||
npx tsx src/scripts/queue-dispensaries.ts --process
|
||
`);
|
||
}
|
||
|
||
async function queueDetectionJobs(): Promise<number> {
|
||
console.log('\n📡 Queueing Detection Jobs...');
|
||
|
||
// Find dispensaries that need provider detection:
|
||
// - menu_provider is null OR
|
||
// - menu_provider_confidence < 70 AND
|
||
// - crawler_status is idle (not already queued/running)
|
||
// - has a website URL
|
||
const query = `
|
||
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
|
||
FROM dispensaries
|
||
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||
AND crawler_status = 'idle'
|
||
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
|
||
ORDER BY
|
||
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
|
||
menu_provider_confidence ASC
|
||
LIMIT $1
|
||
`;
|
||
|
||
const result = await pool.query(query, [flags.limit]);
|
||
|
||
if (flags.dryRun) {
|
||
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
|
||
for (const row of result.rows) {
|
||
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
|
||
}
|
||
return result.rows.length;
|
||
}
|
||
|
||
let queued = 0;
|
||
for (const dispensary of result.rows) {
|
||
try {
|
||
// Update status to queued
|
||
await pool.query(
|
||
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
|
||
[dispensary.id]
|
||
);
|
||
|
||
// Create sandbox job for detection
|
||
await pool.query(
|
||
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||
VALUES ($1, 'detection', 'pending', 10)`,
|
||
[dispensary.id]
|
||
);
|
||
|
||
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||
queued++;
|
||
} catch (error: any) {
|
||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
return queued;
|
||
}
|
||
|
||
async function queueProductionCrawls(): Promise<number> {
|
||
console.log('\n🏭 Queueing Production Dutchie Crawls...');
|
||
|
||
// Find Dutchie dispensaries ready for production crawl:
|
||
// - menu_provider = 'dutchie'
|
||
// - crawler_mode = 'production'
|
||
// - crawler_status is idle
|
||
// - last_menu_scrape is old or null
|
||
const query = `
|
||
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
|
||
FROM dispensaries d
|
||
WHERE d.menu_provider = 'dutchie'
|
||
AND d.crawler_mode = 'production'
|
||
AND d.crawler_status = 'idle'
|
||
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
|
||
ORDER BY
|
||
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
|
||
d.last_menu_scrape ASC
|
||
LIMIT $1
|
||
`;
|
||
|
||
const result = await pool.query(query, [flags.limit]);
|
||
|
||
if (flags.dryRun) {
|
||
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
|
||
for (const row of result.rows) {
|
||
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
|
||
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
|
||
}
|
||
return result.rows.length;
|
||
}
|
||
|
||
let queued = 0;
|
||
for (const dispensary of result.rows) {
|
||
try {
|
||
// Update status to queued
|
||
await pool.query(
|
||
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
|
||
[dispensary.id]
|
||
);
|
||
|
||
// Create crawl job in the main crawl_jobs table (production queue)
|
||
await pool.query(
|
||
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
|
||
FROM stores s
|
||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||
WHERE d.id = $1
|
||
LIMIT 1`,
|
||
[dispensary.id]
|
||
);
|
||
|
||
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
|
||
queued++;
|
||
} catch (error: any) {
|
||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
return queued;
|
||
}
|
||
|
||
async function queueSandboxCrawls(): Promise<number> {
|
||
console.log('\n🧪 Queueing Sandbox Crawls...');
|
||
|
||
// Find sandbox dispensaries needing crawls:
|
||
// - crawler_mode = 'sandbox'
|
||
// - crawler_status in (idle, error_needs_review)
|
||
// - No recent sandbox job
|
||
const query = `
|
||
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
|
||
FROM dispensaries d
|
||
WHERE d.crawler_mode = 'sandbox'
|
||
AND d.crawler_status IN ('idle', 'error_needs_review')
|
||
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||
AND NOT EXISTS (
|
||
SELECT 1 FROM sandbox_crawl_jobs sj
|
||
WHERE sj.dispensary_id = d.id
|
||
AND sj.status IN ('pending', 'running')
|
||
)
|
||
ORDER BY d.updated_at ASC
|
||
LIMIT $1
|
||
`;
|
||
|
||
const result = await pool.query(query, [flags.limit]);
|
||
|
||
if (flags.dryRun) {
|
||
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
|
||
for (const row of result.rows) {
|
||
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
|
||
}
|
||
return result.rows.length;
|
||
}
|
||
|
||
let queued = 0;
|
||
for (const dispensary of result.rows) {
|
||
try {
|
||
// Update status
|
||
await pool.query(
|
||
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
|
||
[dispensary.id]
|
||
);
|
||
|
||
// Create sandbox job
|
||
await pool.query(
|
||
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||
VALUES ($1, 'deep_crawl', 'pending', 5)`,
|
||
[dispensary.id]
|
||
);
|
||
|
||
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
|
||
queued++;
|
||
} catch (error: any) {
|
||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
return queued;
|
||
}
|
||
|
||
async function processJobs(): Promise<void> {
|
||
console.log('\n⚙️ Processing Queued Jobs...\n');
|
||
|
||
// Process sandbox jobs (detection + sandbox crawls)
|
||
const sandboxJobs = await pool.query(
|
||
`SELECT * FROM sandbox_crawl_jobs
|
||
WHERE status = 'pending'
|
||
ORDER BY priority DESC, scheduled_at ASC
|
||
LIMIT $1`,
|
||
[flags.limit]
|
||
);
|
||
|
||
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
|
||
|
||
for (const job of sandboxJobs.rows) {
|
||
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
|
||
|
||
try {
|
||
// Mark as running
|
||
await pool.query(
|
||
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`,
|
||
[job.id]
|
||
);
|
||
|
||
let result;
|
||
if (job.job_type === 'detection') {
|
||
result = await runDetectMenuProviderJob(job.dispensary_id);
|
||
} else {
|
||
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
|
||
}
|
||
|
||
// Update job status
|
||
await pool.query(
|
||
`UPDATE sandbox_crawl_jobs
|
||
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||
WHERE id = $4`,
|
||
[
|
||
result.success ? 'completed' : 'failed',
|
||
JSON.stringify(result.data || {}),
|
||
result.success ? null : result.message,
|
||
job.id,
|
||
]
|
||
);
|
||
|
||
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
|
||
|
||
} catch (error: any) {
|
||
await pool.query(
|
||
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
|
||
[error.message, job.id]
|
||
);
|
||
console.log(` ✗ Error: ${error.message}\n`);
|
||
}
|
||
}
|
||
}
|
||
|
||
async function showStats(): Promise<void> {
|
||
console.log('\n📊 Current Stats:');
|
||
|
||
// Dispensary stats
|
||
const stats = await pool.query(`
|
||
SELECT
|
||
COUNT(*) as total,
|
||
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
|
||
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
|
||
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
|
||
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
|
||
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
|
||
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
|
||
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
|
||
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
|
||
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
|
||
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
|
||
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
|
||
FROM dispensaries
|
||
`);
|
||
|
||
const s = stats.rows[0];
|
||
console.log(`
|
||
Dispensaries: ${s.total}
|
||
- No provider detected: ${s.no_provider}
|
||
- Dutchie: ${s.dutchie}
|
||
- Other providers: ${s.other_providers}
|
||
- Unknown: ${s.unknown}
|
||
|
||
Crawler Mode:
|
||
- Production: ${s.production_mode}
|
||
- Sandbox: ${s.sandbox_mode}
|
||
|
||
Status:
|
||
- Idle: ${s.idle}
|
||
- Queued: ${s.queued}
|
||
- Running: ${s.running}
|
||
- OK: ${s.ok}
|
||
- Needs Review: ${s.needs_review}
|
||
`);
|
||
|
||
// Job stats
|
||
const jobStats = await pool.query(`
|
||
SELECT
|
||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||
FROM sandbox_crawl_jobs
|
||
`);
|
||
|
||
const j = jobStats.rows[0];
|
||
console.log(` Sandbox Jobs:
|
||
- Pending: ${j.pending}
|
||
- Running: ${j.running}
|
||
- Completed: ${j.completed}
|
||
- Failed: ${j.failed}
|
||
`);
|
||
}
|
||
|
||
async function main() {
|
||
if (flags.help) {
|
||
await showHelp();
|
||
process.exit(0);
|
||
}
|
||
|
||
console.log('═══════════════════════════════════════════════════════');
|
||
console.log(' Multi-Provider Crawler Queue Manager');
|
||
console.log('═══════════════════════════════════════════════════════');
|
||
|
||
if (flags.dryRun) {
|
||
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||
}
|
||
|
||
try {
|
||
// Show current stats first
|
||
await showStats();
|
||
|
||
if (flags.process) {
|
||
// Process mode - run jobs instead of queuing
|
||
await processJobs();
|
||
} else {
|
||
// Queuing mode
|
||
let totalQueued = 0;
|
||
|
||
if (flags.detection) {
|
||
totalQueued += await queueDetectionJobs();
|
||
}
|
||
|
||
if (flags.production) {
|
||
totalQueued += await queueProductionCrawls();
|
||
}
|
||
|
||
if (flags.sandbox) {
|
||
totalQueued += await queueSandboxCrawls();
|
||
}
|
||
|
||
console.log('\n═══════════════════════════════════════════════════════');
|
||
console.log(` Total dispensaries queued: ${totalQueued}`);
|
||
console.log('═══════════════════════════════════════════════════════\n');
|
||
}
|
||
|
||
// Show updated stats
|
||
if (!flags.dryRun) {
|
||
await showStats();
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error('Fatal error:', error);
|
||
process.exit(1);
|
||
} finally {
|
||
await pool.end();
|
||
}
|
||
}
|
||
|
||
main();
|