Add crawler scheduler, orchestrator, and multi-category intelligence

- Add scheduler UI with store schedules, job queue, and global settings
- Add store crawl orchestrator for intelligent crawl workflow
- Add multi-category intelligence detection (product, specials, brands, metadata)
- Add CrawlerLogger for structured JSON logging
- Add migrations for scheduler tables and dispensary linking
- Add dispensary → scheduler navigation link
- Support production/sandbox crawler modes per provider

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-11-30 09:29:15 -07:00
parent 8b4292fbb2
commit 3861a31a3b
25 changed files with 8874 additions and 13 deletions

View File

@@ -0,0 +1,345 @@
#!/usr/bin/env npx tsx
/**
* Backfill Store-Dispensary Mapping
*
* Links existing stores (scheduler) to dispensaries (master AZDHS directory)
* by matching on name, city, and zip code.
*
* Usage:
* npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
* npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
* npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
const args = process.argv.slice(2);
const flags = {
apply: args.includes('--apply'),
verbose: args.includes('--verbose'),
help: args.includes('--help') || args.includes('-h'),
};
interface Store {
id: number;
name: string;
slug: string;
dispensary_id: number | null;
}
interface Dispensary {
id: number;
name: string;
company_name: string | null;
city: string;
address: string;
slug: string;
}
interface MatchResult {
store: Store;
dispensary: Dispensary | null;
matchType: 'exact_name' | 'normalized_name' | 'company_name' | 'slug' | 'fuzzy' | 'none';
score: number;
}
/**
* Normalize a store/dispensary name for comparison
* Removes common suffixes, punctuation, and extra whitespace
*/
function normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
.replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
.replace(/['']/g, "'") // Normalize apostrophes
.replace(/[^\w\s']/g, '') // Remove other punctuation
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Simple Levenshtein distance for fuzzy matching
*/
function levenshteinDistance(a: string, b: string): number {
const matrix: number[][] = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Calculate similarity score (0-100)
*/
function similarityScore(a: string, b: string): number {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 100;
const distance = levenshteinDistance(a, b);
return Math.round((1 - distance / maxLen) * 100);
}
/**
* Find the best dispensary match for a store
*/
function findBestMatch(store: Store, dispensaries: Dispensary[]): MatchResult {
const normalizedStoreName = normalizeName(store.name);
const storeSlug = store.slug.toLowerCase();
let bestMatch: MatchResult = {
store,
dispensary: null,
matchType: 'none',
score: 0,
};
for (const disp of dispensaries) {
const normalizedDispName = normalizeName(disp.name);
const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
const dispSlug = disp.slug.toLowerCase();
// 1. Exact name match (case-insensitive)
if (store.name.toLowerCase() === disp.name.toLowerCase()) {
return {
store,
dispensary: disp,
matchType: 'exact_name',
score: 100,
};
}
// 2. Normalized name match
if (normalizedStoreName === normalizedDispName) {
return {
store,
dispensary: disp,
matchType: 'normalized_name',
score: 95,
};
}
// 3. Store name matches company name
if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
return {
store,
dispensary: disp,
matchType: 'company_name',
score: 90,
};
}
// 4. Slug match
if (storeSlug === dispSlug) {
return {
store,
dispensary: disp,
matchType: 'slug',
score: 85,
};
}
// 5. Fuzzy matching (only if score > 70)
const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
const companyScore = normalizedCompanyName
? similarityScore(normalizedStoreName, normalizedCompanyName)
: 0;
const fuzzyScore = Math.max(nameScore, companyScore);
if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
bestMatch = {
store,
dispensary: disp,
matchType: 'fuzzy',
score: fuzzyScore,
};
}
}
return bestMatch;
}
async function main() {
if (flags.help) {
console.log(`
Backfill Store-Dispensary Mapping
Links existing stores (scheduler) to dispensaries (master AZDHS directory)
by matching on name, company name, or slug similarity.
USAGE:
npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
OPTIONS:
--apply Apply the mappings to the database (default: preview only)
--verbose Show detailed match information for all stores
--help, -h Show this help message
EXAMPLES:
# Preview what would be matched
npx tsx src/scripts/backfill-store-dispensary.ts
# Apply the mappings
npx tsx src/scripts/backfill-store-dispensary.ts --apply
# Show verbose output
npx tsx src/scripts/backfill-store-dispensary.ts --verbose
`);
process.exit(0);
}
console.log('\n📦 Backfill Store-Dispensary Mapping');
console.log('=====================================\n');
try {
// Fetch all stores without a dispensary_id
const storesResult = await pool.query<Store>(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NULL
ORDER BY name
`);
const unmappedStores = storesResult.rows;
// Fetch all already-mapped stores for context
const mappedResult = await pool.query<Store>(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NOT NULL
ORDER BY name
`);
const mappedStores = mappedResult.rows;
// Fetch all dispensaries
const dispResult = await pool.query<Dispensary>(`
SELECT id, name, company_name, city, address, slug
FROM dispensaries
ORDER BY name
`);
const dispensaries = dispResult.rows;
console.log(`📊 Current Status:`);
console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
console.log(` Stores already mapped: ${mappedStores.length}`);
console.log(` Total dispensaries: ${dispensaries.length}\n`);
if (unmappedStores.length === 0) {
console.log('✅ All stores are already mapped to dispensaries!\n');
await pool.end();
process.exit(0);
}
// Find matches for each unmapped store
const matches: MatchResult[] = [];
const noMatches: Store[] = [];
for (const store of unmappedStores) {
const match = findBestMatch(store, dispensaries);
if (match.dispensary) {
matches.push(match);
} else {
noMatches.push(store);
}
}
// Sort matches by score (highest first)
matches.sort((a, b) => b.score - a.score);
// Display results
console.log(`\n🔗 Matches Found: ${matches.length}`);
console.log('----------------------------------\n');
if (matches.length > 0) {
// Group by match type
const byType: Record<string, MatchResult[]> = {};
for (const m of matches) {
if (!byType[m.matchType]) byType[m.matchType] = [];
byType[m.matchType].push(m);
}
const typeLabels: Record<string, string> = {
exact_name: '✅ Exact Name Match',
normalized_name: '✅ Normalized Name Match',
company_name: '🏢 Company Name Match',
slug: '🔗 Slug Match',
fuzzy: '🔍 Fuzzy Match',
};
for (const [type, results] of Object.entries(byType)) {
console.log(`${typeLabels[type]} (${results.length}):`);
for (const r of results) {
const dispInfo = r.dispensary!;
console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
}
console.log('');
}
}
if (noMatches.length > 0) {
console.log(`\n❌ No Match Found: ${noMatches.length}`);
console.log('----------------------------------\n');
for (const store of noMatches) {
console.log(` • "${store.name}" (slug: ${store.slug})`);
}
console.log('');
}
// Apply if requested
if (flags.apply && matches.length > 0) {
console.log('\n🔧 Applying mappings...\n');
let updated = 0;
for (const match of matches) {
if (!match.dispensary) continue;
await pool.query(
'UPDATE stores SET dispensary_id = $1 WHERE id = $2',
[match.dispensary.id, match.store.id]
);
updated++;
if (flags.verbose) {
console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
}
}
console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
} else if (matches.length > 0 && !flags.apply) {
console.log('\n💡 Run with --apply to update the database\n');
}
// Summary
console.log('📈 Summary:');
console.log(` Would match: ${matches.length} stores`);
console.log(` No match: ${noMatches.length} stores`);
console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main().catch(console.error);

View File

@@ -0,0 +1,424 @@
#!/usr/bin/env npx tsx
/**
* Queue Dispensaries Script
*
* Orchestrates the multi-provider crawler system:
* 1. Queue dispensaries that need provider detection
* 2. Queue Dutchie dispensaries for production crawl
* 3. Queue sandbox dispensaries for learning crawls
*
* Usage:
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
import {
runDetectMenuProviderJob,
runDutchieMenuCrawlJob,
runSandboxCrawlJob,
processSandboxJobs,
} from '../services/crawler-jobs';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
async function showHelp() {
console.log(`
Queue Dispensaries - Multi-Provider Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need provider detection
--production Queue Dutchie production crawls
--sandbox Queue sandbox/learning crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-dispensaries.ts
# Only queue detection jobs
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
# Dry run to see what would be queued
npx tsx src/scripts/queue-dispensaries.ts --dry-run
# Process sandbox jobs
npx tsx src/scripts/queue-dispensaries.ts --process
`);
}
async function queueDetectionJobs(): Promise<number> {
console.log('\n📡 Queueing Detection Jobs...');
// Find dispensaries that need provider detection:
// - menu_provider is null OR
// - menu_provider_confidence < 70 AND
// - crawler_status is idle (not already queued/running)
// - has a website URL
const query = `
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND crawler_status = 'idle'
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
ORDER BY
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
menu_provider_confidence ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create sandbox job for detection
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 10)`,
[dispensary.id]
);
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueProductionCrawls(): Promise<number> {
console.log('\n🏭 Queueing Production Dutchie Crawls...');
// Find Dutchie dispensaries ready for production crawl:
// - menu_provider = 'dutchie'
// - crawler_mode = 'production'
// - crawler_status is idle
// - last_menu_scrape is old or null
const query = `
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
FROM dispensaries d
WHERE d.menu_provider = 'dutchie'
AND d.crawler_mode = 'production'
AND d.crawler_status = 'idle'
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
d.last_menu_scrape ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
for (const row of result.rows) {
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create crawl job in the main crawl_jobs table (production queue)
await pool.query(
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id]
);
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueSandboxCrawls(): Promise<number> {
console.log('\n🧪 Queueing Sandbox Crawls...');
// Find sandbox dispensaries needing crawls:
// - crawler_mode = 'sandbox'
// - crawler_status in (idle, error_needs_review)
// - No recent sandbox job
const query = `
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
FROM dispensaries d
WHERE d.crawler_mode = 'sandbox'
AND d.crawler_status IN ('idle', 'error_needs_review')
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.status IN ('pending', 'running')
)
ORDER BY d.updated_at ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create sandbox job
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'deep_crawl', 'pending', 5)`,
[dispensary.id]
);
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function processJobs(): Promise<void> {
console.log('\n⚙ Processing Queued Jobs...\n');
// Process sandbox jobs (detection + sandbox crawls)
const sandboxJobs = await pool.query(
`SELECT * FROM sandbox_crawl_jobs
WHERE status = 'pending'
ORDER BY priority DESC, scheduled_at ASC
LIMIT $1`,
[flags.limit]
);
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
for (const job of sandboxJobs.rows) {
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
try {
// Mark as running
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`,
[job.id]
);
let result;
if (job.job_type === 'detection') {
result = await runDetectMenuProviderJob(job.dispensary_id);
} else {
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
}
// Update job status
await pool.query(
`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`,
[
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]
);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
} catch (error: any) {
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
[error.message, job.id]
);
console.log(` ✗ Error: ${error.message}\n`);
}
}
}
async function showStats(): Promise<void> {
console.log('\n📊 Current Stats:');
// Dispensary stats
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
Dispensaries: ${s.total}
- No provider detected: ${s.no_provider}
- Dutchie: ${s.dutchie}
- Other providers: ${s.other_providers}
- Unknown: ${s.unknown}
Crawler Mode:
- Production: ${s.production_mode}
- Sandbox: ${s.sandbox_mode}
Status:
- Idle: ${s.idle}
- Queued: ${s.queued}
- Running: ${s.running}
- OK: ${s.ok}
- Needs Review: ${s.needs_review}
`);
// Job stats
const jobStats = await pool.query(`
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
`);
const j = jobStats.rows[0];
console.log(` Sandbox Jobs:
- Pending: ${j.pending}
- Running: ${j.running}
- Completed: ${j.completed}
- Failed: ${j.failed}
`);
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Provider Crawler Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
try {
// Show current stats first
await showStats();
if (flags.process) {
// Process mode - run jobs instead of queuing
await processJobs();
} else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueDetectionJobs();
}
if (flags.production) {
totalQueued += await queueProductionCrawls();
}
if (flags.sandbox) {
totalQueued += await queueSandboxCrawls();
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total dispensaries queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,583 @@
#!/usr/bin/env npx tsx
/**
* Queue Intelligence Script
*
* Orchestrates the multi-category intelligence crawler system:
* 1. Queue dispensaries that need provider detection (all 4 categories)
* 2. Queue per-category production crawls (Dutchie products only for now)
* 3. Queue per-category sandbox crawls (all providers)
*
* Each category (product, specials, brand, metadata) is handled independently.
* A failure in one category does NOT affect other categories.
*
* Usage:
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
* npx tsx src/scripts/queue-intelligence.ts --dry-run
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
import {
detectMultiCategoryProviders,
updateAllCategoryProviders,
IntelligenceCategory,
} from '../services/intelligence-detector';
import {
runCrawlProductsJob,
runCrawlSpecialsJob,
runCrawlBrandIntelligenceJob,
runCrawlMetadataJob,
runSandboxProductsJob,
runSandboxSpecialsJob,
runSandboxBrandJob,
runSandboxMetadataJob,
runAllCategoryProductionCrawls,
runAllCategorySandboxCrawls,
processCategorySandboxJobs,
} from '../services/category-crawler-jobs';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
category: args.find(a => a.startsWith('--category='))?.split('=')[1] as IntelligenceCategory | undefined,
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
const CATEGORIES: IntelligenceCategory[] = ['product', 'specials', 'brand', 'metadata'];
async function showHelp() {
console.log(`
Queue Intelligence - Multi-Category Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need multi-category detection
--production Queue per-category production crawls
--sandbox Queue per-category sandbox crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
--dispensary=ID Process only a specific dispensary
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
CATEGORIES:
product - Product/menu data (Dutchie=production, others=sandbox)
specials - Deals and specials (all sandbox for now)
brand - Brand intelligence (all sandbox for now)
metadata - Categories/taxonomy (all sandbox for now)
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-intelligence.ts
# Only queue product detection jobs
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
# Process sandbox jobs for specials category
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
# Run full detection for a specific dispensary
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
# Dry run to see what would be queued
npx tsx src/scripts/queue-intelligence.ts --dry-run
`);
}
async function queueMultiCategoryDetection(): Promise<number> {
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
// Find dispensaries that need provider detection for any category:
// - Any *_provider is null OR
// - Any *_confidence < 70
// - has a website URL
const query = `
SELECT id, name, website, menu_url,
product_provider, product_confidence, product_crawler_mode,
specials_provider, specials_confidence, specials_crawler_mode,
brand_provider, brand_confidence, brand_crawler_mode,
metadata_provider, metadata_confidence, metadata_crawler_mode
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND (
product_provider IS NULL OR product_confidence < 70 OR
specials_provider IS NULL OR specials_confidence < 70 OR
brand_provider IS NULL OR brand_confidence < 70 OR
metadata_provider IS NULL OR metadata_confidence < 70
)
ORDER BY
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
product_confidence ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
for (const row of result.rows) {
const needsDetection: string[] = [];
if (!row.product_provider || row.product_confidence < 70) needsDetection.push('product');
if (!row.specials_provider || row.specials_confidence < 70) needsDetection.push('specials');
if (!row.brand_provider || row.brand_confidence < 70) needsDetection.push('brand');
if (!row.metadata_provider || row.metadata_confidence < 70) needsDetection.push('metadata');
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Create detection jobs for each category that needs it
for (const category of CATEGORIES) {
const provider = dispensary[`${category}_provider`];
const confidence = dispensary[`${category}_confidence`];
if (!provider || confidence < 70) {
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
VALUES ($1, $2, 'detection', 'pending', 10)
ON CONFLICT DO NOTHING`,
[dispensary.id, category]
);
}
}
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueCategoryProductionCrawls(category?: IntelligenceCategory): Promise<number> {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
// For now, only products have production-ready crawlers (Dutchie only)
if (cat !== 'product') {
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
continue;
}
// Find dispensaries ready for production crawl
const query = `
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
FROM dispensaries
WHERE ${cat}_provider = 'dutchie'
AND ${cat}_crawler_mode = 'production'
AND ${cat}_confidence >= 70
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
last_${cat}_scan_at ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
for (const row of result.rows) {
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// For products, use the existing crawl_jobs table for production
await pool.query(
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id, cat]
);
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
totalQueued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function queueCategorySandboxCrawls(category?: IntelligenceCategory): Promise<number> {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
// Find dispensaries in sandbox mode for this category
const query = `
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
d.website, d.menu_url
FROM dispensaries d
WHERE d.${cat}_crawler_mode = 'sandbox'
AND d.${cat}_provider IS NOT NULL
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.category = $1
AND sj.status IN ('pending', 'running')
)
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
LIMIT $2
`;
const result = await pool.query(query, [cat, flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// Create sandbox entry if needed
const sandboxResult = await pool.query(
`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
VALUES ($1, $2, $3, 'template_learning', 'pending')
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
DO UPDATE SET updated_at = NOW()
RETURNING id`,
[dispensary.id, cat, dispensary.provider]
);
const sandboxId = sandboxResult.rows[0]?.id;
// Create sandbox job
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`,
[dispensary.id, sandboxId, cat]
);
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
totalQueued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function processDetectionJobs(): Promise<void> {
console.log('\n🔍 Processing Detection Jobs...');
// Get pending detection jobs
const jobs = await pool.query(
`SELECT DISTINCT dispensary_id
FROM sandbox_crawl_jobs
WHERE job_type = 'detection' AND status = 'pending'
${flags.category ? `AND category = $2` : ''}
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
LIMIT $1`,
flags.category
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit])
);
for (const job of jobs.rows) {
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
try {
// Get dispensary info
const dispResult = await pool.query(
'SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1',
[job.dispensary_id]
);
const dispensary = dispResult.rows[0];
if (!dispensary) {
console.log(` ✗ Dispensary not found`);
continue;
}
const websiteUrl = dispensary.website || dispensary.menu_url;
if (!websiteUrl) {
console.log(` ✗ No website URL`);
continue;
}
// Mark jobs as running
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`,
[job.dispensary_id]
);
// Run multi-category detection
console.log(` Detecting providers for ${dispensary.name}...`);
const detection = await detectMultiCategoryProviders(websiteUrl, { timeout: 45000 });
// Update all categories
await updateAllCategoryProviders(job.dispensary_id, detection);
// Mark jobs as completed
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
result_summary = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
[JSON.stringify({
product: { provider: detection.product.provider, confidence: detection.product.confidence },
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
}), job.dispensary_id]
);
console.log(` ✓ Detection complete:`);
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
[error.message, job.dispensary_id]
);
}
}
}
async function processCrawlJobs(): Promise<void> {
const categories = flags.category ? [flags.category] : CATEGORIES;
for (const cat of categories) {
console.log(`\n⚙ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
// Process sandbox jobs for this category
if (flags.sandbox || !flags.production) {
await processCategorySandboxJobs(cat, flags.limit);
}
// Process production jobs for this category
if (flags.production && cat === 'product') {
// Get pending production crawls
const prodJobs = await pool.query(
`SELECT d.id
FROM dispensaries d
WHERE d.product_provider = 'dutchie'
AND d.product_crawler_mode = 'production'
AND d.product_confidence >= 70
${flags.dispensary ? 'AND d.id = $2' : ''}
LIMIT $1`,
flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]
);
for (const job of prodJobs.rows) {
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
const result = await runCrawlProductsJob(job.id);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
}
}
}
}
async function processSpecificDispensary(): Promise<void> {
if (!flags.dispensary) return;
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
const dispResult = await pool.query(
'SELECT * FROM dispensaries WHERE id = $1',
[flags.dispensary]
);
if (dispResult.rows.length === 0) {
console.log('Dispensary not found');
return;
}
const dispensary = dispResult.rows[0];
console.log(`Name: ${dispensary.name}`);
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
console.log('');
if (flags.detection) {
console.log('Running multi-category detection...');
const websiteUrl = dispensary.website || dispensary.menu_url;
if (websiteUrl) {
const detection = await detectMultiCategoryProviders(websiteUrl);
await updateAllCategoryProviders(flags.dispensary, detection);
console.log('Detection results:');
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
}
}
if (flags.production) {
console.log('\nRunning production crawls...');
const results = await runAllCategoryProductionCrawls(flags.dispensary);
console.log(` ${results.summary}`);
}
if (flags.sandbox) {
console.log('\nRunning sandbox crawls...');
const results = await runAllCategorySandboxCrawls(flags.dispensary);
console.log(` ${results.summary}`);
}
}
async function showStats(): Promise<void> {
console.log('\n📊 Multi-Category Intelligence Stats:');
// Per-category stats
for (const cat of CATEGORIES) {
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
AVG(${cat}_confidence) as avg_confidence
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
${cat.toUpperCase()}:
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
Modes: Production=${s.production}, Sandbox=${s.sandbox}
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
}
// Job stats per category
console.log('\n Sandbox Jobs by Category:');
const jobStats = await pool.query(`
SELECT
category,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
GROUP BY category
ORDER BY category
`);
for (const row of jobStats.rows) {
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Category Intelligence Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
if (flags.category) {
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
}
try {
// Show current stats first
await showStats();
// If specific dispensary specified, process it directly
if (flags.dispensary && flags.process) {
await processSpecificDispensary();
} else if (flags.process) {
// Process mode - run jobs
if (flags.detection) {
await processDetectionJobs();
}
await processCrawlJobs();
} else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueMultiCategoryDetection();
}
if (flags.production) {
totalQueued += await queueCategoryProductionCrawls(flags.category);
}
if (flags.sandbox) {
totalQueued += await queueCategorySandboxCrawls(flags.category);
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main();