#!/usr/bin/env npx tsx /** * Unified Hydration CLI * * Central entrypoint for all hydration operations: * * MODES: * payload - Process raw_payloads → canonical tables (existing behavior) * backfill - Migrate dutchie_* → canonical tables (legacy backfill) * sync - Sync recent crawls to canonical tables * status - Show hydration progress * * Usage: * npx tsx src/scripts/run-hydration.ts --mode= [options] * * Examples: * # Payload-based hydration (default) * npx tsx src/scripts/run-hydration.ts --mode=payload * * # Full legacy backfill * npx tsx src/scripts/run-hydration.ts --mode=backfill * * # Backfill single dispensary * npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123 * * # Sync recent crawls * npx tsx src/scripts/run-hydration.ts --mode=sync --since="2 hours" * * # Check status * npx tsx src/scripts/run-hydration.ts --mode=status */ import { Pool } from 'pg'; import dotenv from 'dotenv'; import { HydrationWorker, runHydrationBatch, processPayloadById, reprocessFailedPayloads, getPayloadStats, } from '../hydration'; import { runLegacyBackfill } from '../hydration/legacy-backfill'; import { syncRecentCrawls } from '../hydration/incremental-sync'; dotenv.config(); // ============================================================ // ARGUMENT PARSING // ============================================================ interface CliArgs { mode: 'payload' | 'backfill' | 'sync' | 'status'; store?: number; since?: string; dryRun: boolean; verbose: boolean; limit: number; loop: boolean; reprocess: boolean; payloadId?: string; startFrom?: number; } function parseArgs(): CliArgs { const args = process.argv.slice(2); // Defaults const result: CliArgs = { mode: 'payload', dryRun: args.includes('--dry-run'), verbose: args.includes('--verbose') || args.includes('-v'), limit: 50, loop: args.includes('--loop'), reprocess: args.includes('--reprocess'), }; // Parse --mode= const modeArg = args.find(a => a.startsWith('--mode=')); if (modeArg) { const mode = modeArg.split('=')[1]; if (['payload', 'backfill', 'sync', 'status'].includes(mode)) { result.mode = mode as CliArgs['mode']; } } // Parse --store= const storeArg = args.find(a => a.startsWith('--store=')); if (storeArg) { result.store = parseInt(storeArg.split('=')[1], 10); } // Parse --since= const sinceArg = args.find(a => a.startsWith('--since=')); if (sinceArg) { result.since = sinceArg.split('=')[1]; } // Parse --limit= or --limit const limitArg = args.find(a => a.startsWith('--limit=')); if (limitArg) { result.limit = parseInt(limitArg.split('=')[1], 10); } else { const limitIdx = args.indexOf('--limit'); if (limitIdx !== -1 && args[limitIdx + 1]) { result.limit = parseInt(args[limitIdx + 1], 10); } } // Parse --payload= or --payload const payloadArg = args.find(a => a.startsWith('--payload=')); if (payloadArg) { result.payloadId = payloadArg.split('=')[1]; } else { const payloadIdx = args.indexOf('--payload'); if (payloadIdx !== -1 && args[payloadIdx + 1]) { result.payloadId = args[payloadIdx + 1]; } } // Parse --start-from= const startArg = args.find(a => a.startsWith('--start-from=')); if (startArg) { result.startFrom = parseInt(startArg.split('=')[1], 10); } return result; } // ============================================================ // DATABASE CONNECTION // ============================================================ function getConnectionString(): string { if (process.env.CANNAIQ_DB_URL) { return process.env.CANNAIQ_DB_URL; } const host = process.env.CANNAIQ_DB_HOST; const port = process.env.CANNAIQ_DB_PORT; const name = process.env.CANNAIQ_DB_NAME; const user = process.env.CANNAIQ_DB_USER; const pass = process.env.CANNAIQ_DB_PASS; if (host && port && name && user && pass) { return `postgresql://${user}:${pass}@${host}:${port}/${name}`; } // Fallback to DATABASE_URL for local development if (process.env.DATABASE_URL) { return process.env.DATABASE_URL; } throw new Error('Missing database connection environment variables'); } // ============================================================ // MODE: PAYLOAD (existing behavior) // ============================================================ async function runPayloadMode(pool: Pool, args: CliArgs): Promise { console.log('='.repeat(60)); console.log('HYDRATION - PAYLOAD MODE'); console.log('='.repeat(60)); console.log(`Dry run: ${args.dryRun}`); console.log(`Batch size: ${args.limit}`); console.log(''); // Show current stats try { const stats = await getPayloadStats(pool); console.log('Current payload stats:'); console.log(` Total: ${stats.total}`); console.log(` Processed: ${stats.processed}`); console.log(` Unprocessed: ${stats.unprocessed}`); console.log(` Failed: ${stats.failed}`); console.log(''); } catch { console.log('Note: raw_payloads table not found or empty'); console.log(''); } if (args.payloadId) { // Process specific payload console.log(`Processing payload: ${args.payloadId}`); const result = await processPayloadById(pool, args.payloadId, { dryRun: args.dryRun }); console.log('Result:', JSON.stringify(result, null, 2)); } else if (args.reprocess) { // Reprocess failed payloads console.log('Reprocessing failed payloads...'); const result = await reprocessFailedPayloads(pool, { dryRun: args.dryRun, batchSize: args.limit }); console.log('Result:', JSON.stringify(result, null, 2)); } else if (args.loop) { // Run continuous loop const worker = new HydrationWorker(pool, { dryRun: args.dryRun, batchSize: args.limit }); process.on('SIGINT', () => { console.log('\nStopping hydration loop...'); worker.stop(); }); await worker.runLoop(30000); } else { // Run single batch const result = await runHydrationBatch(pool, { dryRun: args.dryRun, batchSize: args.limit }); console.log('Batch result:'); console.log(` Payloads processed: ${result.payloadsProcessed}`); console.log(` Payloads failed: ${result.payloadsFailed}`); console.log(` Products upserted: ${result.totalProductsUpserted}`); console.log(` Snapshots created: ${result.totalSnapshotsCreated}`); console.log(` Brands created: ${result.totalBrandsCreated}`); console.log(` Duration: ${result.durationMs}ms`); if (result.errors.length > 0) { console.log('\nErrors:'); for (const err of result.errors.slice(0, 10)) { console.log(` ${err.payloadId}: ${err.error}`); } } } } // ============================================================ // MODE: BACKFILL (legacy dutchie_* → canonical) // ============================================================ async function runBackfillMode(pool: Pool, args: CliArgs): Promise { console.log('='.repeat(60)); console.log('HYDRATION - BACKFILL MODE'); console.log('='.repeat(60)); console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`); if (args.store) { console.log(`Store: ${args.store}`); } if (args.startFrom) { console.log(`Start from product ID: ${args.startFrom}`); } console.log(''); await runLegacyBackfill(pool, { dryRun: args.dryRun, verbose: args.verbose, dispensaryId: args.store, startFromProductId: args.startFrom, }); } // ============================================================ // MODE: SYNC (recent crawls → canonical) // ============================================================ async function runSyncMode(pool: Pool, args: CliArgs): Promise { const since = args.since || '1 hour'; console.log('='.repeat(60)); console.log('HYDRATION - SYNC MODE'); console.log('='.repeat(60)); console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`); console.log(`Since: ${since}`); console.log(`Limit: ${args.limit}`); if (args.store) { console.log(`Store: ${args.store}`); } console.log(''); const result = await syncRecentCrawls(pool, { dryRun: args.dryRun, verbose: args.verbose, since, dispensaryId: args.store, limit: args.limit, }); console.log(''); console.log('=== Sync Results ==='); console.log(`Crawls synced: ${result.synced}`); console.log(`Errors: ${result.errors.length}`); if (result.errors.length > 0) { console.log(''); console.log('Errors:'); for (const error of result.errors.slice(0, 10)) { console.log(` - ${error}`); } if (result.errors.length > 10) { console.log(` ... and ${result.errors.length - 10} more`); } } } // ============================================================ // MODE: STATUS // ============================================================ async function runStatusMode(pool: Pool): Promise { console.log('='.repeat(60)); console.log('HYDRATION STATUS'); console.log('='.repeat(60)); console.log(''); // Check if v_hydration_status view exists const viewExists = await pool.query(` SELECT EXISTS ( SELECT 1 FROM pg_views WHERE viewname = 'v_hydration_status' ) as exists `); if (viewExists.rows[0].exists) { const { rows } = await pool.query('SELECT * FROM v_hydration_status'); console.log('Hydration Progress:'); console.log('-'.repeat(70)); console.log( 'Table'.padEnd(30) + 'Source'.padEnd(12) + 'Hydrated'.padEnd(12) + 'Progress' ); console.log('-'.repeat(70)); for (const row of rows) { const progress = row.hydration_pct ? `${row.hydration_pct}%` : 'N/A'; console.log( row.source_table.padEnd(30) + String(row.source_count).padEnd(12) + String(row.hydrated_count).padEnd(12) + progress ); } console.log('-'.repeat(70)); } else { console.log('Note: v_hydration_status view not found. Run migration 052 first.'); } // Get counts from canonical tables console.log('\nCanonical Table Counts:'); console.log('-'.repeat(40)); const tables = ['store_products', 'store_product_snapshots', 'crawl_runs']; for (const table of tables) { try { const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`); console.log(`${table}: ${rows[0].cnt}`); } catch { console.log(`${table}: (table not found)`); } } // Get legacy table counts console.log('\nLegacy Table Counts:'); console.log('-'.repeat(40)); const legacyTables = ['dutchie_products', 'dutchie_product_snapshots', 'dispensary_crawl_jobs']; for (const table of legacyTables) { try { const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`); console.log(`${table}: ${rows[0].cnt}`); } catch { console.log(`${table}: (table not found)`); } } // Show recent sync activity console.log('\nRecent Crawl Runs (last 24h):'); console.log('-'.repeat(40)); try { const { rows } = await pool.query(` SELECT status, COUNT(*) as count FROM crawl_runs WHERE started_at > NOW() - INTERVAL '24 hours' GROUP BY status ORDER BY count DESC `); if (rows.length === 0) { console.log('No crawl runs in last 24 hours'); } else { for (const row of rows) { console.log(`${row.status}: ${row.count}`); } } } catch { console.log('(crawl_runs table not found)'); } // Payload stats console.log('\nPayload Hydration:'); console.log('-'.repeat(40)); try { const stats = await getPayloadStats(pool); console.log(`Total payloads: ${stats.total}`); console.log(`Processed: ${stats.processed}`); console.log(`Unprocessed: ${stats.unprocessed}`); console.log(`Failed: ${stats.failed}`); } catch { console.log('(raw_payloads table not found)'); } } // ============================================================ // HELP // ============================================================ function showHelp(): void { console.log(` Unified Hydration CLI Usage: npx tsx src/scripts/run-hydration.ts --mode= [options] Modes: payload Process raw_payloads → canonical tables (default) backfill Migrate dutchie_* → canonical tables sync Sync recent crawls to canonical tables status Show hydration progress Common Options: --dry-run Print changes without modifying database --verbose, -v Show detailed progress --store= Limit to a single dispensary --limit= Batch size (default: 50) Payload Mode Options: --loop Run continuous hydration loop --reprocess Reprocess failed payloads --payload= Process a specific payload by ID Backfill Mode Options: --start-from= Resume from a specific product ID Sync Mode Options: --since= Time window (default: "1 hour") Examples: "30 minutes", "2 hours", "1 day" Examples: # Full legacy backfill (dutchie_* → canonical) npx tsx src/scripts/run-hydration.ts --mode=backfill # Backfill single dispensary (dry run) npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123 --dry-run # Sync recent crawls from last 4 hours npx tsx src/scripts/run-hydration.ts --mode=sync --since="4 hours" # Sync single dispensary npx tsx src/scripts/run-hydration.ts --mode=sync --store=123 # Run payload hydration loop npx tsx src/scripts/run-hydration.ts --mode=payload --loop # Check hydration status npx tsx src/scripts/run-hydration.ts --mode=status `); } // ============================================================ // MAIN // ============================================================ async function main(): Promise { const rawArgs = process.argv.slice(2); if (rawArgs.includes('--help') || rawArgs.includes('-h')) { showHelp(); process.exit(0); } const args = parseArgs(); const pool = new Pool({ connectionString: getConnectionString(), max: 5, }); try { // Verify connection await pool.query('SELECT 1'); console.log('Database connection: OK\n'); switch (args.mode) { case 'payload': await runPayloadMode(pool, args); break; case 'backfill': await runBackfillMode(pool, args); break; case 'sync': await runSyncMode(pool, args); break; case 'status': await runStatusMode(pool); break; default: console.error(`Unknown mode: ${args.mode}`); showHelp(); process.exit(1); } } catch (error: any) { console.error('Error:', error.message); process.exit(1); } finally { await pool.end(); } } main();