- Moved hydration module back from _deprecated (needed for product_refresh) - Restored product_refresh handler for processing stored payloads - Restored geolocation service for findadispo/findagram - Stubbed system routes that depend on deprecated SyncOrchestrator - Removed crawler-sandbox route (deprecated) - Fixed all TypeScript compilation errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
511 lines
15 KiB
TypeScript
511 lines
15 KiB
TypeScript
#!/usr/bin/env npx tsx
|
|
/**
|
|
* Unified Hydration CLI
|
|
*
|
|
* Central entrypoint for all hydration operations:
|
|
*
|
|
* MODES:
|
|
* payload - Process raw_payloads → canonical tables (existing behavior)
|
|
* backfill - Migrate dutchie_* → canonical tables (legacy backfill)
|
|
* sync - Sync recent crawls to canonical tables
|
|
* status - Show hydration progress
|
|
*
|
|
* Usage:
|
|
* npx tsx src/scripts/run-hydration.ts --mode=<mode> [options]
|
|
*
|
|
* Examples:
|
|
* # Payload-based hydration (default)
|
|
* npx tsx src/scripts/run-hydration.ts --mode=payload
|
|
*
|
|
* # Full legacy backfill
|
|
* npx tsx src/scripts/run-hydration.ts --mode=backfill
|
|
*
|
|
* # Backfill single dispensary
|
|
* npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123
|
|
*
|
|
* # Sync recent crawls
|
|
* npx tsx src/scripts/run-hydration.ts --mode=sync --since="2 hours"
|
|
*
|
|
* # Check status
|
|
* npx tsx src/scripts/run-hydration.ts --mode=status
|
|
*/
|
|
|
|
import { Pool } from 'pg';
|
|
import dotenv from 'dotenv';
|
|
import {
|
|
HydrationWorker,
|
|
runHydrationBatch,
|
|
processPayloadById,
|
|
reprocessFailedPayloads,
|
|
getPayloadStats,
|
|
} from '../hydration';
|
|
import { runLegacyBackfill } from '../hydration/legacy-backfill';
|
|
import { syncRecentCrawls } from '../hydration/incremental-sync';
|
|
|
|
dotenv.config();
|
|
|
|
// ============================================================
|
|
// ARGUMENT PARSING
|
|
// ============================================================
|
|
|
|
interface CliArgs {
|
|
mode: 'payload' | 'backfill' | 'sync' | 'status';
|
|
store?: number;
|
|
since?: string;
|
|
dryRun: boolean;
|
|
verbose: boolean;
|
|
limit: number;
|
|
loop: boolean;
|
|
reprocess: boolean;
|
|
payloadId?: string;
|
|
startFrom?: number;
|
|
}
|
|
|
|
function parseArgs(): CliArgs {
|
|
const args = process.argv.slice(2);
|
|
|
|
// Defaults
|
|
const result: CliArgs = {
|
|
mode: 'payload',
|
|
dryRun: args.includes('--dry-run'),
|
|
verbose: args.includes('--verbose') || args.includes('-v'),
|
|
limit: 50,
|
|
loop: args.includes('--loop'),
|
|
reprocess: args.includes('--reprocess'),
|
|
};
|
|
|
|
// Parse --mode=<value>
|
|
const modeArg = args.find(a => a.startsWith('--mode='));
|
|
if (modeArg) {
|
|
const mode = modeArg.split('=')[1];
|
|
if (['payload', 'backfill', 'sync', 'status'].includes(mode)) {
|
|
result.mode = mode as CliArgs['mode'];
|
|
}
|
|
}
|
|
|
|
// Parse --store=<id>
|
|
const storeArg = args.find(a => a.startsWith('--store='));
|
|
if (storeArg) {
|
|
result.store = parseInt(storeArg.split('=')[1], 10);
|
|
}
|
|
|
|
// Parse --since=<value>
|
|
const sinceArg = args.find(a => a.startsWith('--since='));
|
|
if (sinceArg) {
|
|
result.since = sinceArg.split('=')[1];
|
|
}
|
|
|
|
// Parse --limit=<value> or --limit <value>
|
|
const limitArg = args.find(a => a.startsWith('--limit='));
|
|
if (limitArg) {
|
|
result.limit = parseInt(limitArg.split('=')[1], 10);
|
|
} else {
|
|
const limitIdx = args.indexOf('--limit');
|
|
if (limitIdx !== -1 && args[limitIdx + 1]) {
|
|
result.limit = parseInt(args[limitIdx + 1], 10);
|
|
}
|
|
}
|
|
|
|
// Parse --payload=<id> or --payload <id>
|
|
const payloadArg = args.find(a => a.startsWith('--payload='));
|
|
if (payloadArg) {
|
|
result.payloadId = payloadArg.split('=')[1];
|
|
} else {
|
|
const payloadIdx = args.indexOf('--payload');
|
|
if (payloadIdx !== -1 && args[payloadIdx + 1]) {
|
|
result.payloadId = args[payloadIdx + 1];
|
|
}
|
|
}
|
|
|
|
// Parse --start-from=<id>
|
|
const startArg = args.find(a => a.startsWith('--start-from='));
|
|
if (startArg) {
|
|
result.startFrom = parseInt(startArg.split('=')[1], 10);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// ============================================================
|
|
// DATABASE CONNECTION
|
|
// ============================================================
|
|
|
|
function getConnectionString(): string {
|
|
if (process.env.CANNAIQ_DB_URL) {
|
|
return process.env.CANNAIQ_DB_URL;
|
|
}
|
|
|
|
const host = process.env.CANNAIQ_DB_HOST;
|
|
const port = process.env.CANNAIQ_DB_PORT;
|
|
const name = process.env.CANNAIQ_DB_NAME;
|
|
const user = process.env.CANNAIQ_DB_USER;
|
|
const pass = process.env.CANNAIQ_DB_PASS;
|
|
|
|
if (host && port && name && user && pass) {
|
|
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
|
}
|
|
|
|
// Fallback to DATABASE_URL for local development
|
|
if (process.env.DATABASE_URL) {
|
|
return process.env.DATABASE_URL;
|
|
}
|
|
|
|
throw new Error('Missing database connection environment variables');
|
|
}
|
|
|
|
// ============================================================
|
|
// MODE: PAYLOAD (existing behavior)
|
|
// ============================================================
|
|
|
|
async function runPayloadMode(pool: Pool, args: CliArgs): Promise<void> {
|
|
console.log('='.repeat(60));
|
|
console.log('HYDRATION - PAYLOAD MODE');
|
|
console.log('='.repeat(60));
|
|
console.log(`Dry run: ${args.dryRun}`);
|
|
console.log(`Batch size: ${args.limit}`);
|
|
console.log('');
|
|
|
|
// Show current stats
|
|
try {
|
|
const stats = await getPayloadStats(pool);
|
|
console.log('Current payload stats:');
|
|
console.log(` Total: ${stats.total}`);
|
|
console.log(` Processed: ${stats.processed}`);
|
|
console.log(` Unprocessed: ${stats.unprocessed}`);
|
|
console.log(` Failed: ${stats.failed}`);
|
|
console.log('');
|
|
} catch {
|
|
console.log('Note: raw_payloads table not found or empty');
|
|
console.log('');
|
|
}
|
|
|
|
if (args.payloadId) {
|
|
// Process specific payload
|
|
console.log(`Processing payload: ${args.payloadId}`);
|
|
const result = await processPayloadById(pool, args.payloadId, { dryRun: args.dryRun });
|
|
console.log('Result:', JSON.stringify(result, null, 2));
|
|
} else if (args.reprocess) {
|
|
// Reprocess failed payloads
|
|
console.log('Reprocessing failed payloads...');
|
|
const result = await reprocessFailedPayloads(pool, { dryRun: args.dryRun, batchSize: args.limit });
|
|
console.log('Result:', JSON.stringify(result, null, 2));
|
|
} else if (args.loop) {
|
|
// Run continuous loop
|
|
const worker = new HydrationWorker(pool, { dryRun: args.dryRun, batchSize: args.limit });
|
|
|
|
process.on('SIGINT', () => {
|
|
console.log('\nStopping hydration loop...');
|
|
worker.stop();
|
|
});
|
|
|
|
await worker.runLoop(30000);
|
|
} else {
|
|
// Run single batch
|
|
const result = await runHydrationBatch(pool, { dryRun: args.dryRun, batchSize: args.limit });
|
|
console.log('Batch result:');
|
|
console.log(` Payloads processed: ${result.payloadsProcessed}`);
|
|
console.log(` Payloads failed: ${result.payloadsFailed}`);
|
|
console.log(` Products upserted: ${result.totalProductsUpserted}`);
|
|
console.log(` Snapshots created: ${result.totalSnapshotsCreated}`);
|
|
console.log(` Brands created: ${result.totalBrandsCreated}`);
|
|
console.log(` Duration: ${result.durationMs}ms`);
|
|
|
|
if (result.errors.length > 0) {
|
|
console.log('\nErrors:');
|
|
for (const err of result.errors.slice(0, 10)) {
|
|
console.log(` ${err.payloadId}: ${err.error}`);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// MODE: BACKFILL (legacy dutchie_* → canonical)
|
|
// ============================================================
|
|
|
|
async function runBackfillMode(pool: Pool, args: CliArgs): Promise<void> {
|
|
console.log('='.repeat(60));
|
|
console.log('HYDRATION - BACKFILL MODE');
|
|
console.log('='.repeat(60));
|
|
console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`);
|
|
if (args.store) {
|
|
console.log(`Store: ${args.store}`);
|
|
}
|
|
if (args.startFrom) {
|
|
console.log(`Start from product ID: ${args.startFrom}`);
|
|
}
|
|
console.log('');
|
|
|
|
await runLegacyBackfill(pool, {
|
|
dryRun: args.dryRun,
|
|
verbose: args.verbose,
|
|
dispensaryId: args.store,
|
|
startFromProductId: args.startFrom,
|
|
});
|
|
}
|
|
|
|
// ============================================================
|
|
// MODE: SYNC (recent crawls → canonical)
|
|
// ============================================================
|
|
|
|
async function runSyncMode(pool: Pool, args: CliArgs): Promise<void> {
|
|
const since = args.since || '1 hour';
|
|
|
|
console.log('='.repeat(60));
|
|
console.log('HYDRATION - SYNC MODE');
|
|
console.log('='.repeat(60));
|
|
console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`);
|
|
console.log(`Since: ${since}`);
|
|
console.log(`Limit: ${args.limit}`);
|
|
if (args.store) {
|
|
console.log(`Store: ${args.store}`);
|
|
}
|
|
console.log('');
|
|
|
|
const result = await syncRecentCrawls(pool, {
|
|
dryRun: args.dryRun,
|
|
verbose: args.verbose,
|
|
since,
|
|
dispensaryId: args.store,
|
|
limit: args.limit,
|
|
});
|
|
|
|
console.log('');
|
|
console.log('=== Sync Results ===');
|
|
console.log(`Crawls synced: ${result.synced}`);
|
|
console.log(`Errors: ${result.errors.length}`);
|
|
|
|
if (result.errors.length > 0) {
|
|
console.log('');
|
|
console.log('Errors:');
|
|
for (const error of result.errors.slice(0, 10)) {
|
|
console.log(` - ${error}`);
|
|
}
|
|
if (result.errors.length > 10) {
|
|
console.log(` ... and ${result.errors.length - 10} more`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// MODE: STATUS
|
|
// ============================================================
|
|
|
|
async function runStatusMode(pool: Pool): Promise<void> {
|
|
console.log('='.repeat(60));
|
|
console.log('HYDRATION STATUS');
|
|
console.log('='.repeat(60));
|
|
console.log('');
|
|
|
|
// Check if v_hydration_status view exists
|
|
const viewExists = await pool.query(`
|
|
SELECT EXISTS (
|
|
SELECT 1 FROM pg_views WHERE viewname = 'v_hydration_status'
|
|
) as exists
|
|
`);
|
|
|
|
if (viewExists.rows[0].exists) {
|
|
const { rows } = await pool.query('SELECT * FROM v_hydration_status');
|
|
console.log('Hydration Progress:');
|
|
console.log('-'.repeat(70));
|
|
console.log(
|
|
'Table'.padEnd(30) +
|
|
'Source'.padEnd(12) +
|
|
'Hydrated'.padEnd(12) +
|
|
'Progress'
|
|
);
|
|
console.log('-'.repeat(70));
|
|
|
|
for (const row of rows) {
|
|
const progress = row.hydration_pct ? `${row.hydration_pct}%` : 'N/A';
|
|
console.log(
|
|
row.source_table.padEnd(30) +
|
|
String(row.source_count).padEnd(12) +
|
|
String(row.hydrated_count).padEnd(12) +
|
|
progress
|
|
);
|
|
}
|
|
console.log('-'.repeat(70));
|
|
} else {
|
|
console.log('Note: v_hydration_status view not found. Run migration 052 first.');
|
|
}
|
|
|
|
// Get counts from canonical tables
|
|
console.log('\nCanonical Table Counts:');
|
|
console.log('-'.repeat(40));
|
|
|
|
const tables = ['store_products', 'store_product_snapshots', 'crawl_runs'];
|
|
for (const table of tables) {
|
|
try {
|
|
const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`);
|
|
console.log(`${table}: ${rows[0].cnt}`);
|
|
} catch {
|
|
console.log(`${table}: (table not found)`);
|
|
}
|
|
}
|
|
|
|
// Get legacy table counts
|
|
console.log('\nLegacy Table Counts:');
|
|
console.log('-'.repeat(40));
|
|
|
|
const legacyTables = ['dutchie_products', 'dutchie_product_snapshots', 'dispensary_crawl_jobs'];
|
|
for (const table of legacyTables) {
|
|
try {
|
|
const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`);
|
|
console.log(`${table}: ${rows[0].cnt}`);
|
|
} catch {
|
|
console.log(`${table}: (table not found)`);
|
|
}
|
|
}
|
|
|
|
// Show recent sync activity
|
|
console.log('\nRecent Crawl Runs (last 24h):');
|
|
console.log('-'.repeat(40));
|
|
|
|
try {
|
|
const { rows } = await pool.query(`
|
|
SELECT status, COUNT(*) as count
|
|
FROM crawl_runs
|
|
WHERE started_at > NOW() - INTERVAL '24 hours'
|
|
GROUP BY status
|
|
ORDER BY count DESC
|
|
`);
|
|
|
|
if (rows.length === 0) {
|
|
console.log('No crawl runs in last 24 hours');
|
|
} else {
|
|
for (const row of rows) {
|
|
console.log(`${row.status}: ${row.count}`);
|
|
}
|
|
}
|
|
} catch {
|
|
console.log('(crawl_runs table not found)');
|
|
}
|
|
|
|
// Payload stats
|
|
console.log('\nPayload Hydration:');
|
|
console.log('-'.repeat(40));
|
|
|
|
try {
|
|
const stats = await getPayloadStats(pool);
|
|
console.log(`Total payloads: ${stats.total}`);
|
|
console.log(`Processed: ${stats.processed}`);
|
|
console.log(`Unprocessed: ${stats.unprocessed}`);
|
|
console.log(`Failed: ${stats.failed}`);
|
|
} catch {
|
|
console.log('(raw_payloads table not found)');
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// HELP
|
|
// ============================================================
|
|
|
|
function showHelp(): void {
|
|
console.log(`
|
|
Unified Hydration CLI
|
|
|
|
Usage:
|
|
npx tsx src/scripts/run-hydration.ts --mode=<mode> [options]
|
|
|
|
Modes:
|
|
payload Process raw_payloads → canonical tables (default)
|
|
backfill Migrate dutchie_* → canonical tables
|
|
sync Sync recent crawls to canonical tables
|
|
status Show hydration progress
|
|
|
|
Common Options:
|
|
--dry-run Print changes without modifying database
|
|
--verbose, -v Show detailed progress
|
|
--store=<id> Limit to a single dispensary
|
|
--limit=<n> Batch size (default: 50)
|
|
|
|
Payload Mode Options:
|
|
--loop Run continuous hydration loop
|
|
--reprocess Reprocess failed payloads
|
|
--payload=<id> Process a specific payload by ID
|
|
|
|
Backfill Mode Options:
|
|
--start-from=<id> Resume from a specific product ID
|
|
|
|
Sync Mode Options:
|
|
--since=<interval> Time window (default: "1 hour")
|
|
Examples: "30 minutes", "2 hours", "1 day"
|
|
|
|
Examples:
|
|
# Full legacy backfill (dutchie_* → canonical)
|
|
npx tsx src/scripts/run-hydration.ts --mode=backfill
|
|
|
|
# Backfill single dispensary (dry run)
|
|
npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123 --dry-run
|
|
|
|
# Sync recent crawls from last 4 hours
|
|
npx tsx src/scripts/run-hydration.ts --mode=sync --since="4 hours"
|
|
|
|
# Sync single dispensary
|
|
npx tsx src/scripts/run-hydration.ts --mode=sync --store=123
|
|
|
|
# Run payload hydration loop
|
|
npx tsx src/scripts/run-hydration.ts --mode=payload --loop
|
|
|
|
# Check hydration status
|
|
npx tsx src/scripts/run-hydration.ts --mode=status
|
|
`);
|
|
}
|
|
|
|
// ============================================================
|
|
// MAIN
|
|
// ============================================================
|
|
|
|
async function main(): Promise<void> {
|
|
const rawArgs = process.argv.slice(2);
|
|
|
|
if (rawArgs.includes('--help') || rawArgs.includes('-h')) {
|
|
showHelp();
|
|
process.exit(0);
|
|
}
|
|
|
|
const args = parseArgs();
|
|
|
|
const pool = new Pool({
|
|
connectionString: getConnectionString(),
|
|
max: 5,
|
|
});
|
|
|
|
try {
|
|
// Verify connection
|
|
await pool.query('SELECT 1');
|
|
console.log('Database connection: OK\n');
|
|
|
|
switch (args.mode) {
|
|
case 'payload':
|
|
await runPayloadMode(pool, args);
|
|
break;
|
|
|
|
case 'backfill':
|
|
await runBackfillMode(pool, args);
|
|
break;
|
|
|
|
case 'sync':
|
|
await runSyncMode(pool, args);
|
|
break;
|
|
|
|
case 'status':
|
|
await runStatusMode(pool);
|
|
break;
|
|
|
|
default:
|
|
console.error(`Unknown mode: ${args.mode}`);
|
|
showHelp();
|
|
process.exit(1);
|
|
}
|
|
} catch (error: any) {
|
|
console.error('Error:', error.message);
|
|
process.exit(1);
|
|
} finally {
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
main();
|