fix: Restore hydration and product_refresh for store updates

- Moved hydration module back from _deprecated (needed for product_refresh)
- Restored product_refresh handler for processing stored payloads
- Restored geolocation service for findadispo/findagram
- Stubbed system routes that depend on deprecated SyncOrchestrator
- Removed crawler-sandbox route (deprecated)
- Fixed all TypeScript compilation errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-11 23:03:39 -07:00
parent cdab71a1ee
commit 50654be910
33 changed files with 613 additions and 563 deletions

View File

@@ -1,424 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Queue Dispensaries Script
*
* Orchestrates the multi-provider crawler system:
* 1. Queue dispensaries that need provider detection
* 2. Queue Dutchie dispensaries for production crawl
* 3. Queue sandbox dispensaries for learning crawls
*
* Usage:
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
*/
import { pool } from '../db/pool';
import { logger } from '../services/logger';
import {
runDetectMenuProviderJob,
runDutchieMenuCrawlJob,
runSandboxCrawlJob,
processSandboxJobs,
} from '../services/crawler-jobs';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
async function showHelp() {
console.log(`
Queue Dispensaries - Multi-Provider Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need provider detection
--production Queue Dutchie production crawls
--sandbox Queue sandbox/learning crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-dispensaries.ts
# Only queue detection jobs
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
# Dry run to see what would be queued
npx tsx src/scripts/queue-dispensaries.ts --dry-run
# Process sandbox jobs
npx tsx src/scripts/queue-dispensaries.ts --process
`);
}
async function queueDetectionJobs(): Promise<number> {
console.log('\n📡 Queueing Detection Jobs...');
// Find dispensaries that need provider detection:
// - menu_provider is null OR
// - menu_provider_confidence < 70 AND
// - crawler_status is idle (not already queued/running)
// - has a website URL
const query = `
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND crawler_status = 'idle'
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
ORDER BY
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
menu_provider_confidence ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create sandbox job for detection
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 10)`,
[dispensary.id]
);
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueProductionCrawls(): Promise<number> {
console.log('\n🏭 Queueing Production Dutchie Crawls...');
// Find Dutchie dispensaries ready for production crawl:
// - menu_provider = 'dutchie'
// - crawler_mode = 'production'
// - crawler_status is idle
// - last_menu_scrape is old or null
const query = `
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
FROM dispensaries d
WHERE d.menu_provider = 'dutchie'
AND d.crawler_mode = 'production'
AND d.crawler_status = 'idle'
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
d.last_menu_scrape ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
for (const row of result.rows) {
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create crawl job in the main crawl_jobs table (production queue)
await pool.query(
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id]
);
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueSandboxCrawls(): Promise<number> {
console.log('\n🧪 Queueing Sandbox Crawls...');
// Find sandbox dispensaries needing crawls:
// - crawler_mode = 'sandbox'
// - crawler_status in (idle, error_needs_review)
// - No recent sandbox job
const query = `
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
FROM dispensaries d
WHERE d.crawler_mode = 'sandbox'
AND d.crawler_status IN ('idle', 'error_needs_review')
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.status IN ('pending', 'running')
)
ORDER BY d.updated_at ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create sandbox job
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'deep_crawl', 'pending', 5)`,
[dispensary.id]
);
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function processJobs(): Promise<void> {
console.log('\n⚙ Processing Queued Jobs...\n');
// Process sandbox jobs (detection + sandbox crawls)
const sandboxJobs = await pool.query(
`SELECT * FROM sandbox_crawl_jobs
WHERE status = 'pending'
ORDER BY priority DESC, scheduled_at ASC
LIMIT $1`,
[flags.limit]
);
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
for (const job of sandboxJobs.rows) {
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
try {
// Mark as running
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`,
[job.id]
);
let result;
if (job.job_type === 'detection') {
result = await runDetectMenuProviderJob(job.dispensary_id);
} else {
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
}
// Update job status
await pool.query(
`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`,
[
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]
);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
} catch (error: any) {
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
[error.message, job.id]
);
console.log(` ✗ Error: ${error.message}\n`);
}
}
}
async function showStats(): Promise<void> {
console.log('\n📊 Current Stats:');
// Dispensary stats
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
Dispensaries: ${s.total}
- No provider detected: ${s.no_provider}
- Dutchie: ${s.dutchie}
- Other providers: ${s.other_providers}
- Unknown: ${s.unknown}
Crawler Mode:
- Production: ${s.production_mode}
- Sandbox: ${s.sandbox_mode}
Status:
- Idle: ${s.idle}
- Queued: ${s.queued}
- Running: ${s.running}
- OK: ${s.ok}
- Needs Review: ${s.needs_review}
`);
// Job stats
const jobStats = await pool.query(`
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
`);
const j = jobStats.rows[0];
console.log(` Sandbox Jobs:
- Pending: ${j.pending}
- Running: ${j.running}
- Completed: ${j.completed}
- Failed: ${j.failed}
`);
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Provider Crawler Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
try {
// Show current stats first
await showStats();
if (flags.process) {
// Process mode - run jobs instead of queuing
await processJobs();
} else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueDetectionJobs();
}
if (flags.production) {
totalQueued += await queueProductionCrawls();
}
if (flags.sandbox) {
totalQueued += await queueSandboxCrawls();
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total dispensaries queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,105 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Run Backfill CLI
*
* Import historical payloads from existing data sources.
*
* Usage:
* npx tsx src/scripts/run-backfill.ts [options]
*
* Options:
* --source SOURCE Source to backfill from:
* - dutchie_products (default)
* - snapshots
* - cache_files
* - all
* --dry-run Print changes without modifying DB
* --limit N Max payloads to create (default: unlimited)
* --dispensary ID Only backfill specific dispensary
* --cache-path PATH Path to cache files (default: ./cache/payloads)
*/
import { Pool } from 'pg';
import { runBackfill, BackfillOptions } from '../hydration';
async function main() {
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
let source: BackfillOptions['source'] = 'dutchie_products';
const sourceIdx = args.indexOf('--source');
if (sourceIdx !== -1 && args[sourceIdx + 1]) {
source = args[sourceIdx + 1] as BackfillOptions['source'];
}
let limit: number | undefined;
const limitIdx = args.indexOf('--limit');
if (limitIdx !== -1 && args[limitIdx + 1]) {
limit = parseInt(args[limitIdx + 1], 10);
}
let dispensaryId: number | undefined;
const dispIdx = args.indexOf('--dispensary');
if (dispIdx !== -1 && args[dispIdx + 1]) {
dispensaryId = parseInt(args[dispIdx + 1], 10);
}
let cachePath: string | undefined;
const cacheIdx = args.indexOf('--cache-path');
if (cacheIdx !== -1 && args[cacheIdx + 1]) {
cachePath = args[cacheIdx + 1];
}
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
try {
console.log('='.repeat(60));
console.log('BACKFILL RUNNER');
console.log('='.repeat(60));
console.log(`Source: ${source}`);
console.log(`Dry run: ${dryRun}`);
if (limit) console.log(`Limit: ${limit}`);
if (dispensaryId) console.log(`Dispensary: ${dispensaryId}`);
if (cachePath) console.log(`Cache path: ${cachePath}`);
console.log('');
const results = await runBackfill(pool, {
dryRun,
source,
limit,
dispensaryId,
cachePath,
});
console.log('\nBackfill Results:');
console.log('='.repeat(40));
for (const result of results) {
console.log(`\n${result.source}:`);
console.log(` Payloads created: ${result.payloadsCreated}`);
console.log(` Skipped: ${result.skipped}`);
console.log(` Errors: ${result.errors.length}`);
console.log(` Duration: ${result.durationMs}ms`);
if (result.errors.length > 0) {
console.log(' First 5 errors:');
for (const err of result.errors.slice(0, 5)) {
console.log(` - ${err}`);
}
}
}
const totalCreated = results.reduce((sum, r) => sum + r.payloadsCreated, 0);
console.log(`\nTotal payloads created: ${totalCreated}`);
} catch (error: any) {
console.error('Backfill error:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,510 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Unified Hydration CLI
*
* Central entrypoint for all hydration operations:
*
* MODES:
* payload - Process raw_payloads → canonical tables (existing behavior)
* backfill - Migrate dutchie_* → canonical tables (legacy backfill)
* sync - Sync recent crawls to canonical tables
* status - Show hydration progress
*
* Usage:
* npx tsx src/scripts/run-hydration.ts --mode=<mode> [options]
*
* Examples:
* # Payload-based hydration (default)
* npx tsx src/scripts/run-hydration.ts --mode=payload
*
* # Full legacy backfill
* npx tsx src/scripts/run-hydration.ts --mode=backfill
*
* # Backfill single dispensary
* npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123
*
* # Sync recent crawls
* npx tsx src/scripts/run-hydration.ts --mode=sync --since="2 hours"
*
* # Check status
* npx tsx src/scripts/run-hydration.ts --mode=status
*/
import { Pool } from 'pg';
import dotenv from 'dotenv';
import {
HydrationWorker,
runHydrationBatch,
processPayloadById,
reprocessFailedPayloads,
getPayloadStats,
} from '../hydration';
import { runLegacyBackfill } from '../hydration/legacy-backfill';
import { syncRecentCrawls } from '../hydration/incremental-sync';
dotenv.config();
// ============================================================
// ARGUMENT PARSING
// ============================================================
interface CliArgs {
mode: 'payload' | 'backfill' | 'sync' | 'status';
store?: number;
since?: string;
dryRun: boolean;
verbose: boolean;
limit: number;
loop: boolean;
reprocess: boolean;
payloadId?: string;
startFrom?: number;
}
function parseArgs(): CliArgs {
const args = process.argv.slice(2);
// Defaults
const result: CliArgs = {
mode: 'payload',
dryRun: args.includes('--dry-run'),
verbose: args.includes('--verbose') || args.includes('-v'),
limit: 50,
loop: args.includes('--loop'),
reprocess: args.includes('--reprocess'),
};
// Parse --mode=<value>
const modeArg = args.find(a => a.startsWith('--mode='));
if (modeArg) {
const mode = modeArg.split('=')[1];
if (['payload', 'backfill', 'sync', 'status'].includes(mode)) {
result.mode = mode as CliArgs['mode'];
}
}
// Parse --store=<id>
const storeArg = args.find(a => a.startsWith('--store='));
if (storeArg) {
result.store = parseInt(storeArg.split('=')[1], 10);
}
// Parse --since=<value>
const sinceArg = args.find(a => a.startsWith('--since='));
if (sinceArg) {
result.since = sinceArg.split('=')[1];
}
// Parse --limit=<value> or --limit <value>
const limitArg = args.find(a => a.startsWith('--limit='));
if (limitArg) {
result.limit = parseInt(limitArg.split('=')[1], 10);
} else {
const limitIdx = args.indexOf('--limit');
if (limitIdx !== -1 && args[limitIdx + 1]) {
result.limit = parseInt(args[limitIdx + 1], 10);
}
}
// Parse --payload=<id> or --payload <id>
const payloadArg = args.find(a => a.startsWith('--payload='));
if (payloadArg) {
result.payloadId = payloadArg.split('=')[1];
} else {
const payloadIdx = args.indexOf('--payload');
if (payloadIdx !== -1 && args[payloadIdx + 1]) {
result.payloadId = args[payloadIdx + 1];
}
}
// Parse --start-from=<id>
const startArg = args.find(a => a.startsWith('--start-from='));
if (startArg) {
result.startFrom = parseInt(startArg.split('=')[1], 10);
}
return result;
}
// ============================================================
// DATABASE CONNECTION
// ============================================================
function getConnectionString(): string {
if (process.env.CANNAIQ_DB_URL) {
return process.env.CANNAIQ_DB_URL;
}
const host = process.env.CANNAIQ_DB_HOST;
const port = process.env.CANNAIQ_DB_PORT;
const name = process.env.CANNAIQ_DB_NAME;
const user = process.env.CANNAIQ_DB_USER;
const pass = process.env.CANNAIQ_DB_PASS;
if (host && port && name && user && pass) {
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
// Fallback to DATABASE_URL for local development
if (process.env.DATABASE_URL) {
return process.env.DATABASE_URL;
}
throw new Error('Missing database connection environment variables');
}
// ============================================================
// MODE: PAYLOAD (existing behavior)
// ============================================================
async function runPayloadMode(pool: Pool, args: CliArgs): Promise<void> {
console.log('='.repeat(60));
console.log('HYDRATION - PAYLOAD MODE');
console.log('='.repeat(60));
console.log(`Dry run: ${args.dryRun}`);
console.log(`Batch size: ${args.limit}`);
console.log('');
// Show current stats
try {
const stats = await getPayloadStats(pool);
console.log('Current payload stats:');
console.log(` Total: ${stats.total}`);
console.log(` Processed: ${stats.processed}`);
console.log(` Unprocessed: ${stats.unprocessed}`);
console.log(` Failed: ${stats.failed}`);
console.log('');
} catch {
console.log('Note: raw_payloads table not found or empty');
console.log('');
}
if (args.payloadId) {
// Process specific payload
console.log(`Processing payload: ${args.payloadId}`);
const result = await processPayloadById(pool, args.payloadId, { dryRun: args.dryRun });
console.log('Result:', JSON.stringify(result, null, 2));
} else if (args.reprocess) {
// Reprocess failed payloads
console.log('Reprocessing failed payloads...');
const result = await reprocessFailedPayloads(pool, { dryRun: args.dryRun, batchSize: args.limit });
console.log('Result:', JSON.stringify(result, null, 2));
} else if (args.loop) {
// Run continuous loop
const worker = new HydrationWorker(pool, { dryRun: args.dryRun, batchSize: args.limit });
process.on('SIGINT', () => {
console.log('\nStopping hydration loop...');
worker.stop();
});
await worker.runLoop(30000);
} else {
// Run single batch
const result = await runHydrationBatch(pool, { dryRun: args.dryRun, batchSize: args.limit });
console.log('Batch result:');
console.log(` Payloads processed: ${result.payloadsProcessed}`);
console.log(` Payloads failed: ${result.payloadsFailed}`);
console.log(` Products upserted: ${result.totalProductsUpserted}`);
console.log(` Snapshots created: ${result.totalSnapshotsCreated}`);
console.log(` Brands created: ${result.totalBrandsCreated}`);
console.log(` Duration: ${result.durationMs}ms`);
if (result.errors.length > 0) {
console.log('\nErrors:');
for (const err of result.errors.slice(0, 10)) {
console.log(` ${err.payloadId}: ${err.error}`);
}
}
}
}
// ============================================================
// MODE: BACKFILL (legacy dutchie_* → canonical)
// ============================================================
async function runBackfillMode(pool: Pool, args: CliArgs): Promise<void> {
console.log('='.repeat(60));
console.log('HYDRATION - BACKFILL MODE');
console.log('='.repeat(60));
console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`);
if (args.store) {
console.log(`Store: ${args.store}`);
}
if (args.startFrom) {
console.log(`Start from product ID: ${args.startFrom}`);
}
console.log('');
await runLegacyBackfill(pool, {
dryRun: args.dryRun,
verbose: args.verbose,
dispensaryId: args.store,
startFromProductId: args.startFrom,
});
}
// ============================================================
// MODE: SYNC (recent crawls → canonical)
// ============================================================
async function runSyncMode(pool: Pool, args: CliArgs): Promise<void> {
const since = args.since || '1 hour';
console.log('='.repeat(60));
console.log('HYDRATION - SYNC MODE');
console.log('='.repeat(60));
console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`);
console.log(`Since: ${since}`);
console.log(`Limit: ${args.limit}`);
if (args.store) {
console.log(`Store: ${args.store}`);
}
console.log('');
const result = await syncRecentCrawls(pool, {
dryRun: args.dryRun,
verbose: args.verbose,
since,
dispensaryId: args.store,
limit: args.limit,
});
console.log('');
console.log('=== Sync Results ===');
console.log(`Crawls synced: ${result.synced}`);
console.log(`Errors: ${result.errors.length}`);
if (result.errors.length > 0) {
console.log('');
console.log('Errors:');
for (const error of result.errors.slice(0, 10)) {
console.log(` - ${error}`);
}
if (result.errors.length > 10) {
console.log(` ... and ${result.errors.length - 10} more`);
}
}
}
// ============================================================
// MODE: STATUS
// ============================================================
async function runStatusMode(pool: Pool): Promise<void> {
console.log('='.repeat(60));
console.log('HYDRATION STATUS');
console.log('='.repeat(60));
console.log('');
// Check if v_hydration_status view exists
const viewExists = await pool.query(`
SELECT EXISTS (
SELECT 1 FROM pg_views WHERE viewname = 'v_hydration_status'
) as exists
`);
if (viewExists.rows[0].exists) {
const { rows } = await pool.query('SELECT * FROM v_hydration_status');
console.log('Hydration Progress:');
console.log('-'.repeat(70));
console.log(
'Table'.padEnd(30) +
'Source'.padEnd(12) +
'Hydrated'.padEnd(12) +
'Progress'
);
console.log('-'.repeat(70));
for (const row of rows) {
const progress = row.hydration_pct ? `${row.hydration_pct}%` : 'N/A';
console.log(
row.source_table.padEnd(30) +
String(row.source_count).padEnd(12) +
String(row.hydrated_count).padEnd(12) +
progress
);
}
console.log('-'.repeat(70));
} else {
console.log('Note: v_hydration_status view not found. Run migration 052 first.');
}
// Get counts from canonical tables
console.log('\nCanonical Table Counts:');
console.log('-'.repeat(40));
const tables = ['store_products', 'store_product_snapshots', 'crawl_runs'];
for (const table of tables) {
try {
const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`);
console.log(`${table}: ${rows[0].cnt}`);
} catch {
console.log(`${table}: (table not found)`);
}
}
// Get legacy table counts
console.log('\nLegacy Table Counts:');
console.log('-'.repeat(40));
const legacyTables = ['dutchie_products', 'dutchie_product_snapshots', 'dispensary_crawl_jobs'];
for (const table of legacyTables) {
try {
const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`);
console.log(`${table}: ${rows[0].cnt}`);
} catch {
console.log(`${table}: (table not found)`);
}
}
// Show recent sync activity
console.log('\nRecent Crawl Runs (last 24h):');
console.log('-'.repeat(40));
try {
const { rows } = await pool.query(`
SELECT status, COUNT(*) as count
FROM crawl_runs
WHERE started_at > NOW() - INTERVAL '24 hours'
GROUP BY status
ORDER BY count DESC
`);
if (rows.length === 0) {
console.log('No crawl runs in last 24 hours');
} else {
for (const row of rows) {
console.log(`${row.status}: ${row.count}`);
}
}
} catch {
console.log('(crawl_runs table not found)');
}
// Payload stats
console.log('\nPayload Hydration:');
console.log('-'.repeat(40));
try {
const stats = await getPayloadStats(pool);
console.log(`Total payloads: ${stats.total}`);
console.log(`Processed: ${stats.processed}`);
console.log(`Unprocessed: ${stats.unprocessed}`);
console.log(`Failed: ${stats.failed}`);
} catch {
console.log('(raw_payloads table not found)');
}
}
// ============================================================
// HELP
// ============================================================
function showHelp(): void {
console.log(`
Unified Hydration CLI
Usage:
npx tsx src/scripts/run-hydration.ts --mode=<mode> [options]
Modes:
payload Process raw_payloads → canonical tables (default)
backfill Migrate dutchie_* → canonical tables
sync Sync recent crawls to canonical tables
status Show hydration progress
Common Options:
--dry-run Print changes without modifying database
--verbose, -v Show detailed progress
--store=<id> Limit to a single dispensary
--limit=<n> Batch size (default: 50)
Payload Mode Options:
--loop Run continuous hydration loop
--reprocess Reprocess failed payloads
--payload=<id> Process a specific payload by ID
Backfill Mode Options:
--start-from=<id> Resume from a specific product ID
Sync Mode Options:
--since=<interval> Time window (default: "1 hour")
Examples: "30 minutes", "2 hours", "1 day"
Examples:
# Full legacy backfill (dutchie_* → canonical)
npx tsx src/scripts/run-hydration.ts --mode=backfill
# Backfill single dispensary (dry run)
npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123 --dry-run
# Sync recent crawls from last 4 hours
npx tsx src/scripts/run-hydration.ts --mode=sync --since="4 hours"
# Sync single dispensary
npx tsx src/scripts/run-hydration.ts --mode=sync --store=123
# Run payload hydration loop
npx tsx src/scripts/run-hydration.ts --mode=payload --loop
# Check hydration status
npx tsx src/scripts/run-hydration.ts --mode=status
`);
}
// ============================================================
// MAIN
// ============================================================
async function main(): Promise<void> {
const rawArgs = process.argv.slice(2);
if (rawArgs.includes('--help') || rawArgs.includes('-h')) {
showHelp();
process.exit(0);
}
const args = parseArgs();
const pool = new Pool({
connectionString: getConnectionString(),
max: 5,
});
try {
// Verify connection
await pool.query('SELECT 1');
console.log('Database connection: OK\n');
switch (args.mode) {
case 'payload':
await runPayloadMode(pool, args);
break;
case 'backfill':
await runBackfillMode(pool, args);
break;
case 'sync':
await runSyncMode(pool, args);
break;
case 'status':
await runStatusMode(pool);
break;
default:
console.error(`Unknown mode: ${args.mode}`);
showHelp();
process.exit(1);
}
} catch (error: any) {
console.error('Error:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,277 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Test Script: Crawl a single dispensary and write to canonical tables
*
* This script:
* 1. Fetches products from Dutchie GraphQL
* 2. Normalizes via DutchieNormalizer
* 3. Writes to store_products, product_variants, snapshots via hydrateToCanonical
*
* Usage:
* npx tsx src/scripts/test-crawl-to-canonical.ts <dispensaryId>
* npx tsx src/scripts/test-crawl-to-canonical.ts 235
*/
import { Pool } from 'pg';
import dotenv from 'dotenv';
import {
executeGraphQL,
GRAPHQL_HASHES,
DUTCHIE_CONFIG,
} from '../platforms/dutchie';
import {
DutchieNormalizer,
hydrateToCanonical,
} from '../hydration';
import { initializeImageStorage } from '../utils/image-storage';
dotenv.config();
// ============================================================
// DATABASE CONNECTION
// ============================================================
function getConnectionString(): string {
if (process.env.CANNAIQ_DB_URL) {
return process.env.CANNAIQ_DB_URL;
}
if (process.env.DATABASE_URL) {
return process.env.DATABASE_URL;
}
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
const port = process.env.CANNAIQ_DB_PORT || '54320';
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
const pool = new Pool({ connectionString: getConnectionString() });
// ============================================================
// FETCH PRODUCTS FROM DUTCHIE
// ============================================================
interface FetchResult {
products: any[];
totalPages: number;
totalProducts: number;
}
async function fetchAllProducts(platformDispensaryId: string, cName: string): Promise<FetchResult> {
const allProducts: any[] = [];
let page = 0;
let totalPages = 1;
let totalProducts = 0;
console.log(`[Fetch] Starting fetch for ${platformDispensaryId} (cName: ${cName})`);
while (page < totalPages && page < DUTCHIE_CONFIG.maxPages) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: platformDispensaryId,
pricingType: 'rec',
Status: 'Active', // 'Active' = in-stock products with pricing
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage: DUTCHIE_CONFIG.perPage,
};
try {
const result = await executeGraphQL(
'FilteredProducts',
variables,
GRAPHQL_HASHES.FilteredProducts,
{ cName, maxRetries: 3 }
);
const data = result?.data?.filteredProducts;
if (!data) {
console.error(`[Fetch] No data returned for page ${page}`);
break;
}
const products = data.products || [];
totalProducts = data.queryInfo?.totalCount || 0;
totalPages = Math.ceil(totalProducts / DUTCHIE_CONFIG.perPage);
allProducts.push(...products);
console.log(`[Fetch] Page ${page + 1}/${totalPages}: ${products.length} products (total so far: ${allProducts.length})`);
page++;
if (page < totalPages) {
await new Promise(r => setTimeout(r, DUTCHIE_CONFIG.pageDelayMs));
}
} catch (error: any) {
console.error(`[Fetch] Error on page ${page}: ${error.message}`);
break;
}
}
return { products: allProducts, totalPages, totalProducts };
}
// ============================================================
// MAIN
// ============================================================
async function main() {
const dispensaryId = parseInt(process.argv[2], 10);
if (!dispensaryId) {
console.error('Usage: npx tsx src/scripts/test-crawl-to-canonical.ts <dispensaryId>');
console.error('Example: npx tsx src/scripts/test-crawl-to-canonical.ts 235');
process.exit(1);
}
console.log('============================================================');
console.log(`Test Crawl to Canonical - Dispensary ${dispensaryId}`);
console.log('============================================================\n');
// Initialize image storage
console.log('[Init] Initializing image storage...');
await initializeImageStorage();
console.log(' Image storage ready\n');
try {
// Step 1: Get dispensary info
console.log('[Step 1] Getting dispensary info...');
const dispResult = await pool.query(`
SELECT id, name, platform_dispensary_id, menu_url
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
throw new Error(`Dispensary ${dispensaryId} not found`);
}
const disp = dispResult.rows[0];
console.log(` Name: ${disp.name}`);
console.log(` Platform ID: ${disp.platform_dispensary_id}`);
console.log(` Menu URL: ${disp.menu_url}`);
if (!disp.platform_dispensary_id) {
throw new Error('Dispensary does not have a platform_dispensary_id');
}
// Extract cName from menu_url
const cNameMatch = disp.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
console.log(` cName: ${cName}\n`);
// Step 2: Fetch products from Dutchie
console.log('[Step 2] Fetching products from Dutchie GraphQL...');
const fetchResult = await fetchAllProducts(disp.platform_dispensary_id, cName);
console.log(` Total products fetched: ${fetchResult.products.length}\n`);
if (fetchResult.products.length === 0) {
console.log('No products fetched. Exiting.');
process.exit(0);
}
// Step 3: Normalize
console.log('[Step 3] Normalizing products...');
const normalizer = new DutchieNormalizer();
// Construct a RawPayload structure that the normalizer expects
// The normalizer.normalize() expects: { raw_json, dispensary_id, ... }
const rawPayloadForValidation = {
products: fetchResult.products,
queryInfo: {
totalCount: fetchResult.totalProducts,
},
};
const validation = normalizer.validatePayload(rawPayloadForValidation);
if (!validation.valid) {
console.error(` Validation failed: ${validation.errors?.join(', ')}`);
process.exit(1);
}
console.log(` Validation: PASS`);
// Build proper RawPayload for normalize()
const rawPayload = {
id: `test-${Date.now()}`,
dispensary_id: dispensaryId,
crawl_run_id: null,
platform: 'dutchie',
payload_version: 1,
raw_json: rawPayloadForValidation,
product_count: fetchResult.totalProducts,
pricing_type: 'rec',
crawl_mode: 'active',
fetched_at: new Date(),
processed: false,
normalized_at: null,
hydration_error: null,
hydration_attempts: 0,
created_at: new Date(),
};
const normResult = normalizer.normalize(rawPayload);
console.log(` Normalized products: ${normResult.products.length}`);
console.log(` Brands extracted: ${normResult.brands.length}`);
console.log(` Sample product: ${normResult.products[0]?.name}\n`);
// Step 4: Write to canonical tables
console.log('[Step 4] Writing to canonical tables via hydrateToCanonical...');
const hydrateResult = await hydrateToCanonical(
pool,
dispensaryId,
normResult,
null // no crawl_run_id for this test
);
console.log(` Products upserted: ${hydrateResult.productsUpserted}`);
console.log(` Products new: ${hydrateResult.productsNew}`);
console.log(` Snapshots created: ${hydrateResult.snapshotsCreated}`);
console.log(` Variants upserted: ${hydrateResult.variantsUpserted}`);
console.log(` Brands created: ${hydrateResult.brandsCreated}\n`);
// Step 5: Verify
console.log('[Step 5] Verifying data in canonical tables...');
const productCount = await pool.query(`
SELECT COUNT(*) as count FROM store_products WHERE dispensary_id = $1
`, [dispensaryId]);
console.log(` store_products count: ${productCount.rows[0].count}`);
const variantCount = await pool.query(`
SELECT COUNT(*) as count FROM product_variants WHERE dispensary_id = $1
`, [dispensaryId]);
console.log(` product_variants count: ${variantCount.rows[0].count}`);
const snapshotCount = await pool.query(`
SELECT COUNT(*) as count FROM store_product_snapshots WHERE dispensary_id = $1
`, [dispensaryId]);
console.log(` store_product_snapshots count: ${snapshotCount.rows[0].count}`);
console.log('\n============================================================');
console.log('SUCCESS - Crawl and hydration complete!');
console.log('============================================================');
} catch (error: any) {
console.error('\n============================================================');
console.error('ERROR:', error.message);
console.error('============================================================');
if (error.stack) {
console.error(error.stack);
}
process.exit(1);
} finally {
await pool.end();
}
}
main();