fix: Restore hydration and product_refresh for store updates
- Moved hydration module back from _deprecated (needed for product_refresh) - Restored product_refresh handler for processing stored payloads - Restored geolocation service for findadispo/findagram - Stubbed system routes that depend on deprecated SyncOrchestrator - Removed crawler-sandbox route (deprecated) - Fixed all TypeScript compilation errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,424 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Queue Dispensaries Script
|
||||
*
|
||||
* Orchestrates the multi-provider crawler system:
|
||||
* 1. Queue dispensaries that need provider detection
|
||||
* 2. Queue Dutchie dispensaries for production crawl
|
||||
* 3. Queue sandbox dispensaries for learning crawls
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
|
||||
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||||
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
|
||||
*/
|
||||
|
||||
import { pool } from '../db/pool';
|
||||
import { logger } from '../services/logger';
|
||||
import {
|
||||
runDetectMenuProviderJob,
|
||||
runDutchieMenuCrawlJob,
|
||||
runSandboxCrawlJob,
|
||||
processSandboxJobs,
|
||||
} from '../services/crawler-jobs';
|
||||
|
||||
// Parse command line args
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
detection: args.includes('--detection') || args.includes('--all'),
|
||||
production: args.includes('--production') || args.includes('--all'),
|
||||
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||||
dryRun: args.includes('--dry-run'),
|
||||
process: args.includes('--process'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||||
};
|
||||
|
||||
// If no specific flags, default to all
|
||||
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||||
flags.detection = true;
|
||||
flags.production = true;
|
||||
flags.sandbox = true;
|
||||
}
|
||||
|
||||
async function showHelp() {
|
||||
console.log(`
|
||||
Queue Dispensaries - Multi-Provider Crawler Orchestration
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--detection Queue dispensaries that need provider detection
|
||||
--production Queue Dutchie production crawls
|
||||
--sandbox Queue sandbox/learning crawls
|
||||
--all Queue all job types (default if no specific flag)
|
||||
--process Process queued jobs instead of just queuing
|
||||
--dry-run Show what would be queued without making changes
|
||||
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||||
--help, -h Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
# Queue all dispensaries for appropriate jobs
|
||||
npx tsx src/scripts/queue-dispensaries.ts
|
||||
|
||||
# Only queue detection jobs
|
||||
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
|
||||
|
||||
# Dry run to see what would be queued
|
||||
npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||||
|
||||
# Process sandbox jobs
|
||||
npx tsx src/scripts/queue-dispensaries.ts --process
|
||||
`);
|
||||
}
|
||||
|
||||
async function queueDetectionJobs(): Promise<number> {
|
||||
console.log('\n📡 Queueing Detection Jobs...');
|
||||
|
||||
// Find dispensaries that need provider detection:
|
||||
// - menu_provider is null OR
|
||||
// - menu_provider_confidence < 70 AND
|
||||
// - crawler_status is idle (not already queued/running)
|
||||
// - has a website URL
|
||||
const query = `
|
||||
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
|
||||
FROM dispensaries
|
||||
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||||
AND crawler_status = 'idle'
|
||||
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
|
||||
ORDER BY
|
||||
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
|
||||
menu_provider_confidence ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [flags.limit]);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
|
||||
for (const row of result.rows) {
|
||||
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Update status to queued
|
||||
await pool.query(
|
||||
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
|
||||
[dispensary.id]
|
||||
);
|
||||
|
||||
// Create sandbox job for detection
|
||||
await pool.query(
|
||||
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||
VALUES ($1, 'detection', 'pending', 10)`,
|
||||
[dispensary.id]
|
||||
);
|
||||
|
||||
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
} catch (error: any) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return queued;
|
||||
}
|
||||
|
||||
async function queueProductionCrawls(): Promise<number> {
|
||||
console.log('\n🏭 Queueing Production Dutchie Crawls...');
|
||||
|
||||
// Find Dutchie dispensaries ready for production crawl:
|
||||
// - menu_provider = 'dutchie'
|
||||
// - crawler_mode = 'production'
|
||||
// - crawler_status is idle
|
||||
// - last_menu_scrape is old or null
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
|
||||
FROM dispensaries d
|
||||
WHERE d.menu_provider = 'dutchie'
|
||||
AND d.crawler_mode = 'production'
|
||||
AND d.crawler_status = 'idle'
|
||||
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
|
||||
ORDER BY
|
||||
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
|
||||
d.last_menu_scrape ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [flags.limit]);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
|
||||
for (const row of result.rows) {
|
||||
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
|
||||
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Update status to queued
|
||||
await pool.query(
|
||||
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
|
||||
[dispensary.id]
|
||||
);
|
||||
|
||||
// Create crawl job in the main crawl_jobs table (production queue)
|
||||
await pool.query(
|
||||
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`,
|
||||
[dispensary.id]
|
||||
);
|
||||
|
||||
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
} catch (error: any) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return queued;
|
||||
}
|
||||
|
||||
async function queueSandboxCrawls(): Promise<number> {
|
||||
console.log('\n🧪 Queueing Sandbox Crawls...');
|
||||
|
||||
// Find sandbox dispensaries needing crawls:
|
||||
// - crawler_mode = 'sandbox'
|
||||
// - crawler_status in (idle, error_needs_review)
|
||||
// - No recent sandbox job
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
|
||||
FROM dispensaries d
|
||||
WHERE d.crawler_mode = 'sandbox'
|
||||
AND d.crawler_status IN ('idle', 'error_needs_review')
|
||||
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM sandbox_crawl_jobs sj
|
||||
WHERE sj.dispensary_id = d.id
|
||||
AND sj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY d.updated_at ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [flags.limit]);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
|
||||
for (const row of result.rows) {
|
||||
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Update status
|
||||
await pool.query(
|
||||
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
|
||||
[dispensary.id]
|
||||
);
|
||||
|
||||
// Create sandbox job
|
||||
await pool.query(
|
||||
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||
VALUES ($1, 'deep_crawl', 'pending', 5)`,
|
||||
[dispensary.id]
|
||||
);
|
||||
|
||||
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
} catch (error: any) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return queued;
|
||||
}
|
||||
|
||||
async function processJobs(): Promise<void> {
|
||||
console.log('\n⚙️ Processing Queued Jobs...\n');
|
||||
|
||||
// Process sandbox jobs (detection + sandbox crawls)
|
||||
const sandboxJobs = await pool.query(
|
||||
`SELECT * FROM sandbox_crawl_jobs
|
||||
WHERE status = 'pending'
|
||||
ORDER BY priority DESC, scheduled_at ASC
|
||||
LIMIT $1`,
|
||||
[flags.limit]
|
||||
);
|
||||
|
||||
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
|
||||
|
||||
for (const job of sandboxJobs.rows) {
|
||||
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
|
||||
|
||||
try {
|
||||
// Mark as running
|
||||
await pool.query(
|
||||
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`,
|
||||
[job.id]
|
||||
);
|
||||
|
||||
let result;
|
||||
if (job.job_type === 'detection') {
|
||||
result = await runDetectMenuProviderJob(job.dispensary_id);
|
||||
} else {
|
||||
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
|
||||
}
|
||||
|
||||
// Update job status
|
||||
await pool.query(
|
||||
`UPDATE sandbox_crawl_jobs
|
||||
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||
WHERE id = $4`,
|
||||
[
|
||||
result.success ? 'completed' : 'failed',
|
||||
JSON.stringify(result.data || {}),
|
||||
result.success ? null : result.message,
|
||||
job.id,
|
||||
]
|
||||
);
|
||||
|
||||
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
|
||||
|
||||
} catch (error: any) {
|
||||
await pool.query(
|
||||
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
|
||||
[error.message, job.id]
|
||||
);
|
||||
console.log(` ✗ Error: ${error.message}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function showStats(): Promise<void> {
|
||||
console.log('\n📊 Current Stats:');
|
||||
|
||||
// Dispensary stats
|
||||
const stats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
|
||||
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
|
||||
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
|
||||
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
|
||||
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
|
||||
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
|
||||
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
|
||||
FROM dispensaries
|
||||
`);
|
||||
|
||||
const s = stats.rows[0];
|
||||
console.log(`
|
||||
Dispensaries: ${s.total}
|
||||
- No provider detected: ${s.no_provider}
|
||||
- Dutchie: ${s.dutchie}
|
||||
- Other providers: ${s.other_providers}
|
||||
- Unknown: ${s.unknown}
|
||||
|
||||
Crawler Mode:
|
||||
- Production: ${s.production_mode}
|
||||
- Sandbox: ${s.sandbox_mode}
|
||||
|
||||
Status:
|
||||
- Idle: ${s.idle}
|
||||
- Queued: ${s.queued}
|
||||
- Running: ${s.running}
|
||||
- OK: ${s.ok}
|
||||
- Needs Review: ${s.needs_review}
|
||||
`);
|
||||
|
||||
// Job stats
|
||||
const jobStats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||||
FROM sandbox_crawl_jobs
|
||||
`);
|
||||
|
||||
const j = jobStats.rows[0];
|
||||
console.log(` Sandbox Jobs:
|
||||
- Pending: ${j.pending}
|
||||
- Running: ${j.running}
|
||||
- Completed: ${j.completed}
|
||||
- Failed: ${j.failed}
|
||||
`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
await showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
console.log(' Multi-Provider Crawler Queue Manager');
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||||
}
|
||||
|
||||
try {
|
||||
// Show current stats first
|
||||
await showStats();
|
||||
|
||||
if (flags.process) {
|
||||
// Process mode - run jobs instead of queuing
|
||||
await processJobs();
|
||||
} else {
|
||||
// Queuing mode
|
||||
let totalQueued = 0;
|
||||
|
||||
if (flags.detection) {
|
||||
totalQueued += await queueDetectionJobs();
|
||||
}
|
||||
|
||||
if (flags.production) {
|
||||
totalQueued += await queueProductionCrawls();
|
||||
}
|
||||
|
||||
if (flags.sandbox) {
|
||||
totalQueued += await queueSandboxCrawls();
|
||||
}
|
||||
|
||||
console.log('\n═══════════════════════════════════════════════════════');
|
||||
console.log(` Total dispensaries queued: ${totalQueued}`);
|
||||
console.log('═══════════════════════════════════════════════════════\n');
|
||||
}
|
||||
|
||||
// Show updated stats
|
||||
if (!flags.dryRun) {
|
||||
await showStats();
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,105 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Run Backfill CLI
|
||||
*
|
||||
* Import historical payloads from existing data sources.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/run-backfill.ts [options]
|
||||
*
|
||||
* Options:
|
||||
* --source SOURCE Source to backfill from:
|
||||
* - dutchie_products (default)
|
||||
* - snapshots
|
||||
* - cache_files
|
||||
* - all
|
||||
* --dry-run Print changes without modifying DB
|
||||
* --limit N Max payloads to create (default: unlimited)
|
||||
* --dispensary ID Only backfill specific dispensary
|
||||
* --cache-path PATH Path to cache files (default: ./cache/payloads)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { runBackfill, BackfillOptions } from '../hydration';
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
const dryRun = args.includes('--dry-run');
|
||||
|
||||
let source: BackfillOptions['source'] = 'dutchie_products';
|
||||
const sourceIdx = args.indexOf('--source');
|
||||
if (sourceIdx !== -1 && args[sourceIdx + 1]) {
|
||||
source = args[sourceIdx + 1] as BackfillOptions['source'];
|
||||
}
|
||||
|
||||
let limit: number | undefined;
|
||||
const limitIdx = args.indexOf('--limit');
|
||||
if (limitIdx !== -1 && args[limitIdx + 1]) {
|
||||
limit = parseInt(args[limitIdx + 1], 10);
|
||||
}
|
||||
|
||||
let dispensaryId: number | undefined;
|
||||
const dispIdx = args.indexOf('--dispensary');
|
||||
if (dispIdx !== -1 && args[dispIdx + 1]) {
|
||||
dispensaryId = parseInt(args[dispIdx + 1], 10);
|
||||
}
|
||||
|
||||
let cachePath: string | undefined;
|
||||
const cacheIdx = args.indexOf('--cache-path');
|
||||
if (cacheIdx !== -1 && args[cacheIdx + 1]) {
|
||||
cachePath = args[cacheIdx + 1];
|
||||
}
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
try {
|
||||
console.log('='.repeat(60));
|
||||
console.log('BACKFILL RUNNER');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Source: ${source}`);
|
||||
console.log(`Dry run: ${dryRun}`);
|
||||
if (limit) console.log(`Limit: ${limit}`);
|
||||
if (dispensaryId) console.log(`Dispensary: ${dispensaryId}`);
|
||||
if (cachePath) console.log(`Cache path: ${cachePath}`);
|
||||
console.log('');
|
||||
|
||||
const results = await runBackfill(pool, {
|
||||
dryRun,
|
||||
source,
|
||||
limit,
|
||||
dispensaryId,
|
||||
cachePath,
|
||||
});
|
||||
|
||||
console.log('\nBackfill Results:');
|
||||
console.log('='.repeat(40));
|
||||
|
||||
for (const result of results) {
|
||||
console.log(`\n${result.source}:`);
|
||||
console.log(` Payloads created: ${result.payloadsCreated}`);
|
||||
console.log(` Skipped: ${result.skipped}`);
|
||||
console.log(` Errors: ${result.errors.length}`);
|
||||
console.log(` Duration: ${result.durationMs}ms`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log(' First 5 errors:');
|
||||
for (const err of result.errors.slice(0, 5)) {
|
||||
console.log(` - ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const totalCreated = results.reduce((sum, r) => sum + r.payloadsCreated, 0);
|
||||
console.log(`\nTotal payloads created: ${totalCreated}`);
|
||||
} catch (error: any) {
|
||||
console.error('Backfill error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,510 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Unified Hydration CLI
|
||||
*
|
||||
* Central entrypoint for all hydration operations:
|
||||
*
|
||||
* MODES:
|
||||
* payload - Process raw_payloads → canonical tables (existing behavior)
|
||||
* backfill - Migrate dutchie_* → canonical tables (legacy backfill)
|
||||
* sync - Sync recent crawls to canonical tables
|
||||
* status - Show hydration progress
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/run-hydration.ts --mode=<mode> [options]
|
||||
*
|
||||
* Examples:
|
||||
* # Payload-based hydration (default)
|
||||
* npx tsx src/scripts/run-hydration.ts --mode=payload
|
||||
*
|
||||
* # Full legacy backfill
|
||||
* npx tsx src/scripts/run-hydration.ts --mode=backfill
|
||||
*
|
||||
* # Backfill single dispensary
|
||||
* npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123
|
||||
*
|
||||
* # Sync recent crawls
|
||||
* npx tsx src/scripts/run-hydration.ts --mode=sync --since="2 hours"
|
||||
*
|
||||
* # Check status
|
||||
* npx tsx src/scripts/run-hydration.ts --mode=status
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
import {
|
||||
HydrationWorker,
|
||||
runHydrationBatch,
|
||||
processPayloadById,
|
||||
reprocessFailedPayloads,
|
||||
getPayloadStats,
|
||||
} from '../hydration';
|
||||
import { runLegacyBackfill } from '../hydration/legacy-backfill';
|
||||
import { syncRecentCrawls } from '../hydration/incremental-sync';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
// ============================================================
|
||||
// ARGUMENT PARSING
|
||||
// ============================================================
|
||||
|
||||
interface CliArgs {
|
||||
mode: 'payload' | 'backfill' | 'sync' | 'status';
|
||||
store?: number;
|
||||
since?: string;
|
||||
dryRun: boolean;
|
||||
verbose: boolean;
|
||||
limit: number;
|
||||
loop: boolean;
|
||||
reprocess: boolean;
|
||||
payloadId?: string;
|
||||
startFrom?: number;
|
||||
}
|
||||
|
||||
function parseArgs(): CliArgs {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
// Defaults
|
||||
const result: CliArgs = {
|
||||
mode: 'payload',
|
||||
dryRun: args.includes('--dry-run'),
|
||||
verbose: args.includes('--verbose') || args.includes('-v'),
|
||||
limit: 50,
|
||||
loop: args.includes('--loop'),
|
||||
reprocess: args.includes('--reprocess'),
|
||||
};
|
||||
|
||||
// Parse --mode=<value>
|
||||
const modeArg = args.find(a => a.startsWith('--mode='));
|
||||
if (modeArg) {
|
||||
const mode = modeArg.split('=')[1];
|
||||
if (['payload', 'backfill', 'sync', 'status'].includes(mode)) {
|
||||
result.mode = mode as CliArgs['mode'];
|
||||
}
|
||||
}
|
||||
|
||||
// Parse --store=<id>
|
||||
const storeArg = args.find(a => a.startsWith('--store='));
|
||||
if (storeArg) {
|
||||
result.store = parseInt(storeArg.split('=')[1], 10);
|
||||
}
|
||||
|
||||
// Parse --since=<value>
|
||||
const sinceArg = args.find(a => a.startsWith('--since='));
|
||||
if (sinceArg) {
|
||||
result.since = sinceArg.split('=')[1];
|
||||
}
|
||||
|
||||
// Parse --limit=<value> or --limit <value>
|
||||
const limitArg = args.find(a => a.startsWith('--limit='));
|
||||
if (limitArg) {
|
||||
result.limit = parseInt(limitArg.split('=')[1], 10);
|
||||
} else {
|
||||
const limitIdx = args.indexOf('--limit');
|
||||
if (limitIdx !== -1 && args[limitIdx + 1]) {
|
||||
result.limit = parseInt(args[limitIdx + 1], 10);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse --payload=<id> or --payload <id>
|
||||
const payloadArg = args.find(a => a.startsWith('--payload='));
|
||||
if (payloadArg) {
|
||||
result.payloadId = payloadArg.split('=')[1];
|
||||
} else {
|
||||
const payloadIdx = args.indexOf('--payload');
|
||||
if (payloadIdx !== -1 && args[payloadIdx + 1]) {
|
||||
result.payloadId = args[payloadIdx + 1];
|
||||
}
|
||||
}
|
||||
|
||||
// Parse --start-from=<id>
|
||||
const startArg = args.find(a => a.startsWith('--start-from='));
|
||||
if (startArg) {
|
||||
result.startFrom = parseInt(startArg.split('=')[1], 10);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE CONNECTION
|
||||
// ============================================================
|
||||
|
||||
function getConnectionString(): string {
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
const host = process.env.CANNAIQ_DB_HOST;
|
||||
const port = process.env.CANNAIQ_DB_PORT;
|
||||
const name = process.env.CANNAIQ_DB_NAME;
|
||||
const user = process.env.CANNAIQ_DB_USER;
|
||||
const pass = process.env.CANNAIQ_DB_PASS;
|
||||
|
||||
if (host && port && name && user && pass) {
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
// Fallback to DATABASE_URL for local development
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
|
||||
throw new Error('Missing database connection environment variables');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MODE: PAYLOAD (existing behavior)
|
||||
// ============================================================
|
||||
|
||||
async function runPayloadMode(pool: Pool, args: CliArgs): Promise<void> {
|
||||
console.log('='.repeat(60));
|
||||
console.log('HYDRATION - PAYLOAD MODE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Dry run: ${args.dryRun}`);
|
||||
console.log(`Batch size: ${args.limit}`);
|
||||
console.log('');
|
||||
|
||||
// Show current stats
|
||||
try {
|
||||
const stats = await getPayloadStats(pool);
|
||||
console.log('Current payload stats:');
|
||||
console.log(` Total: ${stats.total}`);
|
||||
console.log(` Processed: ${stats.processed}`);
|
||||
console.log(` Unprocessed: ${stats.unprocessed}`);
|
||||
console.log(` Failed: ${stats.failed}`);
|
||||
console.log('');
|
||||
} catch {
|
||||
console.log('Note: raw_payloads table not found or empty');
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (args.payloadId) {
|
||||
// Process specific payload
|
||||
console.log(`Processing payload: ${args.payloadId}`);
|
||||
const result = await processPayloadById(pool, args.payloadId, { dryRun: args.dryRun });
|
||||
console.log('Result:', JSON.stringify(result, null, 2));
|
||||
} else if (args.reprocess) {
|
||||
// Reprocess failed payloads
|
||||
console.log('Reprocessing failed payloads...');
|
||||
const result = await reprocessFailedPayloads(pool, { dryRun: args.dryRun, batchSize: args.limit });
|
||||
console.log('Result:', JSON.stringify(result, null, 2));
|
||||
} else if (args.loop) {
|
||||
// Run continuous loop
|
||||
const worker = new HydrationWorker(pool, { dryRun: args.dryRun, batchSize: args.limit });
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
console.log('\nStopping hydration loop...');
|
||||
worker.stop();
|
||||
});
|
||||
|
||||
await worker.runLoop(30000);
|
||||
} else {
|
||||
// Run single batch
|
||||
const result = await runHydrationBatch(pool, { dryRun: args.dryRun, batchSize: args.limit });
|
||||
console.log('Batch result:');
|
||||
console.log(` Payloads processed: ${result.payloadsProcessed}`);
|
||||
console.log(` Payloads failed: ${result.payloadsFailed}`);
|
||||
console.log(` Products upserted: ${result.totalProductsUpserted}`);
|
||||
console.log(` Snapshots created: ${result.totalSnapshotsCreated}`);
|
||||
console.log(` Brands created: ${result.totalBrandsCreated}`);
|
||||
console.log(` Duration: ${result.durationMs}ms`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
for (const err of result.errors.slice(0, 10)) {
|
||||
console.log(` ${err.payloadId}: ${err.error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MODE: BACKFILL (legacy dutchie_* → canonical)
|
||||
// ============================================================
|
||||
|
||||
async function runBackfillMode(pool: Pool, args: CliArgs): Promise<void> {
|
||||
console.log('='.repeat(60));
|
||||
console.log('HYDRATION - BACKFILL MODE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
if (args.store) {
|
||||
console.log(`Store: ${args.store}`);
|
||||
}
|
||||
if (args.startFrom) {
|
||||
console.log(`Start from product ID: ${args.startFrom}`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
await runLegacyBackfill(pool, {
|
||||
dryRun: args.dryRun,
|
||||
verbose: args.verbose,
|
||||
dispensaryId: args.store,
|
||||
startFromProductId: args.startFrom,
|
||||
});
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MODE: SYNC (recent crawls → canonical)
|
||||
// ============================================================
|
||||
|
||||
async function runSyncMode(pool: Pool, args: CliArgs): Promise<void> {
|
||||
const since = args.since || '1 hour';
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('HYDRATION - SYNC MODE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Mode: ${args.dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
console.log(`Since: ${since}`);
|
||||
console.log(`Limit: ${args.limit}`);
|
||||
if (args.store) {
|
||||
console.log(`Store: ${args.store}`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
const result = await syncRecentCrawls(pool, {
|
||||
dryRun: args.dryRun,
|
||||
verbose: args.verbose,
|
||||
since,
|
||||
dispensaryId: args.store,
|
||||
limit: args.limit,
|
||||
});
|
||||
|
||||
console.log('');
|
||||
console.log('=== Sync Results ===');
|
||||
console.log(`Crawls synced: ${result.synced}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('');
|
||||
console.log('Errors:');
|
||||
for (const error of result.errors.slice(0, 10)) {
|
||||
console.log(` - ${error}`);
|
||||
}
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MODE: STATUS
|
||||
// ============================================================
|
||||
|
||||
async function runStatusMode(pool: Pool): Promise<void> {
|
||||
console.log('='.repeat(60));
|
||||
console.log('HYDRATION STATUS');
|
||||
console.log('='.repeat(60));
|
||||
console.log('');
|
||||
|
||||
// Check if v_hydration_status view exists
|
||||
const viewExists = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT 1 FROM pg_views WHERE viewname = 'v_hydration_status'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (viewExists.rows[0].exists) {
|
||||
const { rows } = await pool.query('SELECT * FROM v_hydration_status');
|
||||
console.log('Hydration Progress:');
|
||||
console.log('-'.repeat(70));
|
||||
console.log(
|
||||
'Table'.padEnd(30) +
|
||||
'Source'.padEnd(12) +
|
||||
'Hydrated'.padEnd(12) +
|
||||
'Progress'
|
||||
);
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
for (const row of rows) {
|
||||
const progress = row.hydration_pct ? `${row.hydration_pct}%` : 'N/A';
|
||||
console.log(
|
||||
row.source_table.padEnd(30) +
|
||||
String(row.source_count).padEnd(12) +
|
||||
String(row.hydrated_count).padEnd(12) +
|
||||
progress
|
||||
);
|
||||
}
|
||||
console.log('-'.repeat(70));
|
||||
} else {
|
||||
console.log('Note: v_hydration_status view not found. Run migration 052 first.');
|
||||
}
|
||||
|
||||
// Get counts from canonical tables
|
||||
console.log('\nCanonical Table Counts:');
|
||||
console.log('-'.repeat(40));
|
||||
|
||||
const tables = ['store_products', 'store_product_snapshots', 'crawl_runs'];
|
||||
for (const table of tables) {
|
||||
try {
|
||||
const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`);
|
||||
console.log(`${table}: ${rows[0].cnt}`);
|
||||
} catch {
|
||||
console.log(`${table}: (table not found)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get legacy table counts
|
||||
console.log('\nLegacy Table Counts:');
|
||||
console.log('-'.repeat(40));
|
||||
|
||||
const legacyTables = ['dutchie_products', 'dutchie_product_snapshots', 'dispensary_crawl_jobs'];
|
||||
for (const table of legacyTables) {
|
||||
try {
|
||||
const { rows } = await pool.query(`SELECT COUNT(*) as cnt FROM ${table}`);
|
||||
console.log(`${table}: ${rows[0].cnt}`);
|
||||
} catch {
|
||||
console.log(`${table}: (table not found)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Show recent sync activity
|
||||
console.log('\nRecent Crawl Runs (last 24h):');
|
||||
console.log('-'.repeat(40));
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query(`
|
||||
SELECT status, COUNT(*) as count
|
||||
FROM crawl_runs
|
||||
WHERE started_at > NOW() - INTERVAL '24 hours'
|
||||
GROUP BY status
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
|
||||
if (rows.length === 0) {
|
||||
console.log('No crawl runs in last 24 hours');
|
||||
} else {
|
||||
for (const row of rows) {
|
||||
console.log(`${row.status}: ${row.count}`);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
console.log('(crawl_runs table not found)');
|
||||
}
|
||||
|
||||
// Payload stats
|
||||
console.log('\nPayload Hydration:');
|
||||
console.log('-'.repeat(40));
|
||||
|
||||
try {
|
||||
const stats = await getPayloadStats(pool);
|
||||
console.log(`Total payloads: ${stats.total}`);
|
||||
console.log(`Processed: ${stats.processed}`);
|
||||
console.log(`Unprocessed: ${stats.unprocessed}`);
|
||||
console.log(`Failed: ${stats.failed}`);
|
||||
} catch {
|
||||
console.log('(raw_payloads table not found)');
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELP
|
||||
// ============================================================
|
||||
|
||||
function showHelp(): void {
|
||||
console.log(`
|
||||
Unified Hydration CLI
|
||||
|
||||
Usage:
|
||||
npx tsx src/scripts/run-hydration.ts --mode=<mode> [options]
|
||||
|
||||
Modes:
|
||||
payload Process raw_payloads → canonical tables (default)
|
||||
backfill Migrate dutchie_* → canonical tables
|
||||
sync Sync recent crawls to canonical tables
|
||||
status Show hydration progress
|
||||
|
||||
Common Options:
|
||||
--dry-run Print changes without modifying database
|
||||
--verbose, -v Show detailed progress
|
||||
--store=<id> Limit to a single dispensary
|
||||
--limit=<n> Batch size (default: 50)
|
||||
|
||||
Payload Mode Options:
|
||||
--loop Run continuous hydration loop
|
||||
--reprocess Reprocess failed payloads
|
||||
--payload=<id> Process a specific payload by ID
|
||||
|
||||
Backfill Mode Options:
|
||||
--start-from=<id> Resume from a specific product ID
|
||||
|
||||
Sync Mode Options:
|
||||
--since=<interval> Time window (default: "1 hour")
|
||||
Examples: "30 minutes", "2 hours", "1 day"
|
||||
|
||||
Examples:
|
||||
# Full legacy backfill (dutchie_* → canonical)
|
||||
npx tsx src/scripts/run-hydration.ts --mode=backfill
|
||||
|
||||
# Backfill single dispensary (dry run)
|
||||
npx tsx src/scripts/run-hydration.ts --mode=backfill --store=123 --dry-run
|
||||
|
||||
# Sync recent crawls from last 4 hours
|
||||
npx tsx src/scripts/run-hydration.ts --mode=sync --since="4 hours"
|
||||
|
||||
# Sync single dispensary
|
||||
npx tsx src/scripts/run-hydration.ts --mode=sync --store=123
|
||||
|
||||
# Run payload hydration loop
|
||||
npx tsx src/scripts/run-hydration.ts --mode=payload --loop
|
||||
|
||||
# Check hydration status
|
||||
npx tsx src/scripts/run-hydration.ts --mode=status
|
||||
`);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN
|
||||
// ============================================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const rawArgs = process.argv.slice(2);
|
||||
|
||||
if (rawArgs.includes('--help') || rawArgs.includes('-h')) {
|
||||
showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const args = parseArgs();
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: getConnectionString(),
|
||||
max: 5,
|
||||
});
|
||||
|
||||
try {
|
||||
// Verify connection
|
||||
await pool.query('SELECT 1');
|
||||
console.log('Database connection: OK\n');
|
||||
|
||||
switch (args.mode) {
|
||||
case 'payload':
|
||||
await runPayloadMode(pool, args);
|
||||
break;
|
||||
|
||||
case 'backfill':
|
||||
await runBackfillMode(pool, args);
|
||||
break;
|
||||
|
||||
case 'sync':
|
||||
await runSyncMode(pool, args);
|
||||
break;
|
||||
|
||||
case 'status':
|
||||
await runStatusMode(pool);
|
||||
break;
|
||||
|
||||
default:
|
||||
console.error(`Unknown mode: ${args.mode}`);
|
||||
showHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error('Error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,277 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Test Script: Crawl a single dispensary and write to canonical tables
|
||||
*
|
||||
* This script:
|
||||
* 1. Fetches products from Dutchie GraphQL
|
||||
* 2. Normalizes via DutchieNormalizer
|
||||
* 3. Writes to store_products, product_variants, snapshots via hydrateToCanonical
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/test-crawl-to-canonical.ts <dispensaryId>
|
||||
* npx tsx src/scripts/test-crawl-to-canonical.ts 235
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
import {
|
||||
executeGraphQL,
|
||||
GRAPHQL_HASHES,
|
||||
DUTCHIE_CONFIG,
|
||||
} from '../platforms/dutchie';
|
||||
import {
|
||||
DutchieNormalizer,
|
||||
hydrateToCanonical,
|
||||
} from '../hydration';
|
||||
import { initializeImageStorage } from '../utils/image-storage';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
// ============================================================
|
||||
// DATABASE CONNECTION
|
||||
// ============================================================
|
||||
|
||||
function getConnectionString(): string {
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
|
||||
const port = process.env.CANNAIQ_DB_PORT || '54320';
|
||||
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
|
||||
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
|
||||
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString: getConnectionString() });
|
||||
|
||||
// ============================================================
|
||||
// FETCH PRODUCTS FROM DUTCHIE
|
||||
// ============================================================
|
||||
|
||||
interface FetchResult {
|
||||
products: any[];
|
||||
totalPages: number;
|
||||
totalProducts: number;
|
||||
}
|
||||
|
||||
async function fetchAllProducts(platformDispensaryId: string, cName: string): Promise<FetchResult> {
|
||||
const allProducts: any[] = [];
|
||||
let page = 0;
|
||||
let totalPages = 1;
|
||||
let totalProducts = 0;
|
||||
|
||||
console.log(`[Fetch] Starting fetch for ${platformDispensaryId} (cName: ${cName})`);
|
||||
|
||||
while (page < totalPages && page < DUTCHIE_CONFIG.maxPages) {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: platformDispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active', // 'Active' = in-stock products with pricing
|
||||
types: [],
|
||||
useCache: true,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page,
|
||||
perPage: DUTCHIE_CONFIG.perPage,
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await executeGraphQL(
|
||||
'FilteredProducts',
|
||||
variables,
|
||||
GRAPHQL_HASHES.FilteredProducts,
|
||||
{ cName, maxRetries: 3 }
|
||||
);
|
||||
|
||||
const data = result?.data?.filteredProducts;
|
||||
if (!data) {
|
||||
console.error(`[Fetch] No data returned for page ${page}`);
|
||||
break;
|
||||
}
|
||||
|
||||
const products = data.products || [];
|
||||
totalProducts = data.queryInfo?.totalCount || 0;
|
||||
totalPages = Math.ceil(totalProducts / DUTCHIE_CONFIG.perPage);
|
||||
|
||||
allProducts.push(...products);
|
||||
console.log(`[Fetch] Page ${page + 1}/${totalPages}: ${products.length} products (total so far: ${allProducts.length})`);
|
||||
|
||||
page++;
|
||||
|
||||
if (page < totalPages) {
|
||||
await new Promise(r => setTimeout(r, DUTCHIE_CONFIG.pageDelayMs));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Fetch] Error on page ${page}: ${error.message}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return { products: allProducts, totalPages, totalProducts };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN
|
||||
// ============================================================
|
||||
|
||||
async function main() {
|
||||
const dispensaryId = parseInt(process.argv[2], 10);
|
||||
|
||||
if (!dispensaryId) {
|
||||
console.error('Usage: npx tsx src/scripts/test-crawl-to-canonical.ts <dispensaryId>');
|
||||
console.error('Example: npx tsx src/scripts/test-crawl-to-canonical.ts 235');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('============================================================');
|
||||
console.log(`Test Crawl to Canonical - Dispensary ${dispensaryId}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
// Initialize image storage
|
||||
console.log('[Init] Initializing image storage...');
|
||||
await initializeImageStorage();
|
||||
console.log(' Image storage ready\n');
|
||||
|
||||
try {
|
||||
// Step 1: Get dispensary info
|
||||
console.log('[Step 1] Getting dispensary info...');
|
||||
const dispResult = await pool.query(`
|
||||
SELECT id, name, platform_dispensary_id, menu_url
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (dispResult.rows.length === 0) {
|
||||
throw new Error(`Dispensary ${dispensaryId} not found`);
|
||||
}
|
||||
|
||||
const disp = dispResult.rows[0];
|
||||
console.log(` Name: ${disp.name}`);
|
||||
console.log(` Platform ID: ${disp.platform_dispensary_id}`);
|
||||
console.log(` Menu URL: ${disp.menu_url}`);
|
||||
|
||||
if (!disp.platform_dispensary_id) {
|
||||
throw new Error('Dispensary does not have a platform_dispensary_id');
|
||||
}
|
||||
|
||||
// Extract cName from menu_url
|
||||
const cNameMatch = disp.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
|
||||
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
|
||||
console.log(` cName: ${cName}\n`);
|
||||
|
||||
// Step 2: Fetch products from Dutchie
|
||||
console.log('[Step 2] Fetching products from Dutchie GraphQL...');
|
||||
const fetchResult = await fetchAllProducts(disp.platform_dispensary_id, cName);
|
||||
console.log(` Total products fetched: ${fetchResult.products.length}\n`);
|
||||
|
||||
if (fetchResult.products.length === 0) {
|
||||
console.log('No products fetched. Exiting.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Step 3: Normalize
|
||||
console.log('[Step 3] Normalizing products...');
|
||||
const normalizer = new DutchieNormalizer();
|
||||
|
||||
// Construct a RawPayload structure that the normalizer expects
|
||||
// The normalizer.normalize() expects: { raw_json, dispensary_id, ... }
|
||||
const rawPayloadForValidation = {
|
||||
products: fetchResult.products,
|
||||
queryInfo: {
|
||||
totalCount: fetchResult.totalProducts,
|
||||
},
|
||||
};
|
||||
|
||||
const validation = normalizer.validatePayload(rawPayloadForValidation);
|
||||
if (!validation.valid) {
|
||||
console.error(` Validation failed: ${validation.errors?.join(', ')}`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(` Validation: PASS`);
|
||||
|
||||
// Build proper RawPayload for normalize()
|
||||
const rawPayload = {
|
||||
id: `test-${Date.now()}`,
|
||||
dispensary_id: dispensaryId,
|
||||
crawl_run_id: null,
|
||||
platform: 'dutchie',
|
||||
payload_version: 1,
|
||||
raw_json: rawPayloadForValidation,
|
||||
product_count: fetchResult.totalProducts,
|
||||
pricing_type: 'rec',
|
||||
crawl_mode: 'active',
|
||||
fetched_at: new Date(),
|
||||
processed: false,
|
||||
normalized_at: null,
|
||||
hydration_error: null,
|
||||
hydration_attempts: 0,
|
||||
created_at: new Date(),
|
||||
};
|
||||
|
||||
const normResult = normalizer.normalize(rawPayload);
|
||||
console.log(` Normalized products: ${normResult.products.length}`);
|
||||
console.log(` Brands extracted: ${normResult.brands.length}`);
|
||||
console.log(` Sample product: ${normResult.products[0]?.name}\n`);
|
||||
|
||||
// Step 4: Write to canonical tables
|
||||
console.log('[Step 4] Writing to canonical tables via hydrateToCanonical...');
|
||||
const hydrateResult = await hydrateToCanonical(
|
||||
pool,
|
||||
dispensaryId,
|
||||
normResult,
|
||||
null // no crawl_run_id for this test
|
||||
);
|
||||
|
||||
console.log(` Products upserted: ${hydrateResult.productsUpserted}`);
|
||||
console.log(` Products new: ${hydrateResult.productsNew}`);
|
||||
console.log(` Snapshots created: ${hydrateResult.snapshotsCreated}`);
|
||||
console.log(` Variants upserted: ${hydrateResult.variantsUpserted}`);
|
||||
console.log(` Brands created: ${hydrateResult.brandsCreated}\n`);
|
||||
|
||||
// Step 5: Verify
|
||||
console.log('[Step 5] Verifying data in canonical tables...');
|
||||
|
||||
const productCount = await pool.query(`
|
||||
SELECT COUNT(*) as count FROM store_products WHERE dispensary_id = $1
|
||||
`, [dispensaryId]);
|
||||
console.log(` store_products count: ${productCount.rows[0].count}`);
|
||||
|
||||
const variantCount = await pool.query(`
|
||||
SELECT COUNT(*) as count FROM product_variants WHERE dispensary_id = $1
|
||||
`, [dispensaryId]);
|
||||
console.log(` product_variants count: ${variantCount.rows[0].count}`);
|
||||
|
||||
const snapshotCount = await pool.query(`
|
||||
SELECT COUNT(*) as count FROM store_product_snapshots WHERE dispensary_id = $1
|
||||
`, [dispensaryId]);
|
||||
console.log(` store_product_snapshots count: ${snapshotCount.rows[0].count}`);
|
||||
|
||||
console.log('\n============================================================');
|
||||
console.log('SUCCESS - Crawl and hydration complete!');
|
||||
console.log('============================================================');
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('\n============================================================');
|
||||
console.error('ERROR:', error.message);
|
||||
console.error('============================================================');
|
||||
if (error.stack) {
|
||||
console.error(error.stack);
|
||||
}
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user