feat: AZ dispensary harmonization with Dutchie source of truth

Major changes:
- Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API
- Add migration 057 for crawl_enabled and dutchie_verified fields
- Remove legacy dutchie-az module (replaced by platforms/dutchie)
- Clean up deprecated crawlers, scrapers, and orchestrator code
- Update location-discovery to not fallback to slug when ID is missing
- Add crawl-rotator service for proxy rotation
- Add types/index.ts for shared type definitions
- Add woodpecker-agent k8s manifest

Harmonization script:
- Queries ConsumerDispensaries API for all 32 AZ cities
- Matches dispensaries by platform_dispensary_id (not slug)
- Updates existing records with full Dutchie data
- Creates new records for unmatched Dutchie dispensaries
- Disables dispensaries not found in Dutchie

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-08 10:19:49 -07:00
parent 948a732dd5
commit b7cfec0770
112 changed files with 3163 additions and 34694 deletions

View File

@@ -905,12 +905,13 @@ async function backfillProducts(
let crawlRunId = crawlRunCache.get(dayKey);
if (!crawlRunId && !options.dryRun) {
crawlRunId = await getOrCreateBackfillCrawlRun(
const newCrawlRunId = await getOrCreateBackfillCrawlRun(
pool,
product.dispensary_id,
capturedAt,
options.dryRun
);
crawlRunId = newCrawlRunId ?? undefined;
if (crawlRunId) {
crawlRunCache.set(dayKey, crawlRunId);
stats.crawlRunsCreated++;

View File

@@ -212,7 +212,7 @@ EXAMPLES:
try {
// Fetch all stores without a dispensary_id
const storesResult = await pool.query<Store>(`
const storesResult = await pool.query(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NULL
@@ -221,7 +221,7 @@ EXAMPLES:
const unmappedStores = storesResult.rows;
// Fetch all already-mapped stores for context
const mappedResult = await pool.query<Store>(`
const mappedResult = await pool.query(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NOT NULL
@@ -230,7 +230,7 @@ EXAMPLES:
const mappedStores = mappedResult.rows;
// Fetch all dispensaries
const dispResult = await pool.query<Dispensary>(`
const dispResult = await pool.query(`
SELECT id, name, company_name, city, address, slug
FROM dispensaries
ORDER BY name

View File

@@ -1,388 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Bootstrap Discovery Script
*
* One-time (but reusable) bootstrap command that:
* 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
* 2. Optionally runs RunDispensaryOrchestrator for each dispensary
*
* Usage:
* npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
* npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
* npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
* npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
* npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
*/
import { pool } from '../db/pool';
import {
ensureAllDispensariesHaveSchedules,
runDispensaryOrchestrator,
runBatchDispensaryOrchestrator,
getDispensariesDueForOrchestration,
} from '../services/dispensary-orchestrator';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
run: args.includes('--run'),
dryRun: args.includes('--dry-run'),
status: args.includes('--status'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
detectionOnly: args.includes('--detection-only'),
productionOnly: args.includes('--production-only'),
sandboxOnly: args.includes('--sandbox-only'),
};
async function showHelp() {
console.log(`
Bootstrap Discovery - Initialize Dispensary Crawl System
USAGE:
npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
OPTIONS:
--run After creating schedules, run the orchestrator for each dispensary
--dry-run Show what would happen without making changes
--status Show current status and exit
--limit=N Limit how many dispensaries to process (0 = all, default: 0)
--concurrency=N How many dispensaries to process in parallel (default: 3)
--interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
--detection-only Only run detection, don't crawl
--production-only Only run dispensaries in production mode
--sandbox-only Only run dispensaries in sandbox mode
--help, -h Show this help message
EXAMPLES:
# Create schedule entries for all dispensaries (no crawling)
npx tsx src/scripts/bootstrap-discovery.ts
# Create schedules and run orchestrator for all dispensaries
npx tsx src/scripts/bootstrap-discovery.ts --run
# Run orchestrator for first 10 dispensaries
npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
# Run with higher concurrency
npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
# Show current status
npx tsx src/scripts/bootstrap-discovery.ts --status
WHAT IT DOES:
1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
2. If --run: For each dispensary, runs the orchestrator which:
a. Checks if provider detection is needed (null/unknown/stale/low confidence)
b. Runs detection if needed
c. If Dutchie + production mode: runs production crawl
d. Otherwise: runs sandbox crawl
3. Updates schedule status and job records
`);
}
async function showStatus() {
console.log('\n📊 Current Dispensary Crawl Status\n');
console.log('═'.repeat(70));
// Get dispensary counts by provider
const providerStats = await pool.query(`
SELECT
COALESCE(product_provider, 'undetected') as provider,
COUNT(*) as count,
COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
FROM dispensaries
GROUP BY COALESCE(product_provider, 'undetected')
ORDER BY count DESC
`);
console.log('\nProvider Distribution:');
console.log('-'.repeat(60));
console.log(
'Provider'.padEnd(20) +
'Total'.padStart(8) +
'Production'.padStart(12) +
'Sandbox'.padStart(10) +
'No Mode'.padStart(10)
);
console.log('-'.repeat(60));
for (const row of providerStats.rows) {
console.log(
row.provider.padEnd(20) +
row.count.toString().padStart(8) +
row.production.toString().padStart(12) +
row.sandbox.toString().padStart(10) +
row.no_mode.toString().padStart(10)
);
}
// Get schedule stats
const scheduleStats = await pool.query(`
SELECT
COUNT(DISTINCT d.id) as total_dispensaries,
COUNT(DISTINCT dcs.id) as with_schedule,
COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
AVG(dcs.interval_minutes)::INTEGER as avg_interval
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
`);
const s = scheduleStats.rows[0];
console.log('\n\nSchedule Status:');
console.log('-'.repeat(60));
console.log(` Total Dispensaries: ${s.total_dispensaries}`);
console.log(` With Schedule: ${s.with_schedule}`);
console.log(` Without Schedule: ${s.without_schedule}`);
console.log(` Active Schedules: ${s.active_schedules || 0}`);
console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
console.log('\n Last Run Status:');
console.log(` - Success: ${s.last_success || 0}`);
console.log(` - Error: ${s.last_error || 0}`);
console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
console.log(` - Detection Only: ${s.last_detection || 0}`);
console.log(` - Due Now: ${s.due_now || 0}`);
// Get recent job stats
const jobStats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
SUM(products_found) as total_products_found
FROM dispensary_crawl_jobs
WHERE created_at > NOW() - INTERVAL '24 hours'
`);
const j = jobStats.rows[0];
console.log('\n\nJobs (Last 24 Hours):');
console.log('-'.repeat(60));
console.log(` Total Jobs: ${j.total || 0}`);
console.log(` Completed: ${j.completed || 0}`);
console.log(` Failed: ${j.failed || 0}`);
console.log(` Running: ${j.running || 0}`);
console.log(` Pending: ${j.pending || 0}`);
console.log(` With Detection: ${j.with_detection || 0}`);
console.log(` With Crawl: ${j.with_crawl || 0}`);
console.log(` - Production: ${j.production_crawls || 0}`);
console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
console.log(` Products Found: ${j.total_products_found || 0}`);
console.log('\n' + '═'.repeat(70) + '\n');
}
async function createSchedules(): Promise<{ created: number; existing: number }> {
console.log('\n📅 Creating Dispensary Schedules...\n');
if (flags.dryRun) {
// Count how many would be created
const result = await pool.query(`
SELECT COUNT(*) as count
FROM dispensaries d
WHERE NOT EXISTS (
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
)
`);
const wouldCreate = parseInt(result.rows[0].count);
console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
return { created: wouldCreate, existing: 0 };
}
const result = await ensureAllDispensariesHaveSchedules(flags.interval);
console.log(` ✓ Created ${result.created} new schedule entries`);
console.log(`${result.existing} dispensaries already had schedules`);
return result;
}
async function getDispensariesToProcess(): Promise<number[]> {
// Build query based on filters
let whereClause = 'TRUE';
if (flags.productionOnly) {
whereClause += ` AND d.product_crawler_mode = 'production'`;
} else if (flags.sandboxOnly) {
whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
}
if (flags.detectionOnly) {
whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
}
const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
const query = `
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
WHERE ${whereClause}
ORDER BY
COALESCE(dcs.priority, 0) DESC,
dcs.last_run_at ASC NULLS FIRST,
d.id ASC
${limitClause}
`;
const result = await pool.query(query);
return result.rows.map(row => row.id);
}
async function runOrchestrator() {
console.log('\n🚀 Running Dispensary Orchestrator...\n');
const dispensaryIds = await getDispensariesToProcess();
if (dispensaryIds.length === 0) {
console.log(' No dispensaries to process.');
return;
}
console.log(` Found ${dispensaryIds.length} dispensaries to process`);
console.log(` Concurrency: ${flags.concurrency}`);
if (flags.dryRun) {
console.log('\n Would process these dispensaries:');
const details = await pool.query(
`SELECT id, name, product_provider, product_crawler_mode
FROM dispensaries WHERE id = ANY($1) ORDER BY id`,
[dispensaryIds]
);
for (const row of details.rows.slice(0, 20)) {
console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
}
if (details.rows.length > 20) {
console.log(` ... and ${details.rows.length - 20} more`);
}
return;
}
console.log('\n Starting batch processing...\n');
const results = await runBatchDispensaryOrchestrator(dispensaryIds, flags.concurrency);
// Summarize results
const summary = {
total: results.length,
success: results.filter(r => r.status === 'success').length,
sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
detectionOnly: results.filter(r => r.status === 'detection_only').length,
error: results.filter(r => r.status === 'error').length,
detectionsRan: results.filter(r => r.detectionRan).length,
crawlsRan: results.filter(r => r.crawlRan).length,
productionCrawls: results.filter(r => r.crawlType === 'production').length,
sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
};
console.log('\n' + '═'.repeat(70));
console.log(' Orchestrator Results');
console.log('═'.repeat(70));
console.log(`
Total Processed: ${summary.total}
Status:
- Success: ${summary.success}
- Sandbox Only: ${summary.sandboxOnly}
- Detection Only: ${summary.detectionOnly}
- Error: ${summary.error}
Operations:
- Detections Ran: ${summary.detectionsRan}
- Crawls Ran: ${summary.crawlsRan}
- Production: ${summary.productionCrawls}
- Sandbox: ${summary.sandboxCrawls}
Results:
- Products Found: ${summary.totalProducts}
- Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
- Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
`);
console.log('═'.repeat(70) + '\n');
// Show errors if any
const errors = results.filter(r => r.status === 'error');
if (errors.length > 0) {
console.log('\n⚠ Errors encountered:');
for (const err of errors.slice(0, 10)) {
console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
}
if (errors.length > 10) {
console.log(` ... and ${errors.length - 10} more errors`);
}
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('\n' + '═'.repeat(70));
console.log(' Dispensary Crawl Bootstrap Discovery');
console.log('═'.repeat(70));
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made');
}
try {
// Always show status first
await showStatus();
if (flags.status) {
// Status-only mode, we're done
await pool.end();
process.exit(0);
}
// Step 1: Create schedule entries
await createSchedules();
// Step 2: Optionally run orchestrator
if (flags.run) {
await runOrchestrator();
} else {
console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
}
// Show final status
if (!flags.dryRun) {
await showStatus();
}
} catch (error: any) {
console.error('\n❌ Fatal error:', error.message);
console.error(error.stack);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,101 +0,0 @@
/**
* LOCAL-ONLY Admin Bootstrap Script
*
* Creates or resets a local admin user for development.
* This script is ONLY for local development - never use in production.
*
* Usage:
* cd backend
* npx tsx src/scripts/bootstrap-local-admin.ts
*
* Default credentials:
* Email: admin@local.test
* Password: admin123
*/
import bcrypt from 'bcrypt';
import { query, closePool } from '../dutchie-az/db/connection';
// Local admin credentials - deterministic for dev
const LOCAL_ADMIN_EMAIL = 'admin@local.test';
const LOCAL_ADMIN_PASSWORD = 'admin123';
const LOCAL_ADMIN_ROLE = 'admin'; // Match existing schema (admin, not superadmin)
async function bootstrapLocalAdmin(): Promise<void> {
console.log('='.repeat(60));
console.log('LOCAL ADMIN BOOTSTRAP');
console.log('='.repeat(60));
console.log('');
console.log('This script creates/resets a local admin user for development.');
console.log('');
try {
// Hash the password with bcrypt (10 rounds, matching existing code)
const passwordHash = await bcrypt.hash(LOCAL_ADMIN_PASSWORD, 10);
// Check if user exists
const existing = await query<{ id: number; email: string }>(
'SELECT id, email FROM users WHERE email = $1',
[LOCAL_ADMIN_EMAIL]
);
if (existing.rows.length > 0) {
// User exists - update password and role
console.log(`User "${LOCAL_ADMIN_EMAIL}" already exists (id=${existing.rows[0].id})`);
console.log('Resetting password and ensuring admin role...');
await query(
`UPDATE users
SET password_hash = $1,
role = $2,
updated_at = NOW()
WHERE email = $3`,
[passwordHash, LOCAL_ADMIN_ROLE, LOCAL_ADMIN_EMAIL]
);
console.log('User updated successfully.');
} else {
// User doesn't exist - create new
console.log(`Creating new admin user: ${LOCAL_ADMIN_EMAIL}`);
const result = await query<{ id: number }>(
`INSERT INTO users (email, password_hash, role, created_at, updated_at)
VALUES ($1, $2, $3, NOW(), NOW())
RETURNING id`,
[LOCAL_ADMIN_EMAIL, passwordHash, LOCAL_ADMIN_ROLE]
);
console.log(`User created successfully (id=${result.rows[0].id})`);
}
console.log('');
console.log('='.repeat(60));
console.log('LOCAL ADMIN READY');
console.log('='.repeat(60));
console.log('');
console.log('Login credentials:');
console.log(` Email: ${LOCAL_ADMIN_EMAIL}`);
console.log(` Password: ${LOCAL_ADMIN_PASSWORD}`);
console.log('');
console.log('Admin UI: http://localhost:8080/admin');
console.log('');
} catch (error: any) {
console.error('');
console.error('ERROR: Failed to bootstrap local admin');
console.error(error.message);
if (error.message.includes('relation "users" does not exist')) {
console.error('');
console.error('The "users" table does not exist.');
console.error('Run migrations first: npm run migrate');
}
process.exit(1);
} finally {
await closePool();
}
}
// Run the bootstrap
bootstrapLocalAdmin();

View File

@@ -1,66 +0,0 @@
/**
* Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie'
* and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic.
*
* Usage (local):
* node dist/scripts/crawl-all-dutchie.js
*
* Requires:
* - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB
* - Dispensaries table populated with menu_type and platform_dispensary_id
*/
import { query } from '../dutchie-az/db/connection';
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
async function main() {
const { rows } = await query<{
id: number;
name: string;
slug: string;
platform_dispensary_id: string | null;
}>(`
SELECT id, name, slug, platform_dispensary_id
FROM dispensaries
WHERE menu_type = 'dutchie'
AND platform_dispensary_id IS NOT NULL
ORDER BY id
`);
if (!rows.length) {
console.log('No dutchie dispensaries with resolved platform_dispensary_id found.');
process.exit(0);
}
console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`);
let success = 0;
let failed = 0;
for (const row of rows) {
try {
console.log(`Crawling ${row.id} (${row.name})...`);
const result = await runDispensaryOrchestrator(row.id);
const ok =
result.status === 'success' ||
result.status === 'sandbox_only' ||
result.status === 'detection_only';
if (ok) {
success++;
} else {
failed++;
console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`);
}
} catch (err: any) {
failed++;
console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`);
}
}
console.log(`Completed. Success: ${success}, Failed: ${failed}`);
}
main().catch((err) => {
console.error('Fatal:', err);
process.exit(1);
});

View File

@@ -1,50 +0,0 @@
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
// All 57 dutchie stores with platform_dispensary_id (as of 2024-12)
const ALL_DISPENSARY_IDS = [
72, 74, 75, 76, 77, 78, 81, 82, 85, 87, 91, 92, 97, 101, 106, 108, 110, 112,
115, 120, 123, 125, 128, 131, 135, 139, 140, 143, 144, 145, 152, 153, 161,
168, 176, 177, 180, 181, 189, 195, 196, 199, 200, 201, 205, 206, 207, 213,
214, 224, 225, 227, 232, 235, 248, 252, 281
];
const BATCH_SIZE = 5;
async function run() {
const totalBatches = Math.ceil(ALL_DISPENSARY_IDS.length / BATCH_SIZE);
console.log(`Starting crawl of ${ALL_DISPENSARY_IDS.length} stores in ${totalBatches} batches of ${BATCH_SIZE}...`);
let successCount = 0;
let errorCount = 0;
for (let i = 0; i < ALL_DISPENSARY_IDS.length; i += BATCH_SIZE) {
const batch = ALL_DISPENSARY_IDS.slice(i, i + BATCH_SIZE);
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
console.log(`\n========== BATCH ${batchNum}/${totalBatches} (IDs: ${batch.join(', ')}) ==========`);
for (const id of batch) {
console.log(`\n--- Crawling dispensary ${id} ---`);
try {
const result = await runDispensaryOrchestrator(id);
console.log(` Status: ${result.status}`);
console.log(` Summary: ${result.summary}`);
if (result.productsFound) {
console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
}
successCount++;
} catch (e: any) {
console.log(` ERROR: ${e.message}`);
errorCount++;
}
}
console.log(`\n--- Batch ${batchNum} complete. Progress: ${Math.min(i + BATCH_SIZE, ALL_DISPENSARY_IDS.length)}/${ALL_DISPENSARY_IDS.length} ---`);
}
console.log('\n========================================');
console.log(`=== ALL CRAWLS COMPLETE ===`);
console.log(`Success: ${successCount}, Errors: ${errorCount}`);
console.log('========================================');
}
run().catch(e => console.log('Fatal:', e.message));

View File

@@ -0,0 +1,114 @@
/**
* Debug Dutchie city page to see what data is available
*/
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
async function main() {
const cityUrl = process.argv[2] || 'https://dutchie.com/us/dispensaries/wa-bellevue';
console.log(`Debugging page: ${cityUrl}`);
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
console.log('Navigating...');
await page.goto(cityUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 5000));
// Get page title
const title = await page.title();
console.log(`\nPage title: ${title}`);
// Check for Cloudflare challenge
const isCFChallenge = await page.evaluate(() => {
return document.title.includes('Just a moment') ||
document.body.textContent?.includes('Enable JavaScript');
});
if (isCFChallenge) {
console.log('\n⚠ CLOUDFLARE CHALLENGE DETECTED - waiting longer...');
await new Promise((r) => setTimeout(r, 10000));
}
// Check for __NEXT_DATA__
const nextData = await page.evaluate(() => {
const script = document.querySelector('script#__NEXT_DATA__');
if (script) {
try {
return JSON.parse(script.textContent || '{}');
} catch {
return { error: 'Failed to parse __NEXT_DATA__' };
}
}
return null;
});
if (nextData) {
console.log('\n✅ __NEXT_DATA__ found!');
console.log('Keys:', Object.keys(nextData));
if (nextData.props?.pageProps) {
console.log('pageProps keys:', Object.keys(nextData.props.pageProps));
if (nextData.props.pageProps.dispensaries) {
console.log('Dispensaries count:', nextData.props.pageProps.dispensaries.length);
// Show first dispensary structure
const first = nextData.props.pageProps.dispensaries[0];
if (first) {
console.log('\nFirst dispensary keys:', Object.keys(first));
console.log('First dispensary sample:', JSON.stringify(first, null, 2).slice(0, 1000));
}
}
}
} else {
console.log('\n❌ No __NEXT_DATA__ found');
// Check what scripts are on the page
const scripts = await page.evaluate(() => {
return Array.from(document.querySelectorAll('script[id]')).map(s => ({
id: s.id,
src: (s as HTMLScriptElement).src?.slice(0, 100),
}));
});
console.log('Scripts with IDs:', scripts);
// Try to find dispensary data in window object
const windowData = await page.evaluate(() => {
const w = window as any;
const keys = ['__NEXT_DATA__', '__PRELOADED_STATE__', '__INITIAL_STATE__',
'dispensaries', '__data', 'pageData', '__remixContext'];
const found: Record<string, any> = {};
for (const key of keys) {
if (w[key]) {
found[key] = typeof w[key] === 'object' ? Object.keys(w[key]) : typeof w[key];
}
}
return found;
});
console.log('Window data:', windowData);
// Get some page content
const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500));
console.log('\nPage text preview:', bodyText);
}
} finally {
await browser.close();
}
}
main().catch(console.error);

View File

@@ -0,0 +1,256 @@
/**
* Discover and Import Store Script
*
* Discovers a store from Dutchie by city+state and imports it into the dispensaries table.
* Uses the local API endpoints - does NOT make direct GraphQL calls.
*
* Usage:
* npx tsx src/scripts/discover-and-import-store.ts --city "Adelanto" --state "CA"
* npx tsx src/scripts/discover-and-import-store.ts --city "Phoenix" --state "AZ" --dry-run
* npx tsx src/scripts/discover-and-import-store.ts --city "Los Angeles" --state "CA" --all
*/
const API_BASE = process.env.API_BASE || 'http://localhost:3010';
interface DiscoveryResult {
cityId: string;
citySlug: string;
locationsFound: number;
locationsUpserted: number;
locationsNew: number;
locationsUpdated: number;
errors: string[];
durationMs: number;
}
interface DiscoveryLocation {
id: number;
name: string;
city: string;
stateCode: string;
platformSlug: string;
platformLocationId: string;
platformMenuUrl: string;
status: string;
}
interface Store {
id: number;
name: string;
slug: string;
city: string;
state: string;
menu_url: string;
platform_dispensary_id: string;
}
async function discoverCity(city: string, state: string): Promise<DiscoveryResult | null> {
const citySlug = city.toLowerCase().replace(/\s+/g, '-');
console.log(`\n[1/3] Discovering stores in ${city}, ${state}...`);
const response = await fetch(`${API_BASE}/api/discovery/admin/discover-city`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
citySlug,
stateCode: state,
countryCode: 'US'
})
});
if (!response.ok) {
const error = await response.text();
console.error(`Discovery failed: ${error}`);
return null;
}
const data = await response.json();
if (!data.success) {
console.error(`Discovery failed: ${JSON.stringify(data)}`);
return null;
}
console.log(` Found ${data.result.locationsFound} location(s)`);
console.log(` New: ${data.result.locationsNew}, Updated: ${data.result.locationsUpdated}`);
return data.result;
}
async function getDiscoveredLocations(state: string, city?: string): Promise<DiscoveryLocation[]> {
console.log(`\n[2/3] Fetching discovered locations for ${city || 'all cities'}, ${state}...`);
// Query the discovery_locations table via SQL since the API has a bug
// For now, return empty and let caller handle via direct DB query
// TODO: Fix the /api/discovery/locations endpoint
return [];
}
async function createStore(location: {
name: string;
slug: string;
city: string;
state: string;
menuUrl: string;
platformId: string;
}): Promise<Store | null> {
console.log(`\n[3/3] Creating store: ${location.name}...`);
const response = await fetch(`${API_BASE}/api/stores`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
name: location.name,
slug: location.slug,
city: location.city,
state: location.state,
menu_url: location.menuUrl,
menu_type: 'dutchie',
platform: 'dutchie',
platform_dispensary_id: location.platformId
})
});
if (!response.ok) {
const error = await response.json();
if (error.error?.includes('already exists')) {
console.log(` Store already exists (slug: ${location.slug})`);
return null;
}
console.error(` Failed to create store: ${JSON.stringify(error)}`);
return null;
}
const store = await response.json();
console.log(` Created store ID: ${store.id}`);
return store;
}
async function verifyStoreExists(city: string, state: string): Promise<Store[]> {
const response = await fetch(`${API_BASE}/api/stores?city=${encodeURIComponent(city)}&state=${state}`);
if (!response.ok) {
return [];
}
const data = await response.json();
return data.stores || [];
}
async function main() {
const args = process.argv.slice(2);
// Parse arguments
let city = '';
let state = '';
let dryRun = false;
let importAll = false;
for (let i = 0; i < args.length; i++) {
if (args[i] === '--city' && args[i + 1]) {
city = args[i + 1];
i++;
} else if (args[i] === '--state' && args[i + 1]) {
state = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--dry-run') {
dryRun = true;
} else if (args[i] === '--all') {
importAll = true;
}
}
if (!city || !state) {
console.log(`
Usage: npx tsx src/scripts/discover-and-import-store.ts --city "City Name" --state "ST"
Options:
--city City name (required)
--state State code, e.g., CA, AZ (required)
--dry-run Discover only, don't import
--all Import all discovered locations (default: first one only)
Examples:
npx tsx src/scripts/discover-and-import-store.ts --city "Adelanto" --state "CA"
npx tsx src/scripts/discover-and-import-store.ts --city "Phoenix" --state "AZ" --all
`);
process.exit(1);
}
console.log('='.repeat(60));
console.log(`STORE DISCOVERY & IMPORT`);
console.log(`City: ${city}, State: ${state}`);
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'IMPORT'}`);
console.log('='.repeat(60));
// Step 1: Check if stores already exist
const existingStores = await verifyStoreExists(city, state);
if (existingStores.length > 0) {
console.log(`\nFound ${existingStores.length} existing store(s) in ${city}, ${state}:`);
existingStores.forEach(s => console.log(` - ${s.name} (ID: ${s.id})`));
if (!importAll) {
console.log('\nUse --all to discover and import additional stores.');
}
}
// Step 2: Discover from Dutchie
const discovery = await discoverCity(city, state);
if (!discovery) {
console.error('\nDiscovery failed. Exiting.');
process.exit(1);
}
if (discovery.locationsFound === 0) {
console.log('\nNo stores found in this city on Dutchie.');
process.exit(0);
}
if (dryRun) {
console.log('\n[DRY RUN] Would import stores. Run without --dry-run to import.');
process.exit(0);
}
// Step 3: The discovery endpoint already saved to dutchie_discovery_locations
// Now we need to query that table and create dispensary records
// Since the API has bugs, we'll provide instructions for manual import
console.log(`
Next steps to complete import:
1. Query the discovery location:
psql -c "SELECT id, name, platform_slug, platform_location_id, platform_menu_url
FROM dutchie_discovery_locations
WHERE name ILIKE '%${city}%'
ORDER BY id DESC LIMIT 5;"
2. Create the store via API:
curl -X POST ${API_BASE}/api/stores \\
-H "Content-Type: application/json" \\
-d '{
"name": "<NAME>",
"slug": "<PLATFORM_SLUG>",
"city": "${city}",
"state": "${state}",
"menu_url": "<PLATFORM_MENU_URL>",
"menu_type": "dutchie",
"platform": "dutchie",
"platform_dispensary_id": "<PLATFORM_LOCATION_ID>"
}'
3. Verify:
curl "${API_BASE}/api/stores?city=${encodeURIComponent(city)}&state=${state}"
`);
// Final verification
const finalStores = await verifyStoreExists(city, state);
console.log('\n' + '='.repeat(60));
console.log(`RESULT: ${finalStores.length} store(s) now in ${city}, ${state}`);
finalStores.forEach(s => console.log(` - ${s.name} (ID: ${s.id})`));
console.log('='.repeat(60));
}
main().catch(console.error);

View File

@@ -0,0 +1,88 @@
/**
* Discover all Arizona dispensaries from Dutchie
* Uses the state/city HTML pages which contain __NEXT_DATA__ with full dispensary list
*/
import { fetchPage, extractNextData } from '../platforms/dutchie/client';
interface DutchieDispensary {
platform_dispensary_id: string;
name: string;
slug: string;
city: string;
state: string;
address: string;
zip: string;
}
async function discoverAZDispensaries() {
console.log('Discovering Arizona dispensaries from Dutchie...\n');
const allDispensaries: Map<string, DutchieDispensary> = new Map();
// Fetch the Arizona state page
console.log('Fetching /dispensaries/arizona...');
const stateResult = await fetchPage('/dispensaries/arizona');
if (!stateResult) {
console.error('Failed to fetch Arizona page');
return;
}
console.log(`Got ${stateResult.status} response, ${stateResult.html.length} bytes`);
const nextData = extractNextData(stateResult.html);
if (!nextData) {
console.error('Failed to extract __NEXT_DATA__');
// Try to find dispensary links in HTML
const links = stateResult.html.match(/\/dispensary\/([a-z0-9-]+)/gi) || [];
console.log(`Found ${links.length} dispensary links in HTML`);
const uniqueSlugs = [...new Set(links.map(l => l.replace('/dispensary/', '')))];
console.log('Unique slugs:', uniqueSlugs.slice(0, 20));
return;
}
console.log('Extracted __NEXT_DATA__');
console.log('Keys:', Object.keys(nextData));
// The dispensary data is usually in props.pageProps
const pageProps = nextData?.props?.pageProps;
if (pageProps) {
console.log('pageProps keys:', Object.keys(pageProps));
// Try various possible locations
const dispensaries = pageProps.dispensaries ||
pageProps.nearbyDispensaries ||
pageProps.filteredDispensaries ||
pageProps.allDispensaries ||
[];
console.log(`Found ${dispensaries.length} dispensaries in pageProps`);
if (dispensaries.length > 0) {
console.log('Sample:', JSON.stringify(dispensaries[0], null, 2));
}
}
// Also look for dehydratedState (Apollo cache)
const dehydratedState = nextData?.props?.pageProps?.__APOLLO_STATE__;
if (dehydratedState) {
console.log('Found Apollo state');
const dispensaryKeys = Object.keys(dehydratedState).filter(k =>
k.startsWith('Dispensary:') || k.includes('dispensary')
);
console.log(`Found ${dispensaryKeys.length} dispensary entries`);
if (dispensaryKeys.length > 0) {
console.log('Sample key:', dispensaryKeys[0]);
console.log('Sample value:', JSON.stringify(dehydratedState[dispensaryKeys[0]], null, 2).slice(0, 500));
}
}
// Output the raw pageProps for analysis
if (pageProps) {
const fs = await import('fs');
fs.writeFileSync('/tmp/az-pageprops.json', JSON.stringify(pageProps, null, 2));
console.log('\nWrote pageProps to /tmp/az-pageprops.json');
}
}
discoverAZDispensaries().catch(console.error);

View File

@@ -1,86 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Dutchie City Discovery CLI Runner
*
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
*
* Usage:
* npm run discovery:dutchie:cities
* npx tsx src/scripts/discovery-dutchie-cities.ts
*
* Environment:
* DATABASE_URL - PostgreSQL connection string (required)
*/
import { Pool } from 'pg';
import { DutchieCityDiscovery } from '../dutchie-az/discovery/DutchieCityDiscovery';
async function main() {
console.log('='.repeat(60));
console.log('DUTCHIE CITY DISCOVERY');
console.log('='.repeat(60));
// Get database URL from environment
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
console.error('ERROR: DATABASE_URL environment variable is required');
console.error('');
console.error('Usage:');
console.error(' DATABASE_URL="postgresql://..." npm run discovery:dutchie:cities');
process.exit(1);
}
// Create pool
const pool = new Pool({ connectionString });
try {
// Test connection
await pool.query('SELECT 1');
console.log('[CLI] Database connection established');
// Run discovery
const discovery = new DutchieCityDiscovery(pool);
const result = await discovery.run();
// Print summary
console.log('');
console.log('='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Cities found: ${result.citiesFound}`);
console.log(`Cities inserted: ${result.citiesInserted}`);
console.log(`Cities updated: ${result.citiesUpdated}`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0) {
console.log('');
console.log('Errors:');
result.errors.forEach((e) => console.log(` - ${e}`));
}
// Show stats
console.log('');
console.log('Current Statistics:');
const stats = await discovery.getStats();
console.log(` Total cities: ${stats.total}`);
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
console.log(` Never crawled: ${stats.neverCrawled}`);
console.log('');
console.log('By Country:');
stats.byCountry.forEach((c) => console.log(` ${c.countryCode}: ${c.count}`));
console.log('');
console.log('By State (top 10):');
stats.byState.slice(0, 10).forEach((s) => console.log(` ${s.stateCode} (${s.countryCode}): ${s.count}`));
process.exit(result.errors.length > 0 ? 1 : 0);
} catch (error: any) {
console.error('FATAL ERROR:', error.message);
console.error(error.stack);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,189 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Dutchie Location Discovery CLI Runner
*
* Discovers store locations for cities and upserts to dutchie_discovery_locations.
*
* Usage:
* npm run discovery:dutchie:locations -- --all-enabled
* npm run discovery:dutchie:locations -- --city-slug=phoenix
* npm run discovery:dutchie:locations -- --all-enabled --limit=10
*
* npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled
* npx tsx src/scripts/discovery-dutchie-locations.ts --city-slug=phoenix
*
* Options:
* --city-slug=<slug> Run for a single city by its slug
* --all-enabled Run for all cities where crawl_enabled = TRUE
* --limit=<n> Limit the number of cities to process
* --delay=<ms> Delay between cities in ms (default: 2000)
*
* Environment:
* DATABASE_URL - PostgreSQL connection string (required)
*/
import { Pool } from 'pg';
import { DutchieLocationDiscovery } from '../dutchie-az/discovery/DutchieLocationDiscovery';
// Parse command line arguments
function parseArgs(): {
citySlug: string | null;
allEnabled: boolean;
limit: number | undefined;
delay: number;
} {
const args = process.argv.slice(2);
let citySlug: string | null = null;
let allEnabled = false;
let limit: number | undefined = undefined;
let delay = 2000;
for (const arg of args) {
if (arg.startsWith('--city-slug=')) {
citySlug = arg.split('=')[1];
} else if (arg === '--all-enabled') {
allEnabled = true;
} else if (arg.startsWith('--limit=')) {
limit = parseInt(arg.split('=')[1], 10);
} else if (arg.startsWith('--delay=')) {
delay = parseInt(arg.split('=')[1], 10);
}
}
return { citySlug, allEnabled, limit, delay };
}
function printUsage() {
console.log(`
Dutchie Location Discovery CLI
Usage:
npx tsx src/scripts/discovery-dutchie-locations.ts [options]
Options:
--city-slug=<slug> Run for a single city by its slug
--all-enabled Run for all cities where crawl_enabled = TRUE
--limit=<n> Limit the number of cities to process
--delay=<ms> Delay between cities in ms (default: 2000)
Examples:
npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled
npx tsx src/scripts/discovery-dutchie-locations.ts --city-slug=phoenix
npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled --limit=5
Environment:
DATABASE_URL - PostgreSQL connection string (required)
`);
}
async function main() {
const { citySlug, allEnabled, limit, delay } = parseArgs();
if (!citySlug && !allEnabled) {
console.error('ERROR: Must specify either --city-slug=<slug> or --all-enabled');
printUsage();
process.exit(1);
}
console.log('='.repeat(60));
console.log('DUTCHIE LOCATION DISCOVERY');
console.log('='.repeat(60));
if (citySlug) {
console.log(`Mode: Single city (${citySlug})`);
} else {
console.log(`Mode: All enabled cities${limit ? ` (limit: ${limit})` : ''}`);
}
console.log(`Delay between cities: ${delay}ms`);
console.log('');
// Get database URL from environment
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
console.error('ERROR: DATABASE_URL environment variable is required');
console.error('');
console.error('Usage:');
console.error(' DATABASE_URL="postgresql://..." npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled');
process.exit(1);
}
// Create pool
const pool = new Pool({ connectionString });
try {
// Test connection
await pool.query('SELECT 1');
console.log('[CLI] Database connection established');
const discovery = new DutchieLocationDiscovery(pool);
if (citySlug) {
// Single city mode
const city = await discovery.getCityBySlug(citySlug);
if (!city) {
console.error(`ERROR: City not found: ${citySlug}`);
console.error('');
console.error('Make sure you have run city discovery first:');
console.error(' npm run discovery:dutchie:cities');
process.exit(1);
}
const result = await discovery.discoverForCity(city);
console.log('');
console.log('='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`City: ${city.cityName}, ${city.stateCode}`);
console.log(`Locations found: ${result.locationsFound}`);
console.log(`Inserted: ${result.locationsInserted}`);
console.log(`Updated: ${result.locationsUpdated}`);
console.log(`Skipped (protected): ${result.locationsSkipped}`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0) {
console.log('');
console.log('Errors:');
result.errors.forEach((e) => console.log(` - ${e}`));
}
process.exit(result.errors.length > 0 ? 1 : 0);
} else {
// All enabled cities mode
const result = await discovery.discoverAllEnabled({ limit, delayMs: delay });
console.log('');
console.log('='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Total cities processed: ${result.totalCities}`);
console.log(`Total locations found: ${result.totalLocationsFound}`);
console.log(`Total inserted: ${result.totalInserted}`);
console.log(`Total updated: ${result.totalUpdated}`);
console.log(`Total skipped: ${result.totalSkipped}`);
console.log(`Total errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0 && result.errors.length <= 20) {
console.log('');
console.log('Errors:');
result.errors.forEach((e) => console.log(` - ${e}`));
} else if (result.errors.length > 20) {
console.log('');
console.log(`First 20 of ${result.errors.length} errors:`);
result.errors.slice(0, 20).forEach((e) => console.log(` - ${e}`));
}
process.exit(result.errors.length > 0 ? 1 : 0);
}
} catch (error: any) {
console.error('FATAL ERROR:', error.message);
console.error(error.stack);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,749 +0,0 @@
/**
* Legacy Data Import ETL Script
*
* DEPRECATED: This script assumed a two-database architecture.
*
* CURRENT ARCHITECTURE (Single Database):
* - All data lives in ONE database: cannaiq (cannaiq-postgres container)
* - Legacy tables exist INSIDE this same database with namespaced prefixes (e.g., legacy_*)
* - The only database is: cannaiq (in cannaiq-postgres container)
*
* If you need to import legacy data:
* 1. Import into namespaced tables (legacy_dispensaries, legacy_products, etc.)
* inside the main cannaiq database
* 2. Use the canonical connection from src/dutchie-az/db/connection.ts
*
* SAFETY RULES:
* - INSERT-ONLY: No UPDATE, no DELETE, no TRUNCATE
* - ON CONFLICT DO NOTHING: Skip duplicates, never overwrite
* - Batch Processing: 500-1000 rows per batch
* - Manual Invocation Only: Requires explicit user execution
*/
import { Pool, PoolClient } from 'pg';
// ============================================================
// CONFIGURATION
// ============================================================
const BATCH_SIZE = 500;
interface ETLConfig {
dryRun: boolean;
tables: string[];
}
interface ETLStats {
table: string;
read: number;
inserted: number;
skipped: number;
errors: number;
durationMs: number;
}
// Parse command line arguments
function parseArgs(): ETLConfig {
const args = process.argv.slice(2);
const config: ETLConfig = {
dryRun: false,
tables: ['dispensaries', 'products', 'dutchie_products', 'dutchie_product_snapshots'],
};
for (const arg of args) {
if (arg === '--dry-run') {
config.dryRun = true;
} else if (arg.startsWith('--tables=')) {
config.tables = arg.replace('--tables=', '').split(',');
}
}
return config;
}
// ============================================================
// DATABASE CONNECTIONS
// ============================================================
// DEPRECATED: Both pools point to the same database (cannaiq)
// Legacy tables exist inside the main database with namespaced prefixes
function createLegacyPool(): Pool {
return new Pool({
host: process.env.CANNAIQ_DB_HOST || 'localhost',
port: parseInt(process.env.CANNAIQ_DB_PORT || '54320'),
user: process.env.CANNAIQ_DB_USER || 'dutchie',
password: process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass',
database: process.env.CANNAIQ_DB_NAME || 'cannaiq',
max: 5,
});
}
function createCannaiqPool(): Pool {
return new Pool({
host: process.env.CANNAIQ_DB_HOST || 'localhost',
port: parseInt(process.env.CANNAIQ_DB_PORT || '54320'),
user: process.env.CANNAIQ_DB_USER || 'dutchie',
password: process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass',
database: process.env.CANNAIQ_DB_NAME || 'cannaiq',
max: 5,
});
}
// ============================================================
// STAGING TABLE CREATION
// ============================================================
const STAGING_TABLES_SQL = `
-- Staging table for legacy dispensaries
CREATE TABLE IF NOT EXISTS dispensaries_from_legacy (
id SERIAL PRIMARY KEY,
legacy_id INTEGER NOT NULL,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL,
city VARCHAR(100) NOT NULL,
state VARCHAR(10) NOT NULL,
postal_code VARCHAR(20),
address TEXT,
latitude DECIMAL(10,7),
longitude DECIMAL(10,7),
menu_url TEXT,
website TEXT,
legacy_metadata JSONB,
imported_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(legacy_id)
);
-- Staging table for legacy products
CREATE TABLE IF NOT EXISTS products_from_legacy (
id SERIAL PRIMARY KEY,
legacy_product_id INTEGER NOT NULL,
legacy_dispensary_id INTEGER,
external_product_id VARCHAR(255),
name VARCHAR(500) NOT NULL,
brand_name VARCHAR(255),
type VARCHAR(100),
subcategory VARCHAR(100),
strain_type VARCHAR(50),
thc DECIMAL(10,4),
cbd DECIMAL(10,4),
price_cents INTEGER,
original_price_cents INTEGER,
stock_status VARCHAR(20),
weight VARCHAR(100),
primary_image_url TEXT,
first_seen_at TIMESTAMPTZ,
last_seen_at TIMESTAMPTZ,
legacy_raw_payload JSONB,
imported_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(legacy_product_id)
);
-- Staging table for legacy price history
CREATE TABLE IF NOT EXISTS price_history_legacy (
id SERIAL PRIMARY KEY,
legacy_product_id INTEGER NOT NULL,
price_cents INTEGER,
recorded_at TIMESTAMPTZ,
imported_at TIMESTAMPTZ DEFAULT NOW()
);
-- Index for efficient lookups
CREATE INDEX IF NOT EXISTS idx_disp_legacy_slug ON dispensaries_from_legacy(slug, city, state);
CREATE INDEX IF NOT EXISTS idx_prod_legacy_ext_id ON products_from_legacy(external_product_id);
`;
async function createStagingTables(cannaiqPool: Pool, dryRun: boolean): Promise<void> {
console.log('[ETL] Creating staging tables...');
if (dryRun) {
console.log('[ETL] DRY RUN: Would create staging tables');
return;
}
const client = await cannaiqPool.connect();
try {
await client.query(STAGING_TABLES_SQL);
console.log('[ETL] Staging tables created successfully');
} finally {
client.release();
}
}
// ============================================================
// ETL FUNCTIONS
// ============================================================
async function importDispensaries(
legacyPool: Pool,
cannaiqPool: Pool,
dryRun: boolean
): Promise<ETLStats> {
const startTime = Date.now();
const stats: ETLStats = {
table: 'dispensaries',
read: 0,
inserted: 0,
skipped: 0,
errors: 0,
durationMs: 0,
};
console.log('[ETL] Importing dispensaries...');
const legacyClient = await legacyPool.connect();
const cannaiqClient = await cannaiqPool.connect();
try {
// Count total rows
const countResult = await legacyClient.query('SELECT COUNT(*) FROM dispensaries');
const totalRows = parseInt(countResult.rows[0].count);
console.log(`[ETL] Found ${totalRows} dispensaries in legacy database`);
// Process in batches
let offset = 0;
while (offset < totalRows) {
const batchResult = await legacyClient.query(`
SELECT
id, name, slug, city, state, zip, address,
latitude, longitude, menu_url, website, dba_name,
menu_provider, product_provider, provider_detection_data
FROM dispensaries
ORDER BY id
LIMIT $1 OFFSET $2
`, [BATCH_SIZE, offset]);
stats.read += batchResult.rows.length;
if (dryRun) {
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} dispensaries`);
stats.inserted += batchResult.rows.length;
} else {
for (const row of batchResult.rows) {
try {
const legacyMetadata = {
dba_name: row.dba_name,
menu_provider: row.menu_provider,
product_provider: row.product_provider,
provider_detection_data: row.provider_detection_data,
};
const insertResult = await cannaiqClient.query(`
INSERT INTO dispensaries_from_legacy
(legacy_id, name, slug, city, state, postal_code, address,
latitude, longitude, menu_url, website, legacy_metadata)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
ON CONFLICT (legacy_id) DO NOTHING
RETURNING id
`, [
row.id,
row.name,
row.slug,
row.city,
row.state,
row.zip,
row.address,
row.latitude,
row.longitude,
row.menu_url,
row.website,
JSON.stringify(legacyMetadata),
]);
if (insertResult.rowCount > 0) {
stats.inserted++;
} else {
stats.skipped++;
}
} catch (err: any) {
stats.errors++;
console.error(`[ETL] Error inserting dispensary ${row.id}:`, err.message);
}
}
}
offset += BATCH_SIZE;
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} dispensaries`);
}
} finally {
legacyClient.release();
cannaiqClient.release();
}
stats.durationMs = Date.now() - startTime;
return stats;
}
async function importProducts(
legacyPool: Pool,
cannaiqPool: Pool,
dryRun: boolean
): Promise<ETLStats> {
const startTime = Date.now();
const stats: ETLStats = {
table: 'products',
read: 0,
inserted: 0,
skipped: 0,
errors: 0,
durationMs: 0,
};
console.log('[ETL] Importing legacy products...');
const legacyClient = await legacyPool.connect();
const cannaiqClient = await cannaiqPool.connect();
try {
const countResult = await legacyClient.query('SELECT COUNT(*) FROM products');
const totalRows = parseInt(countResult.rows[0].count);
console.log(`[ETL] Found ${totalRows} products in legacy database`);
let offset = 0;
while (offset < totalRows) {
const batchResult = await legacyClient.query(`
SELECT
id, dispensary_id, dutchie_product_id, name, brand,
subcategory, strain_type, thc_percentage, cbd_percentage,
price, original_price, in_stock, weight, image_url,
first_seen_at, last_seen_at, raw_data
FROM products
ORDER BY id
LIMIT $1 OFFSET $2
`, [BATCH_SIZE, offset]);
stats.read += batchResult.rows.length;
if (dryRun) {
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} products`);
stats.inserted += batchResult.rows.length;
} else {
for (const row of batchResult.rows) {
try {
const stockStatus = row.in_stock === true ? 'in_stock' :
row.in_stock === false ? 'out_of_stock' : 'unknown';
const priceCents = row.price ? Math.round(parseFloat(row.price) * 100) : null;
const originalPriceCents = row.original_price ? Math.round(parseFloat(row.original_price) * 100) : null;
const insertResult = await cannaiqClient.query(`
INSERT INTO products_from_legacy
(legacy_product_id, legacy_dispensary_id, external_product_id,
name, brand_name, subcategory, strain_type, thc, cbd,
price_cents, original_price_cents, stock_status, weight,
primary_image_url, first_seen_at, last_seen_at, legacy_raw_payload)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
ON CONFLICT (legacy_product_id) DO NOTHING
RETURNING id
`, [
row.id,
row.dispensary_id,
row.dutchie_product_id,
row.name,
row.brand,
row.subcategory,
row.strain_type,
row.thc_percentage,
row.cbd_percentage,
priceCents,
originalPriceCents,
stockStatus,
row.weight,
row.image_url,
row.first_seen_at,
row.last_seen_at,
row.raw_data ? JSON.stringify(row.raw_data) : null,
]);
if (insertResult.rowCount > 0) {
stats.inserted++;
} else {
stats.skipped++;
}
} catch (err: any) {
stats.errors++;
console.error(`[ETL] Error inserting product ${row.id}:`, err.message);
}
}
}
offset += BATCH_SIZE;
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} products`);
}
} finally {
legacyClient.release();
cannaiqClient.release();
}
stats.durationMs = Date.now() - startTime;
return stats;
}
async function importDutchieProducts(
legacyPool: Pool,
cannaiqPool: Pool,
dryRun: boolean
): Promise<ETLStats> {
const startTime = Date.now();
const stats: ETLStats = {
table: 'dutchie_products',
read: 0,
inserted: 0,
skipped: 0,
errors: 0,
durationMs: 0,
};
console.log('[ETL] Importing dutchie_products...');
const legacyClient = await legacyPool.connect();
const cannaiqClient = await cannaiqPool.connect();
try {
const countResult = await legacyClient.query('SELECT COUNT(*) FROM dutchie_products');
const totalRows = parseInt(countResult.rows[0].count);
console.log(`[ETL] Found ${totalRows} dutchie_products in legacy database`);
// Note: For dutchie_products, we need to map dispensary_id to the canonical dispensary
// This requires the dispensaries to be imported first
// For now, we'll insert directly since the schema is nearly identical
let offset = 0;
while (offset < totalRows) {
const batchResult = await legacyClient.query(`
SELECT *
FROM dutchie_products
ORDER BY id
LIMIT $1 OFFSET $2
`, [BATCH_SIZE, offset]);
stats.read += batchResult.rows.length;
if (dryRun) {
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} dutchie_products`);
stats.inserted += batchResult.rows.length;
} else {
// For each row, attempt insert with ON CONFLICT DO NOTHING
for (const row of batchResult.rows) {
try {
// Check if dispensary exists in canonical table
const dispCheck = await cannaiqClient.query(`
SELECT id FROM dispensaries WHERE id = $1
`, [row.dispensary_id]);
if (dispCheck.rows.length === 0) {
stats.skipped++;
continue; // Skip products for dispensaries not yet imported
}
const insertResult = await cannaiqClient.query(`
INSERT INTO dutchie_products
(dispensary_id, platform, external_product_id, platform_dispensary_id,
c_name, name, brand_name, brand_id, brand_logo_url,
type, subcategory, strain_type, provider,
thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
status, medical_only, rec_only, featured, coming_soon,
certificate_of_analysis_enabled,
is_below_threshold, is_below_kiosk_threshold,
options_below_threshold, options_below_kiosk_threshold,
stock_status, total_quantity_available,
primary_image_url, images, measurements, weight, past_c_names,
created_at_dutchie, updated_at_dutchie, latest_raw_payload)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39)
ON CONFLICT (dispensary_id, external_product_id) DO NOTHING
RETURNING id
`, [
row.dispensary_id,
row.platform || 'dutchie',
row.external_product_id,
row.platform_dispensary_id,
row.c_name,
row.name,
row.brand_name,
row.brand_id,
row.brand_logo_url,
row.type,
row.subcategory,
row.strain_type,
row.provider,
row.thc,
row.thc_content,
row.cbd,
row.cbd_content,
row.cannabinoids_v2,
row.effects,
row.status,
row.medical_only,
row.rec_only,
row.featured,
row.coming_soon,
row.certificate_of_analysis_enabled,
row.is_below_threshold,
row.is_below_kiosk_threshold,
row.options_below_threshold,
row.options_below_kiosk_threshold,
row.stock_status,
row.total_quantity_available,
row.primary_image_url,
row.images,
row.measurements,
row.weight,
row.past_c_names,
row.created_at_dutchie,
row.updated_at_dutchie,
row.latest_raw_payload,
]);
if (insertResult.rowCount > 0) {
stats.inserted++;
} else {
stats.skipped++;
}
} catch (err: any) {
stats.errors++;
if (stats.errors <= 5) {
console.error(`[ETL] Error inserting dutchie_product ${row.id}:`, err.message);
}
}
}
}
offset += BATCH_SIZE;
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} dutchie_products`);
}
} finally {
legacyClient.release();
cannaiqClient.release();
}
stats.durationMs = Date.now() - startTime;
return stats;
}
async function importDutchieSnapshots(
legacyPool: Pool,
cannaiqPool: Pool,
dryRun: boolean
): Promise<ETLStats> {
const startTime = Date.now();
const stats: ETLStats = {
table: 'dutchie_product_snapshots',
read: 0,
inserted: 0,
skipped: 0,
errors: 0,
durationMs: 0,
};
console.log('[ETL] Importing dutchie_product_snapshots...');
const legacyClient = await legacyPool.connect();
const cannaiqClient = await cannaiqPool.connect();
try {
const countResult = await legacyClient.query('SELECT COUNT(*) FROM dutchie_product_snapshots');
const totalRows = parseInt(countResult.rows[0].count);
console.log(`[ETL] Found ${totalRows} dutchie_product_snapshots in legacy database`);
// Build mapping of legacy product IDs to canonical product IDs
console.log('[ETL] Building product ID mapping...');
const productMapping = new Map<number, number>();
const mappingResult = await cannaiqClient.query(`
SELECT id, external_product_id, dispensary_id FROM dutchie_products
`);
// Create a key from dispensary_id + external_product_id
const productByKey = new Map<string, number>();
for (const row of mappingResult.rows) {
const key = `${row.dispensary_id}:${row.external_product_id}`;
productByKey.set(key, row.id);
}
let offset = 0;
while (offset < totalRows) {
const batchResult = await legacyClient.query(`
SELECT *
FROM dutchie_product_snapshots
ORDER BY id
LIMIT $1 OFFSET $2
`, [BATCH_SIZE, offset]);
stats.read += batchResult.rows.length;
if (dryRun) {
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} snapshots`);
stats.inserted += batchResult.rows.length;
} else {
for (const row of batchResult.rows) {
try {
// Map legacy product ID to canonical product ID
const key = `${row.dispensary_id}:${row.external_product_id}`;
const canonicalProductId = productByKey.get(key);
if (!canonicalProductId) {
stats.skipped++;
continue; // Skip snapshots for products not yet imported
}
// Insert snapshot (no conflict handling - all snapshots are historical)
await cannaiqClient.query(`
INSERT INTO dutchie_product_snapshots
(dutchie_product_id, dispensary_id, platform_dispensary_id,
external_product_id, pricing_type, crawl_mode,
status, featured, special, medical_only, rec_only,
is_present_in_feed, stock_status,
rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
wholesale_min_price_cents,
total_quantity_available, total_kiosk_quantity_available,
manual_inventory, is_below_threshold, is_below_kiosk_threshold,
options, raw_payload, crawled_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28)
`, [
canonicalProductId,
row.dispensary_id,
row.platform_dispensary_id,
row.external_product_id,
row.pricing_type,
row.crawl_mode,
row.status,
row.featured,
row.special,
row.medical_only,
row.rec_only,
row.is_present_in_feed,
row.stock_status,
row.rec_min_price_cents,
row.rec_max_price_cents,
row.rec_min_special_price_cents,
row.med_min_price_cents,
row.med_max_price_cents,
row.med_min_special_price_cents,
row.wholesale_min_price_cents,
row.total_quantity_available,
row.total_kiosk_quantity_available,
row.manual_inventory,
row.is_below_threshold,
row.is_below_kiosk_threshold,
row.options,
row.raw_payload,
row.crawled_at,
]);
stats.inserted++;
} catch (err: any) {
stats.errors++;
if (stats.errors <= 5) {
console.error(`[ETL] Error inserting snapshot ${row.id}:`, err.message);
}
}
}
}
offset += BATCH_SIZE;
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} snapshots`);
}
} finally {
legacyClient.release();
cannaiqClient.release();
}
stats.durationMs = Date.now() - startTime;
return stats;
}
// ============================================================
// MAIN
// ============================================================
async function main(): Promise<void> {
console.log('='.repeat(60));
console.log('LEGACY DATA IMPORT ETL');
console.log('='.repeat(60));
const config = parseArgs();
console.log(`Mode: ${config.dryRun ? 'DRY RUN' : 'LIVE'}`);
console.log(`Tables: ${config.tables.join(', ')}`);
console.log('');
// Create connection pools
const legacyPool = createLegacyPool();
const cannaiqPool = createCannaiqPool();
try {
// Test connections
console.log('[ETL] Testing database connections...');
await legacyPool.query('SELECT 1');
console.log('[ETL] Legacy database connected');
await cannaiqPool.query('SELECT 1');
console.log('[ETL] CannaiQ database connected');
console.log('');
// Create staging tables
await createStagingTables(cannaiqPool, config.dryRun);
console.log('');
// Run imports
const allStats: ETLStats[] = [];
if (config.tables.includes('dispensaries')) {
const stats = await importDispensaries(legacyPool, cannaiqPool, config.dryRun);
allStats.push(stats);
console.log('');
}
if (config.tables.includes('products')) {
const stats = await importProducts(legacyPool, cannaiqPool, config.dryRun);
allStats.push(stats);
console.log('');
}
if (config.tables.includes('dutchie_products')) {
const stats = await importDutchieProducts(legacyPool, cannaiqPool, config.dryRun);
allStats.push(stats);
console.log('');
}
if (config.tables.includes('dutchie_product_snapshots')) {
const stats = await importDutchieSnapshots(legacyPool, cannaiqPool, config.dryRun);
allStats.push(stats);
console.log('');
}
// Print summary
console.log('='.repeat(60));
console.log('IMPORT SUMMARY');
console.log('='.repeat(60));
console.log('');
console.log('| Table | Read | Inserted | Skipped | Errors | Duration |');
console.log('|----------------------------|----------|----------|----------|----------|----------|');
for (const s of allStats) {
console.log(`| ${s.table.padEnd(26)} | ${String(s.read).padStart(8)} | ${String(s.inserted).padStart(8)} | ${String(s.skipped).padStart(8)} | ${String(s.errors).padStart(8)} | ${(s.durationMs / 1000).toFixed(1).padStart(7)}s |`);
}
console.log('');
const totalInserted = allStats.reduce((sum, s) => sum + s.inserted, 0);
const totalErrors = allStats.reduce((sum, s) => sum + s.errors, 0);
console.log(`Total inserted: ${totalInserted}`);
console.log(`Total errors: ${totalErrors}`);
if (config.dryRun) {
console.log('');
console.log('DRY RUN COMPLETE - No data was written');
console.log('Run without --dry-run to perform actual import');
}
} catch (error: any) {
console.error('[ETL] Fatal error:', error.message);
process.exit(1);
} finally {
await legacyPool.end();
await cannaiqPool.end();
}
console.log('');
console.log('ETL complete');
}
main().catch((err) => {
console.error('Unhandled error:', err);
process.exit(1);
});

View File

@@ -0,0 +1,397 @@
/**
* Harmonize AZ Dispensaries with Dutchie Source of Truth
*
* This script:
* 1. Queries Dutchie ConsumerDispensaries API for all AZ cities
* 2. Matches our dispensaries by platform_dispensary_id
* 3. Updates existing records with full Dutchie data
* 4. Creates new records for dispensaries in Dutchie but not in our DB
* 5. Disables dispensaries not found in Dutchie
*
* Usage:
* npx tsx src/scripts/harmonize-az-dispensaries.ts
* npx tsx src/scripts/harmonize-az-dispensaries.ts --dry-run
* npx tsx src/scripts/harmonize-az-dispensaries.ts --state CA
*/
import { Pool } from 'pg';
import { executeGraphQL, GRAPHQL_HASHES } from '../platforms/dutchie/client';
const pool = new Pool({
host: process.env.CANNAIQ_DB_HOST || 'localhost',
port: parseInt(process.env.CANNAIQ_DB_PORT || '54320'),
database: process.env.CANNAIQ_DB_NAME || 'dutchie_menus',
user: process.env.CANNAIQ_DB_USER || 'dutchie',
password: process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass',
});
interface Dispensary {
id: number;
name: string;
slug: string;
city: string;
state: string;
platform_dispensary_id: string | null;
dutchie_verified: boolean;
crawl_enabled: boolean;
}
interface DutchieDispensary {
id: string; // Platform ID like "deHiuKKmBHGJKXzuj"
cName: string; // Slug like "the-downtown-dispensary"
name: string;
phone: string | null;
address: string;
description: string | null;
status: string;
chain: string | null;
timezone: string;
location: {
ln1: string;
ln2: string;
city: string;
state: string;
country: string;
zipcode: string;
geometry: {
coordinates: [number, number];
};
};
deliveryHours: any;
pickupHours: any;
offerDelivery: boolean;
offerPickup: boolean;
offerCurbsidePickup: boolean;
isMedical: boolean;
isRecreational: boolean;
}
interface HarmonizationResult {
updated: number;
created: number;
disabled: number;
skipped: number;
errors: string[];
}
// Cities to query for AZ (from statesWithDispensaries)
const AZ_CITIES = [
'Apache Junction', 'Bisbee', 'Bullhead City', 'Casa Grande', 'Chandler',
'Cottonwood', 'El Mirage', 'Flagstaff', 'Florence', 'Gilbert', 'Glendale',
'Globe', 'Goodyear', 'Kingman', 'Lake Havasu City', 'Maricopa', 'Mesa',
'Peoria', 'Phoenix', 'Prescott', 'Prescott Valley', 'Queen Creek',
'Scottsdale', 'Show Low', 'Sierra Vista', 'Snowflake', 'Sun City',
'Surprise', 'Tempe', 'Tolleson', 'Tucson', 'Yuma'
];
async function getDispensaries(state: string): Promise<Dispensary[]> {
const result = await pool.query<Dispensary>(
`SELECT id, name, slug, city, state, platform_dispensary_id,
COALESCE(dutchie_verified, false) as dutchie_verified,
COALESCE(crawl_enabled, true) as crawl_enabled
FROM dispensaries
WHERE state = $1
ORDER BY id`,
[state]
);
return result.rows;
}
async function fetchDutchieDispensariesByCity(
city: string,
state: string
): Promise<DutchieDispensary[]> {
const allDispensaries: DutchieDispensary[] = [];
let page = 0;
const perPage = 100;
while (true) {
const variables = {
dispensaryFilter: {
activeOnly: true,
city,
state,
},
page,
perPage,
};
const result = await executeGraphQL(
'ConsumerDispensaries',
variables,
GRAPHQL_HASHES.ConsumerDispensaries,
{ cName: `${city.toLowerCase().replace(/\s+/g, '-')}-${state.toLowerCase()}`, maxRetries: 2, retryOn403: true }
);
const dispensaries = result?.data?.filteredDispensaries || [];
allDispensaries.push(...dispensaries);
if (dispensaries.length < perPage) break;
page++;
// Rate limit
await new Promise(resolve => setTimeout(resolve, 200));
}
return allDispensaries;
}
async function fetchAllDutchieDispensaries(state: string): Promise<Map<string, DutchieDispensary>> {
const cities = state === 'AZ' ? AZ_CITIES : [];
const dispensaryMap = new Map<string, DutchieDispensary>();
console.log(`Fetching dispensaries from ${cities.length} cities...`);
for (const city of cities) {
const dispensaries = await fetchDutchieDispensariesByCity(city, state);
console.log(` ${city}: ${dispensaries.length} dispensaries`);
for (const d of dispensaries) {
// Index by platform ID
if (d.id && !dispensaryMap.has(d.id)) {
dispensaryMap.set(d.id, d);
}
}
// Rate limit between cities
await new Promise(resolve => setTimeout(resolve, 300));
}
console.log(`Total unique dispensaries from Dutchie: ${dispensaryMap.size}\n`);
return dispensaryMap;
}
async function updateDispensary(
dispensaryId: number,
dutchie: DutchieDispensary,
dryRun: boolean
): Promise<void> {
if (dryRun) return;
const menuUrl = `https://dutchie.com/dispensary/${dutchie.cName}`;
await pool.query(
`UPDATE dispensaries
SET name = $2,
slug = $3,
address = $4,
city = $5,
postal_code = $6,
phone = $7,
latitude = $8,
longitude = $9,
menu_url = $10,
menu_type = 'dutchie',
platform = 'dutchie',
is_delivery = $11,
is_pickup = $12,
dutchie_verified = true,
dutchie_verified_at = NOW(),
crawl_enabled = true,
updated_at = NOW()
WHERE id = $1`,
[
dispensaryId,
dutchie.name.trim(),
dutchie.cName,
dutchie.location?.ln1 || dutchie.address,
dutchie.location?.city || '',
dutchie.location?.zipcode || '',
dutchie.phone,
dutchie.location?.geometry?.coordinates?.[1] || null,
dutchie.location?.geometry?.coordinates?.[0] || null,
menuUrl,
dutchie.offerDelivery ?? false,
dutchie.offerPickup ?? true,
]
);
}
async function createDispensary(
dutchie: DutchieDispensary,
state: string,
dryRun: boolean
): Promise<number | null> {
if (dryRun) return null;
const menuUrl = `https://dutchie.com/dispensary/${dutchie.cName}`;
const result = await pool.query<{ id: number }>(
`INSERT INTO dispensaries (
name, slug, city, state, platform, platform_dispensary_id,
menu_url, menu_type, address, postal_code, latitude, longitude,
is_delivery, is_pickup, phone,
dutchie_verified, dutchie_verified_at,
crawl_enabled, platform_id_source, platform_id_verified_at,
created_at, updated_at
) VALUES (
$1, $2, $3, $4, 'dutchie', $5,
$6, 'dutchie', $7, $8, $9, $10,
$11, $12, $13,
true, NOW(),
true, 'dutchie_harmonization', NOW(),
NOW(), NOW()
)
ON CONFLICT (slug) DO UPDATE SET
platform_dispensary_id = EXCLUDED.platform_dispensary_id,
name = EXCLUDED.name,
menu_url = EXCLUDED.menu_url,
address = EXCLUDED.address,
postal_code = EXCLUDED.postal_code,
latitude = EXCLUDED.latitude,
longitude = EXCLUDED.longitude,
is_delivery = EXCLUDED.is_delivery,
is_pickup = EXCLUDED.is_pickup,
phone = EXCLUDED.phone,
dutchie_verified = true,
dutchie_verified_at = NOW(),
crawl_enabled = true,
updated_at = NOW()
RETURNING id`,
[
dutchie.name.trim(),
dutchie.cName,
dutchie.location?.city || '',
state,
dutchie.id,
menuUrl,
dutchie.location?.ln1 || dutchie.address,
dutchie.location?.zipcode || '',
dutchie.location?.geometry?.coordinates?.[1] || null,
dutchie.location?.geometry?.coordinates?.[0] || null,
dutchie.offerDelivery ?? false,
dutchie.offerPickup ?? true,
dutchie.phone,
]
);
return result.rows[0]?.id || null;
}
async function disableDispensary(dispensaryId: number, reason: string, dryRun: boolean): Promise<void> {
if (dryRun) return;
await pool.query(
`UPDATE dispensaries
SET crawl_enabled = false,
failure_notes = $2,
updated_at = NOW()
WHERE id = $1`,
[dispensaryId, reason]
);
}
async function harmonizeDispensaries(
state: string,
dryRun: boolean = false
): Promise<HarmonizationResult> {
console.log(`\n${'='.repeat(60)}`);
console.log(`HARMONIZING ${state} DISPENSARIES${dryRun ? ' (DRY RUN)' : ''}`);
console.log(`${'='.repeat(60)}\n`);
const result: HarmonizationResult = {
updated: 0,
created: 0,
disabled: 0,
skipped: 0,
errors: [],
};
// Fetch all dispensaries from Dutchie (source of truth)
const dutchieMap = await fetchAllDutchieDispensaries(state);
// Get our current dispensaries
const dispensaries = await getDispensaries(state);
console.log(`Found ${dispensaries.length} dispensaries in our DB\n`);
// Track which Dutchie dispensaries we've matched
const matchedDutchieIds = new Set<string>();
// Step 1: Match our dispensaries to Dutchie by platform_dispensary_id
console.log('[Step 1/3] Matching existing dispensaries to Dutchie...');
for (const disp of dispensaries) {
if (disp.platform_dispensary_id && dutchieMap.has(disp.platform_dispensary_id)) {
// Found match - update with Dutchie data
const dutchie = dutchieMap.get(disp.platform_dispensary_id)!;
try {
await updateDispensary(disp.id, dutchie, dryRun);
console.log(` [UPDATED] ${disp.name} -> ${dutchie.name} (${dutchie.cName})`);
result.updated++;
matchedDutchieIds.add(disp.platform_dispensary_id);
} catch (error: any) {
console.error(` [ERROR] ${disp.name}: ${error.message}`);
result.errors.push(`Update ${disp.name}: ${error.message}`);
}
} else if (disp.platform_dispensary_id) {
// Has platform ID but not found in Dutchie - maybe closed?
console.log(` [NOT FOUND] ${disp.name} (${disp.platform_dispensary_id}) - not in Dutchie`);
await disableDispensary(disp.id, 'Platform ID not found in Dutchie - may be closed', dryRun);
result.disabled++;
} else {
// No platform ID - disable
console.log(` [NO ID] ${disp.name} - no platform_dispensary_id`);
await disableDispensary(disp.id, 'No platform_dispensary_id', dryRun);
result.disabled++;
}
}
// Step 2: Create new dispensaries for Dutchie records we don't have
console.log(`\n[Step 2/3] Creating new dispensaries from Dutchie...`);
for (const [platformId, dutchie] of dutchieMap) {
if (matchedDutchieIds.has(platformId)) {
continue; // Already matched
}
try {
const newId = await createDispensary(dutchie, state, dryRun);
console.log(` [CREATED] ${dutchie.name} (${dutchie.cName}) -> ID ${newId || '(dry-run)'}`);
result.created++;
} catch (error: any) {
console.error(` [ERROR] ${dutchie.name}: ${error.message}`);
result.errors.push(`Create ${dutchie.name}: ${error.message}`);
}
}
// Summary
console.log(`\n${'='.repeat(60)}`);
console.log('HARMONIZATION SUMMARY');
console.log(`${'='.repeat(60)}`);
console.log(` Updated (matched to Dutchie): ${result.updated}`);
console.log(` Created (new from Dutchie): ${result.created}`);
console.log(` Disabled (not in Dutchie): ${result.disabled}`);
console.log(` Errors: ${result.errors.length}`);
if (result.errors.length > 0) {
console.log(`\nErrors:`);
result.errors.slice(0, 20).forEach(e => console.log(` - ${e}`));
if (result.errors.length > 20) {
console.log(` ... and ${result.errors.length - 20} more`);
}
}
return result;
}
async function main() {
const args = process.argv.slice(2);
let state = 'AZ';
let dryRun = false;
for (let i = 0; i < args.length; i++) {
if (args[i] === '--state' && args[i + 1]) {
state = args[i + 1].toUpperCase();
i++;
} else if (args[i] === '--dry-run') {
dryRun = true;
}
}
try {
await harmonizeDispensaries(state, dryRun);
} finally {
await pool.end();
}
}
main().catch(console.error);

View File

@@ -1,583 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Queue Intelligence Script
*
* Orchestrates the multi-category intelligence crawler system:
* 1. Queue dispensaries that need provider detection (all 4 categories)
* 2. Queue per-category production crawls (Dutchie products only for now)
* 3. Queue per-category sandbox crawls (all providers)
*
* Each category (product, specials, brand, metadata) is handled independently.
* A failure in one category does NOT affect other categories.
*
* Usage:
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
* npx tsx src/scripts/queue-intelligence.ts --dry-run
*/
import { pool } from '../db/pool';
import { logger } from '../services/logger';
import {
detectMultiCategoryProviders,
updateAllCategoryProviders,
IntelligenceCategory,
} from '../services/intelligence-detector';
import {
runCrawlProductsJob,
runCrawlSpecialsJob,
runCrawlBrandIntelligenceJob,
runCrawlMetadataJob,
runSandboxProductsJob,
runSandboxSpecialsJob,
runSandboxBrandJob,
runSandboxMetadataJob,
runAllCategoryProductionCrawls,
runAllCategorySandboxCrawls,
processCategorySandboxJobs,
} from '../services/category-crawler-jobs';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
category: args.find(a => a.startsWith('--category='))?.split('=')[1] as IntelligenceCategory | undefined,
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
const CATEGORIES: IntelligenceCategory[] = ['product', 'specials', 'brand', 'metadata'];
async function showHelp() {
console.log(`
Queue Intelligence - Multi-Category Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need multi-category detection
--production Queue per-category production crawls
--sandbox Queue per-category sandbox crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
--dispensary=ID Process only a specific dispensary
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
CATEGORIES:
product - Product/menu data (Dutchie=production, others=sandbox)
specials - Deals and specials (all sandbox for now)
brand - Brand intelligence (all sandbox for now)
metadata - Categories/taxonomy (all sandbox for now)
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-intelligence.ts
# Only queue product detection jobs
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
# Process sandbox jobs for specials category
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
# Run full detection for a specific dispensary
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
# Dry run to see what would be queued
npx tsx src/scripts/queue-intelligence.ts --dry-run
`);
}
async function queueMultiCategoryDetection(): Promise<number> {
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
// Find dispensaries that need provider detection for any category:
// - Any *_provider is null OR
// - Any *_confidence < 70
// - has a website URL
const query = `
SELECT id, name, website, menu_url,
product_provider, product_confidence, product_crawler_mode,
specials_provider, specials_confidence, specials_crawler_mode,
brand_provider, brand_confidence, brand_crawler_mode,
metadata_provider, metadata_confidence, metadata_crawler_mode
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND (
product_provider IS NULL OR product_confidence < 70 OR
specials_provider IS NULL OR specials_confidence < 70 OR
brand_provider IS NULL OR brand_confidence < 70 OR
metadata_provider IS NULL OR metadata_confidence < 70
)
ORDER BY
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
product_confidence ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
for (const row of result.rows) {
const needsDetection: string[] = [];
if (!row.product_provider || row.product_confidence < 70) needsDetection.push('product');
if (!row.specials_provider || row.specials_confidence < 70) needsDetection.push('specials');
if (!row.brand_provider || row.brand_confidence < 70) needsDetection.push('brand');
if (!row.metadata_provider || row.metadata_confidence < 70) needsDetection.push('metadata');
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Create detection jobs for each category that needs it
for (const category of CATEGORIES) {
const provider = dispensary[`${category}_provider`];
const confidence = dispensary[`${category}_confidence`];
if (!provider || confidence < 70) {
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
VALUES ($1, $2, 'detection', 'pending', 10)
ON CONFLICT DO NOTHING`,
[dispensary.id, category]
);
}
}
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueCategoryProductionCrawls(category?: IntelligenceCategory): Promise<number> {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
// For now, only products have production-ready crawlers (Dutchie only)
if (cat !== 'product') {
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
continue;
}
// Find dispensaries ready for production crawl
const query = `
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
FROM dispensaries
WHERE ${cat}_provider = 'dutchie'
AND ${cat}_crawler_mode = 'production'
AND ${cat}_confidence >= 70
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
last_${cat}_scan_at ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
for (const row of result.rows) {
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// For products, use the existing crawl_jobs table for production
await pool.query(
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id, cat]
);
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
totalQueued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function queueCategorySandboxCrawls(category?: IntelligenceCategory): Promise<number> {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
// Find dispensaries in sandbox mode for this category
const query = `
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
d.website, d.menu_url
FROM dispensaries d
WHERE d.${cat}_crawler_mode = 'sandbox'
AND d.${cat}_provider IS NOT NULL
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.category = $1
AND sj.status IN ('pending', 'running')
)
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
LIMIT $2
`;
const result = await pool.query(query, [cat, flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// Create sandbox entry if needed
const sandboxResult = await pool.query(
`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
VALUES ($1, $2, $3, 'template_learning', 'pending')
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
DO UPDATE SET updated_at = NOW()
RETURNING id`,
[dispensary.id, cat, dispensary.provider]
);
const sandboxId = sandboxResult.rows[0]?.id;
// Create sandbox job
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`,
[dispensary.id, sandboxId, cat]
);
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
totalQueued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function processDetectionJobs(): Promise<void> {
console.log('\n🔍 Processing Detection Jobs...');
// Get pending detection jobs
const jobs = await pool.query(
`SELECT DISTINCT dispensary_id
FROM sandbox_crawl_jobs
WHERE job_type = 'detection' AND status = 'pending'
${flags.category ? `AND category = $2` : ''}
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
LIMIT $1`,
flags.category
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit])
);
for (const job of jobs.rows) {
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
try {
// Get dispensary info
const dispResult = await pool.query(
'SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1',
[job.dispensary_id]
);
const dispensary = dispResult.rows[0];
if (!dispensary) {
console.log(` ✗ Dispensary not found`);
continue;
}
const websiteUrl = dispensary.website || dispensary.menu_url;
if (!websiteUrl) {
console.log(` ✗ No website URL`);
continue;
}
// Mark jobs as running
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`,
[job.dispensary_id]
);
// Run multi-category detection
console.log(` Detecting providers for ${dispensary.name}...`);
const detection = await detectMultiCategoryProviders(websiteUrl, { timeout: 45000 });
// Update all categories
await updateAllCategoryProviders(job.dispensary_id, detection);
// Mark jobs as completed
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
result_summary = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
[JSON.stringify({
product: { provider: detection.product.provider, confidence: detection.product.confidence },
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
}), job.dispensary_id]
);
console.log(` ✓ Detection complete:`);
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
[error.message, job.dispensary_id]
);
}
}
}
async function processCrawlJobs(): Promise<void> {
const categories = flags.category ? [flags.category] : CATEGORIES;
for (const cat of categories) {
console.log(`\n⚙ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
// Process sandbox jobs for this category
if (flags.sandbox || !flags.production) {
await processCategorySandboxJobs(cat, flags.limit);
}
// Process production jobs for this category
if (flags.production && cat === 'product') {
// Get pending production crawls
const prodJobs = await pool.query(
`SELECT d.id
FROM dispensaries d
WHERE d.product_provider = 'dutchie'
AND d.product_crawler_mode = 'production'
AND d.product_confidence >= 70
${flags.dispensary ? 'AND d.id = $2' : ''}
LIMIT $1`,
flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]
);
for (const job of prodJobs.rows) {
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
const result = await runCrawlProductsJob(job.id);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
}
}
}
}
async function processSpecificDispensary(): Promise<void> {
if (!flags.dispensary) return;
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
const dispResult = await pool.query(
'SELECT * FROM dispensaries WHERE id = $1',
[flags.dispensary]
);
if (dispResult.rows.length === 0) {
console.log('Dispensary not found');
return;
}
const dispensary = dispResult.rows[0];
console.log(`Name: ${dispensary.name}`);
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
console.log('');
if (flags.detection) {
console.log('Running multi-category detection...');
const websiteUrl = dispensary.website || dispensary.menu_url;
if (websiteUrl) {
const detection = await detectMultiCategoryProviders(websiteUrl);
await updateAllCategoryProviders(flags.dispensary, detection);
console.log('Detection results:');
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
}
}
if (flags.production) {
console.log('\nRunning production crawls...');
const results = await runAllCategoryProductionCrawls(flags.dispensary);
console.log(` ${results.summary}`);
}
if (flags.sandbox) {
console.log('\nRunning sandbox crawls...');
const results = await runAllCategorySandboxCrawls(flags.dispensary);
console.log(` ${results.summary}`);
}
}
async function showStats(): Promise<void> {
console.log('\n📊 Multi-Category Intelligence Stats:');
// Per-category stats
for (const cat of CATEGORIES) {
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
AVG(${cat}_confidence) as avg_confidence
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
${cat.toUpperCase()}:
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
Modes: Production=${s.production}, Sandbox=${s.sandbox}
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
}
// Job stats per category
console.log('\n Sandbox Jobs by Category:');
const jobStats = await pool.query(`
SELECT
category,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
GROUP BY category
ORDER BY category
`);
for (const row of jobStats.rows) {
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Category Intelligence Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
if (flags.category) {
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
}
try {
// Show current stats first
await showStats();
// If specific dispensary specified, process it directly
if (flags.dispensary && flags.process) {
await processSpecificDispensary();
} else if (flags.process) {
// Process mode - run jobs
if (flags.detection) {
await processDetectionJobs();
}
await processCrawlJobs();
} else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueMultiCategoryDetection();
}
if (flags.production) {
totalQueued += await queueCategoryProductionCrawls(flags.category);
}
if (flags.sandbox) {
totalQueued += await queueCategorySandboxCrawls(flags.category);
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -1,173 +0,0 @@
#!/usr/bin/env npx tsx
/**
* Dutchie Platform ID Resolver
*
* Standalone script to resolve a Dutchie dispensary slug to its platform ID.
*
* USAGE:
* npx tsx src/scripts/resolve-dutchie-id.ts <slug>
* npx tsx src/scripts/resolve-dutchie-id.ts hydroman-dispensary
* npx tsx src/scripts/resolve-dutchie-id.ts AZ-Deeply-Rooted
*
* RESOLUTION STRATEGY:
* 1. Navigate to https://dutchie.com/embedded-menu/{slug} via Puppeteer
* 2. Extract window.reactEnv.dispensaryId (preferred - fastest)
* 3. If reactEnv fails, call GraphQL GetAddressBasedDispensaryData as fallback
*
* OUTPUT:
* - dispensaryId: The MongoDB ObjectId (e.g., "6405ef617056e8014d79101b")
* - source: "reactEnv" or "graphql"
* - httpStatus: HTTP status from embedded menu page
* - error: Error message if resolution failed
*/
import { resolveDispensaryIdWithDetails, ResolveDispensaryResult } from '../dutchie-az/services/graphql-client';
async function main() {
const args = process.argv.slice(2);
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
console.log(`
Dutchie Platform ID Resolver
Usage:
npx tsx src/scripts/resolve-dutchie-id.ts <slug>
Examples:
npx tsx src/scripts/resolve-dutchie-id.ts hydroman-dispensary
npx tsx src/scripts/resolve-dutchie-id.ts AZ-Deeply-Rooted
npx tsx src/scripts/resolve-dutchie-id.ts mint-cannabis
Resolution Strategy:
1. Puppeteer navigates to https://dutchie.com/embedded-menu/{slug}
2. Extracts window.reactEnv.dispensaryId (preferred)
3. Falls back to GraphQL GetAddressBasedDispensaryData if needed
Output Fields:
- dispensaryId: MongoDB ObjectId (e.g., "6405ef617056e8014d79101b")
- source: "reactEnv" (from page) or "graphql" (from API)
- httpStatus: HTTP status code from page load
- error: Error message if resolution failed
`);
process.exit(0);
}
const slug = args[0];
console.log('='.repeat(60));
console.log('DUTCHIE PLATFORM ID RESOLVER');
console.log('='.repeat(60));
console.log(`Slug: ${slug}`);
console.log(`Embedded Menu URL: https://dutchie.com/embedded-menu/${slug}`);
console.log('');
console.log('Resolving...');
console.log('');
const startTime = Date.now();
try {
const result: ResolveDispensaryResult = await resolveDispensaryIdWithDetails(slug);
const duration = Date.now() - startTime;
console.log('='.repeat(60));
console.log('RESOLUTION RESULT');
console.log('='.repeat(60));
if (result.dispensaryId) {
console.log(`✓ SUCCESS`);
console.log('');
console.log(` Dispensary ID: ${result.dispensaryId}`);
console.log(` Source: ${result.source}`);
console.log(` HTTP Status: ${result.httpStatus || 'N/A'}`);
console.log(` Duration: ${duration}ms`);
console.log('');
// Show how to use this ID
console.log('='.repeat(60));
console.log('USAGE');
console.log('='.repeat(60));
console.log('');
console.log('Use this ID in GraphQL FilteredProducts query:');
console.log('');
console.log(' POST https://dutchie.com/api-3/graphql');
console.log('');
console.log(' Body:');
console.log(` {
"operationName": "FilteredProducts",
"variables": {
"productsFilter": {
"dispensaryId": "${result.dispensaryId}",
"pricingType": "rec",
"Status": "Active"
},
"page": 0,
"perPage": 100
},
"extensions": {
"persistedQuery": {
"version": 1,
"sha256Hash": "ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0"
}
}
}`);
console.log('');
// Output for piping/scripting
console.log('='.repeat(60));
console.log('JSON OUTPUT');
console.log('='.repeat(60));
console.log(JSON.stringify({
success: true,
slug,
dispensaryId: result.dispensaryId,
source: result.source,
httpStatus: result.httpStatus,
durationMs: duration,
}, null, 2));
} else {
console.log(`✗ FAILED`);
console.log('');
console.log(` Error: ${result.error || 'Unknown error'}`);
console.log(` HTTP Status: ${result.httpStatus || 'N/A'}`);
console.log(` Duration: ${duration}ms`);
console.log('');
if (result.httpStatus === 403 || result.httpStatus === 404) {
console.log('NOTE: This store may be removed or not accessible on Dutchie.');
console.log(' Mark dispensary as not_crawlable in the database.');
}
console.log('');
console.log('JSON OUTPUT:');
console.log(JSON.stringify({
success: false,
slug,
error: result.error,
httpStatus: result.httpStatus,
durationMs: duration,
}, null, 2));
process.exit(1);
}
} catch (error: any) {
const duration = Date.now() - startTime;
console.error('='.repeat(60));
console.error('ERROR');
console.error('='.repeat(60));
console.error(`Message: ${error.message}`);
console.error(`Duration: ${duration}ms`);
console.error('');
if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
console.error('NOTE: DNS resolution failed. This typically happens when running');
console.error(' locally due to network restrictions. Try running from the');
console.error(' Kubernetes pod or a cloud environment.');
}
process.exit(1);
}
}
main();

View File

@@ -1,151 +0,0 @@
/**
* LEGACY SCRIPT - Run Dutchie GraphQL Scrape
*
* DEPRECATED: This script creates its own database pool.
* Future implementations should use the CannaiQ API endpoints instead.
*
* This script demonstrates the full pipeline:
* 1. Puppeteer navigates to Dutchie menu
* 2. GraphQL responses are intercepted
* 3. Products are normalized to our schema
* 4. Products are upserted to database
* 5. Derived views (brands, categories, specials) are automatically updated
*
* DO NOT:
* - Add this to package.json scripts
* - Run this in automated jobs
* - Use DATABASE_URL directly
*/
import { Pool } from 'pg';
import { scrapeDutchieMenu } from '../scrapers/dutchie-graphql';
console.warn('\n⚠ LEGACY SCRIPT: This script should be replaced with CannaiQ API calls.\n');
// Single database connection (cannaiq in cannaiq-postgres container)
const DATABASE_URL = process.env.CANNAIQ_DB_URL ||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
async function main() {
const pool = new Pool({ connectionString: DATABASE_URL });
try {
console.log('='.repeat(80));
console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST');
console.log('='.repeat(80));
console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`);
// Configuration
const storeId = 1; // Deeply Rooted
const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
console.log(`\nStore ID: ${storeId}`);
console.log(`Menu URL: ${menuUrl}`);
console.log('\n' + '-'.repeat(80));
// Run the scrape
console.log('\n🚀 Starting scrape...\n');
const result = await scrapeDutchieMenu(pool, storeId, menuUrl);
console.log('\n' + '-'.repeat(80));
console.log('📊 SCRAPE RESULTS:');
console.log('-'.repeat(80));
console.log(` Success: ${result.success}`);
console.log(` Products Found: ${result.productsFound}`);
console.log(` Inserted: ${result.inserted}`);
console.log(` Updated: ${result.updated}`);
if (result.error) {
console.log(` Error: ${result.error}`);
}
// Query derived views to show the result
if (result.success) {
console.log('\n' + '-'.repeat(80));
console.log('📈 DERIVED DATA (from products table):');
console.log('-'.repeat(80));
// Brands
const brandsResult = await pool.query(`
SELECT brand_name, product_count, min_price, max_price
FROM derived_brands
WHERE store_id = $1
ORDER BY product_count DESC
LIMIT 5
`, [storeId]);
console.log('\nTop 5 Brands:');
brandsResult.rows.forEach(row => {
console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`);
});
// Specials
const specialsResult = await pool.query(`
SELECT name, brand, rec_price, rec_special_price, discount_percent
FROM current_specials
WHERE store_id = $1
LIMIT 5
`, [storeId]);
console.log('\nTop 5 Specials:');
if (specialsResult.rows.length === 0) {
console.log(' (No specials found - is_on_special may not be populated yet)');
} else {
specialsResult.rows.forEach(row => {
console.log(` - ${row.name} (${row.brand}): $${row.rec_price}$${row.rec_special_price} (${row.discount_percent}% off)`);
});
}
// Categories
const categoriesResult = await pool.query(`
SELECT category_name, product_count
FROM derived_categories
WHERE store_id = $1
ORDER BY product_count DESC
LIMIT 5
`, [storeId]);
console.log('\nTop 5 Categories:');
if (categoriesResult.rows.length === 0) {
console.log(' (No categories found - subcategory may not be populated yet)');
} else {
categoriesResult.rows.forEach(row => {
console.log(` - ${row.category_name}: ${row.product_count} products`);
});
}
// Sample product
const sampleResult = await pool.query(`
SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status
FROM products
WHERE store_id = $1 AND subcategory IS NOT NULL
ORDER BY updated_at DESC
LIMIT 1
`, [storeId]);
if (sampleResult.rows.length > 0) {
const sample = sampleResult.rows[0];
console.log('\nSample Product (with new fields):');
console.log(` Name: ${sample.name}`);
console.log(` Brand: ${sample.brand}`);
console.log(` Category: ${sample.subcategory}`);
console.log(` Price: $${sample.rec_price}`);
console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`);
console.log(` On Special: ${sample.is_on_special}`);
console.log(` THC: ${sample.thc_percentage}%`);
console.log(` Status: ${sample.status}`);
}
}
console.log('\n' + '='.repeat(80));
console.log('✅ SCRAPE COMPLETE');
console.log('='.repeat(80));
} catch (error: any) {
console.error('\n❌ Error:', error.message);
throw error;
} finally {
await pool.end();
}
}
main().catch(console.error);

View File

@@ -1,225 +0,0 @@
/**
* Sandbox Crawl Script for Dispensary 101 (Trulieve Scottsdale)
*
* Runs a full crawl and captures trace data for observability.
* NO automatic promotion or status changes.
*/
import { Pool } from 'pg';
import { crawlDispensaryProducts } from '../dutchie-az/services/product-crawler';
import { Dispensary } from '../dutchie-az/types';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function main() {
console.log('=== SANDBOX CRAWL: Dispensary 101 (Trulieve Scottsdale) ===\n');
const startTime = Date.now();
// Load dispensary from database (only columns that exist in local schema)
const dispResult = await pool.query(`
SELECT id, name, city, state, menu_type, platform_dispensary_id, menu_url
FROM dispensaries
WHERE id = 101
`);
if (!dispResult.rows[0]) {
console.log('ERROR: Dispensary 101 not found');
await pool.end();
return;
}
const row = dispResult.rows[0];
// Map to Dispensary interface (snake_case -> camelCase)
const dispensary: Dispensary = {
id: row.id,
platform: 'dutchie',
name: row.name,
slug: row.name.toLowerCase().replace(/\s+/g, '-'),
city: row.city,
state: row.state,
platformDispensaryId: row.platform_dispensary_id,
menuType: row.menu_type,
menuUrl: row.menu_url,
createdAt: new Date(),
updatedAt: new Date(),
};
console.log('=== DISPENSARY INFO ===');
console.log(`Name: ${dispensary.name}`);
console.log(`Location: ${dispensary.city}, ${dispensary.state}`);
console.log(`Menu Type: ${dispensary.menuType}`);
console.log(`Platform ID: ${dispensary.platformDispensaryId}`);
console.log(`Menu URL: ${dispensary.menuUrl}`);
console.log('');
// Get profile info
const profileResult = await pool.query(`
SELECT id, profile_key, status, config FROM dispensary_crawler_profiles
WHERE dispensary_id = 101
`);
const profile = profileResult.rows[0];
if (profile) {
console.log('=== PROFILE ===');
console.log(`Profile Key: ${profile.profile_key}`);
console.log(`Profile Status: ${profile.status}`);
console.log(`Config: ${JSON.stringify(profile.config, null, 2)}`);
console.log('');
} else {
console.log('=== PROFILE ===');
console.log('No profile found - will use defaults');
console.log('');
}
// Run the crawl
console.log('=== STARTING CRAWL ===');
console.log('Options: useBothModes=true, downloadImages=false (sandbox)');
console.log('');
try {
const result = await crawlDispensaryProducts(dispensary, 'rec', {
useBothModes: true,
downloadImages: false, // Skip images in sandbox mode for speed
});
console.log('');
console.log('=== CRAWL RESULT ===');
console.log(`Success: ${result.success}`);
console.log(`Products Found: ${result.productsFound}`);
console.log(`Products Fetched: ${result.productsFetched}`);
console.log(`Products Upserted: ${result.productsUpserted}`);
console.log(`Snapshots Created: ${result.snapshotsCreated}`);
if (result.errorMessage) {
console.log(`Error: ${result.errorMessage}`);
}
console.log(`Duration: ${result.durationMs}ms`);
console.log('');
// Show sample products from database
if (result.productsUpserted > 0) {
const sampleProducts = await pool.query(`
SELECT
id, name, brand_name, type, subcategory, strain_type,
price_rec, price_rec_original, stock_status, external_product_id
FROM dutchie_products
WHERE dispensary_id = 101
ORDER BY updated_at DESC
LIMIT 10
`);
console.log('=== SAMPLE PRODUCTS (10) ===');
sampleProducts.rows.forEach((p: any, i: number) => {
console.log(`${i + 1}. ${p.name}`);
console.log(` Brand: ${p.brand_name || 'N/A'}`);
console.log(` Type: ${p.type} / ${p.subcategory || 'N/A'}`);
console.log(` Strain: ${p.strain_type || 'N/A'}`);
console.log(` Price: $${p.price_rec || 'N/A'} (orig: $${p.price_rec_original || 'N/A'})`);
console.log(` Stock: ${p.stock_status}`);
console.log(` External ID: ${p.external_product_id}`);
console.log('');
});
// Show field coverage stats
const fieldStats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(brand_name) as with_brand,
COUNT(type) as with_type,
COUNT(strain_type) as with_strain,
COUNT(price_rec) as with_price,
COUNT(image_url) as with_image,
COUNT(description) as with_description,
COUNT(thc_content) as with_thc,
COUNT(cbd_content) as with_cbd
FROM dutchie_products
WHERE dispensary_id = 101
`);
const stats = fieldStats.rows[0];
console.log('=== FIELD COVERAGE ===');
console.log(`Total products: ${stats.total}`);
console.log(`With brand: ${stats.with_brand} (${Math.round(stats.with_brand / stats.total * 100)}%)`);
console.log(`With type: ${stats.with_type} (${Math.round(stats.with_type / stats.total * 100)}%)`);
console.log(`With strain_type: ${stats.with_strain} (${Math.round(stats.with_strain / stats.total * 100)}%)`);
console.log(`With price_rec: ${stats.with_price} (${Math.round(stats.with_price / stats.total * 100)}%)`);
console.log(`With image_url: ${stats.with_image} (${Math.round(stats.with_image / stats.total * 100)}%)`);
console.log(`With description: ${stats.with_description} (${Math.round(stats.with_description / stats.total * 100)}%)`);
console.log(`With THC: ${stats.with_thc} (${Math.round(stats.with_thc / stats.total * 100)}%)`);
console.log(`With CBD: ${stats.with_cbd} (${Math.round(stats.with_cbd / stats.total * 100)}%)`);
console.log('');
}
// Insert trace record for observability
const traceData = {
crawlResult: result,
dispensaryInfo: {
id: dispensary.id,
name: dispensary.name,
platformDispensaryId: dispensary.platformDispensaryId,
menuUrl: dispensary.menuUrl,
},
profile: profile || null,
timestamp: new Date().toISOString(),
};
await pool.query(`
INSERT INTO crawl_orchestration_traces
(dispensary_id, profile_id, profile_key, crawler_module, mode,
state_at_start, state_at_end, trace, success, products_found,
duration_ms, started_at, completed_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
`, [
101,
profile?.id || null,
profile?.profile_key || null,
'product-crawler',
'sandbox',
profile?.status || 'no_profile',
profile?.status || 'no_profile', // No status change in sandbox
JSON.stringify(traceData),
result.success,
result.productsFound,
result.durationMs,
new Date(startTime),
]);
console.log('=== TRACE RECORDED ===');
console.log('Trace saved to crawl_orchestration_traces table');
} catch (error: any) {
console.error('=== CRAWL ERROR ===');
console.error('Error:', error.message);
console.error('Stack:', error.stack);
// Record error trace
await pool.query(`
INSERT INTO crawl_orchestration_traces
(dispensary_id, profile_id, profile_key, crawler_module, mode,
state_at_start, state_at_end, trace, success, error_message,
duration_ms, started_at, completed_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
`, [
101,
profile?.id || null,
profile?.profile_key || null,
'product-crawler',
'sandbox',
profile?.status || 'no_profile',
profile?.status || 'no_profile',
JSON.stringify({ error: error.message, stack: error.stack }),
false,
error.message,
Date.now() - startTime,
new Date(startTime),
]);
}
await pool.end();
console.log('=== SANDBOX CRAWL COMPLETE ===');
}
main().catch(e => {
console.error('Fatal error:', e.message);
process.exit(1);
});

View File

@@ -1,181 +0,0 @@
/**
* LEGACY SCRIPT - Sandbox Crawl Test
*
* DEPRECATED: This script uses direct database connections.
* Future implementations should use the CannaiQ API endpoints instead.
*
* This script runs sandbox crawl for a dispensary and captures the full trace.
* It is kept for historical reference and manual testing only.
*
* DO NOT:
* - Add this to package.json scripts
* - Run this in automated jobs
* - Use DATABASE_URL directly
*
* Usage (manual only):
* STORAGE_DRIVER=local npx tsx src/scripts/sandbox-test.ts <dispensary_id>
*
* LOCAL MODE REQUIREMENTS:
* - STORAGE_DRIVER=local
* - STORAGE_BASE_PATH=./storage
* - Local cannaiq-postgres on port 54320
* - NO MinIO, NO Kubernetes
*/
import { query, getClient, closePool } from '../dutchie-az/db/connection';
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
// Verify local mode
function verifyLocalMode(): void {
const storageDriver = process.env.STORAGE_DRIVER || 'local';
const minioEndpoint = process.env.MINIO_ENDPOINT;
console.log('=== LOCAL MODE VERIFICATION ===');
console.log(`STORAGE_DRIVER: ${storageDriver}`);
console.log(`MINIO_ENDPOINT: ${minioEndpoint || 'NOT SET (good)'}`);
console.log(`STORAGE_BASE_PATH: ${process.env.STORAGE_BASE_PATH || './storage'}`);
console.log('DB Connection: Using canonical CannaiQ pool');
if (storageDriver !== 'local') {
console.error('ERROR: STORAGE_DRIVER must be "local"');
process.exit(1);
}
if (minioEndpoint) {
console.error('ERROR: MINIO_ENDPOINT should NOT be set in local mode');
process.exit(1);
}
console.log('✅ Local mode verified\n');
}
async function getDispensaryInfo(dispensaryId: number) {
const result = await query(`
SELECT d.id, d.name, d.city, d.menu_type, d.platform_dispensary_id, d.menu_url,
p.profile_key, p.status as profile_status, p.config
FROM dispensaries d
LEFT JOIN dispensary_crawler_profiles p ON p.dispensary_id = d.id
WHERE d.id = $1
`, [dispensaryId]);
return result.rows[0];
}
async function getLatestTrace(dispensaryId: number) {
const result = await query(`
SELECT *
FROM crawl_orchestration_traces
WHERE dispensary_id = $1
ORDER BY created_at DESC
LIMIT 1
`, [dispensaryId]);
return result.rows[0];
}
async function main() {
console.warn('\n⚠ LEGACY SCRIPT: This script should be replaced with CannaiQ API calls.\n');
const dispensaryId = parseInt(process.argv[2], 10);
if (!dispensaryId || isNaN(dispensaryId)) {
console.error('Usage: npx tsx src/scripts/sandbox-test.ts <dispensary_id>');
console.error('Example: npx tsx src/scripts/sandbox-test.ts 101');
process.exit(1);
}
// Verify local mode first
verifyLocalMode();
try {
// Get dispensary info
console.log(`=== DISPENSARY INFO (ID: ${dispensaryId}) ===`);
const dispensary = await getDispensaryInfo(dispensaryId);
if (!dispensary) {
console.error(`Dispensary ${dispensaryId} not found`);
process.exit(1);
}
console.log(`Name: ${dispensary.name}`);
console.log(`City: ${dispensary.city}`);
console.log(`Menu Type: ${dispensary.menu_type}`);
console.log(`Platform Dispensary ID: ${dispensary.platform_dispensary_id || 'NULL'}`);
console.log(`Menu URL: ${dispensary.menu_url || 'NULL'}`);
console.log(`Profile Key: ${dispensary.profile_key || 'NONE'}`);
console.log(`Profile Status: ${dispensary.profile_status || 'N/A'}`);
console.log(`Profile Config: ${JSON.stringify(dispensary.config, null, 2)}`);
console.log('');
// Run sandbox crawl
console.log('=== RUNNING SANDBOX CRAWL ===');
console.log(`Starting sandbox crawl for ${dispensary.name}...`);
const startTime = Date.now();
const result = await runDispensaryOrchestrator(dispensaryId);
const duration = Date.now() - startTime;
console.log('\n=== CRAWL RESULT ===');
console.log(`Status: ${result.status}`);
console.log(`Summary: ${result.summary}`);
console.log(`Run ID: ${result.runId}`);
console.log(`Duration: ${duration}ms`);
console.log(`Detection Ran: ${result.detectionRan}`);
console.log(`Crawl Ran: ${result.crawlRan}`);
console.log(`Crawl Type: ${result.crawlType || 'N/A'}`);
console.log(`Products Found: ${result.productsFound || 0}`);
console.log(`Products New: ${result.productsNew || 0}`);
console.log(`Products Updated: ${result.productsUpdated || 0}`);
if (result.error) {
console.log(`Error: ${result.error}`);
}
// Get the trace
console.log('\n=== ORCHESTRATOR TRACE ===');
const trace = await getLatestTrace(dispensaryId);
if (trace) {
console.log(`Trace ID: ${trace.id}`);
console.log(`Profile Key: ${trace.profile_key || 'N/A'}`);
console.log(`Mode: ${trace.mode}`);
console.log(`Status: ${trace.status}`);
console.log(`Started At: ${trace.started_at}`);
console.log(`Completed At: ${trace.completed_at || 'In Progress'}`);
if (trace.steps && Array.isArray(trace.steps)) {
console.log(`\nSteps (${trace.steps.length} total):`);
trace.steps.forEach((step: any, i: number) => {
const status = step.status === 'completed' ? '✅' : step.status === 'failed' ? '❌' : '⏳';
console.log(` ${i + 1}. ${status} ${step.action}: ${step.description}`);
if (step.output && Object.keys(step.output).length > 0) {
console.log(` Output: ${JSON.stringify(step.output)}`);
}
if (step.error) {
console.log(` Error: ${step.error}`);
}
});
}
if (trace.result) {
console.log(`\nResult: ${JSON.stringify(trace.result, null, 2)}`);
}
if (trace.error_message) {
console.log(`\nError Message: ${trace.error_message}`);
}
} else {
console.log('No trace found for this dispensary');
}
} catch (error: any) {
console.error('Error running sandbox test:', error.message);
console.error(error.stack);
process.exit(1);
} finally {
await closePool();
}
}
main();

View File

@@ -1,332 +0,0 @@
/**
* LEGACY SCRIPT - Scrape All Active Products
*
* DEPRECATED: This script creates its own database pool.
* Future implementations should use the CannaiQ API endpoints instead.
*
* Scrapes ALL active products via direct GraphQL pagination.
* This is more reliable than category navigation.
*
* DO NOT:
* - Add this to package.json scripts
* - Run this in automated jobs
* - Use DATABASE_URL directly
*/
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
import { normalizeDutchieProduct, DutchieProduct } from '../scrapers/dutchie-graphql';
puppeteer.use(StealthPlugin());
console.warn('\n⚠ LEGACY SCRIPT: This script should be replaced with CannaiQ API calls.\n');
// Single database connection (cannaiq in cannaiq-postgres container)
const DATABASE_URL = process.env.CANNAIQ_DB_URL ||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
async function scrapeAllProducts(menuUrl: string, storeId: number) {
const pool = new Pool({ connectionString: DATABASE_URL });
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'
);
console.log('Loading menu to establish session...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
const dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId);
console.log('Dispensary ID:', dispensaryId);
// Paginate through all products
const allProducts: DutchieProduct[] = [];
let pageNum = 0;
const perPage = 100;
console.log('\nFetching all products via paginated GraphQL...');
while (true) {
const result = await page.evaluate(
async (dispId: string, hash: string, page: number, perPage: number) => {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: dispId,
pricingType: 'rec',
Status: 'Active',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
});
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
method: 'GET',
headers: {
'content-type': 'application/json',
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include',
});
const json = await resp.json();
return {
products: json?.data?.filteredProducts?.products || [],
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
};
},
dispensaryId,
GRAPHQL_HASH,
pageNum,
perPage
);
if (result.products.length === 0) {
break;
}
allProducts.push(...result.products);
console.log(
`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`
);
pageNum++;
// Safety limit
if (pageNum > 50) {
console.log('Reached page limit');
break;
}
}
console.log(`\nTotal products fetched: ${allProducts.length}`);
// Normalize and upsert
console.log('\nNormalizing and upserting to database...');
const normalized = allProducts.map(normalizeDutchieProduct);
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of normalized) {
const result = await client.query(
`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`,
[
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]
);
if (result.rows[0]?.was_inserted) {
inserted++;
} else {
updated++;
}
}
await client.query('COMMIT');
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
// Show summary stats
const stats = await pool.query(
`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_on_special) as specials,
COUNT(DISTINCT brand) as brands,
COUNT(DISTINCT subcategory) as categories
FROM products WHERE store_id = $1
`,
[storeId]
);
console.log('\nStore summary:');
console.log(` Total products: ${stats.rows[0].total}`);
console.log(` On special: ${stats.rows[0].specials}`);
console.log(` Unique brands: ${stats.rows[0].brands}`);
console.log(` Categories: ${stats.rows[0].categories}`);
return {
success: true,
totalProducts: allProducts.length,
inserted,
updated,
};
} finally {
await browser.close();
await pool.end();
}
}
// Run
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
const storeId = parseInt(process.argv[3] || '1', 10);
console.log('='.repeat(60));
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
console.log('='.repeat(60));
console.log(`Menu URL: ${menuUrl}`);
console.log(`Store ID: ${storeId}`);
console.log('');
scrapeAllProducts(menuUrl, storeId)
.then((result) => {
console.log('\n' + '='.repeat(60));
console.log('COMPLETE');
console.log(JSON.stringify(result, null, 2));
})
.catch((error) => {
console.error('Error:', error.message);
process.exit(1);
});

View File

@@ -1,156 +0,0 @@
/**
* Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow
*
* This demonstrates the complete data pipeline:
* 1. Fetch one product from Dutchie GraphQL via Puppeteer
* 2. Normalize it to our schema
* 3. Show the mapping
*/
import { normalizeDutchieProduct, DutchieProduct, NormalizedProduct } from '../scrapers/dutchie-graphql';
import * as fs from 'fs';
// Load the captured sample product from schema capture
const capturedData = JSON.parse(
fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8')
);
const sampleProduct: DutchieProduct = capturedData.sampleProduct;
console.log('='.repeat(80));
console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION');
console.log('='.repeat(80));
console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:');
console.log('-'.repeat(80));
// Show key fields from raw product
const keyRawFields = {
'_id': sampleProduct._id,
'Name': sampleProduct.Name,
'cName': sampleProduct.cName,
'brandName': sampleProduct.brandName,
'brand.id': sampleProduct.brand?.id,
'type': sampleProduct.type,
'subcategory': sampleProduct.subcategory,
'strainType': sampleProduct.strainType,
'Prices': sampleProduct.Prices,
'recPrices': sampleProduct.recPrices,
'recSpecialPrices': sampleProduct.recSpecialPrices,
'special': sampleProduct.special,
'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName,
'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount,
'THCContent.range[0]': sampleProduct.THCContent?.range?.[0],
'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0],
'Status': sampleProduct.Status,
'Image': sampleProduct.Image,
'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU,
'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity,
'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable,
};
Object.entries(keyRawFields).forEach(([key, value]) => {
console.log(` ${key}: ${JSON.stringify(value)}`);
});
console.log('\n📤 NORMALIZED DATABASE ROW:');
console.log('-'.repeat(80));
// Normalize the product
const normalized: NormalizedProduct = normalizeDutchieProduct(sampleProduct);
// Show the normalized result (excluding raw_data for readability)
const { raw_data, cannabinoids, special_data, ...displayFields } = normalized;
Object.entries(displayFields).forEach(([key, value]) => {
if (value !== undefined && value !== null) {
console.log(` ${key}: ${JSON.stringify(value)}`);
}
});
console.log('\n🔗 FIELD MAPPING:');
console.log('-'.repeat(80));
const fieldMappings = [
['_id / id', 'external_id', sampleProduct._id, normalized.external_id],
['Name', 'name', sampleProduct.Name, normalized.name],
['cName', 'slug', sampleProduct.cName, normalized.slug],
['brandName', 'brand', sampleProduct.brandName, normalized.brand],
['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id],
['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory],
['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type],
['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price],
['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price],
['special', 'is_on_special', sampleProduct.special, normalized.is_on_special],
['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'],
['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage],
['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage],
['Status', 'status', sampleProduct.Status, normalized.status],
['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'],
['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku],
];
console.log(' GraphQL Field → DB Column | Value');
console.log(' ' + '-'.repeat(75));
fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => {
const gqlStr = String(gqlField).padEnd(30);
const dbStr = String(dbCol).padEnd(20);
console.log(` ${gqlStr}${dbStr} | ${JSON.stringify(dbVal)}`);
});
console.log('\n📊 SQL INSERT STATEMENT:');
console.log('-'.repeat(80));
// Generate example SQL
const sqlExample = `
INSERT INTO products (
store_id, external_id, slug, name,
brand, brand_external_id,
subcategory, strain_type,
rec_price, rec_special_price,
is_on_special, special_name, discount_percent,
thc_percentage, cbd_percentage,
status, image_url, sku
) VALUES (
1, -- store_id (Deeply Rooted)
'${normalized.external_id}', -- external_id
'${normalized.slug}', -- slug
'${normalized.name}', -- name
'${normalized.brand}', -- brand
'${normalized.brand_external_id}', -- brand_external_id
'${normalized.subcategory}', -- subcategory
'${normalized.strain_type}', -- strain_type
${normalized.rec_price}, -- rec_price
${normalized.rec_special_price}, -- rec_special_price
${normalized.is_on_special}, -- is_on_special
'${normalized.special_name?.substring(0, 50)}...', -- special_name
${normalized.discount_percent || 'NULL'}, -- discount_percent
${normalized.thc_percentage}, -- thc_percentage
${normalized.cbd_percentage}, -- cbd_percentage
'${normalized.status}', -- status
'${normalized.image_url}', -- image_url
'${normalized.sku}' -- sku
)
ON CONFLICT (store_id, slug) DO UPDATE SET ...;
`;
console.log(sqlExample);
console.log('\n✅ SUMMARY:');
console.log('-'.repeat(80));
console.log(` Product: ${normalized.name}`);
console.log(` Brand: ${normalized.brand}`);
console.log(` Category: ${normalized.subcategory}`);
console.log(` Price: $${normalized.rec_price}$${normalized.rec_special_price} (${normalized.discount_percent}% off)`);
console.log(` THC: ${normalized.thc_percentage}%`);
console.log(` Status: ${normalized.status}`);
console.log(` On Special: ${normalized.is_on_special}`);
console.log(` SKU: ${normalized.sku}`);
console.log('\n🎯 DERIVED VIEWS (computed from products table):');
console.log('-'.repeat(80));
console.log(' - current_specials: Products where is_on_special = true');
console.log(' - derived_brands: Aggregated by brand name with counts/prices');
console.log(' - derived_categories: Aggregated by subcategory');
console.log('\nAll views are computed from the single products table - no separate tables needed!');