feat: AZ dispensary harmonization with Dutchie source of truth
Major changes: - Add harmonize-az-dispensaries.ts script to sync dispensaries with Dutchie API - Add migration 057 for crawl_enabled and dutchie_verified fields - Remove legacy dutchie-az module (replaced by platforms/dutchie) - Clean up deprecated crawlers, scrapers, and orchestrator code - Update location-discovery to not fallback to slug when ID is missing - Add crawl-rotator service for proxy rotation - Add types/index.ts for shared type definitions - Add woodpecker-agent k8s manifest Harmonization script: - Queries ConsumerDispensaries API for all 32 AZ cities - Matches dispensaries by platform_dispensary_id (not slug) - Updates existing records with full Dutchie data - Creates new records for unmatched Dutchie dispensaries - Disables dispensaries not found in Dutchie 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -905,12 +905,13 @@ async function backfillProducts(
|
||||
|
||||
let crawlRunId = crawlRunCache.get(dayKey);
|
||||
if (!crawlRunId && !options.dryRun) {
|
||||
crawlRunId = await getOrCreateBackfillCrawlRun(
|
||||
const newCrawlRunId = await getOrCreateBackfillCrawlRun(
|
||||
pool,
|
||||
product.dispensary_id,
|
||||
capturedAt,
|
||||
options.dryRun
|
||||
);
|
||||
crawlRunId = newCrawlRunId ?? undefined;
|
||||
if (crawlRunId) {
|
||||
crawlRunCache.set(dayKey, crawlRunId);
|
||||
stats.crawlRunsCreated++;
|
||||
|
||||
@@ -212,7 +212,7 @@ EXAMPLES:
|
||||
|
||||
try {
|
||||
// Fetch all stores without a dispensary_id
|
||||
const storesResult = await pool.query<Store>(`
|
||||
const storesResult = await pool.query(`
|
||||
SELECT id, name, slug, dispensary_id
|
||||
FROM stores
|
||||
WHERE dispensary_id IS NULL
|
||||
@@ -221,7 +221,7 @@ EXAMPLES:
|
||||
const unmappedStores = storesResult.rows;
|
||||
|
||||
// Fetch all already-mapped stores for context
|
||||
const mappedResult = await pool.query<Store>(`
|
||||
const mappedResult = await pool.query(`
|
||||
SELECT id, name, slug, dispensary_id
|
||||
FROM stores
|
||||
WHERE dispensary_id IS NOT NULL
|
||||
@@ -230,7 +230,7 @@ EXAMPLES:
|
||||
const mappedStores = mappedResult.rows;
|
||||
|
||||
// Fetch all dispensaries
|
||||
const dispResult = await pool.query<Dispensary>(`
|
||||
const dispResult = await pool.query(`
|
||||
SELECT id, name, company_name, city, address, slug
|
||||
FROM dispensaries
|
||||
ORDER BY name
|
||||
|
||||
@@ -1,388 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Bootstrap Discovery Script
|
||||
*
|
||||
* One-time (but reusable) bootstrap command that:
|
||||
* 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
|
||||
* 2. Optionally runs RunDispensaryOrchestrator for each dispensary
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
|
||||
*/
|
||||
|
||||
import { pool } from '../db/pool';
|
||||
import {
|
||||
ensureAllDispensariesHaveSchedules,
|
||||
runDispensaryOrchestrator,
|
||||
runBatchDispensaryOrchestrator,
|
||||
getDispensariesDueForOrchestration,
|
||||
} from '../services/dispensary-orchestrator';
|
||||
|
||||
// Parse command line args
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
run: args.includes('--run'),
|
||||
dryRun: args.includes('--dry-run'),
|
||||
status: args.includes('--status'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
|
||||
concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
|
||||
interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
|
||||
detectionOnly: args.includes('--detection-only'),
|
||||
productionOnly: args.includes('--production-only'),
|
||||
sandboxOnly: args.includes('--sandbox-only'),
|
||||
};
|
||||
|
||||
async function showHelp() {
|
||||
console.log(`
|
||||
Bootstrap Discovery - Initialize Dispensary Crawl System
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--run After creating schedules, run the orchestrator for each dispensary
|
||||
--dry-run Show what would happen without making changes
|
||||
--status Show current status and exit
|
||||
--limit=N Limit how many dispensaries to process (0 = all, default: 0)
|
||||
--concurrency=N How many dispensaries to process in parallel (default: 3)
|
||||
--interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
|
||||
--detection-only Only run detection, don't crawl
|
||||
--production-only Only run dispensaries in production mode
|
||||
--sandbox-only Only run dispensaries in sandbox mode
|
||||
--help, -h Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
# Create schedule entries for all dispensaries (no crawling)
|
||||
npx tsx src/scripts/bootstrap-discovery.ts
|
||||
|
||||
# Create schedules and run orchestrator for all dispensaries
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --run
|
||||
|
||||
# Run orchestrator for first 10 dispensaries
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
|
||||
|
||||
# Run with higher concurrency
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
|
||||
|
||||
# Show current status
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --status
|
||||
|
||||
WHAT IT DOES:
|
||||
1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
|
||||
2. If --run: For each dispensary, runs the orchestrator which:
|
||||
a. Checks if provider detection is needed (null/unknown/stale/low confidence)
|
||||
b. Runs detection if needed
|
||||
c. If Dutchie + production mode: runs production crawl
|
||||
d. Otherwise: runs sandbox crawl
|
||||
3. Updates schedule status and job records
|
||||
`);
|
||||
}
|
||||
|
||||
async function showStatus() {
|
||||
console.log('\n📊 Current Dispensary Crawl Status\n');
|
||||
console.log('═'.repeat(70));
|
||||
|
||||
// Get dispensary counts by provider
|
||||
const providerStats = await pool.query(`
|
||||
SELECT
|
||||
COALESCE(product_provider, 'undetected') as provider,
|
||||
COUNT(*) as count,
|
||||
COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
|
||||
COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
|
||||
COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
|
||||
FROM dispensaries
|
||||
GROUP BY COALESCE(product_provider, 'undetected')
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
|
||||
console.log('\nProvider Distribution:');
|
||||
console.log('-'.repeat(60));
|
||||
console.log(
|
||||
'Provider'.padEnd(20) +
|
||||
'Total'.padStart(8) +
|
||||
'Production'.padStart(12) +
|
||||
'Sandbox'.padStart(10) +
|
||||
'No Mode'.padStart(10)
|
||||
);
|
||||
console.log('-'.repeat(60));
|
||||
|
||||
for (const row of providerStats.rows) {
|
||||
console.log(
|
||||
row.provider.padEnd(20) +
|
||||
row.count.toString().padStart(8) +
|
||||
row.production.toString().padStart(12) +
|
||||
row.sandbox.toString().padStart(10) +
|
||||
row.no_mode.toString().padStart(10)
|
||||
);
|
||||
}
|
||||
|
||||
// Get schedule stats
|
||||
const scheduleStats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(DISTINCT d.id) as total_dispensaries,
|
||||
COUNT(DISTINCT dcs.id) as with_schedule,
|
||||
COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
|
||||
COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
|
||||
COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
|
||||
AVG(dcs.interval_minutes)::INTEGER as avg_interval
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
`);
|
||||
|
||||
const s = scheduleStats.rows[0];
|
||||
console.log('\n\nSchedule Status:');
|
||||
console.log('-'.repeat(60));
|
||||
console.log(` Total Dispensaries: ${s.total_dispensaries}`);
|
||||
console.log(` With Schedule: ${s.with_schedule}`);
|
||||
console.log(` Without Schedule: ${s.without_schedule}`);
|
||||
console.log(` Active Schedules: ${s.active_schedules || 0}`);
|
||||
console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
|
||||
|
||||
console.log('\n Last Run Status:');
|
||||
console.log(` - Success: ${s.last_success || 0}`);
|
||||
console.log(` - Error: ${s.last_error || 0}`);
|
||||
console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
|
||||
console.log(` - Detection Only: ${s.last_detection || 0}`);
|
||||
console.log(` - Due Now: ${s.due_now || 0}`);
|
||||
|
||||
// Get recent job stats
|
||||
const jobStats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
|
||||
COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
|
||||
COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
|
||||
COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
|
||||
SUM(products_found) as total_products_found
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE created_at > NOW() - INTERVAL '24 hours'
|
||||
`);
|
||||
|
||||
const j = jobStats.rows[0];
|
||||
console.log('\n\nJobs (Last 24 Hours):');
|
||||
console.log('-'.repeat(60));
|
||||
console.log(` Total Jobs: ${j.total || 0}`);
|
||||
console.log(` Completed: ${j.completed || 0}`);
|
||||
console.log(` Failed: ${j.failed || 0}`);
|
||||
console.log(` Running: ${j.running || 0}`);
|
||||
console.log(` Pending: ${j.pending || 0}`);
|
||||
console.log(` With Detection: ${j.with_detection || 0}`);
|
||||
console.log(` With Crawl: ${j.with_crawl || 0}`);
|
||||
console.log(` - Production: ${j.production_crawls || 0}`);
|
||||
console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
|
||||
console.log(` Products Found: ${j.total_products_found || 0}`);
|
||||
|
||||
console.log('\n' + '═'.repeat(70) + '\n');
|
||||
}
|
||||
|
||||
async function createSchedules(): Promise<{ created: number; existing: number }> {
|
||||
console.log('\n📅 Creating Dispensary Schedules...\n');
|
||||
|
||||
if (flags.dryRun) {
|
||||
// Count how many would be created
|
||||
const result = await pool.query(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM dispensaries d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
|
||||
)
|
||||
`);
|
||||
|
||||
const wouldCreate = parseInt(result.rows[0].count);
|
||||
console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
|
||||
|
||||
return { created: wouldCreate, existing: 0 };
|
||||
}
|
||||
|
||||
const result = await ensureAllDispensariesHaveSchedules(flags.interval);
|
||||
|
||||
console.log(` ✓ Created ${result.created} new schedule entries`);
|
||||
console.log(` ✓ ${result.existing} dispensaries already had schedules`);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async function getDispensariesToProcess(): Promise<number[]> {
|
||||
// Build query based on filters
|
||||
let whereClause = 'TRUE';
|
||||
|
||||
if (flags.productionOnly) {
|
||||
whereClause += ` AND d.product_crawler_mode = 'production'`;
|
||||
} else if (flags.sandboxOnly) {
|
||||
whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
|
||||
}
|
||||
|
||||
if (flags.detectionOnly) {
|
||||
whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
|
||||
}
|
||||
|
||||
const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
|
||||
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
WHERE ${whereClause}
|
||||
ORDER BY
|
||||
COALESCE(dcs.priority, 0) DESC,
|
||||
dcs.last_run_at ASC NULLS FIRST,
|
||||
d.id ASC
|
||||
${limitClause}
|
||||
`;
|
||||
|
||||
const result = await pool.query(query);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
|
||||
async function runOrchestrator() {
|
||||
console.log('\n🚀 Running Dispensary Orchestrator...\n');
|
||||
|
||||
const dispensaryIds = await getDispensariesToProcess();
|
||||
|
||||
if (dispensaryIds.length === 0) {
|
||||
console.log(' No dispensaries to process.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Found ${dispensaryIds.length} dispensaries to process`);
|
||||
console.log(` Concurrency: ${flags.concurrency}`);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log('\n Would process these dispensaries:');
|
||||
|
||||
const details = await pool.query(
|
||||
`SELECT id, name, product_provider, product_crawler_mode
|
||||
FROM dispensaries WHERE id = ANY($1) ORDER BY id`,
|
||||
[dispensaryIds]
|
||||
);
|
||||
|
||||
for (const row of details.rows.slice(0, 20)) {
|
||||
console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
|
||||
}
|
||||
|
||||
if (details.rows.length > 20) {
|
||||
console.log(` ... and ${details.rows.length - 20} more`);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('\n Starting batch processing...\n');
|
||||
|
||||
const results = await runBatchDispensaryOrchestrator(dispensaryIds, flags.concurrency);
|
||||
|
||||
// Summarize results
|
||||
const summary = {
|
||||
total: results.length,
|
||||
success: results.filter(r => r.status === 'success').length,
|
||||
sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
|
||||
detectionOnly: results.filter(r => r.status === 'detection_only').length,
|
||||
error: results.filter(r => r.status === 'error').length,
|
||||
detectionsRan: results.filter(r => r.detectionRan).length,
|
||||
crawlsRan: results.filter(r => r.crawlRan).length,
|
||||
productionCrawls: results.filter(r => r.crawlType === 'production').length,
|
||||
sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
|
||||
totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
|
||||
totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
|
||||
};
|
||||
|
||||
console.log('\n' + '═'.repeat(70));
|
||||
console.log(' Orchestrator Results');
|
||||
console.log('═'.repeat(70));
|
||||
console.log(`
|
||||
Total Processed: ${summary.total}
|
||||
|
||||
Status:
|
||||
- Success: ${summary.success}
|
||||
- Sandbox Only: ${summary.sandboxOnly}
|
||||
- Detection Only: ${summary.detectionOnly}
|
||||
- Error: ${summary.error}
|
||||
|
||||
Operations:
|
||||
- Detections Ran: ${summary.detectionsRan}
|
||||
- Crawls Ran: ${summary.crawlsRan}
|
||||
- Production: ${summary.productionCrawls}
|
||||
- Sandbox: ${summary.sandboxCrawls}
|
||||
|
||||
Results:
|
||||
- Products Found: ${summary.totalProducts}
|
||||
- Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
|
||||
- Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
|
||||
`);
|
||||
console.log('═'.repeat(70) + '\n');
|
||||
|
||||
// Show errors if any
|
||||
const errors = results.filter(r => r.status === 'error');
|
||||
if (errors.length > 0) {
|
||||
console.log('\n⚠️ Errors encountered:');
|
||||
for (const err of errors.slice(0, 10)) {
|
||||
console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
|
||||
}
|
||||
if (errors.length > 10) {
|
||||
console.log(` ... and ${errors.length - 10} more errors`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
await showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log('\n' + '═'.repeat(70));
|
||||
console.log(' Dispensary Crawl Bootstrap Discovery');
|
||||
console.log('═'.repeat(70));
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log('\n🔍 DRY RUN MODE - No changes will be made');
|
||||
}
|
||||
|
||||
try {
|
||||
// Always show status first
|
||||
await showStatus();
|
||||
|
||||
if (flags.status) {
|
||||
// Status-only mode, we're done
|
||||
await pool.end();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Step 1: Create schedule entries
|
||||
await createSchedules();
|
||||
|
||||
// Step 2: Optionally run orchestrator
|
||||
if (flags.run) {
|
||||
await runOrchestrator();
|
||||
} else {
|
||||
console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
|
||||
}
|
||||
|
||||
// Show final status
|
||||
if (!flags.dryRun) {
|
||||
await showStatus();
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Fatal error:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,101 +0,0 @@
|
||||
/**
|
||||
* LOCAL-ONLY Admin Bootstrap Script
|
||||
*
|
||||
* Creates or resets a local admin user for development.
|
||||
* This script is ONLY for local development - never use in production.
|
||||
*
|
||||
* Usage:
|
||||
* cd backend
|
||||
* npx tsx src/scripts/bootstrap-local-admin.ts
|
||||
*
|
||||
* Default credentials:
|
||||
* Email: admin@local.test
|
||||
* Password: admin123
|
||||
*/
|
||||
|
||||
import bcrypt from 'bcrypt';
|
||||
import { query, closePool } from '../dutchie-az/db/connection';
|
||||
|
||||
// Local admin credentials - deterministic for dev
|
||||
const LOCAL_ADMIN_EMAIL = 'admin@local.test';
|
||||
const LOCAL_ADMIN_PASSWORD = 'admin123';
|
||||
const LOCAL_ADMIN_ROLE = 'admin'; // Match existing schema (admin, not superadmin)
|
||||
|
||||
async function bootstrapLocalAdmin(): Promise<void> {
|
||||
console.log('='.repeat(60));
|
||||
console.log('LOCAL ADMIN BOOTSTRAP');
|
||||
console.log('='.repeat(60));
|
||||
console.log('');
|
||||
console.log('This script creates/resets a local admin user for development.');
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// Hash the password with bcrypt (10 rounds, matching existing code)
|
||||
const passwordHash = await bcrypt.hash(LOCAL_ADMIN_PASSWORD, 10);
|
||||
|
||||
// Check if user exists
|
||||
const existing = await query<{ id: number; email: string }>(
|
||||
'SELECT id, email FROM users WHERE email = $1',
|
||||
[LOCAL_ADMIN_EMAIL]
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
// User exists - update password and role
|
||||
console.log(`User "${LOCAL_ADMIN_EMAIL}" already exists (id=${existing.rows[0].id})`);
|
||||
console.log('Resetting password and ensuring admin role...');
|
||||
|
||||
await query(
|
||||
`UPDATE users
|
||||
SET password_hash = $1,
|
||||
role = $2,
|
||||
updated_at = NOW()
|
||||
WHERE email = $3`,
|
||||
[passwordHash, LOCAL_ADMIN_ROLE, LOCAL_ADMIN_EMAIL]
|
||||
);
|
||||
|
||||
console.log('User updated successfully.');
|
||||
} else {
|
||||
// User doesn't exist - create new
|
||||
console.log(`Creating new admin user: ${LOCAL_ADMIN_EMAIL}`);
|
||||
|
||||
const result = await query<{ id: number }>(
|
||||
`INSERT INTO users (email, password_hash, role, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, NOW(), NOW())
|
||||
RETURNING id`,
|
||||
[LOCAL_ADMIN_EMAIL, passwordHash, LOCAL_ADMIN_ROLE]
|
||||
);
|
||||
|
||||
console.log(`User created successfully (id=${result.rows[0].id})`);
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('='.repeat(60));
|
||||
console.log('LOCAL ADMIN READY');
|
||||
console.log('='.repeat(60));
|
||||
console.log('');
|
||||
console.log('Login credentials:');
|
||||
console.log(` Email: ${LOCAL_ADMIN_EMAIL}`);
|
||||
console.log(` Password: ${LOCAL_ADMIN_PASSWORD}`);
|
||||
console.log('');
|
||||
console.log('Admin UI: http://localhost:8080/admin');
|
||||
console.log('');
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('');
|
||||
console.error('ERROR: Failed to bootstrap local admin');
|
||||
console.error(error.message);
|
||||
|
||||
if (error.message.includes('relation "users" does not exist')) {
|
||||
console.error('');
|
||||
console.error('The "users" table does not exist.');
|
||||
console.error('Run migrations first: npm run migrate');
|
||||
}
|
||||
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await closePool();
|
||||
}
|
||||
}
|
||||
|
||||
// Run the bootstrap
|
||||
bootstrapLocalAdmin();
|
||||
@@ -1,66 +0,0 @@
|
||||
/**
|
||||
* Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie'
|
||||
* and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic.
|
||||
*
|
||||
* Usage (local):
|
||||
* node dist/scripts/crawl-all-dutchie.js
|
||||
*
|
||||
* Requires:
|
||||
* - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB
|
||||
* - Dispensaries table populated with menu_type and platform_dispensary_id
|
||||
*/
|
||||
|
||||
import { query } from '../dutchie-az/db/connection';
|
||||
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
|
||||
|
||||
async function main() {
|
||||
const { rows } = await query<{
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
platform_dispensary_id: string | null;
|
||||
}>(`
|
||||
SELECT id, name, slug, platform_dispensary_id
|
||||
FROM dispensaries
|
||||
WHERE menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY id
|
||||
`);
|
||||
|
||||
if (!rows.length) {
|
||||
console.log('No dutchie dispensaries with resolved platform_dispensary_id found.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`);
|
||||
|
||||
let success = 0;
|
||||
let failed = 0;
|
||||
|
||||
for (const row of rows) {
|
||||
try {
|
||||
console.log(`Crawling ${row.id} (${row.name})...`);
|
||||
const result = await runDispensaryOrchestrator(row.id);
|
||||
const ok =
|
||||
result.status === 'success' ||
|
||||
result.status === 'sandbox_only' ||
|
||||
result.status === 'detection_only';
|
||||
if (ok) {
|
||||
success++;
|
||||
} else {
|
||||
failed++;
|
||||
console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
failed++;
|
||||
console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Completed. Success: ${success}, Failed: ${failed}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Fatal:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,50 +0,0 @@
|
||||
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
|
||||
|
||||
// All 57 dutchie stores with platform_dispensary_id (as of 2024-12)
|
||||
const ALL_DISPENSARY_IDS = [
|
||||
72, 74, 75, 76, 77, 78, 81, 82, 85, 87, 91, 92, 97, 101, 106, 108, 110, 112,
|
||||
115, 120, 123, 125, 128, 131, 135, 139, 140, 143, 144, 145, 152, 153, 161,
|
||||
168, 176, 177, 180, 181, 189, 195, 196, 199, 200, 201, 205, 206, 207, 213,
|
||||
214, 224, 225, 227, 232, 235, 248, 252, 281
|
||||
];
|
||||
|
||||
const BATCH_SIZE = 5;
|
||||
|
||||
async function run() {
|
||||
const totalBatches = Math.ceil(ALL_DISPENSARY_IDS.length / BATCH_SIZE);
|
||||
console.log(`Starting crawl of ${ALL_DISPENSARY_IDS.length} stores in ${totalBatches} batches of ${BATCH_SIZE}...`);
|
||||
|
||||
let successCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (let i = 0; i < ALL_DISPENSARY_IDS.length; i += BATCH_SIZE) {
|
||||
const batch = ALL_DISPENSARY_IDS.slice(i, i + BATCH_SIZE);
|
||||
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
||||
console.log(`\n========== BATCH ${batchNum}/${totalBatches} (IDs: ${batch.join(', ')}) ==========`);
|
||||
|
||||
for (const id of batch) {
|
||||
console.log(`\n--- Crawling dispensary ${id} ---`);
|
||||
try {
|
||||
const result = await runDispensaryOrchestrator(id);
|
||||
console.log(` Status: ${result.status}`);
|
||||
console.log(` Summary: ${result.summary}`);
|
||||
if (result.productsFound) {
|
||||
console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
|
||||
}
|
||||
successCount++;
|
||||
} catch (e: any) {
|
||||
console.log(` ERROR: ${e.message}`);
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n--- Batch ${batchNum} complete. Progress: ${Math.min(i + BATCH_SIZE, ALL_DISPENSARY_IDS.length)}/${ALL_DISPENSARY_IDS.length} ---`);
|
||||
}
|
||||
|
||||
console.log('\n========================================');
|
||||
console.log(`=== ALL CRAWLS COMPLETE ===`);
|
||||
console.log(`Success: ${successCount}, Errors: ${errorCount}`);
|
||||
console.log('========================================');
|
||||
}
|
||||
|
||||
run().catch(e => console.log('Fatal:', e.message));
|
||||
114
backend/src/scripts/debug-dutchie-page.ts
Normal file
114
backend/src/scripts/debug-dutchie-page.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
/**
|
||||
* Debug Dutchie city page to see what data is available
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
async function main() {
|
||||
const cityUrl = process.argv[2] || 'https://dutchie.com/us/dispensaries/wa-bellevue';
|
||||
|
||||
console.log(`Debugging page: ${cityUrl}`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('Navigating...');
|
||||
await page.goto(cityUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
// Get page title
|
||||
const title = await page.title();
|
||||
console.log(`\nPage title: ${title}`);
|
||||
|
||||
// Check for Cloudflare challenge
|
||||
const isCFChallenge = await page.evaluate(() => {
|
||||
return document.title.includes('Just a moment') ||
|
||||
document.body.textContent?.includes('Enable JavaScript');
|
||||
});
|
||||
|
||||
if (isCFChallenge) {
|
||||
console.log('\n⚠️ CLOUDFLARE CHALLENGE DETECTED - waiting longer...');
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
}
|
||||
|
||||
// Check for __NEXT_DATA__
|
||||
const nextData = await page.evaluate(() => {
|
||||
const script = document.querySelector('script#__NEXT_DATA__');
|
||||
if (script) {
|
||||
try {
|
||||
return JSON.parse(script.textContent || '{}');
|
||||
} catch {
|
||||
return { error: 'Failed to parse __NEXT_DATA__' };
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (nextData) {
|
||||
console.log('\n✅ __NEXT_DATA__ found!');
|
||||
console.log('Keys:', Object.keys(nextData));
|
||||
if (nextData.props?.pageProps) {
|
||||
console.log('pageProps keys:', Object.keys(nextData.props.pageProps));
|
||||
if (nextData.props.pageProps.dispensaries) {
|
||||
console.log('Dispensaries count:', nextData.props.pageProps.dispensaries.length);
|
||||
// Show first dispensary structure
|
||||
const first = nextData.props.pageProps.dispensaries[0];
|
||||
if (first) {
|
||||
console.log('\nFirst dispensary keys:', Object.keys(first));
|
||||
console.log('First dispensary sample:', JSON.stringify(first, null, 2).slice(0, 1000));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.log('\n❌ No __NEXT_DATA__ found');
|
||||
|
||||
// Check what scripts are on the page
|
||||
const scripts = await page.evaluate(() => {
|
||||
return Array.from(document.querySelectorAll('script[id]')).map(s => ({
|
||||
id: s.id,
|
||||
src: (s as HTMLScriptElement).src?.slice(0, 100),
|
||||
}));
|
||||
});
|
||||
console.log('Scripts with IDs:', scripts);
|
||||
|
||||
// Try to find dispensary data in window object
|
||||
const windowData = await page.evaluate(() => {
|
||||
const w = window as any;
|
||||
const keys = ['__NEXT_DATA__', '__PRELOADED_STATE__', '__INITIAL_STATE__',
|
||||
'dispensaries', '__data', 'pageData', '__remixContext'];
|
||||
const found: Record<string, any> = {};
|
||||
for (const key of keys) {
|
||||
if (w[key]) {
|
||||
found[key] = typeof w[key] === 'object' ? Object.keys(w[key]) : typeof w[key];
|
||||
}
|
||||
}
|
||||
return found;
|
||||
});
|
||||
console.log('Window data:', windowData);
|
||||
|
||||
// Get some page content
|
||||
const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500));
|
||||
console.log('\nPage text preview:', bodyText);
|
||||
}
|
||||
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
256
backend/src/scripts/discover-and-import-store.ts
Normal file
256
backend/src/scripts/discover-and-import-store.ts
Normal file
@@ -0,0 +1,256 @@
|
||||
/**
|
||||
* Discover and Import Store Script
|
||||
*
|
||||
* Discovers a store from Dutchie by city+state and imports it into the dispensaries table.
|
||||
* Uses the local API endpoints - does NOT make direct GraphQL calls.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/discover-and-import-store.ts --city "Adelanto" --state "CA"
|
||||
* npx tsx src/scripts/discover-and-import-store.ts --city "Phoenix" --state "AZ" --dry-run
|
||||
* npx tsx src/scripts/discover-and-import-store.ts --city "Los Angeles" --state "CA" --all
|
||||
*/
|
||||
|
||||
const API_BASE = process.env.API_BASE || 'http://localhost:3010';
|
||||
|
||||
interface DiscoveryResult {
|
||||
cityId: string;
|
||||
citySlug: string;
|
||||
locationsFound: number;
|
||||
locationsUpserted: number;
|
||||
locationsNew: number;
|
||||
locationsUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
interface DiscoveryLocation {
|
||||
id: number;
|
||||
name: string;
|
||||
city: string;
|
||||
stateCode: string;
|
||||
platformSlug: string;
|
||||
platformLocationId: string;
|
||||
platformMenuUrl: string;
|
||||
status: string;
|
||||
}
|
||||
|
||||
interface Store {
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
city: string;
|
||||
state: string;
|
||||
menu_url: string;
|
||||
platform_dispensary_id: string;
|
||||
}
|
||||
|
||||
async function discoverCity(city: string, state: string): Promise<DiscoveryResult | null> {
|
||||
const citySlug = city.toLowerCase().replace(/\s+/g, '-');
|
||||
|
||||
console.log(`\n[1/3] Discovering stores in ${city}, ${state}...`);
|
||||
|
||||
const response = await fetch(`${API_BASE}/api/discovery/admin/discover-city`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
citySlug,
|
||||
stateCode: state,
|
||||
countryCode: 'US'
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
console.error(`Discovery failed: ${error}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!data.success) {
|
||||
console.error(`Discovery failed: ${JSON.stringify(data)}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(` Found ${data.result.locationsFound} location(s)`);
|
||||
console.log(` New: ${data.result.locationsNew}, Updated: ${data.result.locationsUpdated}`);
|
||||
|
||||
return data.result;
|
||||
}
|
||||
|
||||
async function getDiscoveredLocations(state: string, city?: string): Promise<DiscoveryLocation[]> {
|
||||
console.log(`\n[2/3] Fetching discovered locations for ${city || 'all cities'}, ${state}...`);
|
||||
|
||||
// Query the discovery_locations table via SQL since the API has a bug
|
||||
// For now, return empty and let caller handle via direct DB query
|
||||
// TODO: Fix the /api/discovery/locations endpoint
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
async function createStore(location: {
|
||||
name: string;
|
||||
slug: string;
|
||||
city: string;
|
||||
state: string;
|
||||
menuUrl: string;
|
||||
platformId: string;
|
||||
}): Promise<Store | null> {
|
||||
console.log(`\n[3/3] Creating store: ${location.name}...`);
|
||||
|
||||
const response = await fetch(`${API_BASE}/api/stores`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
name: location.name,
|
||||
slug: location.slug,
|
||||
city: location.city,
|
||||
state: location.state,
|
||||
menu_url: location.menuUrl,
|
||||
menu_type: 'dutchie',
|
||||
platform: 'dutchie',
|
||||
platform_dispensary_id: location.platformId
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
if (error.error?.includes('already exists')) {
|
||||
console.log(` Store already exists (slug: ${location.slug})`);
|
||||
return null;
|
||||
}
|
||||
console.error(` Failed to create store: ${JSON.stringify(error)}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const store = await response.json();
|
||||
console.log(` Created store ID: ${store.id}`);
|
||||
return store;
|
||||
}
|
||||
|
||||
async function verifyStoreExists(city: string, state: string): Promise<Store[]> {
|
||||
const response = await fetch(`${API_BASE}/api/stores?city=${encodeURIComponent(city)}&state=${state}`);
|
||||
|
||||
if (!response.ok) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return data.stores || [];
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
// Parse arguments
|
||||
let city = '';
|
||||
let state = '';
|
||||
let dryRun = false;
|
||||
let importAll = false;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--city' && args[i + 1]) {
|
||||
city = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === '--state' && args[i + 1]) {
|
||||
state = args[i + 1].toUpperCase();
|
||||
i++;
|
||||
} else if (args[i] === '--dry-run') {
|
||||
dryRun = true;
|
||||
} else if (args[i] === '--all') {
|
||||
importAll = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!city || !state) {
|
||||
console.log(`
|
||||
Usage: npx tsx src/scripts/discover-and-import-store.ts --city "City Name" --state "ST"
|
||||
|
||||
Options:
|
||||
--city City name (required)
|
||||
--state State code, e.g., CA, AZ (required)
|
||||
--dry-run Discover only, don't import
|
||||
--all Import all discovered locations (default: first one only)
|
||||
|
||||
Examples:
|
||||
npx tsx src/scripts/discover-and-import-store.ts --city "Adelanto" --state "CA"
|
||||
npx tsx src/scripts/discover-and-import-store.ts --city "Phoenix" --state "AZ" --all
|
||||
`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log(`STORE DISCOVERY & IMPORT`);
|
||||
console.log(`City: ${city}, State: ${state}`);
|
||||
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'IMPORT'}`);
|
||||
console.log('='.repeat(60));
|
||||
|
||||
// Step 1: Check if stores already exist
|
||||
const existingStores = await verifyStoreExists(city, state);
|
||||
if (existingStores.length > 0) {
|
||||
console.log(`\nFound ${existingStores.length} existing store(s) in ${city}, ${state}:`);
|
||||
existingStores.forEach(s => console.log(` - ${s.name} (ID: ${s.id})`));
|
||||
|
||||
if (!importAll) {
|
||||
console.log('\nUse --all to discover and import additional stores.');
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Discover from Dutchie
|
||||
const discovery = await discoverCity(city, state);
|
||||
|
||||
if (!discovery) {
|
||||
console.error('\nDiscovery failed. Exiting.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (discovery.locationsFound === 0) {
|
||||
console.log('\nNo stores found in this city on Dutchie.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n[DRY RUN] Would import stores. Run without --dry-run to import.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Step 3: The discovery endpoint already saved to dutchie_discovery_locations
|
||||
// Now we need to query that table and create dispensary records
|
||||
// Since the API has bugs, we'll provide instructions for manual import
|
||||
|
||||
console.log(`
|
||||
Next steps to complete import:
|
||||
|
||||
1. Query the discovery location:
|
||||
psql -c "SELECT id, name, platform_slug, platform_location_id, platform_menu_url
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE name ILIKE '%${city}%'
|
||||
ORDER BY id DESC LIMIT 5;"
|
||||
|
||||
2. Create the store via API:
|
||||
curl -X POST ${API_BASE}/api/stores \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"name": "<NAME>",
|
||||
"slug": "<PLATFORM_SLUG>",
|
||||
"city": "${city}",
|
||||
"state": "${state}",
|
||||
"menu_url": "<PLATFORM_MENU_URL>",
|
||||
"menu_type": "dutchie",
|
||||
"platform": "dutchie",
|
||||
"platform_dispensary_id": "<PLATFORM_LOCATION_ID>"
|
||||
}'
|
||||
|
||||
3. Verify:
|
||||
curl "${API_BASE}/api/stores?city=${encodeURIComponent(city)}&state=${state}"
|
||||
`);
|
||||
|
||||
// Final verification
|
||||
const finalStores = await verifyStoreExists(city, state);
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log(`RESULT: ${finalStores.length} store(s) now in ${city}, ${state}`);
|
||||
finalStores.forEach(s => console.log(` - ${s.name} (ID: ${s.id})`));
|
||||
console.log('='.repeat(60));
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
88
backend/src/scripts/discover-az-dutchie.ts
Normal file
88
backend/src/scripts/discover-az-dutchie.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
/**
|
||||
* Discover all Arizona dispensaries from Dutchie
|
||||
* Uses the state/city HTML pages which contain __NEXT_DATA__ with full dispensary list
|
||||
*/
|
||||
import { fetchPage, extractNextData } from '../platforms/dutchie/client';
|
||||
|
||||
interface DutchieDispensary {
|
||||
platform_dispensary_id: string;
|
||||
name: string;
|
||||
slug: string;
|
||||
city: string;
|
||||
state: string;
|
||||
address: string;
|
||||
zip: string;
|
||||
}
|
||||
|
||||
async function discoverAZDispensaries() {
|
||||
console.log('Discovering Arizona dispensaries from Dutchie...\n');
|
||||
|
||||
const allDispensaries: Map<string, DutchieDispensary> = new Map();
|
||||
|
||||
// Fetch the Arizona state page
|
||||
console.log('Fetching /dispensaries/arizona...');
|
||||
const stateResult = await fetchPage('/dispensaries/arizona');
|
||||
|
||||
if (!stateResult) {
|
||||
console.error('Failed to fetch Arizona page');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Got ${stateResult.status} response, ${stateResult.html.length} bytes`);
|
||||
|
||||
const nextData = extractNextData(stateResult.html);
|
||||
if (!nextData) {
|
||||
console.error('Failed to extract __NEXT_DATA__');
|
||||
// Try to find dispensary links in HTML
|
||||
const links = stateResult.html.match(/\/dispensary\/([a-z0-9-]+)/gi) || [];
|
||||
console.log(`Found ${links.length} dispensary links in HTML`);
|
||||
const uniqueSlugs = [...new Set(links.map(l => l.replace('/dispensary/', '')))];
|
||||
console.log('Unique slugs:', uniqueSlugs.slice(0, 20));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('Extracted __NEXT_DATA__');
|
||||
console.log('Keys:', Object.keys(nextData));
|
||||
|
||||
// The dispensary data is usually in props.pageProps
|
||||
const pageProps = nextData?.props?.pageProps;
|
||||
if (pageProps) {
|
||||
console.log('pageProps keys:', Object.keys(pageProps));
|
||||
|
||||
// Try various possible locations
|
||||
const dispensaries = pageProps.dispensaries ||
|
||||
pageProps.nearbyDispensaries ||
|
||||
pageProps.filteredDispensaries ||
|
||||
pageProps.allDispensaries ||
|
||||
[];
|
||||
|
||||
console.log(`Found ${dispensaries.length} dispensaries in pageProps`);
|
||||
|
||||
if (dispensaries.length > 0) {
|
||||
console.log('Sample:', JSON.stringify(dispensaries[0], null, 2));
|
||||
}
|
||||
}
|
||||
|
||||
// Also look for dehydratedState (Apollo cache)
|
||||
const dehydratedState = nextData?.props?.pageProps?.__APOLLO_STATE__;
|
||||
if (dehydratedState) {
|
||||
console.log('Found Apollo state');
|
||||
const dispensaryKeys = Object.keys(dehydratedState).filter(k =>
|
||||
k.startsWith('Dispensary:') || k.includes('dispensary')
|
||||
);
|
||||
console.log(`Found ${dispensaryKeys.length} dispensary entries`);
|
||||
if (dispensaryKeys.length > 0) {
|
||||
console.log('Sample key:', dispensaryKeys[0]);
|
||||
console.log('Sample value:', JSON.stringify(dehydratedState[dispensaryKeys[0]], null, 2).slice(0, 500));
|
||||
}
|
||||
}
|
||||
|
||||
// Output the raw pageProps for analysis
|
||||
if (pageProps) {
|
||||
const fs = await import('fs');
|
||||
fs.writeFileSync('/tmp/az-pageprops.json', JSON.stringify(pageProps, null, 2));
|
||||
console.log('\nWrote pageProps to /tmp/az-pageprops.json');
|
||||
}
|
||||
}
|
||||
|
||||
discoverAZDispensaries().catch(console.error);
|
||||
@@ -1,86 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Dutchie City Discovery CLI Runner
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dutchie:cities
|
||||
* npx tsx src/scripts/discovery-dutchie-cities.ts
|
||||
*
|
||||
* Environment:
|
||||
* DATABASE_URL - PostgreSQL connection string (required)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from '../dutchie-az/discovery/DutchieCityDiscovery';
|
||||
|
||||
async function main() {
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE CITY DISCOVERY');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
// Get database URL from environment
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
if (!connectionString) {
|
||||
console.error('ERROR: DATABASE_URL environment variable is required');
|
||||
console.error('');
|
||||
console.error('Usage:');
|
||||
console.error(' DATABASE_URL="postgresql://..." npm run discovery:dutchie:cities');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Create pool
|
||||
const pool = new Pool({ connectionString });
|
||||
|
||||
try {
|
||||
// Test connection
|
||||
await pool.query('SELECT 1');
|
||||
console.log('[CLI] Database connection established');
|
||||
|
||||
// Run discovery
|
||||
const discovery = new DutchieCityDiscovery(pool);
|
||||
const result = await discovery.run();
|
||||
|
||||
// Print summary
|
||||
console.log('');
|
||||
console.log('='.repeat(60));
|
||||
console.log('DISCOVERY COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('');
|
||||
console.log('Errors:');
|
||||
result.errors.forEach((e) => console.log(` - ${e}`));
|
||||
}
|
||||
|
||||
// Show stats
|
||||
console.log('');
|
||||
console.log('Current Statistics:');
|
||||
const stats = await discovery.getStats();
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
console.log('');
|
||||
console.log('By Country:');
|
||||
stats.byCountry.forEach((c) => console.log(` ${c.countryCode}: ${c.count}`));
|
||||
console.log('');
|
||||
console.log('By State (top 10):');
|
||||
stats.byState.slice(0, 10).forEach((s) => console.log(` ${s.stateCode} (${s.countryCode}): ${s.count}`));
|
||||
|
||||
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||
} catch (error: any) {
|
||||
console.error('FATAL ERROR:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,189 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Dutchie Location Discovery CLI Runner
|
||||
*
|
||||
* Discovers store locations for cities and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dutchie:locations -- --all-enabled
|
||||
* npm run discovery:dutchie:locations -- --city-slug=phoenix
|
||||
* npm run discovery:dutchie:locations -- --all-enabled --limit=10
|
||||
*
|
||||
* npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled
|
||||
* npx tsx src/scripts/discovery-dutchie-locations.ts --city-slug=phoenix
|
||||
*
|
||||
* Options:
|
||||
* --city-slug=<slug> Run for a single city by its slug
|
||||
* --all-enabled Run for all cities where crawl_enabled = TRUE
|
||||
* --limit=<n> Limit the number of cities to process
|
||||
* --delay=<ms> Delay between cities in ms (default: 2000)
|
||||
*
|
||||
* Environment:
|
||||
* DATABASE_URL - PostgreSQL connection string (required)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieLocationDiscovery } from '../dutchie-az/discovery/DutchieLocationDiscovery';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs(): {
|
||||
citySlug: string | null;
|
||||
allEnabled: boolean;
|
||||
limit: number | undefined;
|
||||
delay: number;
|
||||
} {
|
||||
const args = process.argv.slice(2);
|
||||
let citySlug: string | null = null;
|
||||
let allEnabled = false;
|
||||
let limit: number | undefined = undefined;
|
||||
let delay = 2000;
|
||||
|
||||
for (const arg of args) {
|
||||
if (arg.startsWith('--city-slug=')) {
|
||||
citySlug = arg.split('=')[1];
|
||||
} else if (arg === '--all-enabled') {
|
||||
allEnabled = true;
|
||||
} else if (arg.startsWith('--limit=')) {
|
||||
limit = parseInt(arg.split('=')[1], 10);
|
||||
} else if (arg.startsWith('--delay=')) {
|
||||
delay = parseInt(arg.split('=')[1], 10);
|
||||
}
|
||||
}
|
||||
|
||||
return { citySlug, allEnabled, limit, delay };
|
||||
}
|
||||
|
||||
function printUsage() {
|
||||
console.log(`
|
||||
Dutchie Location Discovery CLI
|
||||
|
||||
Usage:
|
||||
npx tsx src/scripts/discovery-dutchie-locations.ts [options]
|
||||
|
||||
Options:
|
||||
--city-slug=<slug> Run for a single city by its slug
|
||||
--all-enabled Run for all cities where crawl_enabled = TRUE
|
||||
--limit=<n> Limit the number of cities to process
|
||||
--delay=<ms> Delay between cities in ms (default: 2000)
|
||||
|
||||
Examples:
|
||||
npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled
|
||||
npx tsx src/scripts/discovery-dutchie-locations.ts --city-slug=phoenix
|
||||
npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled --limit=5
|
||||
|
||||
Environment:
|
||||
DATABASE_URL - PostgreSQL connection string (required)
|
||||
`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { citySlug, allEnabled, limit, delay } = parseArgs();
|
||||
|
||||
if (!citySlug && !allEnabled) {
|
||||
console.error('ERROR: Must specify either --city-slug=<slug> or --all-enabled');
|
||||
printUsage();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE LOCATION DISCOVERY');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
if (citySlug) {
|
||||
console.log(`Mode: Single city (${citySlug})`);
|
||||
} else {
|
||||
console.log(`Mode: All enabled cities${limit ? ` (limit: ${limit})` : ''}`);
|
||||
}
|
||||
console.log(`Delay between cities: ${delay}ms`);
|
||||
console.log('');
|
||||
|
||||
// Get database URL from environment
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
if (!connectionString) {
|
||||
console.error('ERROR: DATABASE_URL environment variable is required');
|
||||
console.error('');
|
||||
console.error('Usage:');
|
||||
console.error(' DATABASE_URL="postgresql://..." npx tsx src/scripts/discovery-dutchie-locations.ts --all-enabled');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Create pool
|
||||
const pool = new Pool({ connectionString });
|
||||
|
||||
try {
|
||||
// Test connection
|
||||
await pool.query('SELECT 1');
|
||||
console.log('[CLI] Database connection established');
|
||||
|
||||
const discovery = new DutchieLocationDiscovery(pool);
|
||||
|
||||
if (citySlug) {
|
||||
// Single city mode
|
||||
const city = await discovery.getCityBySlug(citySlug);
|
||||
if (!city) {
|
||||
console.error(`ERROR: City not found: ${citySlug}`);
|
||||
console.error('');
|
||||
console.error('Make sure you have run city discovery first:');
|
||||
console.error(' npm run discovery:dutchie:cities');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const result = await discovery.discoverForCity(city);
|
||||
|
||||
console.log('');
|
||||
console.log('='.repeat(60));
|
||||
console.log('DISCOVERY COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`City: ${city.cityName}, ${city.stateCode}`);
|
||||
console.log(`Locations found: ${result.locationsFound}`);
|
||||
console.log(`Inserted: ${result.locationsInserted}`);
|
||||
console.log(`Updated: ${result.locationsUpdated}`);
|
||||
console.log(`Skipped (protected): ${result.locationsSkipped}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('');
|
||||
console.log('Errors:');
|
||||
result.errors.forEach((e) => console.log(` - ${e}`));
|
||||
}
|
||||
|
||||
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||
} else {
|
||||
// All enabled cities mode
|
||||
const result = await discovery.discoverAllEnabled({ limit, delayMs: delay });
|
||||
|
||||
console.log('');
|
||||
console.log('='.repeat(60));
|
||||
console.log('DISCOVERY COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Total cities processed: ${result.totalCities}`);
|
||||
console.log(`Total locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Total inserted: ${result.totalInserted}`);
|
||||
console.log(`Total updated: ${result.totalUpdated}`);
|
||||
console.log(`Total skipped: ${result.totalSkipped}`);
|
||||
console.log(`Total errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0 && result.errors.length <= 20) {
|
||||
console.log('');
|
||||
console.log('Errors:');
|
||||
result.errors.forEach((e) => console.log(` - ${e}`));
|
||||
} else if (result.errors.length > 20) {
|
||||
console.log('');
|
||||
console.log(`First 20 of ${result.errors.length} errors:`);
|
||||
result.errors.slice(0, 20).forEach((e) => console.log(` - ${e}`));
|
||||
}
|
||||
|
||||
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error('FATAL ERROR:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,749 +0,0 @@
|
||||
/**
|
||||
* Legacy Data Import ETL Script
|
||||
*
|
||||
* DEPRECATED: This script assumed a two-database architecture.
|
||||
*
|
||||
* CURRENT ARCHITECTURE (Single Database):
|
||||
* - All data lives in ONE database: cannaiq (cannaiq-postgres container)
|
||||
* - Legacy tables exist INSIDE this same database with namespaced prefixes (e.g., legacy_*)
|
||||
* - The only database is: cannaiq (in cannaiq-postgres container)
|
||||
*
|
||||
* If you need to import legacy data:
|
||||
* 1. Import into namespaced tables (legacy_dispensaries, legacy_products, etc.)
|
||||
* inside the main cannaiq database
|
||||
* 2. Use the canonical connection from src/dutchie-az/db/connection.ts
|
||||
*
|
||||
* SAFETY RULES:
|
||||
* - INSERT-ONLY: No UPDATE, no DELETE, no TRUNCATE
|
||||
* - ON CONFLICT DO NOTHING: Skip duplicates, never overwrite
|
||||
* - Batch Processing: 500-1000 rows per batch
|
||||
* - Manual Invocation Only: Requires explicit user execution
|
||||
*/
|
||||
|
||||
import { Pool, PoolClient } from 'pg';
|
||||
|
||||
// ============================================================
|
||||
// CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
interface ETLConfig {
|
||||
dryRun: boolean;
|
||||
tables: string[];
|
||||
}
|
||||
|
||||
interface ETLStats {
|
||||
table: string;
|
||||
read: number;
|
||||
inserted: number;
|
||||
skipped: number;
|
||||
errors: number;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs(): ETLConfig {
|
||||
const args = process.argv.slice(2);
|
||||
const config: ETLConfig = {
|
||||
dryRun: false,
|
||||
tables: ['dispensaries', 'products', 'dutchie_products', 'dutchie_product_snapshots'],
|
||||
};
|
||||
|
||||
for (const arg of args) {
|
||||
if (arg === '--dry-run') {
|
||||
config.dryRun = true;
|
||||
} else if (arg.startsWith('--tables=')) {
|
||||
config.tables = arg.replace('--tables=', '').split(',');
|
||||
}
|
||||
}
|
||||
|
||||
return config;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE CONNECTIONS
|
||||
// ============================================================
|
||||
|
||||
// DEPRECATED: Both pools point to the same database (cannaiq)
|
||||
// Legacy tables exist inside the main database with namespaced prefixes
|
||||
function createLegacyPool(): Pool {
|
||||
return new Pool({
|
||||
host: process.env.CANNAIQ_DB_HOST || 'localhost',
|
||||
port: parseInt(process.env.CANNAIQ_DB_PORT || '54320'),
|
||||
user: process.env.CANNAIQ_DB_USER || 'dutchie',
|
||||
password: process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass',
|
||||
database: process.env.CANNAIQ_DB_NAME || 'cannaiq',
|
||||
max: 5,
|
||||
});
|
||||
}
|
||||
|
||||
function createCannaiqPool(): Pool {
|
||||
return new Pool({
|
||||
host: process.env.CANNAIQ_DB_HOST || 'localhost',
|
||||
port: parseInt(process.env.CANNAIQ_DB_PORT || '54320'),
|
||||
user: process.env.CANNAIQ_DB_USER || 'dutchie',
|
||||
password: process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass',
|
||||
database: process.env.CANNAIQ_DB_NAME || 'cannaiq',
|
||||
max: 5,
|
||||
});
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STAGING TABLE CREATION
|
||||
// ============================================================
|
||||
|
||||
const STAGING_TABLES_SQL = `
|
||||
-- Staging table for legacy dispensaries
|
||||
CREATE TABLE IF NOT EXISTS dispensaries_from_legacy (
|
||||
id SERIAL PRIMARY KEY,
|
||||
legacy_id INTEGER NOT NULL,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL,
|
||||
city VARCHAR(100) NOT NULL,
|
||||
state VARCHAR(10) NOT NULL,
|
||||
postal_code VARCHAR(20),
|
||||
address TEXT,
|
||||
latitude DECIMAL(10,7),
|
||||
longitude DECIMAL(10,7),
|
||||
menu_url TEXT,
|
||||
website TEXT,
|
||||
legacy_metadata JSONB,
|
||||
imported_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(legacy_id)
|
||||
);
|
||||
|
||||
-- Staging table for legacy products
|
||||
CREATE TABLE IF NOT EXISTS products_from_legacy (
|
||||
id SERIAL PRIMARY KEY,
|
||||
legacy_product_id INTEGER NOT NULL,
|
||||
legacy_dispensary_id INTEGER,
|
||||
external_product_id VARCHAR(255),
|
||||
name VARCHAR(500) NOT NULL,
|
||||
brand_name VARCHAR(255),
|
||||
type VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
strain_type VARCHAR(50),
|
||||
thc DECIMAL(10,4),
|
||||
cbd DECIMAL(10,4),
|
||||
price_cents INTEGER,
|
||||
original_price_cents INTEGER,
|
||||
stock_status VARCHAR(20),
|
||||
weight VARCHAR(100),
|
||||
primary_image_url TEXT,
|
||||
first_seen_at TIMESTAMPTZ,
|
||||
last_seen_at TIMESTAMPTZ,
|
||||
legacy_raw_payload JSONB,
|
||||
imported_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(legacy_product_id)
|
||||
);
|
||||
|
||||
-- Staging table for legacy price history
|
||||
CREATE TABLE IF NOT EXISTS price_history_legacy (
|
||||
id SERIAL PRIMARY KEY,
|
||||
legacy_product_id INTEGER NOT NULL,
|
||||
price_cents INTEGER,
|
||||
recorded_at TIMESTAMPTZ,
|
||||
imported_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for efficient lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_legacy_slug ON dispensaries_from_legacy(slug, city, state);
|
||||
CREATE INDEX IF NOT EXISTS idx_prod_legacy_ext_id ON products_from_legacy(external_product_id);
|
||||
`;
|
||||
|
||||
async function createStagingTables(cannaiqPool: Pool, dryRun: boolean): Promise<void> {
|
||||
console.log('[ETL] Creating staging tables...');
|
||||
|
||||
if (dryRun) {
|
||||
console.log('[ETL] DRY RUN: Would create staging tables');
|
||||
return;
|
||||
}
|
||||
|
||||
const client = await cannaiqPool.connect();
|
||||
try {
|
||||
await client.query(STAGING_TABLES_SQL);
|
||||
console.log('[ETL] Staging tables created successfully');
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// ETL FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
async function importDispensaries(
|
||||
legacyPool: Pool,
|
||||
cannaiqPool: Pool,
|
||||
dryRun: boolean
|
||||
): Promise<ETLStats> {
|
||||
const startTime = Date.now();
|
||||
const stats: ETLStats = {
|
||||
table: 'dispensaries',
|
||||
read: 0,
|
||||
inserted: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
console.log('[ETL] Importing dispensaries...');
|
||||
|
||||
const legacyClient = await legacyPool.connect();
|
||||
const cannaiqClient = await cannaiqPool.connect();
|
||||
|
||||
try {
|
||||
// Count total rows
|
||||
const countResult = await legacyClient.query('SELECT COUNT(*) FROM dispensaries');
|
||||
const totalRows = parseInt(countResult.rows[0].count);
|
||||
console.log(`[ETL] Found ${totalRows} dispensaries in legacy database`);
|
||||
|
||||
// Process in batches
|
||||
let offset = 0;
|
||||
while (offset < totalRows) {
|
||||
const batchResult = await legacyClient.query(`
|
||||
SELECT
|
||||
id, name, slug, city, state, zip, address,
|
||||
latitude, longitude, menu_url, website, dba_name,
|
||||
menu_provider, product_provider, provider_detection_data
|
||||
FROM dispensaries
|
||||
ORDER BY id
|
||||
LIMIT $1 OFFSET $2
|
||||
`, [BATCH_SIZE, offset]);
|
||||
|
||||
stats.read += batchResult.rows.length;
|
||||
|
||||
if (dryRun) {
|
||||
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} dispensaries`);
|
||||
stats.inserted += batchResult.rows.length;
|
||||
} else {
|
||||
for (const row of batchResult.rows) {
|
||||
try {
|
||||
const legacyMetadata = {
|
||||
dba_name: row.dba_name,
|
||||
menu_provider: row.menu_provider,
|
||||
product_provider: row.product_provider,
|
||||
provider_detection_data: row.provider_detection_data,
|
||||
};
|
||||
|
||||
const insertResult = await cannaiqClient.query(`
|
||||
INSERT INTO dispensaries_from_legacy
|
||||
(legacy_id, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, menu_url, website, legacy_metadata)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
|
||||
ON CONFLICT (legacy_id) DO NOTHING
|
||||
RETURNING id
|
||||
`, [
|
||||
row.id,
|
||||
row.name,
|
||||
row.slug,
|
||||
row.city,
|
||||
row.state,
|
||||
row.zip,
|
||||
row.address,
|
||||
row.latitude,
|
||||
row.longitude,
|
||||
row.menu_url,
|
||||
row.website,
|
||||
JSON.stringify(legacyMetadata),
|
||||
]);
|
||||
|
||||
if (insertResult.rowCount > 0) {
|
||||
stats.inserted++;
|
||||
} else {
|
||||
stats.skipped++;
|
||||
}
|
||||
} catch (err: any) {
|
||||
stats.errors++;
|
||||
console.error(`[ETL] Error inserting dispensary ${row.id}:`, err.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset += BATCH_SIZE;
|
||||
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} dispensaries`);
|
||||
}
|
||||
} finally {
|
||||
legacyClient.release();
|
||||
cannaiqClient.release();
|
||||
}
|
||||
|
||||
stats.durationMs = Date.now() - startTime;
|
||||
return stats;
|
||||
}
|
||||
|
||||
async function importProducts(
|
||||
legacyPool: Pool,
|
||||
cannaiqPool: Pool,
|
||||
dryRun: boolean
|
||||
): Promise<ETLStats> {
|
||||
const startTime = Date.now();
|
||||
const stats: ETLStats = {
|
||||
table: 'products',
|
||||
read: 0,
|
||||
inserted: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
console.log('[ETL] Importing legacy products...');
|
||||
|
||||
const legacyClient = await legacyPool.connect();
|
||||
const cannaiqClient = await cannaiqPool.connect();
|
||||
|
||||
try {
|
||||
const countResult = await legacyClient.query('SELECT COUNT(*) FROM products');
|
||||
const totalRows = parseInt(countResult.rows[0].count);
|
||||
console.log(`[ETL] Found ${totalRows} products in legacy database`);
|
||||
|
||||
let offset = 0;
|
||||
while (offset < totalRows) {
|
||||
const batchResult = await legacyClient.query(`
|
||||
SELECT
|
||||
id, dispensary_id, dutchie_product_id, name, brand,
|
||||
subcategory, strain_type, thc_percentage, cbd_percentage,
|
||||
price, original_price, in_stock, weight, image_url,
|
||||
first_seen_at, last_seen_at, raw_data
|
||||
FROM products
|
||||
ORDER BY id
|
||||
LIMIT $1 OFFSET $2
|
||||
`, [BATCH_SIZE, offset]);
|
||||
|
||||
stats.read += batchResult.rows.length;
|
||||
|
||||
if (dryRun) {
|
||||
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} products`);
|
||||
stats.inserted += batchResult.rows.length;
|
||||
} else {
|
||||
for (const row of batchResult.rows) {
|
||||
try {
|
||||
const stockStatus = row.in_stock === true ? 'in_stock' :
|
||||
row.in_stock === false ? 'out_of_stock' : 'unknown';
|
||||
const priceCents = row.price ? Math.round(parseFloat(row.price) * 100) : null;
|
||||
const originalPriceCents = row.original_price ? Math.round(parseFloat(row.original_price) * 100) : null;
|
||||
|
||||
const insertResult = await cannaiqClient.query(`
|
||||
INSERT INTO products_from_legacy
|
||||
(legacy_product_id, legacy_dispensary_id, external_product_id,
|
||||
name, brand_name, subcategory, strain_type, thc, cbd,
|
||||
price_cents, original_price_cents, stock_status, weight,
|
||||
primary_image_url, first_seen_at, last_seen_at, legacy_raw_payload)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
|
||||
ON CONFLICT (legacy_product_id) DO NOTHING
|
||||
RETURNING id
|
||||
`, [
|
||||
row.id,
|
||||
row.dispensary_id,
|
||||
row.dutchie_product_id,
|
||||
row.name,
|
||||
row.brand,
|
||||
row.subcategory,
|
||||
row.strain_type,
|
||||
row.thc_percentage,
|
||||
row.cbd_percentage,
|
||||
priceCents,
|
||||
originalPriceCents,
|
||||
stockStatus,
|
||||
row.weight,
|
||||
row.image_url,
|
||||
row.first_seen_at,
|
||||
row.last_seen_at,
|
||||
row.raw_data ? JSON.stringify(row.raw_data) : null,
|
||||
]);
|
||||
|
||||
if (insertResult.rowCount > 0) {
|
||||
stats.inserted++;
|
||||
} else {
|
||||
stats.skipped++;
|
||||
}
|
||||
} catch (err: any) {
|
||||
stats.errors++;
|
||||
console.error(`[ETL] Error inserting product ${row.id}:`, err.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset += BATCH_SIZE;
|
||||
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} products`);
|
||||
}
|
||||
} finally {
|
||||
legacyClient.release();
|
||||
cannaiqClient.release();
|
||||
}
|
||||
|
||||
stats.durationMs = Date.now() - startTime;
|
||||
return stats;
|
||||
}
|
||||
|
||||
async function importDutchieProducts(
|
||||
legacyPool: Pool,
|
||||
cannaiqPool: Pool,
|
||||
dryRun: boolean
|
||||
): Promise<ETLStats> {
|
||||
const startTime = Date.now();
|
||||
const stats: ETLStats = {
|
||||
table: 'dutchie_products',
|
||||
read: 0,
|
||||
inserted: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
console.log('[ETL] Importing dutchie_products...');
|
||||
|
||||
const legacyClient = await legacyPool.connect();
|
||||
const cannaiqClient = await cannaiqPool.connect();
|
||||
|
||||
try {
|
||||
const countResult = await legacyClient.query('SELECT COUNT(*) FROM dutchie_products');
|
||||
const totalRows = parseInt(countResult.rows[0].count);
|
||||
console.log(`[ETL] Found ${totalRows} dutchie_products in legacy database`);
|
||||
|
||||
// Note: For dutchie_products, we need to map dispensary_id to the canonical dispensary
|
||||
// This requires the dispensaries to be imported first
|
||||
// For now, we'll insert directly since the schema is nearly identical
|
||||
|
||||
let offset = 0;
|
||||
while (offset < totalRows) {
|
||||
const batchResult = await legacyClient.query(`
|
||||
SELECT *
|
||||
FROM dutchie_products
|
||||
ORDER BY id
|
||||
LIMIT $1 OFFSET $2
|
||||
`, [BATCH_SIZE, offset]);
|
||||
|
||||
stats.read += batchResult.rows.length;
|
||||
|
||||
if (dryRun) {
|
||||
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} dutchie_products`);
|
||||
stats.inserted += batchResult.rows.length;
|
||||
} else {
|
||||
// For each row, attempt insert with ON CONFLICT DO NOTHING
|
||||
for (const row of batchResult.rows) {
|
||||
try {
|
||||
// Check if dispensary exists in canonical table
|
||||
const dispCheck = await cannaiqClient.query(`
|
||||
SELECT id FROM dispensaries WHERE id = $1
|
||||
`, [row.dispensary_id]);
|
||||
|
||||
if (dispCheck.rows.length === 0) {
|
||||
stats.skipped++;
|
||||
continue; // Skip products for dispensaries not yet imported
|
||||
}
|
||||
|
||||
const insertResult = await cannaiqClient.query(`
|
||||
INSERT INTO dutchie_products
|
||||
(dispensary_id, platform, external_product_id, platform_dispensary_id,
|
||||
c_name, name, brand_name, brand_id, brand_logo_url,
|
||||
type, subcategory, strain_type, provider,
|
||||
thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
|
||||
status, medical_only, rec_only, featured, coming_soon,
|
||||
certificate_of_analysis_enabled,
|
||||
is_below_threshold, is_below_kiosk_threshold,
|
||||
options_below_threshold, options_below_kiosk_threshold,
|
||||
stock_status, total_quantity_available,
|
||||
primary_image_url, images, measurements, weight, past_c_names,
|
||||
created_at_dutchie, updated_at_dutchie, latest_raw_payload)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39)
|
||||
ON CONFLICT (dispensary_id, external_product_id) DO NOTHING
|
||||
RETURNING id
|
||||
`, [
|
||||
row.dispensary_id,
|
||||
row.platform || 'dutchie',
|
||||
row.external_product_id,
|
||||
row.platform_dispensary_id,
|
||||
row.c_name,
|
||||
row.name,
|
||||
row.brand_name,
|
||||
row.brand_id,
|
||||
row.brand_logo_url,
|
||||
row.type,
|
||||
row.subcategory,
|
||||
row.strain_type,
|
||||
row.provider,
|
||||
row.thc,
|
||||
row.thc_content,
|
||||
row.cbd,
|
||||
row.cbd_content,
|
||||
row.cannabinoids_v2,
|
||||
row.effects,
|
||||
row.status,
|
||||
row.medical_only,
|
||||
row.rec_only,
|
||||
row.featured,
|
||||
row.coming_soon,
|
||||
row.certificate_of_analysis_enabled,
|
||||
row.is_below_threshold,
|
||||
row.is_below_kiosk_threshold,
|
||||
row.options_below_threshold,
|
||||
row.options_below_kiosk_threshold,
|
||||
row.stock_status,
|
||||
row.total_quantity_available,
|
||||
row.primary_image_url,
|
||||
row.images,
|
||||
row.measurements,
|
||||
row.weight,
|
||||
row.past_c_names,
|
||||
row.created_at_dutchie,
|
||||
row.updated_at_dutchie,
|
||||
row.latest_raw_payload,
|
||||
]);
|
||||
|
||||
if (insertResult.rowCount > 0) {
|
||||
stats.inserted++;
|
||||
} else {
|
||||
stats.skipped++;
|
||||
}
|
||||
} catch (err: any) {
|
||||
stats.errors++;
|
||||
if (stats.errors <= 5) {
|
||||
console.error(`[ETL] Error inserting dutchie_product ${row.id}:`, err.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset += BATCH_SIZE;
|
||||
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} dutchie_products`);
|
||||
}
|
||||
} finally {
|
||||
legacyClient.release();
|
||||
cannaiqClient.release();
|
||||
}
|
||||
|
||||
stats.durationMs = Date.now() - startTime;
|
||||
return stats;
|
||||
}
|
||||
|
||||
async function importDutchieSnapshots(
|
||||
legacyPool: Pool,
|
||||
cannaiqPool: Pool,
|
||||
dryRun: boolean
|
||||
): Promise<ETLStats> {
|
||||
const startTime = Date.now();
|
||||
const stats: ETLStats = {
|
||||
table: 'dutchie_product_snapshots',
|
||||
read: 0,
|
||||
inserted: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
console.log('[ETL] Importing dutchie_product_snapshots...');
|
||||
|
||||
const legacyClient = await legacyPool.connect();
|
||||
const cannaiqClient = await cannaiqPool.connect();
|
||||
|
||||
try {
|
||||
const countResult = await legacyClient.query('SELECT COUNT(*) FROM dutchie_product_snapshots');
|
||||
const totalRows = parseInt(countResult.rows[0].count);
|
||||
console.log(`[ETL] Found ${totalRows} dutchie_product_snapshots in legacy database`);
|
||||
|
||||
// Build mapping of legacy product IDs to canonical product IDs
|
||||
console.log('[ETL] Building product ID mapping...');
|
||||
const productMapping = new Map<number, number>();
|
||||
const mappingResult = await cannaiqClient.query(`
|
||||
SELECT id, external_product_id, dispensary_id FROM dutchie_products
|
||||
`);
|
||||
// Create a key from dispensary_id + external_product_id
|
||||
const productByKey = new Map<string, number>();
|
||||
for (const row of mappingResult.rows) {
|
||||
const key = `${row.dispensary_id}:${row.external_product_id}`;
|
||||
productByKey.set(key, row.id);
|
||||
}
|
||||
|
||||
let offset = 0;
|
||||
while (offset < totalRows) {
|
||||
const batchResult = await legacyClient.query(`
|
||||
SELECT *
|
||||
FROM dutchie_product_snapshots
|
||||
ORDER BY id
|
||||
LIMIT $1 OFFSET $2
|
||||
`, [BATCH_SIZE, offset]);
|
||||
|
||||
stats.read += batchResult.rows.length;
|
||||
|
||||
if (dryRun) {
|
||||
console.log(`[ETL] DRY RUN: Would insert batch of ${batchResult.rows.length} snapshots`);
|
||||
stats.inserted += batchResult.rows.length;
|
||||
} else {
|
||||
for (const row of batchResult.rows) {
|
||||
try {
|
||||
// Map legacy product ID to canonical product ID
|
||||
const key = `${row.dispensary_id}:${row.external_product_id}`;
|
||||
const canonicalProductId = productByKey.get(key);
|
||||
|
||||
if (!canonicalProductId) {
|
||||
stats.skipped++;
|
||||
continue; // Skip snapshots for products not yet imported
|
||||
}
|
||||
|
||||
// Insert snapshot (no conflict handling - all snapshots are historical)
|
||||
await cannaiqClient.query(`
|
||||
INSERT INTO dutchie_product_snapshots
|
||||
(dutchie_product_id, dispensary_id, platform_dispensary_id,
|
||||
external_product_id, pricing_type, crawl_mode,
|
||||
status, featured, special, medical_only, rec_only,
|
||||
is_present_in_feed, stock_status,
|
||||
rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
|
||||
med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
|
||||
wholesale_min_price_cents,
|
||||
total_quantity_available, total_kiosk_quantity_available,
|
||||
manual_inventory, is_below_threshold, is_below_kiosk_threshold,
|
||||
options, raw_payload, crawled_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28)
|
||||
`, [
|
||||
canonicalProductId,
|
||||
row.dispensary_id,
|
||||
row.platform_dispensary_id,
|
||||
row.external_product_id,
|
||||
row.pricing_type,
|
||||
row.crawl_mode,
|
||||
row.status,
|
||||
row.featured,
|
||||
row.special,
|
||||
row.medical_only,
|
||||
row.rec_only,
|
||||
row.is_present_in_feed,
|
||||
row.stock_status,
|
||||
row.rec_min_price_cents,
|
||||
row.rec_max_price_cents,
|
||||
row.rec_min_special_price_cents,
|
||||
row.med_min_price_cents,
|
||||
row.med_max_price_cents,
|
||||
row.med_min_special_price_cents,
|
||||
row.wholesale_min_price_cents,
|
||||
row.total_quantity_available,
|
||||
row.total_kiosk_quantity_available,
|
||||
row.manual_inventory,
|
||||
row.is_below_threshold,
|
||||
row.is_below_kiosk_threshold,
|
||||
row.options,
|
||||
row.raw_payload,
|
||||
row.crawled_at,
|
||||
]);
|
||||
|
||||
stats.inserted++;
|
||||
} catch (err: any) {
|
||||
stats.errors++;
|
||||
if (stats.errors <= 5) {
|
||||
console.error(`[ETL] Error inserting snapshot ${row.id}:`, err.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset += BATCH_SIZE;
|
||||
console.log(`[ETL] Processed ${Math.min(offset, totalRows)}/${totalRows} snapshots`);
|
||||
}
|
||||
} finally {
|
||||
legacyClient.release();
|
||||
cannaiqClient.release();
|
||||
}
|
||||
|
||||
stats.durationMs = Date.now() - startTime;
|
||||
return stats;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN
|
||||
// ============================================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('='.repeat(60));
|
||||
console.log('LEGACY DATA IMPORT ETL');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
const config = parseArgs();
|
||||
|
||||
console.log(`Mode: ${config.dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
console.log(`Tables: ${config.tables.join(', ')}`);
|
||||
console.log('');
|
||||
|
||||
// Create connection pools
|
||||
const legacyPool = createLegacyPool();
|
||||
const cannaiqPool = createCannaiqPool();
|
||||
|
||||
try {
|
||||
// Test connections
|
||||
console.log('[ETL] Testing database connections...');
|
||||
await legacyPool.query('SELECT 1');
|
||||
console.log('[ETL] Legacy database connected');
|
||||
await cannaiqPool.query('SELECT 1');
|
||||
console.log('[ETL] CannaiQ database connected');
|
||||
console.log('');
|
||||
|
||||
// Create staging tables
|
||||
await createStagingTables(cannaiqPool, config.dryRun);
|
||||
console.log('');
|
||||
|
||||
// Run imports
|
||||
const allStats: ETLStats[] = [];
|
||||
|
||||
if (config.tables.includes('dispensaries')) {
|
||||
const stats = await importDispensaries(legacyPool, cannaiqPool, config.dryRun);
|
||||
allStats.push(stats);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (config.tables.includes('products')) {
|
||||
const stats = await importProducts(legacyPool, cannaiqPool, config.dryRun);
|
||||
allStats.push(stats);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (config.tables.includes('dutchie_products')) {
|
||||
const stats = await importDutchieProducts(legacyPool, cannaiqPool, config.dryRun);
|
||||
allStats.push(stats);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (config.tables.includes('dutchie_product_snapshots')) {
|
||||
const stats = await importDutchieSnapshots(legacyPool, cannaiqPool, config.dryRun);
|
||||
allStats.push(stats);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Print summary
|
||||
console.log('='.repeat(60));
|
||||
console.log('IMPORT SUMMARY');
|
||||
console.log('='.repeat(60));
|
||||
console.log('');
|
||||
console.log('| Table | Read | Inserted | Skipped | Errors | Duration |');
|
||||
console.log('|----------------------------|----------|----------|----------|----------|----------|');
|
||||
for (const s of allStats) {
|
||||
console.log(`| ${s.table.padEnd(26)} | ${String(s.read).padStart(8)} | ${String(s.inserted).padStart(8)} | ${String(s.skipped).padStart(8)} | ${String(s.errors).padStart(8)} | ${(s.durationMs / 1000).toFixed(1).padStart(7)}s |`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
const totalInserted = allStats.reduce((sum, s) => sum + s.inserted, 0);
|
||||
const totalErrors = allStats.reduce((sum, s) => sum + s.errors, 0);
|
||||
console.log(`Total inserted: ${totalInserted}`);
|
||||
console.log(`Total errors: ${totalErrors}`);
|
||||
|
||||
if (config.dryRun) {
|
||||
console.log('');
|
||||
console.log('DRY RUN COMPLETE - No data was written');
|
||||
console.log('Run without --dry-run to perform actual import');
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('[ETL] Fatal error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await legacyPool.end();
|
||||
await cannaiqPool.end();
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('ETL complete');
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Unhandled error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
397
backend/src/scripts/harmonize-az-dispensaries.ts
Normal file
397
backend/src/scripts/harmonize-az-dispensaries.ts
Normal file
@@ -0,0 +1,397 @@
|
||||
/**
|
||||
* Harmonize AZ Dispensaries with Dutchie Source of Truth
|
||||
*
|
||||
* This script:
|
||||
* 1. Queries Dutchie ConsumerDispensaries API for all AZ cities
|
||||
* 2. Matches our dispensaries by platform_dispensary_id
|
||||
* 3. Updates existing records with full Dutchie data
|
||||
* 4. Creates new records for dispensaries in Dutchie but not in our DB
|
||||
* 5. Disables dispensaries not found in Dutchie
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/harmonize-az-dispensaries.ts
|
||||
* npx tsx src/scripts/harmonize-az-dispensaries.ts --dry-run
|
||||
* npx tsx src/scripts/harmonize-az-dispensaries.ts --state CA
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { executeGraphQL, GRAPHQL_HASHES } from '../platforms/dutchie/client';
|
||||
|
||||
const pool = new Pool({
|
||||
host: process.env.CANNAIQ_DB_HOST || 'localhost',
|
||||
port: parseInt(process.env.CANNAIQ_DB_PORT || '54320'),
|
||||
database: process.env.CANNAIQ_DB_NAME || 'dutchie_menus',
|
||||
user: process.env.CANNAIQ_DB_USER || 'dutchie',
|
||||
password: process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass',
|
||||
});
|
||||
|
||||
interface Dispensary {
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
city: string;
|
||||
state: string;
|
||||
platform_dispensary_id: string | null;
|
||||
dutchie_verified: boolean;
|
||||
crawl_enabled: boolean;
|
||||
}
|
||||
|
||||
interface DutchieDispensary {
|
||||
id: string; // Platform ID like "deHiuKKmBHGJKXzuj"
|
||||
cName: string; // Slug like "the-downtown-dispensary"
|
||||
name: string;
|
||||
phone: string | null;
|
||||
address: string;
|
||||
description: string | null;
|
||||
status: string;
|
||||
chain: string | null;
|
||||
timezone: string;
|
||||
location: {
|
||||
ln1: string;
|
||||
ln2: string;
|
||||
city: string;
|
||||
state: string;
|
||||
country: string;
|
||||
zipcode: string;
|
||||
geometry: {
|
||||
coordinates: [number, number];
|
||||
};
|
||||
};
|
||||
deliveryHours: any;
|
||||
pickupHours: any;
|
||||
offerDelivery: boolean;
|
||||
offerPickup: boolean;
|
||||
offerCurbsidePickup: boolean;
|
||||
isMedical: boolean;
|
||||
isRecreational: boolean;
|
||||
}
|
||||
|
||||
interface HarmonizationResult {
|
||||
updated: number;
|
||||
created: number;
|
||||
disabled: number;
|
||||
skipped: number;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
// Cities to query for AZ (from statesWithDispensaries)
|
||||
const AZ_CITIES = [
|
||||
'Apache Junction', 'Bisbee', 'Bullhead City', 'Casa Grande', 'Chandler',
|
||||
'Cottonwood', 'El Mirage', 'Flagstaff', 'Florence', 'Gilbert', 'Glendale',
|
||||
'Globe', 'Goodyear', 'Kingman', 'Lake Havasu City', 'Maricopa', 'Mesa',
|
||||
'Peoria', 'Phoenix', 'Prescott', 'Prescott Valley', 'Queen Creek',
|
||||
'Scottsdale', 'Show Low', 'Sierra Vista', 'Snowflake', 'Sun City',
|
||||
'Surprise', 'Tempe', 'Tolleson', 'Tucson', 'Yuma'
|
||||
];
|
||||
|
||||
async function getDispensaries(state: string): Promise<Dispensary[]> {
|
||||
const result = await pool.query<Dispensary>(
|
||||
`SELECT id, name, slug, city, state, platform_dispensary_id,
|
||||
COALESCE(dutchie_verified, false) as dutchie_verified,
|
||||
COALESCE(crawl_enabled, true) as crawl_enabled
|
||||
FROM dispensaries
|
||||
WHERE state = $1
|
||||
ORDER BY id`,
|
||||
[state]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
async function fetchDutchieDispensariesByCity(
|
||||
city: string,
|
||||
state: string
|
||||
): Promise<DutchieDispensary[]> {
|
||||
const allDispensaries: DutchieDispensary[] = [];
|
||||
let page = 0;
|
||||
const perPage = 100;
|
||||
|
||||
while (true) {
|
||||
const variables = {
|
||||
dispensaryFilter: {
|
||||
activeOnly: true,
|
||||
city,
|
||||
state,
|
||||
},
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
|
||||
const result = await executeGraphQL(
|
||||
'ConsumerDispensaries',
|
||||
variables,
|
||||
GRAPHQL_HASHES.ConsumerDispensaries,
|
||||
{ cName: `${city.toLowerCase().replace(/\s+/g, '-')}-${state.toLowerCase()}`, maxRetries: 2, retryOn403: true }
|
||||
);
|
||||
|
||||
const dispensaries = result?.data?.filteredDispensaries || [];
|
||||
allDispensaries.push(...dispensaries);
|
||||
|
||||
if (dispensaries.length < perPage) break;
|
||||
page++;
|
||||
|
||||
// Rate limit
|
||||
await new Promise(resolve => setTimeout(resolve, 200));
|
||||
}
|
||||
|
||||
return allDispensaries;
|
||||
}
|
||||
|
||||
async function fetchAllDutchieDispensaries(state: string): Promise<Map<string, DutchieDispensary>> {
|
||||
const cities = state === 'AZ' ? AZ_CITIES : [];
|
||||
const dispensaryMap = new Map<string, DutchieDispensary>();
|
||||
|
||||
console.log(`Fetching dispensaries from ${cities.length} cities...`);
|
||||
|
||||
for (const city of cities) {
|
||||
const dispensaries = await fetchDutchieDispensariesByCity(city, state);
|
||||
console.log(` ${city}: ${dispensaries.length} dispensaries`);
|
||||
|
||||
for (const d of dispensaries) {
|
||||
// Index by platform ID
|
||||
if (d.id && !dispensaryMap.has(d.id)) {
|
||||
dispensaryMap.set(d.id, d);
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limit between cities
|
||||
await new Promise(resolve => setTimeout(resolve, 300));
|
||||
}
|
||||
|
||||
console.log(`Total unique dispensaries from Dutchie: ${dispensaryMap.size}\n`);
|
||||
return dispensaryMap;
|
||||
}
|
||||
|
||||
async function updateDispensary(
|
||||
dispensaryId: number,
|
||||
dutchie: DutchieDispensary,
|
||||
dryRun: boolean
|
||||
): Promise<void> {
|
||||
if (dryRun) return;
|
||||
|
||||
const menuUrl = `https://dutchie.com/dispensary/${dutchie.cName}`;
|
||||
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET name = $2,
|
||||
slug = $3,
|
||||
address = $4,
|
||||
city = $5,
|
||||
postal_code = $6,
|
||||
phone = $7,
|
||||
latitude = $8,
|
||||
longitude = $9,
|
||||
menu_url = $10,
|
||||
menu_type = 'dutchie',
|
||||
platform = 'dutchie',
|
||||
is_delivery = $11,
|
||||
is_pickup = $12,
|
||||
dutchie_verified = true,
|
||||
dutchie_verified_at = NOW(),
|
||||
crawl_enabled = true,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
dispensaryId,
|
||||
dutchie.name.trim(),
|
||||
dutchie.cName,
|
||||
dutchie.location?.ln1 || dutchie.address,
|
||||
dutchie.location?.city || '',
|
||||
dutchie.location?.zipcode || '',
|
||||
dutchie.phone,
|
||||
dutchie.location?.geometry?.coordinates?.[1] || null,
|
||||
dutchie.location?.geometry?.coordinates?.[0] || null,
|
||||
menuUrl,
|
||||
dutchie.offerDelivery ?? false,
|
||||
dutchie.offerPickup ?? true,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
async function createDispensary(
|
||||
dutchie: DutchieDispensary,
|
||||
state: string,
|
||||
dryRun: boolean
|
||||
): Promise<number | null> {
|
||||
if (dryRun) return null;
|
||||
|
||||
const menuUrl = `https://dutchie.com/dispensary/${dutchie.cName}`;
|
||||
|
||||
const result = await pool.query<{ id: number }>(
|
||||
`INSERT INTO dispensaries (
|
||||
name, slug, city, state, platform, platform_dispensary_id,
|
||||
menu_url, menu_type, address, postal_code, latitude, longitude,
|
||||
is_delivery, is_pickup, phone,
|
||||
dutchie_verified, dutchie_verified_at,
|
||||
crawl_enabled, platform_id_source, platform_id_verified_at,
|
||||
created_at, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, 'dutchie', $5,
|
||||
$6, 'dutchie', $7, $8, $9, $10,
|
||||
$11, $12, $13,
|
||||
true, NOW(),
|
||||
true, 'dutchie_harmonization', NOW(),
|
||||
NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (slug) DO UPDATE SET
|
||||
platform_dispensary_id = EXCLUDED.platform_dispensary_id,
|
||||
name = EXCLUDED.name,
|
||||
menu_url = EXCLUDED.menu_url,
|
||||
address = EXCLUDED.address,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
is_delivery = EXCLUDED.is_delivery,
|
||||
is_pickup = EXCLUDED.is_pickup,
|
||||
phone = EXCLUDED.phone,
|
||||
dutchie_verified = true,
|
||||
dutchie_verified_at = NOW(),
|
||||
crawl_enabled = true,
|
||||
updated_at = NOW()
|
||||
RETURNING id`,
|
||||
[
|
||||
dutchie.name.trim(),
|
||||
dutchie.cName,
|
||||
dutchie.location?.city || '',
|
||||
state,
|
||||
dutchie.id,
|
||||
menuUrl,
|
||||
dutchie.location?.ln1 || dutchie.address,
|
||||
dutchie.location?.zipcode || '',
|
||||
dutchie.location?.geometry?.coordinates?.[1] || null,
|
||||
dutchie.location?.geometry?.coordinates?.[0] || null,
|
||||
dutchie.offerDelivery ?? false,
|
||||
dutchie.offerPickup ?? true,
|
||||
dutchie.phone,
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0]?.id || null;
|
||||
}
|
||||
|
||||
async function disableDispensary(dispensaryId: number, reason: string, dryRun: boolean): Promise<void> {
|
||||
if (dryRun) return;
|
||||
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET crawl_enabled = false,
|
||||
failure_notes = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[dispensaryId, reason]
|
||||
);
|
||||
}
|
||||
|
||||
async function harmonizeDispensaries(
|
||||
state: string,
|
||||
dryRun: boolean = false
|
||||
): Promise<HarmonizationResult> {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`HARMONIZING ${state} DISPENSARIES${dryRun ? ' (DRY RUN)' : ''}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
const result: HarmonizationResult = {
|
||||
updated: 0,
|
||||
created: 0,
|
||||
disabled: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
|
||||
// Fetch all dispensaries from Dutchie (source of truth)
|
||||
const dutchieMap = await fetchAllDutchieDispensaries(state);
|
||||
|
||||
// Get our current dispensaries
|
||||
const dispensaries = await getDispensaries(state);
|
||||
console.log(`Found ${dispensaries.length} dispensaries in our DB\n`);
|
||||
|
||||
// Track which Dutchie dispensaries we've matched
|
||||
const matchedDutchieIds = new Set<string>();
|
||||
|
||||
// Step 1: Match our dispensaries to Dutchie by platform_dispensary_id
|
||||
console.log('[Step 1/3] Matching existing dispensaries to Dutchie...');
|
||||
for (const disp of dispensaries) {
|
||||
if (disp.platform_dispensary_id && dutchieMap.has(disp.platform_dispensary_id)) {
|
||||
// Found match - update with Dutchie data
|
||||
const dutchie = dutchieMap.get(disp.platform_dispensary_id)!;
|
||||
|
||||
try {
|
||||
await updateDispensary(disp.id, dutchie, dryRun);
|
||||
console.log(` [UPDATED] ${disp.name} -> ${dutchie.name} (${dutchie.cName})`);
|
||||
result.updated++;
|
||||
matchedDutchieIds.add(disp.platform_dispensary_id);
|
||||
} catch (error: any) {
|
||||
console.error(` [ERROR] ${disp.name}: ${error.message}`);
|
||||
result.errors.push(`Update ${disp.name}: ${error.message}`);
|
||||
}
|
||||
} else if (disp.platform_dispensary_id) {
|
||||
// Has platform ID but not found in Dutchie - maybe closed?
|
||||
console.log(` [NOT FOUND] ${disp.name} (${disp.platform_dispensary_id}) - not in Dutchie`);
|
||||
await disableDispensary(disp.id, 'Platform ID not found in Dutchie - may be closed', dryRun);
|
||||
result.disabled++;
|
||||
} else {
|
||||
// No platform ID - disable
|
||||
console.log(` [NO ID] ${disp.name} - no platform_dispensary_id`);
|
||||
await disableDispensary(disp.id, 'No platform_dispensary_id', dryRun);
|
||||
result.disabled++;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Create new dispensaries for Dutchie records we don't have
|
||||
console.log(`\n[Step 2/3] Creating new dispensaries from Dutchie...`);
|
||||
for (const [platformId, dutchie] of dutchieMap) {
|
||||
if (matchedDutchieIds.has(platformId)) {
|
||||
continue; // Already matched
|
||||
}
|
||||
|
||||
try {
|
||||
const newId = await createDispensary(dutchie, state, dryRun);
|
||||
console.log(` [CREATED] ${dutchie.name} (${dutchie.cName}) -> ID ${newId || '(dry-run)'}`);
|
||||
result.created++;
|
||||
} catch (error: any) {
|
||||
console.error(` [ERROR] ${dutchie.name}: ${error.message}`);
|
||||
result.errors.push(`Create ${dutchie.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Summary
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log('HARMONIZATION SUMMARY');
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(` Updated (matched to Dutchie): ${result.updated}`);
|
||||
console.log(` Created (new from Dutchie): ${result.created}`);
|
||||
console.log(` Disabled (not in Dutchie): ${result.disabled}`);
|
||||
console.log(` Errors: ${result.errors.length}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log(`\nErrors:`);
|
||||
result.errors.slice(0, 20).forEach(e => console.log(` - ${e}`));
|
||||
if (result.errors.length > 20) {
|
||||
console.log(` ... and ${result.errors.length - 20} more`);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
let state = 'AZ';
|
||||
let dryRun = false;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--state' && args[i + 1]) {
|
||||
state = args[i + 1].toUpperCase();
|
||||
i++;
|
||||
} else if (args[i] === '--dry-run') {
|
||||
dryRun = true;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await harmonizeDispensaries(state, dryRun);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -1,583 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Queue Intelligence Script
|
||||
*
|
||||
* Orchestrates the multi-category intelligence crawler system:
|
||||
* 1. Queue dispensaries that need provider detection (all 4 categories)
|
||||
* 2. Queue per-category production crawls (Dutchie products only for now)
|
||||
* 3. Queue per-category sandbox crawls (all providers)
|
||||
*
|
||||
* Each category (product, specials, brand, metadata) is handled independently.
|
||||
* A failure in one category does NOT affect other categories.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
|
||||
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
|
||||
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
|
||||
* npx tsx src/scripts/queue-intelligence.ts --dry-run
|
||||
*/
|
||||
|
||||
import { pool } from '../db/pool';
|
||||
import { logger } from '../services/logger';
|
||||
import {
|
||||
detectMultiCategoryProviders,
|
||||
updateAllCategoryProviders,
|
||||
IntelligenceCategory,
|
||||
} from '../services/intelligence-detector';
|
||||
import {
|
||||
runCrawlProductsJob,
|
||||
runCrawlSpecialsJob,
|
||||
runCrawlBrandIntelligenceJob,
|
||||
runCrawlMetadataJob,
|
||||
runSandboxProductsJob,
|
||||
runSandboxSpecialsJob,
|
||||
runSandboxBrandJob,
|
||||
runSandboxMetadataJob,
|
||||
runAllCategoryProductionCrawls,
|
||||
runAllCategorySandboxCrawls,
|
||||
processCategorySandboxJobs,
|
||||
} from '../services/category-crawler-jobs';
|
||||
|
||||
// Parse command line args
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
detection: args.includes('--detection') || args.includes('--all'),
|
||||
production: args.includes('--production') || args.includes('--all'),
|
||||
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||||
dryRun: args.includes('--dry-run'),
|
||||
process: args.includes('--process'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||||
category: args.find(a => a.startsWith('--category='))?.split('=')[1] as IntelligenceCategory | undefined,
|
||||
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
|
||||
};
|
||||
|
||||
// If no specific flags, default to all
|
||||
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||||
flags.detection = true;
|
||||
flags.production = true;
|
||||
flags.sandbox = true;
|
||||
}
|
||||
|
||||
const CATEGORIES: IntelligenceCategory[] = ['product', 'specials', 'brand', 'metadata'];
|
||||
|
||||
async function showHelp() {
|
||||
console.log(`
|
||||
Queue Intelligence - Multi-Category Crawler Orchestration
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--detection Queue dispensaries that need multi-category detection
|
||||
--production Queue per-category production crawls
|
||||
--sandbox Queue per-category sandbox crawls
|
||||
--all Queue all job types (default if no specific flag)
|
||||
--process Process queued jobs instead of just queuing
|
||||
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
|
||||
--dispensary=ID Process only a specific dispensary
|
||||
--dry-run Show what would be queued without making changes
|
||||
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||||
--help, -h Show this help message
|
||||
|
||||
CATEGORIES:
|
||||
product - Product/menu data (Dutchie=production, others=sandbox)
|
||||
specials - Deals and specials (all sandbox for now)
|
||||
brand - Brand intelligence (all sandbox for now)
|
||||
metadata - Categories/taxonomy (all sandbox for now)
|
||||
|
||||
EXAMPLES:
|
||||
# Queue all dispensaries for appropriate jobs
|
||||
npx tsx src/scripts/queue-intelligence.ts
|
||||
|
||||
# Only queue product detection jobs
|
||||
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
|
||||
|
||||
# Process sandbox jobs for specials category
|
||||
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
|
||||
|
||||
# Run full detection for a specific dispensary
|
||||
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
|
||||
|
||||
# Dry run to see what would be queued
|
||||
npx tsx src/scripts/queue-intelligence.ts --dry-run
|
||||
`);
|
||||
}
|
||||
|
||||
async function queueMultiCategoryDetection(): Promise<number> {
|
||||
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
|
||||
|
||||
// Find dispensaries that need provider detection for any category:
|
||||
// - Any *_provider is null OR
|
||||
// - Any *_confidence < 70
|
||||
// - has a website URL
|
||||
const query = `
|
||||
SELECT id, name, website, menu_url,
|
||||
product_provider, product_confidence, product_crawler_mode,
|
||||
specials_provider, specials_confidence, specials_crawler_mode,
|
||||
brand_provider, brand_confidence, brand_crawler_mode,
|
||||
metadata_provider, metadata_confidence, metadata_crawler_mode
|
||||
FROM dispensaries
|
||||
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||||
AND (
|
||||
product_provider IS NULL OR product_confidence < 70 OR
|
||||
specials_provider IS NULL OR specials_confidence < 70 OR
|
||||
brand_provider IS NULL OR brand_confidence < 70 OR
|
||||
metadata_provider IS NULL OR metadata_confidence < 70
|
||||
)
|
||||
ORDER BY
|
||||
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
|
||||
product_confidence ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [flags.limit]);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
|
||||
for (const row of result.rows) {
|
||||
const needsDetection: string[] = [];
|
||||
if (!row.product_provider || row.product_confidence < 70) needsDetection.push('product');
|
||||
if (!row.specials_provider || row.specials_confidence < 70) needsDetection.push('specials');
|
||||
if (!row.brand_provider || row.brand_confidence < 70) needsDetection.push('brand');
|
||||
if (!row.metadata_provider || row.metadata_confidence < 70) needsDetection.push('metadata');
|
||||
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Create detection jobs for each category that needs it
|
||||
for (const category of CATEGORIES) {
|
||||
const provider = dispensary[`${category}_provider`];
|
||||
const confidence = dispensary[`${category}_confidence`];
|
||||
|
||||
if (!provider || confidence < 70) {
|
||||
await pool.query(
|
||||
`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
|
||||
VALUES ($1, $2, 'detection', 'pending', 10)
|
||||
ON CONFLICT DO NOTHING`,
|
||||
[dispensary.id, category]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
} catch (error: any) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return queued;
|
||||
}
|
||||
|
||||
async function queueCategoryProductionCrawls(category?: IntelligenceCategory): Promise<number> {
|
||||
const categories = category ? [category] : CATEGORIES;
|
||||
let totalQueued = 0;
|
||||
|
||||
for (const cat of categories) {
|
||||
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
|
||||
|
||||
// For now, only products have production-ready crawlers (Dutchie only)
|
||||
if (cat !== 'product') {
|
||||
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find dispensaries ready for production crawl
|
||||
const query = `
|
||||
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
|
||||
FROM dispensaries
|
||||
WHERE ${cat}_provider = 'dutchie'
|
||||
AND ${cat}_crawler_mode = 'production'
|
||||
AND ${cat}_confidence >= 70
|
||||
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
|
||||
ORDER BY
|
||||
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
|
||||
last_${cat}_scan_at ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [flags.limit]);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
|
||||
for (const row of result.rows) {
|
||||
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
|
||||
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
|
||||
}
|
||||
totalQueued += result.rows.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// For products, use the existing crawl_jobs table for production
|
||||
await pool.query(
|
||||
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`,
|
||||
[dispensary.id, cat]
|
||||
);
|
||||
|
||||
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
|
||||
totalQueued++;
|
||||
} catch (error: any) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return totalQueued;
|
||||
}
|
||||
|
||||
async function queueCategorySandboxCrawls(category?: IntelligenceCategory): Promise<number> {
|
||||
const categories = category ? [category] : CATEGORIES;
|
||||
let totalQueued = 0;
|
||||
|
||||
for (const cat of categories) {
|
||||
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
|
||||
|
||||
// Find dispensaries in sandbox mode for this category
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
|
||||
d.website, d.menu_url
|
||||
FROM dispensaries d
|
||||
WHERE d.${cat}_crawler_mode = 'sandbox'
|
||||
AND d.${cat}_provider IS NOT NULL
|
||||
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM sandbox_crawl_jobs sj
|
||||
WHERE sj.dispensary_id = d.id
|
||||
AND sj.category = $1
|
||||
AND sj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
|
||||
LIMIT $2
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [cat, flags.limit]);
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
|
||||
for (const row of result.rows) {
|
||||
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
|
||||
}
|
||||
totalQueued += result.rows.length;
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Create sandbox entry if needed
|
||||
const sandboxResult = await pool.query(
|
||||
`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
|
||||
VALUES ($1, $2, $3, 'template_learning', 'pending')
|
||||
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
|
||||
DO UPDATE SET updated_at = NOW()
|
||||
RETURNING id`,
|
||||
[dispensary.id, cat, dispensary.provider]
|
||||
);
|
||||
|
||||
const sandboxId = sandboxResult.rows[0]?.id;
|
||||
|
||||
// Create sandbox job
|
||||
await pool.query(
|
||||
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
|
||||
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`,
|
||||
[dispensary.id, sandboxId, cat]
|
||||
);
|
||||
|
||||
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
|
||||
totalQueued++;
|
||||
} catch (error: any) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return totalQueued;
|
||||
}
|
||||
|
||||
async function processDetectionJobs(): Promise<void> {
|
||||
console.log('\n🔍 Processing Detection Jobs...');
|
||||
|
||||
// Get pending detection jobs
|
||||
const jobs = await pool.query(
|
||||
`SELECT DISTINCT dispensary_id
|
||||
FROM sandbox_crawl_jobs
|
||||
WHERE job_type = 'detection' AND status = 'pending'
|
||||
${flags.category ? `AND category = $2` : ''}
|
||||
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
|
||||
LIMIT $1`,
|
||||
flags.category
|
||||
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
|
||||
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit])
|
||||
);
|
||||
|
||||
for (const job of jobs.rows) {
|
||||
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
|
||||
|
||||
try {
|
||||
// Get dispensary info
|
||||
const dispResult = await pool.query(
|
||||
'SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1',
|
||||
[job.dispensary_id]
|
||||
);
|
||||
const dispensary = dispResult.rows[0];
|
||||
|
||||
if (!dispensary) {
|
||||
console.log(` ✗ Dispensary not found`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||
if (!websiteUrl) {
|
||||
console.log(` ✗ No website URL`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Mark jobs as running
|
||||
await pool.query(
|
||||
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
|
||||
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`,
|
||||
[job.dispensary_id]
|
||||
);
|
||||
|
||||
// Run multi-category detection
|
||||
console.log(` Detecting providers for ${dispensary.name}...`);
|
||||
const detection = await detectMultiCategoryProviders(websiteUrl, { timeout: 45000 });
|
||||
|
||||
// Update all categories
|
||||
await updateAllCategoryProviders(job.dispensary_id, detection);
|
||||
|
||||
// Mark jobs as completed
|
||||
await pool.query(
|
||||
`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
|
||||
result_summary = $1
|
||||
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
|
||||
[JSON.stringify({
|
||||
product: { provider: detection.product.provider, confidence: detection.product.confidence },
|
||||
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
|
||||
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
|
||||
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
|
||||
}), job.dispensary_id]
|
||||
);
|
||||
|
||||
console.log(` ✓ Detection complete:`);
|
||||
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
|
||||
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
|
||||
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
|
||||
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
|
||||
|
||||
} catch (error: any) {
|
||||
console.log(` ✗ Error: ${error.message}`);
|
||||
await pool.query(
|
||||
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
|
||||
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
|
||||
[error.message, job.dispensary_id]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function processCrawlJobs(): Promise<void> {
|
||||
const categories = flags.category ? [flags.category] : CATEGORIES;
|
||||
|
||||
for (const cat of categories) {
|
||||
console.log(`\n⚙️ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
|
||||
|
||||
// Process sandbox jobs for this category
|
||||
if (flags.sandbox || !flags.production) {
|
||||
await processCategorySandboxJobs(cat, flags.limit);
|
||||
}
|
||||
|
||||
// Process production jobs for this category
|
||||
if (flags.production && cat === 'product') {
|
||||
// Get pending production crawls
|
||||
const prodJobs = await pool.query(
|
||||
`SELECT d.id
|
||||
FROM dispensaries d
|
||||
WHERE d.product_provider = 'dutchie'
|
||||
AND d.product_crawler_mode = 'production'
|
||||
AND d.product_confidence >= 70
|
||||
${flags.dispensary ? 'AND d.id = $2' : ''}
|
||||
LIMIT $1`,
|
||||
flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]
|
||||
);
|
||||
|
||||
for (const job of prodJobs.rows) {
|
||||
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
|
||||
const result = await runCrawlProductsJob(job.id);
|
||||
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function processSpecificDispensary(): Promise<void> {
|
||||
if (!flags.dispensary) return;
|
||||
|
||||
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
|
||||
|
||||
const dispResult = await pool.query(
|
||||
'SELECT * FROM dispensaries WHERE id = $1',
|
||||
[flags.dispensary]
|
||||
);
|
||||
|
||||
if (dispResult.rows.length === 0) {
|
||||
console.log('Dispensary not found');
|
||||
return;
|
||||
}
|
||||
|
||||
const dispensary = dispResult.rows[0];
|
||||
console.log(`Name: ${dispensary.name}`);
|
||||
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
|
||||
console.log('');
|
||||
|
||||
if (flags.detection) {
|
||||
console.log('Running multi-category detection...');
|
||||
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||
if (websiteUrl) {
|
||||
const detection = await detectMultiCategoryProviders(websiteUrl);
|
||||
await updateAllCategoryProviders(flags.dispensary, detection);
|
||||
console.log('Detection results:');
|
||||
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
|
||||
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
|
||||
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
|
||||
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.production) {
|
||||
console.log('\nRunning production crawls...');
|
||||
const results = await runAllCategoryProductionCrawls(flags.dispensary);
|
||||
console.log(` ${results.summary}`);
|
||||
}
|
||||
|
||||
if (flags.sandbox) {
|
||||
console.log('\nRunning sandbox crawls...');
|
||||
const results = await runAllCategorySandboxCrawls(flags.dispensary);
|
||||
console.log(` ${results.summary}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function showStats(): Promise<void> {
|
||||
console.log('\n📊 Multi-Category Intelligence Stats:');
|
||||
|
||||
// Per-category stats
|
||||
for (const cat of CATEGORIES) {
|
||||
const stats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
|
||||
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
|
||||
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
|
||||
AVG(${cat}_confidence) as avg_confidence
|
||||
FROM dispensaries
|
||||
`);
|
||||
|
||||
const s = stats.rows[0];
|
||||
console.log(`
|
||||
${cat.toUpperCase()}:
|
||||
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
|
||||
Modes: Production=${s.production}, Sandbox=${s.sandbox}
|
||||
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
|
||||
}
|
||||
|
||||
// Job stats per category
|
||||
console.log('\n Sandbox Jobs by Category:');
|
||||
const jobStats = await pool.query(`
|
||||
SELECT
|
||||
category,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||||
FROM sandbox_crawl_jobs
|
||||
GROUP BY category
|
||||
ORDER BY category
|
||||
`);
|
||||
|
||||
for (const row of jobStats.rows) {
|
||||
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
await showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
console.log(' Multi-Category Intelligence Queue Manager');
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
|
||||
if (flags.dryRun) {
|
||||
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||||
}
|
||||
|
||||
if (flags.category) {
|
||||
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
|
||||
}
|
||||
|
||||
try {
|
||||
// Show current stats first
|
||||
await showStats();
|
||||
|
||||
// If specific dispensary specified, process it directly
|
||||
if (flags.dispensary && flags.process) {
|
||||
await processSpecificDispensary();
|
||||
} else if (flags.process) {
|
||||
// Process mode - run jobs
|
||||
if (flags.detection) {
|
||||
await processDetectionJobs();
|
||||
}
|
||||
await processCrawlJobs();
|
||||
} else {
|
||||
// Queuing mode
|
||||
let totalQueued = 0;
|
||||
|
||||
if (flags.detection) {
|
||||
totalQueued += await queueMultiCategoryDetection();
|
||||
}
|
||||
|
||||
if (flags.production) {
|
||||
totalQueued += await queueCategoryProductionCrawls(flags.category);
|
||||
}
|
||||
|
||||
if (flags.sandbox) {
|
||||
totalQueued += await queueCategorySandboxCrawls(flags.category);
|
||||
}
|
||||
|
||||
console.log('\n═══════════════════════════════════════════════════════');
|
||||
console.log(` Total queued: ${totalQueued}`);
|
||||
console.log('═══════════════════════════════════════════════════════\n');
|
||||
}
|
||||
|
||||
// Show updated stats
|
||||
if (!flags.dryRun) {
|
||||
await showStats();
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,173 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Dutchie Platform ID Resolver
|
||||
*
|
||||
* Standalone script to resolve a Dutchie dispensary slug to its platform ID.
|
||||
*
|
||||
* USAGE:
|
||||
* npx tsx src/scripts/resolve-dutchie-id.ts <slug>
|
||||
* npx tsx src/scripts/resolve-dutchie-id.ts hydroman-dispensary
|
||||
* npx tsx src/scripts/resolve-dutchie-id.ts AZ-Deeply-Rooted
|
||||
*
|
||||
* RESOLUTION STRATEGY:
|
||||
* 1. Navigate to https://dutchie.com/embedded-menu/{slug} via Puppeteer
|
||||
* 2. Extract window.reactEnv.dispensaryId (preferred - fastest)
|
||||
* 3. If reactEnv fails, call GraphQL GetAddressBasedDispensaryData as fallback
|
||||
*
|
||||
* OUTPUT:
|
||||
* - dispensaryId: The MongoDB ObjectId (e.g., "6405ef617056e8014d79101b")
|
||||
* - source: "reactEnv" or "graphql"
|
||||
* - httpStatus: HTTP status from embedded menu page
|
||||
* - error: Error message if resolution failed
|
||||
*/
|
||||
|
||||
import { resolveDispensaryIdWithDetails, ResolveDispensaryResult } from '../dutchie-az/services/graphql-client';
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
|
||||
console.log(`
|
||||
Dutchie Platform ID Resolver
|
||||
|
||||
Usage:
|
||||
npx tsx src/scripts/resolve-dutchie-id.ts <slug>
|
||||
|
||||
Examples:
|
||||
npx tsx src/scripts/resolve-dutchie-id.ts hydroman-dispensary
|
||||
npx tsx src/scripts/resolve-dutchie-id.ts AZ-Deeply-Rooted
|
||||
npx tsx src/scripts/resolve-dutchie-id.ts mint-cannabis
|
||||
|
||||
Resolution Strategy:
|
||||
1. Puppeteer navigates to https://dutchie.com/embedded-menu/{slug}
|
||||
2. Extracts window.reactEnv.dispensaryId (preferred)
|
||||
3. Falls back to GraphQL GetAddressBasedDispensaryData if needed
|
||||
|
||||
Output Fields:
|
||||
- dispensaryId: MongoDB ObjectId (e.g., "6405ef617056e8014d79101b")
|
||||
- source: "reactEnv" (from page) or "graphql" (from API)
|
||||
- httpStatus: HTTP status code from page load
|
||||
- error: Error message if resolution failed
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const slug = args[0];
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE PLATFORM ID RESOLVER');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Slug: ${slug}`);
|
||||
console.log(`Embedded Menu URL: https://dutchie.com/embedded-menu/${slug}`);
|
||||
console.log('');
|
||||
console.log('Resolving...');
|
||||
console.log('');
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const result: ResolveDispensaryResult = await resolveDispensaryIdWithDetails(slug);
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('RESOLUTION RESULT');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
if (result.dispensaryId) {
|
||||
console.log(`✓ SUCCESS`);
|
||||
console.log('');
|
||||
console.log(` Dispensary ID: ${result.dispensaryId}`);
|
||||
console.log(` Source: ${result.source}`);
|
||||
console.log(` HTTP Status: ${result.httpStatus || 'N/A'}`);
|
||||
console.log(` Duration: ${duration}ms`);
|
||||
console.log('');
|
||||
|
||||
// Show how to use this ID
|
||||
console.log('='.repeat(60));
|
||||
console.log('USAGE');
|
||||
console.log('='.repeat(60));
|
||||
console.log('');
|
||||
console.log('Use this ID in GraphQL FilteredProducts query:');
|
||||
console.log('');
|
||||
console.log(' POST https://dutchie.com/api-3/graphql');
|
||||
console.log('');
|
||||
console.log(' Body:');
|
||||
console.log(` {
|
||||
"operationName": "FilteredProducts",
|
||||
"variables": {
|
||||
"productsFilter": {
|
||||
"dispensaryId": "${result.dispensaryId}",
|
||||
"pricingType": "rec",
|
||||
"Status": "Active"
|
||||
},
|
||||
"page": 0,
|
||||
"perPage": 100
|
||||
},
|
||||
"extensions": {
|
||||
"persistedQuery": {
|
||||
"version": 1,
|
||||
"sha256Hash": "ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0"
|
||||
}
|
||||
}
|
||||
}`);
|
||||
console.log('');
|
||||
|
||||
// Output for piping/scripting
|
||||
console.log('='.repeat(60));
|
||||
console.log('JSON OUTPUT');
|
||||
console.log('='.repeat(60));
|
||||
console.log(JSON.stringify({
|
||||
success: true,
|
||||
slug,
|
||||
dispensaryId: result.dispensaryId,
|
||||
source: result.source,
|
||||
httpStatus: result.httpStatus,
|
||||
durationMs: duration,
|
||||
}, null, 2));
|
||||
|
||||
} else {
|
||||
console.log(`✗ FAILED`);
|
||||
console.log('');
|
||||
console.log(` Error: ${result.error || 'Unknown error'}`);
|
||||
console.log(` HTTP Status: ${result.httpStatus || 'N/A'}`);
|
||||
console.log(` Duration: ${duration}ms`);
|
||||
console.log('');
|
||||
|
||||
if (result.httpStatus === 403 || result.httpStatus === 404) {
|
||||
console.log('NOTE: This store may be removed or not accessible on Dutchie.');
|
||||
console.log(' Mark dispensary as not_crawlable in the database.');
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('JSON OUTPUT:');
|
||||
console.log(JSON.stringify({
|
||||
success: false,
|
||||
slug,
|
||||
error: result.error,
|
||||
httpStatus: result.httpStatus,
|
||||
durationMs: duration,
|
||||
}, null, 2));
|
||||
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
const duration = Date.now() - startTime;
|
||||
console.error('='.repeat(60));
|
||||
console.error('ERROR');
|
||||
console.error('='.repeat(60));
|
||||
console.error(`Message: ${error.message}`);
|
||||
console.error(`Duration: ${duration}ms`);
|
||||
console.error('');
|
||||
|
||||
if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
|
||||
console.error('NOTE: DNS resolution failed. This typically happens when running');
|
||||
console.error(' locally due to network restrictions. Try running from the');
|
||||
console.error(' Kubernetes pod or a cloud environment.');
|
||||
}
|
||||
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,151 +0,0 @@
|
||||
/**
|
||||
* LEGACY SCRIPT - Run Dutchie GraphQL Scrape
|
||||
*
|
||||
* DEPRECATED: This script creates its own database pool.
|
||||
* Future implementations should use the CannaiQ API endpoints instead.
|
||||
*
|
||||
* This script demonstrates the full pipeline:
|
||||
* 1. Puppeteer navigates to Dutchie menu
|
||||
* 2. GraphQL responses are intercepted
|
||||
* 3. Products are normalized to our schema
|
||||
* 4. Products are upserted to database
|
||||
* 5. Derived views (brands, categories, specials) are automatically updated
|
||||
*
|
||||
* DO NOT:
|
||||
* - Add this to package.json scripts
|
||||
* - Run this in automated jobs
|
||||
* - Use DATABASE_URL directly
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { scrapeDutchieMenu } from '../scrapers/dutchie-graphql';
|
||||
|
||||
console.warn('\n⚠️ LEGACY SCRIPT: This script should be replaced with CannaiQ API calls.\n');
|
||||
|
||||
// Single database connection (cannaiq in cannaiq-postgres container)
|
||||
const DATABASE_URL = process.env.CANNAIQ_DB_URL ||
|
||||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
|
||||
|
||||
async function main() {
|
||||
const pool = new Pool({ connectionString: DATABASE_URL });
|
||||
|
||||
try {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST');
|
||||
console.log('='.repeat(80));
|
||||
console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`);
|
||||
|
||||
// Configuration
|
||||
const storeId = 1; // Deeply Rooted
|
||||
const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
|
||||
console.log(`\nStore ID: ${storeId}`);
|
||||
console.log(`Menu URL: ${menuUrl}`);
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
|
||||
// Run the scrape
|
||||
console.log('\n🚀 Starting scrape...\n');
|
||||
const result = await scrapeDutchieMenu(pool, storeId, menuUrl);
|
||||
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('📊 SCRAPE RESULTS:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(` Success: ${result.success}`);
|
||||
console.log(` Products Found: ${result.productsFound}`);
|
||||
console.log(` Inserted: ${result.inserted}`);
|
||||
console.log(` Updated: ${result.updated}`);
|
||||
if (result.error) {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
|
||||
// Query derived views to show the result
|
||||
if (result.success) {
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('📈 DERIVED DATA (from products table):');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Brands
|
||||
const brandsResult = await pool.query(`
|
||||
SELECT brand_name, product_count, min_price, max_price
|
||||
FROM derived_brands
|
||||
WHERE store_id = $1
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
|
||||
console.log('\nTop 5 Brands:');
|
||||
brandsResult.rows.forEach(row => {
|
||||
console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`);
|
||||
});
|
||||
|
||||
// Specials
|
||||
const specialsResult = await pool.query(`
|
||||
SELECT name, brand, rec_price, rec_special_price, discount_percent
|
||||
FROM current_specials
|
||||
WHERE store_id = $1
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
|
||||
console.log('\nTop 5 Specials:');
|
||||
if (specialsResult.rows.length === 0) {
|
||||
console.log(' (No specials found - is_on_special may not be populated yet)');
|
||||
} else {
|
||||
specialsResult.rows.forEach(row => {
|
||||
console.log(` - ${row.name} (${row.brand}): $${row.rec_price} → $${row.rec_special_price} (${row.discount_percent}% off)`);
|
||||
});
|
||||
}
|
||||
|
||||
// Categories
|
||||
const categoriesResult = await pool.query(`
|
||||
SELECT category_name, product_count
|
||||
FROM derived_categories
|
||||
WHERE store_id = $1
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
|
||||
console.log('\nTop 5 Categories:');
|
||||
if (categoriesResult.rows.length === 0) {
|
||||
console.log(' (No categories found - subcategory may not be populated yet)');
|
||||
} else {
|
||||
categoriesResult.rows.forEach(row => {
|
||||
console.log(` - ${row.category_name}: ${row.product_count} products`);
|
||||
});
|
||||
}
|
||||
|
||||
// Sample product
|
||||
const sampleResult = await pool.query(`
|
||||
SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status
|
||||
FROM products
|
||||
WHERE store_id = $1 AND subcategory IS NOT NULL
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
`, [storeId]);
|
||||
|
||||
if (sampleResult.rows.length > 0) {
|
||||
const sample = sampleResult.rows[0];
|
||||
console.log('\nSample Product (with new fields):');
|
||||
console.log(` Name: ${sample.name}`);
|
||||
console.log(` Brand: ${sample.brand}`);
|
||||
console.log(` Category: ${sample.subcategory}`);
|
||||
console.log(` Price: $${sample.rec_price}`);
|
||||
console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`);
|
||||
console.log(` On Special: ${sample.is_on_special}`);
|
||||
console.log(` THC: ${sample.thc_percentage}%`);
|
||||
console.log(` Status: ${sample.status}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('✅ SCRAPE COMPLETE');
|
||||
console.log('='.repeat(80));
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Error:', error.message);
|
||||
throw error;
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -1,225 +0,0 @@
|
||||
/**
|
||||
* Sandbox Crawl Script for Dispensary 101 (Trulieve Scottsdale)
|
||||
*
|
||||
* Runs a full crawl and captures trace data for observability.
|
||||
* NO automatic promotion or status changes.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { crawlDispensaryProducts } from '../dutchie-az/services/product-crawler';
|
||||
import { Dispensary } from '../dutchie-az/types';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function main() {
|
||||
console.log('=== SANDBOX CRAWL: Dispensary 101 (Trulieve Scottsdale) ===\n');
|
||||
const startTime = Date.now();
|
||||
|
||||
// Load dispensary from database (only columns that exist in local schema)
|
||||
const dispResult = await pool.query(`
|
||||
SELECT id, name, city, state, menu_type, platform_dispensary_id, menu_url
|
||||
FROM dispensaries
|
||||
WHERE id = 101
|
||||
`);
|
||||
|
||||
if (!dispResult.rows[0]) {
|
||||
console.log('ERROR: Dispensary 101 not found');
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const row = dispResult.rows[0];
|
||||
|
||||
// Map to Dispensary interface (snake_case -> camelCase)
|
||||
const dispensary: Dispensary = {
|
||||
id: row.id,
|
||||
platform: 'dutchie',
|
||||
name: row.name,
|
||||
slug: row.name.toLowerCase().replace(/\s+/g, '-'),
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
platformDispensaryId: row.platform_dispensary_id,
|
||||
menuType: row.menu_type,
|
||||
menuUrl: row.menu_url,
|
||||
createdAt: new Date(),
|
||||
updatedAt: new Date(),
|
||||
};
|
||||
|
||||
console.log('=== DISPENSARY INFO ===');
|
||||
console.log(`Name: ${dispensary.name}`);
|
||||
console.log(`Location: ${dispensary.city}, ${dispensary.state}`);
|
||||
console.log(`Menu Type: ${dispensary.menuType}`);
|
||||
console.log(`Platform ID: ${dispensary.platformDispensaryId}`);
|
||||
console.log(`Menu URL: ${dispensary.menuUrl}`);
|
||||
console.log('');
|
||||
|
||||
// Get profile info
|
||||
const profileResult = await pool.query(`
|
||||
SELECT id, profile_key, status, config FROM dispensary_crawler_profiles
|
||||
WHERE dispensary_id = 101
|
||||
`);
|
||||
|
||||
const profile = profileResult.rows[0];
|
||||
if (profile) {
|
||||
console.log('=== PROFILE ===');
|
||||
console.log(`Profile Key: ${profile.profile_key}`);
|
||||
console.log(`Profile Status: ${profile.status}`);
|
||||
console.log(`Config: ${JSON.stringify(profile.config, null, 2)}`);
|
||||
console.log('');
|
||||
} else {
|
||||
console.log('=== PROFILE ===');
|
||||
console.log('No profile found - will use defaults');
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Run the crawl
|
||||
console.log('=== STARTING CRAWL ===');
|
||||
console.log('Options: useBothModes=true, downloadImages=false (sandbox)');
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
const result = await crawlDispensaryProducts(dispensary, 'rec', {
|
||||
useBothModes: true,
|
||||
downloadImages: false, // Skip images in sandbox mode for speed
|
||||
});
|
||||
|
||||
console.log('');
|
||||
console.log('=== CRAWL RESULT ===');
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Products Found: ${result.productsFound}`);
|
||||
console.log(`Products Fetched: ${result.productsFetched}`);
|
||||
console.log(`Products Upserted: ${result.productsUpserted}`);
|
||||
console.log(`Snapshots Created: ${result.snapshotsCreated}`);
|
||||
if (result.errorMessage) {
|
||||
console.log(`Error: ${result.errorMessage}`);
|
||||
}
|
||||
console.log(`Duration: ${result.durationMs}ms`);
|
||||
console.log('');
|
||||
|
||||
// Show sample products from database
|
||||
if (result.productsUpserted > 0) {
|
||||
const sampleProducts = await pool.query(`
|
||||
SELECT
|
||||
id, name, brand_name, type, subcategory, strain_type,
|
||||
price_rec, price_rec_original, stock_status, external_product_id
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = 101
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 10
|
||||
`);
|
||||
|
||||
console.log('=== SAMPLE PRODUCTS (10) ===');
|
||||
sampleProducts.rows.forEach((p: any, i: number) => {
|
||||
console.log(`${i + 1}. ${p.name}`);
|
||||
console.log(` Brand: ${p.brand_name || 'N/A'}`);
|
||||
console.log(` Type: ${p.type} / ${p.subcategory || 'N/A'}`);
|
||||
console.log(` Strain: ${p.strain_type || 'N/A'}`);
|
||||
console.log(` Price: $${p.price_rec || 'N/A'} (orig: $${p.price_rec_original || 'N/A'})`);
|
||||
console.log(` Stock: ${p.stock_status}`);
|
||||
console.log(` External ID: ${p.external_product_id}`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
// Show field coverage stats
|
||||
const fieldStats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(brand_name) as with_brand,
|
||||
COUNT(type) as with_type,
|
||||
COUNT(strain_type) as with_strain,
|
||||
COUNT(price_rec) as with_price,
|
||||
COUNT(image_url) as with_image,
|
||||
COUNT(description) as with_description,
|
||||
COUNT(thc_content) as with_thc,
|
||||
COUNT(cbd_content) as with_cbd
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = 101
|
||||
`);
|
||||
|
||||
const stats = fieldStats.rows[0];
|
||||
console.log('=== FIELD COVERAGE ===');
|
||||
console.log(`Total products: ${stats.total}`);
|
||||
console.log(`With brand: ${stats.with_brand} (${Math.round(stats.with_brand / stats.total * 100)}%)`);
|
||||
console.log(`With type: ${stats.with_type} (${Math.round(stats.with_type / stats.total * 100)}%)`);
|
||||
console.log(`With strain_type: ${stats.with_strain} (${Math.round(stats.with_strain / stats.total * 100)}%)`);
|
||||
console.log(`With price_rec: ${stats.with_price} (${Math.round(stats.with_price / stats.total * 100)}%)`);
|
||||
console.log(`With image_url: ${stats.with_image} (${Math.round(stats.with_image / stats.total * 100)}%)`);
|
||||
console.log(`With description: ${stats.with_description} (${Math.round(stats.with_description / stats.total * 100)}%)`);
|
||||
console.log(`With THC: ${stats.with_thc} (${Math.round(stats.with_thc / stats.total * 100)}%)`);
|
||||
console.log(`With CBD: ${stats.with_cbd} (${Math.round(stats.with_cbd / stats.total * 100)}%)`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Insert trace record for observability
|
||||
const traceData = {
|
||||
crawlResult: result,
|
||||
dispensaryInfo: {
|
||||
id: dispensary.id,
|
||||
name: dispensary.name,
|
||||
platformDispensaryId: dispensary.platformDispensaryId,
|
||||
menuUrl: dispensary.menuUrl,
|
||||
},
|
||||
profile: profile || null,
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
|
||||
await pool.query(`
|
||||
INSERT INTO crawl_orchestration_traces
|
||||
(dispensary_id, profile_id, profile_key, crawler_module, mode,
|
||||
state_at_start, state_at_end, trace, success, products_found,
|
||||
duration_ms, started_at, completed_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
|
||||
`, [
|
||||
101,
|
||||
profile?.id || null,
|
||||
profile?.profile_key || null,
|
||||
'product-crawler',
|
||||
'sandbox',
|
||||
profile?.status || 'no_profile',
|
||||
profile?.status || 'no_profile', // No status change in sandbox
|
||||
JSON.stringify(traceData),
|
||||
result.success,
|
||||
result.productsFound,
|
||||
result.durationMs,
|
||||
new Date(startTime),
|
||||
]);
|
||||
|
||||
console.log('=== TRACE RECORDED ===');
|
||||
console.log('Trace saved to crawl_orchestration_traces table');
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('=== CRAWL ERROR ===');
|
||||
console.error('Error:', error.message);
|
||||
console.error('Stack:', error.stack);
|
||||
|
||||
// Record error trace
|
||||
await pool.query(`
|
||||
INSERT INTO crawl_orchestration_traces
|
||||
(dispensary_id, profile_id, profile_key, crawler_module, mode,
|
||||
state_at_start, state_at_end, trace, success, error_message,
|
||||
duration_ms, started_at, completed_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
|
||||
`, [
|
||||
101,
|
||||
profile?.id || null,
|
||||
profile?.profile_key || null,
|
||||
'product-crawler',
|
||||
'sandbox',
|
||||
profile?.status || 'no_profile',
|
||||
profile?.status || 'no_profile',
|
||||
JSON.stringify({ error: error.message, stack: error.stack }),
|
||||
false,
|
||||
error.message,
|
||||
Date.now() - startTime,
|
||||
new Date(startTime),
|
||||
]);
|
||||
}
|
||||
|
||||
await pool.end();
|
||||
console.log('=== SANDBOX CRAWL COMPLETE ===');
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error('Fatal error:', e.message);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,181 +0,0 @@
|
||||
/**
|
||||
* LEGACY SCRIPT - Sandbox Crawl Test
|
||||
*
|
||||
* DEPRECATED: This script uses direct database connections.
|
||||
* Future implementations should use the CannaiQ API endpoints instead.
|
||||
*
|
||||
* This script runs sandbox crawl for a dispensary and captures the full trace.
|
||||
* It is kept for historical reference and manual testing only.
|
||||
*
|
||||
* DO NOT:
|
||||
* - Add this to package.json scripts
|
||||
* - Run this in automated jobs
|
||||
* - Use DATABASE_URL directly
|
||||
*
|
||||
* Usage (manual only):
|
||||
* STORAGE_DRIVER=local npx tsx src/scripts/sandbox-test.ts <dispensary_id>
|
||||
*
|
||||
* LOCAL MODE REQUIREMENTS:
|
||||
* - STORAGE_DRIVER=local
|
||||
* - STORAGE_BASE_PATH=./storage
|
||||
* - Local cannaiq-postgres on port 54320
|
||||
* - NO MinIO, NO Kubernetes
|
||||
*/
|
||||
|
||||
import { query, getClient, closePool } from '../dutchie-az/db/connection';
|
||||
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
|
||||
|
||||
// Verify local mode
|
||||
function verifyLocalMode(): void {
|
||||
const storageDriver = process.env.STORAGE_DRIVER || 'local';
|
||||
const minioEndpoint = process.env.MINIO_ENDPOINT;
|
||||
|
||||
console.log('=== LOCAL MODE VERIFICATION ===');
|
||||
console.log(`STORAGE_DRIVER: ${storageDriver}`);
|
||||
console.log(`MINIO_ENDPOINT: ${minioEndpoint || 'NOT SET (good)'}`);
|
||||
console.log(`STORAGE_BASE_PATH: ${process.env.STORAGE_BASE_PATH || './storage'}`);
|
||||
console.log('DB Connection: Using canonical CannaiQ pool');
|
||||
|
||||
if (storageDriver !== 'local') {
|
||||
console.error('ERROR: STORAGE_DRIVER must be "local"');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (minioEndpoint) {
|
||||
console.error('ERROR: MINIO_ENDPOINT should NOT be set in local mode');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('✅ Local mode verified\n');
|
||||
}
|
||||
|
||||
async function getDispensaryInfo(dispensaryId: number) {
|
||||
const result = await query(`
|
||||
SELECT d.id, d.name, d.city, d.menu_type, d.platform_dispensary_id, d.menu_url,
|
||||
p.profile_key, p.status as profile_status, p.config
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawler_profiles p ON p.dispensary_id = d.id
|
||||
WHERE d.id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
return result.rows[0];
|
||||
}
|
||||
|
||||
async function getLatestTrace(dispensaryId: number) {
|
||||
const result = await query(`
|
||||
SELECT *
|
||||
FROM crawl_orchestration_traces
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
`, [dispensaryId]);
|
||||
|
||||
return result.rows[0];
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.warn('\n⚠️ LEGACY SCRIPT: This script should be replaced with CannaiQ API calls.\n');
|
||||
|
||||
const dispensaryId = parseInt(process.argv[2], 10);
|
||||
|
||||
if (!dispensaryId || isNaN(dispensaryId)) {
|
||||
console.error('Usage: npx tsx src/scripts/sandbox-test.ts <dispensary_id>');
|
||||
console.error('Example: npx tsx src/scripts/sandbox-test.ts 101');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Verify local mode first
|
||||
verifyLocalMode();
|
||||
|
||||
try {
|
||||
// Get dispensary info
|
||||
console.log(`=== DISPENSARY INFO (ID: ${dispensaryId}) ===`);
|
||||
const dispensary = await getDispensaryInfo(dispensaryId);
|
||||
|
||||
if (!dispensary) {
|
||||
console.error(`Dispensary ${dispensaryId} not found`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`Name: ${dispensary.name}`);
|
||||
console.log(`City: ${dispensary.city}`);
|
||||
console.log(`Menu Type: ${dispensary.menu_type}`);
|
||||
console.log(`Platform Dispensary ID: ${dispensary.platform_dispensary_id || 'NULL'}`);
|
||||
console.log(`Menu URL: ${dispensary.menu_url || 'NULL'}`);
|
||||
console.log(`Profile Key: ${dispensary.profile_key || 'NONE'}`);
|
||||
console.log(`Profile Status: ${dispensary.profile_status || 'N/A'}`);
|
||||
console.log(`Profile Config: ${JSON.stringify(dispensary.config, null, 2)}`);
|
||||
console.log('');
|
||||
|
||||
// Run sandbox crawl
|
||||
console.log('=== RUNNING SANDBOX CRAWL ===');
|
||||
console.log(`Starting sandbox crawl for ${dispensary.name}...`);
|
||||
const startTime = Date.now();
|
||||
|
||||
const result = await runDispensaryOrchestrator(dispensaryId);
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
console.log('\n=== CRAWL RESULT ===');
|
||||
console.log(`Status: ${result.status}`);
|
||||
console.log(`Summary: ${result.summary}`);
|
||||
console.log(`Run ID: ${result.runId}`);
|
||||
console.log(`Duration: ${duration}ms`);
|
||||
console.log(`Detection Ran: ${result.detectionRan}`);
|
||||
console.log(`Crawl Ran: ${result.crawlRan}`);
|
||||
console.log(`Crawl Type: ${result.crawlType || 'N/A'}`);
|
||||
console.log(`Products Found: ${result.productsFound || 0}`);
|
||||
console.log(`Products New: ${result.productsNew || 0}`);
|
||||
console.log(`Products Updated: ${result.productsUpdated || 0}`);
|
||||
|
||||
if (result.error) {
|
||||
console.log(`Error: ${result.error}`);
|
||||
}
|
||||
|
||||
// Get the trace
|
||||
console.log('\n=== ORCHESTRATOR TRACE ===');
|
||||
const trace = await getLatestTrace(dispensaryId);
|
||||
|
||||
if (trace) {
|
||||
console.log(`Trace ID: ${trace.id}`);
|
||||
console.log(`Profile Key: ${trace.profile_key || 'N/A'}`);
|
||||
console.log(`Mode: ${trace.mode}`);
|
||||
console.log(`Status: ${trace.status}`);
|
||||
console.log(`Started At: ${trace.started_at}`);
|
||||
console.log(`Completed At: ${trace.completed_at || 'In Progress'}`);
|
||||
|
||||
if (trace.steps && Array.isArray(trace.steps)) {
|
||||
console.log(`\nSteps (${trace.steps.length} total):`);
|
||||
trace.steps.forEach((step: any, i: number) => {
|
||||
const status = step.status === 'completed' ? '✅' : step.status === 'failed' ? '❌' : '⏳';
|
||||
console.log(` ${i + 1}. ${status} ${step.action}: ${step.description}`);
|
||||
if (step.output && Object.keys(step.output).length > 0) {
|
||||
console.log(` Output: ${JSON.stringify(step.output)}`);
|
||||
}
|
||||
if (step.error) {
|
||||
console.log(` Error: ${step.error}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (trace.result) {
|
||||
console.log(`\nResult: ${JSON.stringify(trace.result, null, 2)}`);
|
||||
}
|
||||
|
||||
if (trace.error_message) {
|
||||
console.log(`\nError Message: ${trace.error_message}`);
|
||||
}
|
||||
} else {
|
||||
console.log('No trace found for this dispensary');
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Error running sandbox test:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await closePool();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,332 +0,0 @@
|
||||
/**
|
||||
* LEGACY SCRIPT - Scrape All Active Products
|
||||
*
|
||||
* DEPRECATED: This script creates its own database pool.
|
||||
* Future implementations should use the CannaiQ API endpoints instead.
|
||||
*
|
||||
* Scrapes ALL active products via direct GraphQL pagination.
|
||||
* This is more reliable than category navigation.
|
||||
*
|
||||
* DO NOT:
|
||||
* - Add this to package.json scripts
|
||||
* - Run this in automated jobs
|
||||
* - Use DATABASE_URL directly
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Pool } from 'pg';
|
||||
import { normalizeDutchieProduct, DutchieProduct } from '../scrapers/dutchie-graphql';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
console.warn('\n⚠️ LEGACY SCRIPT: This script should be replaced with CannaiQ API calls.\n');
|
||||
|
||||
// Single database connection (cannaiq in cannaiq-postgres container)
|
||||
const DATABASE_URL = process.env.CANNAIQ_DB_URL ||
|
||||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
|
||||
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
|
||||
async function scrapeAllProducts(menuUrl: string, storeId: number) {
|
||||
const pool = new Pool({ connectionString: DATABASE_URL });
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('Loading menu to establish session...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId);
|
||||
console.log('Dispensary ID:', dispensaryId);
|
||||
|
||||
// Paginate through all products
|
||||
const allProducts: DutchieProduct[] = [];
|
||||
let pageNum = 0;
|
||||
const perPage = 100;
|
||||
|
||||
console.log('\nFetching all products via paginated GraphQL...');
|
||||
|
||||
while (true) {
|
||||
const result = await page.evaluate(
|
||||
async (dispId: string, hash: string, page: number, perPage: number) => {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: dispId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active',
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
|
||||
});
|
||||
|
||||
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
|
||||
const json = await resp.json();
|
||||
return {
|
||||
products: json?.data?.filteredProducts?.products || [],
|
||||
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
|
||||
};
|
||||
},
|
||||
dispensaryId,
|
||||
GRAPHQL_HASH,
|
||||
pageNum,
|
||||
perPage
|
||||
);
|
||||
|
||||
if (result.products.length === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
allProducts.push(...result.products);
|
||||
console.log(
|
||||
`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`
|
||||
);
|
||||
|
||||
pageNum++;
|
||||
|
||||
// Safety limit
|
||||
if (pageNum > 50) {
|
||||
console.log('Reached page limit');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nTotal products fetched: ${allProducts.length}`);
|
||||
|
||||
// Normalize and upsert
|
||||
console.log('\nNormalizing and upserting to database...');
|
||||
const normalized = allProducts.map(normalizeDutchieProduct);
|
||||
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
for (const product of normalized) {
|
||||
const result = await client.query(
|
||||
`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`,
|
||||
[
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]
|
||||
);
|
||||
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
|
||||
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
|
||||
|
||||
// Show summary stats
|
||||
const stats = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE is_on_special) as specials,
|
||||
COUNT(DISTINCT brand) as brands,
|
||||
COUNT(DISTINCT subcategory) as categories
|
||||
FROM products WHERE store_id = $1
|
||||
`,
|
||||
[storeId]
|
||||
);
|
||||
|
||||
console.log('\nStore summary:');
|
||||
console.log(` Total products: ${stats.rows[0].total}`);
|
||||
console.log(` On special: ${stats.rows[0].specials}`);
|
||||
console.log(` Unique brands: ${stats.rows[0].brands}`);
|
||||
console.log(` Categories: ${stats.rows[0].categories}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
totalProducts: allProducts.length,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
} finally {
|
||||
await browser.close();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
// Run
|
||||
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
const storeId = parseInt(process.argv[3] || '1', 10);
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Menu URL: ${menuUrl}`);
|
||||
console.log(`Store ID: ${storeId}`);
|
||||
console.log('');
|
||||
|
||||
scrapeAllProducts(menuUrl, storeId)
|
||||
.then((result) => {
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('COMPLETE');
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error:', error.message);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,156 +0,0 @@
|
||||
/**
|
||||
* Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow
|
||||
*
|
||||
* This demonstrates the complete data pipeline:
|
||||
* 1. Fetch one product from Dutchie GraphQL via Puppeteer
|
||||
* 2. Normalize it to our schema
|
||||
* 3. Show the mapping
|
||||
*/
|
||||
|
||||
import { normalizeDutchieProduct, DutchieProduct, NormalizedProduct } from '../scrapers/dutchie-graphql';
|
||||
import * as fs from 'fs';
|
||||
|
||||
// Load the captured sample product from schema capture
|
||||
const capturedData = JSON.parse(
|
||||
fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8')
|
||||
);
|
||||
|
||||
const sampleProduct: DutchieProduct = capturedData.sampleProduct;
|
||||
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION');
|
||||
console.log('='.repeat(80));
|
||||
|
||||
console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Show key fields from raw product
|
||||
const keyRawFields = {
|
||||
'_id': sampleProduct._id,
|
||||
'Name': sampleProduct.Name,
|
||||
'cName': sampleProduct.cName,
|
||||
'brandName': sampleProduct.brandName,
|
||||
'brand.id': sampleProduct.brand?.id,
|
||||
'type': sampleProduct.type,
|
||||
'subcategory': sampleProduct.subcategory,
|
||||
'strainType': sampleProduct.strainType,
|
||||
'Prices': sampleProduct.Prices,
|
||||
'recPrices': sampleProduct.recPrices,
|
||||
'recSpecialPrices': sampleProduct.recSpecialPrices,
|
||||
'special': sampleProduct.special,
|
||||
'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName,
|
||||
'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount,
|
||||
'THCContent.range[0]': sampleProduct.THCContent?.range?.[0],
|
||||
'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0],
|
||||
'Status': sampleProduct.Status,
|
||||
'Image': sampleProduct.Image,
|
||||
'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU,
|
||||
'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity,
|
||||
'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable,
|
||||
};
|
||||
|
||||
Object.entries(keyRawFields).forEach(([key, value]) => {
|
||||
console.log(` ${key}: ${JSON.stringify(value)}`);
|
||||
});
|
||||
|
||||
console.log('\n📤 NORMALIZED DATABASE ROW:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Normalize the product
|
||||
const normalized: NormalizedProduct = normalizeDutchieProduct(sampleProduct);
|
||||
|
||||
// Show the normalized result (excluding raw_data for readability)
|
||||
const { raw_data, cannabinoids, special_data, ...displayFields } = normalized;
|
||||
|
||||
Object.entries(displayFields).forEach(([key, value]) => {
|
||||
if (value !== undefined && value !== null) {
|
||||
console.log(` ${key}: ${JSON.stringify(value)}`);
|
||||
}
|
||||
});
|
||||
|
||||
console.log('\n🔗 FIELD MAPPING:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
const fieldMappings = [
|
||||
['_id / id', 'external_id', sampleProduct._id, normalized.external_id],
|
||||
['Name', 'name', sampleProduct.Name, normalized.name],
|
||||
['cName', 'slug', sampleProduct.cName, normalized.slug],
|
||||
['brandName', 'brand', sampleProduct.brandName, normalized.brand],
|
||||
['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id],
|
||||
['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory],
|
||||
['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type],
|
||||
['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price],
|
||||
['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price],
|
||||
['special', 'is_on_special', sampleProduct.special, normalized.is_on_special],
|
||||
['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'],
|
||||
['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage],
|
||||
['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage],
|
||||
['Status', 'status', sampleProduct.Status, normalized.status],
|
||||
['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'],
|
||||
['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku],
|
||||
];
|
||||
|
||||
console.log(' GraphQL Field → DB Column | Value');
|
||||
console.log(' ' + '-'.repeat(75));
|
||||
|
||||
fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => {
|
||||
const gqlStr = String(gqlField).padEnd(30);
|
||||
const dbStr = String(dbCol).padEnd(20);
|
||||
console.log(` ${gqlStr} → ${dbStr} | ${JSON.stringify(dbVal)}`);
|
||||
});
|
||||
|
||||
console.log('\n📊 SQL INSERT STATEMENT:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Generate example SQL
|
||||
const sqlExample = `
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name,
|
||||
brand, brand_external_id,
|
||||
subcategory, strain_type,
|
||||
rec_price, rec_special_price,
|
||||
is_on_special, special_name, discount_percent,
|
||||
thc_percentage, cbd_percentage,
|
||||
status, image_url, sku
|
||||
) VALUES (
|
||||
1, -- store_id (Deeply Rooted)
|
||||
'${normalized.external_id}', -- external_id
|
||||
'${normalized.slug}', -- slug
|
||||
'${normalized.name}', -- name
|
||||
'${normalized.brand}', -- brand
|
||||
'${normalized.brand_external_id}', -- brand_external_id
|
||||
'${normalized.subcategory}', -- subcategory
|
||||
'${normalized.strain_type}', -- strain_type
|
||||
${normalized.rec_price}, -- rec_price
|
||||
${normalized.rec_special_price}, -- rec_special_price
|
||||
${normalized.is_on_special}, -- is_on_special
|
||||
'${normalized.special_name?.substring(0, 50)}...', -- special_name
|
||||
${normalized.discount_percent || 'NULL'}, -- discount_percent
|
||||
${normalized.thc_percentage}, -- thc_percentage
|
||||
${normalized.cbd_percentage}, -- cbd_percentage
|
||||
'${normalized.status}', -- status
|
||||
'${normalized.image_url}', -- image_url
|
||||
'${normalized.sku}' -- sku
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET ...;
|
||||
`;
|
||||
|
||||
console.log(sqlExample);
|
||||
|
||||
console.log('\n✅ SUMMARY:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(` Product: ${normalized.name}`);
|
||||
console.log(` Brand: ${normalized.brand}`);
|
||||
console.log(` Category: ${normalized.subcategory}`);
|
||||
console.log(` Price: $${normalized.rec_price} → $${normalized.rec_special_price} (${normalized.discount_percent}% off)`);
|
||||
console.log(` THC: ${normalized.thc_percentage}%`);
|
||||
console.log(` Status: ${normalized.status}`);
|
||||
console.log(` On Special: ${normalized.is_on_special}`);
|
||||
console.log(` SKU: ${normalized.sku}`);
|
||||
|
||||
console.log('\n🎯 DERIVED VIEWS (computed from products table):');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(' - current_specials: Products where is_on_special = true');
|
||||
console.log(' - derived_brands: Aggregated by brand name with counts/prices');
|
||||
console.log(' - derived_categories: Aggregated by subcategory');
|
||||
console.log('\nAll views are computed from the single products table - no separate tables needed!');
|
||||
Reference in New Issue
Block a user