#!/usr/bin/env npx tsx /** * Discover All States - Sequential State-by-State Dutchie Discovery * * This script discovers all Dutchie dispensaries for every US state, * processing one state at a time with delays between states. * * Progress is automatically saved to /tmp/discovery-progress.json * so the script can resume from where it left off if interrupted. * * Usage: * DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts * DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --dry-run * DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --start-from CA * DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --resume * DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --reset # Clear progress, start fresh * * Options: * --dry-run Don't save to database, just show what would happen * --start-from Start from a specific state (skip earlier states) * --states Comma-separated list of specific states to run (e.g., AZ,CA,CO) * --verbose Show detailed output * --resume Auto-resume from last saved progress (default if progress file exists) * --reset Clear progress file and start fresh */ import { Pool } from 'pg'; import * as fs from 'fs'; import * as path from 'path'; const PROGRESS_FILE = '/tmp/discovery-progress.json'; interface ProgressData { lastCompletedState: string | null; lastCompletedIndex: number; startedAt: string; updatedAt: string; completedStates: string[]; } function loadProgress(): ProgressData | null { try { if (fs.existsSync(PROGRESS_FILE)) { const data = JSON.parse(fs.readFileSync(PROGRESS_FILE, 'utf-8')); return data; } } catch (e) { console.warn('[Progress] Could not load progress file:', e); } return null; } function saveProgress(progress: ProgressData): void { try { progress.updatedAt = new Date().toISOString(); fs.writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2)); } catch (e) { console.warn('[Progress] Could not save progress:', e); } } function clearProgress(): void { try { if (fs.existsSync(PROGRESS_FILE)) { fs.unlinkSync(PROGRESS_FILE); console.log('[Progress] Cleared progress file'); } } catch (e) { console.warn('[Progress] Could not clear progress:', e); } } import { discoverState } from '../discovery'; // US states with legal cannabis (medical or recreational) // Ordered roughly by market size / likelihood of Dutchie presence const US_STATES = [ 'AZ', // Arizona 'CA', // California 'CO', // Colorado 'FL', // Florida 'IL', // Illinois 'MA', // Massachusetts 'MI', // Michigan 'NV', // Nevada 'NJ', // New Jersey 'NY', // New York 'OH', // Ohio 'OR', // Oregon 'PA', // Pennsylvania 'WA', // Washington 'MD', // Maryland 'MO', // Missouri 'CT', // Connecticut 'NM', // New Mexico 'ME', // Maine 'VT', // Vermont 'MT', // Montana 'AK', // Alaska 'OK', // Oklahoma 'AR', // Arkansas 'ND', // North Dakota 'SD', // South Dakota 'MN', // Minnesota 'NH', // New Hampshire 'RI', // Rhode Island 'DE', // Delaware 'HI', // Hawaii 'WV', // West Virginia 'LA', // Louisiana 'UT', // Utah 'VA', // Virginia 'DC', // District of Columbia ]; interface DiscoveryResult { stateCode: string; citiesCrawled: number; locationsFound: number; locationsUpserted: number; durationMs: number; errors: string[]; } function parseArgs() { const args = process.argv.slice(2); const flags: Record = {}; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg.startsWith('--')) { const [key, value] = arg.slice(2).split('='); if (value !== undefined) { flags[key] = value; } else if (args[i + 1] && !args[i + 1].startsWith('--')) { flags[key] = args[i + 1]; i++; } else { flags[key] = true; } } } return flags; } async function main() { const flags = parseArgs(); const dryRun = Boolean(flags['dry-run']); const verbose = Boolean(flags.verbose); const reset = Boolean(flags.reset); const resume = Boolean(flags.resume); let startFrom = flags['start-from'] as string | undefined; const specificStates = flags.states ? (flags.states as string).split(',').map((s) => s.trim().toUpperCase()) : null; // Handle reset flag if (reset) { clearProgress(); } // Determine which states to process let statesToProcess = specificStates || US_STATES; // Check for saved progress (auto-resume unless --reset or --start-from specified) const savedProgress = loadProgress(); if (savedProgress && !reset && !startFrom && !specificStates) { const nextIndex = savedProgress.lastCompletedIndex + 1; if (nextIndex < US_STATES.length) { startFrom = US_STATES[nextIndex]; console.log(`[Progress] Resuming from saved progress`); console.log(`[Progress] Last completed: ${savedProgress.lastCompletedState} (${savedProgress.completedStates.length} states done)`); console.log(`[Progress] Started at: ${savedProgress.startedAt}`); console.log(`[Progress] Last update: ${savedProgress.updatedAt}`); console.log(''); } else { console.log(`[Progress] All states already completed! Use --reset to start over.`); process.exit(0); } } if (startFrom) { const startIndex = statesToProcess.indexOf(startFrom.toUpperCase()); if (startIndex === -1) { console.error(`ERROR: State ${startFrom} not found in list`); process.exit(1); } statesToProcess = statesToProcess.slice(startIndex); console.log(`Starting from ${startFrom}, ${statesToProcess.length} states remaining`); } // Initialize progress tracking let progress: ProgressData = savedProgress || { lastCompletedState: null, lastCompletedIndex: -1, startedAt: new Date().toISOString(), updatedAt: new Date().toISOString(), completedStates: [], }; console.log('='.repeat(70)); console.log('DUTCHIE ALL-STATES DISCOVERY'); console.log('='.repeat(70)); console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`); console.log(`States to process: ${statesToProcess.length}`); console.log(`States: ${statesToProcess.join(', ')}`); console.log(''); // Create database pool const connectionString = process.env.DATABASE_URL; if (!connectionString) { console.error('ERROR: DATABASE_URL environment variable is required'); process.exit(1); } const pool = new Pool({ connectionString }); const results: DiscoveryResult[] = []; const startTime = Date.now(); try { for (let i = 0; i < statesToProcess.length; i++) { const stateCode = statesToProcess[i]; console.log(''); console.log('─'.repeat(70)); console.log(`[${i + 1}/${statesToProcess.length}] Discovering ${stateCode}...`); console.log('─'.repeat(70)); try { const result = await discoverState(pool, stateCode, { dryRun, verbose, cityLimit: 200, // Allow up to 200 cities per state }); const discoveryResult: DiscoveryResult = { stateCode, citiesCrawled: result.locations.length, locationsFound: result.totalLocationsFound, locationsUpserted: result.totalLocationsUpserted, durationMs: result.durationMs, errors: [], }; // Collect errors from city results result.locations.forEach((loc) => { if (loc.errors && loc.errors.length > 0) { discoveryResult.errors.push(...loc.errors); } }); results.push(discoveryResult); // Save progress after each successful state const stateIndex = US_STATES.indexOf(stateCode); progress.lastCompletedState = stateCode; progress.lastCompletedIndex = stateIndex; if (!progress.completedStates.includes(stateCode)) { progress.completedStates.push(stateCode); } saveProgress(progress); console.log(`\n[${stateCode}] COMPLETE:`); console.log(` Cities crawled: ${discoveryResult.citiesCrawled}`); console.log(` Locations found: ${discoveryResult.locationsFound}`); console.log(` Locations upserted: ${discoveryResult.locationsUpserted}`); console.log(` Duration: ${(discoveryResult.durationMs / 1000).toFixed(1)}s`); console.log(` Progress saved (${progress.completedStates.length}/${US_STATES.length} states)`); if (discoveryResult.errors.length > 0) { console.log(` Errors: ${discoveryResult.errors.length}`); } // Delay between states to avoid rate limiting if (i < statesToProcess.length - 1) { const delaySeconds = 5; console.log(`\n Waiting ${delaySeconds}s before next state...`); await new Promise((r) => setTimeout(r, delaySeconds * 1000)); } } catch (error: any) { console.error(`\n[${stateCode}] ERROR: ${error.message}`); results.push({ stateCode, citiesCrawled: 0, locationsFound: 0, locationsUpserted: 0, durationMs: 0, errors: [error.message], }); // Continue to next state even on error await new Promise((r) => setTimeout(r, 3000)); } } // Print summary const totalDuration = Date.now() - startTime; const totalLocations = results.reduce((sum, r) => sum + r.locationsFound, 0); const totalUpserted = results.reduce((sum, r) => sum + r.locationsUpserted, 0); const totalCities = results.reduce((sum, r) => sum + r.citiesCrawled, 0); const statesWithErrors = results.filter((r) => r.errors.length > 0); console.log(''); console.log('='.repeat(70)); console.log('DISCOVERY COMPLETE - SUMMARY'); console.log('='.repeat(70)); console.log(`Total states processed: ${results.length}`); console.log(`Total cities crawled: ${totalCities}`); console.log(`Total locations found: ${totalLocations}`); console.log(`Total locations upserted: ${totalUpserted}`); console.log(`Total duration: ${(totalDuration / 1000 / 60).toFixed(1)} minutes`); console.log(''); if (statesWithErrors.length > 0) { console.log('States with errors:'); statesWithErrors.forEach((r) => { console.log(` ${r.stateCode}: ${r.errors.length} error(s)`); }); console.log(''); } // Print per-state breakdown console.log('Per-state results:'); console.log('-'.repeat(70)); console.log('State\tCities\tFound\tUpserted\tDuration\tStatus'); console.log('-'.repeat(70)); results.forEach((r) => { const status = r.errors.length > 0 ? 'ERRORS' : 'OK'; const duration = (r.durationMs / 1000).toFixed(1) + 's'; console.log( `${r.stateCode}\t${r.citiesCrawled}\t${r.locationsFound}\t${r.locationsUpserted}\t\t${duration}\t\t${status}` ); }); // Final count from database console.log(''); console.log('='.repeat(70)); console.log('DATABASE TOTALS'); console.log('='.repeat(70)); const { rows: locationCounts } = await pool.query(` SELECT state_code, COUNT(*) as count, COUNT(CASE WHEN status = 'discovered' THEN 1 END) as discovered, COUNT(CASE WHEN status = 'promoted' THEN 1 END) as promoted FROM dutchie_discovery_locations WHERE active = TRUE GROUP BY state_code ORDER BY count DESC `); console.log('State\tTotal\tDiscovered\tPromoted'); console.log('-'.repeat(50)); locationCounts.forEach((row: any) => { console.log(`${row.state_code || 'N/A'}\t${row.count}\t${row.discovered}\t\t${row.promoted}`); }); const { rows: totalRow } = await pool.query(` SELECT COUNT(*) as total FROM dutchie_discovery_locations WHERE active = TRUE `); console.log('-'.repeat(50)); console.log(`TOTAL: ${totalRow[0].total} locations in discovery table`); const { rows: dispRow } = await pool.query(` SELECT COUNT(*) as total FROM dispensaries WHERE menu_type = 'dutchie' `); console.log(`DISPENSARIES: ${dispRow[0].total} Dutchie dispensaries in main table`); // Clear progress file on successful completion of all states if (results.length === US_STATES.length || (savedProgress && progress.completedStates.length === US_STATES.length)) { clearProgress(); console.log('\n[Progress] All states completed! Progress file cleared.'); } } finally { await pool.end(); } } main().catch((error) => { console.error('Fatal error:', error); process.exit(1); });