/** * Dutchie Discovery Crawler * * Main orchestrator for the Dutchie store discovery pipeline. * * AUTOMATED FLOW (as of 2025-01): * 1. Fetch cities dynamically from Dutchie GraphQL (getAllCitiesByState) * 2. For each city, discover store locations via ConsumerDispensaries query * 3. Upsert locations to dutchie_discovery_locations (keyed by platform_location_id) * 4. AUTO-VALIDATE: Check required fields (name, city, state, platform_menu_url, platform_location_id) * 5. AUTO-PROMOTE: Valid locations are upserted to dispensaries table with crawl_enabled=true * 6. All actions logged to dutchie_promotion_log for audit * * Tables involved: * - dutchie_discovery_cities: Known cities for each state * - dutchie_discovery_locations: Raw discovered store data * - dispensaries: Canonical store records (promoted from discovery) * - dutchie_promotion_log: Audit trail for validation/promotion * * Usage: * npx tsx src/scripts/run-discovery.ts discover:state AZ * npx tsx src/scripts/run-discovery.ts discover:state CA */ import { Pool } from 'pg'; import { FullDiscoveryResult, LocationDiscoveryResult, DiscoveryCity, } from './types'; import { discoverCities, getCitiesToCrawl, getCityBySlug, seedKnownCities, } from './city-discovery'; import { discoverLocationsForCity, getCitiesForState, } from './location-discovery'; import { promoteDiscoveredLocations } from './promotion'; // ============================================================ // FULL DISCOVERY // ============================================================ export interface DiscoveryCrawlerOptions { dryRun?: boolean; verbose?: boolean; stateCode?: string; countryCode?: string; cityLimit?: number; skipCityDiscovery?: boolean; onlyStale?: boolean; staleDays?: number; } /** * Run the full discovery pipeline: * 1. Discover/refresh cities * 2. For each city, discover locations */ export async function runFullDiscovery( pool: Pool, options: DiscoveryCrawlerOptions = {} ): Promise { const startTime = Date.now(); const { dryRun = false, verbose = false, stateCode, countryCode = 'US', cityLimit = 50, skipCityDiscovery = false, onlyStale = true, staleDays = 7, } = options; console.log('='.repeat(60)); console.log('DUTCHIE DISCOVERY CRAWLER'); console.log('='.repeat(60)); console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`); if (stateCode) console.log(`State: ${stateCode}`); console.log(`Country: ${countryCode}`); console.log(`City limit: ${cityLimit}`); console.log(''); // Step 1: Discover/refresh cities let cityResult = { citiesFound: 0, citiesUpserted: 0, citiesSkipped: 0, errors: [] as string[], durationMs: 0, }; if (!skipCityDiscovery) { console.log('[Discovery] Step 1: Discovering cities...'); cityResult = await discoverCities(pool, { dryRun, verbose }); } else { console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)'); } // Step 2: Get cities to crawl console.log('[Discovery] Step 2: Getting cities to crawl...'); const cities = await getCitiesToCrawl(pool, { stateCode, countryCode, limit: cityLimit, onlyStale, staleDays, }); console.log(`[Discovery] Found ${cities.length} cities to crawl`); // Step 3: Discover locations for each city console.log('[Discovery] Step 3: Discovering locations...'); const locationResults: LocationDiscoveryResult[] = []; let totalLocationsFound = 0; let totalLocationsUpserted = 0; for (let i = 0; i < cities.length; i++) { const city = cities[i]; console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`); try { const result = await discoverLocationsForCity(pool, city, { dryRun, verbose }); locationResults.push(result); totalLocationsFound += result.locationsFound; totalLocationsUpserted += result.locationsUpserted; // Rate limiting between cities if (i < cities.length - 1) { await new Promise((r) => setTimeout(r, 2000)); } } catch (error: any) { console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`); locationResults.push({ cityId: city.id, citySlug: city.citySlug, locationsFound: 0, locationsUpserted: 0, locationsNew: 0, locationsUpdated: 0, errors: [error.message], durationMs: 0, }); } } const durationMs = Date.now() - startTime; // Summary console.log('\n' + '='.repeat(60)); console.log('DISCOVERY COMPLETE'); console.log('='.repeat(60)); console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`); console.log(''); console.log('Cities:'); console.log(` Discovered: ${cityResult.citiesFound}`); console.log(` Upserted: ${cityResult.citiesUpserted}`); console.log(` Crawled: ${cities.length}`); console.log(''); console.log('Locations:'); console.log(` Found: ${totalLocationsFound}`); console.log(` Upserted: ${totalLocationsUpserted}`); console.log(''); const totalErrors = cityResult.errors.length + locationResults.reduce((sum, r) => sum + r.errors.length, 0); if (totalErrors > 0) { console.log(`Errors: ${totalErrors}`); } // Step 4: Auto-validate and promote discovered locations if (!dryRun && totalLocationsUpserted > 0) { console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...'); const promotionResult = await promoteDiscoveredLocations(stateCode, false); console.log(`[Discovery] Promotion complete:`); console.log(` Created: ${promotionResult.created} new dispensaries`); console.log(` Updated: ${promotionResult.updated} existing dispensaries`); console.log(` Rejected: ${promotionResult.rejected} (validation failed)`); if (promotionResult.rejectedRecords.length > 0) { console.log(` Rejection reasons:`); promotionResult.rejectedRecords.slice(0, 5).forEach(r => { console.log(` - ${r.name}: ${r.errors.join(', ')}`); }); if (promotionResult.rejectedRecords.length > 5) { console.log(` ... and ${promotionResult.rejectedRecords.length - 5} more`); } } } return { cities: cityResult, locations: locationResults, totalLocationsFound, totalLocationsUpserted, durationMs, }; } // ============================================================ // SINGLE CITY DISCOVERY // ============================================================ /** * Discover locations for a single city by slug. */ export async function discoverCity( pool: Pool, citySlug: string, options: { stateCode?: string; countryCode?: string; dryRun?: boolean; verbose?: boolean; } = {} ): Promise { const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options; // Find the city let city = await getCityBySlug(pool, citySlug, stateCode, countryCode); if (!city) { // Try to create it if we have enough info if (stateCode) { console.log(`[Discovery] City ${citySlug} not found, creating...`); await seedKnownCities(pool, [{ name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()), slug: citySlug, stateCode, countryCode, }]); city = await getCityBySlug(pool, citySlug, stateCode, countryCode); } if (!city) { console.log(`[Discovery] City ${citySlug} not found and could not be created`); return null; } } return await discoverLocationsForCity(pool, city, { dryRun, verbose }); } // ============================================================ // STATE-WIDE DISCOVERY // ============================================================ /** * Seed and discover all cities for a state. */ export async function discoverState( pool: Pool, stateCode: string, options: { dryRun?: boolean; verbose?: boolean; cityLimit?: number; } = {} ): Promise { const { dryRun = false, verbose = false, cityLimit = 100 } = options; console.log(`[Discovery] Discovering state: ${stateCode}`); // Dynamically fetch and seed cities for this state console.log(`[Discovery] Fetching cities for ${stateCode} from Dutchie...`); const cityNames = await getCitiesForState(stateCode); if (cityNames.length > 0) { const cities = cityNames.map(name => ({ name, slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''), stateCode, })); const seeded = await seedKnownCities(pool, cities); console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated for ${stateCode}`); } else { console.log(`[Discovery] No cities found for ${stateCode}`); } // Run full discovery for this state return await runFullDiscovery(pool, { dryRun, verbose, stateCode, countryCode: 'US', cityLimit, skipCityDiscovery: true, // Use seeded cities onlyStale: false, // Crawl all }); } // ============================================================ // STATISTICS // ============================================================ export interface DiscoveryStats { cities: { total: number; crawledLast24h: number; neverCrawled: number; }; locations: { total: number; discovered: number; verified: number; rejected: number; merged: number; byState: Array<{ stateCode: string; count: number }>; }; } /** * Get discovery statistics. */ export async function getDiscoveryStats(pool: Pool): Promise { const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([ pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'), pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`), pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'), ]); const [locsTotal, locsByStatus, locsByState] = await Promise.all([ pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'), pool.query(` SELECT status, COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE GROUP BY status `), pool.query(` SELECT state_code, COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE AND state_code IS NOT NULL GROUP BY state_code ORDER BY cnt DESC `), ]); const statusCounts = locsByStatus.rows.reduce((acc, row) => { acc[row.status] = parseInt(row.cnt, 10); return acc; }, {} as Record); return { cities: { total: parseInt(citiesTotal.rows[0].cnt, 10), crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10), neverCrawled: parseInt(citiesNever.rows[0].cnt, 10), }, locations: { total: parseInt(locsTotal.rows[0].cnt, 10), discovered: statusCounts.discovered || 0, verified: statusCounts.verified || 0, rejected: statusCounts.rejected || 0, merged: statusCounts.merged || 0, byState: locsByState.rows.map(row => ({ stateCode: row.state_code, count: parseInt(row.cnt, 10), })), }, }; }