Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
328 lines
9.1 KiB
TypeScript
328 lines
9.1 KiB
TypeScript
/**
|
|
* Dutchie Discovery Crawler
|
|
*
|
|
* Main orchestrator for the Dutchie store discovery pipeline.
|
|
*
|
|
* Flow:
|
|
* 1. Discover cities from Dutchie (or use seeded cities)
|
|
* 2. For each city, discover store locations
|
|
* 3. Upsert all data to discovery tables
|
|
* 4. Admin verifies locations manually
|
|
* 5. Verified locations are promoted to canonical dispensaries
|
|
*
|
|
* This module does NOT create canonical dispensaries automatically.
|
|
*/
|
|
|
|
import { Pool } from 'pg';
|
|
import {
|
|
FullDiscoveryResult,
|
|
LocationDiscoveryResult,
|
|
DiscoveryCity,
|
|
} from './types';
|
|
import {
|
|
discoverCities,
|
|
getCitiesToCrawl,
|
|
getCityBySlug,
|
|
seedKnownCities,
|
|
ARIZONA_CITIES,
|
|
} from './city-discovery';
|
|
import {
|
|
discoverLocationsForCity,
|
|
} from './location-discovery';
|
|
|
|
// ============================================================
|
|
// FULL DISCOVERY
|
|
// ============================================================
|
|
|
|
export interface DiscoveryCrawlerOptions {
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
stateCode?: string;
|
|
countryCode?: string;
|
|
cityLimit?: number;
|
|
skipCityDiscovery?: boolean;
|
|
onlyStale?: boolean;
|
|
staleDays?: number;
|
|
}
|
|
|
|
/**
|
|
* Run the full discovery pipeline:
|
|
* 1. Discover/refresh cities
|
|
* 2. For each city, discover locations
|
|
*/
|
|
export async function runFullDiscovery(
|
|
pool: Pool,
|
|
options: DiscoveryCrawlerOptions = {}
|
|
): Promise<FullDiscoveryResult> {
|
|
const startTime = Date.now();
|
|
const {
|
|
dryRun = false,
|
|
verbose = false,
|
|
stateCode,
|
|
countryCode = 'US',
|
|
cityLimit = 50,
|
|
skipCityDiscovery = false,
|
|
onlyStale = true,
|
|
staleDays = 7,
|
|
} = options;
|
|
|
|
console.log('='.repeat(60));
|
|
console.log('DUTCHIE DISCOVERY CRAWLER');
|
|
console.log('='.repeat(60));
|
|
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
|
if (stateCode) console.log(`State: ${stateCode}`);
|
|
console.log(`Country: ${countryCode}`);
|
|
console.log(`City limit: ${cityLimit}`);
|
|
console.log('');
|
|
|
|
// Step 1: Discover/refresh cities
|
|
let cityResult = {
|
|
citiesFound: 0,
|
|
citiesUpserted: 0,
|
|
citiesSkipped: 0,
|
|
errors: [] as string[],
|
|
durationMs: 0,
|
|
};
|
|
|
|
if (!skipCityDiscovery) {
|
|
console.log('[Discovery] Step 1: Discovering cities...');
|
|
cityResult = await discoverCities(pool, { dryRun, verbose });
|
|
} else {
|
|
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
|
|
}
|
|
|
|
// Step 2: Get cities to crawl
|
|
console.log('[Discovery] Step 2: Getting cities to crawl...');
|
|
const cities = await getCitiesToCrawl(pool, {
|
|
stateCode,
|
|
countryCode,
|
|
limit: cityLimit,
|
|
onlyStale,
|
|
staleDays,
|
|
});
|
|
|
|
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
|
|
|
|
// Step 3: Discover locations for each city
|
|
console.log('[Discovery] Step 3: Discovering locations...');
|
|
const locationResults: LocationDiscoveryResult[] = [];
|
|
let totalLocationsFound = 0;
|
|
let totalLocationsUpserted = 0;
|
|
|
|
for (let i = 0; i < cities.length; i++) {
|
|
const city = cities[i];
|
|
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
|
|
|
try {
|
|
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
|
locationResults.push(result);
|
|
totalLocationsFound += result.locationsFound;
|
|
totalLocationsUpserted += result.locationsUpserted;
|
|
|
|
// Rate limiting between cities
|
|
if (i < cities.length - 1) {
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
}
|
|
} catch (error: any) {
|
|
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
|
|
locationResults.push({
|
|
cityId: city.id,
|
|
citySlug: city.citySlug,
|
|
locationsFound: 0,
|
|
locationsUpserted: 0,
|
|
locationsNew: 0,
|
|
locationsUpdated: 0,
|
|
errors: [error.message],
|
|
durationMs: 0,
|
|
});
|
|
}
|
|
}
|
|
|
|
const durationMs = Date.now() - startTime;
|
|
|
|
// Summary
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('DISCOVERY COMPLETE');
|
|
console.log('='.repeat(60));
|
|
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
|
console.log('');
|
|
console.log('Cities:');
|
|
console.log(` Discovered: ${cityResult.citiesFound}`);
|
|
console.log(` Upserted: ${cityResult.citiesUpserted}`);
|
|
console.log(` Crawled: ${cities.length}`);
|
|
console.log('');
|
|
console.log('Locations:');
|
|
console.log(` Found: ${totalLocationsFound}`);
|
|
console.log(` Upserted: ${totalLocationsUpserted}`);
|
|
console.log('');
|
|
|
|
const totalErrors = cityResult.errors.length +
|
|
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
|
|
if (totalErrors > 0) {
|
|
console.log(`Errors: ${totalErrors}`);
|
|
}
|
|
|
|
return {
|
|
cities: cityResult,
|
|
locations: locationResults,
|
|
totalLocationsFound,
|
|
totalLocationsUpserted,
|
|
durationMs,
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// SINGLE CITY DISCOVERY
|
|
// ============================================================
|
|
|
|
/**
|
|
* Discover locations for a single city by slug.
|
|
*/
|
|
export async function discoverCity(
|
|
pool: Pool,
|
|
citySlug: string,
|
|
options: {
|
|
stateCode?: string;
|
|
countryCode?: string;
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
} = {}
|
|
): Promise<LocationDiscoveryResult | null> {
|
|
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
|
|
|
|
// Find the city
|
|
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
|
|
|
if (!city) {
|
|
// Try to create it if we have enough info
|
|
if (stateCode) {
|
|
console.log(`[Discovery] City ${citySlug} not found, creating...`);
|
|
await seedKnownCities(pool, [{
|
|
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
|
|
slug: citySlug,
|
|
stateCode,
|
|
countryCode,
|
|
}]);
|
|
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
|
}
|
|
|
|
if (!city) {
|
|
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
|
}
|
|
|
|
// ============================================================
|
|
// STATE-WIDE DISCOVERY
|
|
// ============================================================
|
|
|
|
/**
|
|
* Seed and discover all cities for a state.
|
|
*/
|
|
export async function discoverState(
|
|
pool: Pool,
|
|
stateCode: string,
|
|
options: {
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
cityLimit?: number;
|
|
} = {}
|
|
): Promise<FullDiscoveryResult> {
|
|
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
|
|
|
|
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
|
|
|
// Seed known cities for this state
|
|
if (stateCode === 'AZ') {
|
|
console.log('[Discovery] Seeding Arizona cities...');
|
|
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
|
|
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
|
|
}
|
|
|
|
// Run full discovery for this state
|
|
return await runFullDiscovery(pool, {
|
|
dryRun,
|
|
verbose,
|
|
stateCode,
|
|
countryCode: 'US',
|
|
cityLimit,
|
|
skipCityDiscovery: true, // Use seeded cities
|
|
onlyStale: false, // Crawl all
|
|
});
|
|
}
|
|
|
|
// ============================================================
|
|
// STATISTICS
|
|
// ============================================================
|
|
|
|
export interface DiscoveryStats {
|
|
cities: {
|
|
total: number;
|
|
crawledLast24h: number;
|
|
neverCrawled: number;
|
|
};
|
|
locations: {
|
|
total: number;
|
|
discovered: number;
|
|
verified: number;
|
|
rejected: number;
|
|
merged: number;
|
|
byState: Array<{ stateCode: string; count: number }>;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get discovery statistics.
|
|
*/
|
|
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
|
|
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
|
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
|
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
|
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
|
|
]);
|
|
|
|
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
|
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
|
|
pool.query(`
|
|
SELECT status, COUNT(*) as cnt
|
|
FROM dutchie_discovery_locations
|
|
WHERE active = TRUE
|
|
GROUP BY status
|
|
`),
|
|
pool.query(`
|
|
SELECT state_code, COUNT(*) as cnt
|
|
FROM dutchie_discovery_locations
|
|
WHERE active = TRUE AND state_code IS NOT NULL
|
|
GROUP BY state_code
|
|
ORDER BY cnt DESC
|
|
`),
|
|
]);
|
|
|
|
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
|
|
acc[row.status] = parseInt(row.cnt, 10);
|
|
return acc;
|
|
}, {} as Record<string, number>);
|
|
|
|
return {
|
|
cities: {
|
|
total: parseInt(citiesTotal.rows[0].cnt, 10),
|
|
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
|
|
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
|
|
},
|
|
locations: {
|
|
total: parseInt(locsTotal.rows[0].cnt, 10),
|
|
discovered: statusCounts.discovered || 0,
|
|
verified: statusCounts.verified || 0,
|
|
rejected: statusCounts.rejected || 0,
|
|
merged: statusCounts.merged || 0,
|
|
byState: locsByState.rows.map(row => ({
|
|
stateCode: row.state_code,
|
|
count: parseInt(row.cnt, 10),
|
|
})),
|
|
},
|
|
};
|
|
}
|