feat: Add v2 architecture with multi-state support and orchestrator services
Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
327
backend/src/discovery/discovery-crawler.ts
Normal file
327
backend/src/discovery/discovery-crawler.ts
Normal file
@@ -0,0 +1,327 @@
|
||||
/**
|
||||
* Dutchie Discovery Crawler
|
||||
*
|
||||
* Main orchestrator for the Dutchie store discovery pipeline.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Discover cities from Dutchie (or use seeded cities)
|
||||
* 2. For each city, discover store locations
|
||||
* 3. Upsert all data to discovery tables
|
||||
* 4. Admin verifies locations manually
|
||||
* 5. Verified locations are promoted to canonical dispensaries
|
||||
*
|
||||
* This module does NOT create canonical dispensaries automatically.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
FullDiscoveryResult,
|
||||
LocationDiscoveryResult,
|
||||
DiscoveryCity,
|
||||
} from './types';
|
||||
import {
|
||||
discoverCities,
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import {
|
||||
discoverLocationsForCity,
|
||||
} from './location-discovery';
|
||||
|
||||
// ============================================================
|
||||
// FULL DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryCrawlerOptions {
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
stateCode?: string;
|
||||
countryCode?: string;
|
||||
cityLimit?: number;
|
||||
skipCityDiscovery?: boolean;
|
||||
onlyStale?: boolean;
|
||||
staleDays?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the full discovery pipeline:
|
||||
* 1. Discover/refresh cities
|
||||
* 2. For each city, discover locations
|
||||
*/
|
||||
export async function runFullDiscovery(
|
||||
pool: Pool,
|
||||
options: DiscoveryCrawlerOptions = {}
|
||||
): Promise<FullDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const {
|
||||
dryRun = false,
|
||||
verbose = false,
|
||||
stateCode,
|
||||
countryCode = 'US',
|
||||
cityLimit = 50,
|
||||
skipCityDiscovery = false,
|
||||
onlyStale = true,
|
||||
staleDays = 7,
|
||||
} = options;
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE DISCOVERY CRAWLER');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
if (stateCode) console.log(`State: ${stateCode}`);
|
||||
console.log(`Country: ${countryCode}`);
|
||||
console.log(`City limit: ${cityLimit}`);
|
||||
console.log('');
|
||||
|
||||
// Step 1: Discover/refresh cities
|
||||
let cityResult = {
|
||||
citiesFound: 0,
|
||||
citiesUpserted: 0,
|
||||
citiesSkipped: 0,
|
||||
errors: [] as string[],
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
if (!skipCityDiscovery) {
|
||||
console.log('[Discovery] Step 1: Discovering cities...');
|
||||
cityResult = await discoverCities(pool, { dryRun, verbose });
|
||||
} else {
|
||||
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
|
||||
}
|
||||
|
||||
// Step 2: Get cities to crawl
|
||||
console.log('[Discovery] Step 2: Getting cities to crawl...');
|
||||
const cities = await getCitiesToCrawl(pool, {
|
||||
stateCode,
|
||||
countryCode,
|
||||
limit: cityLimit,
|
||||
onlyStale,
|
||||
staleDays,
|
||||
});
|
||||
|
||||
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
|
||||
|
||||
// Step 3: Discover locations for each city
|
||||
console.log('[Discovery] Step 3: Discovering locations...');
|
||||
const locationResults: LocationDiscoveryResult[] = [];
|
||||
let totalLocationsFound = 0;
|
||||
let totalLocationsUpserted = 0;
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||
|
||||
try {
|
||||
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
||||
locationResults.push(result);
|
||||
totalLocationsFound += result.locationsFound;
|
||||
totalLocationsUpserted += result.locationsUpserted;
|
||||
|
||||
// Rate limiting between cities
|
||||
if (i < cities.length - 1) {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
|
||||
locationResults.push({
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound: 0,
|
||||
locationsUpserted: 0,
|
||||
locationsNew: 0,
|
||||
locationsUpdated: 0,
|
||||
errors: [error.message],
|
||||
durationMs: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
// Summary
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('DISCOVERY COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
console.log('');
|
||||
console.log('Cities:');
|
||||
console.log(` Discovered: ${cityResult.citiesFound}`);
|
||||
console.log(` Upserted: ${cityResult.citiesUpserted}`);
|
||||
console.log(` Crawled: ${cities.length}`);
|
||||
console.log('');
|
||||
console.log('Locations:');
|
||||
console.log(` Found: ${totalLocationsFound}`);
|
||||
console.log(` Upserted: ${totalLocationsUpserted}`);
|
||||
console.log('');
|
||||
|
||||
const totalErrors = cityResult.errors.length +
|
||||
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
|
||||
if (totalErrors > 0) {
|
||||
console.log(`Errors: ${totalErrors}`);
|
||||
}
|
||||
|
||||
return {
|
||||
cities: cityResult,
|
||||
locations: locationResults,
|
||||
totalLocationsFound,
|
||||
totalLocationsUpserted,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLE CITY DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Discover locations for a single city by slug.
|
||||
*/
|
||||
export async function discoverCity(
|
||||
pool: Pool,
|
||||
citySlug: string,
|
||||
options: {
|
||||
stateCode?: string;
|
||||
countryCode?: string;
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
} = {}
|
||||
): Promise<LocationDiscoveryResult | null> {
|
||||
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
|
||||
|
||||
// Find the city
|
||||
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
||||
|
||||
if (!city) {
|
||||
// Try to create it if we have enough info
|
||||
if (stateCode) {
|
||||
console.log(`[Discovery] City ${citySlug} not found, creating...`);
|
||||
await seedKnownCities(pool, [{
|
||||
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
|
||||
slug: citySlug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
}]);
|
||||
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
||||
}
|
||||
|
||||
if (!city) {
|
||||
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STATE-WIDE DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Seed and discover all cities for a state.
|
||||
*/
|
||||
export async function discoverState(
|
||||
pool: Pool,
|
||||
stateCode: string,
|
||||
options: {
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
cityLimit?: number;
|
||||
} = {}
|
||||
): Promise<FullDiscoveryResult> {
|
||||
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
|
||||
|
||||
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
||||
|
||||
// Seed known cities for this state
|
||||
if (stateCode === 'AZ') {
|
||||
console.log('[Discovery] Seeding Arizona cities...');
|
||||
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
|
||||
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
|
||||
}
|
||||
|
||||
// Run full discovery for this state
|
||||
return await runFullDiscovery(pool, {
|
||||
dryRun,
|
||||
verbose,
|
||||
stateCode,
|
||||
countryCode: 'US',
|
||||
cityLimit,
|
||||
skipCityDiscovery: true, // Use seeded cities
|
||||
onlyStale: false, // Crawl all
|
||||
});
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STATISTICS
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryStats {
|
||||
cities: {
|
||||
total: number;
|
||||
crawledLast24h: number;
|
||||
neverCrawled: number;
|
||||
};
|
||||
locations: {
|
||||
total: number;
|
||||
discovered: number;
|
||||
verified: number;
|
||||
rejected: number;
|
||||
merged: number;
|
||||
byState: Array<{ stateCode: string; count: number }>;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get discovery statistics.
|
||||
*/
|
||||
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
|
||||
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
|
||||
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
|
||||
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
|
||||
]);
|
||||
|
||||
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
|
||||
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
|
||||
pool.query(`
|
||||
SELECT status, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE active = TRUE
|
||||
GROUP BY status
|
||||
`),
|
||||
pool.query(`
|
||||
SELECT state_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE active = TRUE AND state_code IS NOT NULL
|
||||
GROUP BY state_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
]);
|
||||
|
||||
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
|
||||
acc[row.status] = parseInt(row.cnt, 10);
|
||||
return acc;
|
||||
}, {} as Record<string, number>);
|
||||
|
||||
return {
|
||||
cities: {
|
||||
total: parseInt(citiesTotal.rows[0].cnt, 10),
|
||||
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
|
||||
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
|
||||
},
|
||||
locations: {
|
||||
total: parseInt(locsTotal.rows[0].cnt, 10),
|
||||
discovered: statusCounts.discovered || 0,
|
||||
verified: statusCounts.verified || 0,
|
||||
rejected: statusCounts.rejected || 0,
|
||||
merged: statusCounts.merged || 0,
|
||||
byState: locsByState.rows.map(row => ({
|
||||
stateCode: row.state_code,
|
||||
count: parseInt(row.cnt, 10),
|
||||
})),
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user