Files
cannaiq/backend/src/discovery/discovery-crawler.ts
Kelly b4a2fb7d03 feat: Add v2 architecture with multi-state support and orchestrator services
Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-07 11:30:57 -07:00

328 lines
9.1 KiB
TypeScript

/**
* Dutchie Discovery Crawler
*
* Main orchestrator for the Dutchie store discovery pipeline.
*
* Flow:
* 1. Discover cities from Dutchie (or use seeded cities)
* 2. For each city, discover store locations
* 3. Upsert all data to discovery tables
* 4. Admin verifies locations manually
* 5. Verified locations are promoted to canonical dispensaries
*
* This module does NOT create canonical dispensaries automatically.
*/
import { Pool } from 'pg';
import {
FullDiscoveryResult,
LocationDiscoveryResult,
DiscoveryCity,
} from './types';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
import {
discoverLocationsForCity,
} from './location-discovery';
// ============================================================
// FULL DISCOVERY
// ============================================================
export interface DiscoveryCrawlerOptions {
dryRun?: boolean;
verbose?: boolean;
stateCode?: string;
countryCode?: string;
cityLimit?: number;
skipCityDiscovery?: boolean;
onlyStale?: boolean;
staleDays?: number;
}
/**
* Run the full discovery pipeline:
* 1. Discover/refresh cities
* 2. For each city, discover locations
*/
export async function runFullDiscovery(
pool: Pool,
options: DiscoveryCrawlerOptions = {}
): Promise<FullDiscoveryResult> {
const startTime = Date.now();
const {
dryRun = false,
verbose = false,
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
} = options;
console.log('='.repeat(60));
console.log('DUTCHIE DISCOVERY CRAWLER');
console.log('='.repeat(60));
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
if (stateCode) console.log(`State: ${stateCode}`);
console.log(`Country: ${countryCode}`);
console.log(`City limit: ${cityLimit}`);
console.log('');
// Step 1: Discover/refresh cities
let cityResult = {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: [] as string[],
durationMs: 0,
};
if (!skipCityDiscovery) {
console.log('[Discovery] Step 1: Discovering cities...');
cityResult = await discoverCities(pool, { dryRun, verbose });
} else {
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
}
// Step 2: Get cities to crawl
console.log('[Discovery] Step 2: Getting cities to crawl...');
const cities = await getCitiesToCrawl(pool, {
stateCode,
countryCode,
limit: cityLimit,
onlyStale,
staleDays,
});
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
// Step 3: Discover locations for each city
console.log('[Discovery] Step 3: Discovering locations...');
const locationResults: LocationDiscoveryResult[] = [];
let totalLocationsFound = 0;
let totalLocationsUpserted = 0;
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
locationResults.push(result);
totalLocationsFound += result.locationsFound;
totalLocationsUpserted += result.locationsUpserted;
// Rate limiting between cities
if (i < cities.length - 1) {
await new Promise((r) => setTimeout(r, 2000));
}
} catch (error: any) {
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
locationResults.push({
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [error.message],
durationMs: 0,
});
}
}
const durationMs = Date.now() - startTime;
// Summary
console.log('\n' + '='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
console.log('');
console.log('Cities:');
console.log(` Discovered: ${cityResult.citiesFound}`);
console.log(` Upserted: ${cityResult.citiesUpserted}`);
console.log(` Crawled: ${cities.length}`);
console.log('');
console.log('Locations:');
console.log(` Found: ${totalLocationsFound}`);
console.log(` Upserted: ${totalLocationsUpserted}`);
console.log('');
const totalErrors = cityResult.errors.length +
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
if (totalErrors > 0) {
console.log(`Errors: ${totalErrors}`);
}
return {
cities: cityResult,
locations: locationResults,
totalLocationsFound,
totalLocationsUpserted,
durationMs,
};
}
// ============================================================
// SINGLE CITY DISCOVERY
// ============================================================
/**
* Discover locations for a single city by slug.
*/
export async function discoverCity(
pool: Pool,
citySlug: string,
options: {
stateCode?: string;
countryCode?: string;
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult | null> {
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
// Find the city
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
if (!city) {
// Try to create it if we have enough info
if (stateCode) {
console.log(`[Discovery] City ${citySlug} not found, creating...`);
await seedKnownCities(pool, [{
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
slug: citySlug,
stateCode,
countryCode,
}]);
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
}
if (!city) {
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
return null;
}
}
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
}
// ============================================================
// STATE-WIDE DISCOVERY
// ============================================================
/**
* Seed and discover all cities for a state.
*/
export async function discoverState(
pool: Pool,
stateCode: string,
options: {
dryRun?: boolean;
verbose?: boolean;
cityLimit?: number;
} = {}
): Promise<FullDiscoveryResult> {
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
console.log(`[Discovery] Discovering state: ${stateCode}`);
// Seed known cities for this state
if (stateCode === 'AZ') {
console.log('[Discovery] Seeding Arizona cities...');
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
}
// Run full discovery for this state
return await runFullDiscovery(pool, {
dryRun,
verbose,
stateCode,
countryCode: 'US',
cityLimit,
skipCityDiscovery: true, // Use seeded cities
onlyStale: false, // Crawl all
});
}
// ============================================================
// STATISTICS
// ============================================================
export interface DiscoveryStats {
cities: {
total: number;
crawledLast24h: number;
neverCrawled: number;
};
locations: {
total: number;
discovered: number;
verified: number;
rejected: number;
merged: number;
byState: Array<{ stateCode: string; count: number }>;
};
}
/**
* Get discovery statistics.
*/
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
]);
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE
GROUP BY status
`),
pool.query(`
SELECT state_code, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY cnt DESC
`),
]);
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
acc[row.status] = parseInt(row.cnt, 10);
return acc;
}, {} as Record<string, number>);
return {
cities: {
total: parseInt(citiesTotal.rows[0].cnt, 10),
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
},
locations: {
total: parseInt(locsTotal.rows[0].cnt, 10),
discovered: statusCounts.discovered || 0,
verified: statusCounts.verified || 0,
rejected: statusCounts.rejected || 0,
merged: statusCounts.merged || 0,
byState: locsByState.rows.map(row => ({
stateCode: row.state_code,
count: parseInt(row.cnt, 10),
})),
},
};
}