Files
cannaiq/backend/src/discovery/discovery-crawler.ts
Kelly 2f483b3084 feat: SEO template library, discovery pipeline, and orchestrator enhancements
## SEO Template Library
- Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration)
- Add Template Library tab in SEO Orchestrator with accordion-based editors
- Add template preview, validation, and variable injection engine
- Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate

## Discovery Pipeline
- Add promotion.ts for discovery location validation and promotion
- Add discover-all-states.ts script for multi-state discovery
- Add promotion log migration (067)
- Enhance discovery routes and types

## Orchestrator & Admin
- Add crawl_enabled filter to stores page
- Add API permissions page
- Add job queue management
- Add price analytics routes
- Add markets and intelligence routes
- Enhance dashboard and worker monitoring

## Infrastructure
- Add migrations for worker definitions, SEO settings, field alignment
- Add canonical pipeline for scraper v2
- Update hydration and sync orchestrator
- Enhance multi-state query service

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 00:05:34 -07:00

365 lines
11 KiB
TypeScript

/**
* Dutchie Discovery Crawler
*
* Main orchestrator for the Dutchie store discovery pipeline.
*
* AUTOMATED FLOW (as of 2025-01):
* 1. Fetch cities dynamically from Dutchie GraphQL (getAllCitiesByState)
* 2. For each city, discover store locations via ConsumerDispensaries query
* 3. Upsert locations to dutchie_discovery_locations (keyed by platform_location_id)
* 4. AUTO-VALIDATE: Check required fields (name, city, state, platform_menu_url, platform_location_id)
* 5. AUTO-PROMOTE: Valid locations are upserted to dispensaries table with crawl_enabled=true
* 6. All actions logged to dutchie_promotion_log for audit
*
* Tables involved:
* - dutchie_discovery_cities: Known cities for each state
* - dutchie_discovery_locations: Raw discovered store data
* - dispensaries: Canonical store records (promoted from discovery)
* - dutchie_promotion_log: Audit trail for validation/promotion
*
* Usage:
* npx tsx src/scripts/run-discovery.ts discover:state AZ
* npx tsx src/scripts/run-discovery.ts discover:state CA
*/
import { Pool } from 'pg';
import {
FullDiscoveryResult,
LocationDiscoveryResult,
DiscoveryCity,
} from './types';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
} from './city-discovery';
import {
discoverLocationsForCity,
getCitiesForState,
} from './location-discovery';
import { promoteDiscoveredLocations } from './promotion';
// ============================================================
// FULL DISCOVERY
// ============================================================
export interface DiscoveryCrawlerOptions {
dryRun?: boolean;
verbose?: boolean;
stateCode?: string;
countryCode?: string;
cityLimit?: number;
skipCityDiscovery?: boolean;
onlyStale?: boolean;
staleDays?: number;
}
/**
* Run the full discovery pipeline:
* 1. Discover/refresh cities
* 2. For each city, discover locations
*/
export async function runFullDiscovery(
pool: Pool,
options: DiscoveryCrawlerOptions = {}
): Promise<FullDiscoveryResult> {
const startTime = Date.now();
const {
dryRun = false,
verbose = false,
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
} = options;
console.log('='.repeat(60));
console.log('DUTCHIE DISCOVERY CRAWLER');
console.log('='.repeat(60));
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
if (stateCode) console.log(`State: ${stateCode}`);
console.log(`Country: ${countryCode}`);
console.log(`City limit: ${cityLimit}`);
console.log('');
// Step 1: Discover/refresh cities
let cityResult = {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: [] as string[],
durationMs: 0,
};
if (!skipCityDiscovery) {
console.log('[Discovery] Step 1: Discovering cities...');
cityResult = await discoverCities(pool, { dryRun, verbose });
} else {
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
}
// Step 2: Get cities to crawl
console.log('[Discovery] Step 2: Getting cities to crawl...');
const cities = await getCitiesToCrawl(pool, {
stateCode,
countryCode,
limit: cityLimit,
onlyStale,
staleDays,
});
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
// Step 3: Discover locations for each city
console.log('[Discovery] Step 3: Discovering locations...');
const locationResults: LocationDiscoveryResult[] = [];
let totalLocationsFound = 0;
let totalLocationsUpserted = 0;
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
locationResults.push(result);
totalLocationsFound += result.locationsFound;
totalLocationsUpserted += result.locationsUpserted;
// Rate limiting between cities
if (i < cities.length - 1) {
await new Promise((r) => setTimeout(r, 2000));
}
} catch (error: any) {
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
locationResults.push({
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [error.message],
durationMs: 0,
});
}
}
const durationMs = Date.now() - startTime;
// Summary
console.log('\n' + '='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
console.log('');
console.log('Cities:');
console.log(` Discovered: ${cityResult.citiesFound}`);
console.log(` Upserted: ${cityResult.citiesUpserted}`);
console.log(` Crawled: ${cities.length}`);
console.log('');
console.log('Locations:');
console.log(` Found: ${totalLocationsFound}`);
console.log(` Upserted: ${totalLocationsUpserted}`);
console.log('');
const totalErrors = cityResult.errors.length +
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
if (totalErrors > 0) {
console.log(`Errors: ${totalErrors}`);
}
// Step 4: Auto-validate and promote discovered locations
if (!dryRun && totalLocationsUpserted > 0) {
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
const promotionResult = await promoteDiscoveredLocations(stateCode, false);
console.log(`[Discovery] Promotion complete:`);
console.log(` Created: ${promotionResult.created} new dispensaries`);
console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
if (promotionResult.rejectedRecords.length > 0) {
console.log(` Rejection reasons:`);
promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
console.log(` - ${r.name}: ${r.errors.join(', ')}`);
});
if (promotionResult.rejectedRecords.length > 5) {
console.log(` ... and ${promotionResult.rejectedRecords.length - 5} more`);
}
}
}
return {
cities: cityResult,
locations: locationResults,
totalLocationsFound,
totalLocationsUpserted,
durationMs,
};
}
// ============================================================
// SINGLE CITY DISCOVERY
// ============================================================
/**
* Discover locations for a single city by slug.
*/
export async function discoverCity(
pool: Pool,
citySlug: string,
options: {
stateCode?: string;
countryCode?: string;
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult | null> {
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
// Find the city
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
if (!city) {
// Try to create it if we have enough info
if (stateCode) {
console.log(`[Discovery] City ${citySlug} not found, creating...`);
await seedKnownCities(pool, [{
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
slug: citySlug,
stateCode,
countryCode,
}]);
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
}
if (!city) {
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
return null;
}
}
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
}
// ============================================================
// STATE-WIDE DISCOVERY
// ============================================================
/**
* Seed and discover all cities for a state.
*/
export async function discoverState(
pool: Pool,
stateCode: string,
options: {
dryRun?: boolean;
verbose?: boolean;
cityLimit?: number;
} = {}
): Promise<FullDiscoveryResult> {
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
console.log(`[Discovery] Discovering state: ${stateCode}`);
// Dynamically fetch and seed cities for this state
console.log(`[Discovery] Fetching cities for ${stateCode} from Dutchie...`);
const cityNames = await getCitiesForState(stateCode);
if (cityNames.length > 0) {
const cities = cityNames.map(name => ({
name,
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
stateCode,
}));
const seeded = await seedKnownCities(pool, cities);
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated for ${stateCode}`);
} else {
console.log(`[Discovery] No cities found for ${stateCode}`);
}
// Run full discovery for this state
return await runFullDiscovery(pool, {
dryRun,
verbose,
stateCode,
countryCode: 'US',
cityLimit,
skipCityDiscovery: true, // Use seeded cities
onlyStale: false, // Crawl all
});
}
// ============================================================
// STATISTICS
// ============================================================
export interface DiscoveryStats {
cities: {
total: number;
crawledLast24h: number;
neverCrawled: number;
};
locations: {
total: number;
discovered: number;
verified: number;
rejected: number;
merged: number;
byState: Array<{ stateCode: string; count: number }>;
};
}
/**
* Get discovery statistics.
*/
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
]);
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE
GROUP BY status
`),
pool.query(`
SELECT state_code, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY cnt DESC
`),
]);
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
acc[row.status] = parseInt(row.cnt, 10);
return acc;
}, {} as Record<string, number>);
return {
cities: {
total: parseInt(citiesTotal.rows[0].cnt, 10),
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
},
locations: {
total: parseInt(locsTotal.rows[0].cnt, 10),
discovered: statusCounts.discovered || 0,
verified: statusCounts.verified || 0,
rejected: statusCounts.rejected || 0,
merged: statusCounts.merged || 0,
byState: locsByState.rows.map(row => ({
stateCode: row.state_code,
count: parseInt(row.cnt, 10),
})),
},
};
}