Files
cannaiq/backend/src/discovery/discovery-crawler.ts
Kelly 56cc171287 feat: Stealth worker system with mandatory proxy rotation
## Worker System
- Role-agnostic workers that can handle any task type
- Pod-based architecture with StatefulSet (5-15 pods, 5 workers each)
- Custom pod names (Aethelgard, Xylos, Kryll, etc.)
- Worker registry with friendly names and resource monitoring
- Hub-and-spoke visualization on JobQueue page

## Stealth & Anti-Detection (REQUIRED)
- Proxies are MANDATORY - workers fail to start without active proxies
- CrawlRotator initializes on worker startup
- Loads proxies from `proxies` table
- Auto-rotates proxy + fingerprint on 403 errors
- 12 browser fingerprints (Chrome, Firefox, Safari, Edge)
- Locale/timezone matching for geographic consistency

## Task System
- Renamed product_resync → product_refresh
- Task chaining: store_discovery → entry_point → product_discovery
- Priority-based claiming with FOR UPDATE SKIP LOCKED
- Heartbeat and stale task recovery

## UI Updates
- JobQueue: Pod visualization, resource monitoring on hover
- WorkersDashboard: Simplified worker list
- Removed unused filters from task list

## Other
- IP2Location service for visitor analytics
- Findagram consumer features scaffolding
- Documentation updates

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 00:44:59 -07:00

483 lines
15 KiB
TypeScript

/**
* Dutchie Discovery Crawler
*
* Main orchestrator for the Dutchie store discovery pipeline.
*
* AUTOMATED FLOW (as of 2025-01):
* 1. Fetch cities dynamically from Dutchie GraphQL (getAllCitiesByState)
* 2. For each city, discover store locations via ConsumerDispensaries query
* 3. Upsert locations to dutchie_discovery_locations (keyed by platform_location_id)
* 4. AUTO-VALIDATE: Check required fields (name, city, state, platform_menu_url, platform_location_id)
* 5. AUTO-PROMOTE: Valid locations are upserted to dispensaries table with crawl_enabled=true
* 6. All actions logged to dutchie_promotion_log for audit
*
* Tables involved:
* - dutchie_discovery_cities: Known cities for each state
* - dutchie_discovery_locations: Raw discovered store data
* - dispensaries: Canonical store records (promoted from discovery)
* - dutchie_promotion_log: Audit trail for validation/promotion
*
* Usage:
* npx tsx src/scripts/run-discovery.ts discover:state AZ
* npx tsx src/scripts/run-discovery.ts discover:state CA
*/
import { Pool } from 'pg';
import {
FullDiscoveryResult,
LocationDiscoveryResult,
DiscoveryCity,
} from './types';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
} from './city-discovery';
import {
discoverLocationsForCity,
getCitiesForState,
} from './location-discovery';
import { promoteDiscoveredLocations } from './promotion';
// ============================================================
// FULL DISCOVERY
// ============================================================
export interface DiscoveryCrawlerOptions {
dryRun?: boolean;
verbose?: boolean;
stateCode?: string;
countryCode?: string;
cityLimit?: number;
skipCityDiscovery?: boolean;
onlyStale?: boolean;
staleDays?: number;
}
/**
* Run the full discovery pipeline:
* 1. Discover/refresh cities
* 2. For each city, discover locations
*/
export async function runFullDiscovery(
pool: Pool,
options: DiscoveryCrawlerOptions = {}
): Promise<FullDiscoveryResult> {
const startTime = Date.now();
const {
dryRun = false,
verbose = false,
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
} = options;
console.log('='.repeat(60));
console.log('DUTCHIE DISCOVERY CRAWLER');
console.log('='.repeat(60));
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
if (stateCode) console.log(`State: ${stateCode}`);
console.log(`Country: ${countryCode}`);
console.log(`City limit: ${cityLimit}`);
console.log('');
// Step 1: Discover/refresh cities
let cityResult = {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: [] as string[],
durationMs: 0,
};
if (!skipCityDiscovery) {
console.log('[Discovery] Step 1: Discovering cities...');
cityResult = await discoverCities(pool, { dryRun, verbose });
} else {
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
}
// Step 2: Get cities to crawl
console.log('[Discovery] Step 2: Getting cities to crawl...');
const cities = await getCitiesToCrawl(pool, {
stateCode,
countryCode,
limit: cityLimit,
onlyStale,
staleDays,
});
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
// Step 3: Discover locations for each city
console.log('[Discovery] Step 3: Discovering locations...');
const locationResults: LocationDiscoveryResult[] = [];
let totalLocationsFound = 0;
let totalLocationsUpserted = 0;
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
locationResults.push(result);
totalLocationsFound += result.locationsFound;
totalLocationsUpserted += result.locationsUpserted;
// Rate limiting between cities
if (i < cities.length - 1) {
await new Promise((r) => setTimeout(r, 2000));
}
} catch (error: any) {
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
locationResults.push({
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [error.message],
durationMs: 0,
});
}
}
const durationMs = Date.now() - startTime;
// Summary
console.log('\n' + '='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
console.log('');
console.log('Cities:');
console.log(` Discovered: ${cityResult.citiesFound}`);
console.log(` Upserted: ${cityResult.citiesUpserted}`);
console.log(` Crawled: ${cities.length}`);
console.log('');
console.log('Locations:');
console.log(` Found: ${totalLocationsFound}`);
console.log(` Upserted: ${totalLocationsUpserted}`);
console.log('');
const totalErrors = cityResult.errors.length +
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
if (totalErrors > 0) {
console.log(`Errors: ${totalErrors}`);
}
// Step 4: Auto-validate and promote discovered locations
if (!dryRun && totalLocationsUpserted > 0) {
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
const promotionResult = await promoteDiscoveredLocations(stateCode, false);
console.log(`[Discovery] Promotion complete:`);
console.log(` Created: ${promotionResult.created} new dispensaries`);
console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
if (promotionResult.rejectedRecords.length > 0) {
console.log(` Rejection reasons:`);
promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
console.log(` - ${r.name}: ${r.errors.join(', ')}`);
});
if (promotionResult.rejectedRecords.length > 5) {
console.log(` ... and ${promotionResult.rejectedRecords.length - 5} more`);
}
}
}
// Step 5: Detect dropped stores (in DB but not in discovery results)
if (!dryRun) {
console.log('\n[Discovery] Step 5: Detecting dropped stores...');
const droppedResult = await detectDroppedStores(pool, stateCode);
if (droppedResult.droppedCount > 0) {
console.log(`[Discovery] Found ${droppedResult.droppedCount} dropped stores:`);
droppedResult.droppedStores.slice(0, 10).forEach(s => {
console.log(` - ${s.name} (${s.city}, ${s.state}) - last seen: ${s.lastSeenAt}`);
});
if (droppedResult.droppedCount > 10) {
console.log(` ... and ${droppedResult.droppedCount - 10} more`);
}
} else {
console.log(`[Discovery] No dropped stores detected`);
}
}
return {
cities: cityResult,
locations: locationResults,
totalLocationsFound,
totalLocationsUpserted,
durationMs,
};
}
// ============================================================
// DROPPED STORE DETECTION
// ============================================================
export interface DroppedStoreResult {
droppedCount: number;
droppedStores: Array<{
id: number;
name: string;
city: string;
state: string;
platformDispensaryId: string;
lastSeenAt: string;
}>;
}
/**
* Detect stores that exist in dispensaries but were not found in discovery.
* Marks them as status='dropped' for manual review.
*
* A store is considered "dropped" if:
* 1. It has a platform_dispensary_id (was verified via Dutchie)
* 2. It was NOT seen in the latest discovery crawl (last_seen_at in discovery < 24h ago)
* 3. It's currently marked as 'open' status
*/
export async function detectDroppedStores(
pool: Pool,
stateCode?: string
): Promise<DroppedStoreResult> {
// Find dispensaries that:
// 1. Have platform_dispensary_id (verified Dutchie stores)
// 2. Are currently 'open' status
// 3. Have a linked discovery record that wasn't seen in the last discovery run
// (last_seen_at in dutchie_discovery_locations is older than 24 hours)
const params: any[] = [];
let stateFilter = '';
if (stateCode) {
stateFilter = ` AND d.state = $1`;
params.push(stateCode);
}
const query = `
WITH recently_seen AS (
SELECT DISTINCT platform_location_id
FROM dutchie_discovery_locations
WHERE last_seen_at > NOW() - INTERVAL '24 hours'
AND active = true
)
SELECT
d.id,
d.name,
d.city,
d.state,
d.platform_dispensary_id,
d.updated_at as last_seen_at
FROM dispensaries d
WHERE d.platform_dispensary_id IS NOT NULL
AND d.platform = 'dutchie'
AND (d.status = 'open' OR d.status IS NULL)
AND d.crawl_enabled = true
AND d.platform_dispensary_id NOT IN (SELECT platform_location_id FROM recently_seen)
${stateFilter}
ORDER BY d.name
`;
const result = await pool.query(query, params);
const droppedStores = result.rows;
// Mark these stores as 'dropped' status
if (droppedStores.length > 0) {
const ids = droppedStores.map(s => s.id);
await pool.query(`
UPDATE dispensaries
SET status = 'dropped', updated_at = NOW()
WHERE id = ANY($1::int[])
`, [ids]);
// Log to promotion log for audit
for (const store of droppedStores) {
await pool.query(`
INSERT INTO dutchie_promotion_log
(dispensary_id, action, state_code, store_name, triggered_by)
VALUES ($1, 'dropped', $2, $3, 'discovery_detection')
`, [store.id, store.state, store.name]);
}
}
return {
droppedCount: droppedStores.length,
droppedStores: droppedStores.map(s => ({
id: s.id,
name: s.name,
city: s.city,
state: s.state,
platformDispensaryId: s.platform_dispensary_id,
lastSeenAt: s.last_seen_at,
})),
};
}
// ============================================================
// SINGLE CITY DISCOVERY
// ============================================================
/**
* Discover locations for a single city by slug.
*/
export async function discoverCity(
pool: Pool,
citySlug: string,
options: {
stateCode?: string;
countryCode?: string;
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult | null> {
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
// Find the city
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
if (!city) {
// Try to create it if we have enough info
if (stateCode) {
console.log(`[Discovery] City ${citySlug} not found, creating...`);
await seedKnownCities(pool, [{
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
slug: citySlug,
stateCode,
countryCode,
}]);
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
}
if (!city) {
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
return null;
}
}
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
}
// ============================================================
// STATE-WIDE DISCOVERY
// ============================================================
/**
* Seed and discover all cities for a state.
*/
export async function discoverState(
pool: Pool,
stateCode: string,
options: {
dryRun?: boolean;
verbose?: boolean;
cityLimit?: number;
} = {}
): Promise<FullDiscoveryResult> {
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
console.log(`[Discovery] Discovering state: ${stateCode}`);
// Dynamically fetch and seed cities for this state
console.log(`[Discovery] Fetching cities for ${stateCode} from Dutchie...`);
const cityNames = await getCitiesForState(stateCode);
if (cityNames.length > 0) {
const cities = cityNames.map(name => ({
name,
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
stateCode,
}));
const seeded = await seedKnownCities(pool, cities);
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated for ${stateCode}`);
} else {
console.log(`[Discovery] No cities found for ${stateCode}`);
}
// Run full discovery for this state
return await runFullDiscovery(pool, {
dryRun,
verbose,
stateCode,
countryCode: 'US',
cityLimit,
skipCityDiscovery: true, // Use seeded cities
onlyStale: false, // Crawl all
});
}
// ============================================================
// STATISTICS
// ============================================================
export interface DiscoveryStats {
cities: {
total: number;
crawledLast24h: number;
neverCrawled: number;
};
locations: {
total: number;
discovered: number;
verified: number;
rejected: number;
merged: number;
byState: Array<{ stateCode: string; count: number }>;
};
}
/**
* Get discovery statistics.
*/
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
]);
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE
GROUP BY status
`),
pool.query(`
SELECT state_code, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY cnt DESC
`),
]);
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
acc[row.status] = parseInt(row.cnt, 10);
return acc;
}, {} as Record<string, number>);
return {
cities: {
total: parseInt(citiesTotal.rows[0].cnt, 10),
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
},
locations: {
total: parseInt(locsTotal.rows[0].cnt, 10),
discovered: statusCounts.discovered || 0,
verified: statusCounts.verified || 0,
rejected: statusCounts.rejected || 0,
merged: statusCounts.merged || 0,
byState: locsByState.rows.map(row => ({
stateCode: row.state_code,
count: parseInt(row.cnt, 10),
})),
},
};
}