## Worker System - Role-agnostic workers that can handle any task type - Pod-based architecture with StatefulSet (5-15 pods, 5 workers each) - Custom pod names (Aethelgard, Xylos, Kryll, etc.) - Worker registry with friendly names and resource monitoring - Hub-and-spoke visualization on JobQueue page ## Stealth & Anti-Detection (REQUIRED) - Proxies are MANDATORY - workers fail to start without active proxies - CrawlRotator initializes on worker startup - Loads proxies from `proxies` table - Auto-rotates proxy + fingerprint on 403 errors - 12 browser fingerprints (Chrome, Firefox, Safari, Edge) - Locale/timezone matching for geographic consistency ## Task System - Renamed product_resync → product_refresh - Task chaining: store_discovery → entry_point → product_discovery - Priority-based claiming with FOR UPDATE SKIP LOCKED - Heartbeat and stale task recovery ## UI Updates - JobQueue: Pod visualization, resource monitoring on hover - WorkersDashboard: Simplified worker list - Removed unused filters from task list ## Other - IP2Location service for visitor analytics - Findagram consumer features scaffolding - Documentation updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
483 lines
15 KiB
TypeScript
483 lines
15 KiB
TypeScript
/**
|
|
* Dutchie Discovery Crawler
|
|
*
|
|
* Main orchestrator for the Dutchie store discovery pipeline.
|
|
*
|
|
* AUTOMATED FLOW (as of 2025-01):
|
|
* 1. Fetch cities dynamically from Dutchie GraphQL (getAllCitiesByState)
|
|
* 2. For each city, discover store locations via ConsumerDispensaries query
|
|
* 3. Upsert locations to dutchie_discovery_locations (keyed by platform_location_id)
|
|
* 4. AUTO-VALIDATE: Check required fields (name, city, state, platform_menu_url, platform_location_id)
|
|
* 5. AUTO-PROMOTE: Valid locations are upserted to dispensaries table with crawl_enabled=true
|
|
* 6. All actions logged to dutchie_promotion_log for audit
|
|
*
|
|
* Tables involved:
|
|
* - dutchie_discovery_cities: Known cities for each state
|
|
* - dutchie_discovery_locations: Raw discovered store data
|
|
* - dispensaries: Canonical store records (promoted from discovery)
|
|
* - dutchie_promotion_log: Audit trail for validation/promotion
|
|
*
|
|
* Usage:
|
|
* npx tsx src/scripts/run-discovery.ts discover:state AZ
|
|
* npx tsx src/scripts/run-discovery.ts discover:state CA
|
|
*/
|
|
|
|
import { Pool } from 'pg';
|
|
import {
|
|
FullDiscoveryResult,
|
|
LocationDiscoveryResult,
|
|
DiscoveryCity,
|
|
} from './types';
|
|
import {
|
|
discoverCities,
|
|
getCitiesToCrawl,
|
|
getCityBySlug,
|
|
seedKnownCities,
|
|
} from './city-discovery';
|
|
import {
|
|
discoverLocationsForCity,
|
|
getCitiesForState,
|
|
} from './location-discovery';
|
|
import { promoteDiscoveredLocations } from './promotion';
|
|
|
|
// ============================================================
|
|
// FULL DISCOVERY
|
|
// ============================================================
|
|
|
|
export interface DiscoveryCrawlerOptions {
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
stateCode?: string;
|
|
countryCode?: string;
|
|
cityLimit?: number;
|
|
skipCityDiscovery?: boolean;
|
|
onlyStale?: boolean;
|
|
staleDays?: number;
|
|
}
|
|
|
|
/**
|
|
* Run the full discovery pipeline:
|
|
* 1. Discover/refresh cities
|
|
* 2. For each city, discover locations
|
|
*/
|
|
export async function runFullDiscovery(
|
|
pool: Pool,
|
|
options: DiscoveryCrawlerOptions = {}
|
|
): Promise<FullDiscoveryResult> {
|
|
const startTime = Date.now();
|
|
const {
|
|
dryRun = false,
|
|
verbose = false,
|
|
stateCode,
|
|
countryCode = 'US',
|
|
cityLimit = 50,
|
|
skipCityDiscovery = false,
|
|
onlyStale = true,
|
|
staleDays = 7,
|
|
} = options;
|
|
|
|
console.log('='.repeat(60));
|
|
console.log('DUTCHIE DISCOVERY CRAWLER');
|
|
console.log('='.repeat(60));
|
|
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
|
if (stateCode) console.log(`State: ${stateCode}`);
|
|
console.log(`Country: ${countryCode}`);
|
|
console.log(`City limit: ${cityLimit}`);
|
|
console.log('');
|
|
|
|
// Step 1: Discover/refresh cities
|
|
let cityResult = {
|
|
citiesFound: 0,
|
|
citiesUpserted: 0,
|
|
citiesSkipped: 0,
|
|
errors: [] as string[],
|
|
durationMs: 0,
|
|
};
|
|
|
|
if (!skipCityDiscovery) {
|
|
console.log('[Discovery] Step 1: Discovering cities...');
|
|
cityResult = await discoverCities(pool, { dryRun, verbose });
|
|
} else {
|
|
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
|
|
}
|
|
|
|
// Step 2: Get cities to crawl
|
|
console.log('[Discovery] Step 2: Getting cities to crawl...');
|
|
const cities = await getCitiesToCrawl(pool, {
|
|
stateCode,
|
|
countryCode,
|
|
limit: cityLimit,
|
|
onlyStale,
|
|
staleDays,
|
|
});
|
|
|
|
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
|
|
|
|
// Step 3: Discover locations for each city
|
|
console.log('[Discovery] Step 3: Discovering locations...');
|
|
const locationResults: LocationDiscoveryResult[] = [];
|
|
let totalLocationsFound = 0;
|
|
let totalLocationsUpserted = 0;
|
|
|
|
for (let i = 0; i < cities.length; i++) {
|
|
const city = cities[i];
|
|
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
|
|
|
try {
|
|
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
|
locationResults.push(result);
|
|
totalLocationsFound += result.locationsFound;
|
|
totalLocationsUpserted += result.locationsUpserted;
|
|
|
|
// Rate limiting between cities
|
|
if (i < cities.length - 1) {
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
}
|
|
} catch (error: any) {
|
|
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
|
|
locationResults.push({
|
|
cityId: city.id,
|
|
citySlug: city.citySlug,
|
|
locationsFound: 0,
|
|
locationsUpserted: 0,
|
|
locationsNew: 0,
|
|
locationsUpdated: 0,
|
|
errors: [error.message],
|
|
durationMs: 0,
|
|
});
|
|
}
|
|
}
|
|
|
|
const durationMs = Date.now() - startTime;
|
|
|
|
// Summary
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('DISCOVERY COMPLETE');
|
|
console.log('='.repeat(60));
|
|
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
|
console.log('');
|
|
console.log('Cities:');
|
|
console.log(` Discovered: ${cityResult.citiesFound}`);
|
|
console.log(` Upserted: ${cityResult.citiesUpserted}`);
|
|
console.log(` Crawled: ${cities.length}`);
|
|
console.log('');
|
|
console.log('Locations:');
|
|
console.log(` Found: ${totalLocationsFound}`);
|
|
console.log(` Upserted: ${totalLocationsUpserted}`);
|
|
console.log('');
|
|
|
|
const totalErrors = cityResult.errors.length +
|
|
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
|
|
if (totalErrors > 0) {
|
|
console.log(`Errors: ${totalErrors}`);
|
|
}
|
|
|
|
// Step 4: Auto-validate and promote discovered locations
|
|
if (!dryRun && totalLocationsUpserted > 0) {
|
|
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
|
|
const promotionResult = await promoteDiscoveredLocations(stateCode, false);
|
|
console.log(`[Discovery] Promotion complete:`);
|
|
console.log(` Created: ${promotionResult.created} new dispensaries`);
|
|
console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
|
|
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
|
|
if (promotionResult.rejectedRecords.length > 0) {
|
|
console.log(` Rejection reasons:`);
|
|
promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
|
|
console.log(` - ${r.name}: ${r.errors.join(', ')}`);
|
|
});
|
|
if (promotionResult.rejectedRecords.length > 5) {
|
|
console.log(` ... and ${promotionResult.rejectedRecords.length - 5} more`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 5: Detect dropped stores (in DB but not in discovery results)
|
|
if (!dryRun) {
|
|
console.log('\n[Discovery] Step 5: Detecting dropped stores...');
|
|
const droppedResult = await detectDroppedStores(pool, stateCode);
|
|
if (droppedResult.droppedCount > 0) {
|
|
console.log(`[Discovery] Found ${droppedResult.droppedCount} dropped stores:`);
|
|
droppedResult.droppedStores.slice(0, 10).forEach(s => {
|
|
console.log(` - ${s.name} (${s.city}, ${s.state}) - last seen: ${s.lastSeenAt}`);
|
|
});
|
|
if (droppedResult.droppedCount > 10) {
|
|
console.log(` ... and ${droppedResult.droppedCount - 10} more`);
|
|
}
|
|
} else {
|
|
console.log(`[Discovery] No dropped stores detected`);
|
|
}
|
|
}
|
|
|
|
return {
|
|
cities: cityResult,
|
|
locations: locationResults,
|
|
totalLocationsFound,
|
|
totalLocationsUpserted,
|
|
durationMs,
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// DROPPED STORE DETECTION
|
|
// ============================================================
|
|
|
|
export interface DroppedStoreResult {
|
|
droppedCount: number;
|
|
droppedStores: Array<{
|
|
id: number;
|
|
name: string;
|
|
city: string;
|
|
state: string;
|
|
platformDispensaryId: string;
|
|
lastSeenAt: string;
|
|
}>;
|
|
}
|
|
|
|
/**
|
|
* Detect stores that exist in dispensaries but were not found in discovery.
|
|
* Marks them as status='dropped' for manual review.
|
|
*
|
|
* A store is considered "dropped" if:
|
|
* 1. It has a platform_dispensary_id (was verified via Dutchie)
|
|
* 2. It was NOT seen in the latest discovery crawl (last_seen_at in discovery < 24h ago)
|
|
* 3. It's currently marked as 'open' status
|
|
*/
|
|
export async function detectDroppedStores(
|
|
pool: Pool,
|
|
stateCode?: string
|
|
): Promise<DroppedStoreResult> {
|
|
// Find dispensaries that:
|
|
// 1. Have platform_dispensary_id (verified Dutchie stores)
|
|
// 2. Are currently 'open' status
|
|
// 3. Have a linked discovery record that wasn't seen in the last discovery run
|
|
// (last_seen_at in dutchie_discovery_locations is older than 24 hours)
|
|
const params: any[] = [];
|
|
let stateFilter = '';
|
|
|
|
if (stateCode) {
|
|
stateFilter = ` AND d.state = $1`;
|
|
params.push(stateCode);
|
|
}
|
|
|
|
const query = `
|
|
WITH recently_seen AS (
|
|
SELECT DISTINCT platform_location_id
|
|
FROM dutchie_discovery_locations
|
|
WHERE last_seen_at > NOW() - INTERVAL '24 hours'
|
|
AND active = true
|
|
)
|
|
SELECT
|
|
d.id,
|
|
d.name,
|
|
d.city,
|
|
d.state,
|
|
d.platform_dispensary_id,
|
|
d.updated_at as last_seen_at
|
|
FROM dispensaries d
|
|
WHERE d.platform_dispensary_id IS NOT NULL
|
|
AND d.platform = 'dutchie'
|
|
AND (d.status = 'open' OR d.status IS NULL)
|
|
AND d.crawl_enabled = true
|
|
AND d.platform_dispensary_id NOT IN (SELECT platform_location_id FROM recently_seen)
|
|
${stateFilter}
|
|
ORDER BY d.name
|
|
`;
|
|
|
|
const result = await pool.query(query, params);
|
|
const droppedStores = result.rows;
|
|
|
|
// Mark these stores as 'dropped' status
|
|
if (droppedStores.length > 0) {
|
|
const ids = droppedStores.map(s => s.id);
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET status = 'dropped', updated_at = NOW()
|
|
WHERE id = ANY($1::int[])
|
|
`, [ids]);
|
|
|
|
// Log to promotion log for audit
|
|
for (const store of droppedStores) {
|
|
await pool.query(`
|
|
INSERT INTO dutchie_promotion_log
|
|
(dispensary_id, action, state_code, store_name, triggered_by)
|
|
VALUES ($1, 'dropped', $2, $3, 'discovery_detection')
|
|
`, [store.id, store.state, store.name]);
|
|
}
|
|
}
|
|
|
|
return {
|
|
droppedCount: droppedStores.length,
|
|
droppedStores: droppedStores.map(s => ({
|
|
id: s.id,
|
|
name: s.name,
|
|
city: s.city,
|
|
state: s.state,
|
|
platformDispensaryId: s.platform_dispensary_id,
|
|
lastSeenAt: s.last_seen_at,
|
|
})),
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// SINGLE CITY DISCOVERY
|
|
// ============================================================
|
|
|
|
/**
|
|
* Discover locations for a single city by slug.
|
|
*/
|
|
export async function discoverCity(
|
|
pool: Pool,
|
|
citySlug: string,
|
|
options: {
|
|
stateCode?: string;
|
|
countryCode?: string;
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
} = {}
|
|
): Promise<LocationDiscoveryResult | null> {
|
|
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
|
|
|
|
// Find the city
|
|
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
|
|
|
if (!city) {
|
|
// Try to create it if we have enough info
|
|
if (stateCode) {
|
|
console.log(`[Discovery] City ${citySlug} not found, creating...`);
|
|
await seedKnownCities(pool, [{
|
|
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
|
|
slug: citySlug,
|
|
stateCode,
|
|
countryCode,
|
|
}]);
|
|
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
|
}
|
|
|
|
if (!city) {
|
|
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
|
}
|
|
|
|
// ============================================================
|
|
// STATE-WIDE DISCOVERY
|
|
// ============================================================
|
|
|
|
/**
|
|
* Seed and discover all cities for a state.
|
|
*/
|
|
export async function discoverState(
|
|
pool: Pool,
|
|
stateCode: string,
|
|
options: {
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
cityLimit?: number;
|
|
} = {}
|
|
): Promise<FullDiscoveryResult> {
|
|
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
|
|
|
|
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
|
|
|
// Dynamically fetch and seed cities for this state
|
|
console.log(`[Discovery] Fetching cities for ${stateCode} from Dutchie...`);
|
|
const cityNames = await getCitiesForState(stateCode);
|
|
if (cityNames.length > 0) {
|
|
const cities = cityNames.map(name => ({
|
|
name,
|
|
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
|
|
stateCode,
|
|
}));
|
|
const seeded = await seedKnownCities(pool, cities);
|
|
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated for ${stateCode}`);
|
|
} else {
|
|
console.log(`[Discovery] No cities found for ${stateCode}`);
|
|
}
|
|
|
|
// Run full discovery for this state
|
|
return await runFullDiscovery(pool, {
|
|
dryRun,
|
|
verbose,
|
|
stateCode,
|
|
countryCode: 'US',
|
|
cityLimit,
|
|
skipCityDiscovery: true, // Use seeded cities
|
|
onlyStale: false, // Crawl all
|
|
});
|
|
}
|
|
|
|
// ============================================================
|
|
// STATISTICS
|
|
// ============================================================
|
|
|
|
export interface DiscoveryStats {
|
|
cities: {
|
|
total: number;
|
|
crawledLast24h: number;
|
|
neverCrawled: number;
|
|
};
|
|
locations: {
|
|
total: number;
|
|
discovered: number;
|
|
verified: number;
|
|
rejected: number;
|
|
merged: number;
|
|
byState: Array<{ stateCode: string; count: number }>;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get discovery statistics.
|
|
*/
|
|
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
|
|
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
|
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
|
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
|
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
|
|
]);
|
|
|
|
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
|
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
|
|
pool.query(`
|
|
SELECT status, COUNT(*) as cnt
|
|
FROM dutchie_discovery_locations
|
|
WHERE active = TRUE
|
|
GROUP BY status
|
|
`),
|
|
pool.query(`
|
|
SELECT state_code, COUNT(*) as cnt
|
|
FROM dutchie_discovery_locations
|
|
WHERE active = TRUE AND state_code IS NOT NULL
|
|
GROUP BY state_code
|
|
ORDER BY cnt DESC
|
|
`),
|
|
]);
|
|
|
|
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
|
|
acc[row.status] = parseInt(row.cnt, 10);
|
|
return acc;
|
|
}, {} as Record<string, number>);
|
|
|
|
return {
|
|
cities: {
|
|
total: parseInt(citiesTotal.rows[0].cnt, 10),
|
|
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
|
|
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
|
|
},
|
|
locations: {
|
|
total: parseInt(locsTotal.rows[0].cnt, 10),
|
|
discovered: statusCounts.discovered || 0,
|
|
verified: statusCounts.verified || 0,
|
|
rejected: statusCounts.rejected || 0,
|
|
merged: statusCounts.merged || 0,
|
|
byState: locsByState.rows.map(row => ({
|
|
stateCode: row.state_code,
|
|
count: parseInt(row.cnt, 10),
|
|
})),
|
|
},
|
|
};
|
|
}
|