Platform isolation:
- Rename handlers to {task}-{platform}.ts convention
- Deprecate -curl variants (now _deprecated-*)
- Platform-based routing in task-worker.ts
- Add Jane platform handlers and client
Evomi geo-targeting:
- Add dynamic proxy URL builder with state/city targeting
- Session stickiness per worker per state (30 min)
- Fallback to static proxy table when API unavailable
- Add proxy tracking columns to worker_tasks
Proxy management:
- New /proxies admin page for visibility
- Track proxy_ip, proxy_geo, proxy_source per task
- Show active sessions and task history
Validation filtering:
- Filter by validated stores (platform_dispensary_id + menu_url)
- Mark incomplete stores as deprecated
- Update all dashboard/stats queries
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
547 lines
19 KiB
TypeScript
547 lines
19 KiB
TypeScript
/**
|
|
* Store Discovery Handler - Dutchie Platform
|
|
*
|
|
* Uses Puppeteer + StealthPlugin to discover stores via browser context.
|
|
*
|
|
* Naming convention: {task}-{platform}.ts
|
|
*
|
|
* This handler:
|
|
* 1. Launches headless browser with proxy (if provided)
|
|
* 2. Establishes session by visiting Dutchie dispensaries page
|
|
* 3. Fetches cities for each state via getAllCitiesByState GraphQL
|
|
* 4. Fetches stores for each city via ConsumerDispensaries GraphQL
|
|
* 5. Upserts to dutchie_discovery_locations
|
|
* 6. Auto-promotes valid locations to dispensaries table
|
|
*
|
|
* Why browser-based:
|
|
* - Works with session-based residential proxies (Evomi)
|
|
* - Lower detection risk than curl/axios
|
|
* - Real Chrome TLS fingerprint
|
|
*/
|
|
|
|
import { TaskContext, TaskResult } from '../task-worker';
|
|
import { upsertLocation } from '../../discovery/location-discovery';
|
|
import { promoteDiscoveredLocations, TaskTrackingInfo } from '../../discovery/promotion';
|
|
import { saveDiscoveryPayload } from '../../utils/payload-storage';
|
|
|
|
// GraphQL hashes - MUST match CLAUDE.md / dutchie/client.ts
|
|
const GET_ALL_CITIES_HASH = 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6';
|
|
const CONSUMER_DISPENSARIES_HASH = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
|
|
|
|
interface StateWithCities {
|
|
name: string;
|
|
country: string;
|
|
cities: string[];
|
|
}
|
|
|
|
interface DiscoveredLocation {
|
|
id: string;
|
|
name: string;
|
|
slug: string;
|
|
cName?: string;
|
|
address?: string;
|
|
city?: string;
|
|
state?: string;
|
|
zip?: string;
|
|
latitude?: number;
|
|
longitude?: number;
|
|
offerPickup?: boolean;
|
|
offerDelivery?: boolean;
|
|
isRecreational?: boolean;
|
|
isMedical?: boolean;
|
|
phone?: string;
|
|
email?: string;
|
|
website?: string;
|
|
description?: string;
|
|
logoImage?: string;
|
|
bannerImage?: string;
|
|
chainSlug?: string;
|
|
enterpriseId?: string;
|
|
retailType?: string;
|
|
status?: string;
|
|
timezone?: string;
|
|
location?: {
|
|
ln1?: string;
|
|
ln2?: string;
|
|
city?: string;
|
|
state?: string;
|
|
zipcode?: string;
|
|
country?: string;
|
|
geometry?: { coordinates?: [number, number] };
|
|
};
|
|
}
|
|
|
|
export async function handleStoreDiscoveryDutchie(ctx: TaskContext): Promise<TaskResult> {
|
|
const { pool, task, crawlRotator, updateStep } = ctx;
|
|
const platform = task.platform || 'dutchie';
|
|
|
|
let browser: any = null;
|
|
|
|
try {
|
|
updateStep('starting', 'Initializing store discovery');
|
|
console.log(`[StoreDiscoveryHTTP] Starting discovery for platform: ${platform}`);
|
|
|
|
// ============================================================
|
|
// STEP 1: Setup Puppeteer with proxy
|
|
// ============================================================
|
|
updateStep('preflight', 'Launching browser');
|
|
const puppeteer = require('puppeteer-extra');
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
// Get proxy from CrawlRotator if available
|
|
let proxyUrl: string | null = null;
|
|
if (crawlRotator) {
|
|
const currentProxy = crawlRotator.proxy.getCurrent();
|
|
if (currentProxy) {
|
|
proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
|
|
console.log(`[StoreDiscoveryHTTP] Using proxy: ${currentProxy.host}:${currentProxy.port}`);
|
|
}
|
|
}
|
|
|
|
// Build browser args
|
|
const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox'];
|
|
if (proxyUrl) {
|
|
const proxyUrlParsed = new URL(proxyUrl);
|
|
browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`);
|
|
}
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: browserArgs,
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Block unnecessary resources to save bandwidth
|
|
// We only need HTML/JS for session, then GraphQL JSON
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (request: any) => {
|
|
const resourceType = request.resourceType();
|
|
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
request.abort();
|
|
} else {
|
|
request.continue();
|
|
}
|
|
});
|
|
|
|
// Setup proxy auth if needed
|
|
if (proxyUrl) {
|
|
const proxyUrlParsed = new URL(proxyUrl);
|
|
if (proxyUrlParsed.username && proxyUrlParsed.password) {
|
|
await page.authenticate({
|
|
username: decodeURIComponent(proxyUrlParsed.username),
|
|
password: decodeURIComponent(proxyUrlParsed.password),
|
|
});
|
|
}
|
|
}
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 2: Establish session by visiting dispensaries page
|
|
// ============================================================
|
|
updateStep('navigating', 'Loading session page');
|
|
const sessionUrl = 'https://dutchie.com/dispensaries';
|
|
console.log(`[StoreDiscoveryHTTP] Establishing session at ${sessionUrl}...`);
|
|
|
|
await page.goto(sessionUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
|
|
// Handle potential age gate
|
|
try {
|
|
await page.waitForTimeout(1500);
|
|
await page.evaluate(() => {
|
|
const buttons = Array.from(document.querySelectorAll('button'));
|
|
for (const btn of buttons) {
|
|
const text = btn.textContent?.toLowerCase() || '';
|
|
if (text.includes('yes') || text.includes('enter') || text.includes('21')) {
|
|
(btn as HTMLButtonElement).click();
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
});
|
|
} catch {
|
|
// Age gate might not be present
|
|
}
|
|
|
|
console.log(`[StoreDiscoveryHTTP] Session established`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 3: Get states to discover from database
|
|
// ============================================================
|
|
const statesResult = await pool.query(`
|
|
SELECT code FROM states WHERE is_active = true ORDER BY code
|
|
`);
|
|
const stateCodesToDiscover = statesResult.rows.map((r: { code: string }) => r.code);
|
|
|
|
if (stateCodesToDiscover.length === 0) {
|
|
await browser.close();
|
|
return { success: true, storesDiscovered: 0, newStoreIds: [], message: 'No active states to discover' };
|
|
}
|
|
|
|
console.log(`[StoreDiscoveryHTTP] Will discover stores in ${stateCodesToDiscover.length} states`);
|
|
|
|
// ============================================================
|
|
// STEP 4: Fetch cities for each state via GraphQL
|
|
// ============================================================
|
|
updateStep('fetching', `Fetching cities for ${stateCodesToDiscover.length} states`);
|
|
const statesWithCities = await page.evaluate(async (hash: string) => {
|
|
const logs: string[] = [];
|
|
try {
|
|
const extensions = {
|
|
persistedQuery: { version: 1, sha256Hash: hash },
|
|
};
|
|
const qs = new URLSearchParams({
|
|
operationName: 'getAllCitiesByState',
|
|
variables: JSON.stringify({}),
|
|
extensions: JSON.stringify(extensions),
|
|
});
|
|
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
|
|
|
|
const response = await fetch(url, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Accept': 'application/json',
|
|
'content-type': 'application/json',
|
|
},
|
|
credentials: 'include',
|
|
});
|
|
|
|
logs.push(`getAllCitiesByState: HTTP ${response.status}`);
|
|
|
|
if (!response.ok) {
|
|
return { states: [], logs };
|
|
}
|
|
|
|
const json = await response.json();
|
|
const statesData = json?.data?.statesWithDispensaries || [];
|
|
|
|
const states: StateWithCities[] = [];
|
|
for (const state of statesData) {
|
|
if (state && state.name) {
|
|
const cities = Array.isArray(state.cities)
|
|
? state.cities.filter((c: string | null) => c !== null)
|
|
: [];
|
|
states.push({
|
|
name: state.name,
|
|
country: state.country || 'US',
|
|
cities,
|
|
});
|
|
}
|
|
}
|
|
|
|
logs.push(`Found ${states.length} states with cities`);
|
|
return { states, logs };
|
|
} catch (err: any) {
|
|
logs.push(`Error: ${err.message}`);
|
|
return { states: [], logs };
|
|
}
|
|
}, GET_ALL_CITIES_HASH);
|
|
|
|
statesWithCities.logs.forEach((log: string) => console.log(`[Browser] ${log}`));
|
|
|
|
if (statesWithCities.states.length === 0) {
|
|
await browser.close();
|
|
return { success: false, error: 'Failed to fetch states with cities' };
|
|
}
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 5: For each active state, fetch stores for each city
|
|
// ============================================================
|
|
let totalDiscovered = 0;
|
|
let totalUpserted = 0;
|
|
const allNewStoreIds: number[] = [];
|
|
|
|
for (const stateCode of stateCodesToDiscover) {
|
|
const stateData = statesWithCities.states.find(
|
|
(s: StateWithCities) => s.name.toUpperCase() === stateCode.toUpperCase()
|
|
);
|
|
|
|
if (!stateData || stateData.cities.length === 0) {
|
|
console.log(`[StoreDiscoveryHTTP] No cities found for ${stateCode}, skipping`);
|
|
continue;
|
|
}
|
|
|
|
console.log(`[StoreDiscoveryHTTP] Discovering ${stateData.cities.length} cities in ${stateCode}...`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// Accumulate raw store data for this state
|
|
const stateRawStores: any[] = [];
|
|
const stateCityData: { city: string; stores: any[] }[] = [];
|
|
|
|
// Fetch stores for each city in this state
|
|
for (const city of stateData.cities) {
|
|
try {
|
|
const cityResult = await page.evaluate(async (
|
|
cityName: string,
|
|
stateCodeParam: string,
|
|
hash: string
|
|
) => {
|
|
const logs: string[] = [];
|
|
const allDispensaries: any[] = [];
|
|
let page = 0;
|
|
const perPage = 200;
|
|
|
|
try {
|
|
while (page < 5) { // Max 5 pages per city
|
|
const variables = {
|
|
dispensaryFilter: {
|
|
activeOnly: true,
|
|
city: cityName,
|
|
state: stateCodeParam,
|
|
},
|
|
page,
|
|
perPage,
|
|
};
|
|
|
|
const extensions = {
|
|
persistedQuery: { version: 1, sha256Hash: hash },
|
|
};
|
|
|
|
const qs = new URLSearchParams({
|
|
operationName: 'ConsumerDispensaries',
|
|
variables: JSON.stringify(variables),
|
|
extensions: JSON.stringify(extensions),
|
|
});
|
|
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
|
|
|
|
const response = await fetch(url, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Accept': 'application/json',
|
|
'content-type': 'application/json',
|
|
},
|
|
credentials: 'include',
|
|
});
|
|
|
|
if (!response.ok) {
|
|
logs.push(`${cityName}: HTTP ${response.status}`);
|
|
break;
|
|
}
|
|
|
|
const json = await response.json();
|
|
const dispensaries = json?.data?.filteredDispensaries || [];
|
|
|
|
if (dispensaries.length === 0) {
|
|
break;
|
|
}
|
|
|
|
// Filter to ensure correct state
|
|
const stateFiltered = dispensaries.filter((d: any) =>
|
|
d.location?.state?.toUpperCase() === stateCodeParam.toUpperCase()
|
|
);
|
|
allDispensaries.push(...stateFiltered);
|
|
|
|
if (dispensaries.length < perPage) {
|
|
break;
|
|
}
|
|
page++;
|
|
|
|
// Small delay between pages
|
|
await new Promise(r => setTimeout(r, 100));
|
|
}
|
|
|
|
logs.push(`${cityName}: ${allDispensaries.length} stores`);
|
|
} catch (err: any) {
|
|
logs.push(`${cityName}: Error - ${err.message}`);
|
|
}
|
|
|
|
return { dispensaries: allDispensaries, logs };
|
|
}, city, stateCode, CONSUMER_DISPENSARIES_HASH);
|
|
|
|
cityResult.logs.forEach((log: string) => console.log(`[Browser] ${log}`));
|
|
|
|
// Accumulate raw store data
|
|
stateRawStores.push(...cityResult.dispensaries);
|
|
stateCityData.push({ city, stores: cityResult.dispensaries });
|
|
|
|
// Upsert each discovered location
|
|
for (const disp of cityResult.dispensaries) {
|
|
try {
|
|
const location = normalizeLocation(disp);
|
|
if (!location.id) {
|
|
continue; // Skip locations without platform ID
|
|
}
|
|
|
|
const result = await upsertLocation(pool, location as any, null);
|
|
if (result) {
|
|
totalUpserted++;
|
|
if (result.isNew) {
|
|
totalDiscovered++;
|
|
}
|
|
}
|
|
} catch (err: any) {
|
|
console.error(`[StoreDiscoveryHTTP] Upsert error for ${disp.name}:`, err.message);
|
|
}
|
|
}
|
|
|
|
// Small delay between cities to avoid rate limiting
|
|
await new Promise(r => setTimeout(r, 300));
|
|
} catch (err: any) {
|
|
console.error(`[StoreDiscoveryHTTP] Error fetching ${city}, ${stateCode}:`, err.message);
|
|
}
|
|
}
|
|
|
|
// Heartbeat after each state
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 5b: Save raw store payload for this state
|
|
// ============================================================
|
|
if (stateRawStores.length > 0) {
|
|
try {
|
|
const rawPayload = {
|
|
stateCode,
|
|
platform,
|
|
fetchedAt: new Date().toISOString(),
|
|
storeCount: stateRawStores.length,
|
|
citiesProcessed: stateCityData.length,
|
|
cities: stateCityData,
|
|
stores: stateRawStores,
|
|
};
|
|
|
|
const payloadResult = await saveDiscoveryPayload(pool, stateCode, rawPayload, stateRawStores.length);
|
|
console.log(`[StoreDiscoveryHTTP] Saved raw payload for ${stateCode}: ${stateRawStores.length} stores (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`);
|
|
} catch (err: any) {
|
|
console.error(`[StoreDiscoveryHTTP] Failed to save payload for ${stateCode}:`, err.message);
|
|
}
|
|
}
|
|
|
|
// Auto-promote valid locations for this state
|
|
try {
|
|
// Pass task tracking info for modification audit trail
|
|
const taskTracking: TaskTrackingInfo = {
|
|
taskId: task.id,
|
|
taskRole: task.role,
|
|
};
|
|
const promotionResult = await promoteDiscoveredLocations(stateCode, false, taskTracking);
|
|
const promoted = promotionResult.created + promotionResult.updated;
|
|
if (promoted > 0) {
|
|
console.log(`[StoreDiscoveryHTTP] Promoted ${promoted} locations in ${stateCode} (${promotionResult.created} new, ${promotionResult.updated} updated)`);
|
|
// newDispensaryIds is returned but not in typed interface
|
|
const newIds = (promotionResult as any).newDispensaryIds || [];
|
|
allNewStoreIds.push(...newIds);
|
|
}
|
|
} catch (err: any) {
|
|
console.error(`[StoreDiscoveryHTTP] Promotion error for ${stateCode}:`, err.message);
|
|
}
|
|
}
|
|
|
|
await browser.close();
|
|
browser = null;
|
|
|
|
// ============================================================
|
|
// SELF-HEALING: Find existing stores missing payloads
|
|
// This catches stores that were added before chaining was implemented,
|
|
// or stores where product_discovery previously failed.
|
|
// ============================================================
|
|
let healedStoreIds: number[] = [];
|
|
try {
|
|
const healResult = await pool.query(`
|
|
SELECT d.id, d.name
|
|
FROM dispensaries d
|
|
WHERE d.platform = 'dutchie'
|
|
AND d.crawl_enabled = true
|
|
AND (d.stage IS NULL OR d.stage NOT IN ('deprecated', 'failing'))
|
|
AND d.platform_dispensary_id IS NOT NULL
|
|
AND d.last_payload_at IS NULL
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM task_queue t
|
|
WHERE t.dispensary_id = d.id
|
|
AND t.role = 'product_discovery'
|
|
AND t.status IN ('pending', 'running')
|
|
)
|
|
ORDER BY d.id
|
|
LIMIT 50
|
|
`);
|
|
|
|
if (healResult.rows.length > 0) {
|
|
console.log(`[StoreDiscoveryHTTP] Self-healing: Found ${healResult.rows.length} stores missing payloads`);
|
|
|
|
for (const store of healResult.rows) {
|
|
await pool.query(`
|
|
INSERT INTO task_queue (role, dispensary_id, priority, scheduled_for, method, platform)
|
|
VALUES ('product_discovery', $1, 5, NOW(), 'http', 'dutchie')
|
|
ON CONFLICT DO NOTHING
|
|
`, [store.id]);
|
|
healedStoreIds.push(store.id);
|
|
}
|
|
|
|
console.log(`[StoreDiscoveryHTTP] Self-healing: Queued ${healedStoreIds.length} product_discovery tasks`);
|
|
}
|
|
} catch (healErr: any) {
|
|
console.error(`[StoreDiscoveryHTTP] Self-healing error:`, healErr.message);
|
|
}
|
|
|
|
console.log(`[StoreDiscoveryHTTP] Complete: ${totalDiscovered} new, ${totalUpserted} upserted, ${allNewStoreIds.length} promoted, ${healedStoreIds.length} healed`);
|
|
|
|
return {
|
|
success: true,
|
|
storesDiscovered: totalDiscovered,
|
|
storesUpserted: totalUpserted,
|
|
statesProcessed: stateCodesToDiscover.length,
|
|
newStoreIds: allNewStoreIds,
|
|
healedStoreIds,
|
|
};
|
|
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
console.error(`[StoreDiscoveryHTTP] Error:`, errorMessage);
|
|
return {
|
|
success: false,
|
|
error: errorMessage,
|
|
newStoreIds: [],
|
|
};
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close().catch(() => {});
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Normalize a raw dispensary response to our DiscoveredLocation format
|
|
*/
|
|
function normalizeLocation(raw: any): DiscoveredLocation {
|
|
const loc = raw.location || {};
|
|
const coords = loc.geometry?.coordinates || [];
|
|
|
|
return {
|
|
id: raw.id || raw._id || '',
|
|
name: raw.name || '',
|
|
slug: raw.slug || raw.cName || '',
|
|
cName: raw.cName || raw.slug || '',
|
|
address: raw.address || loc.ln1 || '',
|
|
city: raw.city || loc.city || '',
|
|
state: raw.state || loc.state || '',
|
|
zip: raw.zip || loc.zipcode || loc.zip || '',
|
|
latitude: coords[1] || raw.latitude,
|
|
longitude: coords[0] || raw.longitude,
|
|
timezone: raw.timezone || '',
|
|
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
|
|
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
|
|
isRecreational: raw.isRecreational ?? raw.recDispensary ?? true,
|
|
isMedical: raw.isMedical ?? raw.medicalDispensary ?? true,
|
|
phone: raw.phone || '',
|
|
email: raw.email || '',
|
|
website: raw.embedBackUrl || '',
|
|
description: raw.description || '',
|
|
logoImage: raw.logoImage || '',
|
|
bannerImage: raw.bannerImage || '',
|
|
chainSlug: raw.chain || '',
|
|
enterpriseId: raw.retailer?.enterpriseId || '',
|
|
retailType: raw.retailType || '',
|
|
status: raw.status || '',
|
|
location: loc,
|
|
};
|
|
}
|