Store Discovery Parallelization: - Add store_discovery_state handler for per-state parallel discovery - Add POST /api/tasks/batch/store-discovery endpoint - 8 workers can now process states in parallel (~30-45 min vs 3+ hours) Modification Tracking (Migration 090): - Add last_modified_at, last_modified_by_task, last_modified_task_id to dispensaries - Add same columns to store_products - Update all handlers to set tracking info on modifications Stale Task Recovery: - Add periodic stale cleanup every 10 minutes (worker-0 only) - Prevents orphaned tasks from blocking queue after worker crashes Task Deduplication: - createStaggeredTasks now skips if pending/active task exists for same role - Skips if same role completed within last 4 hours - API responses include skipped count 🤖 Generated with [Claude Code](https://claude.com/claude-code)
372 lines
13 KiB
TypeScript
372 lines
13 KiB
TypeScript
/**
|
|
* Product Discovery HTTP Handler (Browser-based)
|
|
*
|
|
* Uses Puppeteer + StealthPlugin to fetch products via browser context.
|
|
* Based on test-intercept.js pattern from ORGANIC_SCRAPING_GUIDE.md.
|
|
*
|
|
* This handler:
|
|
* 1. Loads dispensary info
|
|
* 2. Launches headless browser with proxy (if provided)
|
|
* 3. Establishes session by visiting embedded menu
|
|
* 4. Fetches ALL products via GraphQL from browser context
|
|
* 5. Saves raw payload to filesystem (gzipped)
|
|
* 6. Records metadata in raw_crawl_payloads table
|
|
* 7. Queues product_refresh task to process the payload
|
|
*
|
|
* Why browser-based:
|
|
* - Works with session-based residential proxies (Evomi)
|
|
* - Lower detection risk than curl/axios
|
|
* - Real Chrome TLS fingerprint
|
|
*/
|
|
|
|
import { TaskContext, TaskResult } from '../task-worker';
|
|
import { saveRawPayload } from '../../utils/payload-storage';
|
|
import { taskService } from '../task-service';
|
|
|
|
// GraphQL hash for FilteredProducts query - MUST match CLAUDE.md
|
|
const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
|
|
|
export async function handleProductDiscoveryHttp(ctx: TaskContext): Promise<TaskResult> {
|
|
const { pool, task, crawlRotator, updateStep } = ctx;
|
|
const dispensaryId = task.dispensary_id;
|
|
|
|
if (!dispensaryId) {
|
|
return { success: false, error: 'No dispensary_id specified for product_discovery task' };
|
|
}
|
|
|
|
let browser: any = null;
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 1: Load dispensary info
|
|
// ============================================================
|
|
updateStep('loading', 'Loading dispensary info');
|
|
const dispResult = await pool.query(`
|
|
SELECT
|
|
id, name, platform_dispensary_id, menu_url, menu_type, city, state
|
|
FROM dispensaries
|
|
WHERE id = $1 AND crawl_enabled = true
|
|
`, [dispensaryId]);
|
|
|
|
if (dispResult.rows.length === 0) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` };
|
|
}
|
|
|
|
const dispensary = dispResult.rows[0];
|
|
const platformId = dispensary.platform_dispensary_id;
|
|
|
|
if (!platformId) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` };
|
|
}
|
|
|
|
// Extract cName from menu_url
|
|
const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
|
|
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
|
|
|
|
console.log(`[ProductDiscoveryHTTP] Starting for ${dispensary.name} (ID: ${dispensaryId})`);
|
|
console.log(`[ProductDiscoveryHTTP] Platform ID: ${platformId}, cName: ${cName}`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 2: Setup Puppeteer with proxy
|
|
// ============================================================
|
|
updateStep('preflight', `Launching browser for ${dispensary.name}`);
|
|
const puppeteer = require('puppeteer-extra');
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
// Get proxy from CrawlRotator if available
|
|
let proxyUrl: string | null = null;
|
|
if (crawlRotator) {
|
|
const currentProxy = crawlRotator.proxy.getCurrent();
|
|
if (currentProxy) {
|
|
proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
|
|
console.log(`[ProductDiscoveryHTTP] Using proxy: ${currentProxy.host}:${currentProxy.port}`);
|
|
}
|
|
}
|
|
|
|
// Build browser args
|
|
const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox'];
|
|
if (proxyUrl) {
|
|
const proxyUrlParsed = new URL(proxyUrl);
|
|
browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`);
|
|
}
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: browserArgs,
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Setup proxy auth if needed
|
|
if (proxyUrl) {
|
|
const proxyUrlParsed = new URL(proxyUrl);
|
|
if (proxyUrlParsed.username && proxyUrlParsed.password) {
|
|
await page.authenticate({
|
|
username: decodeURIComponent(proxyUrlParsed.username),
|
|
password: decodeURIComponent(proxyUrlParsed.password),
|
|
});
|
|
}
|
|
}
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 3: Establish session by visiting embedded menu
|
|
// ============================================================
|
|
updateStep('navigating', `Loading menu page`);
|
|
const embedUrl = `https://dutchie.com/embedded-menu/${cName}?menuType=rec`;
|
|
console.log(`[ProductDiscoveryHTTP] Establishing session at ${embedUrl}...`);
|
|
|
|
await page.goto(embedUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
|
|
// ============================================================
|
|
// STEP 3b: Detect and dismiss age gate modal
|
|
// ============================================================
|
|
try {
|
|
// Wait a bit for age gate to appear
|
|
await page.waitForTimeout(1500);
|
|
|
|
// Look for common age gate selectors
|
|
const ageGateSelectors = [
|
|
'button[data-testid="age-gate-submit"]',
|
|
'button:has-text("Yes")',
|
|
'button:has-text("I am 21")',
|
|
'button:has-text("Enter")',
|
|
'[class*="age-gate"] button',
|
|
'[class*="AgeGate"] button',
|
|
'[data-test="age-gate-button"]',
|
|
];
|
|
|
|
for (const selector of ageGateSelectors) {
|
|
try {
|
|
const button = await page.$(selector);
|
|
if (button) {
|
|
await button.click();
|
|
console.log(`[ProductDiscoveryHTTP] Age gate dismissed via: ${selector}`);
|
|
await page.waitForTimeout(1000); // Wait for modal to close
|
|
break;
|
|
}
|
|
} catch {
|
|
// Selector not found, try next
|
|
}
|
|
}
|
|
|
|
// Also try evaluating in page context for button with specific text
|
|
await page.evaluate(() => {
|
|
const buttons = Array.from(document.querySelectorAll('button'));
|
|
for (const btn of buttons) {
|
|
const text = btn.textContent?.toLowerCase() || '';
|
|
if (text.includes('yes') || text.includes('enter') || text.includes('21')) {
|
|
(btn as HTMLButtonElement).click();
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
});
|
|
} catch (ageGateErr) {
|
|
// Age gate might not be present, continue
|
|
console.log(`[ProductDiscoveryHTTP] No age gate detected or already dismissed`);
|
|
}
|
|
|
|
console.log(`[ProductDiscoveryHTTP] Session established, fetching products...`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 4: Fetch ALL products via GraphQL from browser context
|
|
// ============================================================
|
|
updateStep('fetching', `Executing GraphQL query`);
|
|
const result = await page.evaluate(async (platformId: string, graphqlHash: string) => {
|
|
const allProducts: any[] = [];
|
|
const logs: string[] = [];
|
|
let pageNum = 0;
|
|
const perPage = 100;
|
|
let totalCount = 0;
|
|
const sessionId = 'browser-session-' + Date.now();
|
|
|
|
try {
|
|
while (pageNum < 30) { // Max 30 pages = 3000 products
|
|
const variables = {
|
|
includeEnterpriseSpecials: false,
|
|
productsFilter: {
|
|
dispensaryId: platformId,
|
|
pricingType: 'rec',
|
|
Status: 'Active', // CRITICAL: Must be 'Active', not null
|
|
types: [],
|
|
useCache: true,
|
|
isDefaultSort: true,
|
|
sortBy: 'popularSortIdx',
|
|
sortDirection: 1,
|
|
bypassOnlineThresholds: true,
|
|
isKioskMenu: false,
|
|
removeProductsBelowOptionThresholds: false,
|
|
},
|
|
page: pageNum,
|
|
perPage: perPage,
|
|
};
|
|
|
|
const extensions = {
|
|
persistedQuery: {
|
|
version: 1,
|
|
sha256Hash: graphqlHash,
|
|
},
|
|
};
|
|
|
|
// Build GET URL like the browser does
|
|
const qs = new URLSearchParams({
|
|
operationName: 'FilteredProducts',
|
|
variables: JSON.stringify(variables),
|
|
extensions: JSON.stringify(extensions),
|
|
});
|
|
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
|
|
|
|
const response = await fetch(url, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Accept': 'application/json',
|
|
'content-type': 'application/json',
|
|
'x-dutchie-session': sessionId,
|
|
'apollographql-client-name': 'Marketplace (production)',
|
|
},
|
|
credentials: 'include',
|
|
});
|
|
|
|
logs.push(`Page ${pageNum}: HTTP ${response.status}`);
|
|
|
|
if (!response.ok) {
|
|
const text = await response.text();
|
|
logs.push(`HTTP error: ${response.status} - ${text.slice(0, 200)}`);
|
|
break;
|
|
}
|
|
|
|
const json = await response.json();
|
|
|
|
if (json.errors) {
|
|
logs.push(`GraphQL error: ${JSON.stringify(json.errors).slice(0, 200)}`);
|
|
break;
|
|
}
|
|
|
|
const data = json?.data?.filteredProducts;
|
|
if (!data || !data.products) {
|
|
logs.push('No products in response');
|
|
break;
|
|
}
|
|
|
|
const products = data.products;
|
|
allProducts.push(...products);
|
|
|
|
if (pageNum === 0) {
|
|
totalCount = data.queryInfo?.totalCount || 0;
|
|
logs.push(`Total reported: ${totalCount}`);
|
|
}
|
|
|
|
logs.push(`Got ${products.length} products (total: ${allProducts.length}/${totalCount})`);
|
|
|
|
if (allProducts.length >= totalCount || products.length < perPage) {
|
|
break;
|
|
}
|
|
|
|
pageNum++;
|
|
|
|
// Small delay between pages to be polite
|
|
await new Promise(r => setTimeout(r, 200));
|
|
}
|
|
} catch (err: any) {
|
|
logs.push(`Error: ${err.message}`);
|
|
}
|
|
|
|
return { products: allProducts, totalCount, logs };
|
|
}, platformId, FILTERED_PRODUCTS_HASH);
|
|
|
|
// Print logs from browser context
|
|
result.logs.forEach((log: string) => console.log(`[Browser] ${log}`));
|
|
|
|
console.log(`[ProductDiscoveryHTTP] Fetched ${result.products.length} products (API reported ${result.totalCount})`);
|
|
|
|
await browser.close();
|
|
browser = null;
|
|
|
|
if (result.products.length === 0) {
|
|
return {
|
|
success: false,
|
|
error: 'No products returned from GraphQL',
|
|
productsProcessed: 0,
|
|
};
|
|
}
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 5: Save raw payload to filesystem
|
|
// ============================================================
|
|
updateStep('saving', `Saving ${result.products.length} products`);
|
|
const rawPayload = {
|
|
dispensaryId,
|
|
platformId,
|
|
cName,
|
|
fetchedAt: new Date().toISOString(),
|
|
productCount: result.products.length,
|
|
products: result.products,
|
|
};
|
|
|
|
const payloadResult = await saveRawPayload(
|
|
pool,
|
|
dispensaryId,
|
|
rawPayload,
|
|
null, // crawl_run_id - not using crawl_runs in new system
|
|
result.products.length
|
|
);
|
|
|
|
console.log(`[ProductDiscoveryHTTP] Saved payload #${payloadResult.id} (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`);
|
|
|
|
// ============================================================
|
|
// STEP 6: Update dispensary last_fetch_at and tracking
|
|
// ============================================================
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET last_fetch_at = NOW(),
|
|
last_modified_at = NOW(),
|
|
last_modified_by_task = $2,
|
|
last_modified_task_id = $3
|
|
WHERE id = $1
|
|
`, [dispensaryId, task.role, task.id]);
|
|
|
|
// ============================================================
|
|
// STEP 7: Queue product_refresh task to process the payload
|
|
// ============================================================
|
|
await taskService.createTask({
|
|
role: 'product_refresh',
|
|
dispensary_id: dispensaryId,
|
|
priority: task.priority || 0,
|
|
payload: { payload_id: payloadResult.id },
|
|
});
|
|
|
|
console.log(`[ProductDiscoveryHTTP] Queued product_refresh task for payload #${payloadResult.id}`);
|
|
|
|
return {
|
|
success: true,
|
|
payloadId: payloadResult.id,
|
|
productCount: result.products.length,
|
|
sizeBytes: payloadResult.sizeBytes,
|
|
};
|
|
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
console.error(`[ProductDiscoveryHTTP] Error for dispensary ${dispensaryId}:`, errorMessage);
|
|
return {
|
|
success: false,
|
|
error: errorMessage,
|
|
};
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close().catch(() => {});
|
|
}
|
|
}
|
|
}
|