/** * Product Discovery Handler - Dutchie Platform * * Uses Puppeteer + StealthPlugin to fetch products via browser context. * Based on test-intercept.js pattern from ORGANIC_SCRAPING_GUIDE.md. * * Naming convention: {task}-{platform}.ts * * This handler: * 1. Loads dispensary info * 2. Launches headless browser with proxy (if provided) * 3. Establishes session by visiting embedded menu * 4. Fetches ALL products via GraphQL from browser context * 5. Saves raw payload to filesystem (gzipped) at payloads/dutchie/... * 6. Records metadata in raw_crawl_payloads table * 7. Queues product_refresh task to process the payload * * Why browser-based: * - Works with session-based residential proxies (Evomi) * - Lower detection risk than curl/axios * - Real Chrome TLS fingerprint */ import { TaskContext, TaskResult } from '../task-worker'; import { saveDailyBaseline } from '../../utils/payload-storage'; import { taskService } from '../task-service'; import { saveInventorySnapshots } from '../../services/inventory-snapshots'; import { detectVisibilityEvents } from '../../services/visibility-events'; import { storePayload as storeWasabiPayload, checkConnection as checkWasabiConnection } from '../../services/wasabi-storage'; // GraphQL hash for FilteredProducts query - MUST match CLAUDE.md const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'; export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise { const { pool, task, crawlRotator, updateStep } = ctx; const dispensaryId = task.dispensary_id; if (!dispensaryId) { return { success: false, error: 'No dispensary_id specified for product_discovery task' }; } let browser: any = null; try { // ============================================================ // STEP 1: Load dispensary info // ============================================================ updateStep('loading', 'Loading dispensary info'); const dispResult = await pool.query(` SELECT id, name, platform_dispensary_id, menu_url, menu_type, city, state FROM dispensaries WHERE id = $1 AND crawl_enabled = true `, [dispensaryId]); if (dispResult.rows.length === 0) { return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` }; } const dispensary = dispResult.rows[0]; const platformId = dispensary.platform_dispensary_id; if (!platformId) { return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` }; } // Extract cName from menu_url const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/); const cName = cNameMatch ? cNameMatch[1] : 'dispensary'; console.log(`[ProductDiscoveryHTTP] Starting for ${dispensary.name} (ID: ${dispensaryId})`); console.log(`[ProductDiscoveryHTTP] Platform ID: ${platformId}, cName: ${cName}`); await ctx.heartbeat(); // ============================================================ // STEP 2: Setup Puppeteer with proxy // ============================================================ updateStep('preflight', `Launching browser for ${dispensary.name}`); const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); puppeteer.use(StealthPlugin()); // Get proxy from CrawlRotator if available let proxyUrl: string | null = null; if (crawlRotator) { const currentProxy = crawlRotator.proxy.getCurrent(); if (currentProxy) { proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy); console.log(`[ProductDiscoveryHTTP] Using proxy: ${currentProxy.host}:${currentProxy.port}`); } } // Build browser args const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox']; if (proxyUrl) { const proxyUrlParsed = new URL(proxyUrl); browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`); } browser = await puppeteer.launch({ headless: 'new', args: browserArgs, }); const page = await browser.newPage(); // Block unnecessary resources to save bandwidth // We only need HTML/JS for session, then GraphQL JSON await page.setRequestInterception(true); // Domains to block - analytics, tracking, feature flags (not needed for GraphQL) const BLOCKED_DOMAINS = [ 'googletagmanager.com', 'google-analytics.com', 'launchdarkly.com', 'assets2.dutchie.com', // CDN assets - we only need GraphQL 'sentry.io', 'segment.io', 'segment.com', 'amplitude.com', 'mixpanel.com', 'hotjar.com', 'fullstory.com', ]; page.on('request', (request: any) => { const url = request.url(); const resourceType = request.resourceType(); // Block by domain - saves significant proxy bandwidth if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) { request.abort(); return; } // Block by resource type - images, fonts, media, stylesheets if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) { request.abort(); return; } request.continue(); }); // Setup proxy auth if needed if (proxyUrl) { const proxyUrlParsed = new URL(proxyUrl); if (proxyUrlParsed.username && proxyUrlParsed.password) { await page.authenticate({ username: decodeURIComponent(proxyUrlParsed.username), password: decodeURIComponent(proxyUrlParsed.password), }); } } // ============================================================ // STEP 2b: Apply stored fingerprint (timezone, locale) // CRITICAL: Must match the IP's geographic location // ============================================================ if (ctx.fingerprint?.timezone) { try { const client = await page.target().createCDPSession(); await client.send('Emulation.setTimezoneOverride', { timezoneId: ctx.fingerprint.timezone }); console.log(`[ProductDiscoveryHTTP] Browser timezone set to: ${ctx.fingerprint.timezone}`); } catch (tzErr: any) { console.warn(`[ProductDiscoveryHTTP] Failed to set timezone: ${tzErr.message}`); } } // Set locale to match proxy region (en-US for US proxies) if (ctx.fingerprint?.locale) { await page.setExtraHTTPHeaders({ 'Accept-Language': `${ctx.fingerprint.locale},en;q=0.9`, }); console.log(`[ProductDiscoveryHTTP] Accept-Language set to: ${ctx.fingerprint.locale}`); } await ctx.heartbeat(); // ============================================================ // STEP 3: Establish session by visiting embedded menu // ============================================================ updateStep('navigating', `Loading menu page`); const embedUrl = `https://dutchie.com/embedded-menu/${cName}?menuType=rec`; console.log(`[ProductDiscoveryHTTP] Establishing session at ${embedUrl}...`); await page.goto(embedUrl, { waitUntil: 'networkidle2', timeout: 60000, }); // ============================================================ // STEP 3b: Detect and dismiss age gate modal // ============================================================ try { // Wait a bit for age gate to appear await page.waitForTimeout(1500); // Look for common age gate selectors const ageGateSelectors = [ 'button[data-testid="age-gate-submit"]', 'button:has-text("Yes")', 'button:has-text("I am 21")', 'button:has-text("Enter")', '[class*="age-gate"] button', '[class*="AgeGate"] button', '[data-test="age-gate-button"]', ]; for (const selector of ageGateSelectors) { try { const button = await page.$(selector); if (button) { await button.click(); console.log(`[ProductDiscoveryHTTP] Age gate dismissed via: ${selector}`); await page.waitForTimeout(1000); // Wait for modal to close break; } } catch { // Selector not found, try next } } // Also try evaluating in page context for button with specific text await page.evaluate(() => { const buttons = Array.from(document.querySelectorAll('button')); for (const btn of buttons) { const text = btn.textContent?.toLowerCase() || ''; if (text.includes('yes') || text.includes('enter') || text.includes('21')) { (btn as HTMLButtonElement).click(); return true; } } return false; }); } catch (ageGateErr) { // Age gate might not be present, continue console.log(`[ProductDiscoveryHTTP] No age gate detected or already dismissed`); } console.log(`[ProductDiscoveryHTTP] Session established, fetching products...`); await ctx.heartbeat(); // ============================================================ // STEP 4: Fetch ALL products via GraphQL from browser context // ============================================================ updateStep('fetching', `Executing GraphQL query`); const result = await page.evaluate(async (platformId: string, graphqlHash: string) => { const allProducts: any[] = []; const logs: string[] = []; let pageNum = 0; const perPage = 100; let totalCount = 0; const sessionId = 'browser-session-' + Date.now(); try { while (pageNum < 30) { // Max 30 pages = 3000 products const variables = { includeEnterpriseSpecials: true, // Include BOGO/sale special names in product data productsFilter: { dispensaryId: platformId, pricingType: 'rec', Status: 'All', // 'All' = Active + Inactive products for sellout tracking types: [], useCache: true, isDefaultSort: true, sortBy: 'popularSortIdx', sortDirection: 1, bypassOnlineThresholds: true, isKioskMenu: false, removeProductsBelowOptionThresholds: false, }, page: pageNum, perPage: perPage, }; const extensions = { persistedQuery: { version: 1, sha256Hash: graphqlHash, }, }; // Build GET URL like the browser does const qs = new URLSearchParams({ operationName: 'FilteredProducts', variables: JSON.stringify(variables), extensions: JSON.stringify(extensions), }); const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`; const response = await fetch(url, { method: 'GET', headers: { 'Accept': 'application/json', 'content-type': 'application/json', 'x-dutchie-session': sessionId, 'apollographql-client-name': 'Marketplace (production)', }, credentials: 'include', }); logs.push(`Page ${pageNum}: HTTP ${response.status}`); if (!response.ok) { const text = await response.text(); logs.push(`HTTP error: ${response.status} - ${text.slice(0, 200)}`); break; } const json = await response.json(); if (json.errors) { logs.push(`GraphQL error: ${JSON.stringify(json.errors).slice(0, 200)}`); break; } const data = json?.data?.filteredProducts; if (!data || !data.products) { logs.push('No products in response'); break; } const products = data.products; allProducts.push(...products); if (pageNum === 0) { totalCount = data.queryInfo?.totalCount || 0; logs.push(`Total reported: ${totalCount}`); } logs.push(`Got ${products.length} products (total: ${allProducts.length}/${totalCount})`); if (allProducts.length >= totalCount || products.length < perPage) { break; } pageNum++; // Small delay between pages to be polite await new Promise(r => setTimeout(r, 200)); } } catch (err: any) { logs.push(`Error: ${err.message}`); } return { products: allProducts, totalCount, logs }; }, platformId, FILTERED_PRODUCTS_HASH); // Print logs from browser context result.logs.forEach((log: string) => console.log(`[Browser] ${log}`)); console.log(`[ProductDiscoveryHTTP] Fetched ${result.products.length} products (API reported ${result.totalCount})`); await browser.close(); browser = null; if (result.products.length === 0) { return { success: false, error: 'No products returned from GraphQL', productsProcessed: 0, }; } await ctx.heartbeat(); // ============================================================ // STEP 5: Archive raw payload to Wasabi S3 (long-term storage) // Every crawl is archived for potential reprocessing // ============================================================ updateStep('saving', `Saving ${result.products.length} products`); const rawPayload = { dispensaryId, platformId, cName, fetchedAt: new Date().toISOString(), productCount: result.products.length, products: result.products, }; // Archive to Wasabi S3 (if configured) let wasabiPath: string | null = null; try { const wasabiResult = await storeWasabiPayload( dispensaryId, dispensary.state || 'XX', 'dutchie', rawPayload, { taskId: String(task.id), cName, productCount: String(result.products.length), } ); wasabiPath = wasabiResult.path; const compressionRatio = Math.round((1 - wasabiResult.compressedBytes / wasabiResult.sizeBytes) * 100); console.log(`[ProductDiscoveryHTTP] Archived to Wasabi: ${wasabiPath} (${(wasabiResult.compressedBytes / 1024).toFixed(1)}KB, ${compressionRatio}% compression)`); } catch (wasabiErr: any) { // Wasabi archival is optional - don't fail the task if it fails if (wasabiErr.message?.includes('not configured')) { console.log(`[ProductDiscoveryHTTP] Wasabi not configured, skipping archive`); } else { console.warn(`[ProductDiscoveryHTTP] Wasabi archive failed: ${wasabiErr.message}`); } } // ============================================================ // STEP 5b: Save daily baseline to PostgreSQL (if in window) // Daily baselines are saved once per day per store (12:01 AM - 3:00 AM) // Outside this window, only inventory snapshots are saved (Step 5.5) // ============================================================ // saveDailyBaseline returns null if outside window or baseline already exists today const payloadResult = await saveDailyBaseline( pool, dispensaryId, rawPayload, null, // crawl_run_id - not using crawl_runs in new system result.products.length, 'dutchie', task.id // task ID for traceability ); if (payloadResult) { console.log(`[ProductDiscoveryHTTP] Saved daily baseline #${payloadResult.id} (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`); } else { console.log(`[ProductDiscoveryHTTP] Skipped PostgreSQL baseline (outside window or already exists)`); } // ============================================================ // STEP 5.5: Save inventory snapshots and detect visibility events // ============================================================ const snapshotCount = await saveInventorySnapshots(pool, dispensaryId, result.products, 'dutchie'); const eventCount = await detectVisibilityEvents(pool, dispensaryId, result.products, 'dutchie'); console.log(`[ProductDiscoveryHTTP] Saved ${snapshotCount} inventory snapshots, detected ${eventCount} visibility events`); // ============================================================ // STEP 6: Update dispensary last_fetch_at and tracking // ============================================================ await pool.query(` UPDATE dispensaries SET last_fetch_at = NOW(), last_modified_at = NOW(), last_modified_by_task = $2, last_modified_task_id = $3 WHERE id = $1 `, [dispensaryId, task.role, task.id]); // ============================================================ // STEP 7: Queue product_refresh task to process the payload // Only queue if a baseline payload was saved (need payload_id) // ============================================================ if (payloadResult) { await taskService.createTask({ role: 'product_refresh', dispensary_id: dispensaryId, priority: task.priority || 0, method: 'http', // Browser-only transport payload: { payload_id: payloadResult.id }, }); console.log(`[ProductDiscoveryHTTP] Queued product_refresh task for payload #${payloadResult.id}`); } else { console.log(`[ProductDiscoveryHTTP] Skipped product_refresh (no payload saved)`); } // ============================================================ // STEP 8: Stage checkpoint - observational update // Discovery success → hydrating (awaiting product_refresh completion) // ============================================================ await pool.query(` UPDATE dispensaries SET stage = CASE WHEN stage IN ('promoted', 'sandbox') THEN 'hydrating' WHEN stage = 'failing' THEN 'hydrating' ELSE stage END, consecutive_successes = COALESCE(consecutive_successes, 0) + 1, consecutive_failures = 0 WHERE id = $1 `, [dispensaryId]); console.log(`[ProductDiscoveryHTTP] Stage checkpoint: hydrating`); return { success: true, payloadId: payloadResult?.id || null, productCount: result.products.length, sizeBytes: payloadResult?.sizeBytes || 0, baselineSaved: !!payloadResult, wasabiPath, snapshotCount, eventCount, }; } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error(`[ProductDiscoveryHTTP] Error for dispensary ${dispensaryId}:`, errorMessage); // Stage checkpoint - track failures // After 3+ consecutive failures, stage transitions to 'failing' try { const failureResult = await pool.query(` UPDATE dispensaries SET consecutive_failures = COALESCE(consecutive_failures, 0) + 1, consecutive_successes = 0, stage = CASE WHEN COALESCE(consecutive_failures, 0) + 1 >= 3 THEN 'failing' ELSE stage END WHERE id = $1 RETURNING consecutive_failures, stage `, [dispensaryId]); if (failureResult.rows[0]) { const { consecutive_failures, stage } = failureResult.rows[0]; console.log(`[ProductDiscoveryHTTP] Failure tracked: ${consecutive_failures} consecutive failures, stage: ${stage}`); } } catch (trackError) { // Don't let tracking errors mask the original error console.error(`[ProductDiscoveryHTTP] Failed to track failure:`, trackError); } return { success: false, error: errorMessage, }; } finally { if (browser) { await browser.close().catch(() => {}); } } }