diff --git a/backend/docs/CODEBASE_MAP.md b/backend/docs/CODEBASE_MAP.md index f1cbfef6..124b5065 100644 --- a/backend/docs/CODEBASE_MAP.md +++ b/backend/docs/CODEBASE_MAP.md @@ -99,10 +99,60 @@ src/scraper-v2/*.ts # Entire directory deprecated |------|---------|--------| | `src/tasks/handlers/payload-fetch.ts` | Fetch products from Dutchie | **PRIMARY** | | `src/tasks/handlers/product-refresh.ts` | Process payload into DB | **PRIMARY** | +| `src/tasks/handlers/entry-point-discovery.ts` | Resolve platform IDs (auto-healing) | **PRIMARY** | | `src/tasks/handlers/menu-detection.ts` | Detect menu type | ACTIVE | -| `src/tasks/handlers/id-resolution.ts` | Resolve platform IDs | ACTIVE | +| `src/tasks/handlers/id-resolution.ts` | Resolve platform IDs (legacy) | LEGACY | | `src/tasks/handlers/image-download.ts` | Download product images | ACTIVE | +--- + +## Transport Rules (CRITICAL) + +**Browser-based (Puppeteer) is the DEFAULT transport. curl is ONLY allowed when explicitly specified.** + +### Transport Selection +| `task.method` | Transport Used | Notes | +|---------------|----------------|-------| +| `null` | Browser (Puppeteer) | DEFAULT - use this for most tasks | +| `'http'` | Browser (Puppeteer) | Explicit browser request | +| `'curl'` | curl-impersonate | ONLY when explicitly needed | + +### Why Browser-First? +1. **Anti-detection**: Puppeteer with StealthPlugin evades bot detection +2. **Session cookies**: Browser maintains session state automatically +3. **Fingerprinting**: Real browser fingerprint (TLS, headers, etc.) +4. **Age gates**: Browser can click through age verification + +### Entry Point Discovery Auto-Healing +The `entry_point_discovery` handler uses a healing strategy: + +``` +1. FIRST: Check dutchie_discovery_locations for existing platform_location_id + - By linked dutchie_discovery_id + - By slug match in discovery data + → If found, NO network call needed + +2. SECOND: Browser-based GraphQL (Puppeteer) + - 5x retries for network/proxy failures + - On HTTP 403: rotate proxy and retry + - On HTTP 404 after 2 attempts: mark as 'removed' + +3. HARD FAILURE: After exhausting options → 'needs_investigation' +``` + +### DO NOT Use curl Unless: +- Task explicitly has `method = 'curl'` +- You're testing curl-impersonate binaries +- The API explicitly requires curl fingerprinting + +### Files +| File | Transport | Purpose | +|------|-----------|---------| +| `src/services/puppeteer-preflight.ts` | Browser | Preflight check | +| `src/services/curl-preflight.ts` | curl | Preflight check | +| `src/tasks/handlers/entry-point-discovery.ts` | Browser | Platform ID resolution | +| `src/tasks/handlers/payload-fetch.ts` | Both | Product fetching | + ### Database | File | Purpose | Status | |------|---------|--------| diff --git a/backend/migrations/091_store_discovery_tracking.sql b/backend/migrations/091_store_discovery_tracking.sql new file mode 100644 index 00000000..c8f3925a --- /dev/null +++ b/backend/migrations/091_store_discovery_tracking.sql @@ -0,0 +1,26 @@ +-- Migration 091: Add store discovery tracking columns +-- Per auto-healing scheme (2025-12-12): +-- Track when store_discovery last updated each dispensary +-- Track when last payload was saved + +-- Add last_store_discovery_at to track when store_discovery updated this record +ALTER TABLE dispensaries +ADD COLUMN IF NOT EXISTS last_store_discovery_at TIMESTAMPTZ; + +-- Add last_payload_at to track when last product payload was saved +-- (Complements last_fetch_at which tracks API fetch time) +ALTER TABLE dispensaries +ADD COLUMN IF NOT EXISTS last_payload_at TIMESTAMPTZ; + +-- Add index for finding stale discovery data +CREATE INDEX IF NOT EXISTS idx_dispensaries_store_discovery_at +ON dispensaries (last_store_discovery_at DESC NULLS LAST) +WHERE crawl_enabled = true; + +-- Add index for finding dispensaries without recent payloads +CREATE INDEX IF NOT EXISTS idx_dispensaries_payload_at +ON dispensaries (last_payload_at DESC NULLS LAST) +WHERE crawl_enabled = true; + +COMMENT ON COLUMN dispensaries.last_store_discovery_at IS 'When store_discovery task last updated this record'; +COMMENT ON COLUMN dispensaries.last_payload_at IS 'When last product payload was saved for this dispensary'; diff --git a/backend/src/tasks/handlers/entry-point-discovery.ts b/backend/src/tasks/handlers/entry-point-discovery.ts index eae9e9a5..ede002ea 100644 --- a/backend/src/tasks/handlers/entry-point-discovery.ts +++ b/backend/src/tasks/handlers/entry-point-discovery.ts @@ -4,33 +4,55 @@ * Resolves platform IDs for a discovered store using Dutchie GraphQL. * This is the step between store_discovery and product_discovery. * + * AUTO-HEALING SCHEME (2025-12-12): + * 1. FIRST: Check dutchie_discovery_locations for existing platform_location_id + * - If found, use it directly (no network call needed) + * 2. SECOND: If not in discovery data, use browser-based GraphQL (Puppeteer) + * - 5x retries for network/proxy failures + * - On HTTP 403: rotate proxy and retry + * - On HTTP 404 after 2 attempts: mark as 'removed' + * 3. HARD FAILURE: After exhausting all options, mark as 'needs_investigation' + * + * TRANSPORT RULE: Browser-based (Puppeteer) is the DEFAULT. + * curl is ONLY used when task.method === 'curl' explicitly. + * * Flow: * 1. Load dispensary info from database - * 2. Extract slug from menu_url - * 3. Start stealth session (fingerprint + optional proxy) - * 4. Query Dutchie GraphQL to resolve slug → platform_dispensary_id - * 5. Update dispensary record with resolved ID - * 6. Queue product_discovery task if successful + * 2. Check discovery data for existing platform ID (healing strategy #1) + * 3. Extract slug from menu_url + * 4. Launch browser and establish session + * 5. Query Dutchie GraphQL to resolve slug → platform_dispensary_id + * 6. Update dispensary record with resolved ID + * 7. Queue product_discovery task if successful */ import { TaskContext, TaskResult } from '../task-worker'; -import { startSession, endSession } from '../../platforms/dutchie'; -import { resolveDispensaryIdWithDetails } from '../../platforms/dutchie/queries'; + +// GraphQL hash for GetAddressBasedDispensaryData - MUST match CLAUDE.md +const GET_DISPENSARY_DATA_HASH = '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b'; + +// Auto-healing configuration +const MAX_NETWORK_RETRIES = 5; +const MAX_404_ATTEMPTS = 2; export async function handleEntryPointDiscovery(ctx: TaskContext): Promise { - const { pool, task } = ctx; + const { pool, task, crawlRotator, updateStep } = ctx; const dispensaryId = task.dispensary_id; if (!dispensaryId) { return { success: false, error: 'No dispensary_id specified for entry_point_discovery task' }; } + let browser: any = null; + try { // ============================================================ // STEP 1: Load dispensary info // ============================================================ + updateStep('loading', 'Loading dispensary info'); const dispResult = await pool.query(` - SELECT id, name, menu_url, platform_dispensary_id, menu_type, state + SELECT id, name, menu_url, platform_dispensary_id, menu_type, state, + dutchie_discovery_id, id_resolution_attempts FROM dispensaries WHERE id = $1 `, [dispensaryId]); @@ -44,7 +66,6 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise 0) { + const discovery = discoveryResult.rows[0]; + console.log(`[EntryPointDiscovery] Found platform ID in discovery data: ${discovery.platform_location_id}`); + + await updateDispensaryWithPlatformId( + pool, dispensaryId, discovery.platform_location_id, task, + 'discovery_data', discovery.platform_slug + ); + + return { + success: true, + platformId: discovery.platform_location_id, + source: 'discovery_data', + healingStrategy: 'discovery_lookup', + }; + } + } + + // Also try to find by slug match in discovery data const menuUrl = dispensary.menu_url; if (!menuUrl) { return { success: false, error: `Dispensary ${dispensaryId} has no menu_url` }; } - console.log(`[EntryPointDiscovery] Resolving platform ID for ${dispensary.name}`); - console.log(`[EntryPointDiscovery] Menu URL: ${menuUrl}`); - - // ============================================================ - // STEP 2: Extract slug from menu URL - // ============================================================ + // Extract slug from menu URL let slug: string | null = null; - const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/); const dispensaryMatch = menuUrl.match(/\/dispensary\/([^/?]+)/); @@ -93,10 +147,11 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise 0) { + const discovery = slugLookupResult.rows[0]; + console.log(`[EntryPointDiscovery] Found platform ID by slug lookup: ${discovery.platform_location_id}`); - try { - // ============================================================ - // STEP 4: Resolve platform ID via GraphQL - // ============================================================ - console.log(`[EntryPointDiscovery] Querying Dutchie GraphQL for slug: ${slug}`); - - const result = await resolveDispensaryIdWithDetails(slug); - - if (!result.dispensaryId) { - // Resolution failed - could be 403, 404, or invalid response - const reason = result.httpStatus - ? `HTTP ${result.httpStatus}` - : result.error || 'Unknown error'; - - console.log(`[EntryPointDiscovery] Failed to resolve ${slug}: ${reason}`); - - // Mark as failed resolution - await pool.query(` - UPDATE dispensaries - SET - menu_type = CASE - WHEN $2 = 404 THEN 'removed' - WHEN $2 = 403 THEN 'blocked' - ELSE 'dutchie' - END, - id_resolution_status = 'failed', - id_resolution_error = $3, - updated_at = NOW(), - last_modified_at = NOW(), - last_modified_by_task = $4, - last_modified_task_id = $5 - WHERE id = $1 - `, [dispensaryId, result.httpStatus || 0, reason, task.role, task.id]); - - return { - success: false, - error: `Could not resolve platform ID: ${reason}`, - slug, - httpStatus: result.httpStatus, - }; - } - - const platformId = result.dispensaryId; - console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${platformId}`); - - await ctx.heartbeat(); - - // ============================================================ - // STEP 5: Update dispensary with resolved ID and tracking - // ============================================================ - await pool.query(` - UPDATE dispensaries - SET - platform_dispensary_id = $2, - menu_type = 'dutchie', - crawl_enabled = true, - id_resolution_status = 'resolved', - id_resolution_error = NULL, - updated_at = NOW(), - last_modified_at = NOW(), - last_modified_by_task = $3, - last_modified_task_id = $4 - WHERE id = $1 - `, [dispensaryId, platformId, task.role, task.id]); - - console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID`); - - // ============================================================ - // STEP 6: Queue product_discovery task - // ============================================================ - await pool.query(` - INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for) - VALUES ('product_discovery', $1, 5, NOW()) - ON CONFLICT DO NOTHING - `, [dispensaryId]); - - console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`); + await updateDispensaryWithPlatformId( + pool, dispensaryId, discovery.platform_location_id, task, + 'discovery_slug_lookup', slug + ); return { success: true, - platformId, + platformId: discovery.platform_location_id, slug, - queuedProductDiscovery: true, + source: 'discovery_slug_lookup', + healingStrategy: 'discovery_lookup', }; - - } finally { - // Always end session - endSession(); } + console.log(`[EntryPointDiscovery] Not found in discovery data, proceeding to browser-based resolution`); + + await ctx.heartbeat(); + + // ============================================================ + // STEP 3: AUTO-HEALING STRATEGY #2 - Browser-based GraphQL + // Use Puppeteer with 5x retry for network failures + // ============================================================ + updateStep('preflight', 'Launching browser'); + + const puppeteer = require('puppeteer-extra'); + const StealthPlugin = require('puppeteer-extra-plugin-stealth'); + puppeteer.use(StealthPlugin()); + + // Get proxy from CrawlRotator if available + let proxyUrl: string | null = null; + if (crawlRotator) { + const currentProxy = crawlRotator.proxy.getCurrent(); + if (currentProxy) { + proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy); + console.log(`[EntryPointDiscovery] Using proxy: ${currentProxy.host}:${currentProxy.port}`); + } + } + + // Build browser args + const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox']; + if (proxyUrl) { + const proxyUrlParsed = new URL(proxyUrl); + browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`); + } + + browser = await puppeteer.launch({ + headless: 'new', + args: browserArgs, + }); + + const page = await browser.newPage(); + + // Setup proxy auth if needed + if (proxyUrl) { + const proxyUrlParsed = new URL(proxyUrl); + if (proxyUrlParsed.username && proxyUrlParsed.password) { + await page.authenticate({ + username: decodeURIComponent(proxyUrlParsed.username), + password: decodeURIComponent(proxyUrlParsed.password), + }); + } + } + + await ctx.heartbeat(); + + // ============================================================ + // STEP 4: Establish session by visiting dispensary page + // ============================================================ + updateStep('navigating', 'Establishing session'); + const sessionUrl = `https://dutchie.com/dispensary/${slug}`; + console.log(`[EntryPointDiscovery] Establishing session at ${sessionUrl}...`); + + try { + await page.goto(sessionUrl, { + waitUntil: 'networkidle2', + timeout: 30000, + }); + } catch (navError: any) { + console.log(`[EntryPointDiscovery] Navigation timeout/error (may still work): ${navError.message}`); + } + + // Handle age gate + try { + await page.waitForTimeout(1500); + await page.evaluate(() => { + const buttons = Array.from(document.querySelectorAll('button')); + for (const btn of buttons) { + const text = btn.textContent?.toLowerCase() || ''; + if (text.includes('yes') || text.includes('enter') || text.includes('21')) { + (btn as HTMLButtonElement).click(); + return true; + } + } + return false; + }); + } catch { + // Age gate might not be present + } + + await ctx.heartbeat(); + + // ============================================================ + // STEP 5: Resolve platform ID via GraphQL with retries + // ============================================================ + updateStep('fetching', 'Resolving platform ID'); + + let lastError: string = ''; + let lastHttpStatus: number = 0; + let networkFailures = 0; + let http404Count = 0; + + for (let attempt = 1; attempt <= MAX_NETWORK_RETRIES; attempt++) { + console.log(`[EntryPointDiscovery] GraphQL attempt ${attempt}/${MAX_NETWORK_RETRIES}`); + + const result = await page.evaluate(async (slugParam: string, hash: string) => { + try { + const variables = { + dispensaryFilter: { + cNameOrID: slugParam, + }, + }; + + const extensions = { + persistedQuery: { version: 1, sha256Hash: hash }, + }; + + const response = await fetch('https://dutchie.com/api-3/graphql', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + }, + body: JSON.stringify({ + operationName: 'GetAddressBasedDispensaryData', + variables, + extensions, + }), + credentials: 'include', + }); + + const status = response.status; + + if (!response.ok) { + return { success: false, httpStatus: status, error: `HTTP ${status}` }; + } + + const json = await response.json(); + + const dispensaryId = json?.data?.dispensaryBySlug?.id || + json?.data?.dispensary?.id || + json?.data?.getAddressBasedDispensaryData?.dispensary?.id; + + if (dispensaryId) { + return { success: true, dispensaryId, httpStatus: status }; + } + + return { success: false, httpStatus: status, error: 'No dispensaryId in response' }; + } catch (err: any) { + return { success: false, httpStatus: 0, error: err.message }; + } + }, slug, GET_DISPENSARY_DATA_HASH); + + lastHttpStatus = result.httpStatus || 0; + lastError = result.error || ''; + + if (result.success && result.dispensaryId) { + console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${result.dispensaryId}`); + + await browser.close(); + browser = null; + + await updateDispensaryWithPlatformId( + pool, dispensaryId, result.dispensaryId, task, + 'browser_graphql', slug + ); + + return { + success: true, + platformId: result.dispensaryId, + slug, + source: 'browser_graphql', + attempts: attempt, + }; + } + + // Handle different failure types + if (result.httpStatus === 404) { + http404Count++; + console.log(`[EntryPointDiscovery] HTTP 404 - store may be removed (count: ${http404Count})`); + + if (http404Count >= MAX_404_ATTEMPTS) { + console.log(`[EntryPointDiscovery] Max 404 attempts reached - marking as removed`); + break; + } + } else if (result.httpStatus === 403) { + console.log(`[EntryPointDiscovery] HTTP 403 - blocked, will retry with new proxy`); + // TODO: Rotate proxy if available + } else if (result.httpStatus === 0) { + networkFailures++; + console.log(`[EntryPointDiscovery] Network failure (count: ${networkFailures}): ${result.error}`); + } + + if (attempt < MAX_NETWORK_RETRIES) { + const delay = 1000 * attempt; // Exponential backoff + console.log(`[EntryPointDiscovery] Retrying in ${delay}ms...`); + await new Promise(r => setTimeout(r, delay)); + await ctx.heartbeat(); + } + } + + await browser.close(); + browser = null; + + // ============================================================ + // STEP 6: Handle hard failure + // ============================================================ + const isHardFailure = http404Count >= MAX_404_ATTEMPTS || + networkFailures >= MAX_NETWORK_RETRIES || + currentAttempts >= 3; + + const failureStatus = isHardFailure ? 'needs_investigation' : 'failed'; + const failureReason = lastHttpStatus === 404 + ? `Store removed from Dutchie (HTTP 404 after ${http404Count} attempts)` + : lastHttpStatus === 403 + ? `Blocked by Dutchie (HTTP 403)` + : `Network failures: ${networkFailures}, Last error: ${lastError}`; + + console.log(`[EntryPointDiscovery] ${isHardFailure ? 'HARD FAILURE' : 'Soft failure'}: ${failureReason}`); + + await pool.query(` + UPDATE dispensaries + SET + menu_type = CASE + WHEN $2 = 404 THEN 'removed' + WHEN $2 = 403 THEN 'blocked' + ELSE menu_type + END, + id_resolution_status = $3, + id_resolution_error = $4, + updated_at = NOW(), + last_modified_at = NOW(), + last_modified_by_task = $5, + last_modified_task_id = $6 + WHERE id = $1 + `, [dispensaryId, lastHttpStatus, failureStatus, failureReason, task.role, task.id]); + + return { + success: false, + error: `Hard failure: ${failureReason}`, + slug, + httpStatus: lastHttpStatus, + hardFailure: isHardFailure, + networkFailures, + http404Count, + }; + } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; console.error(`[EntryPointDiscovery] Error for dispensary ${dispensaryId}:`, errorMessage); + + // Mark as needs_investigation on unexpected errors + await pool.query(` + UPDATE dispensaries + SET id_resolution_status = 'needs_investigation', + id_resolution_error = $2, + last_modified_at = NOW(), + last_modified_by_task = $3, + last_modified_task_id = $4 + WHERE id = $1 + `, [dispensaryId, errorMessage, task.role, task.id]); + return { success: false, error: errorMessage, + hardFailure: true, }; + } finally { + if (browser) { + await browser.close().catch(() => {}); + } } } + +/** + * Helper to update dispensary with resolved platform ID and queue product_discovery + */ +async function updateDispensaryWithPlatformId( + pool: any, + dispensaryId: number, + platformId: string, + task: any, + source: string, + slug: string +): Promise { + await pool.query(` + UPDATE dispensaries + SET + platform_dispensary_id = $2, + menu_type = 'dutchie', + crawl_enabled = true, + id_resolution_status = 'resolved', + id_resolution_error = NULL, + updated_at = NOW(), + last_modified_at = NOW(), + last_modified_by_task = $3, + last_modified_task_id = $4 + WHERE id = $1 + `, [dispensaryId, platformId, task.role, task.id]); + + console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID (source: ${source})`); + + // Queue product_discovery task + await pool.query(` + INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for, method) + VALUES ('product_discovery', $1, 5, NOW(), 'http') + ON CONFLICT DO NOTHING + `, [dispensaryId]); + + console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`); +}