diff --git a/backend/src/dutchie-az/services/menu-detection.ts b/backend/src/dutchie-az/services/menu-detection.ts index 703b4789..81f49b21 100644 --- a/backend/src/dutchie-az/services/menu-detection.ts +++ b/backend/src/dutchie-az/services/menu-detection.ts @@ -208,6 +208,12 @@ async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ l const html = await response.text(); + // Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie + const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html); + if (reactEnvMatch && reactEnvMatch[1]) { + return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] }; + } + // Extract all href attributes from anchor tags const linkRegex = /href=["']([^"']+)["']/gi; const links: string[] = []; @@ -305,7 +311,8 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise) + for (const link of homepageLinks) { + const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link); + if (reactEnvToken) { + result.menuUrl = homepage; + result.provider = 'dutchie'; + result.platformDispensaryId = reactEnvToken[1]; + console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`); + return result; + } + } + + // Step 3: Check for direct provider matches in homepage links for (const link of homepageLinks) { for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { if (patterns.some(p => p.test(link))) { @@ -330,7 +349,7 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise { // Must be same domain or a known provider domain try { @@ -821,6 +840,64 @@ if (isSpecialDomain && website && website.trim() !== '') { ); console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`); } else { + // cName resolution failed - try crawling website as fallback + console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`); + + if (website && website.trim() !== '') { + const fallbackCrawl = await crawlWebsiteForMenuLinks(website); + + if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') { + // Found Dutchie menu via website crawl! + console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`); + + // Extract from the new menu URL + const newExtraction = extractFromMenuUrl(fallbackCrawl.menuUrl); + if (newExtraction) { + let fallbackPlatformId: string | null = null; + + if (newExtraction.type === 'platformId') { + fallbackPlatformId = newExtraction.value; + } else { + // Try to resolve the new cName + fallbackPlatformId = await resolveDispensaryId(newExtraction.value); + } + + if (fallbackPlatformId) { + result.platformDispensaryId = fallbackPlatformId; + result.success = true; + result.cName = newExtraction.value; + + await query( + ` + UPDATE dispensaries SET + menu_type = 'dutchie', + menu_url = $1, + platform_dispensary_id = $2, + provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || + jsonb_build_object( + 'detected_provider', 'dutchie'::text, + 'detection_method', 'website_crawl_fallback'::text, + 'detected_at', NOW(), + 'original_cname', $3::text, + 'fallback_cname', $4::text, + 'website_crawled', $5::text, + 'platform_id_resolved', true, + 'platform_id_resolved_at', NOW(), + 'not_crawlable', false + ), + updated_at = NOW() + WHERE id = $6 + `, + [fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId] + ); + console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`); + return result; + } + } + } + } + + // Website crawl fallback didn't work either result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`; await query( ` @@ -835,6 +912,7 @@ if (isSpecialDomain && website && website.trim() !== '') { 'cname_extracted', $1::text, 'platform_id_resolved', false, 'resolution_error', $2::text, + 'website_crawl_attempted', true, 'not_crawlable', true ), updated_at = NOW()