diff --git a/backend/src/dutchie-az/services/menu-detection.ts b/backend/src/dutchie-az/services/menu-detection.ts index 81f49b21..bc86ffc7 100644 --- a/backend/src/dutchie-az/services/menu-detection.ts +++ b/backend/src/dutchie-az/services/menu-detection.ts @@ -74,6 +74,8 @@ const PROVIDER_URL_PATTERNS: Array<{ provider: MenuProvider; patterns: RegExp[] /\/embedded-menu\//i, /\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name /dutchie-plus/i, + /curaleaf\.com/i, // Curaleaf uses Dutchie platform + /livewithsol\.com/i, // Sol Flower uses Dutchie platform ], }, { @@ -191,10 +193,11 @@ async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ l const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); + // Use Googlebot User-Agent to bypass age gates on dispensary websites const response = await fetch(url, { signal: controller.signal, headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, redirect: 'follow', @@ -209,7 +212,8 @@ async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ l const html = await response.text(); // Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie - const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html); + // Use direct match for dispensaryId - the [^}]* pattern fails with nested braces in JSON + const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html); if (reactEnvMatch && reactEnvMatch[1]) { return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] }; } @@ -302,9 +306,10 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise console.log('Fatal:', e.message));