From 95fc8bb4cc5493bd8b9598ee7c60038e96dfd9c3 Mon Sep 17 00:00:00 2001 From: Kelly Date: Wed, 3 Dec 2025 21:41:45 -0700 Subject: [PATCH] Improve menu detection to extract platform ID from URL and crawl proprietary domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add extractFromMenuUrl() to discovery.ts that extracts either cName or platformId directly from Dutchie URLs (handles /api/v2/embedded-menu/.js pattern) - Add isObjectId() helper to identify MongoDB ObjectIds in URLs - Update menu-detection.ts to skip GraphQL resolution when URL contains platformId directly - For proprietary domains (curaleaf, sol), crawl website to find actual menu provider instead of blindly marking as not_crawlable - If website crawl finds Dutchie embedded menu, set menu_type='dutchie' and resolve platform ID - Tested successfully with consumeaz.com which discovers Dutchie embedded menu JS URL 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- backend/src/dutchie-az/services/discovery.ts | 58 ++++- .../src/dutchie-az/services/menu-detection.ts | 227 ++++++++++++------ 2 files changed, 209 insertions(+), 76 deletions(-) diff --git a/backend/src/dutchie-az/services/discovery.ts b/backend/src/dutchie-az/services/discovery.ts index a283f5cb..1c0169ce 100644 --- a/backend/src/dutchie-az/services/discovery.ts +++ b/backend/src/dutchie-az/services/discovery.ts @@ -146,24 +146,59 @@ export async function discoverDispensaries(): Promise<{ discovered: number; erro } /** - * Extract cName (slug) from a Dutchie menu_url - * Supports formats: - * - https://dutchie.com/embedded-menu/ - * - https://dutchie.com/dispensary/ + * Check if a string looks like a MongoDB ObjectId (24 hex chars) */ -export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null { +export function isObjectId(value: string): boolean { + return /^[a-f0-9]{24}$/i.test(value); +} + +/** + * Extract cName (slug) or platform_dispensary_id from a Dutchie menu_url + * + * Supports formats: + * - https://dutchie.com/embedded-menu/ -> returns { type: 'cName', value: '' } + * - https://dutchie.com/dispensary/ -> returns { type: 'cName', value: '' } + * - https://dutchie.com/api/v2/embedded-menu/.js -> returns { type: 'platformId', value: '' } + * + * For backward compatibility, extractCNameFromMenuUrl still returns just the string value. + */ +export interface MenuUrlExtraction { + type: 'cName' | 'platformId'; + value: string; +} + +export function extractFromMenuUrl(menuUrl: string | null | undefined): MenuUrlExtraction | null { if (!menuUrl) return null; try { const url = new URL(menuUrl); const pathname = url.pathname; + // Match /api/v2/embedded-menu/.js - this contains the platform_dispensary_id directly + const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i); + if (apiMatch) { + return { type: 'platformId', value: apiMatch[1] }; + } + // Match /embedded-menu/ or /dispensary/ const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/); - if (embeddedMatch) return embeddedMatch[1]; + if (embeddedMatch) { + const value = embeddedMatch[1]; + // Check if it's actually an ObjectId (some URLs use ID directly) + if (isObjectId(value)) { + return { type: 'platformId', value }; + } + return { type: 'cName', value }; + } const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/); - if (dispensaryMatch) return dispensaryMatch[1]; + if (dispensaryMatch) { + const value = dispensaryMatch[1]; + if (isObjectId(value)) { + return { type: 'platformId', value }; + } + return { type: 'cName', value }; + } return null; } catch { @@ -171,6 +206,15 @@ export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): str } } +/** + * Extract cName (slug) from a Dutchie menu_url + * Backward compatible - use extractFromMenuUrl for full info + */ +export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null { + const extraction = extractFromMenuUrl(menuUrl); + return extraction?.value || null; +} + /** * Resolve platform dispensary IDs for all dispensaries that don't have one * CRITICAL: Uses cName extracted from menu_url, NOT the slug column! diff --git a/backend/src/dutchie-az/services/menu-detection.ts b/backend/src/dutchie-az/services/menu-detection.ts index 18ff23c9..249fdf5f 100644 --- a/backend/src/dutchie-az/services/menu-detection.ts +++ b/backend/src/dutchie-az/services/menu-detection.ts @@ -12,7 +12,7 @@ */ import { query } from '../db/connection'; -import { extractCNameFromMenuUrl, mapDbRowToDispensary } from './discovery'; +import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } from './discovery'; import { resolveDispensaryId } from './graphql-client'; import { Dispensary, JobStatus } from '../types'; @@ -461,51 +461,6 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise< const previousMenuType = dispensary.menuType || null; const website = dispensary.website; - // ============================================================ - // CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url - // This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform - // ============================================================ - if (isCuraleafUrl(website)) { - console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`); - - // Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any) - // At this point we know website is defined since isCuraleafUrl returned true - const curaleafUrl = extractCuraleafStoreUrl(website!) || website; - - await query( - ` - UPDATE dispensaries SET - menu_type = 'curaleaf', - menu_url = $1, - platform_dispensary_id = NULL, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'curaleaf'::text, - 'detection_method', 'website_pattern'::text, - 'detected_at', NOW(), - 'curaleaf_store_url', $1::text, - 'stale_dutchie_url', $2::text, - 'not_crawlable', true, - 'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text - ), - updated_at = NOW() - WHERE id = $3 - `, - [curaleafUrl, menuUrl || null, dispensaryId] - ); - - return { - dispensaryId, - dispensaryName: dispensary.name, - previousMenuType, - detectedProvider: 'curaleaf', - cName: null, - platformDispensaryId: null, - success: true, - error: undefined, - }; - } - // If menu_url is null or empty, try to discover it by crawling the dispensary website if (!menuUrl || menuUrl.trim() === '') { console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`); @@ -640,44 +595,127 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise< success: false, }; - // If not dutchie, just update menu_type and return - if (detectedProvider !== 'dutchie') { - // Special handling for proprietary providers - mark as not_crawlable until we have crawlers - const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'] as const; - const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider as any); - const notCrawlableReason = isProprietaryProvider - ? `${detectedProvider} proprietary menu - no crawler available` - : null; + // For proprietary providers (curaleaf, sol), try website crawl to find an actual menu provider + // These sites may have Dutchie embedded menus even though their domain suggests proprietary + const PROPRIETARY_DOMAINS = ['curaleaf', 'sol'] as const; + const isPotentiallyProprietary = PROPRIETARY_DOMAINS.includes(detectedProvider as any); + if (isPotentiallyProprietary && website && website.trim() !== '') { + console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider...`); + const crawlResult = await crawlWebsiteForMenuLinks(website); + + if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') { + // Found an actual menu provider (likely Dutchie) - use that instead! + console.log(`[MenuDetection] ${dispensary.name}: Website crawl found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`); + menuUrl = crawlResult.menuUrl; + + // Re-detect provider from the found URL + const actualProvider = detectProviderFromUrl(menuUrl); + + // Update with the actual discovered provider + await query( + ` + UPDATE dispensaries SET + menu_url = $1, + menu_type = $2, + provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || + jsonb_build_object( + 'detected_provider', $2::text, + 'detection_method', 'website_crawl'::text, + 'detected_at', NOW(), + 'original_url_provider', $3::text, + 'website_crawled', $4::text, + 'website_crawl_pages', $5::jsonb, + 'not_crawlable', false + ), + updated_at = NOW() + WHERE id = $6 + `, + [ + crawlResult.menuUrl, + actualProvider, + detectedProvider, + website, + JSON.stringify(crawlResult.crawledPages), + dispensaryId + ] + ); + + // If the actual provider is dutchie, continue to platform ID resolution + if (actualProvider === 'dutchie') { + result.detectedProvider = 'dutchie'; + // Fall through to dutchie platform ID resolution below + } else { + // Found a different provider (treez, jane, etc.) - we're done + result.detectedProvider = actualProvider; + result.success = true; + console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${actualProvider} (discovered from website crawl)`); + return result; + } + } else { + // Website crawl didn't find any menu provider - mark as proprietary + const notCrawlableReason = `${detectedProvider} proprietary menu - no embedded menu provider found`; + console.log(`[MenuDetection] ${dispensary.name}: Website crawl found no menu provider - marking as ${detectedProvider}`); + + await query( + ` + UPDATE dispensaries SET + menu_type = $1, + platform_dispensary_id = NULL, + provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || + jsonb_build_object( + 'detected_provider', $1::text, + 'detection_method', 'url_pattern_with_crawl'::text, + 'detected_at', NOW(), + 'website_crawled', $2::text, + 'website_crawl_pages', $3::jsonb, + 'not_crawlable', true, + 'not_crawlable_reason', $4::text + ), + updated_at = NOW() + WHERE id = $5 + `, + [ + detectedProvider, + website, + JSON.stringify(crawlResult.crawledPages), + notCrawlableReason, + dispensaryId + ] + ); + result.success = true; + return result; + } + } + + // If not dutchie and not a proprietary domain we need to crawl, just update menu_type + if (detectedProvider !== 'dutchie') { await query( ` UPDATE dispensaries SET menu_type = $1, - platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', $1::text, 'detection_method', 'url_pattern'::text, 'detected_at', NOW(), - 'not_crawlable', $3, - 'not_crawlable_reason', $4::text + 'not_crawlable', false ), updated_at = NOW() WHERE id = $2 `, - [detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason] + [detectedProvider, dispensaryId] ); result.success = true; - console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`); + console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`); return result; } - // For dutchie: extract cName and resolve platform ID - const cName = extractCNameFromMenuUrl(menuUrl); - result.cName = cName; + // For dutchie: extract cName or platformId from menu_url + const extraction = extractFromMenuUrl(menuUrl); - if (!cName) { - result.error = `Could not extract cName from menu_url: ${menuUrl}`; + if (!extraction) { + result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`; await query( ` UPDATE dispensaries SET @@ -699,6 +737,41 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise< return result; } + // If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/.js), skip GraphQL resolution + if (extraction.type === 'platformId') { + const platformId = extraction.value; + result.platformDispensaryId = platformId; + result.success = true; + + await query( + ` + UPDATE dispensaries SET + menu_type = 'dutchie', + platform_dispensary_id = $1, + provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || + jsonb_build_object( + 'detected_provider', 'dutchie'::text, + 'detection_method', 'url_direct_platform_id'::text, + 'detected_at', NOW(), + 'platform_id_source', 'url_embedded'::text, + 'platform_id_resolved', true, + 'platform_id_resolved_at', NOW(), + 'resolution_error', NULL::text, + 'not_crawlable', false + ), + updated_at = NOW() + WHERE id = $2 + `, + [platformId, dispensaryId] + ); + console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`); + return result; + } + + // Otherwise, we have a cName that needs GraphQL resolution + const cName = extraction.value; + result.cName = cName; + // Resolve platform_dispensary_id from cName console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`); @@ -791,14 +864,23 @@ export async function runBulkDetection(options: { onlyUnknown?: boolean; onlyMissingPlatformId?: boolean; includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url + includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id limit?: number; } = {}): Promise { - const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options; + const { + state, + onlyUnknown = true, + onlyMissingPlatformId = false, + includeWebsiteCrawl = true, + includeDutchieMissingPlatformId = true, + limit, + } = options; console.log('[MenuDetection] Starting bulk detection...'); // Build query to find dispensaries needing detection - // Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable) + // Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable) + // Optionally includes dutchie stores missing platform ID let whereClause = `WHERE ( menu_url IS NOT NULL ${includeWebsiteCrawl ? `OR ( @@ -807,6 +889,9 @@ export async function runBulkDetection(options: { AND website != '' AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean) )` : ''} + ${includeDutchieMissingPlatformId ? `OR ( + menu_type = 'dutchie' AND platform_dispensary_id IS NULL + )` : ''} )`; const params: any[] = []; let paramIndex = 1; @@ -816,15 +901,17 @@ export async function runBulkDetection(options: { params.push(state); } - // Handle the combination of onlyUnknown and onlyMissingPlatformId - // If both are set, include: unknown menu_type OR (dutchie without platform_id) + // Handle filters for unknown and/or missing platform IDs if (onlyUnknown && onlyMissingPlatformId) { whereClause += ` AND ( (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown') OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL) )`; } else if (onlyUnknown) { - whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`; + whereClause += ` AND ( + (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown') + ${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''} + )`; } else if (onlyMissingPlatformId) { whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`; } @@ -899,6 +986,7 @@ export async function executeMenuDetectionJob(config: Record = {}): const onlyUnknown = config.onlyUnknown !== false; // Default to true - always try to resolve platform IDs for dutchie stores const onlyMissingPlatformId = config.onlyMissingPlatformId !== false; + const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false; console.log(`[MenuDetection] Executing scheduled job for state=${state}...`); @@ -907,6 +995,7 @@ export async function executeMenuDetectionJob(config: Record = {}): state, onlyUnknown, onlyMissingPlatformId, + includeDutchieMissingPlatformId, }); const status: JobStatus =