"use strict"; /** * Menu Detection Service * * Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url * and resolves platform_dispensary_id for dutchie stores. * * This service: * 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id * 2. Detects provider from menu_url patterns * 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL * 4. Logs results to job_run_logs */ Object.defineProperty(exports, "__esModule", { value: true }); exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks; exports.detectProviderFromUrl = detectProviderFromUrl; exports.detectAndResolveDispensary = detectAndResolveDispensary; exports.runBulkDetection = runBulkDetection; exports.executeMenuDetectionJob = executeMenuDetectionJob; exports.getDetectionStats = getDetectionStats; exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection; const connection_1 = require("../db/connection"); const discovery_1 = require("./discovery"); const graphql_client_1 = require("./graphql-client"); // Explicit column list for dispensaries table (avoids SELECT * issues with schema differences) const DISPENSARY_COLUMNS = ` id, name, slug, city, state, zip, address, latitude, longitude, menu_type, menu_url, platform_dispensary_id, website, provider_detection_data, created_at, updated_at `; // ============================================================ // PROVIDER DETECTION PATTERNS // ============================================================ const PROVIDER_URL_PATTERNS = [ // We detect provider based on the actual menu link we find, not just the site domain. { provider: 'dutchie', patterns: [ /dutchie\.com/i, /\/embedded-menu\//i, /\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name /dutchie-plus/i, /curaleaf\.com/i, // Curaleaf uses Dutchie platform /livewithsol\.com/i, // Sol Flower uses Dutchie platform ], }, { provider: 'treez', patterns: [ /treez\.io/i, /shop\.treez/i, /treez-ecommerce/i, ], }, { provider: 'jane', patterns: [ /jane\.co/i, /iheartjane\.com/i, /embed\.iheartjane/i, ], }, { provider: 'weedmaps', patterns: [ /weedmaps\.com/i, /menu\.weedmaps/i, ], }, { provider: 'leafly', patterns: [ /leafly\.com/i, /order\.leafly/i, ], }, { provider: 'meadow', patterns: [ /getmeadow\.com/i, /meadow\.co/i, ], }, { provider: 'blaze', patterns: [ /blaze\.me/i, /blazepos\.com/i, ], }, { provider: 'flowhub', patterns: [ /flowhub\.com/i, /flowhub\.co/i, ], }, { provider: 'dispense', patterns: [ /dispense\.io/i, /dispenseapp\.com/i, ], }, ]; /** * Link patterns that suggest a menu or ordering page */ const MENU_LINK_PATTERNS = [ /\/menu/i, /\/order/i, /\/shop/i, /\/products/i, /\/dispensary/i, /\/store/i, /curaleaf\.com/i, /dutchie\.com/i, /treez\.io/i, /jane\.co/i, /iheartjane\.com/i, /weedmaps\.com/i, /leafly\.com/i, /getmeadow\.com/i, /blaze\.me/i, /flowhub\.com/i, /dispense\.io/i, ]; /** * Check if a URL is a Curaleaf store URL */ function isCuraleafUrl(url) { if (!url) return false; return /curaleaf\.com\/(stores|dispensary)\//i.test(url); } /** * Fetch a page and extract all links */ async function fetchPageLinks(url, timeout = 10000) { try { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); // Use Googlebot User-Agent to bypass age gates on dispensary websites const response = await fetch(url, { signal: controller.signal, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, redirect: 'follow', }); clearTimeout(timeoutId); if (!response.ok) { return { links: [], error: `HTTP ${response.status}` }; } const html = await response.text(); // Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie // Use direct match for dispensaryId - the [^}]* pattern fails with nested braces in JSON const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html); if (reactEnvMatch && reactEnvMatch[1]) { return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] }; } // Extract all href attributes from anchor tags const linkRegex = /href=["']([^"']+)["']/gi; const links = []; let match; while ((match = linkRegex.exec(html)) !== null) { const href = match[1]; // Convert relative URLs to absolute try { const absoluteUrl = new URL(href, url).href; links.push(absoluteUrl); } catch { // Skip invalid URLs } } // Also look for iframe src attributes (common for embedded menus) const iframeRegex = /src=["']([^"']+)["']/gi; while ((match = iframeRegex.exec(html)) !== null) { const src = match[1]; try { const absoluteUrl = new URL(src, url).href; // Only add if it matches a provider pattern for (const { patterns } of PROVIDER_URL_PATTERNS) { if (patterns.some(p => p.test(absoluteUrl))) { links.push(absoluteUrl); break; } } } catch { // Skip invalid URLs } } return { links: [...new Set(links)] }; // Deduplicate } catch (error) { if (error.name === 'AbortError') { return { links: [], error: 'Timeout' }; } return { links: [], error: error.message }; } } /** * Crawl a dispensary's website to find menu provider links * * Strategy: * 1. Fetch the homepage and extract all links * 2. Look for links that match known provider patterns (dutchie, treez, etc.) * 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops) * 4. Check followed pages for provider patterns */ async function crawlWebsiteForMenuLinks(websiteUrl) { console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`); const result = { menuUrl: null, provider: 'unknown', foundLinks: [], crawledPages: [], }; // Normalize URL let baseUrl; try { baseUrl = new URL(websiteUrl); if (!baseUrl.protocol.startsWith('http')) { baseUrl = new URL(`https://${websiteUrl}`); } } catch { result.error = 'Invalid website URL'; return result; } // Step 1: Fetch the homepage const homepage = baseUrl.href; result.crawledPages.push(homepage); const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage); if (homepageError) { result.error = `Failed to fetch homepage: ${homepageError}`; return result; } result.foundLinks = homepageLinks; // Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML try { // Use Googlebot User-Agent to bypass age gates on dispensary websites const resp = await fetch(homepage, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, redirect: 'follow', }); if (resp.ok) { const html = await resp.text(); // Look for dispensaryId directly - the [^}]* pattern fails with nested braces const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html); if (reactEnvMatch && reactEnvMatch[1]) { result.provider = 'dutchie'; result.menuUrl = homepage; result.platformDispensaryId = reactEnvMatch[1]; console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`); return result; } } } catch (err) { console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`); } // Step 2: Check for reactEnv token from fetchPageLinks (encoded as dutchie-reactenv:) for (const link of homepageLinks) { const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link); if (reactEnvToken) { result.menuUrl = homepage; result.provider = 'dutchie'; result.platformDispensaryId = reactEnvToken[1]; console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`); return result; } } // Step 3: Check for direct provider matches in homepage links for (const link of homepageLinks) { for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { if (patterns.some(p => p.test(link))) { console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`); result.menuUrl = link; result.provider = provider; return result; } } } // Step 4: Find menu/order/shop links to follow const menuLinks = homepageLinks.filter(link => { // Must be same domain or a known provider domain try { const linkUrl = new URL(link); const isSameDomain = linkUrl.hostname === baseUrl.hostname || linkUrl.hostname.endsWith(`.${baseUrl.hostname}`); const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link))); const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link)); return (isSameDomain && isMenuPath) || isProviderDomain; } catch { return false; } }); console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`); // Step 4: Follow menu links (limit to 3 to avoid excessive crawling) for (const menuLink of menuLinks.slice(0, 3)) { // Skip if we've already crawled this page if (result.crawledPages.includes(menuLink)) continue; // Check if this link itself is a provider URL for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { if (patterns.some(p => p.test(menuLink))) { console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`); result.menuUrl = menuLink; result.provider = provider; return result; } } result.crawledPages.push(menuLink); // Rate limit await new Promise(r => setTimeout(r, 500)); const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink); if (pageError) { console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`); continue; } result.foundLinks.push(...pageLinks); // Check for provider matches on this page for (const link of pageLinks) { for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { if (patterns.some(p => p.test(link))) { console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`); result.menuUrl = link; result.provider = provider; return result; } } } } console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`); return result; } // ============================================================ // CORE DETECTION FUNCTIONS // ============================================================ /** * Detect menu provider from a URL */ function detectProviderFromUrl(menuUrl) { if (!menuUrl) return 'unknown'; for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { for (const pattern of patterns) { if (pattern.test(menuUrl)) { return provider; } } } // Check if it's a custom website (has a domain but doesn't match known providers) try { const url = new URL(menuUrl); if (url.hostname && !url.hostname.includes('localhost')) { return 'custom'; } } catch { // Invalid URL } return 'unknown'; } /** * Detect provider and resolve platform ID for a single dispensary */ async function detectAndResolveDispensary(dispensaryId) { console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`); // Get dispensary record const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]); if (rows.length === 0) { return { dispensaryId, dispensaryName: 'Unknown', previousMenuType: null, detectedProvider: 'unknown', cName: null, platformDispensaryId: null, success: false, error: 'Dispensary not found', }; } const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]); let menuUrl = dispensary.menuUrl; const previousMenuType = dispensary.menuType || null; const website = dispensary.website; // If menu_url is null or empty, try to discover it by crawling the dispensary website if (!menuUrl || menuUrl.trim() === '') { console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`); // Check if website is available if (!website || website.trim() === '') { console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`); await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'unknown', provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'unknown'::text, 'detection_method', 'no_data'::text, 'detected_at', NOW(), 'resolution_error', 'No menu_url and no website available'::text, 'not_crawlable', true, 'website_crawl_attempted', false ), updated_at = NOW() WHERE id = $1 `, [dispensaryId]); return { dispensaryId, dispensaryName: dispensary.name, previousMenuType, detectedProvider: 'unknown', cName: null, platformDispensaryId: null, success: true, error: 'No menu_url and no website available - marked as not crawlable', }; } // Crawl the website to find menu provider links console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`); const crawlResult = await crawlWebsiteForMenuLinks(website); if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') { // SUCCESS: Found a menu URL from website crawl! console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`); menuUrl = crawlResult.menuUrl; // Update the dispensary with the discovered menu_url await (0, connection_1.query)(` UPDATE dispensaries SET menu_url = $1, menu_type = $2, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', $2::text, 'detection_method', 'website_crawl'::text, 'detected_at', NOW(), 'website_crawled', $3::text, 'website_crawl_pages', $4::jsonb, 'not_crawlable', false ), updated_at = NOW() WHERE id = $5 `, [ crawlResult.menuUrl, crawlResult.provider, website, JSON.stringify(crawlResult.crawledPages), dispensaryId ]); // Continue with full detection flow using the discovered menu_url } else { // Website crawl failed to find a menu provider const errorReason = crawlResult.error || 'No menu provider links found on website'; console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`); await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'unknown', provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'unknown'::text, 'detection_method', 'website_crawl'::text, 'detected_at', NOW(), 'website_crawled', $1::text, 'website_crawl_pages', $2::jsonb, 'resolution_error', $3::text, 'not_crawlable', true ), updated_at = NOW() WHERE id = $4 `, [ website, JSON.stringify(crawlResult.crawledPages), errorReason, dispensaryId ]); return { dispensaryId, dispensaryName: dispensary.name, previousMenuType, detectedProvider: 'unknown', cName: null, platformDispensaryId: null, success: true, error: `Website crawl failed: ${errorReason}`, }; } } // Detect provider from URL const detectedProvider = detectProviderFromUrl(menuUrl); console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`); // Initialize result const result = { dispensaryId, dispensaryName: dispensary.name, previousMenuType, detectedProvider, cName: null, platformDispensaryId: null, success: false, }; // If not dutchie, just update menu_type (non-dutchie providers) // Note: curaleaf.com and livewithsol.com are detected directly as 'dutchie' via PROVIDER_URL_PATTERNS if (detectedProvider !== 'dutchie') { await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = $1, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', $1::text, 'detection_method', 'url_pattern'::text, 'detected_at', NOW(), 'not_crawlable', false ), updated_at = NOW() WHERE id = $2 `, [detectedProvider, dispensaryId]); result.success = true; console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`); return result; } // For dutchie: extract cName or platformId from menu_url const extraction = (0, discovery_1.extractFromMenuUrl)(menuUrl); if (!extraction) { result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`; await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'dutchie', provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'dutchie'::text, 'detection_method', 'url_pattern'::text, 'detected_at', NOW(), 'resolution_error', $1::text, 'not_crawlable', true ), updated_at = NOW() WHERE id = $2 `, [result.error, dispensaryId]); console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`); return result; } // If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/.js), skip GraphQL resolution if (extraction.type === 'platformId') { const platformId = extraction.value; result.platformDispensaryId = platformId; result.success = true; await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'dutchie', platform_dispensary_id = $1, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'dutchie'::text, 'detection_method', 'url_direct_platform_id'::text, 'detected_at', NOW(), 'platform_id_source', 'url_embedded'::text, 'platform_id_resolved', true, 'platform_id_resolved_at', NOW(), 'resolution_error', NULL::text, 'not_crawlable', false ), updated_at = NOW() WHERE id = $2 `, [platformId, dispensaryId]); console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`); return result; } // Otherwise, we have a cName that needs GraphQL resolution const cName = extraction.value; result.cName = cName; // Resolve platform_dispensary_id from cName console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`); try { const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName); if (platformId) { result.platformDispensaryId = platformId; result.success = true; await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'dutchie', platform_dispensary_id = $1, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'dutchie'::text, 'detection_method', 'url_pattern'::text, 'detected_at', NOW(), 'cname_extracted', $2::text, 'platform_id_resolved', true, 'platform_id_resolved_at', NOW(), 'resolution_error', NULL::text, 'not_crawlable', false ), updated_at = NOW() WHERE id = $3 `, [platformId, cName, dispensaryId]); console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`); } else { // cName resolution failed - try crawling website as fallback console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`); if (website && website.trim() !== '') { const fallbackCrawl = await crawlWebsiteForMenuLinks(website); if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') { // Found Dutchie menu via website crawl! console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`); // Extract from the new menu URL const newExtraction = (0, discovery_1.extractFromMenuUrl)(fallbackCrawl.menuUrl); if (newExtraction) { let fallbackPlatformId = null; if (newExtraction.type === 'platformId') { fallbackPlatformId = newExtraction.value; } else { // Try to resolve the new cName fallbackPlatformId = await (0, graphql_client_1.resolveDispensaryId)(newExtraction.value); } if (fallbackPlatformId) { result.platformDispensaryId = fallbackPlatformId; result.success = true; result.cName = newExtraction.value; await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'dutchie', menu_url = $1, platform_dispensary_id = $2, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'dutchie'::text, 'detection_method', 'website_crawl_fallback'::text, 'detected_at', NOW(), 'original_cname', $3::text, 'fallback_cname', $4::text, 'website_crawled', $5::text, 'platform_id_resolved', true, 'platform_id_resolved_at', NOW(), 'not_crawlable', false ), updated_at = NOW() WHERE id = $6 `, [fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId]); console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`); return result; } } } } // Website crawl fallback didn't work either result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`; await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'dutchie', platform_dispensary_id = NULL, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'dutchie'::text, 'detection_method', 'url_pattern'::text, 'detected_at', NOW(), 'cname_extracted', $1::text, 'platform_id_resolved', false, 'resolution_error', $2::text, 'website_crawl_attempted', true, 'not_crawlable', true ), updated_at = NOW() WHERE id = $3 `, [cName, result.error, dispensaryId]); console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`); } } catch (error) { result.error = `Resolution failed: ${error.message}`; await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = 'dutchie', provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', 'dutchie'::text, 'detection_method', 'url_pattern'::text, 'detected_at', NOW(), 'cname_extracted', $1::text, 'platform_id_resolved', false, 'resolution_error', $2::text, 'not_crawlable', true ), updated_at = NOW() WHERE id = $3 `, [cName, result.error, dispensaryId]); console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`); } return result; } /** * Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id * Also includes dispensaries with no menu_url but with a website (for website crawl discovery) */ async function runBulkDetection(options = {}) { const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, includeDutchieMissingPlatformId = true, limit, } = options; console.log('[MenuDetection] Starting bulk detection...'); // Build query to find dispensaries needing detection // Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable) // Optionally includes dutchie stores missing platform ID let whereClause = `WHERE ( menu_url IS NOT NULL ${includeWebsiteCrawl ? `OR ( menu_url IS NULL AND website IS NOT NULL AND website != '' AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean) )` : ''} ${includeDutchieMissingPlatformId ? `OR ( menu_type = 'dutchie' AND platform_dispensary_id IS NULL )` : ''} )`; const params = []; let paramIndex = 1; if (state) { whereClause += ` AND state = $${paramIndex++}`; params.push(state); } // Handle filters for unknown and/or missing platform IDs if (onlyUnknown && onlyMissingPlatformId) { whereClause += ` AND ( (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown') OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL) )`; } else if (onlyUnknown) { whereClause += ` AND ( (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown') ${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''} )`; } else if (onlyMissingPlatformId) { whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`; } else if (includeDutchieMissingPlatformId) { // Always attempt to resolve dutchie stores missing platform IDs whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`; } let query_str = ` SELECT ${DISPENSARY_COLUMNS} FROM dispensaries ${whereClause} ORDER BY name `; if (limit) { query_str += ` LIMIT $${paramIndex}`; params.push(limit); } const { rows: dispensaries } = await (0, connection_1.query)(query_str, params); console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`); const result = { totalProcessed: 0, totalSucceeded: 0, totalFailed: 0, totalSkipped: 0, results: [], errors: [], }; for (const row of dispensaries) { result.totalProcessed++; try { const detectionResult = await detectAndResolveDispensary(row.id); result.results.push(detectionResult); if (detectionResult.success) { result.totalSucceeded++; } else { result.totalFailed++; if (detectionResult.error) { result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`); } } // Rate limit between requests await new Promise(r => setTimeout(r, 1000)); } catch (error) { result.totalFailed++; result.errors.push(`${row.name || row.id}: ${error.message}`); } } console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`); return result; } // ============================================================ // SCHEDULED JOB EXECUTOR // ============================================================ /** * Execute the menu detection job (called by scheduler) */ async function executeMenuDetectionJob(config = {}) { const state = config.state || 'AZ'; const onlyUnknown = config.onlyUnknown !== false; // Default to true - always try to resolve platform IDs for dutchie stores const onlyMissingPlatformId = config.onlyMissingPlatformId !== false; const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false; console.log(`[MenuDetection] Executing scheduled job for state=${state}...`); try { const result = await runBulkDetection({ state, onlyUnknown, onlyMissingPlatformId, includeDutchieMissingPlatformId, }); const status = result.totalFailed === 0 ? 'success' : result.totalSucceeded === 0 ? 'error' : 'partial'; return { status, itemsProcessed: result.totalProcessed, itemsSucceeded: result.totalSucceeded, itemsFailed: result.totalFailed, errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined, metadata: { state, onlyUnknown, onlyMissingPlatformId, providerCounts: countByProvider(result.results), }, }; } catch (error) { return { status: 'error', itemsProcessed: 0, itemsSucceeded: 0, itemsFailed: 0, errorMessage: error.message, }; } } /** * Count results by detected provider */ function countByProvider(results) { const counts = {}; for (const r of results) { counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1; } return counts; } // ============================================================ // UTILITY FUNCTIONS // ============================================================ /** * Get detection stats for dashboard */ async function getDetectionStats() { const { rows } = await (0, connection_1.query)(` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type, COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id, COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection FROM dispensaries WHERE state = 'AZ' `); const stats = rows[0] || {}; // Get provider breakdown const { rows: providerRows } = await (0, connection_1.query)(` SELECT menu_type, COUNT(*) as count FROM dispensaries WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != '' GROUP BY menu_type ORDER BY count DESC `); const byProvider = {}; for (const row of providerRows) { byProvider[row.menu_type] = parseInt(row.count, 10); } return { totalDispensaries: parseInt(stats.total || '0', 10), withMenuType: parseInt(stats.with_menu_type || '0', 10), withPlatformId: parseInt(stats.with_platform_id || '0', 10), needsDetection: parseInt(stats.needs_detection || '0', 10), byProvider, }; } /** * Get dispensaries needing detection * Includes dispensaries with website but no menu_url for website crawl discovery */ async function getDispensariesNeedingDetection(options = {}) { const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options; const { rows } = await (0, connection_1.query)(` SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE state = $1 AND ( (menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown' OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL))) ${includeWebsiteCrawl ? `OR ( menu_url IS NULL AND website IS NOT NULL AND website != '' AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean) )` : ''} ) ORDER BY name LIMIT $2 `, [state, limit]); return rows.map(discovery_1.mapDbRowToDispensary); }