/** * Multi-Category Intelligence Detector * * Detects providers for each intelligence category independently: * - Products: Which provider serves product data * - Specials: Which provider serves deals/specials * - Brand: Which provider serves brand information * - Metadata: Which provider serves taxonomy/category data */ import { pool } from '../db/pool'; import { logger } from './logger'; import puppeteer, { Browser, Page } from 'puppeteer'; // ======================================== // Types // ======================================== export type IntelligenceCategory = 'product' | 'specials' | 'brand' | 'metadata'; export type MenuProvider = | 'dutchie' | 'treez' | 'jane' | 'iheartjane' | 'weedmaps' | 'leafly' | 'meadow' | 'greenlight' | 'blaze' | 'flowhub' | 'dispense' | 'cova' | 'custom_html' | 'custom_json' | 'dutchie_json' | 'other' | 'unknown'; export interface CategoryDetectionResult { provider: MenuProvider; confidence: number; mode: 'production' | 'sandbox'; signals: Record; templateName?: string; } export interface MultiCategoryDetectionResult { product: CategoryDetectionResult; specials: CategoryDetectionResult; brand: CategoryDetectionResult; metadata: CategoryDetectionResult; urlsTested: string[]; rawSignals: Record; } // Production-ready providers per category // Only these combinations can be set to production mode const PRODUCTION_READY: Record = { product: ['dutchie'], // Only Dutchie products are production-ready specials: [], // None yet brand: [], // None yet metadata: [], // None yet }; // Provider detection patterns const PROVIDER_PATTERNS: Record = { dutchie: { scripts: [ /dutchie\.com/i, /dutchie-plus/i, /dutchie\.js/i, /__DUTCHIE__/i, /dutchie-embed/i, ], iframes: [ /dutchie\.com/i, /dutchie-plus\.com/i, /embed\.dutchie/i, ], html: [ /class="dutchie/i, /id="dutchie/i, /data-dutchie/i, /"menuType":\s*"dutchie"/i, ], apiEndpoints: [ /dutchie\.com\/graphql/i, /plus\.dutchie\.com/i, ], metaTags: [ /dutchie/i, ], }, treez: { scripts: [ /treez\.io/i, /treez-ecommerce/i, /treez\.js/i, ], iframes: [ /treez\.io/i, /shop\.treez/i, ], html: [ /class="treez/i, /data-treez/i, /treez-menu/i, ], apiEndpoints: [ /api\.treez\.io/i, /treez\.io\/api/i, ], metaTags: [], }, jane: { scripts: [ /jane\.co/i, /iheartjane\.com/i, /jane-frame/i, /jane\.js/i, ], iframes: [ /jane\.co/i, /iheartjane\.com/i, /embed\.iheartjane/i, ], html: [ /class="jane/i, /data-jane/i, /jane-embed/i, ], apiEndpoints: [ /api\.iheartjane/i, /jane\.co\/api/i, ], metaTags: [], }, weedmaps: { scripts: [ /weedmaps\.com/i, /wm-menu/i, ], iframes: [ /weedmaps\.com/i, /menu\.weedmaps/i, ], html: [ /data-weedmaps/i, /wm-menu/i, ], apiEndpoints: [ /api-g\.weedmaps/i, /weedmaps\.com\/api/i, ], metaTags: [], }, leafly: { scripts: [ /leafly\.com/i, /leafly-menu/i, ], iframes: [ /leafly\.com/i, /order\.leafly/i, ], html: [ /data-leafly/i, /leafly-embed/i, ], apiEndpoints: [ /api\.leafly/i, ], metaTags: [], }, }; // Category-specific detection signals const CATEGORY_SIGNALS: Record = { product: { urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i], htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i], jsonKeys: ['products', 'menuItems', 'items', 'inventory'], }, specials: { urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i], htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i], jsonKeys: ['specials', 'deals', 'promotions', 'offers'], }, brand: { urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i], htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i], jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'], }, metadata: { urlPatterns: [/\/categories/i, /\/taxonomy/i], htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i], jsonKeys: ['categories', 'taxonomy', 'filters', 'types'], }, }; // ======================================== // Main Detection Function // ======================================== export async function detectMultiCategoryProviders( websiteUrl: string, options: { timeout?: number; headless?: boolean; existingBrowser?: Browser; } = {} ): Promise { const { timeout = 30000, headless = true, existingBrowser } = options; let browser: Browser | null = null; let page: Page | null = null; const urlsTested: string[] = []; const rawSignals: Record = {}; try { browser = existingBrowser || await puppeteer.launch({ headless, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], }); page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // Navigate to main site const baseUrl = normalizeUrl(websiteUrl); urlsTested.push(baseUrl); await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout }); // Collect signals from main page const mainPageSignals = await collectPageSignals(page); rawSignals.mainPage = mainPageSignals; // Try common menu URLs const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands']; for (const path of menuUrls) { try { const fullUrl = new URL(path, baseUrl).toString(); urlsTested.push(fullUrl); await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 }); const signals = await collectPageSignals(page); rawSignals[path] = signals; } catch { // URL doesn't exist or timed out } } // Analyze signals for each category const result: MultiCategoryDetectionResult = { product: analyzeCategorySignals('product', rawSignals), specials: analyzeCategorySignals('specials', rawSignals), brand: analyzeCategorySignals('brand', rawSignals), metadata: analyzeCategorySignals('metadata', rawSignals), urlsTested, rawSignals, }; logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`); return result; } catch (error: any) { logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`); // Return unknown results for all categories return { product: createUnknownResult(), specials: createUnknownResult(), brand: createUnknownResult(), metadata: createUnknownResult(), urlsTested, rawSignals: { error: error.message }, }; } finally { if (page) await page.close().catch(() => {}); if (browser && !existingBrowser) await browser.close().catch(() => {}); } } // ======================================== // Helper Functions // ======================================== function normalizeUrl(url: string): string { if (!url.startsWith('http')) { url = 'https://' + url; } return url.replace(/\/$/, ''); } async function collectPageSignals(page: Page): Promise> { return page.evaluate(() => { const signals: Record = { scripts: [] as string[], iframes: [] as string[], links: [] as string[], metaTags: [] as string[], bodyClasses: document.body?.className || '', bodyId: document.body?.id || '', htmlSnippet: document.documentElement.outerHTML.slice(0, 10000), }; // Collect script sources document.querySelectorAll('script[src]').forEach((el) => { signals.scripts.push((el as HTMLScriptElement).src); }); // Collect inline scripts document.querySelectorAll('script:not([src])').forEach((el) => { const content = el.textContent || ''; if (content.length < 5000) { signals.scripts.push(`inline:${content.slice(0, 500)}`); } }); // Collect iframes document.querySelectorAll('iframe').forEach((el) => { signals.iframes.push(el.src); }); // Collect links document.querySelectorAll('a[href]').forEach((el) => { signals.links.push((el as HTMLAnchorElement).href); }); // Collect meta tags document.querySelectorAll('meta').forEach((el) => { const content = el.getAttribute('content') || ''; const name = el.getAttribute('name') || el.getAttribute('property') || ''; if (content || name) { signals.metaTags.push(`${name}:${content}`); } }); // Look for JSON data const jsonBlocks: string[] = []; document.querySelectorAll('script[type="application/json"]').forEach((el) => { jsonBlocks.push(el.textContent?.slice(0, 2000) || ''); }); signals.jsonBlocks = jsonBlocks; return signals; }); } function analyzeCategorySignals( category: IntelligenceCategory, allSignals: Record ): CategoryDetectionResult { const providerScores: Record = {} as any; const detectedSignals: Record = {}; // Initialize scores for (const provider of Object.keys(PROVIDER_PATTERNS)) { providerScores[provider as MenuProvider] = 0; } // Analyze each page's signals for (const [pagePath, signals] of Object.entries(allSignals)) { if (!signals || typeof signals !== 'object') continue; // Check for provider-specific patterns for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) { let score = 0; // Check scripts if (signals.scripts) { for (const script of signals.scripts) { for (const pattern of patterns.scripts) { if (pattern.test(script)) { score += 20; detectedSignals[`${provider}_script_${pagePath}`] = script; } } } } // Check iframes if (signals.iframes) { for (const iframe of signals.iframes) { for (const pattern of patterns.iframes) { if (pattern.test(iframe)) { score += 25; detectedSignals[`${provider}_iframe_${pagePath}`] = iframe; } } } } // Check HTML content if (signals.htmlSnippet) { for (const pattern of patterns.html) { if (pattern.test(signals.htmlSnippet)) { score += 15; detectedSignals[`${provider}_html_${pagePath}`] = true; } } } providerScores[provider as MenuProvider] += score; } // Check for category-specific signals on relevant pages const categorySignals = CATEGORY_SIGNALS[category]; const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath)); if (isRelevantPage && signals.htmlSnippet) { for (const pattern of categorySignals.htmlPatterns) { if (pattern.test(signals.htmlSnippet)) { detectedSignals[`${category}_html_pattern`] = true; } } } // Check JSON blocks for category data if (signals.jsonBlocks) { for (const json of signals.jsonBlocks) { for (const key of categorySignals.jsonKeys) { if (json.toLowerCase().includes(`"${key}"`)) { detectedSignals[`${category}_json_key_${key}`] = true; } } } } } // Determine winning provider let bestProvider: MenuProvider = 'unknown'; let bestScore = 0; for (const [provider, score] of Object.entries(providerScores)) { if (score > bestScore) { bestScore = score; bestProvider = provider as MenuProvider; } } // Calculate confidence (0-100) const confidence = Math.min(100, bestScore); // Determine mode based on provider and confidence const isProductionReady = PRODUCTION_READY[category].includes(bestProvider); const mode: 'production' | 'sandbox' = isProductionReady && confidence >= 70 ? 'production' : 'sandbox'; // Get template name if available let templateName: string | undefined; if (bestProvider === 'dutchie' && category === 'product') { templateName = 'dutchie_standard'; } else if (bestProvider === 'treez') { templateName = 'treez_products_v0'; } return { provider: bestProvider, confidence, mode, signals: detectedSignals, templateName, }; } function createUnknownResult(): CategoryDetectionResult { return { provider: 'unknown', confidence: 0, mode: 'sandbox', signals: {}, }; } // ======================================== // Lightweight Per-Category Change Detection // ======================================== export async function detectCategoryProviderChange( page: Page, category: IntelligenceCategory, expectedProvider: MenuProvider ): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> { try { const signals = await collectPageSignals(page); const result = analyzeCategorySignals(category, { currentPage: signals }); if (result.provider !== expectedProvider && result.confidence > 50) { logger.warn( 'provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}` ); return { changed: true, newProvider: result.provider, confidence: result.confidence, }; } return { changed: false }; } catch (error: any) { logger.error('provider-detection', `Change detection failed: ${error.message}`); return { changed: false }; } } // ======================================== // Database Operations // ======================================== export async function updateDispensaryCategoryProvider( dispensaryId: number, category: IntelligenceCategory, result: CategoryDetectionResult ): Promise { const columnPrefix = category === 'product' ? 'product' : category === 'specials' ? 'specials' : category === 'brand' ? 'brand' : 'metadata'; await pool.query( `UPDATE dispensaries SET ${columnPrefix}_provider = $1, ${columnPrefix}_confidence = $2, ${columnPrefix}_crawler_mode = $3, ${columnPrefix}_detection_data = $4, updated_at = NOW() WHERE id = $5`, [ result.provider, result.confidence, result.mode, JSON.stringify(result.signals), dispensaryId, ] ); } export async function updateAllCategoryProviders( dispensaryId: number, result: MultiCategoryDetectionResult ): Promise { await pool.query( `UPDATE dispensaries SET product_provider = $1, product_confidence = $2, product_crawler_mode = $3, product_detection_data = $4, specials_provider = $5, specials_confidence = $6, specials_crawler_mode = $7, specials_detection_data = $8, brand_provider = $9, brand_confidence = $10, brand_crawler_mode = $11, brand_detection_data = $12, metadata_provider = $13, metadata_confidence = $14, metadata_crawler_mode = $15, metadata_detection_data = $16, updated_at = NOW() WHERE id = $17`, [ result.product.provider, result.product.confidence, result.product.mode, JSON.stringify(result.product.signals), result.specials.provider, result.specials.confidence, result.specials.mode, JSON.stringify(result.specials.signals), result.brand.provider, result.brand.confidence, result.brand.mode, JSON.stringify(result.brand.signals), result.metadata.provider, result.metadata.confidence, result.metadata.mode, JSON.stringify(result.metadata.signals), dispensaryId, ] ); } export async function moveCategoryToSandbox( dispensaryId: number, category: IntelligenceCategory, reason: string ): Promise { const columnPrefix = category === 'product' ? 'product' : category === 'specials' ? 'specials' : category === 'brand' ? 'brand' : 'metadata'; await pool.query( `UPDATE dispensaries SET ${columnPrefix}_crawler_mode = 'sandbox', ${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb, updated_at = NOW() WHERE id = $2`, [ JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }), dispensaryId, ] ); logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`); }