"use strict"; /** * Multi-Category Intelligence Detector * * Detects providers for each intelligence category independently: * - Products: Which provider serves product data * - Specials: Which provider serves deals/specials * - Brand: Which provider serves brand information * - Metadata: Which provider serves taxonomy/category data */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.detectMultiCategoryProviders = detectMultiCategoryProviders; exports.detectCategoryProviderChange = detectCategoryProviderChange; exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider; exports.updateAllCategoryProviders = updateAllCategoryProviders; exports.moveCategoryToSandbox = moveCategoryToSandbox; const migrate_1 = require("../db/migrate"); const logger_1 = require("./logger"); const puppeteer_1 = __importDefault(require("puppeteer")); // Production-ready providers per category // Only these combinations can be set to production mode const PRODUCTION_READY = { product: ['dutchie'], // Only Dutchie products are production-ready specials: [], // None yet brand: [], // None yet metadata: [], // None yet }; // Provider detection patterns const PROVIDER_PATTERNS = { dutchie: { scripts: [ /dutchie\.com/i, /dutchie-plus/i, /dutchie\.js/i, /__DUTCHIE__/i, /dutchie-embed/i, ], iframes: [ /dutchie\.com/i, /dutchie-plus\.com/i, /embed\.dutchie/i, ], html: [ /class="dutchie/i, /id="dutchie/i, /data-dutchie/i, /"menuType":\s*"dutchie"/i, ], apiEndpoints: [ /dutchie\.com\/graphql/i, /plus\.dutchie\.com/i, ], metaTags: [ /dutchie/i, ], }, treez: { scripts: [ /treez\.io/i, /treez-ecommerce/i, /treez\.js/i, ], iframes: [ /treez\.io/i, /shop\.treez/i, ], html: [ /class="treez/i, /data-treez/i, /treez-menu/i, ], apiEndpoints: [ /api\.treez\.io/i, /treez\.io\/api/i, ], metaTags: [], }, jane: { scripts: [ /jane\.co/i, /iheartjane\.com/i, /jane-frame/i, /jane\.js/i, ], iframes: [ /jane\.co/i, /iheartjane\.com/i, /embed\.iheartjane/i, ], html: [ /class="jane/i, /data-jane/i, /jane-embed/i, ], apiEndpoints: [ /api\.iheartjane/i, /jane\.co\/api/i, ], metaTags: [], }, weedmaps: { scripts: [ /weedmaps\.com/i, /wm-menu/i, ], iframes: [ /weedmaps\.com/i, /menu\.weedmaps/i, ], html: [ /data-weedmaps/i, /wm-menu/i, ], apiEndpoints: [ /api-g\.weedmaps/i, /weedmaps\.com\/api/i, ], metaTags: [], }, leafly: { scripts: [ /leafly\.com/i, /leafly-menu/i, ], iframes: [ /leafly\.com/i, /order\.leafly/i, ], html: [ /data-leafly/i, /leafly-embed/i, ], apiEndpoints: [ /api\.leafly/i, ], metaTags: [], }, }; // Category-specific detection signals const CATEGORY_SIGNALS = { product: { urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i], htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i], jsonKeys: ['products', 'menuItems', 'items', 'inventory'], }, specials: { urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i], htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i], jsonKeys: ['specials', 'deals', 'promotions', 'offers'], }, brand: { urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i], htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i], jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'], }, metadata: { urlPatterns: [/\/categories/i, /\/taxonomy/i], htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i], jsonKeys: ['categories', 'taxonomy', 'filters', 'types'], }, }; // ======================================== // Main Detection Function // ======================================== async function detectMultiCategoryProviders(websiteUrl, options = {}) { const { timeout = 30000, headless = true, existingBrowser } = options; let browser = null; let page = null; const urlsTested = []; const rawSignals = {}; try { browser = existingBrowser || await puppeteer_1.default.launch({ headless, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], }); page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // Navigate to main site const baseUrl = normalizeUrl(websiteUrl); urlsTested.push(baseUrl); await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout }); // Collect signals from main page const mainPageSignals = await collectPageSignals(page); rawSignals.mainPage = mainPageSignals; // Try common menu URLs const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands']; for (const path of menuUrls) { try { const fullUrl = new URL(path, baseUrl).toString(); urlsTested.push(fullUrl); await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 }); const signals = await collectPageSignals(page); rawSignals[path] = signals; } catch { // URL doesn't exist or timed out } } // Analyze signals for each category const result = { product: analyzeCategorySignals('product', rawSignals), specials: analyzeCategorySignals('specials', rawSignals), brand: analyzeCategorySignals('brand', rawSignals), metadata: analyzeCategorySignals('metadata', rawSignals), urlsTested, rawSignals, }; logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`); return result; } catch (error) { logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`); // Return unknown results for all categories return { product: createUnknownResult(), specials: createUnknownResult(), brand: createUnknownResult(), metadata: createUnknownResult(), urlsTested, rawSignals: { error: error.message }, }; } finally { if (page) await page.close().catch(() => { }); if (browser && !existingBrowser) await browser.close().catch(() => { }); } } // ======================================== // Helper Functions // ======================================== function normalizeUrl(url) { if (!url.startsWith('http')) { url = 'https://' + url; } return url.replace(/\/$/, ''); } async function collectPageSignals(page) { return page.evaluate(() => { const signals = { scripts: [], iframes: [], links: [], metaTags: [], bodyClasses: document.body?.className || '', bodyId: document.body?.id || '', htmlSnippet: document.documentElement.outerHTML.slice(0, 10000), }; // Collect script sources document.querySelectorAll('script[src]').forEach((el) => { signals.scripts.push(el.src); }); // Collect inline scripts document.querySelectorAll('script:not([src])').forEach((el) => { const content = el.textContent || ''; if (content.length < 5000) { signals.scripts.push(`inline:${content.slice(0, 500)}`); } }); // Collect iframes document.querySelectorAll('iframe').forEach((el) => { signals.iframes.push(el.src); }); // Collect links document.querySelectorAll('a[href]').forEach((el) => { signals.links.push(el.href); }); // Collect meta tags document.querySelectorAll('meta').forEach((el) => { const content = el.getAttribute('content') || ''; const name = el.getAttribute('name') || el.getAttribute('property') || ''; if (content || name) { signals.metaTags.push(`${name}:${content}`); } }); // Look for JSON data const jsonBlocks = []; document.querySelectorAll('script[type="application/json"]').forEach((el) => { jsonBlocks.push(el.textContent?.slice(0, 2000) || ''); }); signals.jsonBlocks = jsonBlocks; return signals; }); } function analyzeCategorySignals(category, allSignals) { const providerScores = {}; const detectedSignals = {}; // Initialize scores for (const provider of Object.keys(PROVIDER_PATTERNS)) { providerScores[provider] = 0; } // Analyze each page's signals for (const [pagePath, signals] of Object.entries(allSignals)) { if (!signals || typeof signals !== 'object') continue; // Check for provider-specific patterns for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) { let score = 0; // Check scripts if (signals.scripts) { for (const script of signals.scripts) { for (const pattern of patterns.scripts) { if (pattern.test(script)) { score += 20; detectedSignals[`${provider}_script_${pagePath}`] = script; } } } } // Check iframes if (signals.iframes) { for (const iframe of signals.iframes) { for (const pattern of patterns.iframes) { if (pattern.test(iframe)) { score += 25; detectedSignals[`${provider}_iframe_${pagePath}`] = iframe; } } } } // Check HTML content if (signals.htmlSnippet) { for (const pattern of patterns.html) { if (pattern.test(signals.htmlSnippet)) { score += 15; detectedSignals[`${provider}_html_${pagePath}`] = true; } } } providerScores[provider] += score; } // Check for category-specific signals on relevant pages const categorySignals = CATEGORY_SIGNALS[category]; const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath)); if (isRelevantPage && signals.htmlSnippet) { for (const pattern of categorySignals.htmlPatterns) { if (pattern.test(signals.htmlSnippet)) { detectedSignals[`${category}_html_pattern`] = true; } } } // Check JSON blocks for category data if (signals.jsonBlocks) { for (const json of signals.jsonBlocks) { for (const key of categorySignals.jsonKeys) { if (json.toLowerCase().includes(`"${key}"`)) { detectedSignals[`${category}_json_key_${key}`] = true; } } } } } // Determine winning provider let bestProvider = 'unknown'; let bestScore = 0; for (const [provider, score] of Object.entries(providerScores)) { if (score > bestScore) { bestScore = score; bestProvider = provider; } } // Calculate confidence (0-100) const confidence = Math.min(100, bestScore); // Determine mode based on provider and confidence const isProductionReady = PRODUCTION_READY[category].includes(bestProvider); const mode = isProductionReady && confidence >= 70 ? 'production' : 'sandbox'; // Get template name if available let templateName; if (bestProvider === 'dutchie' && category === 'product') { templateName = 'dutchie_standard'; } else if (bestProvider === 'treez') { templateName = 'treez_products_v0'; } return { provider: bestProvider, confidence, mode, signals: detectedSignals, templateName, }; } function createUnknownResult() { return { provider: 'unknown', confidence: 0, mode: 'sandbox', signals: {}, }; } // ======================================== // Lightweight Per-Category Change Detection // ======================================== async function detectCategoryProviderChange(page, category, expectedProvider) { try { const signals = await collectPageSignals(page); const result = analyzeCategorySignals(category, { currentPage: signals }); if (result.provider !== expectedProvider && result.confidence > 50) { logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`); return { changed: true, newProvider: result.provider, confidence: result.confidence, }; } return { changed: false }; } catch (error) { logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`); return { changed: false }; } } // ======================================== // Database Operations // ======================================== async function updateDispensaryCategoryProvider(dispensaryId, category, result) { const columnPrefix = category === 'product' ? 'product' : category === 'specials' ? 'specials' : category === 'brand' ? 'brand' : 'metadata'; await migrate_1.pool.query(`UPDATE dispensaries SET ${columnPrefix}_provider = $1, ${columnPrefix}_confidence = $2, ${columnPrefix}_crawler_mode = $3, ${columnPrefix}_detection_data = $4, updated_at = NOW() WHERE id = $5`, [ result.provider, result.confidence, result.mode, JSON.stringify(result.signals), dispensaryId, ]); } async function updateAllCategoryProviders(dispensaryId, result) { await migrate_1.pool.query(`UPDATE dispensaries SET product_provider = $1, product_confidence = $2, product_crawler_mode = $3, product_detection_data = $4, specials_provider = $5, specials_confidence = $6, specials_crawler_mode = $7, specials_detection_data = $8, brand_provider = $9, brand_confidence = $10, brand_crawler_mode = $11, brand_detection_data = $12, metadata_provider = $13, metadata_confidence = $14, metadata_crawler_mode = $15, metadata_detection_data = $16, updated_at = NOW() WHERE id = $17`, [ result.product.provider, result.product.confidence, result.product.mode, JSON.stringify(result.product.signals), result.specials.provider, result.specials.confidence, result.specials.mode, JSON.stringify(result.specials.signals), result.brand.provider, result.brand.confidence, result.brand.mode, JSON.stringify(result.brand.signals), result.metadata.provider, result.metadata.confidence, result.metadata.mode, JSON.stringify(result.metadata.signals), dispensaryId, ]); } async function moveCategoryToSandbox(dispensaryId, category, reason) { const columnPrefix = category === 'product' ? 'product' : category === 'specials' ? 'specials' : category === 'brand' ? 'brand' : 'metadata'; await migrate_1.pool.query(`UPDATE dispensaries SET ${columnPrefix}_crawler_mode = 'sandbox', ${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb, updated_at = NOW() WHERE id = $2`, [ JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }), dispensaryId, ]); logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`); }