"use strict"; /** * Category-Specific Crawler Jobs * * Handles crawl jobs for each intelligence category independently: * - CrawlProductsJob - Production product crawling (Dutchie only) * - CrawlSpecialsJob - Production specials crawling * - CrawlBrandIntelligenceJob - Production brand intelligence crawling * - CrawlMetadataJob - Production metadata crawling * - SandboxProductsJob - Sandbox product crawling (all providers) * - SandboxSpecialsJob - Sandbox specials crawling * - SandboxBrandJob - Sandbox brand crawling * - SandboxMetadataJob - Sandbox metadata crawling */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.runCrawlProductsJob = runCrawlProductsJob; exports.runCrawlSpecialsJob = runCrawlSpecialsJob; exports.runCrawlBrandIntelligenceJob = runCrawlBrandIntelligenceJob; exports.runCrawlMetadataJob = runCrawlMetadataJob; exports.runSandboxProductsJob = runSandboxProductsJob; exports.runSandboxSpecialsJob = runSandboxSpecialsJob; exports.runSandboxBrandJob = runSandboxBrandJob; exports.runSandboxMetadataJob = runSandboxMetadataJob; exports.processCategorySandboxJobs = processCategorySandboxJobs; exports.runAllCategoryProductionCrawls = runAllCategoryProductionCrawls; exports.runAllCategorySandboxCrawls = runAllCategorySandboxCrawls; const migrate_1 = require("../db/migrate"); const crawler_logger_1 = require("./crawler-logger"); // Note: scrapeStore from scraper-v2 is NOT used for Dutchie - we use GraphQL API directly const product_crawler_1 = require("../dutchie-az/services/product-crawler"); const puppeteer_1 = __importDefault(require("puppeteer")); const WORKER_ID = `crawler-${process.pid}-${Date.now()}`; // ======================================== // Helper Functions // ======================================== async function getDispensaryWithCategories(dispensaryId) { const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_type, platform_dispensary_id, product_provider, product_confidence, product_crawler_mode, last_product_scan_at, specials_provider, specials_confidence, specials_crawler_mode, last_specials_scan_at, brand_provider, brand_confidence, brand_crawler_mode, last_brand_scan_at, metadata_provider, metadata_confidence, metadata_crawler_mode, last_metadata_scan_at, crawler_status, scraper_template FROM dispensaries WHERE id = $1`, [dispensaryId]); return result.rows[0] || null; } async function updateCategoryScanTime(dispensaryId, category) { const column = `last_${category}_scan_at`; await migrate_1.pool.query(`UPDATE dispensaries SET ${column} = NOW(), updated_at = NOW() WHERE id = $1`, [dispensaryId]); } async function getStoreIdForDispensary(dispensaryId) { // First check if dispensary has menu_url - if so, try to match with stores.dutchie_url const result = await migrate_1.pool.query(`SELECT s.id FROM stores s JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%' WHERE d.id = $1 LIMIT 1`, [dispensaryId]); if (result.rows.length > 0) { return result.rows[0].id; } // Try matching by slug const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%' WHERE d.id = $1 LIMIT 1`, [dispensaryId]); return result2.rows[0]?.id || null; } async function createCategorySandboxEntry(dispensaryId, category, suspectedProvider, templateName, detectionSignals) { // Check for existing sandbox for this category const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId, category]); if (existing.rows.length > 0) { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET suspected_menu_provider = $2, template_name = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW() WHERE id = $1`, [existing.rows[0].id, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : null]); return existing.rows[0].id; } const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, template_name, mode, detection_signals, status) VALUES ($1, $2, $3, $4, 'template_learning', $5, 'pending') RETURNING id`, [dispensaryId, category, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : '{}']); return result.rows[0].id; } async function createCategorySandboxJob(dispensaryId, sandboxId, category, templateName, jobType = 'crawl', priority = 0) { const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, template_name, job_type, status, priority) VALUES ($1, $2, $3, $4, $5, 'pending', $6) RETURNING id`, [dispensaryId, sandboxId, category, templateName, jobType, priority]); return result.rows[0].id; } async function updateSandboxQuality(sandboxId, metrics) { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET quality_score = $1, products_extracted = $2, fields_missing = $3, error_count = $4, analysis_json = COALESCE(analysis_json, '{}'::jsonb) || $5::jsonb, analyzed_at = NOW(), updated_at = NOW() WHERE id = $6`, [ metrics.quality_score, metrics.items_extracted, metrics.fields_missing, metrics.error_count, JSON.stringify({ sample_data: metrics.sample_data }), sandboxId, ]); } async function getCrawlerTemplate(provider, category, environment) { const result = await migrate_1.pool.query(`SELECT id, name, selector_config, navigation_config FROM crawler_templates WHERE provider = $1 AND environment = $2 AND is_active = true ORDER BY is_default_for_provider DESC, version DESC LIMIT 1`, [provider, environment]); return result.rows[0] || null; } // ======================================== // Production Crawl Jobs // ======================================== /** * CrawlProductsJob - Production product crawling * Uses Dutchie GraphQL API directly (NOT browser-based scraping) * * IMPORTANT: This function calls crawlDispensaryProducts() from dutchie-az * which uses the GraphQL API. The GraphQL response includes categories directly, * so no browser-based category discovery is needed. */ async function runCrawlProductsJob(dispensaryId) { const category = 'product'; const startTime = Date.now(); const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } // Verify production eligibility - accept either: // 1. product_provider = 'dutchie' with product_crawler_mode = 'production', OR // 2. menu_type = 'dutchie' with platform_dispensary_id (known Dutchie store) const isDutchieProduction = (dispensary.product_provider === 'dutchie' && dispensary.product_crawler_mode === 'production') || (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id); if (!isDutchieProduction) { return { success: false, category, message: 'Not a Dutchie dispensary for products' }; } if (!dispensary.platform_dispensary_id) { return { success: false, category, message: 'Missing platform_dispensary_id for GraphQL crawl' }; } // Log job start crawler_logger_1.crawlerLogger.jobStarted({ job_id: 0, // Category jobs don't have traditional job IDs store_id: dispensaryId, // Use dispensary ID since we're not using stores table store_name: dispensary.name, job_type: 'CrawlProductsJob', trigger_type: 'category_crawl', provider: 'dutchie', }); try { // Build Dispensary object for GraphQL crawler // The crawler uses platformDispensaryId to call the Dutchie GraphQL API directly const dispensaryForCrawl = { id: dispensary.id, platform: 'dutchie', name: dispensary.name, slug: dispensary.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'), city: '', state: 'AZ', menuType: dispensary.menu_type || 'dutchie', menuUrl: dispensary.menu_url || undefined, platformDispensaryId: dispensary.platform_dispensary_id || undefined, website: dispensary.website || undefined, createdAt: new Date(), updatedAt: new Date(), }; // Use GraphQL crawler directly - this calls the Dutchie API, not browser scraping const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dispensaryForCrawl, 'rec', // Default to recreational pricing { useBothModes: true, downloadImages: true }); // Update scan time await updateCategoryScanTime(dispensaryId, category); const durationMs = Date.now() - startTime; if (crawlResult.success) { // Log job completion with summary crawler_logger_1.crawlerLogger.jobCompleted({ job_id: 0, store_id: dispensaryId, store_name: dispensary.name, duration_ms: durationMs, products_found: crawlResult.productsFound, products_new: 0, // GraphQL crawler doesn't track new vs updated separately products_updated: crawlResult.productsUpserted, provider: 'dutchie', }); return { success: true, category, message: `GraphQL crawl completed: ${crawlResult.productsUpserted} products, ${crawlResult.snapshotsCreated} snapshots`, data: { dispensaryId, provider: 'dutchie', durationMs, productsFound: crawlResult.productsFound, productsUpserted: crawlResult.productsUpserted, snapshotsCreated: crawlResult.snapshotsCreated, modeAProducts: crawlResult.modeAProducts, modeBProducts: crawlResult.modeBProducts, }, }; } else { // Log job failure crawler_logger_1.crawlerLogger.jobFailed({ job_id: 0, store_id: dispensaryId, store_name: dispensary.name, duration_ms: durationMs, error_message: crawlResult.errorMessage || 'Unknown error', provider: 'dutchie', }); return { success: false, category, message: crawlResult.errorMessage || 'GraphQL crawl failed' }; } } catch (error) { const durationMs = Date.now() - startTime; // Log job failure crawler_logger_1.crawlerLogger.jobFailed({ job_id: 0, store_id: dispensaryId, store_name: dispensary.name, duration_ms: durationMs, error_message: error.message, provider: 'dutchie', }); return { success: false, category, message: error.message }; } } /** * CrawlSpecialsJob - Production specials crawling * Currently no production-ready providers, so always returns false */ async function runCrawlSpecialsJob(dispensaryId) { const category = 'specials'; const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } // No production-ready providers for specials yet if (dispensary.specials_crawler_mode !== 'production') { return { success: false, category, message: 'Specials not in production mode' }; } // Would implement provider-specific specials crawling here // For now, no providers are production-ready return { success: false, category, message: `No production crawler for specials provider: ${dispensary.specials_provider}`, }; } /** * CrawlBrandIntelligenceJob - Production brand intelligence crawling * Currently no production-ready providers */ async function runCrawlBrandIntelligenceJob(dispensaryId) { const category = 'brand'; const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } if (dispensary.brand_crawler_mode !== 'production') { return { success: false, category, message: 'Brand not in production mode' }; } return { success: false, category, message: `No production crawler for brand provider: ${dispensary.brand_provider}`, }; } /** * CrawlMetadataJob - Production metadata crawling * Currently no production-ready providers */ async function runCrawlMetadataJob(dispensaryId) { const category = 'metadata'; const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } if (dispensary.metadata_crawler_mode !== 'production') { return { success: false, category, message: 'Metadata not in production mode' }; } return { success: false, category, message: `No production crawler for metadata provider: ${dispensary.metadata_provider}`, }; } // ======================================== // Sandbox Crawl Jobs // ======================================== /** * SandboxProductsJob - Sandbox product crawling * Works with any provider including Treez */ async function runSandboxProductsJob(dispensaryId, sandboxId) { const category = 'product'; const startTime = Date.now(); const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } // Get or create sandbox entry let sandbox; if (sandboxId) { const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); sandbox = result.rows[0]; } else { const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed') ORDER BY created_at DESC LIMIT 1`, [dispensaryId, category]); sandbox = result.rows[0]; if (!sandbox) { const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.product_provider, null); const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); sandbox = result.rows[0]; } } const websiteUrl = dispensary.menu_url || dispensary.website; if (!websiteUrl) { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]); return { success: false, category, message: 'No website URL available' }; } let browser = null; try { // Update status await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // Get provider-specific template if available const provider = dispensary.product_provider || 'unknown'; const template = await getCrawlerTemplate(provider, category, 'sandbox'); let products = []; let metrics = { quality_score: 0, items_extracted: 0, fields_missing: 0, error_count: 0, }; // Provider-specific extraction logic if (provider === 'treez' && template) { // Use Treez-specific extraction const treezResult = await extractTreezProducts(page, websiteUrl); products = treezResult.products; metrics = treezResult.metrics; } else { // Generic product extraction const genericResult = await extractGenericProducts(page, websiteUrl); products = genericResult.products; metrics = genericResult.metrics; } // Update sandbox with results metrics.sample_data = products.slice(0, 5); await updateSandboxQuality(sandbox.id, metrics); // Determine final status based on quality const status = metrics.quality_score >= 70 ? 'ready_for_review' : metrics.quality_score >= 40 ? 'needs_human_review' : 'pending'; await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, urls_tested = $2, updated_at = NOW() WHERE id = $3`, [status, JSON.stringify([websiteUrl]), sandbox.id]); // Update scan time await updateCategoryScanTime(dispensaryId, category); // Log sandbox completion crawler_logger_1.crawlerLogger.sandboxEvent({ event: 'sandbox_completed', dispensary_id: dispensaryId, dispensary_name: dispensary.name, template_name: provider, category: 'product', quality_score: metrics.quality_score, products_extracted: products.length, fields_missing: metrics.fields_missing, provider: provider, }); return { success: true, category, message: `Sandbox crawl completed. ${products.length} products extracted, quality score ${metrics.quality_score}`, data: { sandboxId: sandbox.id, productsExtracted: products.length, qualityScore: metrics.quality_score, status, }, }; } catch (error) { // Log sandbox failure crawler_logger_1.crawlerLogger.sandboxEvent({ event: 'sandbox_failed', dispensary_id: dispensaryId, dispensary_name: dispensary.name, template_name: dispensary.product_provider || 'unknown', category: 'product', error_message: error.message, }); await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1, error_count = error_count + 1 WHERE id = $2`, [error.message, sandbox.id]); return { success: false, category, message: error.message }; } finally { if (browser) await browser.close(); } } /** * SandboxSpecialsJob - Sandbox specials crawling */ async function runSandboxSpecialsJob(dispensaryId, sandboxId) { const category = 'specials'; const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } let sandbox; if (sandboxId) { const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); sandbox = result.rows[0]; } else { const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.specials_provider, null); const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); sandbox = result.rows[0]; } const websiteUrl = dispensary.website; if (!websiteUrl) { return { success: false, category, message: 'No website URL available' }; } let browser = null; try { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); const result = await extractSpecials(page, websiteUrl); await updateSandboxQuality(sandbox.id, { ...result.metrics, sample_data: result.specials.slice(0, 5), }); const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : result.metrics.quality_score >= 40 ? 'needs_human_review' : 'pending'; await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]); await updateCategoryScanTime(dispensaryId, category); return { success: true, category, message: `Sandbox specials crawl completed. ${result.specials.length} specials found.`, data: { sandboxId: sandbox.id, specialsCount: result.specials.length }, }; } catch (error) { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); return { success: false, category, message: error.message }; } finally { if (browser) await browser.close(); } } /** * SandboxBrandJob - Sandbox brand intelligence crawling */ async function runSandboxBrandJob(dispensaryId, sandboxId) { const category = 'brand'; const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } let sandbox; if (sandboxId) { const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); sandbox = result.rows[0]; } else { const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.brand_provider, null); const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); sandbox = result.rows[0]; } const websiteUrl = dispensary.website; if (!websiteUrl) { return { success: false, category, message: 'No website URL available' }; } let browser = null; try { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); const result = await extractBrands(page, websiteUrl); await updateSandboxQuality(sandbox.id, { ...result.metrics, sample_data: result.brands.slice(0, 10), }); const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending'; await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]); await updateCategoryScanTime(dispensaryId, category); return { success: true, category, message: `Sandbox brand crawl completed. ${result.brands.length} brands found.`, data: { sandboxId: sandbox.id, brandsCount: result.brands.length }, }; } catch (error) { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); return { success: false, category, message: error.message }; } finally { if (browser) await browser.close(); } } /** * SandboxMetadataJob - Sandbox metadata crawling */ async function runSandboxMetadataJob(dispensaryId, sandboxId) { const category = 'metadata'; const dispensary = await getDispensaryWithCategories(dispensaryId); if (!dispensary) { return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; } let sandbox; if (sandboxId) { const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); sandbox = result.rows[0]; } else { const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.metadata_provider, null); const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); sandbox = result.rows[0]; } const websiteUrl = dispensary.website; if (!websiteUrl) { return { success: false, category, message: 'No website URL available' }; } let browser = null; try { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); const result = await extractMetadata(page, websiteUrl); await updateSandboxQuality(sandbox.id, { ...result.metrics, sample_data: result.categories.slice(0, 20), }); const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending'; await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]); await updateCategoryScanTime(dispensaryId, category); return { success: true, category, message: `Sandbox metadata crawl completed. ${result.categories.length} categories found.`, data: { sandboxId: sandbox.id, categoriesCount: result.categories.length }, }; } catch (error) { await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); return { success: false, category, message: error.message }; } finally { if (browser) await browser.close(); } } // ======================================== // Extraction Functions // ======================================== /** * Extract products from Treez-powered sites */ async function extractTreezProducts(page, baseUrl) { const products = []; let errorCount = 0; let fieldsMissing = 0; try { // Navigate to menu const menuUrls = ['/menu', '/shop', '/products', '/order']; let menuUrl = baseUrl; for (const path of menuUrls) { try { const testUrl = new URL(path, baseUrl).toString(); await page.goto(testUrl, { waitUntil: 'networkidle2', timeout: 20000 }); const hasProducts = await page.evaluate(() => { const text = document.body.innerText.toLowerCase(); return text.includes('add to cart') || text.includes('thc') || text.includes('indica'); }); if (hasProducts) { menuUrl = testUrl; break; } } catch { // Try next URL } } await page.goto(menuUrl, { waitUntil: 'networkidle2', timeout: 30000 }); await new Promise(r => setTimeout(r, 3000)); // Wait for dynamic content // Look for Treez API data in network requests or page content const pageProducts = await page.evaluate(() => { const extractedProducts = []; // Try common Treez selectors const selectors = [ '.product-card', '.menu-item', '[data-product]', '.product-tile', '.menu-product', ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); if (elements.length > 3) { elements.forEach((el) => { const nameEl = el.querySelector('h2, h3, .product-name, .name, [class*="name"]'); const priceEl = el.querySelector('.price, [class*="price"]'); const thcEl = el.querySelector('[class*="thc"], [class*="potency"]'); if (nameEl) { extractedProducts.push({ name: nameEl.textContent?.trim(), price: priceEl?.textContent?.trim(), thc: thcEl?.textContent?.trim(), html: el.outerHTML.slice(0, 500), }); } }); break; } } return extractedProducts; }); products.push(...pageProducts); // Calculate quality metrics for (const product of products) { if (!product.name) fieldsMissing++; if (!product.price) fieldsMissing++; } } catch (error) { // Error tracked via errorCount - logged at job level errorCount++; } const qualityScore = products.length > 0 ? Math.min(100, Math.max(0, 100 - (fieldsMissing * 5) - (errorCount * 10))) : 0; return { products, metrics: { quality_score: qualityScore, items_extracted: products.length, fields_missing: fieldsMissing, error_count: errorCount, }, }; } /** * Extract products using generic selectors */ async function extractGenericProducts(page, baseUrl) { const products = []; let errorCount = 0; let fieldsMissing = 0; try { // Try common menu paths const menuPaths = ['/menu', '/shop', '/products', '/order']; let foundMenu = false; for (const path of menuPaths) { try { const fullUrl = new URL(path, baseUrl).toString(); await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 }); const hasProducts = await page.evaluate(() => { const text = document.body.innerText.toLowerCase(); return text.includes('add to cart') || text.includes('thc') || text.includes('gram'); }); if (hasProducts) { foundMenu = true; break; } } catch { continue; } } if (!foundMenu) { await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 }); } await new Promise(r => setTimeout(r, 2000)); // Generic product extraction const pageProducts = await page.evaluate(() => { const extractedProducts = []; const selectors = [ '.product', '.product-card', '.menu-item', '.item-card', '[data-product]', '.strain', '.listing', ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); if (elements.length > 3) { elements.forEach((el) => { const nameEl = el.querySelector('h2, h3, h4, .name, .title, [class*="name"]'); const priceEl = el.querySelector('.price, [class*="price"]'); const brandEl = el.querySelector('.brand, [class*="brand"]'); const categoryEl = el.querySelector('.category, [class*="category"], [class*="type"]'); if (nameEl?.textContent?.trim()) { extractedProducts.push({ name: nameEl.textContent.trim(), price: priceEl?.textContent?.trim(), brand: brandEl?.textContent?.trim(), category: categoryEl?.textContent?.trim(), }); } }); break; } } return extractedProducts; }); products.push(...pageProducts); // Calculate missing fields for (const product of products) { if (!product.name) fieldsMissing++; if (!product.price) fieldsMissing++; } } catch (error) { // Error tracked via errorCount - logged at job level errorCount++; } const qualityScore = products.length > 0 ? Math.min(100, Math.max(0, 80 - (fieldsMissing * 3) - (errorCount * 10))) : 0; return { products, metrics: { quality_score: qualityScore, items_extracted: products.length, fields_missing: fieldsMissing, error_count: errorCount, }, }; } /** * Extract specials/deals */ async function extractSpecials(page, baseUrl) { const specials = []; let errorCount = 0; let fieldsMissing = 0; try { const specialsPaths = ['/specials', '/deals', '/promotions', '/offers', '/sale']; for (const path of specialsPaths) { try { const fullUrl = new URL(path, baseUrl).toString(); await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 }); const pageSpecials = await page.evaluate(() => { const extracted = []; const selectors = [ '.special', '.deal', '.promotion', '.offer', '[class*="special"]', '[class*="deal"]', ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); elements.forEach((el) => { const titleEl = el.querySelector('h2, h3, h4, .title, .name'); const descEl = el.querySelector('p, .description, .details'); const discountEl = el.querySelector('.discount, .savings, [class*="percent"]'); if (titleEl?.textContent?.trim()) { extracted.push({ title: titleEl.textContent.trim(), description: descEl?.textContent?.trim(), discount: discountEl?.textContent?.trim(), }); } }); } return extracted; }); specials.push(...pageSpecials); if (specials.length > 0) break; } catch { continue; } } for (const special of specials) { if (!special.title) fieldsMissing++; if (!special.description && !special.discount) fieldsMissing++; } } catch (error) { // Error tracked via errorCount - logged at job level errorCount++; } const qualityScore = specials.length > 0 ? Math.min(100, Math.max(0, 70 - (fieldsMissing * 5) - (errorCount * 10))) : 0; return { specials, metrics: { quality_score: qualityScore, items_extracted: specials.length, fields_missing: fieldsMissing, error_count: errorCount, }, }; } /** * Extract brand information */ async function extractBrands(page, baseUrl) { const brands = []; let errorCount = 0; let fieldsMissing = 0; try { const brandPaths = ['/brands', '/vendors', '/producers', '/menu']; for (const path of brandPaths) { try { const fullUrl = new URL(path, baseUrl).toString(); await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 }); const pageBrands = await page.evaluate(() => { const extracted = []; const brandNames = new Set(); // Look for brand elements const selectors = [ '.brand', '[class*="brand"]', '.vendor', '.producer', ]; for (const selector of selectors) { document.querySelectorAll(selector).forEach((el) => { const name = el.textContent?.trim(); if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) { brandNames.add(name); extracted.push({ name }); } }); } // Also extract from filter dropdowns document.querySelectorAll('select option, [role="option"]').forEach((el) => { const name = el.textContent?.trim(); if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) { const lowerName = name.toLowerCase(); if (!['all', 'any', 'select', 'choose', '--'].some(skip => lowerName.includes(skip))) { brandNames.add(name); extracted.push({ name, source: 'filter' }); } } }); return extracted; }); brands.push(...pageBrands); if (brands.length > 5) break; } catch { continue; } } } catch (error) { // Error tracked via errorCount - logged at job level errorCount++; } const qualityScore = brands.length > 0 ? Math.min(100, Math.max(0, 60 + Math.min(30, brands.length * 2) - (errorCount * 10))) : 0; return { brands, metrics: { quality_score: qualityScore, items_extracted: brands.length, fields_missing: fieldsMissing, error_count: errorCount, }, }; } /** * Extract metadata (categories, taxonomy) */ async function extractMetadata(page, baseUrl) { const categories = []; let errorCount = 0; let fieldsMissing = 0; try { await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 }); const menuPaths = ['/menu', '/shop', '/products']; for (const path of menuPaths) { try { await page.goto(new URL(path, baseUrl).toString(), { waitUntil: 'networkidle2', timeout: 15000 }); break; } catch { continue; } } const pageCategories = await page.evaluate(() => { const extracted = []; const categoryNames = new Set(); // Navigation/tab categories const navSelectors = [ 'nav a', '.category-nav a', '.menu-categories a', '[class*="category"] a', '.tabs button', '.tab-list button', ]; for (const selector of navSelectors) { document.querySelectorAll(selector).forEach((el) => { const name = el.textContent?.trim(); if (name && name.length > 1 && name.length < 50 && !categoryNames.has(name)) { const lowerName = name.toLowerCase(); const categoryKeywords = ['flower', 'edible', 'concentrate', 'vape', 'preroll', 'tincture', 'topical', 'accessory', 'indica', 'sativa', 'hybrid']; if (categoryKeywords.some(kw => lowerName.includes(kw)) || el.closest('[class*="category"], [class*="menu"]')) { categoryNames.add(name); extracted.push({ name, type: 'navigation' }); } } }); } // Filter categories document.querySelectorAll('select, [role="listbox"]').forEach((select) => { const label = select.getAttribute('aria-label') || select.previousElementSibling?.textContent?.trim(); if (label?.toLowerCase().includes('category') || label?.toLowerCase().includes('type')) { select.querySelectorAll('option, [role="option"]').forEach((opt) => { const name = opt.textContent?.trim(); if (name && name.length > 1 && !categoryNames.has(name)) { const lowerName = name.toLowerCase(); if (!['all', 'any', 'select', 'choose'].some(skip => lowerName.includes(skip))) { categoryNames.add(name); extracted.push({ name, type: 'filter' }); } } }); } }); return extracted; }); categories.push(...pageCategories); } catch (error) { // Error tracked via errorCount - logged at job level errorCount++; } const qualityScore = categories.length > 0 ? Math.min(100, Math.max(0, 50 + Math.min(40, categories.length * 3) - (errorCount * 10))) : 0; return { categories, metrics: { quality_score: qualityScore, items_extracted: categories.length, fields_missing: fieldsMissing, error_count: errorCount, }, }; } // ======================================== // Queue Processing Functions // ======================================== /** * Process pending category-specific sandbox jobs */ async function processCategorySandboxJobs(category, limit = 5) { const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', worker_id = $1, started_at = NOW() WHERE id IN ( SELECT id FROM sandbox_crawl_jobs WHERE status = 'pending' AND category = $2 AND scheduled_at <= NOW() ORDER BY priority DESC, scheduled_at ASC LIMIT $3 FOR UPDATE SKIP LOCKED ) RETURNING *`, [WORKER_ID, category, limit]); for (const job of jobs.rows) { try { let result; switch (category) { case 'product': result = await runSandboxProductsJob(job.dispensary_id, job.sandbox_id); break; case 'specials': result = await runSandboxSpecialsJob(job.dispensary_id, job.sandbox_id); break; case 'brand': result = await runSandboxBrandJob(job.dispensary_id, job.sandbox_id); break; case 'metadata': result = await runSandboxMetadataJob(job.dispensary_id, job.sandbox_id); break; default: result = { success: false, category, message: `Unknown category: ${category}` }; } await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3 WHERE id = $4`, [ result.success ? 'completed' : 'failed', JSON.stringify(result.data || {}), result.success ? null : result.message, job.id, ]); } catch (error) { await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]); } } } /** * Run all category production crawls for a dispensary * Each category runs independently - failures don't affect others */ async function runAllCategoryProductionCrawls(dispensaryId) { const results = []; // Run all categories in parallel - independent failures const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([ runCrawlProductsJob(dispensaryId), runCrawlSpecialsJob(dispensaryId), runCrawlBrandIntelligenceJob(dispensaryId), runCrawlMetadataJob(dispensaryId), ]); if (productResult.status === 'fulfilled') results.push(productResult.value); else results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' }); if (specialsResult.status === 'fulfilled') results.push(specialsResult.value); else results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' }); if (brandResult.status === 'fulfilled') results.push(brandResult.value); else results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' }); if (metadataResult.status === 'fulfilled') results.push(metadataResult.value); else results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' }); const successCount = results.filter(r => r.success).length; const summary = `${successCount}/4 categories succeeded: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`; // Individual category jobs log their own completion via crawlerLogger return { results, summary }; } /** * Run all category sandbox crawls for a dispensary */ async function runAllCategorySandboxCrawls(dispensaryId) { const results = []; const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([ runSandboxProductsJob(dispensaryId), runSandboxSpecialsJob(dispensaryId), runSandboxBrandJob(dispensaryId), runSandboxMetadataJob(dispensaryId), ]); if (productResult.status === 'fulfilled') results.push(productResult.value); else results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' }); if (specialsResult.status === 'fulfilled') results.push(specialsResult.value); else results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' }); if (brandResult.status === 'fulfilled') results.push(brandResult.value); else results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' }); if (metadataResult.status === 'fulfilled') results.push(metadataResult.value); else results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' }); const successCount = results.filter(r => r.success).length; const summary = `${successCount}/4 sandbox crawls: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`; // Individual sandbox jobs log their own completion via crawlerLogger return { results, summary }; }