"use strict"; /** * Store Crawl Orchestrator * * Orchestrates the complete crawl workflow for a store: * 1. Load store and its linked dispensary * 2. Check if provider detection is needed * 3. Run provider detection if needed * 4. Queue appropriate crawl jobs based on provider/mode * 5. Update store_crawl_schedule with meaningful status * * This replaces the simple "triggerManualCrawl" with intelligent orchestration. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator; exports.runBatchOrchestrator = runBatchOrchestrator; exports.getStoresDueForOrchestration = getStoresDueForOrchestration; const uuid_1 = require("uuid"); const migrate_1 = require("../db/migrate"); const crawler_logger_1 = require("./crawler-logger"); const intelligence_detector_1 = require("./intelligence-detector"); const category_crawler_jobs_1 = require("./category-crawler-jobs"); // DEPRECATED: scrapeStore writes to legacy products table // import { scrapeStore } from '../scraper-v2'; // Import the new dutchie-az pipeline for Dutchie crawling const product_crawler_1 = require("../dutchie-az/services/product-crawler"); const connection_1 = require("../dutchie-az/db/connection"); // ======================================== // Main Orchestrator Function // ======================================== /** * Run the complete crawl orchestration for a store * * Behavior: * 1. Load the store and its linked dispensary * 2. If no dispensary is linked, report error * 3. If product_provider is missing or stale (>7 days), run detection * 4. After detection: * - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl * - Otherwise: Run sandbox crawl * 5. Update store_crawl_schedule with status/summary */ async function runStoreCrawlOrchestrator(storeId) { const startTime = Date.now(); const runId = (0, uuid_1.v4)(); let result = { status: 'pending', summary: '', runId, storeId, dispensaryId: null, detectionRan: false, crawlRan: false, durationMs: 0, }; try { // Mark schedule as running await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId); // 1. Load store with dispensary info const store = await getStoreWithDispensary(storeId); if (!store) { throw new Error(`Store ${storeId} not found`); } result.dispensaryId = store.dispensary_id; // 2. Check if dispensary is linked if (!store.dispensary_id) { result.status = 'error'; result.summary = 'No dispensary linked - cannot determine provider'; result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.'; await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error); result.durationMs = Date.now() - startTime; return result; } // 3. Check if provider detection is needed const needsDetection = await checkNeedsDetection(store); if (needsDetection) { // Run provider detection const websiteUrl = store.dispensary_menu_url || store.dispensary_website; if (!websiteUrl) { result.status = 'error'; result.summary = 'No website URL available for detection'; result.error = 'Dispensary has no menu_url or website configured'; await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error); result.durationMs = Date.now() - startTime; return result; } await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId); const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl); result.detectionRan = true; result.detectionResult = detectionResult; // Save detection results to dispensary await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult); crawler_logger_1.crawlerLogger.providerDetected({ dispensary_id: store.dispensary_id, dispensary_name: store.dispensary_name || store.name, detected_provider: detectionResult.product.provider, confidence: detectionResult.product.confidence, detection_method: 'orchestrator_run', menu_url: websiteUrl, category: 'product', }); // Refresh store info after detection const updatedStore = await getStoreWithDispensary(storeId); if (updatedStore) { Object.assign(store, updatedStore); } } // 4. Determine crawl type and run const provider = store.product_provider; const mode = store.product_crawler_mode; if (provider === 'dutchie' && mode === 'production') { // Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId); try { // Look up the dispensary in the dutchie-az database // The dutchie-az pipeline has its own dispensaries table // We try multiple matching strategies: name, slug, or platform_dispensary_id const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries WHERE name ILIKE $1 OR slug ILIKE $2 LIMIT 1`, [store.dispensary_name, store.slug]); if (dispensaryResult.rows.length === 0) { throw new Error(`Dispensary not found in dutchie-az database. ` + `You must add this dispensary to the dutchie-az pipeline first. ` + `Store: ${store.name} (${store.dispensary_name})`); } const dutchieDispensary = dispensaryResult.rows[0]; // Run the new dutchie-az GraphQL crawler const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true }); result.crawlRan = true; result.crawlType = 'production'; result.productsFound = crawlResult.productsFound ?? undefined; result.productsNew = crawlResult.productsUpserted ?? undefined; result.productsUpdated = crawlResult.snapshotsCreated ?? undefined; if (crawlResult.success) { const detectionPart = result.detectionRan ? 'Detection + ' : ''; result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`; result.status = 'success'; // Update store's last_scraped_at await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]); crawler_logger_1.crawlerLogger.jobCompleted({ job_id: 0, // Orchestrator doesn't create traditional jobs store_id: storeId, store_name: store.name, duration_ms: crawlResult.durationMs, products_found: crawlResult.productsFound || 0, products_new: crawlResult.productsUpserted || 0, products_updated: crawlResult.snapshotsCreated || 0, provider: 'dutchie', }); } else { throw new Error(crawlResult.errorMessage || 'Crawl failed'); } } catch (crawlError) { result.status = 'error'; result.error = crawlError.message; result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`; result.crawlRan = true; result.crawlType = 'production'; crawler_logger_1.crawlerLogger.jobFailed({ job_id: 0, store_id: storeId, store_name: store.name, duration_ms: Date.now() - startTime, error_message: crawlError.message, provider: 'dutchie', }); } } else if (provider && provider !== 'unknown') { // Sandbox crawl for non-Dutchie or sandbox mode await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId); try { const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id); result.crawlRan = true; result.crawlType = 'sandbox'; result.productsFound = sandboxResult.data?.productsExtracted || 0; const detectionPart = result.detectionRan ? 'Detection + ' : ''; if (sandboxResult.success) { result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`; result.status = 'sandbox_only'; } else { result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`; result.status = 'error'; result.error = sandboxResult.message; } } catch (sandboxError) { result.status = 'error'; result.error = sandboxError.message; result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`; result.crawlRan = true; result.crawlType = 'sandbox'; } } else { // No provider detected - detection only if (result.detectionRan) { result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`; result.status = 'detection_only'; } else { result.summary = 'No provider detected and no crawl possible'; result.status = 'error'; result.error = 'Could not determine menu provider'; } } } catch (error) { result.status = 'error'; result.error = error.message; result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`; crawler_logger_1.crawlerLogger.queueFailure({ queue_type: 'orchestrator', error_message: error.message, }); } result.durationMs = Date.now() - startTime; // Update final schedule status await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error); // Create a crawl_job record for tracking await createOrchestratorJobRecord(storeId, result); return result; } // ======================================== // Helper Functions // ======================================== async function getStoreWithDispensary(storeId) { const result = await migrate_1.pool.query(`SELECT s.id, s.name, s.slug, s.timezone, s.dispensary_id, d.name as dispensary_name, d.menu_url as dispensary_menu_url, d.website as dispensary_website, d.product_provider, d.product_confidence, d.product_crawler_mode, d.last_product_scan_at FROM stores s LEFT JOIN dispensaries d ON d.id = s.dispensary_id WHERE s.id = $1`, [storeId]); return result.rows[0] || null; } async function checkNeedsDetection(store) { // No dispensary = can't detect if (!store.dispensary_id) return false; // No provider = definitely needs detection if (!store.product_provider) return true; // Unknown provider = needs detection if (store.product_provider === 'unknown') return true; // Low confidence = needs re-detection if (store.product_confidence !== null && store.product_confidence < 50) return true; // Stale detection (> 7 days) = needs refresh if (store.last_product_scan_at) { const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24); if (daysSince > 7) return true; } return false; } async function updateScheduleStatus(storeId, status, summary, runId, error) { await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error) VALUES ($1, $2, $3, NOW(), $4) ON CONFLICT (store_id) DO UPDATE SET last_status = $2, last_summary = $3, last_run_at = NOW(), last_error = $4, updated_at = NOW()`, [storeId, status, summary, error || null]); } async function getLatestCrawlStats(storeId) { // Get count of products for this store const result = await migrate_1.pool.query(`SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new, COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated FROM products WHERE store_id = $1`, [storeId]); return { products_found: parseInt(result.rows[0]?.total || '0'), products_new: parseInt(result.rows[0]?.recent_new || '0'), products_updated: parseInt(result.rows[0]?.recent_updated || '0'), }; } async function createOrchestratorJobRecord(storeId, result) { await migrate_1.pool.query(`INSERT INTO crawl_jobs ( store_id, job_type, trigger_type, status, priority, scheduled_at, started_at, completed_at, products_found, products_new, products_updated, error_message, orchestrator_run_id, detection_result ) VALUES ( $1, 'orchestrator', 'manual', $2, 100, NOW(), NOW(), NOW(), $3, $4, $5, $6, $7, $8 )`, [ storeId, result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed', result.productsFound || null, result.productsNew || null, result.productsUpdated || null, result.error || null, result.runId, result.detectionResult ? JSON.stringify({ product_provider: result.detectionResult.product.provider, product_confidence: result.detectionResult.product.confidence, product_mode: result.detectionResult.product.mode, }) : null, ]); } // ======================================== // Batch Orchestration // ======================================== /** * Run orchestrator for multiple stores */ async function runBatchOrchestrator(storeIds, concurrency = 3) { const results = []; // Process in batches for (let i = 0; i < storeIds.length; i += concurrency) { const batch = storeIds.slice(i, i + concurrency); const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId))); results.push(...batchResults); } return results; } /** * Get stores that are due for orchestration */ async function getStoresDueForOrchestration(limit = 10) { const result = await migrate_1.pool.query(`SELECT s.id FROM stores s LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id WHERE s.active = TRUE AND s.scrape_enabled = TRUE AND COALESCE(scs.enabled, TRUE) = TRUE AND ( scs.last_run_at IS NULL OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL ) AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending')) ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST LIMIT $1`, [limit]); return result.rows.map(row => row.id); }