Files
cannaiq/backend/dist/services/store-crawl-orchestrator.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

352 lines
16 KiB
JavaScript

"use strict";
/**
* Store Crawl Orchestrator
*
* Orchestrates the complete crawl workflow for a store:
* 1. Load store and its linked dispensary
* 2. Check if provider detection is needed
* 3. Run provider detection if needed
* 4. Queue appropriate crawl jobs based on provider/mode
* 5. Update store_crawl_schedule with meaningful status
*
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator;
exports.runBatchOrchestrator = runBatchOrchestrator;
exports.getStoresDueForOrchestration = getStoresDueForOrchestration;
const uuid_1 = require("uuid");
const migrate_1 = require("../db/migrate");
const crawler_logger_1 = require("./crawler-logger");
const intelligence_detector_1 = require("./intelligence-detector");
const category_crawler_jobs_1 = require("./category-crawler-jobs");
// DEPRECATED: scrapeStore writes to legacy products table
// import { scrapeStore } from '../scraper-v2';
// Import the new dutchie-az pipeline for Dutchie crawling
const product_crawler_1 = require("../dutchie-az/services/product-crawler");
const connection_1 = require("../dutchie-az/db/connection");
// ========================================
// Main Orchestrator Function
// ========================================
/**
* Run the complete crawl orchestration for a store
*
* Behavior:
* 1. Load the store and its linked dispensary
* 2. If no dispensary is linked, report error
* 3. If product_provider is missing or stale (>7 days), run detection
* 4. After detection:
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
* - Otherwise: Run sandbox crawl
* 5. Update store_crawl_schedule with status/summary
*/
async function runStoreCrawlOrchestrator(storeId) {
const startTime = Date.now();
const runId = (0, uuid_1.v4)();
let result = {
status: 'pending',
summary: '',
runId,
storeId,
dispensaryId: null,
detectionRan: false,
crawlRan: false,
durationMs: 0,
};
try {
// Mark schedule as running
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
// 1. Load store with dispensary info
const store = await getStoreWithDispensary(storeId);
if (!store) {
throw new Error(`Store ${storeId} not found`);
}
result.dispensaryId = store.dispensary_id;
// 2. Check if dispensary is linked
if (!store.dispensary_id) {
result.status = 'error';
result.summary = 'No dispensary linked - cannot determine provider';
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
result.durationMs = Date.now() - startTime;
return result;
}
// 3. Check if provider detection is needed
const needsDetection = await checkNeedsDetection(store);
if (needsDetection) {
// Run provider detection
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
if (!websiteUrl) {
result.status = 'error';
result.summary = 'No website URL available for detection';
result.error = 'Dispensary has no menu_url or website configured';
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
result.durationMs = Date.now() - startTime;
return result;
}
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
result.detectionRan = true;
result.detectionResult = detectionResult;
// Save detection results to dispensary
await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult);
crawler_logger_1.crawlerLogger.providerDetected({
dispensary_id: store.dispensary_id,
dispensary_name: store.dispensary_name || store.name,
detected_provider: detectionResult.product.provider,
confidence: detectionResult.product.confidence,
detection_method: 'orchestrator_run',
menu_url: websiteUrl,
category: 'product',
});
// Refresh store info after detection
const updatedStore = await getStoreWithDispensary(storeId);
if (updatedStore) {
Object.assign(store, updatedStore);
}
}
// 4. Determine crawl type and run
const provider = store.product_provider;
const mode = store.product_crawler_mode;
if (provider === 'dutchie' && mode === 'production') {
// Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
try {
// Look up the dispensary in the dutchie-az database
// The dutchie-az pipeline has its own dispensaries table
// We try multiple matching strategies: name, slug, or platform_dispensary_id
const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries
WHERE name ILIKE $1
OR slug ILIKE $2
LIMIT 1`, [store.dispensary_name, store.slug]);
if (dispensaryResult.rows.length === 0) {
throw new Error(`Dispensary not found in dutchie-az database. ` +
`You must add this dispensary to the dutchie-az pipeline first. ` +
`Store: ${store.name} (${store.dispensary_name})`);
}
const dutchieDispensary = dispensaryResult.rows[0];
// Run the new dutchie-az GraphQL crawler
const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true });
result.crawlRan = true;
result.crawlType = 'production';
result.productsFound = crawlResult.productsFound ?? undefined;
result.productsNew = crawlResult.productsUpserted ?? undefined;
result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
if (crawlResult.success) {
const detectionPart = result.detectionRan ? 'Detection + ' : '';
result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
result.status = 'success';
// Update store's last_scraped_at
await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
crawler_logger_1.crawlerLogger.jobCompleted({
job_id: 0, // Orchestrator doesn't create traditional jobs
store_id: storeId,
store_name: store.name,
duration_ms: crawlResult.durationMs,
products_found: crawlResult.productsFound || 0,
products_new: crawlResult.productsUpserted || 0,
products_updated: crawlResult.snapshotsCreated || 0,
provider: 'dutchie',
});
}
else {
throw new Error(crawlResult.errorMessage || 'Crawl failed');
}
}
catch (crawlError) {
result.status = 'error';
result.error = crawlError.message;
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'production';
crawler_logger_1.crawlerLogger.jobFailed({
job_id: 0,
store_id: storeId,
store_name: store.name,
duration_ms: Date.now() - startTime,
error_message: crawlError.message,
provider: 'dutchie',
});
}
}
else if (provider && provider !== 'unknown') {
// Sandbox crawl for non-Dutchie or sandbox mode
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
try {
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id);
result.crawlRan = true;
result.crawlType = 'sandbox';
result.productsFound = sandboxResult.data?.productsExtracted || 0;
const detectionPart = result.detectionRan ? 'Detection + ' : '';
if (sandboxResult.success) {
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
result.status = 'sandbox_only';
}
else {
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
result.status = 'error';
result.error = sandboxResult.message;
}
}
catch (sandboxError) {
result.status = 'error';
result.error = sandboxError.message;
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'sandbox';
}
}
else {
// No provider detected - detection only
if (result.detectionRan) {
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
result.status = 'detection_only';
}
else {
result.summary = 'No provider detected and no crawl possible';
result.status = 'error';
result.error = 'Could not determine menu provider';
}
}
}
catch (error) {
result.status = 'error';
result.error = error.message;
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
crawler_logger_1.crawlerLogger.queueFailure({
queue_type: 'orchestrator',
error_message: error.message,
});
}
result.durationMs = Date.now() - startTime;
// Update final schedule status
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
// Create a crawl_job record for tracking
await createOrchestratorJobRecord(storeId, result);
return result;
}
// ========================================
// Helper Functions
// ========================================
async function getStoreWithDispensary(storeId) {
const result = await migrate_1.pool.query(`SELECT
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
d.name as dispensary_name,
d.menu_url as dispensary_menu_url,
d.website as dispensary_website,
d.product_provider,
d.product_confidence,
d.product_crawler_mode,
d.last_product_scan_at
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
WHERE s.id = $1`, [storeId]);
return result.rows[0] || null;
}
async function checkNeedsDetection(store) {
// No dispensary = can't detect
if (!store.dispensary_id)
return false;
// No provider = definitely needs detection
if (!store.product_provider)
return true;
// Unknown provider = needs detection
if (store.product_provider === 'unknown')
return true;
// Low confidence = needs re-detection
if (store.product_confidence !== null && store.product_confidence < 50)
return true;
// Stale detection (> 7 days) = needs refresh
if (store.last_product_scan_at) {
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
if (daysSince > 7)
return true;
}
return false;
}
async function updateScheduleStatus(storeId, status, summary, runId, error) {
await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
VALUES ($1, $2, $3, NOW(), $4)
ON CONFLICT (store_id) DO UPDATE SET
last_status = $2,
last_summary = $3,
last_run_at = NOW(),
last_error = $4,
updated_at = NOW()`, [storeId, status, summary, error || null]);
}
async function getLatestCrawlStats(storeId) {
// Get count of products for this store
const result = await migrate_1.pool.query(`SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
FROM products
WHERE store_id = $1`, [storeId]);
return {
products_found: parseInt(result.rows[0]?.total || '0'),
products_new: parseInt(result.rows[0]?.recent_new || '0'),
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
};
}
async function createOrchestratorJobRecord(storeId, result) {
await migrate_1.pool.query(`INSERT INTO crawl_jobs (
store_id, job_type, trigger_type, status, priority,
scheduled_at, started_at, completed_at,
products_found, products_new, products_updated,
error_message, orchestrator_run_id, detection_result
) VALUES (
$1, 'orchestrator', 'manual', $2, 100,
NOW(), NOW(), NOW(),
$3, $4, $5,
$6, $7, $8
)`, [
storeId,
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
result.productsFound || null,
result.productsNew || null,
result.productsUpdated || null,
result.error || null,
result.runId,
result.detectionResult ? JSON.stringify({
product_provider: result.detectionResult.product.provider,
product_confidence: result.detectionResult.product.confidence,
product_mode: result.detectionResult.product.mode,
}) : null,
]);
}
// ========================================
// Batch Orchestration
// ========================================
/**
* Run orchestrator for multiple stores
*/
async function runBatchOrchestrator(storeIds, concurrency = 3) {
const results = [];
// Process in batches
for (let i = 0; i < storeIds.length; i += concurrency) {
const batch = storeIds.slice(i, i + concurrency);
const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId)));
results.push(...batchResults);
}
return results;
}
/**
* Get stores that are due for orchestration
*/
async function getStoresDueForOrchestration(limit = 10) {
const result = await migrate_1.pool.query(`SELECT s.id
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.enabled, TRUE) = TRUE
AND (
scs.last_run_at IS NULL
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
)
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
LIMIT $1`, [limit]);
return result.rows.map(row => row.id);
}