The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
352 lines
16 KiB
JavaScript
352 lines
16 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Store Crawl Orchestrator
|
|
*
|
|
* Orchestrates the complete crawl workflow for a store:
|
|
* 1. Load store and its linked dispensary
|
|
* 2. Check if provider detection is needed
|
|
* 3. Run provider detection if needed
|
|
* 4. Queue appropriate crawl jobs based on provider/mode
|
|
* 5. Update store_crawl_schedule with meaningful status
|
|
*
|
|
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
|
|
*/
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator;
|
|
exports.runBatchOrchestrator = runBatchOrchestrator;
|
|
exports.getStoresDueForOrchestration = getStoresDueForOrchestration;
|
|
const uuid_1 = require("uuid");
|
|
const migrate_1 = require("../db/migrate");
|
|
const crawler_logger_1 = require("./crawler-logger");
|
|
const intelligence_detector_1 = require("./intelligence-detector");
|
|
const category_crawler_jobs_1 = require("./category-crawler-jobs");
|
|
// DEPRECATED: scrapeStore writes to legacy products table
|
|
// import { scrapeStore } from '../scraper-v2';
|
|
// Import the new dutchie-az pipeline for Dutchie crawling
|
|
const product_crawler_1 = require("../dutchie-az/services/product-crawler");
|
|
const connection_1 = require("../dutchie-az/db/connection");
|
|
// ========================================
|
|
// Main Orchestrator Function
|
|
// ========================================
|
|
/**
|
|
* Run the complete crawl orchestration for a store
|
|
*
|
|
* Behavior:
|
|
* 1. Load the store and its linked dispensary
|
|
* 2. If no dispensary is linked, report error
|
|
* 3. If product_provider is missing or stale (>7 days), run detection
|
|
* 4. After detection:
|
|
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
|
* - Otherwise: Run sandbox crawl
|
|
* 5. Update store_crawl_schedule with status/summary
|
|
*/
|
|
async function runStoreCrawlOrchestrator(storeId) {
|
|
const startTime = Date.now();
|
|
const runId = (0, uuid_1.v4)();
|
|
let result = {
|
|
status: 'pending',
|
|
summary: '',
|
|
runId,
|
|
storeId,
|
|
dispensaryId: null,
|
|
detectionRan: false,
|
|
crawlRan: false,
|
|
durationMs: 0,
|
|
};
|
|
try {
|
|
// Mark schedule as running
|
|
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
|
|
// 1. Load store with dispensary info
|
|
const store = await getStoreWithDispensary(storeId);
|
|
if (!store) {
|
|
throw new Error(`Store ${storeId} not found`);
|
|
}
|
|
result.dispensaryId = store.dispensary_id;
|
|
// 2. Check if dispensary is linked
|
|
if (!store.dispensary_id) {
|
|
result.status = 'error';
|
|
result.summary = 'No dispensary linked - cannot determine provider';
|
|
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
|
|
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
|
result.durationMs = Date.now() - startTime;
|
|
return result;
|
|
}
|
|
// 3. Check if provider detection is needed
|
|
const needsDetection = await checkNeedsDetection(store);
|
|
if (needsDetection) {
|
|
// Run provider detection
|
|
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
|
|
if (!websiteUrl) {
|
|
result.status = 'error';
|
|
result.summary = 'No website URL available for detection';
|
|
result.error = 'Dispensary has no menu_url or website configured';
|
|
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
|
result.durationMs = Date.now() - startTime;
|
|
return result;
|
|
}
|
|
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
|
|
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
|
result.detectionRan = true;
|
|
result.detectionResult = detectionResult;
|
|
// Save detection results to dispensary
|
|
await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult);
|
|
crawler_logger_1.crawlerLogger.providerDetected({
|
|
dispensary_id: store.dispensary_id,
|
|
dispensary_name: store.dispensary_name || store.name,
|
|
detected_provider: detectionResult.product.provider,
|
|
confidence: detectionResult.product.confidence,
|
|
detection_method: 'orchestrator_run',
|
|
menu_url: websiteUrl,
|
|
category: 'product',
|
|
});
|
|
// Refresh store info after detection
|
|
const updatedStore = await getStoreWithDispensary(storeId);
|
|
if (updatedStore) {
|
|
Object.assign(store, updatedStore);
|
|
}
|
|
}
|
|
// 4. Determine crawl type and run
|
|
const provider = store.product_provider;
|
|
const mode = store.product_crawler_mode;
|
|
if (provider === 'dutchie' && mode === 'production') {
|
|
// Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
|
|
await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
|
|
try {
|
|
// Look up the dispensary in the dutchie-az database
|
|
// The dutchie-az pipeline has its own dispensaries table
|
|
// We try multiple matching strategies: name, slug, or platform_dispensary_id
|
|
const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries
|
|
WHERE name ILIKE $1
|
|
OR slug ILIKE $2
|
|
LIMIT 1`, [store.dispensary_name, store.slug]);
|
|
if (dispensaryResult.rows.length === 0) {
|
|
throw new Error(`Dispensary not found in dutchie-az database. ` +
|
|
`You must add this dispensary to the dutchie-az pipeline first. ` +
|
|
`Store: ${store.name} (${store.dispensary_name})`);
|
|
}
|
|
const dutchieDispensary = dispensaryResult.rows[0];
|
|
// Run the new dutchie-az GraphQL crawler
|
|
const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true });
|
|
result.crawlRan = true;
|
|
result.crawlType = 'production';
|
|
result.productsFound = crawlResult.productsFound ?? undefined;
|
|
result.productsNew = crawlResult.productsUpserted ?? undefined;
|
|
result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
|
|
if (crawlResult.success) {
|
|
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
|
result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
|
|
result.status = 'success';
|
|
// Update store's last_scraped_at
|
|
await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
|
|
crawler_logger_1.crawlerLogger.jobCompleted({
|
|
job_id: 0, // Orchestrator doesn't create traditional jobs
|
|
store_id: storeId,
|
|
store_name: store.name,
|
|
duration_ms: crawlResult.durationMs,
|
|
products_found: crawlResult.productsFound || 0,
|
|
products_new: crawlResult.productsUpserted || 0,
|
|
products_updated: crawlResult.snapshotsCreated || 0,
|
|
provider: 'dutchie',
|
|
});
|
|
}
|
|
else {
|
|
throw new Error(crawlResult.errorMessage || 'Crawl failed');
|
|
}
|
|
}
|
|
catch (crawlError) {
|
|
result.status = 'error';
|
|
result.error = crawlError.message;
|
|
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
|
result.crawlRan = true;
|
|
result.crawlType = 'production';
|
|
crawler_logger_1.crawlerLogger.jobFailed({
|
|
job_id: 0,
|
|
store_id: storeId,
|
|
store_name: store.name,
|
|
duration_ms: Date.now() - startTime,
|
|
error_message: crawlError.message,
|
|
provider: 'dutchie',
|
|
});
|
|
}
|
|
}
|
|
else if (provider && provider !== 'unknown') {
|
|
// Sandbox crawl for non-Dutchie or sandbox mode
|
|
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
|
|
try {
|
|
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id);
|
|
result.crawlRan = true;
|
|
result.crawlType = 'sandbox';
|
|
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
|
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
|
if (sandboxResult.success) {
|
|
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
|
result.status = 'sandbox_only';
|
|
}
|
|
else {
|
|
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
|
result.status = 'error';
|
|
result.error = sandboxResult.message;
|
|
}
|
|
}
|
|
catch (sandboxError) {
|
|
result.status = 'error';
|
|
result.error = sandboxError.message;
|
|
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
|
result.crawlRan = true;
|
|
result.crawlType = 'sandbox';
|
|
}
|
|
}
|
|
else {
|
|
// No provider detected - detection only
|
|
if (result.detectionRan) {
|
|
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
|
|
result.status = 'detection_only';
|
|
}
|
|
else {
|
|
result.summary = 'No provider detected and no crawl possible';
|
|
result.status = 'error';
|
|
result.error = 'Could not determine menu provider';
|
|
}
|
|
}
|
|
}
|
|
catch (error) {
|
|
result.status = 'error';
|
|
result.error = error.message;
|
|
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
|
crawler_logger_1.crawlerLogger.queueFailure({
|
|
queue_type: 'orchestrator',
|
|
error_message: error.message,
|
|
});
|
|
}
|
|
result.durationMs = Date.now() - startTime;
|
|
// Update final schedule status
|
|
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
|
|
// Create a crawl_job record for tracking
|
|
await createOrchestratorJobRecord(storeId, result);
|
|
return result;
|
|
}
|
|
// ========================================
|
|
// Helper Functions
|
|
// ========================================
|
|
async function getStoreWithDispensary(storeId) {
|
|
const result = await migrate_1.pool.query(`SELECT
|
|
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
|
|
d.name as dispensary_name,
|
|
d.menu_url as dispensary_menu_url,
|
|
d.website as dispensary_website,
|
|
d.product_provider,
|
|
d.product_confidence,
|
|
d.product_crawler_mode,
|
|
d.last_product_scan_at
|
|
FROM stores s
|
|
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
|
WHERE s.id = $1`, [storeId]);
|
|
return result.rows[0] || null;
|
|
}
|
|
async function checkNeedsDetection(store) {
|
|
// No dispensary = can't detect
|
|
if (!store.dispensary_id)
|
|
return false;
|
|
// No provider = definitely needs detection
|
|
if (!store.product_provider)
|
|
return true;
|
|
// Unknown provider = needs detection
|
|
if (store.product_provider === 'unknown')
|
|
return true;
|
|
// Low confidence = needs re-detection
|
|
if (store.product_confidence !== null && store.product_confidence < 50)
|
|
return true;
|
|
// Stale detection (> 7 days) = needs refresh
|
|
if (store.last_product_scan_at) {
|
|
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
|
if (daysSince > 7)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
async function updateScheduleStatus(storeId, status, summary, runId, error) {
|
|
await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
|
|
VALUES ($1, $2, $3, NOW(), $4)
|
|
ON CONFLICT (store_id) DO UPDATE SET
|
|
last_status = $2,
|
|
last_summary = $3,
|
|
last_run_at = NOW(),
|
|
last_error = $4,
|
|
updated_at = NOW()`, [storeId, status, summary, error || null]);
|
|
}
|
|
async function getLatestCrawlStats(storeId) {
|
|
// Get count of products for this store
|
|
const result = await migrate_1.pool.query(`SELECT
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
|
|
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
|
|
FROM products
|
|
WHERE store_id = $1`, [storeId]);
|
|
return {
|
|
products_found: parseInt(result.rows[0]?.total || '0'),
|
|
products_new: parseInt(result.rows[0]?.recent_new || '0'),
|
|
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
|
|
};
|
|
}
|
|
async function createOrchestratorJobRecord(storeId, result) {
|
|
await migrate_1.pool.query(`INSERT INTO crawl_jobs (
|
|
store_id, job_type, trigger_type, status, priority,
|
|
scheduled_at, started_at, completed_at,
|
|
products_found, products_new, products_updated,
|
|
error_message, orchestrator_run_id, detection_result
|
|
) VALUES (
|
|
$1, 'orchestrator', 'manual', $2, 100,
|
|
NOW(), NOW(), NOW(),
|
|
$3, $4, $5,
|
|
$6, $7, $8
|
|
)`, [
|
|
storeId,
|
|
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
|
result.productsFound || null,
|
|
result.productsNew || null,
|
|
result.productsUpdated || null,
|
|
result.error || null,
|
|
result.runId,
|
|
result.detectionResult ? JSON.stringify({
|
|
product_provider: result.detectionResult.product.provider,
|
|
product_confidence: result.detectionResult.product.confidence,
|
|
product_mode: result.detectionResult.product.mode,
|
|
}) : null,
|
|
]);
|
|
}
|
|
// ========================================
|
|
// Batch Orchestration
|
|
// ========================================
|
|
/**
|
|
* Run orchestrator for multiple stores
|
|
*/
|
|
async function runBatchOrchestrator(storeIds, concurrency = 3) {
|
|
const results = [];
|
|
// Process in batches
|
|
for (let i = 0; i < storeIds.length; i += concurrency) {
|
|
const batch = storeIds.slice(i, i + concurrency);
|
|
const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId)));
|
|
results.push(...batchResults);
|
|
}
|
|
return results;
|
|
}
|
|
/**
|
|
* Get stores that are due for orchestration
|
|
*/
|
|
async function getStoresDueForOrchestration(limit = 10) {
|
|
const result = await migrate_1.pool.query(`SELECT s.id
|
|
FROM stores s
|
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
|
WHERE s.active = TRUE
|
|
AND s.scrape_enabled = TRUE
|
|
AND COALESCE(scs.enabled, TRUE) = TRUE
|
|
AND (
|
|
scs.last_run_at IS NULL
|
|
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
|
|
)
|
|
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
|
|
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
|
|
LIMIT $1`, [limit]);
|
|
return result.rows.map(row => row.id);
|
|
}
|