fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
383
backend/dist/services/dispensary-orchestrator.js
vendored
Normal file
383
backend/dist/services/dispensary-orchestrator.js
vendored
Normal file
@@ -0,0 +1,383 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dispensary Crawl Orchestrator
|
||||
*
|
||||
* Orchestrates the complete crawl workflow for a dispensary:
|
||||
* 1. Load dispensary data
|
||||
* 2. Check if provider detection is needed
|
||||
* 3. Run provider detection if needed
|
||||
* 4. Queue appropriate crawl jobs based on provider/mode
|
||||
* 5. Update dispensary_crawl_schedule with meaningful status
|
||||
*
|
||||
* This works DIRECTLY with dispensaries (not through stores table).
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runDispensaryOrchestrator = runDispensaryOrchestrator;
|
||||
exports.runBatchDispensaryOrchestrator = runBatchDispensaryOrchestrator;
|
||||
exports.getDispensariesDueForOrchestration = getDispensariesDueForOrchestration;
|
||||
exports.ensureAllDispensariesHaveSchedules = ensureAllDispensariesHaveSchedules;
|
||||
exports.processDispensaryScheduler = processDispensaryScheduler;
|
||||
const uuid_1 = require("uuid");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crawler_logger_1 = require("./crawler-logger");
|
||||
const intelligence_detector_1 = require("./intelligence-detector");
|
||||
const category_crawler_jobs_1 = require("./category-crawler-jobs");
|
||||
// ========================================
|
||||
// Main Orchestrator Function
|
||||
// ========================================
|
||||
/**
|
||||
* Run the complete crawl orchestration for a dispensary
|
||||
*
|
||||
* Behavior:
|
||||
* 1. Load the dispensary info
|
||||
* 2. If product_provider is missing or stale (>7 days), run detection
|
||||
* 3. After detection:
|
||||
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
||||
* - Otherwise: Run sandbox crawl
|
||||
* 4. Update dispensary_crawl_schedule with status/summary
|
||||
*/
|
||||
async function runDispensaryOrchestrator(dispensaryId, scheduleId) {
|
||||
const startTime = Date.now();
|
||||
const runId = (0, uuid_1.v4)();
|
||||
let result = {
|
||||
status: 'pending',
|
||||
summary: '',
|
||||
runId,
|
||||
dispensaryId,
|
||||
dispensaryName: '',
|
||||
detectionRan: false,
|
||||
crawlRan: false,
|
||||
durationMs: 0,
|
||||
};
|
||||
try {
|
||||
// Mark schedule as running
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Starting orchestrator...', null, runId);
|
||||
// 1. Load dispensary info
|
||||
const dispensary = await getDispensaryInfo(dispensaryId);
|
||||
if (!dispensary) {
|
||||
throw new Error(`Dispensary ${dispensaryId} not found`);
|
||||
}
|
||||
result.dispensaryName = dispensary.name;
|
||||
// 2. Check if provider detection is needed
|
||||
const needsDetection = await checkNeedsDetection(dispensary);
|
||||
if (needsDetection) {
|
||||
// Run provider detection
|
||||
const websiteUrl = dispensary.menu_url || dispensary.website;
|
||||
if (!websiteUrl) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No website URL available for detection';
|
||||
result.error = 'Dispensary has no menu_url or website configured';
|
||||
await updateScheduleStatus(dispensaryId, 'error', result.summary, result.error, runId);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
await createJobRecord(dispensaryId, scheduleId, result);
|
||||
return result;
|
||||
}
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Running provider detection...', null, runId);
|
||||
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
||||
result.detectionRan = true;
|
||||
result.detectionResult = detectionResult;
|
||||
// Save detection results to dispensary
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(dispensaryId, detectionResult);
|
||||
crawler_logger_1.crawlerLogger.providerDetected({
|
||||
dispensary_id: dispensaryId,
|
||||
dispensary_name: dispensary.name,
|
||||
detected_provider: detectionResult.product.provider,
|
||||
confidence: detectionResult.product.confidence,
|
||||
detection_method: 'dispensary_orchestrator',
|
||||
menu_url: websiteUrl,
|
||||
category: 'product',
|
||||
});
|
||||
// Refresh dispensary info after detection
|
||||
const updatedDispensary = await getDispensaryInfo(dispensaryId);
|
||||
if (updatedDispensary) {
|
||||
Object.assign(dispensary, updatedDispensary);
|
||||
}
|
||||
}
|
||||
// 3. Determine crawl type and run
|
||||
const provider = dispensary.product_provider;
|
||||
const mode = dispensary.product_crawler_mode;
|
||||
if (provider === 'dutchie' && mode === 'production') {
|
||||
// Production Dutchie crawl
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Running Dutchie production crawl...', null, runId);
|
||||
try {
|
||||
// Run the category-specific crawl job
|
||||
const crawlResult = await (0, category_crawler_jobs_1.runCrawlProductsJob)(dispensaryId);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
if (crawlResult.success) {
|
||||
result.productsFound = crawlResult.data?.productsFound || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
result.summary = `${detectionPart}Dutchie products crawl completed`;
|
||||
result.status = 'success';
|
||||
crawler_logger_1.crawlerLogger.jobCompleted({
|
||||
job_id: 0,
|
||||
store_id: 0,
|
||||
store_name: dispensary.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
products_found: result.productsFound || 0,
|
||||
products_new: 0,
|
||||
products_updated: 0,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
else {
|
||||
result.status = 'error';
|
||||
result.error = crawlResult.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlResult.message.slice(0, 100)}`;
|
||||
}
|
||||
}
|
||||
catch (crawlError) {
|
||||
result.status = 'error';
|
||||
result.error = crawlError.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
crawler_logger_1.crawlerLogger.jobFailed({
|
||||
job_id: 0,
|
||||
store_id: 0,
|
||||
store_name: dispensary.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_message: crawlError.message,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (provider && provider !== 'unknown') {
|
||||
// Sandbox crawl for non-Dutchie or sandbox mode
|
||||
await updateScheduleStatus(dispensaryId, 'running', `Running ${provider} sandbox crawl...`, null, runId);
|
||||
try {
|
||||
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(dispensaryId);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
if (sandboxResult.success) {
|
||||
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
||||
result.status = 'sandbox_only';
|
||||
}
|
||||
else {
|
||||
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
||||
result.status = 'error';
|
||||
result.error = sandboxResult.message;
|
||||
}
|
||||
}
|
||||
catch (sandboxError) {
|
||||
result.status = 'error';
|
||||
result.error = sandboxError.message;
|
||||
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
}
|
||||
}
|
||||
else {
|
||||
// No provider detected - detection only
|
||||
if (result.detectionRan) {
|
||||
result.summary = `Detection complete: provider=${dispensary.product_provider || 'unknown'}, confidence=${dispensary.product_confidence || 0}%`;
|
||||
result.status = 'detection_only';
|
||||
}
|
||||
else {
|
||||
result.summary = 'No provider detected and no crawl possible';
|
||||
result.status = 'error';
|
||||
result.error = 'Could not determine menu provider';
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.status = 'error';
|
||||
result.error = error.message;
|
||||
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
||||
crawler_logger_1.crawlerLogger.queueFailure({
|
||||
queue_type: 'dispensary_orchestrator',
|
||||
error_message: error.message,
|
||||
});
|
||||
}
|
||||
result.durationMs = Date.now() - startTime;
|
||||
// Update final schedule status
|
||||
await updateScheduleStatus(dispensaryId, result.status, result.summary, result.error || null, runId);
|
||||
// Create job record
|
||||
await createJobRecord(dispensaryId, scheduleId, result);
|
||||
return result;
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getDispensaryInfo(dispensaryId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, city, website, menu_url,
|
||||
product_provider, product_confidence, product_crawler_mode, last_product_scan_at
|
||||
FROM dispensaries
|
||||
WHERE id = $1`, [dispensaryId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function checkNeedsDetection(dispensary) {
|
||||
// No provider = definitely needs detection
|
||||
if (!dispensary.product_provider)
|
||||
return true;
|
||||
// Unknown provider = needs detection
|
||||
if (dispensary.product_provider === 'unknown')
|
||||
return true;
|
||||
// Low confidence = needs re-detection
|
||||
if (dispensary.product_confidence !== null && dispensary.product_confidence < 50)
|
||||
return true;
|
||||
// Stale detection (> 7 days) = needs refresh
|
||||
if (dispensary.last_product_scan_at) {
|
||||
const daysSince = (Date.now() - new Date(dispensary.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
||||
if (daysSince > 7)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async function updateScheduleStatus(dispensaryId, status, summary, error, runId) {
|
||||
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, last_status, last_summary, last_error, last_run_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, NOW(), NOW())
|
||||
ON CONFLICT (dispensary_id) DO UPDATE SET
|
||||
last_status = $2,
|
||||
last_summary = $3,
|
||||
last_error = $4,
|
||||
last_run_at = NOW(),
|
||||
updated_at = NOW()`, [dispensaryId, status, summary, error]);
|
||||
}
|
||||
async function createJobRecord(dispensaryId, scheduleId, result) {
|
||||
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_jobs (
|
||||
dispensary_id, schedule_id, job_type, trigger_type, status, priority,
|
||||
scheduled_at, started_at, completed_at, duration_ms,
|
||||
detection_ran, crawl_ran, crawl_type,
|
||||
products_found, products_new, products_updated,
|
||||
detected_provider, detected_confidence, detected_mode,
|
||||
error_message, run_id
|
||||
) VALUES (
|
||||
$1, $2, 'orchestrator', 'manual', $3, 100,
|
||||
NOW(), NOW(), NOW(), $4,
|
||||
$5, $6, $7,
|
||||
$8, $9, $10,
|
||||
$11, $12, $13,
|
||||
$14, $15
|
||||
)`, [
|
||||
dispensaryId,
|
||||
scheduleId || null,
|
||||
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
||||
result.durationMs,
|
||||
result.detectionRan,
|
||||
result.crawlRan,
|
||||
result.crawlType || null,
|
||||
result.productsFound || null,
|
||||
result.productsNew || null,
|
||||
result.productsUpdated || null,
|
||||
result.detectionResult?.product.provider || null,
|
||||
result.detectionResult?.product.confidence || null,
|
||||
result.detectionResult?.product.mode || null,
|
||||
result.error || null,
|
||||
result.runId,
|
||||
]);
|
||||
// Update schedule stats
|
||||
if (result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only') {
|
||||
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
|
||||
total_runs = COALESCE(total_runs, 0) + 1,
|
||||
successful_runs = COALESCE(successful_runs, 0) + 1,
|
||||
consecutive_failures = 0,
|
||||
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
|
||||
last_duration_ms = $2
|
||||
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
|
||||
}
|
||||
else if (result.status === 'error') {
|
||||
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
|
||||
total_runs = COALESCE(total_runs, 0) + 1,
|
||||
consecutive_failures = COALESCE(consecutive_failures, 0) + 1,
|
||||
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
|
||||
last_duration_ms = $2
|
||||
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Batch Processing
|
||||
// ========================================
|
||||
/**
|
||||
* Run orchestrator for multiple dispensaries
|
||||
*/
|
||||
async function runBatchDispensaryOrchestrator(dispensaryIds, concurrency = 3) {
|
||||
const results = [];
|
||||
// Process in batches
|
||||
for (let i = 0; i < dispensaryIds.length; i += concurrency) {
|
||||
const batch = dispensaryIds.slice(i, i + concurrency);
|
||||
console.log(`Processing batch ${Math.floor(i / concurrency) + 1}: dispensaries ${batch.join(', ')}`);
|
||||
const batchResults = await Promise.all(batch.map(id => runDispensaryOrchestrator(id)));
|
||||
results.push(...batchResults);
|
||||
// Small delay between batches to avoid overwhelming the system
|
||||
if (i + concurrency < dispensaryIds.length) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
/**
|
||||
* Get dispensaries that are due for orchestration
|
||||
*/
|
||||
async function getDispensariesDueForOrchestration(limit = 10) {
|
||||
const result = await migrate_1.pool.query(`SELECT d.id
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
WHERE COALESCE(dcs.is_active, TRUE) = TRUE
|
||||
AND (
|
||||
dcs.next_run_at IS NULL
|
||||
OR dcs.next_run_at <= NOW()
|
||||
)
|
||||
AND (dcs.last_status IS NULL OR dcs.last_status NOT IN ('running', 'pending'))
|
||||
ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
|
||||
LIMIT $1`, [limit]);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
/**
|
||||
* Ensure all dispensaries have schedule entries
|
||||
*/
|
||||
async function ensureAllDispensariesHaveSchedules(intervalMinutes = 240) {
|
||||
// Get all dispensary IDs that don't have a schedule
|
||||
const result = await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
|
||||
SELECT d.id, TRUE, $1, 0
|
||||
FROM dispensaries d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
|
||||
)
|
||||
RETURNING id`, [intervalMinutes]);
|
||||
const existingCount = await migrate_1.pool.query('SELECT COUNT(*) FROM dispensary_crawl_schedule');
|
||||
return {
|
||||
created: result.rowCount || 0,
|
||||
existing: parseInt(existingCount.rows[0].count) - (result.rowCount || 0),
|
||||
};
|
||||
}
|
||||
// ========================================
|
||||
// Scheduler Integration
|
||||
// ========================================
|
||||
let dispensarySchedulerRunning = false;
|
||||
/**
|
||||
* Process dispensaries using the intelligent orchestrator
|
||||
* Called periodically by the scheduler
|
||||
*/
|
||||
async function processDispensaryScheduler() {
|
||||
if (dispensarySchedulerRunning) {
|
||||
console.log('Dispensary scheduler already running, skipping...');
|
||||
return;
|
||||
}
|
||||
dispensarySchedulerRunning = true;
|
||||
try {
|
||||
// Get dispensaries due for orchestration
|
||||
const dispensaryIds = await getDispensariesDueForOrchestration(3);
|
||||
if (dispensaryIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
console.log(`Dispensary Scheduler: Processing ${dispensaryIds.length} dispensaries due for crawl`);
|
||||
// Process each dispensary through the orchestrator
|
||||
for (const dispensaryId of dispensaryIds) {
|
||||
try {
|
||||
console.log(`Dispensary Scheduler: Starting crawl for dispensary ${dispensaryId}`);
|
||||
const result = await runDispensaryOrchestrator(dispensaryId);
|
||||
console.log(`Dispensary Scheduler: Dispensary ${dispensaryId} completed - ${result.summary}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Dispensary Scheduler: Dispensary ${dispensaryId} failed - ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`Dispensary Scheduler: Finished processing ${dispensaryIds.length} dispensaries`);
|
||||
}
|
||||
finally {
|
||||
dispensarySchedulerRunning = false;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user