Files
cannaiq/backend/dist/services/category-crawler-jobs.js
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

1108 lines
47 KiB
JavaScript

"use strict";
/**
* Category-Specific Crawler Jobs
*
* Handles crawl jobs for each intelligence category independently:
* - CrawlProductsJob - Production product crawling (Dutchie only)
* - CrawlSpecialsJob - Production specials crawling
* - CrawlBrandIntelligenceJob - Production brand intelligence crawling
* - CrawlMetadataJob - Production metadata crawling
* - SandboxProductsJob - Sandbox product crawling (all providers)
* - SandboxSpecialsJob - Sandbox specials crawling
* - SandboxBrandJob - Sandbox brand crawling
* - SandboxMetadataJob - Sandbox metadata crawling
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.runCrawlProductsJob = runCrawlProductsJob;
exports.runCrawlSpecialsJob = runCrawlSpecialsJob;
exports.runCrawlBrandIntelligenceJob = runCrawlBrandIntelligenceJob;
exports.runCrawlMetadataJob = runCrawlMetadataJob;
exports.runSandboxProductsJob = runSandboxProductsJob;
exports.runSandboxSpecialsJob = runSandboxSpecialsJob;
exports.runSandboxBrandJob = runSandboxBrandJob;
exports.runSandboxMetadataJob = runSandboxMetadataJob;
exports.processCategorySandboxJobs = processCategorySandboxJobs;
exports.runAllCategoryProductionCrawls = runAllCategoryProductionCrawls;
exports.runAllCategorySandboxCrawls = runAllCategorySandboxCrawls;
const migrate_1 = require("../db/migrate");
const crawler_logger_1 = require("./crawler-logger");
// Note: scrapeStore from scraper-v2 is NOT used for Dutchie - we use GraphQL API directly
const product_crawler_1 = require("../dutchie-az/services/product-crawler");
const puppeteer_1 = __importDefault(require("puppeteer"));
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
// ========================================
// Helper Functions
// ========================================
async function getDispensaryWithCategories(dispensaryId) {
const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_type, platform_dispensary_id,
product_provider, product_confidence, product_crawler_mode, last_product_scan_at,
specials_provider, specials_confidence, specials_crawler_mode, last_specials_scan_at,
brand_provider, brand_confidence, brand_crawler_mode, last_brand_scan_at,
metadata_provider, metadata_confidence, metadata_crawler_mode, last_metadata_scan_at,
crawler_status, scraper_template
FROM dispensaries WHERE id = $1`, [dispensaryId]);
return result.rows[0] || null;
}
async function updateCategoryScanTime(dispensaryId, category) {
const column = `last_${category}_scan_at`;
await migrate_1.pool.query(`UPDATE dispensaries SET ${column} = NOW(), updated_at = NOW() WHERE id = $1`, [dispensaryId]);
}
async function getStoreIdForDispensary(dispensaryId) {
// First check if dispensary has menu_url - if so, try to match with stores.dutchie_url
const result = await migrate_1.pool.query(`SELECT s.id FROM stores s
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
WHERE d.id = $1
LIMIT 1`, [dispensaryId]);
if (result.rows.length > 0) {
return result.rows[0].id;
}
// Try matching by slug
const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
WHERE d.id = $1
LIMIT 1`, [dispensaryId]);
return result2.rows[0]?.id || null;
}
async function createCategorySandboxEntry(dispensaryId, category, suspectedProvider, templateName, detectionSignals) {
// Check for existing sandbox for this category
const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes
WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId, category]);
if (existing.rows.length > 0) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes
SET suspected_menu_provider = $2, template_name = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
WHERE id = $1`, [existing.rows[0].id, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : null]);
return existing.rows[0].id;
}
const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, template_name, mode, detection_signals, status)
VALUES ($1, $2, $3, $4, 'template_learning', $5, 'pending')
RETURNING id`, [dispensaryId, category, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : '{}']);
return result.rows[0].id;
}
async function createCategorySandboxJob(dispensaryId, sandboxId, category, templateName, jobType = 'crawl', priority = 0) {
const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, template_name, job_type, status, priority)
VALUES ($1, $2, $3, $4, $5, 'pending', $6)
RETURNING id`, [dispensaryId, sandboxId, category, templateName, jobType, priority]);
return result.rows[0].id;
}
async function updateSandboxQuality(sandboxId, metrics) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
quality_score = $1,
products_extracted = $2,
fields_missing = $3,
error_count = $4,
analysis_json = COALESCE(analysis_json, '{}'::jsonb) || $5::jsonb,
analyzed_at = NOW(),
updated_at = NOW()
WHERE id = $6`, [
metrics.quality_score,
metrics.items_extracted,
metrics.fields_missing,
metrics.error_count,
JSON.stringify({ sample_data: metrics.sample_data }),
sandboxId,
]);
}
async function getCrawlerTemplate(provider, category, environment) {
const result = await migrate_1.pool.query(`SELECT id, name, selector_config, navigation_config
FROM crawler_templates
WHERE provider = $1 AND environment = $2 AND is_active = true
ORDER BY is_default_for_provider DESC, version DESC
LIMIT 1`, [provider, environment]);
return result.rows[0] || null;
}
// ========================================
// Production Crawl Jobs
// ========================================
/**
* CrawlProductsJob - Production product crawling
* Uses Dutchie GraphQL API directly (NOT browser-based scraping)
*
* IMPORTANT: This function calls crawlDispensaryProducts() from dutchie-az
* which uses the GraphQL API. The GraphQL response includes categories directly,
* so no browser-based category discovery is needed.
*/
async function runCrawlProductsJob(dispensaryId) {
const category = 'product';
const startTime = Date.now();
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
// Verify production eligibility - accept either:
// 1. product_provider = 'dutchie' with product_crawler_mode = 'production', OR
// 2. menu_type = 'dutchie' with platform_dispensary_id (known Dutchie store)
const isDutchieProduction = (dispensary.product_provider === 'dutchie' && dispensary.product_crawler_mode === 'production') ||
(dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id);
if (!isDutchieProduction) {
return { success: false, category, message: 'Not a Dutchie dispensary for products' };
}
if (!dispensary.platform_dispensary_id) {
return { success: false, category, message: 'Missing platform_dispensary_id for GraphQL crawl' };
}
// Log job start
crawler_logger_1.crawlerLogger.jobStarted({
job_id: 0, // Category jobs don't have traditional job IDs
store_id: dispensaryId, // Use dispensary ID since we're not using stores table
store_name: dispensary.name,
job_type: 'CrawlProductsJob',
trigger_type: 'category_crawl',
provider: 'dutchie',
});
try {
// Build Dispensary object for GraphQL crawler
// The crawler uses platformDispensaryId to call the Dutchie GraphQL API directly
const dispensaryForCrawl = {
id: dispensary.id,
platform: 'dutchie',
name: dispensary.name,
slug: dispensary.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'),
city: '',
state: 'AZ',
menuType: dispensary.menu_type || 'dutchie',
menuUrl: dispensary.menu_url || undefined,
platformDispensaryId: dispensary.platform_dispensary_id || undefined,
website: dispensary.website || undefined,
createdAt: new Date(),
updatedAt: new Date(),
};
// Use GraphQL crawler directly - this calls the Dutchie API, not browser scraping
const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dispensaryForCrawl, 'rec', // Default to recreational pricing
{ useBothModes: true, downloadImages: true });
// Update scan time
await updateCategoryScanTime(dispensaryId, category);
const durationMs = Date.now() - startTime;
if (crawlResult.success) {
// Log job completion with summary
crawler_logger_1.crawlerLogger.jobCompleted({
job_id: 0,
store_id: dispensaryId,
store_name: dispensary.name,
duration_ms: durationMs,
products_found: crawlResult.productsFound,
products_new: 0, // GraphQL crawler doesn't track new vs updated separately
products_updated: crawlResult.productsUpserted,
provider: 'dutchie',
});
return {
success: true,
category,
message: `GraphQL crawl completed: ${crawlResult.productsUpserted} products, ${crawlResult.snapshotsCreated} snapshots`,
data: {
dispensaryId,
provider: 'dutchie',
durationMs,
productsFound: crawlResult.productsFound,
productsUpserted: crawlResult.productsUpserted,
snapshotsCreated: crawlResult.snapshotsCreated,
modeAProducts: crawlResult.modeAProducts,
modeBProducts: crawlResult.modeBProducts,
},
};
}
else {
// Log job failure
crawler_logger_1.crawlerLogger.jobFailed({
job_id: 0,
store_id: dispensaryId,
store_name: dispensary.name,
duration_ms: durationMs,
error_message: crawlResult.errorMessage || 'Unknown error',
provider: 'dutchie',
});
return { success: false, category, message: crawlResult.errorMessage || 'GraphQL crawl failed' };
}
}
catch (error) {
const durationMs = Date.now() - startTime;
// Log job failure
crawler_logger_1.crawlerLogger.jobFailed({
job_id: 0,
store_id: dispensaryId,
store_name: dispensary.name,
duration_ms: durationMs,
error_message: error.message,
provider: 'dutchie',
});
return { success: false, category, message: error.message };
}
}
/**
* CrawlSpecialsJob - Production specials crawling
* Currently no production-ready providers, so always returns false
*/
async function runCrawlSpecialsJob(dispensaryId) {
const category = 'specials';
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
// No production-ready providers for specials yet
if (dispensary.specials_crawler_mode !== 'production') {
return { success: false, category, message: 'Specials not in production mode' };
}
// Would implement provider-specific specials crawling here
// For now, no providers are production-ready
return {
success: false,
category,
message: `No production crawler for specials provider: ${dispensary.specials_provider}`,
};
}
/**
* CrawlBrandIntelligenceJob - Production brand intelligence crawling
* Currently no production-ready providers
*/
async function runCrawlBrandIntelligenceJob(dispensaryId) {
const category = 'brand';
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
if (dispensary.brand_crawler_mode !== 'production') {
return { success: false, category, message: 'Brand not in production mode' };
}
return {
success: false,
category,
message: `No production crawler for brand provider: ${dispensary.brand_provider}`,
};
}
/**
* CrawlMetadataJob - Production metadata crawling
* Currently no production-ready providers
*/
async function runCrawlMetadataJob(dispensaryId) {
const category = 'metadata';
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
if (dispensary.metadata_crawler_mode !== 'production') {
return { success: false, category, message: 'Metadata not in production mode' };
}
return {
success: false,
category,
message: `No production crawler for metadata provider: ${dispensary.metadata_provider}`,
};
}
// ========================================
// Sandbox Crawl Jobs
// ========================================
/**
* SandboxProductsJob - Sandbox product crawling
* Works with any provider including Treez
*/
async function runSandboxProductsJob(dispensaryId, sandboxId) {
const category = 'product';
const startTime = Date.now();
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
// Get or create sandbox entry
let sandbox;
if (sandboxId) {
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
sandbox = result.rows[0];
}
else {
const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes
WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed')
ORDER BY created_at DESC LIMIT 1`, [dispensaryId, category]);
sandbox = result.rows[0];
if (!sandbox) {
const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.product_provider, null);
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
sandbox = result.rows[0];
}
}
const websiteUrl = dispensary.menu_url || dispensary.website;
if (!websiteUrl) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]);
return { success: false, category, message: 'No website URL available' };
}
let browser = null;
try {
// Update status
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
browser = await puppeteer_1.default.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
// Get provider-specific template if available
const provider = dispensary.product_provider || 'unknown';
const template = await getCrawlerTemplate(provider, category, 'sandbox');
let products = [];
let metrics = {
quality_score: 0,
items_extracted: 0,
fields_missing: 0,
error_count: 0,
};
// Provider-specific extraction logic
if (provider === 'treez' && template) {
// Use Treez-specific extraction
const treezResult = await extractTreezProducts(page, websiteUrl);
products = treezResult.products;
metrics = treezResult.metrics;
}
else {
// Generic product extraction
const genericResult = await extractGenericProducts(page, websiteUrl);
products = genericResult.products;
metrics = genericResult.metrics;
}
// Update sandbox with results
metrics.sample_data = products.slice(0, 5);
await updateSandboxQuality(sandbox.id, metrics);
// Determine final status based on quality
const status = metrics.quality_score >= 70 ? 'ready_for_review' :
metrics.quality_score >= 40 ? 'needs_human_review' : 'pending';
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
status = $1,
urls_tested = $2,
updated_at = NOW()
WHERE id = $3`, [status, JSON.stringify([websiteUrl]), sandbox.id]);
// Update scan time
await updateCategoryScanTime(dispensaryId, category);
// Log sandbox completion
crawler_logger_1.crawlerLogger.sandboxEvent({
event: 'sandbox_completed',
dispensary_id: dispensaryId,
dispensary_name: dispensary.name,
template_name: provider,
category: 'product',
quality_score: metrics.quality_score,
products_extracted: products.length,
fields_missing: metrics.fields_missing,
provider: provider,
});
return {
success: true,
category,
message: `Sandbox crawl completed. ${products.length} products extracted, quality score ${metrics.quality_score}`,
data: {
sandboxId: sandbox.id,
productsExtracted: products.length,
qualityScore: metrics.quality_score,
status,
},
};
}
catch (error) {
// Log sandbox failure
crawler_logger_1.crawlerLogger.sandboxEvent({
event: 'sandbox_failed',
dispensary_id: dispensaryId,
dispensary_name: dispensary.name,
template_name: dispensary.product_provider || 'unknown',
category: 'product',
error_message: error.message,
});
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1, error_count = error_count + 1 WHERE id = $2`, [error.message, sandbox.id]);
return { success: false, category, message: error.message };
}
finally {
if (browser)
await browser.close();
}
}
/**
* SandboxSpecialsJob - Sandbox specials crawling
*/
async function runSandboxSpecialsJob(dispensaryId, sandboxId) {
const category = 'specials';
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
let sandbox;
if (sandboxId) {
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
sandbox = result.rows[0];
}
else {
const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.specials_provider, null);
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
sandbox = result.rows[0];
}
const websiteUrl = dispensary.website;
if (!websiteUrl) {
return { success: false, category, message: 'No website URL available' };
}
let browser = null;
try {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
browser = await puppeteer_1.default.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
const result = await extractSpecials(page, websiteUrl);
await updateSandboxQuality(sandbox.id, {
...result.metrics,
sample_data: result.specials.slice(0, 5),
});
const status = result.metrics.quality_score >= 70 ? 'ready_for_review' :
result.metrics.quality_score >= 40 ? 'needs_human_review' : 'pending';
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]);
await updateCategoryScanTime(dispensaryId, category);
return {
success: true,
category,
message: `Sandbox specials crawl completed. ${result.specials.length} specials found.`,
data: { sandboxId: sandbox.id, specialsCount: result.specials.length },
};
}
catch (error) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
return { success: false, category, message: error.message };
}
finally {
if (browser)
await browser.close();
}
}
/**
* SandboxBrandJob - Sandbox brand intelligence crawling
*/
async function runSandboxBrandJob(dispensaryId, sandboxId) {
const category = 'brand';
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
let sandbox;
if (sandboxId) {
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
sandbox = result.rows[0];
}
else {
const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.brand_provider, null);
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
sandbox = result.rows[0];
}
const websiteUrl = dispensary.website;
if (!websiteUrl) {
return { success: false, category, message: 'No website URL available' };
}
let browser = null;
try {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
browser = await puppeteer_1.default.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
const result = await extractBrands(page, websiteUrl);
await updateSandboxQuality(sandbox.id, {
...result.metrics,
sample_data: result.brands.slice(0, 10),
});
const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending';
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]);
await updateCategoryScanTime(dispensaryId, category);
return {
success: true,
category,
message: `Sandbox brand crawl completed. ${result.brands.length} brands found.`,
data: { sandboxId: sandbox.id, brandsCount: result.brands.length },
};
}
catch (error) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
return { success: false, category, message: error.message };
}
finally {
if (browser)
await browser.close();
}
}
/**
* SandboxMetadataJob - Sandbox metadata crawling
*/
async function runSandboxMetadataJob(dispensaryId, sandboxId) {
const category = 'metadata';
const dispensary = await getDispensaryWithCategories(dispensaryId);
if (!dispensary) {
return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
}
let sandbox;
if (sandboxId) {
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
sandbox = result.rows[0];
}
else {
const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.metadata_provider, null);
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
sandbox = result.rows[0];
}
const websiteUrl = dispensary.website;
if (!websiteUrl) {
return { success: false, category, message: 'No website URL available' };
}
let browser = null;
try {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
browser = await puppeteer_1.default.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
const result = await extractMetadata(page, websiteUrl);
await updateSandboxQuality(sandbox.id, {
...result.metrics,
sample_data: result.categories.slice(0, 20),
});
const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending';
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]);
await updateCategoryScanTime(dispensaryId, category);
return {
success: true,
category,
message: `Sandbox metadata crawl completed. ${result.categories.length} categories found.`,
data: { sandboxId: sandbox.id, categoriesCount: result.categories.length },
};
}
catch (error) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
return { success: false, category, message: error.message };
}
finally {
if (browser)
await browser.close();
}
}
// ========================================
// Extraction Functions
// ========================================
/**
* Extract products from Treez-powered sites
*/
async function extractTreezProducts(page, baseUrl) {
const products = [];
let errorCount = 0;
let fieldsMissing = 0;
try {
// Navigate to menu
const menuUrls = ['/menu', '/shop', '/products', '/order'];
let menuUrl = baseUrl;
for (const path of menuUrls) {
try {
const testUrl = new URL(path, baseUrl).toString();
await page.goto(testUrl, { waitUntil: 'networkidle2', timeout: 20000 });
const hasProducts = await page.evaluate(() => {
const text = document.body.innerText.toLowerCase();
return text.includes('add to cart') || text.includes('thc') || text.includes('indica');
});
if (hasProducts) {
menuUrl = testUrl;
break;
}
}
catch {
// Try next URL
}
}
await page.goto(menuUrl, { waitUntil: 'networkidle2', timeout: 30000 });
await new Promise(r => setTimeout(r, 3000)); // Wait for dynamic content
// Look for Treez API data in network requests or page content
const pageProducts = await page.evaluate(() => {
const extractedProducts = [];
// Try common Treez selectors
const selectors = [
'.product-card',
'.menu-item',
'[data-product]',
'.product-tile',
'.menu-product',
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 3) {
elements.forEach((el) => {
const nameEl = el.querySelector('h2, h3, .product-name, .name, [class*="name"]');
const priceEl = el.querySelector('.price, [class*="price"]');
const thcEl = el.querySelector('[class*="thc"], [class*="potency"]');
if (nameEl) {
extractedProducts.push({
name: nameEl.textContent?.trim(),
price: priceEl?.textContent?.trim(),
thc: thcEl?.textContent?.trim(),
html: el.outerHTML.slice(0, 500),
});
}
});
break;
}
}
return extractedProducts;
});
products.push(...pageProducts);
// Calculate quality metrics
for (const product of products) {
if (!product.name)
fieldsMissing++;
if (!product.price)
fieldsMissing++;
}
}
catch (error) {
// Error tracked via errorCount - logged at job level
errorCount++;
}
const qualityScore = products.length > 0
? Math.min(100, Math.max(0, 100 - (fieldsMissing * 5) - (errorCount * 10)))
: 0;
return {
products,
metrics: {
quality_score: qualityScore,
items_extracted: products.length,
fields_missing: fieldsMissing,
error_count: errorCount,
},
};
}
/**
* Extract products using generic selectors
*/
async function extractGenericProducts(page, baseUrl) {
const products = [];
let errorCount = 0;
let fieldsMissing = 0;
try {
// Try common menu paths
const menuPaths = ['/menu', '/shop', '/products', '/order'];
let foundMenu = false;
for (const path of menuPaths) {
try {
const fullUrl = new URL(path, baseUrl).toString();
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 });
const hasProducts = await page.evaluate(() => {
const text = document.body.innerText.toLowerCase();
return text.includes('add to cart') || text.includes('thc') || text.includes('gram');
});
if (hasProducts) {
foundMenu = true;
break;
}
}
catch {
continue;
}
}
if (!foundMenu) {
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 });
}
await new Promise(r => setTimeout(r, 2000));
// Generic product extraction
const pageProducts = await page.evaluate(() => {
const extractedProducts = [];
const selectors = [
'.product',
'.product-card',
'.menu-item',
'.item-card',
'[data-product]',
'.strain',
'.listing',
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 3) {
elements.forEach((el) => {
const nameEl = el.querySelector('h2, h3, h4, .name, .title, [class*="name"]');
const priceEl = el.querySelector('.price, [class*="price"]');
const brandEl = el.querySelector('.brand, [class*="brand"]');
const categoryEl = el.querySelector('.category, [class*="category"], [class*="type"]');
if (nameEl?.textContent?.trim()) {
extractedProducts.push({
name: nameEl.textContent.trim(),
price: priceEl?.textContent?.trim(),
brand: brandEl?.textContent?.trim(),
category: categoryEl?.textContent?.trim(),
});
}
});
break;
}
}
return extractedProducts;
});
products.push(...pageProducts);
// Calculate missing fields
for (const product of products) {
if (!product.name)
fieldsMissing++;
if (!product.price)
fieldsMissing++;
}
}
catch (error) {
// Error tracked via errorCount - logged at job level
errorCount++;
}
const qualityScore = products.length > 0
? Math.min(100, Math.max(0, 80 - (fieldsMissing * 3) - (errorCount * 10)))
: 0;
return {
products,
metrics: {
quality_score: qualityScore,
items_extracted: products.length,
fields_missing: fieldsMissing,
error_count: errorCount,
},
};
}
/**
* Extract specials/deals
*/
async function extractSpecials(page, baseUrl) {
const specials = [];
let errorCount = 0;
let fieldsMissing = 0;
try {
const specialsPaths = ['/specials', '/deals', '/promotions', '/offers', '/sale'];
for (const path of specialsPaths) {
try {
const fullUrl = new URL(path, baseUrl).toString();
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 });
const pageSpecials = await page.evaluate(() => {
const extracted = [];
const selectors = [
'.special',
'.deal',
'.promotion',
'.offer',
'[class*="special"]',
'[class*="deal"]',
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
elements.forEach((el) => {
const titleEl = el.querySelector('h2, h3, h4, .title, .name');
const descEl = el.querySelector('p, .description, .details');
const discountEl = el.querySelector('.discount, .savings, [class*="percent"]');
if (titleEl?.textContent?.trim()) {
extracted.push({
title: titleEl.textContent.trim(),
description: descEl?.textContent?.trim(),
discount: discountEl?.textContent?.trim(),
});
}
});
}
return extracted;
});
specials.push(...pageSpecials);
if (specials.length > 0)
break;
}
catch {
continue;
}
}
for (const special of specials) {
if (!special.title)
fieldsMissing++;
if (!special.description && !special.discount)
fieldsMissing++;
}
}
catch (error) {
// Error tracked via errorCount - logged at job level
errorCount++;
}
const qualityScore = specials.length > 0
? Math.min(100, Math.max(0, 70 - (fieldsMissing * 5) - (errorCount * 10)))
: 0;
return {
specials,
metrics: {
quality_score: qualityScore,
items_extracted: specials.length,
fields_missing: fieldsMissing,
error_count: errorCount,
},
};
}
/**
* Extract brand information
*/
async function extractBrands(page, baseUrl) {
const brands = [];
let errorCount = 0;
let fieldsMissing = 0;
try {
const brandPaths = ['/brands', '/vendors', '/producers', '/menu'];
for (const path of brandPaths) {
try {
const fullUrl = new URL(path, baseUrl).toString();
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 });
const pageBrands = await page.evaluate(() => {
const extracted = [];
const brandNames = new Set();
// Look for brand elements
const selectors = [
'.brand',
'[class*="brand"]',
'.vendor',
'.producer',
];
for (const selector of selectors) {
document.querySelectorAll(selector).forEach((el) => {
const name = el.textContent?.trim();
if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) {
brandNames.add(name);
extracted.push({ name });
}
});
}
// Also extract from filter dropdowns
document.querySelectorAll('select option, [role="option"]').forEach((el) => {
const name = el.textContent?.trim();
if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) {
const lowerName = name.toLowerCase();
if (!['all', 'any', 'select', 'choose', '--'].some(skip => lowerName.includes(skip))) {
brandNames.add(name);
extracted.push({ name, source: 'filter' });
}
}
});
return extracted;
});
brands.push(...pageBrands);
if (brands.length > 5)
break;
}
catch {
continue;
}
}
}
catch (error) {
// Error tracked via errorCount - logged at job level
errorCount++;
}
const qualityScore = brands.length > 0
? Math.min(100, Math.max(0, 60 + Math.min(30, brands.length * 2) - (errorCount * 10)))
: 0;
return {
brands,
metrics: {
quality_score: qualityScore,
items_extracted: brands.length,
fields_missing: fieldsMissing,
error_count: errorCount,
},
};
}
/**
* Extract metadata (categories, taxonomy)
*/
async function extractMetadata(page, baseUrl) {
const categories = [];
let errorCount = 0;
let fieldsMissing = 0;
try {
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 });
const menuPaths = ['/menu', '/shop', '/products'];
for (const path of menuPaths) {
try {
await page.goto(new URL(path, baseUrl).toString(), { waitUntil: 'networkidle2', timeout: 15000 });
break;
}
catch {
continue;
}
}
const pageCategories = await page.evaluate(() => {
const extracted = [];
const categoryNames = new Set();
// Navigation/tab categories
const navSelectors = [
'nav a',
'.category-nav a',
'.menu-categories a',
'[class*="category"] a',
'.tabs button',
'.tab-list button',
];
for (const selector of navSelectors) {
document.querySelectorAll(selector).forEach((el) => {
const name = el.textContent?.trim();
if (name && name.length > 1 && name.length < 50 && !categoryNames.has(name)) {
const lowerName = name.toLowerCase();
const categoryKeywords = ['flower', 'edible', 'concentrate', 'vape', 'preroll', 'tincture', 'topical', 'accessory', 'indica', 'sativa', 'hybrid'];
if (categoryKeywords.some(kw => lowerName.includes(kw)) || el.closest('[class*="category"], [class*="menu"]')) {
categoryNames.add(name);
extracted.push({ name, type: 'navigation' });
}
}
});
}
// Filter categories
document.querySelectorAll('select, [role="listbox"]').forEach((select) => {
const label = select.getAttribute('aria-label') || select.previousElementSibling?.textContent?.trim();
if (label?.toLowerCase().includes('category') || label?.toLowerCase().includes('type')) {
select.querySelectorAll('option, [role="option"]').forEach((opt) => {
const name = opt.textContent?.trim();
if (name && name.length > 1 && !categoryNames.has(name)) {
const lowerName = name.toLowerCase();
if (!['all', 'any', 'select', 'choose'].some(skip => lowerName.includes(skip))) {
categoryNames.add(name);
extracted.push({ name, type: 'filter' });
}
}
});
}
});
return extracted;
});
categories.push(...pageCategories);
}
catch (error) {
// Error tracked via errorCount - logged at job level
errorCount++;
}
const qualityScore = categories.length > 0
? Math.min(100, Math.max(0, 50 + Math.min(40, categories.length * 3) - (errorCount * 10)))
: 0;
return {
categories,
metrics: {
quality_score: qualityScore,
items_extracted: categories.length,
fields_missing: fieldsMissing,
error_count: errorCount,
},
};
}
// ========================================
// Queue Processing Functions
// ========================================
/**
* Process pending category-specific sandbox jobs
*/
async function processCategorySandboxJobs(category, limit = 5) {
const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = 'running', worker_id = $1, started_at = NOW()
WHERE id IN (
SELECT id FROM sandbox_crawl_jobs
WHERE status = 'pending' AND category = $2 AND scheduled_at <= NOW()
ORDER BY priority DESC, scheduled_at ASC
LIMIT $3
FOR UPDATE SKIP LOCKED
)
RETURNING *`, [WORKER_ID, category, limit]);
for (const job of jobs.rows) {
try {
let result;
switch (category) {
case 'product':
result = await runSandboxProductsJob(job.dispensary_id, job.sandbox_id);
break;
case 'specials':
result = await runSandboxSpecialsJob(job.dispensary_id, job.sandbox_id);
break;
case 'brand':
result = await runSandboxBrandJob(job.dispensary_id, job.sandbox_id);
break;
case 'metadata':
result = await runSandboxMetadataJob(job.dispensary_id, job.sandbox_id);
break;
default:
result = { success: false, category, message: `Unknown category: ${category}` };
}
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`, [
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]);
}
catch (error) {
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
}
}
}
/**
* Run all category production crawls for a dispensary
* Each category runs independently - failures don't affect others
*/
async function runAllCategoryProductionCrawls(dispensaryId) {
const results = [];
// Run all categories in parallel - independent failures
const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([
runCrawlProductsJob(dispensaryId),
runCrawlSpecialsJob(dispensaryId),
runCrawlBrandIntelligenceJob(dispensaryId),
runCrawlMetadataJob(dispensaryId),
]);
if (productResult.status === 'fulfilled')
results.push(productResult.value);
else
results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' });
if (specialsResult.status === 'fulfilled')
results.push(specialsResult.value);
else
results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' });
if (brandResult.status === 'fulfilled')
results.push(brandResult.value);
else
results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' });
if (metadataResult.status === 'fulfilled')
results.push(metadataResult.value);
else
results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' });
const successCount = results.filter(r => r.success).length;
const summary = `${successCount}/4 categories succeeded: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`;
// Individual category jobs log their own completion via crawlerLogger
return { results, summary };
}
/**
* Run all category sandbox crawls for a dispensary
*/
async function runAllCategorySandboxCrawls(dispensaryId) {
const results = [];
const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([
runSandboxProductsJob(dispensaryId),
runSandboxSpecialsJob(dispensaryId),
runSandboxBrandJob(dispensaryId),
runSandboxMetadataJob(dispensaryId),
]);
if (productResult.status === 'fulfilled')
results.push(productResult.value);
else
results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' });
if (specialsResult.status === 'fulfilled')
results.push(specialsResult.value);
else
results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' });
if (brandResult.status === 'fulfilled')
results.push(brandResult.value);
else
results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' });
if (metadataResult.status === 'fulfilled')
results.push(metadataResult.value);
else
results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' });
const successCount = results.filter(r => r.success).length;
const summary = `${successCount}/4 sandbox crawls: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`;
// Individual sandbox jobs log their own completion via crawlerLogger
return { results, summary };
}