The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
494 lines
17 KiB
JavaScript
494 lines
17 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Multi-Category Intelligence Detector
|
|
*
|
|
* Detects providers for each intelligence category independently:
|
|
* - Products: Which provider serves product data
|
|
* - Specials: Which provider serves deals/specials
|
|
* - Brand: Which provider serves brand information
|
|
* - Metadata: Which provider serves taxonomy/category data
|
|
*/
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.detectMultiCategoryProviders = detectMultiCategoryProviders;
|
|
exports.detectCategoryProviderChange = detectCategoryProviderChange;
|
|
exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider;
|
|
exports.updateAllCategoryProviders = updateAllCategoryProviders;
|
|
exports.moveCategoryToSandbox = moveCategoryToSandbox;
|
|
const migrate_1 = require("../db/migrate");
|
|
const logger_1 = require("./logger");
|
|
const puppeteer_1 = __importDefault(require("puppeteer"));
|
|
// Production-ready providers per category
|
|
// Only these combinations can be set to production mode
|
|
const PRODUCTION_READY = {
|
|
product: ['dutchie'], // Only Dutchie products are production-ready
|
|
specials: [], // None yet
|
|
brand: [], // None yet
|
|
metadata: [], // None yet
|
|
};
|
|
// Provider detection patterns
|
|
const PROVIDER_PATTERNS = {
|
|
dutchie: {
|
|
scripts: [
|
|
/dutchie\.com/i,
|
|
/dutchie-plus/i,
|
|
/dutchie\.js/i,
|
|
/__DUTCHIE__/i,
|
|
/dutchie-embed/i,
|
|
],
|
|
iframes: [
|
|
/dutchie\.com/i,
|
|
/dutchie-plus\.com/i,
|
|
/embed\.dutchie/i,
|
|
],
|
|
html: [
|
|
/class="dutchie/i,
|
|
/id="dutchie/i,
|
|
/data-dutchie/i,
|
|
/"menuType":\s*"dutchie"/i,
|
|
],
|
|
apiEndpoints: [
|
|
/dutchie\.com\/graphql/i,
|
|
/plus\.dutchie\.com/i,
|
|
],
|
|
metaTags: [
|
|
/dutchie/i,
|
|
],
|
|
},
|
|
treez: {
|
|
scripts: [
|
|
/treez\.io/i,
|
|
/treez-ecommerce/i,
|
|
/treez\.js/i,
|
|
],
|
|
iframes: [
|
|
/treez\.io/i,
|
|
/shop\.treez/i,
|
|
],
|
|
html: [
|
|
/class="treez/i,
|
|
/data-treez/i,
|
|
/treez-menu/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.treez\.io/i,
|
|
/treez\.io\/api/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
jane: {
|
|
scripts: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/jane-frame/i,
|
|
/jane\.js/i,
|
|
],
|
|
iframes: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/embed\.iheartjane/i,
|
|
],
|
|
html: [
|
|
/class="jane/i,
|
|
/data-jane/i,
|
|
/jane-embed/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.iheartjane/i,
|
|
/jane\.co\/api/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
weedmaps: {
|
|
scripts: [
|
|
/weedmaps\.com/i,
|
|
/wm-menu/i,
|
|
],
|
|
iframes: [
|
|
/weedmaps\.com/i,
|
|
/menu\.weedmaps/i,
|
|
],
|
|
html: [
|
|
/data-weedmaps/i,
|
|
/wm-menu/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api-g\.weedmaps/i,
|
|
/weedmaps\.com\/api/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
leafly: {
|
|
scripts: [
|
|
/leafly\.com/i,
|
|
/leafly-menu/i,
|
|
],
|
|
iframes: [
|
|
/leafly\.com/i,
|
|
/order\.leafly/i,
|
|
],
|
|
html: [
|
|
/data-leafly/i,
|
|
/leafly-embed/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.leafly/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
};
|
|
// Category-specific detection signals
|
|
const CATEGORY_SIGNALS = {
|
|
product: {
|
|
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
|
|
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
|
|
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
|
|
},
|
|
specials: {
|
|
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
|
|
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
|
|
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
|
|
},
|
|
brand: {
|
|
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
|
|
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
|
|
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
|
|
},
|
|
metadata: {
|
|
urlPatterns: [/\/categories/i, /\/taxonomy/i],
|
|
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
|
|
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
|
|
},
|
|
};
|
|
// ========================================
|
|
// Main Detection Function
|
|
// ========================================
|
|
async function detectMultiCategoryProviders(websiteUrl, options = {}) {
|
|
const { timeout = 30000, headless = true, existingBrowser } = options;
|
|
let browser = null;
|
|
let page = null;
|
|
const urlsTested = [];
|
|
const rawSignals = {};
|
|
try {
|
|
browser = existingBrowser || await puppeteer_1.default.launch({
|
|
headless,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
|
});
|
|
page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
|
// Navigate to main site
|
|
const baseUrl = normalizeUrl(websiteUrl);
|
|
urlsTested.push(baseUrl);
|
|
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
|
|
// Collect signals from main page
|
|
const mainPageSignals = await collectPageSignals(page);
|
|
rawSignals.mainPage = mainPageSignals;
|
|
// Try common menu URLs
|
|
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
|
|
for (const path of menuUrls) {
|
|
try {
|
|
const fullUrl = new URL(path, baseUrl).toString();
|
|
urlsTested.push(fullUrl);
|
|
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
|
|
const signals = await collectPageSignals(page);
|
|
rawSignals[path] = signals;
|
|
}
|
|
catch {
|
|
// URL doesn't exist or timed out
|
|
}
|
|
}
|
|
// Analyze signals for each category
|
|
const result = {
|
|
product: analyzeCategorySignals('product', rawSignals),
|
|
specials: analyzeCategorySignals('specials', rawSignals),
|
|
brand: analyzeCategorySignals('brand', rawSignals),
|
|
metadata: analyzeCategorySignals('metadata', rawSignals),
|
|
urlsTested,
|
|
rawSignals,
|
|
};
|
|
logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
|
|
return result;
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
|
// Return unknown results for all categories
|
|
return {
|
|
product: createUnknownResult(),
|
|
specials: createUnknownResult(),
|
|
brand: createUnknownResult(),
|
|
metadata: createUnknownResult(),
|
|
urlsTested,
|
|
rawSignals: { error: error.message },
|
|
};
|
|
}
|
|
finally {
|
|
if (page)
|
|
await page.close().catch(() => { });
|
|
if (browser && !existingBrowser)
|
|
await browser.close().catch(() => { });
|
|
}
|
|
}
|
|
// ========================================
|
|
// Helper Functions
|
|
// ========================================
|
|
function normalizeUrl(url) {
|
|
if (!url.startsWith('http')) {
|
|
url = 'https://' + url;
|
|
}
|
|
return url.replace(/\/$/, '');
|
|
}
|
|
async function collectPageSignals(page) {
|
|
return page.evaluate(() => {
|
|
const signals = {
|
|
scripts: [],
|
|
iframes: [],
|
|
links: [],
|
|
metaTags: [],
|
|
bodyClasses: document.body?.className || '',
|
|
bodyId: document.body?.id || '',
|
|
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
|
|
};
|
|
// Collect script sources
|
|
document.querySelectorAll('script[src]').forEach((el) => {
|
|
signals.scripts.push(el.src);
|
|
});
|
|
// Collect inline scripts
|
|
document.querySelectorAll('script:not([src])').forEach((el) => {
|
|
const content = el.textContent || '';
|
|
if (content.length < 5000) {
|
|
signals.scripts.push(`inline:${content.slice(0, 500)}`);
|
|
}
|
|
});
|
|
// Collect iframes
|
|
document.querySelectorAll('iframe').forEach((el) => {
|
|
signals.iframes.push(el.src);
|
|
});
|
|
// Collect links
|
|
document.querySelectorAll('a[href]').forEach((el) => {
|
|
signals.links.push(el.href);
|
|
});
|
|
// Collect meta tags
|
|
document.querySelectorAll('meta').forEach((el) => {
|
|
const content = el.getAttribute('content') || '';
|
|
const name = el.getAttribute('name') || el.getAttribute('property') || '';
|
|
if (content || name) {
|
|
signals.metaTags.push(`${name}:${content}`);
|
|
}
|
|
});
|
|
// Look for JSON data
|
|
const jsonBlocks = [];
|
|
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
|
|
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
|
|
});
|
|
signals.jsonBlocks = jsonBlocks;
|
|
return signals;
|
|
});
|
|
}
|
|
function analyzeCategorySignals(category, allSignals) {
|
|
const providerScores = {};
|
|
const detectedSignals = {};
|
|
// Initialize scores
|
|
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
|
|
providerScores[provider] = 0;
|
|
}
|
|
// Analyze each page's signals
|
|
for (const [pagePath, signals] of Object.entries(allSignals)) {
|
|
if (!signals || typeof signals !== 'object')
|
|
continue;
|
|
// Check for provider-specific patterns
|
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
|
let score = 0;
|
|
// Check scripts
|
|
if (signals.scripts) {
|
|
for (const script of signals.scripts) {
|
|
for (const pattern of patterns.scripts) {
|
|
if (pattern.test(script)) {
|
|
score += 20;
|
|
detectedSignals[`${provider}_script_${pagePath}`] = script;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Check iframes
|
|
if (signals.iframes) {
|
|
for (const iframe of signals.iframes) {
|
|
for (const pattern of patterns.iframes) {
|
|
if (pattern.test(iframe)) {
|
|
score += 25;
|
|
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Check HTML content
|
|
if (signals.htmlSnippet) {
|
|
for (const pattern of patterns.html) {
|
|
if (pattern.test(signals.htmlSnippet)) {
|
|
score += 15;
|
|
detectedSignals[`${provider}_html_${pagePath}`] = true;
|
|
}
|
|
}
|
|
}
|
|
providerScores[provider] += score;
|
|
}
|
|
// Check for category-specific signals on relevant pages
|
|
const categorySignals = CATEGORY_SIGNALS[category];
|
|
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
|
|
if (isRelevantPage && signals.htmlSnippet) {
|
|
for (const pattern of categorySignals.htmlPatterns) {
|
|
if (pattern.test(signals.htmlSnippet)) {
|
|
detectedSignals[`${category}_html_pattern`] = true;
|
|
}
|
|
}
|
|
}
|
|
// Check JSON blocks for category data
|
|
if (signals.jsonBlocks) {
|
|
for (const json of signals.jsonBlocks) {
|
|
for (const key of categorySignals.jsonKeys) {
|
|
if (json.toLowerCase().includes(`"${key}"`)) {
|
|
detectedSignals[`${category}_json_key_${key}`] = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Determine winning provider
|
|
let bestProvider = 'unknown';
|
|
let bestScore = 0;
|
|
for (const [provider, score] of Object.entries(providerScores)) {
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestProvider = provider;
|
|
}
|
|
}
|
|
// Calculate confidence (0-100)
|
|
const confidence = Math.min(100, bestScore);
|
|
// Determine mode based on provider and confidence
|
|
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
|
|
const mode = isProductionReady && confidence >= 70
|
|
? 'production'
|
|
: 'sandbox';
|
|
// Get template name if available
|
|
let templateName;
|
|
if (bestProvider === 'dutchie' && category === 'product') {
|
|
templateName = 'dutchie_standard';
|
|
}
|
|
else if (bestProvider === 'treez') {
|
|
templateName = 'treez_products_v0';
|
|
}
|
|
return {
|
|
provider: bestProvider,
|
|
confidence,
|
|
mode,
|
|
signals: detectedSignals,
|
|
templateName,
|
|
};
|
|
}
|
|
function createUnknownResult() {
|
|
return {
|
|
provider: 'unknown',
|
|
confidence: 0,
|
|
mode: 'sandbox',
|
|
signals: {},
|
|
};
|
|
}
|
|
// ========================================
|
|
// Lightweight Per-Category Change Detection
|
|
// ========================================
|
|
async function detectCategoryProviderChange(page, category, expectedProvider) {
|
|
try {
|
|
const signals = await collectPageSignals(page);
|
|
const result = analyzeCategorySignals(category, { currentPage: signals });
|
|
if (result.provider !== expectedProvider && result.confidence > 50) {
|
|
logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`);
|
|
return {
|
|
changed: true,
|
|
newProvider: result.provider,
|
|
confidence: result.confidence,
|
|
};
|
|
}
|
|
return { changed: false };
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`);
|
|
return { changed: false };
|
|
}
|
|
}
|
|
// ========================================
|
|
// Database Operations
|
|
// ========================================
|
|
async function updateDispensaryCategoryProvider(dispensaryId, category, result) {
|
|
const columnPrefix = category === 'product' ? 'product' :
|
|
category === 'specials' ? 'specials' :
|
|
category === 'brand' ? 'brand' : 'metadata';
|
|
await migrate_1.pool.query(`UPDATE dispensaries SET
|
|
${columnPrefix}_provider = $1,
|
|
${columnPrefix}_confidence = $2,
|
|
${columnPrefix}_crawler_mode = $3,
|
|
${columnPrefix}_detection_data = $4,
|
|
updated_at = NOW()
|
|
WHERE id = $5`, [
|
|
result.provider,
|
|
result.confidence,
|
|
result.mode,
|
|
JSON.stringify(result.signals),
|
|
dispensaryId,
|
|
]);
|
|
}
|
|
async function updateAllCategoryProviders(dispensaryId, result) {
|
|
await migrate_1.pool.query(`UPDATE dispensaries SET
|
|
product_provider = $1,
|
|
product_confidence = $2,
|
|
product_crawler_mode = $3,
|
|
product_detection_data = $4,
|
|
specials_provider = $5,
|
|
specials_confidence = $6,
|
|
specials_crawler_mode = $7,
|
|
specials_detection_data = $8,
|
|
brand_provider = $9,
|
|
brand_confidence = $10,
|
|
brand_crawler_mode = $11,
|
|
brand_detection_data = $12,
|
|
metadata_provider = $13,
|
|
metadata_confidence = $14,
|
|
metadata_crawler_mode = $15,
|
|
metadata_detection_data = $16,
|
|
updated_at = NOW()
|
|
WHERE id = $17`, [
|
|
result.product.provider,
|
|
result.product.confidence,
|
|
result.product.mode,
|
|
JSON.stringify(result.product.signals),
|
|
result.specials.provider,
|
|
result.specials.confidence,
|
|
result.specials.mode,
|
|
JSON.stringify(result.specials.signals),
|
|
result.brand.provider,
|
|
result.brand.confidence,
|
|
result.brand.mode,
|
|
JSON.stringify(result.brand.signals),
|
|
result.metadata.provider,
|
|
result.metadata.confidence,
|
|
result.metadata.mode,
|
|
JSON.stringify(result.metadata.signals),
|
|
dispensaryId,
|
|
]);
|
|
}
|
|
async function moveCategoryToSandbox(dispensaryId, category, reason) {
|
|
const columnPrefix = category === 'product' ? 'product' :
|
|
category === 'specials' ? 'specials' :
|
|
category === 'brand' ? 'brand' : 'metadata';
|
|
await migrate_1.pool.query(`UPDATE dispensaries SET
|
|
${columnPrefix}_crawler_mode = 'sandbox',
|
|
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
|
|
updated_at = NOW()
|
|
WHERE id = $2`, [
|
|
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
|
|
dispensaryId,
|
|
]);
|
|
logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
|
|
}
|