fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
612
backend/dist/services/menu-provider-detector.js
vendored
Normal file
612
backend/dist/services/menu-provider-detector.js
vendored
Normal file
@@ -0,0 +1,612 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Menu Provider Detection Service
|
||||
*
|
||||
* Detects which menu platform a dispensary is using by analyzing:
|
||||
* - HTML content patterns (scripts, iframes, classes)
|
||||
* - URL patterns (embedded menu paths)
|
||||
* - API endpoint signatures
|
||||
* - Meta tags and headers
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.detectMenuProvider = detectMenuProvider;
|
||||
exports.quickDutchieCheck = quickDutchieCheck;
|
||||
exports.detectProviderChange = detectProviderChange;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const logger_1 = require("./logger");
|
||||
// Provider detection patterns
|
||||
const PROVIDER_PATTERNS = {
|
||||
dutchie: {
|
||||
scripts: [
|
||||
/dutchie/i,
|
||||
/dutchie-plus/i,
|
||||
/dutchie\.com/i,
|
||||
/dutchie-embed/i,
|
||||
],
|
||||
iframes: [
|
||||
/dutchie\.com/i,
|
||||
/embed\.dutchie/i,
|
||||
/iframe\.dutchie/i,
|
||||
],
|
||||
classes: [
|
||||
/dutchie-/i,
|
||||
/DutchieEmbed/i,
|
||||
],
|
||||
urls: [
|
||||
/dutchie\.com/i,
|
||||
/\.dutchie\./i,
|
||||
],
|
||||
meta: [
|
||||
/dutchie/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/graphql.*dutchie/i,
|
||||
/api\.dutchie/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-dutchie/i,
|
||||
/__DUTCHIE__/i,
|
||||
/dutchie-plus-iframe/i,
|
||||
],
|
||||
},
|
||||
treez: {
|
||||
scripts: [
|
||||
/treez/i,
|
||||
/treez\.io/i,
|
||||
/treezpay/i,
|
||||
],
|
||||
iframes: [
|
||||
/treez\.io/i,
|
||||
/menu\.treez/i,
|
||||
],
|
||||
classes: [
|
||||
/treez-/i,
|
||||
],
|
||||
urls: [
|
||||
/treez\.io/i,
|
||||
/\.treez\./i,
|
||||
],
|
||||
meta: [
|
||||
/treez/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.treez/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-treez/i,
|
||||
/treez-embed/i,
|
||||
],
|
||||
},
|
||||
jane: {
|
||||
scripts: [
|
||||
/jane\.co/i,
|
||||
/iheartjane/i,
|
||||
/jane-embed/i,
|
||||
/janetechnologies/i,
|
||||
],
|
||||
iframes: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/menu\.jane/i,
|
||||
],
|
||||
classes: [
|
||||
/jane-/i,
|
||||
/iheartjane/i,
|
||||
],
|
||||
urls: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/jane/i,
|
||||
/iheartjane/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.iheartjane/i,
|
||||
/api\.jane\.co/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-jane/i,
|
||||
/jane-root/i,
|
||||
/jane-embed/i,
|
||||
],
|
||||
},
|
||||
weedmaps: {
|
||||
scripts: [
|
||||
/weedmaps/i,
|
||||
/wm\.com/i,
|
||||
],
|
||||
iframes: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
classes: [
|
||||
/weedmaps-/i,
|
||||
/wm-/i,
|
||||
],
|
||||
urls: [
|
||||
/weedmaps\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/weedmaps/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api.*weedmaps/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-weedmaps/i,
|
||||
],
|
||||
},
|
||||
leafly: {
|
||||
scripts: [
|
||||
/leafly/i,
|
||||
/leafly\.com/i,
|
||||
],
|
||||
iframes: [
|
||||
/leafly\.com/i,
|
||||
/menu\.leafly/i,
|
||||
],
|
||||
classes: [
|
||||
/leafly-/i,
|
||||
],
|
||||
urls: [
|
||||
/leafly\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/leafly/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.leafly/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-leafly/i,
|
||||
],
|
||||
},
|
||||
meadow: {
|
||||
scripts: [
|
||||
/meadow/i,
|
||||
/getmeadow/i,
|
||||
],
|
||||
iframes: [
|
||||
/getmeadow\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/meadow-/i,
|
||||
],
|
||||
urls: [
|
||||
/getmeadow\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [
|
||||
/api\.getmeadow/i,
|
||||
],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
greenlight: {
|
||||
scripts: [
|
||||
/greenlight/i,
|
||||
/greenlightmenu/i,
|
||||
],
|
||||
iframes: [
|
||||
/greenlight/i,
|
||||
],
|
||||
classes: [
|
||||
/greenlight-/i,
|
||||
],
|
||||
urls: [
|
||||
/greenlight/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
blaze: {
|
||||
scripts: [
|
||||
/blaze\.me/i,
|
||||
/blazepos/i,
|
||||
],
|
||||
iframes: [
|
||||
/blaze\.me/i,
|
||||
],
|
||||
classes: [
|
||||
/blaze-/i,
|
||||
],
|
||||
urls: [
|
||||
/blaze\.me/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [
|
||||
/api\.blaze/i,
|
||||
],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
flowhub: {
|
||||
scripts: [
|
||||
/flowhub/i,
|
||||
],
|
||||
iframes: [
|
||||
/flowhub\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/flowhub-/i,
|
||||
],
|
||||
urls: [
|
||||
/flowhub\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
dispense: {
|
||||
scripts: [
|
||||
/dispenseapp/i,
|
||||
],
|
||||
iframes: [
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/dispense-/i,
|
||||
],
|
||||
urls: [
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
cova: {
|
||||
scripts: [
|
||||
/covasoftware/i,
|
||||
/cova\.software/i,
|
||||
],
|
||||
iframes: [
|
||||
/cova/i,
|
||||
],
|
||||
classes: [
|
||||
/cova-/i,
|
||||
],
|
||||
urls: [
|
||||
/cova/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
};
|
||||
// Common menu URL paths to check
|
||||
const MENU_PATHS = [
|
||||
'/menu',
|
||||
'/shop',
|
||||
'/products',
|
||||
'/order',
|
||||
'/store',
|
||||
'/dispensary-menu',
|
||||
'/online-menu',
|
||||
'/shop-all',
|
||||
'/browse',
|
||||
'/catalog',
|
||||
];
|
||||
/**
|
||||
* Analyze a single page for provider signals
|
||||
*/
|
||||
async function analyzePageForProviders(page, url) {
|
||||
const signals = [];
|
||||
try {
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
const lowerHtml = html.toLowerCase();
|
||||
// Check each provider's patterns
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
// Check script sources
|
||||
const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const script of scripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(script)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 90,
|
||||
source: 'script_src',
|
||||
details: script,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check inline scripts
|
||||
const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || ''));
|
||||
for (const scriptContent of inlineScripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(scriptContent)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 70,
|
||||
source: 'inline_script',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const iframe of iframes) {
|
||||
for (const pattern of patterns.iframes) {
|
||||
if (pattern.test(iframe)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 95,
|
||||
source: 'iframe_src',
|
||||
details: iframe,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check HTML patterns
|
||||
for (const pattern of patterns.htmlPatterns) {
|
||||
if (pattern.test(html)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 85,
|
||||
source: 'html_pattern',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check CSS classes
|
||||
for (const pattern of patterns.classes) {
|
||||
if (pattern.test(html)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 60,
|
||||
source: 'css_class',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check meta tags
|
||||
const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`));
|
||||
for (const meta of metaTags) {
|
||||
for (const pattern of patterns.meta) {
|
||||
if (pattern.test(meta)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 80,
|
||||
source: 'meta_tag',
|
||||
details: meta,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check for network requests (if we intercepted them)
|
||||
// This would be enhanced with request interception
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
|
||||
}
|
||||
return signals;
|
||||
}
|
||||
/**
|
||||
* Aggregate signals into a final detection result
|
||||
*/
|
||||
function aggregateSignals(signals) {
|
||||
if (signals.length === 0) {
|
||||
return { provider: 'unknown', confidence: 0 };
|
||||
}
|
||||
// Group signals by provider
|
||||
const providerScores = {};
|
||||
for (const signal of signals) {
|
||||
if (!providerScores[signal.provider]) {
|
||||
providerScores[signal.provider] = [];
|
||||
}
|
||||
providerScores[signal.provider].push(signal.confidence);
|
||||
}
|
||||
// Calculate weighted score for each provider
|
||||
const scores = [];
|
||||
for (const [provider, confidences] of Object.entries(providerScores)) {
|
||||
// Use max confidence + bonus for multiple signals
|
||||
const maxConf = Math.max(...confidences);
|
||||
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
|
||||
const score = Math.min(100, maxConf + multiSignalBonus);
|
||||
scores.push({ provider: provider, score });
|
||||
}
|
||||
// Sort by score descending
|
||||
scores.sort((a, b) => b.score - a.score);
|
||||
const best = scores[0];
|
||||
// If there's a clear winner (20+ point lead), use it
|
||||
if (scores.length === 1 || best.score - scores[1].score >= 20) {
|
||||
return { provider: best.provider, confidence: best.score };
|
||||
}
|
||||
// Multiple contenders - reduce confidence
|
||||
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
|
||||
}
|
||||
/**
|
||||
* Detect the menu provider for a dispensary
|
||||
*/
|
||||
async function detectMenuProvider(websiteUrl, options = {}) {
|
||||
const { checkMenuPaths = true, timeout = 30000 } = options;
|
||||
const result = {
|
||||
provider: 'unknown',
|
||||
confidence: 0,
|
||||
signals: [],
|
||||
urlsTested: [],
|
||||
menuEntryPoints: [],
|
||||
rawSignals: {},
|
||||
};
|
||||
let browser = null;
|
||||
try {
|
||||
// Normalize URL
|
||||
let baseUrl = websiteUrl.trim();
|
||||
if (!baseUrl.startsWith('http')) {
|
||||
baseUrl = `https://${baseUrl}`;
|
||||
}
|
||||
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
|
||||
// Launch browser
|
||||
browser = await puppeteer_1.default.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// Track network requests for API detection
|
||||
const apiRequests = [];
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (request) => {
|
||||
const url = request.url();
|
||||
if (url.includes('api') || url.includes('graphql')) {
|
||||
apiRequests.push(url);
|
||||
}
|
||||
request.continue();
|
||||
});
|
||||
// URLs to check
|
||||
const urlsToCheck = [baseUrl];
|
||||
if (checkMenuPaths) {
|
||||
for (const path of MENU_PATHS) {
|
||||
urlsToCheck.push(`${baseUrl}${path}`);
|
||||
}
|
||||
}
|
||||
// Check each URL
|
||||
for (const url of urlsToCheck) {
|
||||
try {
|
||||
result.urlsTested.push(url);
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Wait a bit for dynamic content
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
// Analyze page
|
||||
const pageSignals = await analyzePageForProviders(page, url);
|
||||
result.signals.push(...pageSignals);
|
||||
// Track if this URL has menu content
|
||||
const hasMenuContent = await page.evaluate(() => {
|
||||
const text = document.body.innerText.toLowerCase();
|
||||
return (text.includes('add to cart') ||
|
||||
text.includes('add to bag') ||
|
||||
text.includes('product') ||
|
||||
text.includes('indica') ||
|
||||
text.includes('sativa') ||
|
||||
text.includes('hybrid') ||
|
||||
text.includes('thc') ||
|
||||
text.includes('cbd'));
|
||||
});
|
||||
if (hasMenuContent && url !== baseUrl) {
|
||||
result.menuEntryPoints.push(url);
|
||||
}
|
||||
}
|
||||
catch (pageError) {
|
||||
// 404s are fine, just skip
|
||||
if (!pageError.message?.includes('404')) {
|
||||
logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check API requests for provider hints
|
||||
for (const apiUrl of apiRequests) {
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
for (const pattern of patterns.apiEndpoints) {
|
||||
if (pattern.test(apiUrl)) {
|
||||
result.signals.push({
|
||||
provider: provider,
|
||||
confidence: 95,
|
||||
source: 'api_request',
|
||||
details: apiUrl,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record raw signals
|
||||
result.rawSignals = {
|
||||
apiRequestsFound: apiRequests.length,
|
||||
menuEntryPointsFound: result.menuEntryPoints.length,
|
||||
totalSignals: result.signals.length,
|
||||
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
|
||||
};
|
||||
// Aggregate signals into final result
|
||||
const aggregated = aggregateSignals(result.signals);
|
||||
result.provider = aggregated.provider;
|
||||
result.confidence = aggregated.confidence;
|
||||
}
|
||||
catch (error) {
|
||||
result.error = error.message;
|
||||
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Quick check if a site has Dutchie - used during production crawls
|
||||
*/
|
||||
async function quickDutchieCheck(page) {
|
||||
try {
|
||||
const html = await page.content();
|
||||
// Check for Dutchie-specific patterns
|
||||
const dutchiePatterns = [
|
||||
/dutchie/i,
|
||||
/dutchie-plus/i,
|
||||
/__DUTCHIE__/i,
|
||||
/data-dutchie/i,
|
||||
/embed\.dutchie/i,
|
||||
];
|
||||
for (const pattern of dutchiePatterns) {
|
||||
if (pattern.test(html)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const iframe of iframes) {
|
||||
if (/dutchie/i.test(iframe)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check if provider has changed from expected
|
||||
*/
|
||||
async function detectProviderChange(page, expectedProvider) {
|
||||
try {
|
||||
const signals = await analyzePageForProviders(page, page.url());
|
||||
const aggregated = aggregateSignals(signals);
|
||||
// If we expected Dutchie but found something else with high confidence
|
||||
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: aggregated.provider,
|
||||
confidence: aggregated.confidence,
|
||||
};
|
||||
}
|
||||
// If we expected Dutchie and found nothing/low confidence, might have switched
|
||||
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
|
||||
// Check if Dutchie is definitely NOT present
|
||||
const hasDutchie = await quickDutchieCheck(page);
|
||||
if (!hasDutchie) {
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
|
||||
confidence: Math.max(30, aggregated.confidence),
|
||||
};
|
||||
}
|
||||
}
|
||||
return { changed: false };
|
||||
}
|
||||
catch {
|
||||
return { changed: false };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user