The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
613 lines
18 KiB
JavaScript
613 lines
18 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Menu Provider Detection Service
|
|
*
|
|
* Detects which menu platform a dispensary is using by analyzing:
|
|
* - HTML content patterns (scripts, iframes, classes)
|
|
* - URL patterns (embedded menu paths)
|
|
* - API endpoint signatures
|
|
* - Meta tags and headers
|
|
*/
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.detectMenuProvider = detectMenuProvider;
|
|
exports.quickDutchieCheck = quickDutchieCheck;
|
|
exports.detectProviderChange = detectProviderChange;
|
|
const puppeteer_1 = __importDefault(require("puppeteer"));
|
|
const logger_1 = require("./logger");
|
|
// Provider detection patterns
|
|
const PROVIDER_PATTERNS = {
|
|
dutchie: {
|
|
scripts: [
|
|
/dutchie/i,
|
|
/dutchie-plus/i,
|
|
/dutchie\.com/i,
|
|
/dutchie-embed/i,
|
|
],
|
|
iframes: [
|
|
/dutchie\.com/i,
|
|
/embed\.dutchie/i,
|
|
/iframe\.dutchie/i,
|
|
],
|
|
classes: [
|
|
/dutchie-/i,
|
|
/DutchieEmbed/i,
|
|
],
|
|
urls: [
|
|
/dutchie\.com/i,
|
|
/\.dutchie\./i,
|
|
],
|
|
meta: [
|
|
/dutchie/i,
|
|
],
|
|
apiEndpoints: [
|
|
/graphql.*dutchie/i,
|
|
/api\.dutchie/i,
|
|
],
|
|
htmlPatterns: [
|
|
/data-dutchie/i,
|
|
/__DUTCHIE__/i,
|
|
/dutchie-plus-iframe/i,
|
|
],
|
|
},
|
|
treez: {
|
|
scripts: [
|
|
/treez/i,
|
|
/treez\.io/i,
|
|
/treezpay/i,
|
|
],
|
|
iframes: [
|
|
/treez\.io/i,
|
|
/menu\.treez/i,
|
|
],
|
|
classes: [
|
|
/treez-/i,
|
|
],
|
|
urls: [
|
|
/treez\.io/i,
|
|
/\.treez\./i,
|
|
],
|
|
meta: [
|
|
/treez/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.treez/i,
|
|
],
|
|
htmlPatterns: [
|
|
/data-treez/i,
|
|
/treez-embed/i,
|
|
],
|
|
},
|
|
jane: {
|
|
scripts: [
|
|
/jane\.co/i,
|
|
/iheartjane/i,
|
|
/jane-embed/i,
|
|
/janetechnologies/i,
|
|
],
|
|
iframes: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/menu\.jane/i,
|
|
],
|
|
classes: [
|
|
/jane-/i,
|
|
/iheartjane/i,
|
|
],
|
|
urls: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
],
|
|
meta: [
|
|
/jane/i,
|
|
/iheartjane/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.iheartjane/i,
|
|
/api\.jane\.co/i,
|
|
],
|
|
htmlPatterns: [
|
|
/data-jane/i,
|
|
/jane-root/i,
|
|
/jane-embed/i,
|
|
],
|
|
},
|
|
weedmaps: {
|
|
scripts: [
|
|
/weedmaps/i,
|
|
/wm\.com/i,
|
|
],
|
|
iframes: [
|
|
/weedmaps\.com/i,
|
|
/menu\.weedmaps/i,
|
|
],
|
|
classes: [
|
|
/weedmaps-/i,
|
|
/wm-/i,
|
|
],
|
|
urls: [
|
|
/weedmaps\.com/i,
|
|
],
|
|
meta: [
|
|
/weedmaps/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api.*weedmaps/i,
|
|
],
|
|
htmlPatterns: [
|
|
/data-weedmaps/i,
|
|
],
|
|
},
|
|
leafly: {
|
|
scripts: [
|
|
/leafly/i,
|
|
/leafly\.com/i,
|
|
],
|
|
iframes: [
|
|
/leafly\.com/i,
|
|
/menu\.leafly/i,
|
|
],
|
|
classes: [
|
|
/leafly-/i,
|
|
],
|
|
urls: [
|
|
/leafly\.com/i,
|
|
],
|
|
meta: [
|
|
/leafly/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.leafly/i,
|
|
],
|
|
htmlPatterns: [
|
|
/data-leafly/i,
|
|
],
|
|
},
|
|
meadow: {
|
|
scripts: [
|
|
/meadow/i,
|
|
/getmeadow/i,
|
|
],
|
|
iframes: [
|
|
/getmeadow\.com/i,
|
|
],
|
|
classes: [
|
|
/meadow-/i,
|
|
],
|
|
urls: [
|
|
/getmeadow\.com/i,
|
|
],
|
|
meta: [],
|
|
apiEndpoints: [
|
|
/api\.getmeadow/i,
|
|
],
|
|
htmlPatterns: [],
|
|
},
|
|
greenlight: {
|
|
scripts: [
|
|
/greenlight/i,
|
|
/greenlightmenu/i,
|
|
],
|
|
iframes: [
|
|
/greenlight/i,
|
|
],
|
|
classes: [
|
|
/greenlight-/i,
|
|
],
|
|
urls: [
|
|
/greenlight/i,
|
|
],
|
|
meta: [],
|
|
apiEndpoints: [],
|
|
htmlPatterns: [],
|
|
},
|
|
blaze: {
|
|
scripts: [
|
|
/blaze\.me/i,
|
|
/blazepos/i,
|
|
],
|
|
iframes: [
|
|
/blaze\.me/i,
|
|
],
|
|
classes: [
|
|
/blaze-/i,
|
|
],
|
|
urls: [
|
|
/blaze\.me/i,
|
|
],
|
|
meta: [],
|
|
apiEndpoints: [
|
|
/api\.blaze/i,
|
|
],
|
|
htmlPatterns: [],
|
|
},
|
|
flowhub: {
|
|
scripts: [
|
|
/flowhub/i,
|
|
],
|
|
iframes: [
|
|
/flowhub\.com/i,
|
|
],
|
|
classes: [
|
|
/flowhub-/i,
|
|
],
|
|
urls: [
|
|
/flowhub\.com/i,
|
|
],
|
|
meta: [],
|
|
apiEndpoints: [],
|
|
htmlPatterns: [],
|
|
},
|
|
dispense: {
|
|
scripts: [
|
|
/dispenseapp/i,
|
|
],
|
|
iframes: [
|
|
/dispenseapp\.com/i,
|
|
],
|
|
classes: [
|
|
/dispense-/i,
|
|
],
|
|
urls: [
|
|
/dispenseapp\.com/i,
|
|
],
|
|
meta: [],
|
|
apiEndpoints: [],
|
|
htmlPatterns: [],
|
|
},
|
|
cova: {
|
|
scripts: [
|
|
/covasoftware/i,
|
|
/cova\.software/i,
|
|
],
|
|
iframes: [
|
|
/cova/i,
|
|
],
|
|
classes: [
|
|
/cova-/i,
|
|
],
|
|
urls: [
|
|
/cova/i,
|
|
],
|
|
meta: [],
|
|
apiEndpoints: [],
|
|
htmlPatterns: [],
|
|
},
|
|
};
|
|
// Common menu URL paths to check
|
|
const MENU_PATHS = [
|
|
'/menu',
|
|
'/shop',
|
|
'/products',
|
|
'/order',
|
|
'/store',
|
|
'/dispensary-menu',
|
|
'/online-menu',
|
|
'/shop-all',
|
|
'/browse',
|
|
'/catalog',
|
|
];
|
|
/**
|
|
* Analyze a single page for provider signals
|
|
*/
|
|
async function analyzePageForProviders(page, url) {
|
|
const signals = [];
|
|
try {
|
|
// Get page HTML
|
|
const html = await page.content();
|
|
const lowerHtml = html.toLowerCase();
|
|
// Check each provider's patterns
|
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
|
// Check script sources
|
|
const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || ''));
|
|
for (const script of scripts) {
|
|
for (const pattern of patterns.scripts) {
|
|
if (pattern.test(script)) {
|
|
signals.push({
|
|
provider: provider,
|
|
confidence: 90,
|
|
source: 'script_src',
|
|
details: script,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
// Check inline scripts
|
|
const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || ''));
|
|
for (const scriptContent of inlineScripts) {
|
|
for (const pattern of patterns.scripts) {
|
|
if (pattern.test(scriptContent)) {
|
|
signals.push({
|
|
provider: provider,
|
|
confidence: 70,
|
|
source: 'inline_script',
|
|
details: `Pattern: ${pattern}`,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
// Check iframes
|
|
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
|
for (const iframe of iframes) {
|
|
for (const pattern of patterns.iframes) {
|
|
if (pattern.test(iframe)) {
|
|
signals.push({
|
|
provider: provider,
|
|
confidence: 95,
|
|
source: 'iframe_src',
|
|
details: iframe,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
// Check HTML patterns
|
|
for (const pattern of patterns.htmlPatterns) {
|
|
if (pattern.test(html)) {
|
|
signals.push({
|
|
provider: provider,
|
|
confidence: 85,
|
|
source: 'html_pattern',
|
|
details: `Pattern: ${pattern}`,
|
|
});
|
|
}
|
|
}
|
|
// Check CSS classes
|
|
for (const pattern of patterns.classes) {
|
|
if (pattern.test(html)) {
|
|
signals.push({
|
|
provider: provider,
|
|
confidence: 60,
|
|
source: 'css_class',
|
|
details: `Pattern: ${pattern}`,
|
|
});
|
|
}
|
|
}
|
|
// Check meta tags
|
|
const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`));
|
|
for (const meta of metaTags) {
|
|
for (const pattern of patterns.meta) {
|
|
if (pattern.test(meta)) {
|
|
signals.push({
|
|
provider: provider,
|
|
confidence: 80,
|
|
source: 'meta_tag',
|
|
details: meta,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Check for network requests (if we intercepted them)
|
|
// This would be enhanced with request interception
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
|
|
}
|
|
return signals;
|
|
}
|
|
/**
|
|
* Aggregate signals into a final detection result
|
|
*/
|
|
function aggregateSignals(signals) {
|
|
if (signals.length === 0) {
|
|
return { provider: 'unknown', confidence: 0 };
|
|
}
|
|
// Group signals by provider
|
|
const providerScores = {};
|
|
for (const signal of signals) {
|
|
if (!providerScores[signal.provider]) {
|
|
providerScores[signal.provider] = [];
|
|
}
|
|
providerScores[signal.provider].push(signal.confidence);
|
|
}
|
|
// Calculate weighted score for each provider
|
|
const scores = [];
|
|
for (const [provider, confidences] of Object.entries(providerScores)) {
|
|
// Use max confidence + bonus for multiple signals
|
|
const maxConf = Math.max(...confidences);
|
|
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
|
|
const score = Math.min(100, maxConf + multiSignalBonus);
|
|
scores.push({ provider: provider, score });
|
|
}
|
|
// Sort by score descending
|
|
scores.sort((a, b) => b.score - a.score);
|
|
const best = scores[0];
|
|
// If there's a clear winner (20+ point lead), use it
|
|
if (scores.length === 1 || best.score - scores[1].score >= 20) {
|
|
return { provider: best.provider, confidence: best.score };
|
|
}
|
|
// Multiple contenders - reduce confidence
|
|
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
|
|
}
|
|
/**
|
|
* Detect the menu provider for a dispensary
|
|
*/
|
|
async function detectMenuProvider(websiteUrl, options = {}) {
|
|
const { checkMenuPaths = true, timeout = 30000 } = options;
|
|
const result = {
|
|
provider: 'unknown',
|
|
confidence: 0,
|
|
signals: [],
|
|
urlsTested: [],
|
|
menuEntryPoints: [],
|
|
rawSignals: {},
|
|
};
|
|
let browser = null;
|
|
try {
|
|
// Normalize URL
|
|
let baseUrl = websiteUrl.trim();
|
|
if (!baseUrl.startsWith('http')) {
|
|
baseUrl = `https://${baseUrl}`;
|
|
}
|
|
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
|
|
// Launch browser
|
|
browser = await puppeteer_1.default.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-gpu',
|
|
],
|
|
});
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
// Track network requests for API detection
|
|
const apiRequests = [];
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (request) => {
|
|
const url = request.url();
|
|
if (url.includes('api') || url.includes('graphql')) {
|
|
apiRequests.push(url);
|
|
}
|
|
request.continue();
|
|
});
|
|
// URLs to check
|
|
const urlsToCheck = [baseUrl];
|
|
if (checkMenuPaths) {
|
|
for (const path of MENU_PATHS) {
|
|
urlsToCheck.push(`${baseUrl}${path}`);
|
|
}
|
|
}
|
|
// Check each URL
|
|
for (const url of urlsToCheck) {
|
|
try {
|
|
result.urlsTested.push(url);
|
|
await page.goto(url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout,
|
|
});
|
|
// Wait a bit for dynamic content
|
|
await new Promise(r => setTimeout(r, 2000));
|
|
// Analyze page
|
|
const pageSignals = await analyzePageForProviders(page, url);
|
|
result.signals.push(...pageSignals);
|
|
// Track if this URL has menu content
|
|
const hasMenuContent = await page.evaluate(() => {
|
|
const text = document.body.innerText.toLowerCase();
|
|
return (text.includes('add to cart') ||
|
|
text.includes('add to bag') ||
|
|
text.includes('product') ||
|
|
text.includes('indica') ||
|
|
text.includes('sativa') ||
|
|
text.includes('hybrid') ||
|
|
text.includes('thc') ||
|
|
text.includes('cbd'));
|
|
});
|
|
if (hasMenuContent && url !== baseUrl) {
|
|
result.menuEntryPoints.push(url);
|
|
}
|
|
}
|
|
catch (pageError) {
|
|
// 404s are fine, just skip
|
|
if (!pageError.message?.includes('404')) {
|
|
logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
|
|
}
|
|
}
|
|
}
|
|
// Check API requests for provider hints
|
|
for (const apiUrl of apiRequests) {
|
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
|
for (const pattern of patterns.apiEndpoints) {
|
|
if (pattern.test(apiUrl)) {
|
|
result.signals.push({
|
|
provider: provider,
|
|
confidence: 95,
|
|
source: 'api_request',
|
|
details: apiUrl,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Record raw signals
|
|
result.rawSignals = {
|
|
apiRequestsFound: apiRequests.length,
|
|
menuEntryPointsFound: result.menuEntryPoints.length,
|
|
totalSignals: result.signals.length,
|
|
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
|
|
};
|
|
// Aggregate signals into final result
|
|
const aggregated = aggregateSignals(result.signals);
|
|
result.provider = aggregated.provider;
|
|
result.confidence = aggregated.confidence;
|
|
}
|
|
catch (error) {
|
|
result.error = error.message;
|
|
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
|
}
|
|
finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
/**
|
|
* Quick check if a site has Dutchie - used during production crawls
|
|
*/
|
|
async function quickDutchieCheck(page) {
|
|
try {
|
|
const html = await page.content();
|
|
// Check for Dutchie-specific patterns
|
|
const dutchiePatterns = [
|
|
/dutchie/i,
|
|
/dutchie-plus/i,
|
|
/__DUTCHIE__/i,
|
|
/data-dutchie/i,
|
|
/embed\.dutchie/i,
|
|
];
|
|
for (const pattern of dutchiePatterns) {
|
|
if (pattern.test(html)) {
|
|
return true;
|
|
}
|
|
}
|
|
// Check iframes
|
|
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
|
for (const iframe of iframes) {
|
|
if (/dutchie/i.test(iframe)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
catch {
|
|
return false;
|
|
}
|
|
}
|
|
/**
|
|
* Check if provider has changed from expected
|
|
*/
|
|
async function detectProviderChange(page, expectedProvider) {
|
|
try {
|
|
const signals = await analyzePageForProviders(page, page.url());
|
|
const aggregated = aggregateSignals(signals);
|
|
// If we expected Dutchie but found something else with high confidence
|
|
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
|
|
return {
|
|
changed: true,
|
|
newProvider: aggregated.provider,
|
|
confidence: aggregated.confidence,
|
|
};
|
|
}
|
|
// If we expected Dutchie and found nothing/low confidence, might have switched
|
|
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
|
|
// Check if Dutchie is definitely NOT present
|
|
const hasDutchie = await quickDutchieCheck(page);
|
|
if (!hasDutchie) {
|
|
return {
|
|
changed: true,
|
|
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
|
|
confidence: Math.max(30, aggregated.confidence),
|
|
};
|
|
}
|
|
}
|
|
return { changed: false };
|
|
}
|
|
catch {
|
|
return { changed: false };
|
|
}
|
|
}
|