Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
621 lines
17 KiB
TypeScript
621 lines
17 KiB
TypeScript
/**
|
|
* Multi-Category Intelligence Detector
|
|
*
|
|
* Detects providers for each intelligence category independently:
|
|
* - Products: Which provider serves product data
|
|
* - Specials: Which provider serves deals/specials
|
|
* - Brand: Which provider serves brand information
|
|
* - Metadata: Which provider serves taxonomy/category data
|
|
*/
|
|
|
|
import { pool } from '../db/pool';
|
|
import { logger } from './logger';
|
|
import puppeteer, { Browser, Page } from 'puppeteer';
|
|
|
|
// ========================================
|
|
// Types
|
|
// ========================================
|
|
|
|
export type IntelligenceCategory = 'product' | 'specials' | 'brand' | 'metadata';
|
|
|
|
export type MenuProvider =
|
|
| 'dutchie'
|
|
| 'treez'
|
|
| 'jane'
|
|
| 'iheartjane'
|
|
| 'weedmaps'
|
|
| 'leafly'
|
|
| 'meadow'
|
|
| 'greenlight'
|
|
| 'blaze'
|
|
| 'flowhub'
|
|
| 'dispense'
|
|
| 'cova'
|
|
| 'custom_html'
|
|
| 'custom_json'
|
|
| 'dutchie_json'
|
|
| 'other'
|
|
| 'unknown';
|
|
|
|
export interface CategoryDetectionResult {
|
|
provider: MenuProvider;
|
|
confidence: number;
|
|
mode: 'production' | 'sandbox';
|
|
signals: Record<string, any>;
|
|
templateName?: string;
|
|
}
|
|
|
|
export interface MultiCategoryDetectionResult {
|
|
product: CategoryDetectionResult;
|
|
specials: CategoryDetectionResult;
|
|
brand: CategoryDetectionResult;
|
|
metadata: CategoryDetectionResult;
|
|
urlsTested: string[];
|
|
rawSignals: Record<string, any>;
|
|
}
|
|
|
|
// Production-ready providers per category
|
|
// Only these combinations can be set to production mode
|
|
const PRODUCTION_READY: Record<IntelligenceCategory, MenuProvider[]> = {
|
|
product: ['dutchie'], // Only Dutchie products are production-ready
|
|
specials: [], // None yet
|
|
brand: [], // None yet
|
|
metadata: [], // None yet
|
|
};
|
|
|
|
// Provider detection patterns
|
|
const PROVIDER_PATTERNS: Record<string, {
|
|
scripts: RegExp[];
|
|
iframes: RegExp[];
|
|
html: RegExp[];
|
|
apiEndpoints: RegExp[];
|
|
metaTags: RegExp[];
|
|
}> = {
|
|
dutchie: {
|
|
scripts: [
|
|
/dutchie\.com/i,
|
|
/dutchie-plus/i,
|
|
/dutchie\.js/i,
|
|
/__DUTCHIE__/i,
|
|
/dutchie-embed/i,
|
|
],
|
|
iframes: [
|
|
/dutchie\.com/i,
|
|
/dutchie-plus\.com/i,
|
|
/embed\.dutchie/i,
|
|
],
|
|
html: [
|
|
/class="dutchie/i,
|
|
/id="dutchie/i,
|
|
/data-dutchie/i,
|
|
/"menuType":\s*"dutchie"/i,
|
|
],
|
|
apiEndpoints: [
|
|
/dutchie\.com\/graphql/i,
|
|
/plus\.dutchie\.com/i,
|
|
],
|
|
metaTags: [
|
|
/dutchie/i,
|
|
],
|
|
},
|
|
treez: {
|
|
scripts: [
|
|
/treez\.io/i,
|
|
/treez-ecommerce/i,
|
|
/treez\.js/i,
|
|
],
|
|
iframes: [
|
|
/treez\.io/i,
|
|
/shop\.treez/i,
|
|
],
|
|
html: [
|
|
/class="treez/i,
|
|
/data-treez/i,
|
|
/treez-menu/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.treez\.io/i,
|
|
/treez\.io\/api/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
jane: {
|
|
scripts: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/jane-frame/i,
|
|
/jane\.js/i,
|
|
],
|
|
iframes: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/embed\.iheartjane/i,
|
|
],
|
|
html: [
|
|
/class="jane/i,
|
|
/data-jane/i,
|
|
/jane-embed/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.iheartjane/i,
|
|
/jane\.co\/api/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
weedmaps: {
|
|
scripts: [
|
|
/weedmaps\.com/i,
|
|
/wm-menu/i,
|
|
],
|
|
iframes: [
|
|
/weedmaps\.com/i,
|
|
/menu\.weedmaps/i,
|
|
],
|
|
html: [
|
|
/data-weedmaps/i,
|
|
/wm-menu/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api-g\.weedmaps/i,
|
|
/weedmaps\.com\/api/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
leafly: {
|
|
scripts: [
|
|
/leafly\.com/i,
|
|
/leafly-menu/i,
|
|
],
|
|
iframes: [
|
|
/leafly\.com/i,
|
|
/order\.leafly/i,
|
|
],
|
|
html: [
|
|
/data-leafly/i,
|
|
/leafly-embed/i,
|
|
],
|
|
apiEndpoints: [
|
|
/api\.leafly/i,
|
|
],
|
|
metaTags: [],
|
|
},
|
|
};
|
|
|
|
// Category-specific detection signals
|
|
const CATEGORY_SIGNALS: Record<IntelligenceCategory, {
|
|
urlPatterns: RegExp[];
|
|
htmlPatterns: RegExp[];
|
|
jsonKeys: string[];
|
|
}> = {
|
|
product: {
|
|
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
|
|
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
|
|
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
|
|
},
|
|
specials: {
|
|
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
|
|
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
|
|
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
|
|
},
|
|
brand: {
|
|
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
|
|
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
|
|
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
|
|
},
|
|
metadata: {
|
|
urlPatterns: [/\/categories/i, /\/taxonomy/i],
|
|
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
|
|
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
|
|
},
|
|
};
|
|
|
|
// ========================================
|
|
// Main Detection Function
|
|
// ========================================
|
|
|
|
export async function detectMultiCategoryProviders(
|
|
websiteUrl: string,
|
|
options: {
|
|
timeout?: number;
|
|
headless?: boolean;
|
|
existingBrowser?: Browser;
|
|
} = {}
|
|
): Promise<MultiCategoryDetectionResult> {
|
|
const { timeout = 30000, headless = true, existingBrowser } = options;
|
|
|
|
let browser: Browser | null = null;
|
|
let page: Page | null = null;
|
|
const urlsTested: string[] = [];
|
|
const rawSignals: Record<string, any> = {};
|
|
|
|
try {
|
|
browser = existingBrowser || await puppeteer.launch({
|
|
headless,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
|
});
|
|
|
|
page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
|
|
|
// Navigate to main site
|
|
const baseUrl = normalizeUrl(websiteUrl);
|
|
urlsTested.push(baseUrl);
|
|
|
|
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
|
|
|
|
// Collect signals from main page
|
|
const mainPageSignals = await collectPageSignals(page);
|
|
rawSignals.mainPage = mainPageSignals;
|
|
|
|
// Try common menu URLs
|
|
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
|
|
for (const path of menuUrls) {
|
|
try {
|
|
const fullUrl = new URL(path, baseUrl).toString();
|
|
urlsTested.push(fullUrl);
|
|
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
|
|
const signals = await collectPageSignals(page);
|
|
rawSignals[path] = signals;
|
|
} catch {
|
|
// URL doesn't exist or timed out
|
|
}
|
|
}
|
|
|
|
// Analyze signals for each category
|
|
const result: MultiCategoryDetectionResult = {
|
|
product: analyzeCategorySignals('product', rawSignals),
|
|
specials: analyzeCategorySignals('specials', rawSignals),
|
|
brand: analyzeCategorySignals('brand', rawSignals),
|
|
metadata: analyzeCategorySignals('metadata', rawSignals),
|
|
urlsTested,
|
|
rawSignals,
|
|
};
|
|
|
|
logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
|
|
return result;
|
|
|
|
} catch (error: any) {
|
|
logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
|
|
|
// Return unknown results for all categories
|
|
return {
|
|
product: createUnknownResult(),
|
|
specials: createUnknownResult(),
|
|
brand: createUnknownResult(),
|
|
metadata: createUnknownResult(),
|
|
urlsTested,
|
|
rawSignals: { error: error.message },
|
|
};
|
|
} finally {
|
|
if (page) await page.close().catch(() => {});
|
|
if (browser && !existingBrowser) await browser.close().catch(() => {});
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// Helper Functions
|
|
// ========================================
|
|
|
|
function normalizeUrl(url: string): string {
|
|
if (!url.startsWith('http')) {
|
|
url = 'https://' + url;
|
|
}
|
|
return url.replace(/\/$/, '');
|
|
}
|
|
|
|
async function collectPageSignals(page: Page): Promise<Record<string, any>> {
|
|
return page.evaluate(() => {
|
|
const signals: Record<string, any> = {
|
|
scripts: [] as string[],
|
|
iframes: [] as string[],
|
|
links: [] as string[],
|
|
metaTags: [] as string[],
|
|
bodyClasses: document.body?.className || '',
|
|
bodyId: document.body?.id || '',
|
|
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
|
|
};
|
|
|
|
// Collect script sources
|
|
document.querySelectorAll('script[src]').forEach((el) => {
|
|
signals.scripts.push((el as HTMLScriptElement).src);
|
|
});
|
|
|
|
// Collect inline scripts
|
|
document.querySelectorAll('script:not([src])').forEach((el) => {
|
|
const content = el.textContent || '';
|
|
if (content.length < 5000) {
|
|
signals.scripts.push(`inline:${content.slice(0, 500)}`);
|
|
}
|
|
});
|
|
|
|
// Collect iframes
|
|
document.querySelectorAll('iframe').forEach((el) => {
|
|
signals.iframes.push(el.src);
|
|
});
|
|
|
|
// Collect links
|
|
document.querySelectorAll('a[href]').forEach((el) => {
|
|
signals.links.push((el as HTMLAnchorElement).href);
|
|
});
|
|
|
|
// Collect meta tags
|
|
document.querySelectorAll('meta').forEach((el) => {
|
|
const content = el.getAttribute('content') || '';
|
|
const name = el.getAttribute('name') || el.getAttribute('property') || '';
|
|
if (content || name) {
|
|
signals.metaTags.push(`${name}:${content}`);
|
|
}
|
|
});
|
|
|
|
// Look for JSON data
|
|
const jsonBlocks: string[] = [];
|
|
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
|
|
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
|
|
});
|
|
signals.jsonBlocks = jsonBlocks;
|
|
|
|
return signals;
|
|
});
|
|
}
|
|
|
|
function analyzeCategorySignals(
|
|
category: IntelligenceCategory,
|
|
allSignals: Record<string, any>
|
|
): CategoryDetectionResult {
|
|
const providerScores: Record<MenuProvider, number> = {} as any;
|
|
const detectedSignals: Record<string, any> = {};
|
|
|
|
// Initialize scores
|
|
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
|
|
providerScores[provider as MenuProvider] = 0;
|
|
}
|
|
|
|
// Analyze each page's signals
|
|
for (const [pagePath, signals] of Object.entries(allSignals)) {
|
|
if (!signals || typeof signals !== 'object') continue;
|
|
|
|
// Check for provider-specific patterns
|
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
|
let score = 0;
|
|
|
|
// Check scripts
|
|
if (signals.scripts) {
|
|
for (const script of signals.scripts) {
|
|
for (const pattern of patterns.scripts) {
|
|
if (pattern.test(script)) {
|
|
score += 20;
|
|
detectedSignals[`${provider}_script_${pagePath}`] = script;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check iframes
|
|
if (signals.iframes) {
|
|
for (const iframe of signals.iframes) {
|
|
for (const pattern of patterns.iframes) {
|
|
if (pattern.test(iframe)) {
|
|
score += 25;
|
|
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check HTML content
|
|
if (signals.htmlSnippet) {
|
|
for (const pattern of patterns.html) {
|
|
if (pattern.test(signals.htmlSnippet)) {
|
|
score += 15;
|
|
detectedSignals[`${provider}_html_${pagePath}`] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
providerScores[provider as MenuProvider] += score;
|
|
}
|
|
|
|
// Check for category-specific signals on relevant pages
|
|
const categorySignals = CATEGORY_SIGNALS[category];
|
|
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
|
|
|
|
if (isRelevantPage && signals.htmlSnippet) {
|
|
for (const pattern of categorySignals.htmlPatterns) {
|
|
if (pattern.test(signals.htmlSnippet)) {
|
|
detectedSignals[`${category}_html_pattern`] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check JSON blocks for category data
|
|
if (signals.jsonBlocks) {
|
|
for (const json of signals.jsonBlocks) {
|
|
for (const key of categorySignals.jsonKeys) {
|
|
if (json.toLowerCase().includes(`"${key}"`)) {
|
|
detectedSignals[`${category}_json_key_${key}`] = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Determine winning provider
|
|
let bestProvider: MenuProvider = 'unknown';
|
|
let bestScore = 0;
|
|
|
|
for (const [provider, score] of Object.entries(providerScores)) {
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestProvider = provider as MenuProvider;
|
|
}
|
|
}
|
|
|
|
// Calculate confidence (0-100)
|
|
const confidence = Math.min(100, bestScore);
|
|
|
|
// Determine mode based on provider and confidence
|
|
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
|
|
const mode: 'production' | 'sandbox' = isProductionReady && confidence >= 70
|
|
? 'production'
|
|
: 'sandbox';
|
|
|
|
// Get template name if available
|
|
let templateName: string | undefined;
|
|
if (bestProvider === 'dutchie' && category === 'product') {
|
|
templateName = 'dutchie_standard';
|
|
} else if (bestProvider === 'treez') {
|
|
templateName = 'treez_products_v0';
|
|
}
|
|
|
|
return {
|
|
provider: bestProvider,
|
|
confidence,
|
|
mode,
|
|
signals: detectedSignals,
|
|
templateName,
|
|
};
|
|
}
|
|
|
|
function createUnknownResult(): CategoryDetectionResult {
|
|
return {
|
|
provider: 'unknown',
|
|
confidence: 0,
|
|
mode: 'sandbox',
|
|
signals: {},
|
|
};
|
|
}
|
|
|
|
// ========================================
|
|
// Lightweight Per-Category Change Detection
|
|
// ========================================
|
|
|
|
export async function detectCategoryProviderChange(
|
|
page: Page,
|
|
category: IntelligenceCategory,
|
|
expectedProvider: MenuProvider
|
|
): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> {
|
|
try {
|
|
const signals = await collectPageSignals(page);
|
|
const result = analyzeCategorySignals(category, { currentPage: signals });
|
|
|
|
if (result.provider !== expectedProvider && result.confidence > 50) {
|
|
logger.warn(
|
|
'provider-detection',
|
|
`Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`
|
|
);
|
|
return {
|
|
changed: true,
|
|
newProvider: result.provider,
|
|
confidence: result.confidence,
|
|
};
|
|
}
|
|
|
|
return { changed: false };
|
|
} catch (error: any) {
|
|
logger.error('provider-detection', `Change detection failed: ${error.message}`);
|
|
return { changed: false };
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// Database Operations
|
|
// ========================================
|
|
|
|
export async function updateDispensaryCategoryProvider(
|
|
dispensaryId: number,
|
|
category: IntelligenceCategory,
|
|
result: CategoryDetectionResult
|
|
): Promise<void> {
|
|
const columnPrefix = category === 'product' ? 'product' :
|
|
category === 'specials' ? 'specials' :
|
|
category === 'brand' ? 'brand' : 'metadata';
|
|
|
|
await pool.query(
|
|
`UPDATE dispensaries SET
|
|
${columnPrefix}_provider = $1,
|
|
${columnPrefix}_confidence = $2,
|
|
${columnPrefix}_crawler_mode = $3,
|
|
${columnPrefix}_detection_data = $4,
|
|
updated_at = NOW()
|
|
WHERE id = $5`,
|
|
[
|
|
result.provider,
|
|
result.confidence,
|
|
result.mode,
|
|
JSON.stringify(result.signals),
|
|
dispensaryId,
|
|
]
|
|
);
|
|
}
|
|
|
|
export async function updateAllCategoryProviders(
|
|
dispensaryId: number,
|
|
result: MultiCategoryDetectionResult
|
|
): Promise<void> {
|
|
await pool.query(
|
|
`UPDATE dispensaries SET
|
|
product_provider = $1,
|
|
product_confidence = $2,
|
|
product_crawler_mode = $3,
|
|
product_detection_data = $4,
|
|
specials_provider = $5,
|
|
specials_confidence = $6,
|
|
specials_crawler_mode = $7,
|
|
specials_detection_data = $8,
|
|
brand_provider = $9,
|
|
brand_confidence = $10,
|
|
brand_crawler_mode = $11,
|
|
brand_detection_data = $12,
|
|
metadata_provider = $13,
|
|
metadata_confidence = $14,
|
|
metadata_crawler_mode = $15,
|
|
metadata_detection_data = $16,
|
|
updated_at = NOW()
|
|
WHERE id = $17`,
|
|
[
|
|
result.product.provider,
|
|
result.product.confidence,
|
|
result.product.mode,
|
|
JSON.stringify(result.product.signals),
|
|
result.specials.provider,
|
|
result.specials.confidence,
|
|
result.specials.mode,
|
|
JSON.stringify(result.specials.signals),
|
|
result.brand.provider,
|
|
result.brand.confidence,
|
|
result.brand.mode,
|
|
JSON.stringify(result.brand.signals),
|
|
result.metadata.provider,
|
|
result.metadata.confidence,
|
|
result.metadata.mode,
|
|
JSON.stringify(result.metadata.signals),
|
|
dispensaryId,
|
|
]
|
|
);
|
|
}
|
|
|
|
export async function moveCategoryToSandbox(
|
|
dispensaryId: number,
|
|
category: IntelligenceCategory,
|
|
reason: string
|
|
): Promise<void> {
|
|
const columnPrefix = category === 'product' ? 'product' :
|
|
category === 'specials' ? 'specials' :
|
|
category === 'brand' ? 'brand' : 'metadata';
|
|
|
|
await pool.query(
|
|
`UPDATE dispensaries SET
|
|
${columnPrefix}_crawler_mode = 'sandbox',
|
|
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
|
|
updated_at = NOW()
|
|
WHERE id = $2`,
|
|
[
|
|
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
|
|
dispensaryId,
|
|
]
|
|
);
|
|
|
|
logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
|
|
}
|