Files
cannaiq/backend/src/_deprecated/services/intelligence-detector.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

621 lines
17 KiB
TypeScript

/**
* Multi-Category Intelligence Detector
*
* Detects providers for each intelligence category independently:
* - Products: Which provider serves product data
* - Specials: Which provider serves deals/specials
* - Brand: Which provider serves brand information
* - Metadata: Which provider serves taxonomy/category data
*/
import { pool } from '../db/pool';
import { logger } from './logger';
import puppeteer, { Browser, Page } from 'puppeteer';
// ========================================
// Types
// ========================================
export type IntelligenceCategory = 'product' | 'specials' | 'brand' | 'metadata';
export type MenuProvider =
| 'dutchie'
| 'treez'
| 'jane'
| 'iheartjane'
| 'weedmaps'
| 'leafly'
| 'meadow'
| 'greenlight'
| 'blaze'
| 'flowhub'
| 'dispense'
| 'cova'
| 'custom_html'
| 'custom_json'
| 'dutchie_json'
| 'other'
| 'unknown';
export interface CategoryDetectionResult {
provider: MenuProvider;
confidence: number;
mode: 'production' | 'sandbox';
signals: Record<string, any>;
templateName?: string;
}
export interface MultiCategoryDetectionResult {
product: CategoryDetectionResult;
specials: CategoryDetectionResult;
brand: CategoryDetectionResult;
metadata: CategoryDetectionResult;
urlsTested: string[];
rawSignals: Record<string, any>;
}
// Production-ready providers per category
// Only these combinations can be set to production mode
const PRODUCTION_READY: Record<IntelligenceCategory, MenuProvider[]> = {
product: ['dutchie'], // Only Dutchie products are production-ready
specials: [], // None yet
brand: [], // None yet
metadata: [], // None yet
};
// Provider detection patterns
const PROVIDER_PATTERNS: Record<string, {
scripts: RegExp[];
iframes: RegExp[];
html: RegExp[];
apiEndpoints: RegExp[];
metaTags: RegExp[];
}> = {
dutchie: {
scripts: [
/dutchie\.com/i,
/dutchie-plus/i,
/dutchie\.js/i,
/__DUTCHIE__/i,
/dutchie-embed/i,
],
iframes: [
/dutchie\.com/i,
/dutchie-plus\.com/i,
/embed\.dutchie/i,
],
html: [
/class="dutchie/i,
/id="dutchie/i,
/data-dutchie/i,
/"menuType":\s*"dutchie"/i,
],
apiEndpoints: [
/dutchie\.com\/graphql/i,
/plus\.dutchie\.com/i,
],
metaTags: [
/dutchie/i,
],
},
treez: {
scripts: [
/treez\.io/i,
/treez-ecommerce/i,
/treez\.js/i,
],
iframes: [
/treez\.io/i,
/shop\.treez/i,
],
html: [
/class="treez/i,
/data-treez/i,
/treez-menu/i,
],
apiEndpoints: [
/api\.treez\.io/i,
/treez\.io\/api/i,
],
metaTags: [],
},
jane: {
scripts: [
/jane\.co/i,
/iheartjane\.com/i,
/jane-frame/i,
/jane\.js/i,
],
iframes: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
html: [
/class="jane/i,
/data-jane/i,
/jane-embed/i,
],
apiEndpoints: [
/api\.iheartjane/i,
/jane\.co\/api/i,
],
metaTags: [],
},
weedmaps: {
scripts: [
/weedmaps\.com/i,
/wm-menu/i,
],
iframes: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
html: [
/data-weedmaps/i,
/wm-menu/i,
],
apiEndpoints: [
/api-g\.weedmaps/i,
/weedmaps\.com\/api/i,
],
metaTags: [],
},
leafly: {
scripts: [
/leafly\.com/i,
/leafly-menu/i,
],
iframes: [
/leafly\.com/i,
/order\.leafly/i,
],
html: [
/data-leafly/i,
/leafly-embed/i,
],
apiEndpoints: [
/api\.leafly/i,
],
metaTags: [],
},
};
// Category-specific detection signals
const CATEGORY_SIGNALS: Record<IntelligenceCategory, {
urlPatterns: RegExp[];
htmlPatterns: RegExp[];
jsonKeys: string[];
}> = {
product: {
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
},
specials: {
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
},
brand: {
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
},
metadata: {
urlPatterns: [/\/categories/i, /\/taxonomy/i],
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
},
};
// ========================================
// Main Detection Function
// ========================================
export async function detectMultiCategoryProviders(
websiteUrl: string,
options: {
timeout?: number;
headless?: boolean;
existingBrowser?: Browser;
} = {}
): Promise<MultiCategoryDetectionResult> {
const { timeout = 30000, headless = true, existingBrowser } = options;
let browser: Browser | null = null;
let page: Page | null = null;
const urlsTested: string[] = [];
const rawSignals: Record<string, any> = {};
try {
browser = existingBrowser || await puppeteer.launch({
headless,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
// Navigate to main site
const baseUrl = normalizeUrl(websiteUrl);
urlsTested.push(baseUrl);
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
// Collect signals from main page
const mainPageSignals = await collectPageSignals(page);
rawSignals.mainPage = mainPageSignals;
// Try common menu URLs
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
for (const path of menuUrls) {
try {
const fullUrl = new URL(path, baseUrl).toString();
urlsTested.push(fullUrl);
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
const signals = await collectPageSignals(page);
rawSignals[path] = signals;
} catch {
// URL doesn't exist or timed out
}
}
// Analyze signals for each category
const result: MultiCategoryDetectionResult = {
product: analyzeCategorySignals('product', rawSignals),
specials: analyzeCategorySignals('specials', rawSignals),
brand: analyzeCategorySignals('brand', rawSignals),
metadata: analyzeCategorySignals('metadata', rawSignals),
urlsTested,
rawSignals,
};
logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
return result;
} catch (error: any) {
logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
// Return unknown results for all categories
return {
product: createUnknownResult(),
specials: createUnknownResult(),
brand: createUnknownResult(),
metadata: createUnknownResult(),
urlsTested,
rawSignals: { error: error.message },
};
} finally {
if (page) await page.close().catch(() => {});
if (browser && !existingBrowser) await browser.close().catch(() => {});
}
}
// ========================================
// Helper Functions
// ========================================
function normalizeUrl(url: string): string {
if (!url.startsWith('http')) {
url = 'https://' + url;
}
return url.replace(/\/$/, '');
}
async function collectPageSignals(page: Page): Promise<Record<string, any>> {
return page.evaluate(() => {
const signals: Record<string, any> = {
scripts: [] as string[],
iframes: [] as string[],
links: [] as string[],
metaTags: [] as string[],
bodyClasses: document.body?.className || '',
bodyId: document.body?.id || '',
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
};
// Collect script sources
document.querySelectorAll('script[src]').forEach((el) => {
signals.scripts.push((el as HTMLScriptElement).src);
});
// Collect inline scripts
document.querySelectorAll('script:not([src])').forEach((el) => {
const content = el.textContent || '';
if (content.length < 5000) {
signals.scripts.push(`inline:${content.slice(0, 500)}`);
}
});
// Collect iframes
document.querySelectorAll('iframe').forEach((el) => {
signals.iframes.push(el.src);
});
// Collect links
document.querySelectorAll('a[href]').forEach((el) => {
signals.links.push((el as HTMLAnchorElement).href);
});
// Collect meta tags
document.querySelectorAll('meta').forEach((el) => {
const content = el.getAttribute('content') || '';
const name = el.getAttribute('name') || el.getAttribute('property') || '';
if (content || name) {
signals.metaTags.push(`${name}:${content}`);
}
});
// Look for JSON data
const jsonBlocks: string[] = [];
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
});
signals.jsonBlocks = jsonBlocks;
return signals;
});
}
function analyzeCategorySignals(
category: IntelligenceCategory,
allSignals: Record<string, any>
): CategoryDetectionResult {
const providerScores: Record<MenuProvider, number> = {} as any;
const detectedSignals: Record<string, any> = {};
// Initialize scores
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
providerScores[provider as MenuProvider] = 0;
}
// Analyze each page's signals
for (const [pagePath, signals] of Object.entries(allSignals)) {
if (!signals || typeof signals !== 'object') continue;
// Check for provider-specific patterns
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
let score = 0;
// Check scripts
if (signals.scripts) {
for (const script of signals.scripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(script)) {
score += 20;
detectedSignals[`${provider}_script_${pagePath}`] = script;
}
}
}
}
// Check iframes
if (signals.iframes) {
for (const iframe of signals.iframes) {
for (const pattern of patterns.iframes) {
if (pattern.test(iframe)) {
score += 25;
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
}
}
}
}
// Check HTML content
if (signals.htmlSnippet) {
for (const pattern of patterns.html) {
if (pattern.test(signals.htmlSnippet)) {
score += 15;
detectedSignals[`${provider}_html_${pagePath}`] = true;
}
}
}
providerScores[provider as MenuProvider] += score;
}
// Check for category-specific signals on relevant pages
const categorySignals = CATEGORY_SIGNALS[category];
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
if (isRelevantPage && signals.htmlSnippet) {
for (const pattern of categorySignals.htmlPatterns) {
if (pattern.test(signals.htmlSnippet)) {
detectedSignals[`${category}_html_pattern`] = true;
}
}
}
// Check JSON blocks for category data
if (signals.jsonBlocks) {
for (const json of signals.jsonBlocks) {
for (const key of categorySignals.jsonKeys) {
if (json.toLowerCase().includes(`"${key}"`)) {
detectedSignals[`${category}_json_key_${key}`] = true;
}
}
}
}
}
// Determine winning provider
let bestProvider: MenuProvider = 'unknown';
let bestScore = 0;
for (const [provider, score] of Object.entries(providerScores)) {
if (score > bestScore) {
bestScore = score;
bestProvider = provider as MenuProvider;
}
}
// Calculate confidence (0-100)
const confidence = Math.min(100, bestScore);
// Determine mode based on provider and confidence
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
const mode: 'production' | 'sandbox' = isProductionReady && confidence >= 70
? 'production'
: 'sandbox';
// Get template name if available
let templateName: string | undefined;
if (bestProvider === 'dutchie' && category === 'product') {
templateName = 'dutchie_standard';
} else if (bestProvider === 'treez') {
templateName = 'treez_products_v0';
}
return {
provider: bestProvider,
confidence,
mode,
signals: detectedSignals,
templateName,
};
}
function createUnknownResult(): CategoryDetectionResult {
return {
provider: 'unknown',
confidence: 0,
mode: 'sandbox',
signals: {},
};
}
// ========================================
// Lightweight Per-Category Change Detection
// ========================================
export async function detectCategoryProviderChange(
page: Page,
category: IntelligenceCategory,
expectedProvider: MenuProvider
): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> {
try {
const signals = await collectPageSignals(page);
const result = analyzeCategorySignals(category, { currentPage: signals });
if (result.provider !== expectedProvider && result.confidence > 50) {
logger.warn(
'provider-detection',
`Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`
);
return {
changed: true,
newProvider: result.provider,
confidence: result.confidence,
};
}
return { changed: false };
} catch (error: any) {
logger.error('provider-detection', `Change detection failed: ${error.message}`);
return { changed: false };
}
}
// ========================================
// Database Operations
// ========================================
export async function updateDispensaryCategoryProvider(
dispensaryId: number,
category: IntelligenceCategory,
result: CategoryDetectionResult
): Promise<void> {
const columnPrefix = category === 'product' ? 'product' :
category === 'specials' ? 'specials' :
category === 'brand' ? 'brand' : 'metadata';
await pool.query(
`UPDATE dispensaries SET
${columnPrefix}_provider = $1,
${columnPrefix}_confidence = $2,
${columnPrefix}_crawler_mode = $3,
${columnPrefix}_detection_data = $4,
updated_at = NOW()
WHERE id = $5`,
[
result.provider,
result.confidence,
result.mode,
JSON.stringify(result.signals),
dispensaryId,
]
);
}
export async function updateAllCategoryProviders(
dispensaryId: number,
result: MultiCategoryDetectionResult
): Promise<void> {
await pool.query(
`UPDATE dispensaries SET
product_provider = $1,
product_confidence = $2,
product_crawler_mode = $3,
product_detection_data = $4,
specials_provider = $5,
specials_confidence = $6,
specials_crawler_mode = $7,
specials_detection_data = $8,
brand_provider = $9,
brand_confidence = $10,
brand_crawler_mode = $11,
brand_detection_data = $12,
metadata_provider = $13,
metadata_confidence = $14,
metadata_crawler_mode = $15,
metadata_detection_data = $16,
updated_at = NOW()
WHERE id = $17`,
[
result.product.provider,
result.product.confidence,
result.product.mode,
JSON.stringify(result.product.signals),
result.specials.provider,
result.specials.confidence,
result.specials.mode,
JSON.stringify(result.specials.signals),
result.brand.provider,
result.brand.confidence,
result.brand.mode,
JSON.stringify(result.brand.signals),
result.metadata.provider,
result.metadata.confidence,
result.metadata.mode,
JSON.stringify(result.metadata.signals),
dispensaryId,
]
);
}
export async function moveCategoryToSandbox(
dispensaryId: number,
category: IntelligenceCategory,
reason: string
): Promise<void> {
const columnPrefix = category === 'product' ? 'product' :
category === 'specials' ? 'specials' :
category === 'brand' ? 'brand' : 'metadata';
await pool.query(
`UPDATE dispensaries SET
${columnPrefix}_crawler_mode = 'sandbox',
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
updated_at = NOW()
WHERE id = $2`,
[
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
dispensaryId,
]
);
logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
}