Files
cannaiq/backend/dist/dutchie-az/services/menu-detection.js
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

910 lines
35 KiB
JavaScript

"use strict";
/**
* Menu Detection Service
*
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
* and resolves platform_dispensary_id for dutchie stores.
*
* This service:
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
* 2. Detects provider from menu_url patterns
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
* 4. Logs results to job_run_logs
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks;
exports.detectProviderFromUrl = detectProviderFromUrl;
exports.detectAndResolveDispensary = detectAndResolveDispensary;
exports.runBulkDetection = runBulkDetection;
exports.executeMenuDetectionJob = executeMenuDetectionJob;
exports.getDetectionStats = getDetectionStats;
exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection;
const connection_1 = require("../db/connection");
const discovery_1 = require("./discovery");
const graphql_client_1 = require("./graphql-client");
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// ============================================================
// PROVIDER DETECTION PATTERNS
// ============================================================
const PROVIDER_URL_PATTERNS = [
// We detect provider based on the actual menu link we find, not just the site domain.
{
provider: 'dutchie',
patterns: [
/dutchie\.com/i,
/\/embedded-menu\//i,
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
/dutchie-plus/i,
/curaleaf\.com/i, // Curaleaf uses Dutchie platform
/livewithsol\.com/i, // Sol Flower uses Dutchie platform
],
},
{
provider: 'treez',
patterns: [
/treez\.io/i,
/shop\.treez/i,
/treez-ecommerce/i,
],
},
{
provider: 'jane',
patterns: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
},
{
provider: 'weedmaps',
patterns: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
},
{
provider: 'leafly',
patterns: [
/leafly\.com/i,
/order\.leafly/i,
],
},
{
provider: 'meadow',
patterns: [
/getmeadow\.com/i,
/meadow\.co/i,
],
},
{
provider: 'blaze',
patterns: [
/blaze\.me/i,
/blazepos\.com/i,
],
},
{
provider: 'flowhub',
patterns: [
/flowhub\.com/i,
/flowhub\.co/i,
],
},
{
provider: 'dispense',
patterns: [
/dispense\.io/i,
/dispenseapp\.com/i,
],
},
];
/**
* Link patterns that suggest a menu or ordering page
*/
const MENU_LINK_PATTERNS = [
/\/menu/i,
/\/order/i,
/\/shop/i,
/\/products/i,
/\/dispensary/i,
/\/store/i,
/curaleaf\.com/i,
/dutchie\.com/i,
/treez\.io/i,
/jane\.co/i,
/iheartjane\.com/i,
/weedmaps\.com/i,
/leafly\.com/i,
/getmeadow\.com/i,
/blaze\.me/i,
/flowhub\.com/i,
/dispense\.io/i,
];
/**
* Check if a URL is a Curaleaf store URL
*/
function isCuraleafUrl(url) {
if (!url)
return false;
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
}
/**
* Fetch a page and extract all links
*/
async function fetchPageLinks(url, timeout = 10000) {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
// Use Googlebot User-Agent to bypass age gates on dispensary websites
const response = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
clearTimeout(timeoutId);
if (!response.ok) {
return { links: [], error: `HTTP ${response.status}` };
}
const html = await response.text();
// Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie
// Use direct match for dispensaryId - the [^}]* pattern fails with nested braces in JSON
const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html);
if (reactEnvMatch && reactEnvMatch[1]) {
return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] };
}
// Extract all href attributes from anchor tags
const linkRegex = /href=["']([^"']+)["']/gi;
const links = [];
let match;
while ((match = linkRegex.exec(html)) !== null) {
const href = match[1];
// Convert relative URLs to absolute
try {
const absoluteUrl = new URL(href, url).href;
links.push(absoluteUrl);
}
catch {
// Skip invalid URLs
}
}
// Also look for iframe src attributes (common for embedded menus)
const iframeRegex = /src=["']([^"']+)["']/gi;
while ((match = iframeRegex.exec(html)) !== null) {
const src = match[1];
try {
const absoluteUrl = new URL(src, url).href;
// Only add if it matches a provider pattern
for (const { patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(absoluteUrl))) {
links.push(absoluteUrl);
break;
}
}
}
catch {
// Skip invalid URLs
}
}
return { links: [...new Set(links)] }; // Deduplicate
}
catch (error) {
if (error.name === 'AbortError') {
return { links: [], error: 'Timeout' };
}
return { links: [], error: error.message };
}
}
/**
* Crawl a dispensary's website to find menu provider links
*
* Strategy:
* 1. Fetch the homepage and extract all links
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
* 4. Check followed pages for provider patterns
*/
async function crawlWebsiteForMenuLinks(websiteUrl) {
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
const result = {
menuUrl: null,
provider: 'unknown',
foundLinks: [],
crawledPages: [],
};
// Normalize URL
let baseUrl;
try {
baseUrl = new URL(websiteUrl);
if (!baseUrl.protocol.startsWith('http')) {
baseUrl = new URL(`https://${websiteUrl}`);
}
}
catch {
result.error = 'Invalid website URL';
return result;
}
// Step 1: Fetch the homepage
const homepage = baseUrl.href;
result.crawledPages.push(homepage);
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
if (homepageError) {
result.error = `Failed to fetch homepage: ${homepageError}`;
return result;
}
result.foundLinks = homepageLinks;
// Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML
try {
// Use Googlebot User-Agent to bypass age gates on dispensary websites
const resp = await fetch(homepage, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
if (resp.ok) {
const html = await resp.text();
// Look for dispensaryId directly - the [^}]* pattern fails with nested braces
const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html);
if (reactEnvMatch && reactEnvMatch[1]) {
result.provider = 'dutchie';
result.menuUrl = homepage;
result.platformDispensaryId = reactEnvMatch[1];
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`);
return result;
}
}
}
catch (err) {
console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
}
// Step 2: Check for reactEnv token from fetchPageLinks (encoded as dutchie-reactenv:<id>)
for (const link of homepageLinks) {
const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link);
if (reactEnvToken) {
result.menuUrl = homepage;
result.provider = 'dutchie';
result.platformDispensaryId = reactEnvToken[1];
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`);
return result;
}
}
// Step 3: Check for direct provider matches in homepage links
for (const link of homepageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
// Step 4: Find menu/order/shop links to follow
const menuLinks = homepageLinks.filter(link => {
// Must be same domain or a known provider domain
try {
const linkUrl = new URL(link);
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link)));
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
return (isSameDomain && isMenuPath) || isProviderDomain;
}
catch {
return false;
}
});
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
for (const menuLink of menuLinks.slice(0, 3)) {
// Skip if we've already crawled this page
if (result.crawledPages.includes(menuLink))
continue;
// Check if this link itself is a provider URL
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(menuLink))) {
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
result.menuUrl = menuLink;
result.provider = provider;
return result;
}
}
result.crawledPages.push(menuLink);
// Rate limit
await new Promise(r => setTimeout(r, 500));
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
if (pageError) {
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
continue;
}
result.foundLinks.push(...pageLinks);
// Check for provider matches on this page
for (const link of pageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
}
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
return result;
}
// ============================================================
// CORE DETECTION FUNCTIONS
// ============================================================
/**
* Detect menu provider from a URL
*/
function detectProviderFromUrl(menuUrl) {
if (!menuUrl)
return 'unknown';
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
for (const pattern of patterns) {
if (pattern.test(menuUrl)) {
return provider;
}
}
}
// Check if it's a custom website (has a domain but doesn't match known providers)
try {
const url = new URL(menuUrl);
if (url.hostname && !url.hostname.includes('localhost')) {
return 'custom';
}
}
catch {
// Invalid URL
}
return 'unknown';
}
/**
* Detect provider and resolve platform ID for a single dispensary
*/
async function detectAndResolveDispensary(dispensaryId) {
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
// Get dispensary record
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]);
if (rows.length === 0) {
return {
dispensaryId,
dispensaryName: 'Unknown',
previousMenuType: null,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: false,
error: 'Dispensary not found',
};
}
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
let menuUrl = dispensary.menuUrl;
const previousMenuType = dispensary.menuType || null;
const website = dispensary.website;
// If menu_url is null or empty, try to discover it by crawling the dispensary website
if (!menuUrl || menuUrl.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
// Check if website is available
if (!website || website.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'no_data'::text,
'detected_at', NOW(),
'resolution_error', 'No menu_url and no website available'::text,
'not_crawlable', true,
'website_crawl_attempted', false
),
updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: 'No menu_url and no website available - marked as not crawlable',
};
}
// Crawl the website to find menu provider links
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
const crawlResult = await crawlWebsiteForMenuLinks(website);
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
// SUCCESS: Found a menu URL from website crawl!
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
menuUrl = crawlResult.menuUrl;
// Update the dispensary with the discovered menu_url
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_url = $1,
menu_type = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $2::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $3::text,
'website_crawl_pages', $4::jsonb,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $5
`, [
crawlResult.menuUrl,
crawlResult.provider,
website,
JSON.stringify(crawlResult.crawledPages),
dispensaryId
]);
// Continue with full detection flow using the discovered menu_url
}
else {
// Website crawl failed to find a menu provider
const errorReason = crawlResult.error || 'No menu provider links found on website';
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $1::text,
'website_crawl_pages', $2::jsonb,
'resolution_error', $3::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $4
`, [
website,
JSON.stringify(crawlResult.crawledPages),
errorReason,
dispensaryId
]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: `Website crawl failed: ${errorReason}`,
};
}
}
// Detect provider from URL
const detectedProvider = detectProviderFromUrl(menuUrl);
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
// Initialize result
const result = {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider,
cName: null,
platformDispensaryId: null,
success: false,
};
// If not dutchie, just update menu_type (non-dutchie providers)
// Note: curaleaf.com and livewithsol.com are detected directly as 'dutchie' via PROVIDER_URL_PATTERNS
if (detectedProvider !== 'dutchie') {
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $2
`, [detectedProvider, dispensaryId]);
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`);
return result;
}
// For dutchie: extract cName or platformId from menu_url
const extraction = (0, discovery_1.extractFromMenuUrl)(menuUrl);
if (!extraction) {
result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'resolution_error', $1::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $2
`, [result.error, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
return result;
}
// If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/<id>.js), skip GraphQL resolution
if (extraction.type === 'platformId') {
const platformId = extraction.value;
result.platformDispensaryId = platformId;
result.success = true;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_direct_platform_id'::text,
'detected_at', NOW(),
'platform_id_source', 'url_embedded'::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $2
`, [platformId, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
return result;
}
// Otherwise, we have a cName that needs GraphQL resolution
const cName = extraction.value;
result.cName = cName;
// Resolve platform_dispensary_id from cName
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
try {
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
if (platformId) {
result.platformDispensaryId = platformId;
result.success = true;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $2::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $3
`, [platformId, cName, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
}
else {
// cName resolution failed - try crawling website as fallback
console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
if (website && website.trim() !== '') {
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') {
// Found Dutchie menu via website crawl!
console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`);
// Extract from the new menu URL
const newExtraction = (0, discovery_1.extractFromMenuUrl)(fallbackCrawl.menuUrl);
if (newExtraction) {
let fallbackPlatformId = null;
if (newExtraction.type === 'platformId') {
fallbackPlatformId = newExtraction.value;
}
else {
// Try to resolve the new cName
fallbackPlatformId = await (0, graphql_client_1.resolveDispensaryId)(newExtraction.value);
}
if (fallbackPlatformId) {
result.platformDispensaryId = fallbackPlatformId;
result.success = true;
result.cName = newExtraction.value;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
menu_url = $1,
platform_dispensary_id = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'website_crawl_fallback'::text,
'detected_at', NOW(),
'original_cname', $3::text,
'fallback_cname', $4::text,
'website_crawled', $5::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $6
`, [fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`);
return result;
}
}
}
}
// Website crawl fallback didn't work either
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'website_crawl_attempted', true,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`, [cName, result.error, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
}
catch (error) {
result.error = `Resolution failed: ${error.message}`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`, [cName, result.error, dispensaryId]);
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
return result;
}
/**
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
*/
async function runBulkDetection(options = {}) {
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, includeDutchieMissingPlatformId = true, limit, } = options;
console.log('[MenuDetection] Starting bulk detection...');
// Build query to find dispensaries needing detection
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
// Optionally includes dutchie stores missing platform ID
let whereClause = `WHERE (
menu_url IS NOT NULL
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
${includeDutchieMissingPlatformId ? `OR (
menu_type = 'dutchie' AND platform_dispensary_id IS NULL
)` : ''}
)`;
const params = [];
let paramIndex = 1;
if (state) {
whereClause += ` AND state = $${paramIndex++}`;
params.push(state);
}
// Handle filters for unknown and/or missing platform IDs
if (onlyUnknown && onlyMissingPlatformId) {
whereClause += ` AND (
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)
)`;
}
else if (onlyUnknown) {
whereClause += ` AND (
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''}
)`;
}
else if (onlyMissingPlatformId) {
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
}
else if (includeDutchieMissingPlatformId) {
// Always attempt to resolve dutchie stores missing platform IDs
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
}
let query_str = `
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
${whereClause}
ORDER BY name
`;
if (limit) {
query_str += ` LIMIT $${paramIndex}`;
params.push(limit);
}
const { rows: dispensaries } = await (0, connection_1.query)(query_str, params);
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
const result = {
totalProcessed: 0,
totalSucceeded: 0,
totalFailed: 0,
totalSkipped: 0,
results: [],
errors: [],
};
for (const row of dispensaries) {
result.totalProcessed++;
try {
const detectionResult = await detectAndResolveDispensary(row.id);
result.results.push(detectionResult);
if (detectionResult.success) {
result.totalSucceeded++;
}
else {
result.totalFailed++;
if (detectionResult.error) {
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
}
}
// Rate limit between requests
await new Promise(r => setTimeout(r, 1000));
}
catch (error) {
result.totalFailed++;
result.errors.push(`${row.name || row.id}: ${error.message}`);
}
}
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
return result;
}
// ============================================================
// SCHEDULED JOB EXECUTOR
// ============================================================
/**
* Execute the menu detection job (called by scheduler)
*/
async function executeMenuDetectionJob(config = {}) {
const state = config.state || 'AZ';
const onlyUnknown = config.onlyUnknown !== false;
// Default to true - always try to resolve platform IDs for dutchie stores
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
try {
const result = await runBulkDetection({
state,
onlyUnknown,
onlyMissingPlatformId,
includeDutchieMissingPlatformId,
});
const status = result.totalFailed === 0 ? 'success' :
result.totalSucceeded === 0 ? 'error' : 'partial';
return {
status,
itemsProcessed: result.totalProcessed,
itemsSucceeded: result.totalSucceeded,
itemsFailed: result.totalFailed,
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
metadata: {
state,
onlyUnknown,
onlyMissingPlatformId,
providerCounts: countByProvider(result.results),
},
};
}
catch (error) {
return {
status: 'error',
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
errorMessage: error.message,
};
}
}
/**
* Count results by detected provider
*/
function countByProvider(results) {
const counts = {};
for (const r of results) {
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
}
return counts;
}
// ============================================================
// UTILITY FUNCTIONS
// ============================================================
/**
* Get detection stats for dashboard
*/
async function getDetectionStats() {
const { rows } = await (0, connection_1.query)(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
FROM dispensaries
WHERE state = 'AZ'
`);
const stats = rows[0] || {};
// Get provider breakdown
const { rows: providerRows } = await (0, connection_1.query)(`
SELECT menu_type, COUNT(*) as count
FROM dispensaries
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
GROUP BY menu_type
ORDER BY count DESC
`);
const byProvider = {};
for (const row of providerRows) {
byProvider[row.menu_type] = parseInt(row.count, 10);
}
return {
totalDispensaries: parseInt(stats.total || '0', 10),
withMenuType: parseInt(stats.with_menu_type || '0', 10),
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
needsDetection: parseInt(stats.needs_detection || '0', 10),
byProvider,
};
}
/**
* Get dispensaries needing detection
* Includes dispensaries with website but no menu_url for website crawl discovery
*/
async function getDispensariesNeedingDetection(options = {}) {
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
const { rows } = await (0, connection_1.query)(`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE state = $1
AND (
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
)
ORDER BY name
LIMIT $2
`, [state, limit]);
return rows.map(discovery_1.mapDbRowToDispensary);
}