Files
cannaiq/backend/src/dutchie-az/services/menu-detection.ts

1141 lines
35 KiB
TypeScript

/**
* Menu Detection Service
*
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
* and resolves platform_dispensary_id for dutchie stores.
*
* This service:
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
* 2. Detects provider from menu_url patterns
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
* 4. Logs results to job_run_logs
*/
import { query } from '../db/connection';
import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } from './discovery';
import { resolveDispensaryId } from './graphql-client';
import { Dispensary, JobStatus } from '../types';
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// ============================================================
// TYPES
// ============================================================
export type MenuProvider =
| 'dutchie'
| 'treez'
| 'jane'
| 'iheartjane'
| 'weedmaps'
| 'leafly'
| 'meadow'
| 'blaze'
| 'flowhub'
| 'dispense'
| 'custom'
| 'unknown';
export interface DetectionResult {
dispensaryId: number;
dispensaryName: string;
previousMenuType: string | null;
detectedProvider: MenuProvider;
cName: string | null;
platformDispensaryId: string | null;
success: boolean;
error?: string;
}
export interface BulkDetectionResult {
totalProcessed: number;
totalSucceeded: number;
totalFailed: number;
totalSkipped: number;
results: DetectionResult[];
errors: string[];
}
// ============================================================
// PROVIDER DETECTION PATTERNS
// ============================================================
const PROVIDER_URL_PATTERNS: Array<{ provider: MenuProvider; patterns: RegExp[] }> = [
// We detect provider based on the actual menu link we find, not just the site domain.
{
provider: 'dutchie',
patterns: [
/dutchie\.com/i,
/\/embedded-menu\//i,
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
/dutchie-plus/i,
],
},
{
provider: 'treez',
patterns: [
/treez\.io/i,
/shop\.treez/i,
/treez-ecommerce/i,
],
},
{
provider: 'jane',
patterns: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
},
{
provider: 'weedmaps',
patterns: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
},
{
provider: 'leafly',
patterns: [
/leafly\.com/i,
/order\.leafly/i,
],
},
{
provider: 'meadow',
patterns: [
/getmeadow\.com/i,
/meadow\.co/i,
],
},
{
provider: 'blaze',
patterns: [
/blaze\.me/i,
/blazepos\.com/i,
],
},
{
provider: 'flowhub',
patterns: [
/flowhub\.com/i,
/flowhub\.co/i,
],
},
{
provider: 'dispense',
patterns: [
/dispense\.io/i,
/dispenseapp\.com/i,
],
},
];
// ============================================================
// WEBSITE CRAWL FUNCTIONS
// ============================================================
/**
* Result from crawling a website to find menu links
*/
export interface WebsiteCrawlResult {
menuUrl: string | null;
provider: MenuProvider;
foundLinks: string[];
crawledPages: string[];
platformDispensaryId?: string | null;
error?: string;
}
/**
* Link patterns that suggest a menu or ordering page
*/
const MENU_LINK_PATTERNS = [
/\/menu/i,
/\/order/i,
/\/shop/i,
/\/products/i,
/\/dispensary/i,
/\/store/i,
/curaleaf\.com/i,
/dutchie\.com/i,
/treez\.io/i,
/jane\.co/i,
/iheartjane\.com/i,
/weedmaps\.com/i,
/leafly\.com/i,
/getmeadow\.com/i,
/blaze\.me/i,
/flowhub\.com/i,
/dispense\.io/i,
];
/**
* Check if a URL is a Curaleaf store URL
*/
function isCuraleafUrl(url: string | null | undefined): boolean {
if (!url) return false;
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
}
/**
* Fetch a page and extract all links
*/
async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ links: string[]; error?: string }> {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
const response = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
clearTimeout(timeoutId);
if (!response.ok) {
return { links: [], error: `HTTP ${response.status}` };
}
const html = await response.text();
// Extract all href attributes from anchor tags
const linkRegex = /href=["']([^"']+)["']/gi;
const links: string[] = [];
let match;
while ((match = linkRegex.exec(html)) !== null) {
const href = match[1];
// Convert relative URLs to absolute
try {
const absoluteUrl = new URL(href, url).href;
links.push(absoluteUrl);
} catch {
// Skip invalid URLs
}
}
// Also look for iframe src attributes (common for embedded menus)
const iframeRegex = /src=["']([^"']+)["']/gi;
while ((match = iframeRegex.exec(html)) !== null) {
const src = match[1];
try {
const absoluteUrl = new URL(src, url).href;
// Only add if it matches a provider pattern
for (const { patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(absoluteUrl))) {
links.push(absoluteUrl);
break;
}
}
} catch {
// Skip invalid URLs
}
}
return { links: [...new Set(links)] }; // Deduplicate
} catch (error: any) {
if (error.name === 'AbortError') {
return { links: [], error: 'Timeout' };
}
return { links: [], error: error.message };
}
}
/**
* Crawl a dispensary's website to find menu provider links
*
* Strategy:
* 1. Fetch the homepage and extract all links
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
* 4. Check followed pages for provider patterns
*/
export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<WebsiteCrawlResult> {
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
const result: WebsiteCrawlResult = {
menuUrl: null,
provider: 'unknown',
foundLinks: [],
crawledPages: [],
};
// Normalize URL
let baseUrl: URL;
try {
baseUrl = new URL(websiteUrl);
if (!baseUrl.protocol.startsWith('http')) {
baseUrl = new URL(`https://${websiteUrl}`);
}
} catch {
result.error = 'Invalid website URL';
return result;
}
// Step 1: Fetch the homepage
const homepage = baseUrl.href;
result.crawledPages.push(homepage);
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
if (homepageError) {
result.error = `Failed to fetch homepage: ${homepageError}`;
return result;
}
result.foundLinks = homepageLinks;
// Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML
try {
const resp = await fetch(homepage, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
if (resp.ok) {
const html = await resp.text();
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
if (reactEnvMatch && reactEnvMatch[1]) {
result.provider = 'dutchie';
result.menuUrl = homepage;
result.platformDispensaryId = reactEnvMatch[1];
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`);
return result;
}
}
} catch (err: any) {
console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
}
// Step 2: Check for direct provider matches in homepage links
for (const link of homepageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
// Step 3: Find menu/order/shop links to follow
const menuLinks = homepageLinks.filter(link => {
// Must be same domain or a known provider domain
try {
const linkUrl = new URL(link);
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) =>
patterns.some(p => p.test(link))
);
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
return (isSameDomain && isMenuPath) || isProviderDomain;
} catch {
return false;
}
});
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
for (const menuLink of menuLinks.slice(0, 3)) {
// Skip if we've already crawled this page
if (result.crawledPages.includes(menuLink)) continue;
// Check if this link itself is a provider URL
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(menuLink))) {
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
result.menuUrl = menuLink;
result.provider = provider;
return result;
}
}
result.crawledPages.push(menuLink);
// Rate limit
await new Promise(r => setTimeout(r, 500));
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
if (pageError) {
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
continue;
}
result.foundLinks.push(...pageLinks);
// Check for provider matches on this page
for (const link of pageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
}
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
return result;
}
// ============================================================
// CORE DETECTION FUNCTIONS
// ============================================================
/**
* Detect menu provider from a URL
*/
export function detectProviderFromUrl(menuUrl: string | null | undefined): MenuProvider {
if (!menuUrl) return 'unknown';
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
for (const pattern of patterns) {
if (pattern.test(menuUrl)) {
return provider;
}
}
}
// Check if it's a custom website (has a domain but doesn't match known providers)
try {
const url = new URL(menuUrl);
if (url.hostname && !url.hostname.includes('localhost')) {
return 'custom';
}
} catch {
// Invalid URL
}
return 'unknown';
}
/**
* Detect provider and resolve platform ID for a single dispensary
*/
export async function detectAndResolveDispensary(dispensaryId: number): Promise<DetectionResult> {
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
// Get dispensary record
const { rows } = await query<any>(
`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (rows.length === 0) {
return {
dispensaryId,
dispensaryName: 'Unknown',
previousMenuType: null,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: false,
error: 'Dispensary not found',
};
}
const dispensary = mapDbRowToDispensary(rows[0]);
let menuUrl = dispensary.menuUrl;
const previousMenuType = dispensary.menuType || null;
const website = dispensary.website;
// If menu_url is null or empty, try to discover it by crawling the dispensary website
if (!menuUrl || menuUrl.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
// Check if website is available
if (!website || website.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
await query(
`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'no_data'::text,
'detected_at', NOW(),
'resolution_error', 'No menu_url and no website available'::text,
'not_crawlable', true,
'website_crawl_attempted', false
),
updated_at = NOW()
WHERE id = $1
`,
[dispensaryId]
);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: 'No menu_url and no website available - marked as not crawlable',
};
}
// Crawl the website to find menu provider links
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
const crawlResult = await crawlWebsiteForMenuLinks(website);
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
// SUCCESS: Found a menu URL from website crawl!
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
menuUrl = crawlResult.menuUrl;
// Update the dispensary with the discovered menu_url
await query(
`
UPDATE dispensaries SET
menu_url = $1,
menu_type = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $2::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $3::text,
'website_crawl_pages', $4::jsonb,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $5
`,
[
crawlResult.menuUrl,
crawlResult.provider,
website,
JSON.stringify(crawlResult.crawledPages),
dispensaryId
]
);
// Continue with full detection flow using the discovered menu_url
} else {
// Website crawl failed to find a menu provider
const errorReason = crawlResult.error || 'No menu provider links found on website';
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
await query(
`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $1::text,
'website_crawl_pages', $2::jsonb,
'resolution_error', $3::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $4
`,
[
website,
JSON.stringify(crawlResult.crawledPages),
errorReason,
dispensaryId
]
);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: `Website crawl failed: ${errorReason}`,
};
}
}
// Detect provider from URL
const detectedProvider = detectProviderFromUrl(menuUrl);
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
// Initialize result
const result: DetectionResult = {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider,
cName: null,
platformDispensaryId: null,
success: false,
};
// For domains like curaleaf/sol, crawl the website to find the actual menu provider (often Dutchie)
const SPECIAL_DOMAINS = ['curaleaf', 'sol'] as const;
const isSpecialDomain = SPECIAL_DOMAINS.includes(detectedProvider as any);
if (isSpecialDomain && website && website.trim() !== '') {
console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider (often Dutchie)...`);
const crawlResult = await crawlWebsiteForMenuLinks(website);
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
// Found an actual menu provider (likely Dutchie) - use that instead!
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
menuUrl = crawlResult.menuUrl;
// Re-detect provider from the found URL
const actualProvider = detectProviderFromUrl(menuUrl);
// Update with the actual discovered provider
await query(
`
UPDATE dispensaries SET
menu_url = $1,
menu_type = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $2::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'original_url_provider', $3::text,
'website_crawled', $4::text,
'website_crawl_pages', $5::jsonb,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $6
`,
[
crawlResult.menuUrl,
actualProvider,
detectedProvider,
website,
JSON.stringify(crawlResult.crawledPages),
dispensaryId
]
);
// If the actual provider is dutchie, continue to platform ID resolution
if (actualProvider === 'dutchie') {
result.detectedProvider = 'dutchie';
// If platformDispensaryId was captured (e.g., reactEnv on homepage), save it now and return
if (crawlResult.platformDispensaryId) {
await query(
`
UPDATE dispensaries SET
platform_dispensary_id = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'detected_provider', 'dutchie'::text,
'detection_method', 'website_crawl'::text
),
updated_at = NOW()
WHERE id = $2
`,
[crawlResult.platformDispensaryId, dispensaryId]
);
result.platformDispensaryId = crawlResult.platformDispensaryId;
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID from reactEnv = ${crawlResult.platformDispensaryId}`);
return result;
}
// Fall through to dutchie platform ID resolution below if no platform ID captured
} else {
// Found a different provider (treez, jane, etc.) - we're done
result.detectedProvider = actualProvider;
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${actualProvider} (discovered from website crawl)`);
return result;
}
} else {
// Website crawl didn't find any menu provider - mark unknown with reason
const notCrawlableReason = `No embedded menu provider found`;
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found no menu provider - marking as unknown`);
await query(
`
UPDATE dispensaries SET
menu_type = 'unknown',
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'url_pattern_with_crawl'::text,
'detected_at', NOW(),
'website_crawled', $1::text,
'website_crawl_pages', $2::jsonb,
'not_crawlable', true,
'not_crawlable_reason', $3::text
),
updated_at = NOW()
WHERE id = $4
`,
[
website,
JSON.stringify(crawlResult.crawledPages),
notCrawlableReason,
dispensaryId
]
);
result.success = true;
return result;
}
}
// If not dutchie, just update menu_type (non-dutchie providers)
if (detectedProvider !== 'dutchie') {
await query(
`
UPDATE dispensaries SET
menu_type = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $2
`,
[detectedProvider, dispensaryId]
);
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`);
return result;
}
// For dutchie: extract cName or platformId from menu_url
const extraction = extractFromMenuUrl(menuUrl);
if (!extraction) {
result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'resolution_error', $1::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $2
`,
[result.error, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
return result;
}
// If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/<id>.js), skip GraphQL resolution
if (extraction.type === 'platformId') {
const platformId = extraction.value;
result.platformDispensaryId = platformId;
result.success = true;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_direct_platform_id'::text,
'detected_at', NOW(),
'platform_id_source', 'url_embedded'::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $2
`,
[platformId, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
return result;
}
// Otherwise, we have a cName that needs GraphQL resolution
const cName = extraction.value;
result.cName = cName;
// Resolve platform_dispensary_id from cName
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
try {
const platformId = await resolveDispensaryId(cName);
if (platformId) {
result.platformDispensaryId = platformId;
result.success = true;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $2::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $3
`,
[platformId, cName, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
} else {
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`,
[cName, result.error, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
} catch (error: any) {
result.error = `Resolution failed: ${error.message}`;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`,
[cName, result.error, dispensaryId]
);
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
return result;
}
/**
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
*/
export async function runBulkDetection(options: {
state?: string;
onlyUnknown?: boolean;
onlyMissingPlatformId?: boolean;
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
limit?: number;
} = {}): Promise<BulkDetectionResult> {
const {
state,
onlyUnknown = true,
onlyMissingPlatformId = false,
includeWebsiteCrawl = true,
includeDutchieMissingPlatformId = true,
limit,
} = options;
console.log('[MenuDetection] Starting bulk detection...');
// Build query to find dispensaries needing detection
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
// Optionally includes dutchie stores missing platform ID
let whereClause = `WHERE (
menu_url IS NOT NULL
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
${includeDutchieMissingPlatformId ? `OR (
menu_type = 'dutchie' AND platform_dispensary_id IS NULL
)` : ''}
)`;
const params: any[] = [];
let paramIndex = 1;
if (state) {
whereClause += ` AND state = $${paramIndex++}`;
params.push(state);
}
// Handle filters for unknown and/or missing platform IDs
if (onlyUnknown && onlyMissingPlatformId) {
whereClause += ` AND (
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)
)`;
} else if (onlyUnknown) {
whereClause += ` AND (
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''}
)`;
} else if (onlyMissingPlatformId) {
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
} else if (includeDutchieMissingPlatformId) {
// Always attempt to resolve dutchie stores missing platform IDs
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
}
let query_str = `
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
${whereClause}
ORDER BY name
`;
if (limit) {
query_str += ` LIMIT $${paramIndex}`;
params.push(limit);
}
const { rows: dispensaries } = await query<any>(query_str, params);
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
const result: BulkDetectionResult = {
totalProcessed: 0,
totalSucceeded: 0,
totalFailed: 0,
totalSkipped: 0,
results: [],
errors: [],
};
for (const row of dispensaries) {
result.totalProcessed++;
try {
const detectionResult = await detectAndResolveDispensary(row.id);
result.results.push(detectionResult);
if (detectionResult.success) {
result.totalSucceeded++;
} else {
result.totalFailed++;
if (detectionResult.error) {
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
}
}
// Rate limit between requests
await new Promise(r => setTimeout(r, 1000));
} catch (error: any) {
result.totalFailed++;
result.errors.push(`${row.name || row.id}: ${error.message}`);
}
}
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
return result;
}
// ============================================================
// SCHEDULED JOB EXECUTOR
// ============================================================
/**
* Execute the menu detection job (called by scheduler)
*/
export async function executeMenuDetectionJob(config: Record<string, any> = {}): Promise<{
status: JobStatus;
itemsProcessed: number;
itemsSucceeded: number;
itemsFailed: number;
errorMessage?: string;
metadata?: any;
}> {
const state = config.state || 'AZ';
const onlyUnknown = config.onlyUnknown !== false;
// Default to true - always try to resolve platform IDs for dutchie stores
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
try {
const result = await runBulkDetection({
state,
onlyUnknown,
onlyMissingPlatformId,
includeDutchieMissingPlatformId,
});
const status: JobStatus =
result.totalFailed === 0 ? 'success' :
result.totalSucceeded === 0 ? 'error' : 'partial';
return {
status,
itemsProcessed: result.totalProcessed,
itemsSucceeded: result.totalSucceeded,
itemsFailed: result.totalFailed,
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
metadata: {
state,
onlyUnknown,
onlyMissingPlatformId,
providerCounts: countByProvider(result.results),
},
};
} catch (error: any) {
return {
status: 'error',
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
errorMessage: error.message,
};
}
}
/**
* Count results by detected provider
*/
function countByProvider(results: DetectionResult[]): Record<string, number> {
const counts: Record<string, number> = {};
for (const r of results) {
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
}
return counts;
}
// ============================================================
// UTILITY FUNCTIONS
// ============================================================
/**
* Get detection stats for dashboard
*/
export async function getDetectionStats(): Promise<{
totalDispensaries: number;
withMenuType: number;
withPlatformId: number;
needsDetection: number;
byProvider: Record<string, number>;
}> {
const { rows } = await query<any>(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
FROM dispensaries
WHERE state = 'AZ'
`);
const stats = rows[0] || {};
// Get provider breakdown
const { rows: providerRows } = await query<any>(`
SELECT menu_type, COUNT(*) as count
FROM dispensaries
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
GROUP BY menu_type
ORDER BY count DESC
`);
const byProvider: Record<string, number> = {};
for (const row of providerRows) {
byProvider[row.menu_type] = parseInt(row.count, 10);
}
return {
totalDispensaries: parseInt(stats.total || '0', 10),
withMenuType: parseInt(stats.with_menu_type || '0', 10),
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
needsDetection: parseInt(stats.needs_detection || '0', 10),
byProvider,
};
}
/**
* Get dispensaries needing detection
* Includes dispensaries with website but no menu_url for website crawl discovery
*/
export async function getDispensariesNeedingDetection(options: {
state?: string;
limit?: number;
includeWebsiteCrawl?: boolean;
} = {}): Promise<Dispensary[]> {
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
const { rows } = await query<any>(
`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE state = $1
AND (
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
)
ORDER BY name
LIMIT $2
`,
[state, limit]
);
return rows.map(mapDbRowToDispensary);
}