1141 lines
35 KiB
TypeScript
1141 lines
35 KiB
TypeScript
/**
|
|
* Menu Detection Service
|
|
*
|
|
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
|
|
* and resolves platform_dispensary_id for dutchie stores.
|
|
*
|
|
* This service:
|
|
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
|
|
* 2. Detects provider from menu_url patterns
|
|
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
|
|
* 4. Logs results to job_run_logs
|
|
*/
|
|
|
|
import { query } from '../db/connection';
|
|
import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } from './discovery';
|
|
import { resolveDispensaryId } from './graphql-client';
|
|
import { Dispensary, JobStatus } from '../types';
|
|
|
|
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
|
const DISPENSARY_COLUMNS = `
|
|
id, name, slug, city, state, zip, address, latitude, longitude,
|
|
menu_type, menu_url, platform_dispensary_id, website,
|
|
provider_detection_data, created_at, updated_at
|
|
`;
|
|
|
|
// ============================================================
|
|
// TYPES
|
|
// ============================================================
|
|
|
|
export type MenuProvider =
|
|
| 'dutchie'
|
|
| 'treez'
|
|
| 'jane'
|
|
| 'iheartjane'
|
|
| 'weedmaps'
|
|
| 'leafly'
|
|
| 'meadow'
|
|
| 'blaze'
|
|
| 'flowhub'
|
|
| 'dispense'
|
|
| 'custom'
|
|
| 'unknown';
|
|
|
|
export interface DetectionResult {
|
|
dispensaryId: number;
|
|
dispensaryName: string;
|
|
previousMenuType: string | null;
|
|
detectedProvider: MenuProvider;
|
|
cName: string | null;
|
|
platformDispensaryId: string | null;
|
|
success: boolean;
|
|
error?: string;
|
|
}
|
|
|
|
export interface BulkDetectionResult {
|
|
totalProcessed: number;
|
|
totalSucceeded: number;
|
|
totalFailed: number;
|
|
totalSkipped: number;
|
|
results: DetectionResult[];
|
|
errors: string[];
|
|
}
|
|
|
|
// ============================================================
|
|
// PROVIDER DETECTION PATTERNS
|
|
// ============================================================
|
|
|
|
const PROVIDER_URL_PATTERNS: Array<{ provider: MenuProvider; patterns: RegExp[] }> = [
|
|
// We detect provider based on the actual menu link we find, not just the site domain.
|
|
{
|
|
provider: 'dutchie',
|
|
patterns: [
|
|
/dutchie\.com/i,
|
|
/\/embedded-menu\//i,
|
|
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
|
|
/dutchie-plus/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'treez',
|
|
patterns: [
|
|
/treez\.io/i,
|
|
/shop\.treez/i,
|
|
/treez-ecommerce/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'jane',
|
|
patterns: [
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/embed\.iheartjane/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'weedmaps',
|
|
patterns: [
|
|
/weedmaps\.com/i,
|
|
/menu\.weedmaps/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'leafly',
|
|
patterns: [
|
|
/leafly\.com/i,
|
|
/order\.leafly/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'meadow',
|
|
patterns: [
|
|
/getmeadow\.com/i,
|
|
/meadow\.co/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'blaze',
|
|
patterns: [
|
|
/blaze\.me/i,
|
|
/blazepos\.com/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'flowhub',
|
|
patterns: [
|
|
/flowhub\.com/i,
|
|
/flowhub\.co/i,
|
|
],
|
|
},
|
|
{
|
|
provider: 'dispense',
|
|
patterns: [
|
|
/dispense\.io/i,
|
|
/dispenseapp\.com/i,
|
|
],
|
|
},
|
|
];
|
|
|
|
// ============================================================
|
|
// WEBSITE CRAWL FUNCTIONS
|
|
// ============================================================
|
|
|
|
/**
|
|
* Result from crawling a website to find menu links
|
|
*/
|
|
export interface WebsiteCrawlResult {
|
|
menuUrl: string | null;
|
|
provider: MenuProvider;
|
|
foundLinks: string[];
|
|
crawledPages: string[];
|
|
platformDispensaryId?: string | null;
|
|
error?: string;
|
|
}
|
|
|
|
/**
|
|
* Link patterns that suggest a menu or ordering page
|
|
*/
|
|
const MENU_LINK_PATTERNS = [
|
|
/\/menu/i,
|
|
/\/order/i,
|
|
/\/shop/i,
|
|
/\/products/i,
|
|
/\/dispensary/i,
|
|
/\/store/i,
|
|
/curaleaf\.com/i,
|
|
/dutchie\.com/i,
|
|
/treez\.io/i,
|
|
/jane\.co/i,
|
|
/iheartjane\.com/i,
|
|
/weedmaps\.com/i,
|
|
/leafly\.com/i,
|
|
/getmeadow\.com/i,
|
|
/blaze\.me/i,
|
|
/flowhub\.com/i,
|
|
/dispense\.io/i,
|
|
];
|
|
|
|
/**
|
|
* Check if a URL is a Curaleaf store URL
|
|
*/
|
|
function isCuraleafUrl(url: string | null | undefined): boolean {
|
|
if (!url) return false;
|
|
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
|
|
}
|
|
|
|
/**
|
|
* Fetch a page and extract all links
|
|
*/
|
|
async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ links: string[]; error?: string }> {
|
|
try {
|
|
const controller = new AbortController();
|
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
|
|
const response = await fetch(url, {
|
|
signal: controller.signal,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
},
|
|
redirect: 'follow',
|
|
});
|
|
|
|
clearTimeout(timeoutId);
|
|
|
|
if (!response.ok) {
|
|
return { links: [], error: `HTTP ${response.status}` };
|
|
}
|
|
|
|
const html = await response.text();
|
|
|
|
// Extract all href attributes from anchor tags
|
|
const linkRegex = /href=["']([^"']+)["']/gi;
|
|
const links: string[] = [];
|
|
let match;
|
|
|
|
while ((match = linkRegex.exec(html)) !== null) {
|
|
const href = match[1];
|
|
// Convert relative URLs to absolute
|
|
try {
|
|
const absoluteUrl = new URL(href, url).href;
|
|
links.push(absoluteUrl);
|
|
} catch {
|
|
// Skip invalid URLs
|
|
}
|
|
}
|
|
|
|
// Also look for iframe src attributes (common for embedded menus)
|
|
const iframeRegex = /src=["']([^"']+)["']/gi;
|
|
while ((match = iframeRegex.exec(html)) !== null) {
|
|
const src = match[1];
|
|
try {
|
|
const absoluteUrl = new URL(src, url).href;
|
|
// Only add if it matches a provider pattern
|
|
for (const { patterns } of PROVIDER_URL_PATTERNS) {
|
|
if (patterns.some(p => p.test(absoluteUrl))) {
|
|
links.push(absoluteUrl);
|
|
break;
|
|
}
|
|
}
|
|
} catch {
|
|
// Skip invalid URLs
|
|
}
|
|
}
|
|
|
|
return { links: [...new Set(links)] }; // Deduplicate
|
|
} catch (error: any) {
|
|
if (error.name === 'AbortError') {
|
|
return { links: [], error: 'Timeout' };
|
|
}
|
|
return { links: [], error: error.message };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Crawl a dispensary's website to find menu provider links
|
|
*
|
|
* Strategy:
|
|
* 1. Fetch the homepage and extract all links
|
|
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
|
|
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
|
|
* 4. Check followed pages for provider patterns
|
|
*/
|
|
export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<WebsiteCrawlResult> {
|
|
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
|
|
|
|
const result: WebsiteCrawlResult = {
|
|
menuUrl: null,
|
|
provider: 'unknown',
|
|
foundLinks: [],
|
|
crawledPages: [],
|
|
};
|
|
|
|
// Normalize URL
|
|
let baseUrl: URL;
|
|
try {
|
|
baseUrl = new URL(websiteUrl);
|
|
if (!baseUrl.protocol.startsWith('http')) {
|
|
baseUrl = new URL(`https://${websiteUrl}`);
|
|
}
|
|
} catch {
|
|
result.error = 'Invalid website URL';
|
|
return result;
|
|
}
|
|
|
|
// Step 1: Fetch the homepage
|
|
const homepage = baseUrl.href;
|
|
result.crawledPages.push(homepage);
|
|
|
|
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
|
|
if (homepageError) {
|
|
result.error = `Failed to fetch homepage: ${homepageError}`;
|
|
return result;
|
|
}
|
|
|
|
result.foundLinks = homepageLinks;
|
|
|
|
// Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML
|
|
try {
|
|
const resp = await fetch(homepage, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
},
|
|
redirect: 'follow',
|
|
});
|
|
if (resp.ok) {
|
|
const html = await resp.text();
|
|
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
|
|
if (reactEnvMatch && reactEnvMatch[1]) {
|
|
result.provider = 'dutchie';
|
|
result.menuUrl = homepage;
|
|
result.platformDispensaryId = reactEnvMatch[1];
|
|
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`);
|
|
return result;
|
|
}
|
|
}
|
|
} catch (err: any) {
|
|
console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
|
|
}
|
|
|
|
// Step 2: Check for direct provider matches in homepage links
|
|
for (const link of homepageLinks) {
|
|
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
|
if (patterns.some(p => p.test(link))) {
|
|
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
|
|
result.menuUrl = link;
|
|
result.provider = provider;
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 3: Find menu/order/shop links to follow
|
|
const menuLinks = homepageLinks.filter(link => {
|
|
// Must be same domain or a known provider domain
|
|
try {
|
|
const linkUrl = new URL(link);
|
|
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
|
|
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
|
|
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) =>
|
|
patterns.some(p => p.test(link))
|
|
);
|
|
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
|
|
|
|
return (isSameDomain && isMenuPath) || isProviderDomain;
|
|
} catch {
|
|
return false;
|
|
}
|
|
});
|
|
|
|
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
|
|
|
|
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
|
|
for (const menuLink of menuLinks.slice(0, 3)) {
|
|
// Skip if we've already crawled this page
|
|
if (result.crawledPages.includes(menuLink)) continue;
|
|
|
|
// Check if this link itself is a provider URL
|
|
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
|
if (patterns.some(p => p.test(menuLink))) {
|
|
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
|
|
result.menuUrl = menuLink;
|
|
result.provider = provider;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
result.crawledPages.push(menuLink);
|
|
|
|
// Rate limit
|
|
await new Promise(r => setTimeout(r, 500));
|
|
|
|
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
|
|
if (pageError) {
|
|
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
|
|
continue;
|
|
}
|
|
|
|
result.foundLinks.push(...pageLinks);
|
|
|
|
// Check for provider matches on this page
|
|
for (const link of pageLinks) {
|
|
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
|
if (patterns.some(p => p.test(link))) {
|
|
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
|
|
result.menuUrl = link;
|
|
result.provider = provider;
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
|
|
return result;
|
|
}
|
|
|
|
// ============================================================
|
|
// CORE DETECTION FUNCTIONS
|
|
// ============================================================
|
|
|
|
/**
|
|
* Detect menu provider from a URL
|
|
*/
|
|
export function detectProviderFromUrl(menuUrl: string | null | undefined): MenuProvider {
|
|
if (!menuUrl) return 'unknown';
|
|
|
|
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
|
for (const pattern of patterns) {
|
|
if (pattern.test(menuUrl)) {
|
|
return provider;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if it's a custom website (has a domain but doesn't match known providers)
|
|
try {
|
|
const url = new URL(menuUrl);
|
|
if (url.hostname && !url.hostname.includes('localhost')) {
|
|
return 'custom';
|
|
}
|
|
} catch {
|
|
// Invalid URL
|
|
}
|
|
|
|
return 'unknown';
|
|
}
|
|
|
|
/**
|
|
* Detect provider and resolve platform ID for a single dispensary
|
|
*/
|
|
export async function detectAndResolveDispensary(dispensaryId: number): Promise<DetectionResult> {
|
|
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
|
|
|
|
// Get dispensary record
|
|
const { rows } = await query<any>(
|
|
`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`,
|
|
[dispensaryId]
|
|
);
|
|
|
|
if (rows.length === 0) {
|
|
return {
|
|
dispensaryId,
|
|
dispensaryName: 'Unknown',
|
|
previousMenuType: null,
|
|
detectedProvider: 'unknown',
|
|
cName: null,
|
|
platformDispensaryId: null,
|
|
success: false,
|
|
error: 'Dispensary not found',
|
|
};
|
|
}
|
|
|
|
const dispensary = mapDbRowToDispensary(rows[0]);
|
|
let menuUrl = dispensary.menuUrl;
|
|
const previousMenuType = dispensary.menuType || null;
|
|
const website = dispensary.website;
|
|
|
|
// If menu_url is null or empty, try to discover it by crawling the dispensary website
|
|
if (!menuUrl || menuUrl.trim() === '') {
|
|
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
|
|
|
|
// Check if website is available
|
|
if (!website || website.trim() === '') {
|
|
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
|
|
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'unknown',
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'unknown'::text,
|
|
'detection_method', 'no_data'::text,
|
|
'detected_at', NOW(),
|
|
'resolution_error', 'No menu_url and no website available'::text,
|
|
'not_crawlable', true,
|
|
'website_crawl_attempted', false
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`,
|
|
[dispensaryId]
|
|
);
|
|
|
|
return {
|
|
dispensaryId,
|
|
dispensaryName: dispensary.name,
|
|
previousMenuType,
|
|
detectedProvider: 'unknown',
|
|
cName: null,
|
|
platformDispensaryId: null,
|
|
success: true,
|
|
error: 'No menu_url and no website available - marked as not crawlable',
|
|
};
|
|
}
|
|
|
|
// Crawl the website to find menu provider links
|
|
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
|
|
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
|
|
|
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
|
|
// SUCCESS: Found a menu URL from website crawl!
|
|
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
|
|
menuUrl = crawlResult.menuUrl;
|
|
|
|
// Update the dispensary with the discovered menu_url
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_url = $1,
|
|
menu_type = $2,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', $2::text,
|
|
'detection_method', 'website_crawl'::text,
|
|
'detected_at', NOW(),
|
|
'website_crawled', $3::text,
|
|
'website_crawl_pages', $4::jsonb,
|
|
'not_crawlable', false
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $5
|
|
`,
|
|
[
|
|
crawlResult.menuUrl,
|
|
crawlResult.provider,
|
|
website,
|
|
JSON.stringify(crawlResult.crawledPages),
|
|
dispensaryId
|
|
]
|
|
);
|
|
|
|
// Continue with full detection flow using the discovered menu_url
|
|
} else {
|
|
// Website crawl failed to find a menu provider
|
|
const errorReason = crawlResult.error || 'No menu provider links found on website';
|
|
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
|
|
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'unknown',
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'unknown'::text,
|
|
'detection_method', 'website_crawl'::text,
|
|
'detected_at', NOW(),
|
|
'website_crawled', $1::text,
|
|
'website_crawl_pages', $2::jsonb,
|
|
'resolution_error', $3::text,
|
|
'not_crawlable', true
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $4
|
|
`,
|
|
[
|
|
website,
|
|
JSON.stringify(crawlResult.crawledPages),
|
|
errorReason,
|
|
dispensaryId
|
|
]
|
|
);
|
|
|
|
return {
|
|
dispensaryId,
|
|
dispensaryName: dispensary.name,
|
|
previousMenuType,
|
|
detectedProvider: 'unknown',
|
|
cName: null,
|
|
platformDispensaryId: null,
|
|
success: true,
|
|
error: `Website crawl failed: ${errorReason}`,
|
|
};
|
|
}
|
|
}
|
|
|
|
// Detect provider from URL
|
|
const detectedProvider = detectProviderFromUrl(menuUrl);
|
|
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
|
|
|
|
// Initialize result
|
|
const result: DetectionResult = {
|
|
dispensaryId,
|
|
dispensaryName: dispensary.name,
|
|
previousMenuType,
|
|
detectedProvider,
|
|
cName: null,
|
|
platformDispensaryId: null,
|
|
success: false,
|
|
};
|
|
|
|
// For domains like curaleaf/sol, crawl the website to find the actual menu provider (often Dutchie)
|
|
const SPECIAL_DOMAINS = ['curaleaf', 'sol'] as const;
|
|
const isSpecialDomain = SPECIAL_DOMAINS.includes(detectedProvider as any);
|
|
|
|
if (isSpecialDomain && website && website.trim() !== '') {
|
|
console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider (often Dutchie)...`);
|
|
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
|
|
|
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
|
|
// Found an actual menu provider (likely Dutchie) - use that instead!
|
|
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
|
|
menuUrl = crawlResult.menuUrl;
|
|
|
|
// Re-detect provider from the found URL
|
|
const actualProvider = detectProviderFromUrl(menuUrl);
|
|
|
|
// Update with the actual discovered provider
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_url = $1,
|
|
menu_type = $2,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', $2::text,
|
|
'detection_method', 'website_crawl'::text,
|
|
'detected_at', NOW(),
|
|
'original_url_provider', $3::text,
|
|
'website_crawled', $4::text,
|
|
'website_crawl_pages', $5::jsonb,
|
|
'not_crawlable', false
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $6
|
|
`,
|
|
[
|
|
crawlResult.menuUrl,
|
|
actualProvider,
|
|
detectedProvider,
|
|
website,
|
|
JSON.stringify(crawlResult.crawledPages),
|
|
dispensaryId
|
|
]
|
|
);
|
|
|
|
// If the actual provider is dutchie, continue to platform ID resolution
|
|
if (actualProvider === 'dutchie') {
|
|
result.detectedProvider = 'dutchie';
|
|
// If platformDispensaryId was captured (e.g., reactEnv on homepage), save it now and return
|
|
if (crawlResult.platformDispensaryId) {
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
platform_dispensary_id = $1,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'platform_id_resolved', true,
|
|
'platform_id_resolved_at', NOW(),
|
|
'detected_provider', 'dutchie'::text,
|
|
'detection_method', 'website_crawl'::text
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $2
|
|
`,
|
|
[crawlResult.platformDispensaryId, dispensaryId]
|
|
);
|
|
result.platformDispensaryId = crawlResult.platformDispensaryId;
|
|
result.success = true;
|
|
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID from reactEnv = ${crawlResult.platformDispensaryId}`);
|
|
return result;
|
|
}
|
|
// Fall through to dutchie platform ID resolution below if no platform ID captured
|
|
} else {
|
|
// Found a different provider (treez, jane, etc.) - we're done
|
|
result.detectedProvider = actualProvider;
|
|
result.success = true;
|
|
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${actualProvider} (discovered from website crawl)`);
|
|
return result;
|
|
}
|
|
} else {
|
|
// Website crawl didn't find any menu provider - mark unknown with reason
|
|
const notCrawlableReason = `No embedded menu provider found`;
|
|
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found no menu provider - marking as unknown`);
|
|
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'unknown',
|
|
platform_dispensary_id = NULL,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'unknown'::text,
|
|
'detection_method', 'url_pattern_with_crawl'::text,
|
|
'detected_at', NOW(),
|
|
'website_crawled', $1::text,
|
|
'website_crawl_pages', $2::jsonb,
|
|
'not_crawlable', true,
|
|
'not_crawlable_reason', $3::text
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $4
|
|
`,
|
|
[
|
|
website,
|
|
JSON.stringify(crawlResult.crawledPages),
|
|
notCrawlableReason,
|
|
dispensaryId
|
|
]
|
|
);
|
|
result.success = true;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// If not dutchie, just update menu_type (non-dutchie providers)
|
|
if (detectedProvider !== 'dutchie') {
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = $1,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', $1::text,
|
|
'detection_method', 'url_pattern'::text,
|
|
'detected_at', NOW(),
|
|
'not_crawlable', false
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $2
|
|
`,
|
|
[detectedProvider, dispensaryId]
|
|
);
|
|
result.success = true;
|
|
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`);
|
|
return result;
|
|
}
|
|
|
|
// For dutchie: extract cName or platformId from menu_url
|
|
const extraction = extractFromMenuUrl(menuUrl);
|
|
|
|
if (!extraction) {
|
|
result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`;
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'dutchie',
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'dutchie'::text,
|
|
'detection_method', 'url_pattern'::text,
|
|
'detected_at', NOW(),
|
|
'resolution_error', $1::text,
|
|
'not_crawlable', true
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $2
|
|
`,
|
|
[result.error, dispensaryId]
|
|
);
|
|
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
|
return result;
|
|
}
|
|
|
|
// If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/<id>.js), skip GraphQL resolution
|
|
if (extraction.type === 'platformId') {
|
|
const platformId = extraction.value;
|
|
result.platformDispensaryId = platformId;
|
|
result.success = true;
|
|
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'dutchie',
|
|
platform_dispensary_id = $1,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'dutchie'::text,
|
|
'detection_method', 'url_direct_platform_id'::text,
|
|
'detected_at', NOW(),
|
|
'platform_id_source', 'url_embedded'::text,
|
|
'platform_id_resolved', true,
|
|
'platform_id_resolved_at', NOW(),
|
|
'resolution_error', NULL::text,
|
|
'not_crawlable', false
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $2
|
|
`,
|
|
[platformId, dispensaryId]
|
|
);
|
|
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
|
|
return result;
|
|
}
|
|
|
|
// Otherwise, we have a cName that needs GraphQL resolution
|
|
const cName = extraction.value;
|
|
result.cName = cName;
|
|
|
|
// Resolve platform_dispensary_id from cName
|
|
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
|
|
|
|
try {
|
|
const platformId = await resolveDispensaryId(cName);
|
|
|
|
if (platformId) {
|
|
result.platformDispensaryId = platformId;
|
|
result.success = true;
|
|
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'dutchie',
|
|
platform_dispensary_id = $1,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'dutchie'::text,
|
|
'detection_method', 'url_pattern'::text,
|
|
'detected_at', NOW(),
|
|
'cname_extracted', $2::text,
|
|
'platform_id_resolved', true,
|
|
'platform_id_resolved_at', NOW(),
|
|
'resolution_error', NULL::text,
|
|
'not_crawlable', false
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $3
|
|
`,
|
|
[platformId, cName, dispensaryId]
|
|
);
|
|
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
|
} else {
|
|
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'dutchie',
|
|
platform_dispensary_id = NULL,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'dutchie'::text,
|
|
'detection_method', 'url_pattern'::text,
|
|
'detected_at', NOW(),
|
|
'cname_extracted', $1::text,
|
|
'platform_id_resolved', false,
|
|
'resolution_error', $2::text,
|
|
'not_crawlable', true
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $3
|
|
`,
|
|
[cName, result.error, dispensaryId]
|
|
);
|
|
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
|
}
|
|
} catch (error: any) {
|
|
result.error = `Resolution failed: ${error.message}`;
|
|
await query(
|
|
`
|
|
UPDATE dispensaries SET
|
|
menu_type = 'dutchie',
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', 'dutchie'::text,
|
|
'detection_method', 'url_pattern'::text,
|
|
'detected_at', NOW(),
|
|
'cname_extracted', $1::text,
|
|
'platform_id_resolved', false,
|
|
'resolution_error', $2::text,
|
|
'not_crawlable', true
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $3
|
|
`,
|
|
[cName, result.error, dispensaryId]
|
|
);
|
|
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
|
|
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
|
|
*/
|
|
export async function runBulkDetection(options: {
|
|
state?: string;
|
|
onlyUnknown?: boolean;
|
|
onlyMissingPlatformId?: boolean;
|
|
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
|
|
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
|
|
limit?: number;
|
|
} = {}): Promise<BulkDetectionResult> {
|
|
const {
|
|
state,
|
|
onlyUnknown = true,
|
|
onlyMissingPlatformId = false,
|
|
includeWebsiteCrawl = true,
|
|
includeDutchieMissingPlatformId = true,
|
|
limit,
|
|
} = options;
|
|
|
|
console.log('[MenuDetection] Starting bulk detection...');
|
|
|
|
// Build query to find dispensaries needing detection
|
|
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
|
// Optionally includes dutchie stores missing platform ID
|
|
let whereClause = `WHERE (
|
|
menu_url IS NOT NULL
|
|
${includeWebsiteCrawl ? `OR (
|
|
menu_url IS NULL
|
|
AND website IS NOT NULL
|
|
AND website != ''
|
|
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
|
)` : ''}
|
|
${includeDutchieMissingPlatformId ? `OR (
|
|
menu_type = 'dutchie' AND platform_dispensary_id IS NULL
|
|
)` : ''}
|
|
)`;
|
|
const params: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (state) {
|
|
whereClause += ` AND state = $${paramIndex++}`;
|
|
params.push(state);
|
|
}
|
|
|
|
// Handle filters for unknown and/or missing platform IDs
|
|
if (onlyUnknown && onlyMissingPlatformId) {
|
|
whereClause += ` AND (
|
|
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
|
|
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)
|
|
)`;
|
|
} else if (onlyUnknown) {
|
|
whereClause += ` AND (
|
|
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
|
|
${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''}
|
|
)`;
|
|
} else if (onlyMissingPlatformId) {
|
|
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
|
} else if (includeDutchieMissingPlatformId) {
|
|
// Always attempt to resolve dutchie stores missing platform IDs
|
|
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
|
}
|
|
|
|
let query_str = `
|
|
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
|
${whereClause}
|
|
ORDER BY name
|
|
`;
|
|
|
|
if (limit) {
|
|
query_str += ` LIMIT $${paramIndex}`;
|
|
params.push(limit);
|
|
}
|
|
|
|
const { rows: dispensaries } = await query<any>(query_str, params);
|
|
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
|
|
|
|
const result: BulkDetectionResult = {
|
|
totalProcessed: 0,
|
|
totalSucceeded: 0,
|
|
totalFailed: 0,
|
|
totalSkipped: 0,
|
|
results: [],
|
|
errors: [],
|
|
};
|
|
|
|
for (const row of dispensaries) {
|
|
result.totalProcessed++;
|
|
|
|
try {
|
|
const detectionResult = await detectAndResolveDispensary(row.id);
|
|
result.results.push(detectionResult);
|
|
|
|
if (detectionResult.success) {
|
|
result.totalSucceeded++;
|
|
} else {
|
|
result.totalFailed++;
|
|
if (detectionResult.error) {
|
|
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
|
|
}
|
|
}
|
|
|
|
// Rate limit between requests
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
} catch (error: any) {
|
|
result.totalFailed++;
|
|
result.errors.push(`${row.name || row.id}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
|
|
return result;
|
|
}
|
|
|
|
// ============================================================
|
|
// SCHEDULED JOB EXECUTOR
|
|
// ============================================================
|
|
|
|
/**
|
|
* Execute the menu detection job (called by scheduler)
|
|
*/
|
|
export async function executeMenuDetectionJob(config: Record<string, any> = {}): Promise<{
|
|
status: JobStatus;
|
|
itemsProcessed: number;
|
|
itemsSucceeded: number;
|
|
itemsFailed: number;
|
|
errorMessage?: string;
|
|
metadata?: any;
|
|
}> {
|
|
const state = config.state || 'AZ';
|
|
const onlyUnknown = config.onlyUnknown !== false;
|
|
// Default to true - always try to resolve platform IDs for dutchie stores
|
|
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
|
|
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
|
|
|
|
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
|
|
|
|
try {
|
|
const result = await runBulkDetection({
|
|
state,
|
|
onlyUnknown,
|
|
onlyMissingPlatformId,
|
|
includeDutchieMissingPlatformId,
|
|
});
|
|
|
|
const status: JobStatus =
|
|
result.totalFailed === 0 ? 'success' :
|
|
result.totalSucceeded === 0 ? 'error' : 'partial';
|
|
|
|
return {
|
|
status,
|
|
itemsProcessed: result.totalProcessed,
|
|
itemsSucceeded: result.totalSucceeded,
|
|
itemsFailed: result.totalFailed,
|
|
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
|
metadata: {
|
|
state,
|
|
onlyUnknown,
|
|
onlyMissingPlatformId,
|
|
providerCounts: countByProvider(result.results),
|
|
},
|
|
};
|
|
} catch (error: any) {
|
|
return {
|
|
status: 'error',
|
|
itemsProcessed: 0,
|
|
itemsSucceeded: 0,
|
|
itemsFailed: 0,
|
|
errorMessage: error.message,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Count results by detected provider
|
|
*/
|
|
function countByProvider(results: DetectionResult[]): Record<string, number> {
|
|
const counts: Record<string, number> = {};
|
|
for (const r of results) {
|
|
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
|
|
}
|
|
return counts;
|
|
}
|
|
|
|
// ============================================================
|
|
// UTILITY FUNCTIONS
|
|
// ============================================================
|
|
|
|
/**
|
|
* Get detection stats for dashboard
|
|
*/
|
|
export async function getDetectionStats(): Promise<{
|
|
totalDispensaries: number;
|
|
withMenuType: number;
|
|
withPlatformId: number;
|
|
needsDetection: number;
|
|
byProvider: Record<string, number>;
|
|
}> {
|
|
const { rows } = await query<any>(`
|
|
SELECT
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
|
|
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
|
|
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
|
|
FROM dispensaries
|
|
WHERE state = 'AZ'
|
|
`);
|
|
|
|
const stats = rows[0] || {};
|
|
|
|
// Get provider breakdown
|
|
const { rows: providerRows } = await query<any>(`
|
|
SELECT menu_type, COUNT(*) as count
|
|
FROM dispensaries
|
|
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
|
|
GROUP BY menu_type
|
|
ORDER BY count DESC
|
|
`);
|
|
|
|
const byProvider: Record<string, number> = {};
|
|
for (const row of providerRows) {
|
|
byProvider[row.menu_type] = parseInt(row.count, 10);
|
|
}
|
|
|
|
return {
|
|
totalDispensaries: parseInt(stats.total || '0', 10),
|
|
withMenuType: parseInt(stats.with_menu_type || '0', 10),
|
|
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
|
|
needsDetection: parseInt(stats.needs_detection || '0', 10),
|
|
byProvider,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get dispensaries needing detection
|
|
* Includes dispensaries with website but no menu_url for website crawl discovery
|
|
*/
|
|
export async function getDispensariesNeedingDetection(options: {
|
|
state?: string;
|
|
limit?: number;
|
|
includeWebsiteCrawl?: boolean;
|
|
} = {}): Promise<Dispensary[]> {
|
|
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
|
|
|
|
const { rows } = await query<any>(
|
|
`
|
|
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
|
WHERE state = $1
|
|
AND (
|
|
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
|
|
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
|
|
${includeWebsiteCrawl ? `OR (
|
|
menu_url IS NULL
|
|
AND website IS NOT NULL
|
|
AND website != ''
|
|
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
|
)` : ''}
|
|
)
|
|
ORDER BY name
|
|
LIMIT $2
|
|
`,
|
|
[state, limit]
|
|
);
|
|
|
|
return rows.map(mapDbRowToDispensary);
|
|
}
|