fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
837
backend/dist/dutchie-az/services/menu-detection.js
vendored
Normal file
837
backend/dist/dutchie-az/services/menu-detection.js
vendored
Normal file
@@ -0,0 +1,837 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Menu Detection Service
|
||||
*
|
||||
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
|
||||
* and resolves platform_dispensary_id for dutchie stores.
|
||||
*
|
||||
* This service:
|
||||
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
|
||||
* 2. Detects provider from menu_url patterns
|
||||
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
|
||||
* 4. Logs results to job_run_logs
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks;
|
||||
exports.detectProviderFromUrl = detectProviderFromUrl;
|
||||
exports.detectAndResolveDispensary = detectAndResolveDispensary;
|
||||
exports.runBulkDetection = runBulkDetection;
|
||||
exports.executeMenuDetectionJob = executeMenuDetectionJob;
|
||||
exports.getDetectionStats = getDetectionStats;
|
||||
exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection;
|
||||
const connection_1 = require("../db/connection");
|
||||
const discovery_1 = require("./discovery");
|
||||
const graphql_client_1 = require("./graphql-client");
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
// ============================================================
|
||||
// PROVIDER DETECTION PATTERNS
|
||||
// ============================================================
|
||||
const PROVIDER_URL_PATTERNS = [
|
||||
// IMPORTANT: Curaleaf and Sol must come BEFORE dutchie to take precedence
|
||||
// These stores have their own proprietary menu systems (not crawlable via Dutchie)
|
||||
{
|
||||
provider: 'curaleaf',
|
||||
patterns: [
|
||||
/curaleaf\.com\/stores\//i, // e.g., https://curaleaf.com/stores/curaleaf-az-glendale-east
|
||||
/curaleaf\.com\/dispensary\//i, // e.g., https://curaleaf.com/dispensary/arizona
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'sol',
|
||||
patterns: [
|
||||
/livewithsol\.com/i, // e.g., https://www.livewithsol.com/locations/sun-city/
|
||||
/solflower\.com/i, // alternate domain if any
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'dutchie',
|
||||
patterns: [
|
||||
/dutchie\.com/i,
|
||||
/\/embedded-menu\//i,
|
||||
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
|
||||
/dutchie-plus/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'treez',
|
||||
patterns: [
|
||||
/treez\.io/i,
|
||||
/shop\.treez/i,
|
||||
/treez-ecommerce/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'jane',
|
||||
patterns: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/embed\.iheartjane/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'weedmaps',
|
||||
patterns: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'leafly',
|
||||
patterns: [
|
||||
/leafly\.com/i,
|
||||
/order\.leafly/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'meadow',
|
||||
patterns: [
|
||||
/getmeadow\.com/i,
|
||||
/meadow\.co/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'blaze',
|
||||
patterns: [
|
||||
/blaze\.me/i,
|
||||
/blazepos\.com/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'flowhub',
|
||||
patterns: [
|
||||
/flowhub\.com/i,
|
||||
/flowhub\.co/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'dispense',
|
||||
patterns: [
|
||||
/dispense\.io/i,
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
},
|
||||
];
|
||||
/**
|
||||
* Link patterns that suggest a menu or ordering page
|
||||
*/
|
||||
const MENU_LINK_PATTERNS = [
|
||||
/\/menu/i,
|
||||
/\/order/i,
|
||||
/\/shop/i,
|
||||
/\/products/i,
|
||||
/\/dispensary/i,
|
||||
/\/store/i,
|
||||
/curaleaf\.com/i,
|
||||
/dutchie\.com/i,
|
||||
/treez\.io/i,
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/weedmaps\.com/i,
|
||||
/leafly\.com/i,
|
||||
/getmeadow\.com/i,
|
||||
/blaze\.me/i,
|
||||
/flowhub\.com/i,
|
||||
/dispense\.io/i,
|
||||
];
|
||||
/**
|
||||
* Check if a URL is a Curaleaf store URL
|
||||
*/
|
||||
function isCuraleafUrl(url) {
|
||||
if (!url)
|
||||
return false;
|
||||
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
|
||||
}
|
||||
/**
|
||||
* Extract the Curaleaf store URL from a website URL
|
||||
* Handles both /stores/ and /dispensary/ formats
|
||||
*/
|
||||
function extractCuraleafStoreUrl(url) {
|
||||
if (!url)
|
||||
return null;
|
||||
// If it's already a Curaleaf stores/dispensary URL, use it
|
||||
if (isCuraleafUrl(url)) {
|
||||
return url;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
/**
|
||||
* Fetch a page and extract all links
|
||||
*/
|
||||
async function fetchPageLinks(url, timeout = 10000) {
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
||||
const response = await fetch(url, {
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
redirect: 'follow',
|
||||
});
|
||||
clearTimeout(timeoutId);
|
||||
if (!response.ok) {
|
||||
return { links: [], error: `HTTP ${response.status}` };
|
||||
}
|
||||
const html = await response.text();
|
||||
// Extract all href attributes from anchor tags
|
||||
const linkRegex = /href=["']([^"']+)["']/gi;
|
||||
const links = [];
|
||||
let match;
|
||||
while ((match = linkRegex.exec(html)) !== null) {
|
||||
const href = match[1];
|
||||
// Convert relative URLs to absolute
|
||||
try {
|
||||
const absoluteUrl = new URL(href, url).href;
|
||||
links.push(absoluteUrl);
|
||||
}
|
||||
catch {
|
||||
// Skip invalid URLs
|
||||
}
|
||||
}
|
||||
// Also look for iframe src attributes (common for embedded menus)
|
||||
const iframeRegex = /src=["']([^"']+)["']/gi;
|
||||
while ((match = iframeRegex.exec(html)) !== null) {
|
||||
const src = match[1];
|
||||
try {
|
||||
const absoluteUrl = new URL(src, url).href;
|
||||
// Only add if it matches a provider pattern
|
||||
for (const { patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(absoluteUrl))) {
|
||||
links.push(absoluteUrl);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Skip invalid URLs
|
||||
}
|
||||
}
|
||||
return { links: [...new Set(links)] }; // Deduplicate
|
||||
}
|
||||
catch (error) {
|
||||
if (error.name === 'AbortError') {
|
||||
return { links: [], error: 'Timeout' };
|
||||
}
|
||||
return { links: [], error: error.message };
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Crawl a dispensary's website to find menu provider links
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Fetch the homepage and extract all links
|
||||
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
|
||||
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
|
||||
* 4. Check followed pages for provider patterns
|
||||
*/
|
||||
async function crawlWebsiteForMenuLinks(websiteUrl) {
|
||||
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
|
||||
const result = {
|
||||
menuUrl: null,
|
||||
provider: 'unknown',
|
||||
foundLinks: [],
|
||||
crawledPages: [],
|
||||
};
|
||||
// Normalize URL
|
||||
let baseUrl;
|
||||
try {
|
||||
baseUrl = new URL(websiteUrl);
|
||||
if (!baseUrl.protocol.startsWith('http')) {
|
||||
baseUrl = new URL(`https://${websiteUrl}`);
|
||||
}
|
||||
}
|
||||
catch {
|
||||
result.error = 'Invalid website URL';
|
||||
return result;
|
||||
}
|
||||
// Step 1: Fetch the homepage
|
||||
const homepage = baseUrl.href;
|
||||
result.crawledPages.push(homepage);
|
||||
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
|
||||
if (homepageError) {
|
||||
result.error = `Failed to fetch homepage: ${homepageError}`;
|
||||
return result;
|
||||
}
|
||||
result.foundLinks = homepageLinks;
|
||||
// Step 2: Check for direct provider matches in homepage links
|
||||
for (const link of homepageLinks) {
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(link))) {
|
||||
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
|
||||
result.menuUrl = link;
|
||||
result.provider = provider;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Step 3: Find menu/order/shop links to follow
|
||||
const menuLinks = homepageLinks.filter(link => {
|
||||
// Must be same domain or a known provider domain
|
||||
try {
|
||||
const linkUrl = new URL(link);
|
||||
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
|
||||
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
|
||||
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link)));
|
||||
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
|
||||
return (isSameDomain && isMenuPath) || isProviderDomain;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
|
||||
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
|
||||
for (const menuLink of menuLinks.slice(0, 3)) {
|
||||
// Skip if we've already crawled this page
|
||||
if (result.crawledPages.includes(menuLink))
|
||||
continue;
|
||||
// Check if this link itself is a provider URL
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(menuLink))) {
|
||||
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
|
||||
result.menuUrl = menuLink;
|
||||
result.provider = provider;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
result.crawledPages.push(menuLink);
|
||||
// Rate limit
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
|
||||
if (pageError) {
|
||||
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
|
||||
continue;
|
||||
}
|
||||
result.foundLinks.push(...pageLinks);
|
||||
// Check for provider matches on this page
|
||||
for (const link of pageLinks) {
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(link))) {
|
||||
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
|
||||
result.menuUrl = link;
|
||||
result.provider = provider;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
|
||||
return result;
|
||||
}
|
||||
// ============================================================
|
||||
// CORE DETECTION FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Detect menu provider from a URL
|
||||
*/
|
||||
function detectProviderFromUrl(menuUrl) {
|
||||
if (!menuUrl)
|
||||
return 'unknown';
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
for (const pattern of patterns) {
|
||||
if (pattern.test(menuUrl)) {
|
||||
return provider;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check if it's a custom website (has a domain but doesn't match known providers)
|
||||
try {
|
||||
const url = new URL(menuUrl);
|
||||
if (url.hostname && !url.hostname.includes('localhost')) {
|
||||
return 'custom';
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Invalid URL
|
||||
}
|
||||
return 'unknown';
|
||||
}
|
||||
/**
|
||||
* Detect provider and resolve platform ID for a single dispensary
|
||||
*/
|
||||
async function detectAndResolveDispensary(dispensaryId) {
|
||||
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
|
||||
// Get dispensary record
|
||||
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]);
|
||||
if (rows.length === 0) {
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: 'Unknown',
|
||||
previousMenuType: null,
|
||||
detectedProvider: 'unknown',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: false,
|
||||
error: 'Dispensary not found',
|
||||
};
|
||||
}
|
||||
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
|
||||
let menuUrl = dispensary.menuUrl;
|
||||
const previousMenuType = dispensary.menuType || null;
|
||||
const website = dispensary.website;
|
||||
// ============================================================
|
||||
// CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url
|
||||
// This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform
|
||||
// ============================================================
|
||||
if (isCuraleafUrl(website)) {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`);
|
||||
// Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any)
|
||||
// At this point we know website is defined since isCuraleafUrl returned true
|
||||
const curaleafUrl = extractCuraleafStoreUrl(website) || website;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'curaleaf',
|
||||
menu_url = $1,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'curaleaf'::text,
|
||||
'detection_method', 'website_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'curaleaf_store_url', $1::text,
|
||||
'stale_dutchie_url', $2::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [curaleafUrl, menuUrl || null, dispensaryId]);
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'curaleaf',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: undefined,
|
||||
};
|
||||
}
|
||||
// If menu_url is null or empty, try to discover it by crawling the dispensary website
|
||||
if (!menuUrl || menuUrl.trim() === '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
|
||||
// Check if website is available
|
||||
if (!website || website.trim() === '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'unknown',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'unknown'::text,
|
||||
'detection_method', 'no_data'::text,
|
||||
'detected_at', NOW(),
|
||||
'resolution_error', 'No menu_url and no website available'::text,
|
||||
'not_crawlable', true,
|
||||
'website_crawl_attempted', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'unknown',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: 'No menu_url and no website available - marked as not crawlable',
|
||||
};
|
||||
}
|
||||
// Crawl the website to find menu provider links
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
|
||||
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
||||
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
|
||||
// SUCCESS: Found a menu URL from website crawl!
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
|
||||
menuUrl = crawlResult.menuUrl;
|
||||
// Update the dispensary with the discovered menu_url
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_url = $1,
|
||||
menu_type = $2,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $2::text,
|
||||
'detection_method', 'website_crawl'::text,
|
||||
'detected_at', NOW(),
|
||||
'website_crawled', $3::text,
|
||||
'website_crawl_pages', $4::jsonb,
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $5
|
||||
`, [
|
||||
crawlResult.menuUrl,
|
||||
crawlResult.provider,
|
||||
website,
|
||||
JSON.stringify(crawlResult.crawledPages),
|
||||
dispensaryId
|
||||
]);
|
||||
// Continue with full detection flow using the discovered menu_url
|
||||
}
|
||||
else {
|
||||
// Website crawl failed to find a menu provider
|
||||
const errorReason = crawlResult.error || 'No menu provider links found on website';
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'unknown',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'unknown'::text,
|
||||
'detection_method', 'website_crawl'::text,
|
||||
'detected_at', NOW(),
|
||||
'website_crawled', $1::text,
|
||||
'website_crawl_pages', $2::jsonb,
|
||||
'resolution_error', $3::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`, [
|
||||
website,
|
||||
JSON.stringify(crawlResult.crawledPages),
|
||||
errorReason,
|
||||
dispensaryId
|
||||
]);
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'unknown',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: `Website crawl failed: ${errorReason}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
// Detect provider from URL
|
||||
const detectedProvider = detectProviderFromUrl(menuUrl);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
|
||||
// Initialize result
|
||||
const result = {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider,
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: false,
|
||||
};
|
||||
// If not dutchie, just update menu_type and return
|
||||
if (detectedProvider !== 'dutchie') {
|
||||
// Special handling for proprietary providers - mark as not_crawlable until we have crawlers
|
||||
const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'];
|
||||
const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider);
|
||||
const notCrawlableReason = isProprietaryProvider
|
||||
? `${detectedProvider} proprietary menu - no crawler available`
|
||||
: null;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'not_crawlable', $3,
|
||||
'not_crawlable_reason', $4::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason]);
|
||||
result.success = true;
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`);
|
||||
return result;
|
||||
}
|
||||
// For dutchie: extract cName and resolve platform ID
|
||||
const cName = (0, discovery_1.extractCNameFromMenuUrl)(menuUrl);
|
||||
result.cName = cName;
|
||||
if (!cName) {
|
||||
result.error = `Could not extract cName from menu_url: ${menuUrl}`;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'resolution_error', $1::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [result.error, dispensaryId]);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
return result;
|
||||
}
|
||||
// Resolve platform_dispensary_id from cName
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
|
||||
try {
|
||||
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
|
||||
if (platformId) {
|
||||
result.platformDispensaryId = platformId;
|
||||
result.success = true;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'cname_extracted', $2::text,
|
||||
'platform_id_resolved', true,
|
||||
'resolution_error', NULL::text,
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [platformId, cName, dispensaryId]);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
||||
}
|
||||
else {
|
||||
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'cname_extracted', $1::text,
|
||||
'platform_id_resolved', false,
|
||||
'resolution_error', $2::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [cName, result.error, dispensaryId]);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.error = `Resolution failed: ${error.message}`;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'cname_extracted', $1::text,
|
||||
'platform_id_resolved', false,
|
||||
'resolution_error', $2::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [cName, result.error, dispensaryId]);
|
||||
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
|
||||
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
|
||||
*/
|
||||
async function runBulkDetection(options = {}) {
|
||||
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options;
|
||||
console.log('[MenuDetection] Starting bulk detection...');
|
||||
// Build query to find dispensaries needing detection
|
||||
// Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
||||
let whereClause = `WHERE (
|
||||
menu_url IS NOT NULL
|
||||
${includeWebsiteCrawl ? `OR (
|
||||
menu_url IS NULL
|
||||
AND website IS NOT NULL
|
||||
AND website != ''
|
||||
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
||||
)` : ''}
|
||||
)`;
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (state) {
|
||||
whereClause += ` AND state = $${paramIndex++}`;
|
||||
params.push(state);
|
||||
}
|
||||
if (onlyUnknown) {
|
||||
whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`;
|
||||
}
|
||||
if (onlyMissingPlatformId) {
|
||||
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
||||
}
|
||||
let query_str = `
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
${whereClause}
|
||||
ORDER BY name
|
||||
`;
|
||||
if (limit) {
|
||||
query_str += ` LIMIT $${paramIndex}`;
|
||||
params.push(limit);
|
||||
}
|
||||
const { rows: dispensaries } = await (0, connection_1.query)(query_str, params);
|
||||
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
|
||||
const result = {
|
||||
totalProcessed: 0,
|
||||
totalSucceeded: 0,
|
||||
totalFailed: 0,
|
||||
totalSkipped: 0,
|
||||
results: [],
|
||||
errors: [],
|
||||
};
|
||||
for (const row of dispensaries) {
|
||||
result.totalProcessed++;
|
||||
try {
|
||||
const detectionResult = await detectAndResolveDispensary(row.id);
|
||||
result.results.push(detectionResult);
|
||||
if (detectionResult.success) {
|
||||
result.totalSucceeded++;
|
||||
}
|
||||
else {
|
||||
result.totalFailed++;
|
||||
if (detectionResult.error) {
|
||||
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
|
||||
}
|
||||
}
|
||||
// Rate limit between requests
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
catch (error) {
|
||||
result.totalFailed++;
|
||||
result.errors.push(`${row.name || row.id}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
|
||||
return result;
|
||||
}
|
||||
// ============================================================
|
||||
// SCHEDULED JOB EXECUTOR
|
||||
// ============================================================
|
||||
/**
|
||||
* Execute the menu detection job (called by scheduler)
|
||||
*/
|
||||
async function executeMenuDetectionJob(config = {}) {
|
||||
const state = config.state || 'AZ';
|
||||
const onlyUnknown = config.onlyUnknown !== false;
|
||||
const onlyMissingPlatformId = config.onlyMissingPlatformId || false;
|
||||
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
|
||||
try {
|
||||
const result = await runBulkDetection({
|
||||
state,
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
});
|
||||
const status = result.totalFailed === 0 ? 'success' :
|
||||
result.totalSucceeded === 0 ? 'error' : 'partial';
|
||||
return {
|
||||
status,
|
||||
itemsProcessed: result.totalProcessed,
|
||||
itemsSucceeded: result.totalSucceeded,
|
||||
itemsFailed: result.totalFailed,
|
||||
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
||||
metadata: {
|
||||
state,
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
providerCounts: countByProvider(result.results),
|
||||
},
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
return {
|
||||
status: 'error',
|
||||
itemsProcessed: 0,
|
||||
itemsSucceeded: 0,
|
||||
itemsFailed: 0,
|
||||
errorMessage: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Count results by detected provider
|
||||
*/
|
||||
function countByProvider(results) {
|
||||
const counts = {};
|
||||
for (const r of results) {
|
||||
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
|
||||
}
|
||||
return counts;
|
||||
}
|
||||
// ============================================================
|
||||
// UTILITY FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Get detection stats for dashboard
|
||||
*/
|
||||
async function getDetectionStats() {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
|
||||
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
|
||||
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
`);
|
||||
const stats = rows[0] || {};
|
||||
// Get provider breakdown
|
||||
const { rows: providerRows } = await (0, connection_1.query)(`
|
||||
SELECT menu_type, COUNT(*) as count
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
|
||||
GROUP BY menu_type
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
const byProvider = {};
|
||||
for (const row of providerRows) {
|
||||
byProvider[row.menu_type] = parseInt(row.count, 10);
|
||||
}
|
||||
return {
|
||||
totalDispensaries: parseInt(stats.total || '0', 10),
|
||||
withMenuType: parseInt(stats.with_menu_type || '0', 10),
|
||||
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
|
||||
needsDetection: parseInt(stats.needs_detection || '0', 10),
|
||||
byProvider,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Get dispensaries needing detection
|
||||
* Includes dispensaries with website but no menu_url for website crawl discovery
|
||||
*/
|
||||
async function getDispensariesNeedingDetection(options = {}) {
|
||||
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
WHERE state = $1
|
||||
AND (
|
||||
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
|
||||
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
|
||||
${includeWebsiteCrawl ? `OR (
|
||||
menu_url IS NULL
|
||||
AND website IS NOT NULL
|
||||
AND website != ''
|
||||
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
||||
)` : ''}
|
||||
)
|
||||
ORDER BY name
|
||||
LIMIT $2
|
||||
`, [state, limit]);
|
||||
return rows.map(discovery_1.mapDbRowToDispensary);
|
||||
}
|
||||
Reference in New Issue
Block a user