fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -0,0 +1,837 @@
"use strict";
/**
* Menu Detection Service
*
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
* and resolves platform_dispensary_id for dutchie stores.
*
* This service:
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
* 2. Detects provider from menu_url patterns
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
* 4. Logs results to job_run_logs
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks;
exports.detectProviderFromUrl = detectProviderFromUrl;
exports.detectAndResolveDispensary = detectAndResolveDispensary;
exports.runBulkDetection = runBulkDetection;
exports.executeMenuDetectionJob = executeMenuDetectionJob;
exports.getDetectionStats = getDetectionStats;
exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection;
const connection_1 = require("../db/connection");
const discovery_1 = require("./discovery");
const graphql_client_1 = require("./graphql-client");
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// ============================================================
// PROVIDER DETECTION PATTERNS
// ============================================================
const PROVIDER_URL_PATTERNS = [
// IMPORTANT: Curaleaf and Sol must come BEFORE dutchie to take precedence
// These stores have their own proprietary menu systems (not crawlable via Dutchie)
{
provider: 'curaleaf',
patterns: [
/curaleaf\.com\/stores\//i, // e.g., https://curaleaf.com/stores/curaleaf-az-glendale-east
/curaleaf\.com\/dispensary\//i, // e.g., https://curaleaf.com/dispensary/arizona
],
},
{
provider: 'sol',
patterns: [
/livewithsol\.com/i, // e.g., https://www.livewithsol.com/locations/sun-city/
/solflower\.com/i, // alternate domain if any
],
},
{
provider: 'dutchie',
patterns: [
/dutchie\.com/i,
/\/embedded-menu\//i,
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
/dutchie-plus/i,
],
},
{
provider: 'treez',
patterns: [
/treez\.io/i,
/shop\.treez/i,
/treez-ecommerce/i,
],
},
{
provider: 'jane',
patterns: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
},
{
provider: 'weedmaps',
patterns: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
},
{
provider: 'leafly',
patterns: [
/leafly\.com/i,
/order\.leafly/i,
],
},
{
provider: 'meadow',
patterns: [
/getmeadow\.com/i,
/meadow\.co/i,
],
},
{
provider: 'blaze',
patterns: [
/blaze\.me/i,
/blazepos\.com/i,
],
},
{
provider: 'flowhub',
patterns: [
/flowhub\.com/i,
/flowhub\.co/i,
],
},
{
provider: 'dispense',
patterns: [
/dispense\.io/i,
/dispenseapp\.com/i,
],
},
];
/**
* Link patterns that suggest a menu or ordering page
*/
const MENU_LINK_PATTERNS = [
/\/menu/i,
/\/order/i,
/\/shop/i,
/\/products/i,
/\/dispensary/i,
/\/store/i,
/curaleaf\.com/i,
/dutchie\.com/i,
/treez\.io/i,
/jane\.co/i,
/iheartjane\.com/i,
/weedmaps\.com/i,
/leafly\.com/i,
/getmeadow\.com/i,
/blaze\.me/i,
/flowhub\.com/i,
/dispense\.io/i,
];
/**
* Check if a URL is a Curaleaf store URL
*/
function isCuraleafUrl(url) {
if (!url)
return false;
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
}
/**
* Extract the Curaleaf store URL from a website URL
* Handles both /stores/ and /dispensary/ formats
*/
function extractCuraleafStoreUrl(url) {
if (!url)
return null;
// If it's already a Curaleaf stores/dispensary URL, use it
if (isCuraleafUrl(url)) {
return url;
}
return null;
}
/**
* Fetch a page and extract all links
*/
async function fetchPageLinks(url, timeout = 10000) {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
const response = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
clearTimeout(timeoutId);
if (!response.ok) {
return { links: [], error: `HTTP ${response.status}` };
}
const html = await response.text();
// Extract all href attributes from anchor tags
const linkRegex = /href=["']([^"']+)["']/gi;
const links = [];
let match;
while ((match = linkRegex.exec(html)) !== null) {
const href = match[1];
// Convert relative URLs to absolute
try {
const absoluteUrl = new URL(href, url).href;
links.push(absoluteUrl);
}
catch {
// Skip invalid URLs
}
}
// Also look for iframe src attributes (common for embedded menus)
const iframeRegex = /src=["']([^"']+)["']/gi;
while ((match = iframeRegex.exec(html)) !== null) {
const src = match[1];
try {
const absoluteUrl = new URL(src, url).href;
// Only add if it matches a provider pattern
for (const { patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(absoluteUrl))) {
links.push(absoluteUrl);
break;
}
}
}
catch {
// Skip invalid URLs
}
}
return { links: [...new Set(links)] }; // Deduplicate
}
catch (error) {
if (error.name === 'AbortError') {
return { links: [], error: 'Timeout' };
}
return { links: [], error: error.message };
}
}
/**
* Crawl a dispensary's website to find menu provider links
*
* Strategy:
* 1. Fetch the homepage and extract all links
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
* 4. Check followed pages for provider patterns
*/
async function crawlWebsiteForMenuLinks(websiteUrl) {
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
const result = {
menuUrl: null,
provider: 'unknown',
foundLinks: [],
crawledPages: [],
};
// Normalize URL
let baseUrl;
try {
baseUrl = new URL(websiteUrl);
if (!baseUrl.protocol.startsWith('http')) {
baseUrl = new URL(`https://${websiteUrl}`);
}
}
catch {
result.error = 'Invalid website URL';
return result;
}
// Step 1: Fetch the homepage
const homepage = baseUrl.href;
result.crawledPages.push(homepage);
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
if (homepageError) {
result.error = `Failed to fetch homepage: ${homepageError}`;
return result;
}
result.foundLinks = homepageLinks;
// Step 2: Check for direct provider matches in homepage links
for (const link of homepageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
// Step 3: Find menu/order/shop links to follow
const menuLinks = homepageLinks.filter(link => {
// Must be same domain or a known provider domain
try {
const linkUrl = new URL(link);
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link)));
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
return (isSameDomain && isMenuPath) || isProviderDomain;
}
catch {
return false;
}
});
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
for (const menuLink of menuLinks.slice(0, 3)) {
// Skip if we've already crawled this page
if (result.crawledPages.includes(menuLink))
continue;
// Check if this link itself is a provider URL
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(menuLink))) {
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
result.menuUrl = menuLink;
result.provider = provider;
return result;
}
}
result.crawledPages.push(menuLink);
// Rate limit
await new Promise(r => setTimeout(r, 500));
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
if (pageError) {
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
continue;
}
result.foundLinks.push(...pageLinks);
// Check for provider matches on this page
for (const link of pageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
}
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
return result;
}
// ============================================================
// CORE DETECTION FUNCTIONS
// ============================================================
/**
* Detect menu provider from a URL
*/
function detectProviderFromUrl(menuUrl) {
if (!menuUrl)
return 'unknown';
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
for (const pattern of patterns) {
if (pattern.test(menuUrl)) {
return provider;
}
}
}
// Check if it's a custom website (has a domain but doesn't match known providers)
try {
const url = new URL(menuUrl);
if (url.hostname && !url.hostname.includes('localhost')) {
return 'custom';
}
}
catch {
// Invalid URL
}
return 'unknown';
}
/**
* Detect provider and resolve platform ID for a single dispensary
*/
async function detectAndResolveDispensary(dispensaryId) {
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
// Get dispensary record
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]);
if (rows.length === 0) {
return {
dispensaryId,
dispensaryName: 'Unknown',
previousMenuType: null,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: false,
error: 'Dispensary not found',
};
}
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
let menuUrl = dispensary.menuUrl;
const previousMenuType = dispensary.menuType || null;
const website = dispensary.website;
// ============================================================
// CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url
// This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform
// ============================================================
if (isCuraleafUrl(website)) {
console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`);
// Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any)
// At this point we know website is defined since isCuraleafUrl returned true
const curaleafUrl = extractCuraleafStoreUrl(website) || website;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'curaleaf',
menu_url = $1,
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'curaleaf'::text,
'detection_method', 'website_pattern'::text,
'detected_at', NOW(),
'curaleaf_store_url', $1::text,
'stale_dutchie_url', $2::text,
'not_crawlable', true,
'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text
),
updated_at = NOW()
WHERE id = $3
`, [curaleafUrl, menuUrl || null, dispensaryId]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'curaleaf',
cName: null,
platformDispensaryId: null,
success: true,
error: undefined,
};
}
// If menu_url is null or empty, try to discover it by crawling the dispensary website
if (!menuUrl || menuUrl.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
// Check if website is available
if (!website || website.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'no_data'::text,
'detected_at', NOW(),
'resolution_error', 'No menu_url and no website available'::text,
'not_crawlable', true,
'website_crawl_attempted', false
),
updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: 'No menu_url and no website available - marked as not crawlable',
};
}
// Crawl the website to find menu provider links
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
const crawlResult = await crawlWebsiteForMenuLinks(website);
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
// SUCCESS: Found a menu URL from website crawl!
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
menuUrl = crawlResult.menuUrl;
// Update the dispensary with the discovered menu_url
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_url = $1,
menu_type = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $2::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $3::text,
'website_crawl_pages', $4::jsonb,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $5
`, [
crawlResult.menuUrl,
crawlResult.provider,
website,
JSON.stringify(crawlResult.crawledPages),
dispensaryId
]);
// Continue with full detection flow using the discovered menu_url
}
else {
// Website crawl failed to find a menu provider
const errorReason = crawlResult.error || 'No menu provider links found on website';
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $1::text,
'website_crawl_pages', $2::jsonb,
'resolution_error', $3::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $4
`, [
website,
JSON.stringify(crawlResult.crawledPages),
errorReason,
dispensaryId
]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: `Website crawl failed: ${errorReason}`,
};
}
}
// Detect provider from URL
const detectedProvider = detectProviderFromUrl(menuUrl);
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
// Initialize result
const result = {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider,
cName: null,
platformDispensaryId: null,
success: false,
};
// If not dutchie, just update menu_type and return
if (detectedProvider !== 'dutchie') {
// Special handling for proprietary providers - mark as not_crawlable until we have crawlers
const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'];
const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider);
const notCrawlableReason = isProprietaryProvider
? `${detectedProvider} proprietary menu - no crawler available`
: null;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = $1,
platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'not_crawlable', $3,
'not_crawlable_reason', $4::text
),
updated_at = NOW()
WHERE id = $2
`, [detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason]);
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`);
return result;
}
// For dutchie: extract cName and resolve platform ID
const cName = (0, discovery_1.extractCNameFromMenuUrl)(menuUrl);
result.cName = cName;
if (!cName) {
result.error = `Could not extract cName from menu_url: ${menuUrl}`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'resolution_error', $1::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $2
`, [result.error, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
return result;
}
// Resolve platform_dispensary_id from cName
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
try {
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
if (platformId) {
result.platformDispensaryId = platformId;
result.success = true;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
platform_dispensary_id_resolved_at = NOW(),
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $2::text,
'platform_id_resolved', true,
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $3
`, [platformId, cName, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
}
else {
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`, [cName, result.error, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
}
catch (error) {
result.error = `Resolution failed: ${error.message}`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`, [cName, result.error, dispensaryId]);
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
return result;
}
/**
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
*/
async function runBulkDetection(options = {}) {
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options;
console.log('[MenuDetection] Starting bulk detection...');
// Build query to find dispensaries needing detection
// Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
let whereClause = `WHERE (
menu_url IS NOT NULL
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
)`;
const params = [];
let paramIndex = 1;
if (state) {
whereClause += ` AND state = $${paramIndex++}`;
params.push(state);
}
if (onlyUnknown) {
whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`;
}
if (onlyMissingPlatformId) {
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
}
let query_str = `
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
${whereClause}
ORDER BY name
`;
if (limit) {
query_str += ` LIMIT $${paramIndex}`;
params.push(limit);
}
const { rows: dispensaries } = await (0, connection_1.query)(query_str, params);
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
const result = {
totalProcessed: 0,
totalSucceeded: 0,
totalFailed: 0,
totalSkipped: 0,
results: [],
errors: [],
};
for (const row of dispensaries) {
result.totalProcessed++;
try {
const detectionResult = await detectAndResolveDispensary(row.id);
result.results.push(detectionResult);
if (detectionResult.success) {
result.totalSucceeded++;
}
else {
result.totalFailed++;
if (detectionResult.error) {
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
}
}
// Rate limit between requests
await new Promise(r => setTimeout(r, 1000));
}
catch (error) {
result.totalFailed++;
result.errors.push(`${row.name || row.id}: ${error.message}`);
}
}
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
return result;
}
// ============================================================
// SCHEDULED JOB EXECUTOR
// ============================================================
/**
* Execute the menu detection job (called by scheduler)
*/
async function executeMenuDetectionJob(config = {}) {
const state = config.state || 'AZ';
const onlyUnknown = config.onlyUnknown !== false;
const onlyMissingPlatformId = config.onlyMissingPlatformId || false;
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
try {
const result = await runBulkDetection({
state,
onlyUnknown,
onlyMissingPlatformId,
});
const status = result.totalFailed === 0 ? 'success' :
result.totalSucceeded === 0 ? 'error' : 'partial';
return {
status,
itemsProcessed: result.totalProcessed,
itemsSucceeded: result.totalSucceeded,
itemsFailed: result.totalFailed,
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
metadata: {
state,
onlyUnknown,
onlyMissingPlatformId,
providerCounts: countByProvider(result.results),
},
};
}
catch (error) {
return {
status: 'error',
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
errorMessage: error.message,
};
}
}
/**
* Count results by detected provider
*/
function countByProvider(results) {
const counts = {};
for (const r of results) {
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
}
return counts;
}
// ============================================================
// UTILITY FUNCTIONS
// ============================================================
/**
* Get detection stats for dashboard
*/
async function getDetectionStats() {
const { rows } = await (0, connection_1.query)(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
FROM dispensaries
WHERE state = 'AZ'
`);
const stats = rows[0] || {};
// Get provider breakdown
const { rows: providerRows } = await (0, connection_1.query)(`
SELECT menu_type, COUNT(*) as count
FROM dispensaries
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
GROUP BY menu_type
ORDER BY count DESC
`);
const byProvider = {};
for (const row of providerRows) {
byProvider[row.menu_type] = parseInt(row.count, 10);
}
return {
totalDispensaries: parseInt(stats.total || '0', 10),
withMenuType: parseInt(stats.with_menu_type || '0', 10),
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
needsDetection: parseInt(stats.needs_detection || '0', 10),
byProvider,
};
}
/**
* Get dispensaries needing detection
* Includes dispensaries with website but no menu_url for website crawl discovery
*/
async function getDispensariesNeedingDetection(options = {}) {
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
const { rows } = await (0, connection_1.query)(`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE state = $1
AND (
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
)
ORDER BY name
LIMIT $2
`, [state, limit]);
return rows.map(discovery_1.mapDbRowToDispensary);
}