Improve menu detection to extract platform ID from URL and crawl proprietary domains
- Add extractFromMenuUrl() to discovery.ts that extracts either cName or platformId directly from Dutchie URLs (handles /api/v2/embedded-menu/<id>.js pattern) - Add isObjectId() helper to identify MongoDB ObjectIds in URLs - Update menu-detection.ts to skip GraphQL resolution when URL contains platformId directly - For proprietary domains (curaleaf, sol), crawl website to find actual menu provider instead of blindly marking as not_crawlable - If website crawl finds Dutchie embedded menu, set menu_type='dutchie' and resolve platform ID - Tested successfully with consumeaz.com which discovers Dutchie embedded menu JS URL 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -146,24 +146,59 @@ export async function discoverDispensaries(): Promise<{ discovered: number; erro
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) from a Dutchie menu_url
|
||||
* Supports formats:
|
||||
* - https://dutchie.com/embedded-menu/<cName>
|
||||
* - https://dutchie.com/dispensary/<cName>
|
||||
* Check if a string looks like a MongoDB ObjectId (24 hex chars)
|
||||
*/
|
||||
export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null {
|
||||
export function isObjectId(value: string): boolean {
|
||||
return /^[a-f0-9]{24}$/i.test(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) or platform_dispensary_id from a Dutchie menu_url
|
||||
*
|
||||
* Supports formats:
|
||||
* - https://dutchie.com/embedded-menu/<cName> -> returns { type: 'cName', value: '<cName>' }
|
||||
* - https://dutchie.com/dispensary/<cName> -> returns { type: 'cName', value: '<cName>' }
|
||||
* - https://dutchie.com/api/v2/embedded-menu/<id>.js -> returns { type: 'platformId', value: '<id>' }
|
||||
*
|
||||
* For backward compatibility, extractCNameFromMenuUrl still returns just the string value.
|
||||
*/
|
||||
export interface MenuUrlExtraction {
|
||||
type: 'cName' | 'platformId';
|
||||
value: string;
|
||||
}
|
||||
|
||||
export function extractFromMenuUrl(menuUrl: string | null | undefined): MenuUrlExtraction | null {
|
||||
if (!menuUrl) return null;
|
||||
|
||||
try {
|
||||
const url = new URL(menuUrl);
|
||||
const pathname = url.pathname;
|
||||
|
||||
// Match /api/v2/embedded-menu/<id>.js - this contains the platform_dispensary_id directly
|
||||
const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i);
|
||||
if (apiMatch) {
|
||||
return { type: 'platformId', value: apiMatch[1] };
|
||||
}
|
||||
|
||||
// Match /embedded-menu/<cName> or /dispensary/<cName>
|
||||
const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
|
||||
if (embeddedMatch) return embeddedMatch[1];
|
||||
if (embeddedMatch) {
|
||||
const value = embeddedMatch[1];
|
||||
// Check if it's actually an ObjectId (some URLs use ID directly)
|
||||
if (isObjectId(value)) {
|
||||
return { type: 'platformId', value };
|
||||
}
|
||||
return { type: 'cName', value };
|
||||
}
|
||||
|
||||
const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
|
||||
if (dispensaryMatch) return dispensaryMatch[1];
|
||||
if (dispensaryMatch) {
|
||||
const value = dispensaryMatch[1];
|
||||
if (isObjectId(value)) {
|
||||
return { type: 'platformId', value };
|
||||
}
|
||||
return { type: 'cName', value };
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch {
|
||||
@@ -171,6 +206,15 @@ export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): str
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) from a Dutchie menu_url
|
||||
* Backward compatible - use extractFromMenuUrl for full info
|
||||
*/
|
||||
export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null {
|
||||
const extraction = extractFromMenuUrl(menuUrl);
|
||||
return extraction?.value || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve platform dispensary IDs for all dispensaries that don't have one
|
||||
* CRITICAL: Uses cName extracted from menu_url, NOT the slug column!
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
*/
|
||||
|
||||
import { query } from '../db/connection';
|
||||
import { extractCNameFromMenuUrl, mapDbRowToDispensary } from './discovery';
|
||||
import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } from './discovery';
|
||||
import { resolveDispensaryId } from './graphql-client';
|
||||
import { Dispensary, JobStatus } from '../types';
|
||||
|
||||
@@ -461,51 +461,6 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
const previousMenuType = dispensary.menuType || null;
|
||||
const website = dispensary.website;
|
||||
|
||||
// ============================================================
|
||||
// CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url
|
||||
// This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform
|
||||
// ============================================================
|
||||
if (isCuraleafUrl(website)) {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`);
|
||||
|
||||
// Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any)
|
||||
// At this point we know website is defined since isCuraleafUrl returned true
|
||||
const curaleafUrl = extractCuraleafStoreUrl(website!) || website;
|
||||
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'curaleaf',
|
||||
menu_url = $1,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'curaleaf'::text,
|
||||
'detection_method', 'website_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'curaleaf_store_url', $1::text,
|
||||
'stale_dutchie_url', $2::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[curaleafUrl, menuUrl || null, dispensaryId]
|
||||
);
|
||||
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'curaleaf',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// If menu_url is null or empty, try to discover it by crawling the dispensary website
|
||||
if (!menuUrl || menuUrl.trim() === '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
|
||||
@@ -640,44 +595,127 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
success: false,
|
||||
};
|
||||
|
||||
// If not dutchie, just update menu_type and return
|
||||
if (detectedProvider !== 'dutchie') {
|
||||
// Special handling for proprietary providers - mark as not_crawlable until we have crawlers
|
||||
const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'] as const;
|
||||
const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider as any);
|
||||
const notCrawlableReason = isProprietaryProvider
|
||||
? `${detectedProvider} proprietary menu - no crawler available`
|
||||
: null;
|
||||
// For proprietary providers (curaleaf, sol), try website crawl to find an actual menu provider
|
||||
// These sites may have Dutchie embedded menus even though their domain suggests proprietary
|
||||
const PROPRIETARY_DOMAINS = ['curaleaf', 'sol'] as const;
|
||||
const isPotentiallyProprietary = PROPRIETARY_DOMAINS.includes(detectedProvider as any);
|
||||
|
||||
if (isPotentiallyProprietary && website && website.trim() !== '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider...`);
|
||||
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
||||
|
||||
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
|
||||
// Found an actual menu provider (likely Dutchie) - use that instead!
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
|
||||
menuUrl = crawlResult.menuUrl;
|
||||
|
||||
// Re-detect provider from the found URL
|
||||
const actualProvider = detectProviderFromUrl(menuUrl);
|
||||
|
||||
// Update with the actual discovered provider
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_url = $1,
|
||||
menu_type = $2,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $2::text,
|
||||
'detection_method', 'website_crawl'::text,
|
||||
'detected_at', NOW(),
|
||||
'original_url_provider', $3::text,
|
||||
'website_crawled', $4::text,
|
||||
'website_crawl_pages', $5::jsonb,
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $6
|
||||
`,
|
||||
[
|
||||
crawlResult.menuUrl,
|
||||
actualProvider,
|
||||
detectedProvider,
|
||||
website,
|
||||
JSON.stringify(crawlResult.crawledPages),
|
||||
dispensaryId
|
||||
]
|
||||
);
|
||||
|
||||
// If the actual provider is dutchie, continue to platform ID resolution
|
||||
if (actualProvider === 'dutchie') {
|
||||
result.detectedProvider = 'dutchie';
|
||||
// Fall through to dutchie platform ID resolution below
|
||||
} else {
|
||||
// Found a different provider (treez, jane, etc.) - we're done
|
||||
result.detectedProvider = actualProvider;
|
||||
result.success = true;
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${actualProvider} (discovered from website crawl)`);
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
// Website crawl didn't find any menu provider - mark as proprietary
|
||||
const notCrawlableReason = `${detectedProvider} proprietary menu - no embedded menu provider found`;
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found no menu provider - marking as ${detectedProvider}`);
|
||||
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'url_pattern_with_crawl'::text,
|
||||
'detected_at', NOW(),
|
||||
'website_crawled', $2::text,
|
||||
'website_crawl_pages', $3::jsonb,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', $4::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $5
|
||||
`,
|
||||
[
|
||||
detectedProvider,
|
||||
website,
|
||||
JSON.stringify(crawlResult.crawledPages),
|
||||
notCrawlableReason,
|
||||
dispensaryId
|
||||
]
|
||||
);
|
||||
result.success = true;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// If not dutchie and not a proprietary domain we need to crawl, just update menu_type
|
||||
if (detectedProvider !== 'dutchie') {
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'not_crawlable', $3,
|
||||
'not_crawlable_reason', $4::text
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason]
|
||||
[detectedProvider, dispensaryId]
|
||||
);
|
||||
result.success = true;
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
// For dutchie: extract cName and resolve platform ID
|
||||
const cName = extractCNameFromMenuUrl(menuUrl);
|
||||
result.cName = cName;
|
||||
// For dutchie: extract cName or platformId from menu_url
|
||||
const extraction = extractFromMenuUrl(menuUrl);
|
||||
|
||||
if (!cName) {
|
||||
result.error = `Could not extract cName from menu_url: ${menuUrl}`;
|
||||
if (!extraction) {
|
||||
result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`;
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
@@ -699,6 +737,41 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
return result;
|
||||
}
|
||||
|
||||
// If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/<id>.js), skip GraphQL resolution
|
||||
if (extraction.type === 'platformId') {
|
||||
const platformId = extraction.value;
|
||||
result.platformDispensaryId = platformId;
|
||||
result.success = true;
|
||||
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = $1,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_direct_platform_id'::text,
|
||||
'detected_at', NOW(),
|
||||
'platform_id_source', 'url_embedded'::text,
|
||||
'platform_id_resolved', true,
|
||||
'platform_id_resolved_at', NOW(),
|
||||
'resolution_error', NULL::text,
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[platformId, dispensaryId]
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Otherwise, we have a cName that needs GraphQL resolution
|
||||
const cName = extraction.value;
|
||||
result.cName = cName;
|
||||
|
||||
// Resolve platform_dispensary_id from cName
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
|
||||
|
||||
@@ -791,14 +864,23 @@ export async function runBulkDetection(options: {
|
||||
onlyUnknown?: boolean;
|
||||
onlyMissingPlatformId?: boolean;
|
||||
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
|
||||
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
|
||||
limit?: number;
|
||||
} = {}): Promise<BulkDetectionResult> {
|
||||
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options;
|
||||
const {
|
||||
state,
|
||||
onlyUnknown = true,
|
||||
onlyMissingPlatformId = false,
|
||||
includeWebsiteCrawl = true,
|
||||
includeDutchieMissingPlatformId = true,
|
||||
limit,
|
||||
} = options;
|
||||
|
||||
console.log('[MenuDetection] Starting bulk detection...');
|
||||
|
||||
// Build query to find dispensaries needing detection
|
||||
// Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
||||
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
||||
// Optionally includes dutchie stores missing platform ID
|
||||
let whereClause = `WHERE (
|
||||
menu_url IS NOT NULL
|
||||
${includeWebsiteCrawl ? `OR (
|
||||
@@ -807,6 +889,9 @@ export async function runBulkDetection(options: {
|
||||
AND website != ''
|
||||
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
||||
)` : ''}
|
||||
${includeDutchieMissingPlatformId ? `OR (
|
||||
menu_type = 'dutchie' AND platform_dispensary_id IS NULL
|
||||
)` : ''}
|
||||
)`;
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
@@ -816,15 +901,17 @@ export async function runBulkDetection(options: {
|
||||
params.push(state);
|
||||
}
|
||||
|
||||
// Handle the combination of onlyUnknown and onlyMissingPlatformId
|
||||
// If both are set, include: unknown menu_type OR (dutchie without platform_id)
|
||||
// Handle filters for unknown and/or missing platform IDs
|
||||
if (onlyUnknown && onlyMissingPlatformId) {
|
||||
whereClause += ` AND (
|
||||
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
|
||||
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)
|
||||
)`;
|
||||
} else if (onlyUnknown) {
|
||||
whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`;
|
||||
whereClause += ` AND (
|
||||
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
|
||||
${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''}
|
||||
)`;
|
||||
} else if (onlyMissingPlatformId) {
|
||||
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
||||
}
|
||||
@@ -899,6 +986,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
||||
const onlyUnknown = config.onlyUnknown !== false;
|
||||
// Default to true - always try to resolve platform IDs for dutchie stores
|
||||
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
|
||||
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
|
||||
|
||||
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
|
||||
|
||||
@@ -907,6 +995,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
||||
state,
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
includeDutchieMissingPlatformId,
|
||||
});
|
||||
|
||||
const status: JobStatus =
|
||||
|
||||
Reference in New Issue
Block a user