Improve menu detection to extract platform ID from URL and crawl proprietary domains

- Add extractFromMenuUrl() to discovery.ts that extracts either cName or platformId directly
  from Dutchie URLs (handles /api/v2/embedded-menu/<id>.js pattern)
- Add isObjectId() helper to identify MongoDB ObjectIds in URLs
- Update menu-detection.ts to skip GraphQL resolution when URL contains platformId directly
- For proprietary domains (curaleaf, sol), crawl website to find actual menu provider
  instead of blindly marking as not_crawlable
- If website crawl finds Dutchie embedded menu, set menu_type='dutchie' and resolve platform ID
- Tested successfully with consumeaz.com which discovers Dutchie embedded menu JS URL

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 21:41:45 -07:00
parent 202a3b92bf
commit 95fc8bb4cc
2 changed files with 209 additions and 76 deletions

View File

@@ -146,24 +146,59 @@ export async function discoverDispensaries(): Promise<{ discovered: number; erro
}
/**
* Extract cName (slug) from a Dutchie menu_url
* Supports formats:
* - https://dutchie.com/embedded-menu/<cName>
* - https://dutchie.com/dispensary/<cName>
* Check if a string looks like a MongoDB ObjectId (24 hex chars)
*/
export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null {
export function isObjectId(value: string): boolean {
return /^[a-f0-9]{24}$/i.test(value);
}
/**
* Extract cName (slug) or platform_dispensary_id from a Dutchie menu_url
*
* Supports formats:
* - https://dutchie.com/embedded-menu/<cName> -> returns { type: 'cName', value: '<cName>' }
* - https://dutchie.com/dispensary/<cName> -> returns { type: 'cName', value: '<cName>' }
* - https://dutchie.com/api/v2/embedded-menu/<id>.js -> returns { type: 'platformId', value: '<id>' }
*
* For backward compatibility, extractCNameFromMenuUrl still returns just the string value.
*/
export interface MenuUrlExtraction {
type: 'cName' | 'platformId';
value: string;
}
export function extractFromMenuUrl(menuUrl: string | null | undefined): MenuUrlExtraction | null {
if (!menuUrl) return null;
try {
const url = new URL(menuUrl);
const pathname = url.pathname;
// Match /api/v2/embedded-menu/<id>.js - this contains the platform_dispensary_id directly
const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i);
if (apiMatch) {
return { type: 'platformId', value: apiMatch[1] };
}
// Match /embedded-menu/<cName> or /dispensary/<cName>
const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
if (embeddedMatch) return embeddedMatch[1];
if (embeddedMatch) {
const value = embeddedMatch[1];
// Check if it's actually an ObjectId (some URLs use ID directly)
if (isObjectId(value)) {
return { type: 'platformId', value };
}
return { type: 'cName', value };
}
const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
if (dispensaryMatch) return dispensaryMatch[1];
if (dispensaryMatch) {
const value = dispensaryMatch[1];
if (isObjectId(value)) {
return { type: 'platformId', value };
}
return { type: 'cName', value };
}
return null;
} catch {
@@ -171,6 +206,15 @@ export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): str
}
}
/**
* Extract cName (slug) from a Dutchie menu_url
* Backward compatible - use extractFromMenuUrl for full info
*/
export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null {
const extraction = extractFromMenuUrl(menuUrl);
return extraction?.value || null;
}
/**
* Resolve platform dispensary IDs for all dispensaries that don't have one
* CRITICAL: Uses cName extracted from menu_url, NOT the slug column!

View File

@@ -12,7 +12,7 @@
*/
import { query } from '../db/connection';
import { extractCNameFromMenuUrl, mapDbRowToDispensary } from './discovery';
import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } from './discovery';
import { resolveDispensaryId } from './graphql-client';
import { Dispensary, JobStatus } from '../types';
@@ -461,51 +461,6 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
const previousMenuType = dispensary.menuType || null;
const website = dispensary.website;
// ============================================================
// CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url
// This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform
// ============================================================
if (isCuraleafUrl(website)) {
console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`);
// Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any)
// At this point we know website is defined since isCuraleafUrl returned true
const curaleafUrl = extractCuraleafStoreUrl(website!) || website;
await query(
`
UPDATE dispensaries SET
menu_type = 'curaleaf',
menu_url = $1,
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'curaleaf'::text,
'detection_method', 'website_pattern'::text,
'detected_at', NOW(),
'curaleaf_store_url', $1::text,
'stale_dutchie_url', $2::text,
'not_crawlable', true,
'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text
),
updated_at = NOW()
WHERE id = $3
`,
[curaleafUrl, menuUrl || null, dispensaryId]
);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'curaleaf',
cName: null,
platformDispensaryId: null,
success: true,
error: undefined,
};
}
// If menu_url is null or empty, try to discover it by crawling the dispensary website
if (!menuUrl || menuUrl.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
@@ -640,44 +595,127 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
success: false,
};
// If not dutchie, just update menu_type and return
if (detectedProvider !== 'dutchie') {
// Special handling for proprietary providers - mark as not_crawlable until we have crawlers
const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'] as const;
const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider as any);
const notCrawlableReason = isProprietaryProvider
? `${detectedProvider} proprietary menu - no crawler available`
: null;
// For proprietary providers (curaleaf, sol), try website crawl to find an actual menu provider
// These sites may have Dutchie embedded menus even though their domain suggests proprietary
const PROPRIETARY_DOMAINS = ['curaleaf', 'sol'] as const;
const isPotentiallyProprietary = PROPRIETARY_DOMAINS.includes(detectedProvider as any);
if (isPotentiallyProprietary && website && website.trim() !== '') {
console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider...`);
const crawlResult = await crawlWebsiteForMenuLinks(website);
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
// Found an actual menu provider (likely Dutchie) - use that instead!
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
menuUrl = crawlResult.menuUrl;
// Re-detect provider from the found URL
const actualProvider = detectProviderFromUrl(menuUrl);
// Update with the actual discovered provider
await query(
`
UPDATE dispensaries SET
menu_url = $1,
menu_type = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $2::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'original_url_provider', $3::text,
'website_crawled', $4::text,
'website_crawl_pages', $5::jsonb,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $6
`,
[
crawlResult.menuUrl,
actualProvider,
detectedProvider,
website,
JSON.stringify(crawlResult.crawledPages),
dispensaryId
]
);
// If the actual provider is dutchie, continue to platform ID resolution
if (actualProvider === 'dutchie') {
result.detectedProvider = 'dutchie';
// Fall through to dutchie platform ID resolution below
} else {
// Found a different provider (treez, jane, etc.) - we're done
result.detectedProvider = actualProvider;
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${actualProvider} (discovered from website crawl)`);
return result;
}
} else {
// Website crawl didn't find any menu provider - mark as proprietary
const notCrawlableReason = `${detectedProvider} proprietary menu - no embedded menu provider found`;
console.log(`[MenuDetection] ${dispensary.name}: Website crawl found no menu provider - marking as ${detectedProvider}`);
await query(
`
UPDATE dispensaries SET
menu_type = $1,
platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END,
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'url_pattern_with_crawl'::text,
'detected_at', NOW(),
'website_crawled', $2::text,
'website_crawl_pages', $3::jsonb,
'not_crawlable', true,
'not_crawlable_reason', $4::text
),
updated_at = NOW()
WHERE id = $5
`,
[
detectedProvider,
website,
JSON.stringify(crawlResult.crawledPages),
notCrawlableReason,
dispensaryId
]
);
result.success = true;
return result;
}
}
// If not dutchie and not a proprietary domain we need to crawl, just update menu_type
if (detectedProvider !== 'dutchie') {
await query(
`
UPDATE dispensaries SET
menu_type = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'not_crawlable', $3,
'not_crawlable_reason', $4::text
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $2
`,
[detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason]
[detectedProvider, dispensaryId]
);
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`);
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`);
return result;
}
// For dutchie: extract cName and resolve platform ID
const cName = extractCNameFromMenuUrl(menuUrl);
result.cName = cName;
// For dutchie: extract cName or platformId from menu_url
const extraction = extractFromMenuUrl(menuUrl);
if (!cName) {
result.error = `Could not extract cName from menu_url: ${menuUrl}`;
if (!extraction) {
result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`;
await query(
`
UPDATE dispensaries SET
@@ -699,6 +737,41 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
return result;
}
// If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/<id>.js), skip GraphQL resolution
if (extraction.type === 'platformId') {
const platformId = extraction.value;
result.platformDispensaryId = platformId;
result.success = true;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_direct_platform_id'::text,
'detected_at', NOW(),
'platform_id_source', 'url_embedded'::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $2
`,
[platformId, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
return result;
}
// Otherwise, we have a cName that needs GraphQL resolution
const cName = extraction.value;
result.cName = cName;
// Resolve platform_dispensary_id from cName
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
@@ -791,14 +864,23 @@ export async function runBulkDetection(options: {
onlyUnknown?: boolean;
onlyMissingPlatformId?: boolean;
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
limit?: number;
} = {}): Promise<BulkDetectionResult> {
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options;
const {
state,
onlyUnknown = true,
onlyMissingPlatformId = false,
includeWebsiteCrawl = true,
includeDutchieMissingPlatformId = true,
limit,
} = options;
console.log('[MenuDetection] Starting bulk detection...');
// Build query to find dispensaries needing detection
// Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
// Optionally includes dutchie stores missing platform ID
let whereClause = `WHERE (
menu_url IS NOT NULL
${includeWebsiteCrawl ? `OR (
@@ -807,6 +889,9 @@ export async function runBulkDetection(options: {
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
${includeDutchieMissingPlatformId ? `OR (
menu_type = 'dutchie' AND platform_dispensary_id IS NULL
)` : ''}
)`;
const params: any[] = [];
let paramIndex = 1;
@@ -816,15 +901,17 @@ export async function runBulkDetection(options: {
params.push(state);
}
// Handle the combination of onlyUnknown and onlyMissingPlatformId
// If both are set, include: unknown menu_type OR (dutchie without platform_id)
// Handle filters for unknown and/or missing platform IDs
if (onlyUnknown && onlyMissingPlatformId) {
whereClause += ` AND (
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)
)`;
} else if (onlyUnknown) {
whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`;
whereClause += ` AND (
(menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''}
)`;
} else if (onlyMissingPlatformId) {
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
}
@@ -899,6 +986,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
const onlyUnknown = config.onlyUnknown !== false;
// Default to true - always try to resolve platform IDs for dutchie stores
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
@@ -907,6 +995,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
state,
onlyUnknown,
onlyMissingPlatformId,
includeDutchieMissingPlatformId,
});
const status: JobStatus =