feat(detection): extract reactEnv dispensaryId and prefer Dutchie on page

This commit is contained in:
Kelly
2025-12-03 22:31:40 -07:00
parent 129d318314
commit 1083b51e6d

View File

@@ -208,6 +208,12 @@ async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ l
const html = await response.text();
// Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
if (reactEnvMatch && reactEnvMatch[1]) {
return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] };
}
// Extract all href attributes from anchor tags
const linkRegex = /href=["']([^"']+)["']/gi;
const links: string[] = [];
@@ -305,7 +311,8 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
});
if (resp.ok) {
const html = await resp.text();
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
// Look for dispensaryId directly - the [^}]* pattern fails with nested braces
const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html);
if (reactEnvMatch && reactEnvMatch[1]) {
result.provider = 'dutchie';
result.menuUrl = homepage;
@@ -318,7 +325,19 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
}
// Step 2: Check for direct provider matches in homepage links
// Step 2: Check for reactEnv token from fetchPageLinks (encoded as dutchie-reactenv:<id>)
for (const link of homepageLinks) {
const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link);
if (reactEnvToken) {
result.menuUrl = homepage;
result.provider = 'dutchie';
result.platformDispensaryId = reactEnvToken[1];
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`);
return result;
}
}
// Step 3: Check for direct provider matches in homepage links
for (const link of homepageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
@@ -330,7 +349,7 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
}
}
// Step 3: Find menu/order/shop links to follow
// Step 4: Find menu/order/shop links to follow
const menuLinks = homepageLinks.filter(link => {
// Must be same domain or a known provider domain
try {
@@ -821,6 +840,64 @@ if (isSpecialDomain && website && website.trim() !== '') {
);
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
} else {
// cName resolution failed - try crawling website as fallback
console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
if (website && website.trim() !== '') {
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') {
// Found Dutchie menu via website crawl!
console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`);
// Extract from the new menu URL
const newExtraction = extractFromMenuUrl(fallbackCrawl.menuUrl);
if (newExtraction) {
let fallbackPlatformId: string | null = null;
if (newExtraction.type === 'platformId') {
fallbackPlatformId = newExtraction.value;
} else {
// Try to resolve the new cName
fallbackPlatformId = await resolveDispensaryId(newExtraction.value);
}
if (fallbackPlatformId) {
result.platformDispensaryId = fallbackPlatformId;
result.success = true;
result.cName = newExtraction.value;
await query(
`
UPDATE dispensaries SET
menu_type = 'dutchie',
menu_url = $1,
platform_dispensary_id = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'website_crawl_fallback'::text,
'detected_at', NOW(),
'original_cname', $3::text,
'fallback_cname', $4::text,
'website_crawled', $5::text,
'platform_id_resolved', true,
'platform_id_resolved_at', NOW(),
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $6
`,
[fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`);
return result;
}
}
}
}
// Website crawl fallback didn't work either
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
await query(
`
@@ -835,6 +912,7 @@ if (isSpecialDomain && website && website.trim() !== '') {
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'website_crawl_attempted', true,
'not_crawlable', true
),
updated_at = NOW()