feat(detection): extract reactEnv dispensaryId and prefer Dutchie on page
This commit is contained in:
@@ -208,6 +208,12 @@ async function fetchPageLinks(url: string, timeout: number = 10000): Promise<{ l
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie
|
||||
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
|
||||
if (reactEnvMatch && reactEnvMatch[1]) {
|
||||
return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] };
|
||||
}
|
||||
|
||||
// Extract all href attributes from anchor tags
|
||||
const linkRegex = /href=["']([^"']+)["']/gi;
|
||||
const links: string[] = [];
|
||||
@@ -305,7 +311,8 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
|
||||
});
|
||||
if (resp.ok) {
|
||||
const html = await resp.text();
|
||||
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
|
||||
// Look for dispensaryId directly - the [^}]* pattern fails with nested braces
|
||||
const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html);
|
||||
if (reactEnvMatch && reactEnvMatch[1]) {
|
||||
result.provider = 'dutchie';
|
||||
result.menuUrl = homepage;
|
||||
@@ -318,7 +325,19 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
|
||||
console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
|
||||
}
|
||||
|
||||
// Step 2: Check for direct provider matches in homepage links
|
||||
// Step 2: Check for reactEnv token from fetchPageLinks (encoded as dutchie-reactenv:<id>)
|
||||
for (const link of homepageLinks) {
|
||||
const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link);
|
||||
if (reactEnvToken) {
|
||||
result.menuUrl = homepage;
|
||||
result.provider = 'dutchie';
|
||||
result.platformDispensaryId = reactEnvToken[1];
|
||||
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Check for direct provider matches in homepage links
|
||||
for (const link of homepageLinks) {
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(link))) {
|
||||
@@ -330,7 +349,7 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Find menu/order/shop links to follow
|
||||
// Step 4: Find menu/order/shop links to follow
|
||||
const menuLinks = homepageLinks.filter(link => {
|
||||
// Must be same domain or a known provider domain
|
||||
try {
|
||||
@@ -821,6 +840,64 @@ if (isSpecialDomain && website && website.trim() !== '') {
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
||||
} else {
|
||||
// cName resolution failed - try crawling website as fallback
|
||||
console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
|
||||
|
||||
if (website && website.trim() !== '') {
|
||||
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
|
||||
|
||||
if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') {
|
||||
// Found Dutchie menu via website crawl!
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`);
|
||||
|
||||
// Extract from the new menu URL
|
||||
const newExtraction = extractFromMenuUrl(fallbackCrawl.menuUrl);
|
||||
if (newExtraction) {
|
||||
let fallbackPlatformId: string | null = null;
|
||||
|
||||
if (newExtraction.type === 'platformId') {
|
||||
fallbackPlatformId = newExtraction.value;
|
||||
} else {
|
||||
// Try to resolve the new cName
|
||||
fallbackPlatformId = await resolveDispensaryId(newExtraction.value);
|
||||
}
|
||||
|
||||
if (fallbackPlatformId) {
|
||||
result.platformDispensaryId = fallbackPlatformId;
|
||||
result.success = true;
|
||||
result.cName = newExtraction.value;
|
||||
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
menu_url = $1,
|
||||
platform_dispensary_id = $2,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'website_crawl_fallback'::text,
|
||||
'detected_at', NOW(),
|
||||
'original_cname', $3::text,
|
||||
'fallback_cname', $4::text,
|
||||
'website_crawled', $5::text,
|
||||
'platform_id_resolved', true,
|
||||
'platform_id_resolved_at', NOW(),
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $6
|
||||
`,
|
||||
[fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId]
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Website crawl fallback didn't work either
|
||||
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
|
||||
await query(
|
||||
`
|
||||
@@ -835,6 +912,7 @@ if (isSpecialDomain && website && website.trim() !== '') {
|
||||
'cname_extracted', $1::text,
|
||||
'platform_id_resolved', false,
|
||||
'resolution_error', $2::text,
|
||||
'website_crawl_attempted', true,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
|
||||
Reference in New Issue
Block a user