fix(detection): crawl websites to find Dutchie menus and retry missing platform IDs
This commit is contained in:
@@ -136,6 +136,9 @@
|
||||
20) **Never delete or overwrite historical data**
|
||||
- Do not delete products/snapshots or overwrite historical records. Always append snapshots for changes (price/stock/qty), and mark missing_from_feed instead of removing records. Historical data must remain intact for analytics.
|
||||
|
||||
21) **Deployment via CI/CD only**
|
||||
- Test locally, commit clean changes, and let CI/CD build and deploy to Kubernetes at code.cannabrands.app. Do NOT manually build/push images or tweak prod pods. Deploy backend first, smoke-test APIs, then frontend; roll back via CI/CD if needed.
|
||||
|
||||
18) **Per-location cName and platform_dispensary_id resolution**
|
||||
- For each dispensary, menu_url and cName must be valid for that exact location; no hardcoded defaults and no sharing platform_dispensary_id across locations.
|
||||
- Derive cName from menu_url per store: `/embedded-menu/<cName>` or `/dispensary/<cName>`.
|
||||
|
||||
@@ -148,6 +148,7 @@ export interface WebsiteCrawlResult {
|
||||
provider: MenuProvider;
|
||||
foundLinks: string[];
|
||||
crawledPages: string[];
|
||||
platformDispensaryId?: string | null;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
@@ -293,6 +294,30 @@ export async function crawlWebsiteForMenuLinks(websiteUrl: string): Promise<Webs
|
||||
|
||||
result.foundLinks = homepageLinks;
|
||||
|
||||
// Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML
|
||||
try {
|
||||
const resp = await fetch(homepage, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
redirect: 'follow',
|
||||
});
|
||||
if (resp.ok) {
|
||||
const html = await resp.text();
|
||||
const reactEnvMatch = /window\.reactEnv\s*=\s*\{[^}]*"dispensaryId"\s*:\s*"([a-fA-F0-9]+)"/i.exec(html);
|
||||
if (reactEnvMatch && reactEnvMatch[1]) {
|
||||
result.provider = 'dutchie';
|
||||
result.menuUrl = homepage;
|
||||
result.platformDispensaryId = reactEnvMatch[1];
|
||||
console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
|
||||
}
|
||||
|
||||
// Step 2: Check for direct provider matches in homepage links
|
||||
for (const link of homepageLinks) {
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
@@ -565,14 +590,13 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
success: false,
|
||||
};
|
||||
|
||||
// For proprietary providers (curaleaf, sol), try website crawl to find an actual menu provider
|
||||
// These sites may have Dutchie embedded menus even though their domain suggests proprietary
|
||||
const PROPRIETARY_DOMAINS = ['curaleaf', 'sol'] as const;
|
||||
const isPotentiallyProprietary = PROPRIETARY_DOMAINS.includes(detectedProvider as any);
|
||||
// For domains like curaleaf/sol, crawl the website to find the actual menu provider (often Dutchie)
|
||||
const SPECIAL_DOMAINS = ['curaleaf', 'sol'] as const;
|
||||
const isSpecialDomain = SPECIAL_DOMAINS.includes(detectedProvider as any);
|
||||
|
||||
if (isPotentiallyProprietary && website && website.trim() !== '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider...`);
|
||||
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
||||
if (isSpecialDomain && website && website.trim() !== '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Detected ${detectedProvider} domain - crawling website to find actual menu provider (often Dutchie)...`);
|
||||
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
||||
|
||||
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
|
||||
// Found an actual menu provider (likely Dutchie) - use that instead!
|
||||
@@ -614,7 +638,30 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
// If the actual provider is dutchie, continue to platform ID resolution
|
||||
if (actualProvider === 'dutchie') {
|
||||
result.detectedProvider = 'dutchie';
|
||||
// Fall through to dutchie platform ID resolution below
|
||||
// If platformDispensaryId was captured (e.g., reactEnv on homepage), save it now and return
|
||||
if (crawlResult.platformDispensaryId) {
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
platform_dispensary_id = $1,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'platform_id_resolved', true,
|
||||
'platform_id_resolved_at', NOW(),
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'website_crawl'::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[crawlResult.platformDispensaryId, dispensaryId]
|
||||
);
|
||||
result.platformDispensaryId = crawlResult.platformDispensaryId;
|
||||
result.success = true;
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID from reactEnv = ${crawlResult.platformDispensaryId}`);
|
||||
return result;
|
||||
}
|
||||
// Fall through to dutchie platform ID resolution below if no platform ID captured
|
||||
} else {
|
||||
// Found a different provider (treez, jane, etc.) - we're done
|
||||
result.detectedProvider = actualProvider;
|
||||
@@ -883,6 +930,9 @@ export async function runBulkDetection(options: {
|
||||
)`;
|
||||
} else if (onlyMissingPlatformId) {
|
||||
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
||||
} else if (includeDutchieMissingPlatformId) {
|
||||
// Always attempt to resolve dutchie stores missing platform IDs
|
||||
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
||||
}
|
||||
|
||||
let query_str = `
|
||||
|
||||
Reference in New Issue
Block a user