Fix cName bug: extract cName from menuUrl per dispensary

- Add extractCName() helper to parse cName from dispensary.menuUrl
- Handles /embedded-menu/<cName> and /dispensary/<cName> URL patterns
- Falls back to dispensary.slug if menuUrl extraction fails
- Pass cName to fetchAllProductsBothModes and fetchAllProducts
- Make cName required parameter (no hardcoded defaults)
- Add normBool and normDate helpers for API data normalization
- Refactor graphql-client to use server-side fetch with Puppeteer session cookies

Previously all stores were using AZ-Deeply-Rooted cName, causing 0 products
for other dispensaries like Sol Flower.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-02 20:53:28 -07:00
parent 9caa52fd5b
commit c10710d6a7
2 changed files with 431 additions and 381 deletions

View File

@@ -1,24 +1,24 @@
/** /**
* Dutchie GraphQL Client * Dutchie GraphQL Client
* *
* Makes GraphQL requests to Dutchie's API using Puppeteer to bypass Cloudflare. * Uses Puppeteer to establish a session (get CF cookies), then makes
* Uses in-page fetch to maintain browser session/cookies. * SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies.
* *
* Key features: * DUTCHIE FETCH RULES:
* - Browser session reuse between Mode A and Mode B (single browser per store) * 1. Server-side only - use axios (never browser fetch with CORS)
* - Config-driven GraphQL hashes * 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly
* - POST fallback when GET fails with 405 * 3. Headers must mimic Chrome: User-Agent, Origin, Referer
* - Pagination retry logic * 4. If 403, extract CF cookies from Puppeteer session and include them
* - Proper termination on incomplete pages * 5. Log status codes, error bodies, and product counts
*/ */
import axios, { AxiosError } from 'axios';
import puppeteer from 'puppeteer-extra'; import puppeteer from 'puppeteer-extra';
import type { Browser, Page } from 'puppeteer'; import type { Browser, Page, Protocol } from 'puppeteer';
import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { import {
DutchieRawProduct, DutchieRawProduct,
DutchiePOSChild, DutchiePOSChild,
FilteredProductsVariables,
CrawlMode, CrawlMode,
} from '../types'; } from '../types';
import { dutchieConfig, GRAPHQL_HASHES, ARIZONA_CENTERPOINTS } from '../config/dutchie'; import { dutchieConfig, GRAPHQL_HASHES, ARIZONA_CENTERPOINTS } from '../config/dutchie';
@@ -28,162 +28,165 @@ puppeteer.use(StealthPlugin());
// Re-export for backward compatibility // Re-export for backward compatibility
export { GRAPHQL_HASHES, ARIZONA_CENTERPOINTS }; export { GRAPHQL_HASHES, ARIZONA_CENTERPOINTS };
interface BrowserSession { interface SessionCredentials {
cookies: string; // Cookie header string
userAgent: string;
browser: Browser; browser: Browser;
page: Page;
dispensaryId?: string;
} }
// ============================================================ // ============================================================
// BROWSER SESSION MANAGEMENT // SESSION MANAGEMENT - Get CF cookies via Puppeteer
// ============================================================ // ============================================================
/** /**
* Launch a browser session for Dutchie GraphQL requests * Create a session by navigating to the embedded menu page
* and extracting CF clearance cookies for server-side requests
*/ */
async function createBrowserSession(menuUrl?: string): Promise<BrowserSession> { async function createSession(cName: string): Promise<SessionCredentials> {
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
headless: 'new', headless: 'new',
args: dutchieConfig.browserArgs, args: dutchieConfig.browserArgs,
}); });
const page = await browser.newPage(); const page = await browser.newPage();
const userAgent = dutchieConfig.userAgent;
// Set up stealth await page.setUserAgent(userAgent);
await page.setUserAgent(dutchieConfig.userAgent);
await page.setViewport({ width: 1920, height: 1080 }); await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => { await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false }); Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} }; (window as any).chrome = { runtime: {} };
}); });
// Navigate to establish session // Navigate to the embedded menu page for this dispensary
const url = menuUrl || 'https://dutchie.com/dispensaries'; const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
console.log(`[GraphQL Client] Loading ${url} to establish session...`); console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`);
await page.goto(url, { try {
waitUntil: 'networkidle2', await page.goto(embeddedMenuUrl, {
timeout: dutchieConfig.navigationTimeout, waitUntil: 'networkidle2',
}); timeout: dutchieConfig.navigationTimeout,
await new Promise((r) => setTimeout(r, dutchieConfig.pageLoadDelay)); });
await new Promise((r) => setTimeout(r, dutchieConfig.pageLoadDelay));
// Try to get dispensary ID from page if it's a menu page } catch (error: any) {
let dispensaryId: string | undefined; console.warn(`[GraphQL Client] Navigation warning: ${error.message}`);
if (menuUrl && menuUrl.includes('embedded-menu')) { // Continue anyway - we may have gotten cookies
dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId);
} }
return { browser, page, dispensaryId }; // Extract cookies
const cookies = await page.cookies();
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
console.log(`[GraphQL Client] Got ${cookies.length} cookies`);
if (cookies.length > 0) {
console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`);
}
return { cookies: cookieString, userAgent, browser };
} }
/** /**
* Close browser session * Close session (browser)
*/ */
async function closeBrowserSession(session: BrowserSession): Promise<void> { async function closeSession(session: SessionCredentials): Promise<void> {
await session.browser.close(); await session.browser.close();
} }
// ============================================================ // ============================================================
// GRAPHQL EXECUTION WITH POST FALLBACK // SERVER-SIDE GRAPHQL FETCH USING AXIOS
// ============================================================ // ============================================================
/** /**
* Execute a GraphQL query from within the browser context * Build headers that mimic a real browser request
* Supports GET (default) with POST fallback on 405 errors */
function buildHeaders(session: SessionCredentials, cName: string): Record<string, string> {
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
return {
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-US,en;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'content-type': 'application/json',
'origin': 'https://dutchie.com',
'referer': embeddedMenuUrl,
'user-agent': session.userAgent,
'apollographql-client-name': 'Marketplace (production)',
'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
...(session.cookies ? { 'cookie': session.cookies } : {}),
};
}
/**
* Execute GraphQL query server-side using axios
* Uses cookies from the browser session to bypass CF
*/ */
async function executeGraphQL( async function executeGraphQL(
page: Page, session: SessionCredentials,
operationName: string, operationName: string,
variables: any, variables: any,
hash: string, hash: string,
endpoint: string = 'https://dutchie.com/graphql' cName: string
): Promise<any> { ): Promise<any> {
const headers = dutchieConfig.defaultHeaders; const endpoint = dutchieConfig.graphqlEndpoint;
const preferGet = dutchieConfig.preferGet; const headers = buildHeaders(session, cName);
const enablePostFallback = dutchieConfig.enablePostFallback;
return page.evaluate( // Build request body for POST
async ( const body = {
opName: string,
vars: any,
queryHash: string,
url: string,
hdrs: Record<string, string>,
useGet: boolean,
allowPostFallback: boolean
) => {
const doFetch = async (method: 'GET' | 'POST'): Promise<Response> => {
if (method === 'GET') {
const qs = new URLSearchParams({
operationName: opName,
variables: JSON.stringify(vars),
extensions: JSON.stringify({
persistedQuery: { version: 1, sha256Hash: queryHash },
}),
});
return fetch(`${url}?${qs.toString()}`, {
method: 'GET',
headers: {
...hdrs,
'content-type': 'application/json',
},
credentials: 'include',
});
} else {
// POST request with full body
return fetch(url, {
method: 'POST',
headers: {
...hdrs,
'content-type': 'application/json',
},
credentials: 'include',
body: JSON.stringify({
operationName: opName,
variables: vars,
extensions: {
persistedQuery: { version: 1, sha256Hash: queryHash },
},
}),
});
}
};
// Try GET first if preferred
if (useGet) {
const response = await doFetch('GET');
// If GET fails with 405 and POST fallback is enabled, try POST
if (response.status === 405 && allowPostFallback) {
console.log('[GraphQL] GET returned 405, falling back to POST');
const postResponse = await doFetch('POST');
if (!postResponse.ok) {
throw new Error(`HTTP ${postResponse.status} (POST fallback)`);
}
return postResponse.json();
}
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
return response.json();
} else {
// Use POST directly
const response = await doFetch('POST');
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
return response.json();
}
},
operationName, operationName,
variables, variables,
hash, extensions: {
endpoint, persistedQuery: { version: 1, sha256Hash: hash },
headers, },
preferGet, };
enablePostFallback
); console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`);
console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`);
try {
const response = await axios.post(endpoint, body, {
headers,
timeout: 30000,
validateStatus: () => true, // Don't throw on non-2xx
});
// Log response details
console.log(`[GraphQL Client] Response status: ${response.status}`);
if (response.status !== 200) {
const bodyPreview = typeof response.data === 'string'
? response.data.slice(0, 500)
: JSON.stringify(response.data).slice(0, 500);
console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`);
throw new Error(`HTTP ${response.status}`);
}
// Check for GraphQL errors
if (response.data?.errors && response.data.errors.length > 0) {
console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
}
return response.data;
} catch (error: any) {
if (axios.isAxiosError(error)) {
const axiosError = error as AxiosError;
console.error(`[GraphQL Client] Axios error: ${axiosError.message}`);
if (axiosError.response) {
console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`);
console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`);
}
if (axiosError.code) {
console.error(`[GraphQL Client] Error code: ${axiosError.code}`);
}
} else {
console.error(`[GraphQL Client] Error: ${error.message}`);
}
throw error;
}
} }
// ============================================================ // ============================================================
@@ -192,120 +195,180 @@ async function executeGraphQL(
/** /**
* Resolve a dispensary slug to its internal platform ID * Resolve a dispensary slug to its internal platform ID
* Uses GetAddressBasedDispensaryData query
*/ */
export async function resolveDispensaryId(slug: string): Promise<string | null> { export async function resolveDispensaryId(slug: string): Promise<string | null> {
const session = await createBrowserSession(`https://dutchie.com/embedded-menu/${slug}`); console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`);
const session = await createSession(slug);
try { try {
// First check if we got it from the page context const variables = {
if (session.dispensaryId) { dispensaryFilter: {
console.log(`[GraphQL Client] Got dispensaryId from page: ${session.dispensaryId}`); cNameOrID: slug,
return session.dispensaryId; },
} };
// Otherwise try the GetAddressBasedDispensaryData query
const result = await executeGraphQL( const result = await executeGraphQL(
session.page, session,
'GetAddressBasedDispensaryData', 'GetAddressBasedDispensaryData',
{ input: { dispensaryId: slug } }, variables,
GRAPHQL_HASHES.GetAddressBasedDispensaryData GRAPHQL_HASHES.GetAddressBasedDispensaryData,
slug
); );
const dispensaryId = result?.data?.getAddressBasedDispensaryData?.dispensaryId; const dispensaryId = result?.data?.dispensaryBySlug?.id ||
console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId}`); result?.data?.dispensary?.id ||
return dispensaryId || null; result?.data?.getAddressBasedDispensaryData?.dispensary?.id;
} catch (error: any) {
console.error(`[GraphQL Client] Failed to resolve ${slug}:`, error.message); if (dispensaryId) {
console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId}`);
return dispensaryId;
}
console.log(`[GraphQL Client] Could not resolve ${slug}, response:`, JSON.stringify(result).slice(0, 300));
return null; return null;
} finally { } finally {
await closeBrowserSession(session); await closeSession(session);
} }
} }
/**
* Discover Arizona dispensaries via geo-based query
*/
export async function discoverArizonaDispensaries(): Promise<any[]> {
console.log('[GraphQL Client] Discovering Arizona dispensaries...');
// Use Phoenix as the default center
const session = await createSession('AZ-Deeply-Rooted');
const allDispensaries: any[] = [];
const seenIds = new Set<string>();
try {
for (const centerpoint of ARIZONA_CENTERPOINTS) {
console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`);
const variables = {
dispensariesFilter: {
latitude: centerpoint.lat,
longitude: centerpoint.lng,
distance: 100,
state: 'AZ',
},
};
try {
const result = await executeGraphQL(
session,
'ConsumerDispensaries',
variables,
GRAPHQL_HASHES.ConsumerDispensaries,
'AZ-Deeply-Rooted'
);
const dispensaries = result?.data?.consumerDispensaries || [];
for (const d of dispensaries) {
const id = d.id || d.dispensaryId;
if (id && !seenIds.has(id)) {
seenIds.add(id);
allDispensaries.push(d);
}
}
console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`);
} catch (error: any) {
console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`);
}
// Delay between requests
await new Promise((r) => setTimeout(r, 1000));
}
} finally {
await closeSession(session);
}
console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`);
return allDispensaries;
}
// ============================================================ // ============================================================
// FILTER VARIABLE BUILDING // PRODUCT FILTERING VARIABLES
// ============================================================ // ============================================================
/** /**
* Build GraphQL variables based on crawl mode * Build filter variables for FilteredProducts query
* *
* MODE A - "UI parity": Matches what Dutchie website shows * CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b")
* - Status: 'Active' * NOT dispensaryFilter.cNameOrID!
* - removeProductsBelowOptionThresholds: true (default behavior)
* - bypassOnlineThresholds: false
* *
* MODE B - "MAX COVERAGE": Tries to get out-of-stock products * The actual browser request structure is:
* - Status: undefined (no filter) * {
* - removeProductsBelowOptionThresholds: false * "productsFilter": {
* - bypassOnlineThresholds: true * "dispensaryId": "6405ef617056e8014d79101b",
* "pricingType": "rec",
* "Status": "Active", // Mode A only
* "strainTypes": [],
* "subcategories": [],
* "types": [],
* "useCache": true,
* ...
* },
* "page": 0,
* "perPage": 100
* }
*
* Mode A = UI parity (Status: "Active")
* Mode B = MAX COVERAGE (no Status filter)
*/ */
function buildFilterVariables( function buildFilterVariables(
platformDispensaryId: string, platformDispensaryId: string,
pricingType: 'rec' | 'med', pricingType: 'rec' | 'med',
crawlMode: CrawlMode, crawlMode: CrawlMode,
pageNum: number, page: number,
perPage: number perPage: number
): FilteredProductsVariables { ): any {
if (crawlMode === 'mode_a') { const isModeA = crawlMode === 'mode_a';
// UI parity mode
return { const productsFilter: Record<string, any> = {
includeEnterpriseSpecials: false, dispensaryId: platformDispensaryId,
productsFilter: { pricingType: pricingType,
dispensaryId: platformDispensaryId, strainTypes: [],
pricingType, subcategories: [],
Status: 'Active', types: [],
types: [], useCache: false, // Get fresh data
strainTypes: [], isDefaultSort: true,
subcategories: [], sortBy: 'popular',
useCache: false, sortDirection: 1,
isDefaultSort: true, bypassOnlineThresholds: false,
sortBy: 'popularSortIdx', isKioskMenu: false,
sortDirection: 1, removeProductsBelowOptionThresholds: true,
bypassOnlineThresholds: false, };
isKioskMenu: false,
removeProductsBelowOptionThresholds: true, // Mode A: Only active products (UI parity)
}, if (isModeA) {
page: pageNum, productsFilter.Status = 'Active';
perPage,
};
} else {
// MAX COVERAGE mode (mode_b)
return {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: platformDispensaryId,
pricingType,
// No Status filter - try to get all products
types: [],
strainTypes: [],
subcategories: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: pageNum,
perPage,
};
} }
return {
includeEnterpriseSpecials: false,
productsFilter,
page,
perPage,
};
} }
// ============================================================ // ============================================================
// PRODUCT FETCHING WITH RETRY & PAGINATION // PRODUCT FETCHING WITH PAGINATION
// ============================================================ // ============================================================
/** /**
* Fetch all products for a dispensary via paginated GraphQL * Fetch products for a single mode with pagination
* Supports retry logic and proper termination
*
* @param session - Existing browser session to reuse
*/ */
async function fetchProductsWithSession( async function fetchProductsForMode(
session: BrowserSession, session: SessionCredentials,
platformDispensaryId: string, platformDispensaryId: string,
cName: string,
pricingType: 'rec' | 'med', pricingType: 'rec' | 'med',
crawlMode: CrawlMode crawlMode: CrawlMode
): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> { ): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> {
@@ -319,51 +382,57 @@ async function fetchProductsWithSession(
let totalCount = 0; let totalCount = 0;
let consecutiveEmptyPages = 0; let consecutiveEmptyPages = 0;
console.log(`[GraphQL Client] Fetching products for ${platformDispensaryId} (${pricingType}, ${crawlMode})...`); console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`);
while (pageNum < maxPages) { while (pageNum < maxPages) {
const variables = buildFilterVariables( const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage);
platformDispensaryId,
pricingType,
crawlMode,
pageNum,
perPage
);
let result: any = null; let result: any = null;
let lastError: Error | null = null; let lastError: Error | null = null;
// Retry logic for failed page fetches // Retry logic
for (let attempt = 0; attempt <= maxRetries; attempt++) { for (let attempt = 0; attempt <= maxRetries; attempt++) {
try { try {
result = await executeGraphQL( result = await executeGraphQL(
session.page, session,
'FilteredProducts', 'FilteredProducts',
variables, variables,
GRAPHQL_HASHES.FilteredProducts GRAPHQL_HASHES.FilteredProducts,
cName
); );
lastError = null; lastError = null;
break; // Success, exit retry loop break;
} catch (error: any) { } catch (error: any) {
lastError = error; lastError = error;
console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`); console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`);
if (attempt < maxRetries) { if (attempt < maxRetries) {
await new Promise((r) => setTimeout(r, 1000 * (attempt + 1))); // Exponential backoff await new Promise((r) => setTimeout(r, 1000 * (attempt + 1)));
} }
} }
} }
// If all retries failed, log error and break
if (lastError) { if (lastError) {
console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts: ${lastError.message}`); console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`);
break; break;
} }
if (result.errors) { if (result?.errors) {
console.error('[GraphQL Client] GraphQL errors:', result.errors); console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors));
break; break;
} }
// Log response shape on first page
if (pageNum === 0) {
console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`);
if (result?.data) {
console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`);
}
if (!result?.data?.filteredProducts) {
console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`);
console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`);
}
}
const products = result?.data?.filteredProducts?.products || []; const products = result?.data?.filteredProducts?.products || [];
const queryInfo = result?.data?.filteredProducts?.queryInfo; const queryInfo = result?.data?.filteredProducts?.queryInfo;
@@ -375,7 +444,6 @@ async function fetchProductsWithSession(
`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})` `[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`
); );
// PROPER TERMINATION: Stop if products.length < perPage (incomplete page = last page)
if (products.length === 0) { if (products.length === 0) {
consecutiveEmptyPages++; consecutiveEmptyPages++;
if (consecutiveEmptyPages >= 2) { if (consecutiveEmptyPages >= 2) {
@@ -387,15 +455,13 @@ async function fetchProductsWithSession(
allProducts.push(...products); allProducts.push(...products);
} }
// Stop if we got less than a full page (this is the last page) // Stop if incomplete page (last page)
if (products.length < perPage) { if (products.length < perPage) {
console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping pagination`); console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`);
break; break;
} }
pageNum++; pageNum++;
// Small delay between pages
await new Promise((r) => setTimeout(r, pageDelayMs)); await new Promise((r) => setTimeout(r, pageDelayMs));
} }
@@ -403,8 +469,12 @@ async function fetchProductsWithSession(
return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode }; return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode };
} }
// ============================================================
// LEGACY SINGLE-MODE INTERFACE
// ============================================================
/** /**
* Fetch all products for a dispensary (legacy interface - creates new browser) * Fetch all products for a dispensary (single mode)
*/ */
export async function fetchAllProducts( export async function fetchAllProducts(
platformDispensaryId: string, platformDispensaryId: string,
@@ -414,28 +484,32 @@ export async function fetchAllProducts(
maxPages?: number; maxPages?: number;
menuUrl?: string; menuUrl?: string;
crawlMode?: CrawlMode; crawlMode?: CrawlMode;
cName?: string;
} = {} } = {}
): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> { ): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> {
const { crawlMode = 'mode_a' } = options; const { crawlMode = 'mode_a' } = options;
const menuUrl = options.menuUrl || `https://dutchie.com/dispensaries`;
const session = await createBrowserSession(menuUrl); // cName is now REQUIRED - no default fallback to avoid using wrong store's session
const cName = options.cName;
if (!cName) {
throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session');
}
const session = await createSession(cName);
try { try {
return await fetchProductsWithSession(session, platformDispensaryId, pricingType, crawlMode); return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode);
} finally { } finally {
await closeBrowserSession(session); await closeSession(session);
} }
} }
// ============================================================ // ============================================================
// MODE A+B MERGING WITH OPTIONS // MODE A+B MERGING
// ============================================================ // ============================================================
/** /**
* Merge POSMetaData.children arrays from Mode A and Mode B products * Merge POSMetaData.children arrays from Mode A and Mode B products
* Uses canonicalID/canonicalSKU/canonicalPackageId as merge key
* Mode B children may have different quantityAvailable for options not in Mode A
*/ */
function mergeProductOptions( function mergeProductOptions(
modeAProduct: DutchieRawProduct, modeAProduct: DutchieRawProduct,
@@ -444,22 +518,17 @@ function mergeProductOptions(
const modeAChildren = modeAProduct.POSMetaData?.children || []; const modeAChildren = modeAProduct.POSMetaData?.children || [];
const modeBChildren = modeBProduct.POSMetaData?.children || []; const modeBChildren = modeBProduct.POSMetaData?.children || [];
// Create a map keyed by option identifier
const getOptionKey = (child: DutchiePOSChild): string => { const getOptionKey = (child: DutchiePOSChild): string => {
return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || ''; return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || '';
}; };
const mergedMap = new Map<string, DutchiePOSChild>(); const mergedMap = new Map<string, DutchiePOSChild>();
// Add all Mode A children first (they're "canonical")
for (const child of modeAChildren) { for (const child of modeAChildren) {
const key = getOptionKey(child); const key = getOptionKey(child);
if (key) { if (key) mergedMap.set(key, child);
mergedMap.set(key, child);
}
} }
// Add Mode B children that aren't in Mode A (may include OOS options)
for (const child of modeBChildren) { for (const child of modeBChildren) {
const key = getOptionKey(child); const key = getOptionKey(child);
if (key && !mergedMap.has(key)) { if (key && !mergedMap.has(key)) {
@@ -472,7 +541,6 @@ function mergeProductOptions(
/** /**
* Merge a Mode A product with a Mode B product * Merge a Mode A product with a Mode B product
* Mode A data is preferred, but children are merged for max coverage
*/ */
function mergeProducts( function mergeProducts(
modeAProduct: DutchieRawProduct, modeAProduct: DutchieRawProduct,
@@ -482,10 +550,8 @@ function mergeProducts(
return modeAProduct; return modeAProduct;
} }
// Merge children arrays
const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct); const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct);
// Return Mode A product with merged children
return { return {
...modeAProduct, ...modeAProduct,
POSMetaData: { POSMetaData: {
@@ -495,9 +561,13 @@ function mergeProducts(
}; };
} }
// ============================================================
// MAIN EXPORT: TWO-MODE CRAWL
// ============================================================
/** /**
* Fetch products using BOTH crawl modes with SINGLE browser session * Fetch products using BOTH crawl modes with SINGLE session
* This ensures maximum coverage by running Mode A then Mode B with the same session * Runs Mode A then Mode B, merges results
*/ */
export async function fetchAllProductsBothModes( export async function fetchAllProductsBothModes(
platformDispensaryId: string, platformDispensaryId: string,
@@ -506,161 +576,67 @@ export async function fetchAllProductsBothModes(
perPage?: number; perPage?: number;
maxPages?: number; maxPages?: number;
menuUrl?: string; menuUrl?: string;
cName?: string;
} = {} } = {}
): Promise<{ ): Promise<{
modeA: { products: DutchieRawProduct[]; totalCount: number }; modeA: { products: DutchieRawProduct[]; totalCount: number };
modeB: { products: DutchieRawProduct[]; totalCount: number }; modeB: { products: DutchieRawProduct[]; totalCount: number };
merged: { products: DutchieRawProduct[]; totalCount: number }; merged: { products: DutchieRawProduct[]; totalCount: number };
}> { }> {
console.log(`[GraphQL Client] Running two-mode crawl for ${platformDispensaryId} (${pricingType})...`); // cName is now REQUIRED - no default fallback to avoid using wrong store's session
const cName = options.cName;
if (!cName) {
throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session');
}
const menuUrl = options.menuUrl || `https://dutchie.com/dispensaries`; console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`);
console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`);
// Create a SINGLE browser session for both modes const session = await createSession(cName);
const session = await createBrowserSession(menuUrl);
try { try {
// Run Mode A (UI parity) with shared session // Mode A (UI parity)
const modeAResult = await fetchProductsWithSession( const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a');
session,
platformDispensaryId,
pricingType,
'mode_a'
);
// Small delay between modes // Delay between modes
await new Promise((r) => setTimeout(r, dutchieConfig.modeDelayMs)); await new Promise((r) => setTimeout(r, dutchieConfig.modeDelayMs));
// Run Mode B (MAX COVERAGE) with same session - NO new browser! // Mode B (MAX COVERAGE)
const modeBResult = await fetchProductsWithSession( const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b');
session,
platformDispensaryId,
pricingType,
'mode_b'
);
// Build lookup map for Mode B products // Merge results
const modeBMap = new Map<string, DutchieRawProduct>(); const modeBMap = new Map<string, DutchieRawProduct>();
for (const product of modeBResult.products) { for (const product of modeBResult.products) {
modeBMap.set(product._id, product); modeBMap.set(product._id, product);
} }
// Merge results - deduplicate by _id, merge options
const productMap = new Map<string, DutchieRawProduct>(); const productMap = new Map<string, DutchieRawProduct>();
// Add Mode A products first (canonical), merging with Mode B if exists // Add Mode A products, merging with Mode B if exists
for (const product of modeAResult.products) { for (const product of modeAResult.products) {
const modeBProduct = modeBMap.get(product._id); const modeBProduct = modeBMap.get(product._id);
const mergedProduct = mergeProducts(product, modeBProduct); const mergedProduct = mergeProducts(product, modeBProduct);
productMap.set(product._id, mergedProduct); productMap.set(product._id, mergedProduct);
} }
// Add Mode B products that aren't in Mode A (may include OOS items) // Add Mode B products not in Mode A
for (const product of modeBResult.products) { for (const product of modeBResult.products) {
if (!productMap.has(product._id)) { if (!productMap.has(product._id)) {
productMap.set(product._id, product); productMap.set(product._id, product);
} }
} }
const merged = Array.from(productMap.values()); const mergedProducts = Array.from(productMap.values());
console.log(`[GraphQL Client] Two-mode crawl complete:`); console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`);
console.log(` Mode A: ${modeAResult.products.length} products`); console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`);
console.log(` Mode B: ${modeBResult.products.length} products`);
console.log(` Merged: ${merged.length} unique products`);
return { return {
modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount }, modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount },
modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount }, modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount },
merged: { products: merged, totalCount: merged.length }, merged: { products: mergedProducts, totalCount: mergedProducts.length },
}; };
} finally { } finally {
// Close the shared session when done await closeSession(session);
await closeBrowserSession(session);
} }
} }
// ============================================================
// DISPENSARY DISCOVERY
// ============================================================
/**
* Discover dispensaries near a geographic point
*/
export async function discoverDispensaries(
lat: number,
lng: number,
radiusKm: number = 100
): Promise<any[]> {
// Skip discovery if disabled in config
if (!dutchieConfig.useDiscovery) {
console.log('[GraphQL Client] Discovery disabled in config, skipping');
return [];
}
const session = await createBrowserSession();
try {
console.log(`[GraphQL Client] Discovering dispensaries near ${lat}, ${lng}...`);
// Try to use ConsumerDispensaries or similar discovery query
// Note: The exact operation may need to be captured from live traffic
const result = await executeGraphQL(
session.page,
'ConsumerDispensaries',
{
filter: {
lat,
lng,
radius: radiusKm * 1000, // Convert to meters if needed
isDelivery: false,
},
},
GRAPHQL_HASHES.ConsumerDispensaries
);
const dispensaries = result?.data?.consumerDispensaries || [];
console.log(`[GraphQL Client] Found ${dispensaries.length} dispensaries`);
return dispensaries;
} catch (error: any) {
console.error(`[GraphQL Client] Discovery failed:`, error.message);
return [];
} finally {
await closeBrowserSession(session);
}
}
/**
* Discover all Arizona Dutchie dispensaries using multiple centerpoints
*/
export async function discoverArizonaDispensaries(): Promise<any[]> {
const allDispensaries = new Map<string, any>();
for (const center of ARIZONA_CENTERPOINTS) {
console.log(`[GraphQL Client] Scanning ${center.name}...`);
try {
const dispensaries = await discoverDispensaries(center.lat, center.lng, 150);
for (const disp of dispensaries) {
// Filter to AZ only
const state = disp.state || disp.address?.state;
if (state === 'AZ' || state === 'Arizona') {
const key = disp.slug || disp.cName || disp.id;
if (key && !allDispensaries.has(key)) {
allDispensaries.set(key, disp);
}
}
}
// Delay between scans
await new Promise((r) => setTimeout(r, 2000));
} catch (error: any) {
console.error(`[GraphQL Client] Failed to scan ${center.name}:`, error.message);
}
}
const result = Array.from(allDispensaries.values());
console.log(`[GraphQL Client] Total unique AZ dispensaries: ${result.length}`);
return result;
}

View File

@@ -7,6 +7,7 @@
import { query, getClient } from '../db/connection'; import { query, getClient } from '../db/connection';
import { fetchAllProducts, fetchAllProductsBothModes } from './graphql-client'; import { fetchAllProducts, fetchAllProductsBothModes } from './graphql-client';
import { mapDbRowToDispensary } from './discovery';
import { import {
DutchieRawProduct, DutchieRawProduct,
DutchieProduct, DutchieProduct,
@@ -49,6 +50,71 @@ function getMax(arr?: number[]): number | undefined {
return Math.max(...arr.filter((n) => n !== null && n !== undefined)); return Math.max(...arr.filter((n) => n !== null && n !== undefined));
} }
/**
* Normalize a value to boolean
* Handles Dutchie API returning {} or [] or other non-boolean values
* that would cause "invalid input syntax for type boolean" errors
*/
function normBool(v: any, defaultVal: boolean = false): boolean {
if (v === true) return true;
if (v === false) return false;
// Log unexpected object/array values once for debugging
if (v !== null && v !== undefined && typeof v === 'object') {
console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v));
}
return defaultVal;
}
/**
* Normalize a value to Date or undefined
* Handles Dutchie API returning {} or [] or other non-date values
* that would cause "invalid input syntax for type timestamp" errors
*/
function normDate(v: any): Date | undefined {
if (!v) return undefined;
// Reject objects/arrays that aren't dates
if (typeof v === 'object' && !(v instanceof Date)) {
console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v));
return undefined;
}
// Try parsing
const d = new Date(v);
if (isNaN(d.getTime())) {
console.warn(`[normDate] Invalid date value, ignoring:`, v);
return undefined;
}
return d;
}
/**
* Extract cName (Dutchie slug) from menuUrl or dispensary slug
* Handles URL formats:
* - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted
* - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock
* Falls back to dispensary.slug if menuUrl extraction fails
*/
function extractCName(dispensary: Dispensary): string {
if (dispensary.menuUrl) {
try {
const url = new URL(dispensary.menuUrl);
// Extract last path segment: /embedded-menu/X or /dispensary/X
const segments = url.pathname.split('/').filter(Boolean);
if (segments.length >= 2) {
const cName = segments[segments.length - 1];
if (cName) {
console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`);
return cName;
}
}
} catch (e) {
console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`);
}
}
// Fallback to slug
console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`);
return dispensary.slug;
}
/** /**
* Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot * Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot
*/ */
@@ -120,16 +186,16 @@ export function normalizeProduct(
// Status / flags // Status / flags
status: raw.Status, status: raw.Status,
medicalOnly: raw.medicalOnly || false, medicalOnly: normBool(raw.medicalOnly, false),
recOnly: raw.recOnly || false, recOnly: normBool(raw.recOnly, false),
featured: raw.featured || false, featured: normBool(raw.featured, false),
comingSoon: raw.comingSoon || false, comingSoon: normBool(raw.comingSoon, false),
certificateOfAnalysisEnabled: raw.certificateOfAnalysisEnabled || false, certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false),
isBelowThreshold: raw.isBelowThreshold || false, isBelowThreshold: normBool(raw.isBelowThreshold, false),
isBelowKioskThreshold: raw.isBelowKioskThreshold || false, isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
optionsBelowThreshold: raw.optionsBelowThreshold || false, optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false),
optionsBelowKioskThreshold: raw.optionsBelowKioskThreshold || false, optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false),
// Derived stock status // Derived stock status
stockStatus: deriveStockStatus(raw), stockStatus: deriveStockStatus(raw),
@@ -144,8 +210,8 @@ export function normalizeProduct(
weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight, weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight,
pastCNames: raw.pastCNames, pastCNames: raw.pastCNames,
createdAtDutchie: raw.createdAt ? new Date(raw.createdAt) : undefined, createdAtDutchie: normDate(raw.createdAt),
updatedAtDutchie: raw.updatedAt ? new Date(raw.updatedAt) : undefined, updatedAtDutchie: normDate(raw.updatedAt),
latestRawPayload: raw, latestRawPayload: raw,
}; };
@@ -200,10 +266,10 @@ export function normalizeSnapshot(
crawlMode, crawlMode,
status: raw.Status, status: raw.Status,
featured: raw.featured || false, featured: normBool(raw.featured, false),
special: isOnSpecial, special: normBool(isOnSpecial, false),
medicalOnly: raw.medicalOnly || false, medicalOnly: normBool(raw.medicalOnly, false),
recOnly: raw.recOnly || false, recOnly: normBool(raw.recOnly, false),
// Product was present in feed // Product was present in feed
isPresentInFeed: true, isPresentInFeed: true,
@@ -223,9 +289,9 @@ export function normalizeSnapshot(
// Inventory summary - null = unknown, 0 = all OOS // Inventory summary - null = unknown, 0 = all OOS
totalQuantityAvailable: totalQty, totalQuantityAvailable: totalQty,
totalKioskQuantityAvailable: totalKioskQty, totalKioskQuantityAvailable: totalKioskQty,
manualInventory: raw.manualInventory || false, manualInventory: normBool(raw.manualInventory, false),
isBelowThreshold: raw.isBelowThreshold || false, isBelowThreshold: normBool(raw.isBelowThreshold, false),
isBelowKioskThreshold: raw.isBelowKioskThreshold || false, isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
options, options,
rawPayload: raw, rawPayload: raw,
@@ -469,13 +535,15 @@ async function updateDispensaryCrawlStats(
dispensaryId: number, dispensaryId: number,
productCount: number productCount: number
): Promise<void> { ): Promise<void> {
// Update last_crawl_at to track when we last crawled
// Skip product_count as that column may not exist
await query( await query(
` `
UPDATE dispensaries UPDATE dispensaries
SET last_crawled_at = NOW(), product_count = $2, updated_at = NOW() SET last_crawl_at = NOW(), updated_at = NOW()
WHERE id = $1 WHERE id = $1
`, `,
[dispensaryId, productCount] [dispensaryId]
); );
} }
@@ -701,11 +769,16 @@ export async function crawlDispensaryProducts(
const modeAProductIds = new Set<string>(); const modeAProductIds = new Set<string>();
const modeBProductIds = new Set<string>(); const modeBProductIds = new Set<string>();
// Extract cName for this specific dispensary (used for Puppeteer session & headers)
const cName = extractCName(dispensary);
console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`);
if (useBothModes) { if (useBothModes) {
// Run two-mode crawl for maximum coverage // Run two-mode crawl for maximum coverage
const bothResults = await fetchAllProductsBothModes( const bothResults = await fetchAllProductsBothModes(
dispensary.platformDispensaryId, dispensary.platformDispensaryId,
pricingType pricingType,
{ cName }
); );
modeAProducts = bothResults.modeA.products.length; modeAProducts = bothResults.modeA.products.length;
@@ -742,7 +815,7 @@ export async function crawlDispensaryProducts(
const { products, crawlMode } = await fetchAllProducts( const { products, crawlMode } = await fetchAllProducts(
dispensary.platformDispensaryId, dispensary.platformDispensaryId,
pricingType, pricingType,
{ crawlMode: 'mode_a' } { crawlMode: 'mode_a', cName }
); );
modeAProducts = products.length; modeAProducts = products.length;
@@ -811,13 +884,14 @@ export async function crawlAllArizonaDispensaries(
const results: CrawlResult[] = []; const results: CrawlResult[] = [];
// Get all AZ dispensaries with platform IDs // Get all AZ dispensaries with platform IDs
const { rows: dispensaries } = await query<Dispensary>( const { rows: rawRows } = await query(
` `
SELECT * FROM dispensaries SELECT * FROM dispensaries
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
ORDER BY id ORDER BY id
` `
); );
const dispensaries = rawRows.map(mapDbRowToDispensary);
console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`); console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);