diff --git a/backend/src/dutchie-az/services/graphql-client.ts b/backend/src/dutchie-az/services/graphql-client.ts index a3fca3b4..a169a0a2 100644 --- a/backend/src/dutchie-az/services/graphql-client.ts +++ b/backend/src/dutchie-az/services/graphql-client.ts @@ -1,24 +1,24 @@ /** * Dutchie GraphQL Client * - * Makes GraphQL requests to Dutchie's API using Puppeteer to bypass Cloudflare. - * Uses in-page fetch to maintain browser session/cookies. + * Uses Puppeteer to establish a session (get CF cookies), then makes + * SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies. * - * Key features: - * - Browser session reuse between Mode A and Mode B (single browser per store) - * - Config-driven GraphQL hashes - * - POST fallback when GET fails with 405 - * - Pagination retry logic - * - Proper termination on incomplete pages + * DUTCHIE FETCH RULES: + * 1. Server-side only - use axios (never browser fetch with CORS) + * 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly + * 3. Headers must mimic Chrome: User-Agent, Origin, Referer + * 4. If 403, extract CF cookies from Puppeteer session and include them + * 5. Log status codes, error bodies, and product counts */ +import axios, { AxiosError } from 'axios'; import puppeteer from 'puppeteer-extra'; -import type { Browser, Page } from 'puppeteer'; +import type { Browser, Page, Protocol } from 'puppeteer'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { DutchieRawProduct, DutchiePOSChild, - FilteredProductsVariables, CrawlMode, } from '../types'; import { dutchieConfig, GRAPHQL_HASHES, ARIZONA_CENTERPOINTS } from '../config/dutchie'; @@ -28,162 +28,165 @@ puppeteer.use(StealthPlugin()); // Re-export for backward compatibility export { GRAPHQL_HASHES, ARIZONA_CENTERPOINTS }; -interface BrowserSession { +interface SessionCredentials { + cookies: string; // Cookie header string + userAgent: string; browser: Browser; - page: Page; - dispensaryId?: string; } // ============================================================ -// BROWSER SESSION MANAGEMENT +// SESSION MANAGEMENT - Get CF cookies via Puppeteer // ============================================================ /** - * Launch a browser session for Dutchie GraphQL requests + * Create a session by navigating to the embedded menu page + * and extracting CF clearance cookies for server-side requests */ -async function createBrowserSession(menuUrl?: string): Promise { +async function createSession(cName: string): Promise { const browser = await puppeteer.launch({ headless: 'new', args: dutchieConfig.browserArgs, }); const page = await browser.newPage(); + const userAgent = dutchieConfig.userAgent; - // Set up stealth - await page.setUserAgent(dutchieConfig.userAgent); + await page.setUserAgent(userAgent); await page.setViewport({ width: 1920, height: 1080 }); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); (window as any).chrome = { runtime: {} }; }); - // Navigate to establish session - const url = menuUrl || 'https://dutchie.com/dispensaries'; - console.log(`[GraphQL Client] Loading ${url} to establish session...`); + // Navigate to the embedded menu page for this dispensary + const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`; + console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`); - await page.goto(url, { - waitUntil: 'networkidle2', - timeout: dutchieConfig.navigationTimeout, - }); - await new Promise((r) => setTimeout(r, dutchieConfig.pageLoadDelay)); - - // Try to get dispensary ID from page if it's a menu page - let dispensaryId: string | undefined; - if (menuUrl && menuUrl.includes('embedded-menu')) { - dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId); + try { + await page.goto(embeddedMenuUrl, { + waitUntil: 'networkidle2', + timeout: dutchieConfig.navigationTimeout, + }); + await new Promise((r) => setTimeout(r, dutchieConfig.pageLoadDelay)); + } catch (error: any) { + console.warn(`[GraphQL Client] Navigation warning: ${error.message}`); + // Continue anyway - we may have gotten cookies } - return { browser, page, dispensaryId }; + // Extract cookies + const cookies = await page.cookies(); + const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; '); + + console.log(`[GraphQL Client] Got ${cookies.length} cookies`); + if (cookies.length > 0) { + console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`); + } + + return { cookies: cookieString, userAgent, browser }; } /** - * Close browser session + * Close session (browser) */ -async function closeBrowserSession(session: BrowserSession): Promise { +async function closeSession(session: SessionCredentials): Promise { await session.browser.close(); } // ============================================================ -// GRAPHQL EXECUTION WITH POST FALLBACK +// SERVER-SIDE GRAPHQL FETCH USING AXIOS // ============================================================ /** - * Execute a GraphQL query from within the browser context - * Supports GET (default) with POST fallback on 405 errors + * Build headers that mimic a real browser request + */ +function buildHeaders(session: SessionCredentials, cName: string): Record { + const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`; + + return { + 'accept': 'application/json, text/plain, */*', + 'accept-language': 'en-US,en;q=0.9', + 'accept-encoding': 'gzip, deflate, br', + 'content-type': 'application/json', + 'origin': 'https://dutchie.com', + 'referer': embeddedMenuUrl, + 'user-agent': session.userAgent, + 'apollographql-client-name': 'Marketplace (production)', + 'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-site', + ...(session.cookies ? { 'cookie': session.cookies } : {}), + }; +} + +/** + * Execute GraphQL query server-side using axios + * Uses cookies from the browser session to bypass CF */ async function executeGraphQL( - page: Page, + session: SessionCredentials, operationName: string, variables: any, hash: string, - endpoint: string = 'https://dutchie.com/graphql' + cName: string ): Promise { - const headers = dutchieConfig.defaultHeaders; - const preferGet = dutchieConfig.preferGet; - const enablePostFallback = dutchieConfig.enablePostFallback; + const endpoint = dutchieConfig.graphqlEndpoint; + const headers = buildHeaders(session, cName); - return page.evaluate( - async ( - opName: string, - vars: any, - queryHash: string, - url: string, - hdrs: Record, - useGet: boolean, - allowPostFallback: boolean - ) => { - const doFetch = async (method: 'GET' | 'POST'): Promise => { - if (method === 'GET') { - const qs = new URLSearchParams({ - operationName: opName, - variables: JSON.stringify(vars), - extensions: JSON.stringify({ - persistedQuery: { version: 1, sha256Hash: queryHash }, - }), - }); - return fetch(`${url}?${qs.toString()}`, { - method: 'GET', - headers: { - ...hdrs, - 'content-type': 'application/json', - }, - credentials: 'include', - }); - } else { - // POST request with full body - return fetch(url, { - method: 'POST', - headers: { - ...hdrs, - 'content-type': 'application/json', - }, - credentials: 'include', - body: JSON.stringify({ - operationName: opName, - variables: vars, - extensions: { - persistedQuery: { version: 1, sha256Hash: queryHash }, - }, - }), - }); - } - }; - - // Try GET first if preferred - if (useGet) { - const response = await doFetch('GET'); - - // If GET fails with 405 and POST fallback is enabled, try POST - if (response.status === 405 && allowPostFallback) { - console.log('[GraphQL] GET returned 405, falling back to POST'); - const postResponse = await doFetch('POST'); - if (!postResponse.ok) { - throw new Error(`HTTP ${postResponse.status} (POST fallback)`); - } - return postResponse.json(); - } - - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - return response.json(); - } else { - // Use POST directly - const response = await doFetch('POST'); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - return response.json(); - } - }, + // Build request body for POST + const body = { operationName, variables, - hash, - endpoint, - headers, - preferGet, - enablePostFallback - ); + extensions: { + persistedQuery: { version: 1, sha256Hash: hash }, + }, + }; + + console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`); + console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`); + + try { + const response = await axios.post(endpoint, body, { + headers, + timeout: 30000, + validateStatus: () => true, // Don't throw on non-2xx + }); + + // Log response details + console.log(`[GraphQL Client] Response status: ${response.status}`); + + if (response.status !== 200) { + const bodyPreview = typeof response.data === 'string' + ? response.data.slice(0, 500) + : JSON.stringify(response.data).slice(0, 500); + console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`); + throw new Error(`HTTP ${response.status}`); + } + + // Check for GraphQL errors + if (response.data?.errors && response.data.errors.length > 0) { + console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`); + } + + return response.data; + } catch (error: any) { + if (axios.isAxiosError(error)) { + const axiosError = error as AxiosError; + console.error(`[GraphQL Client] Axios error: ${axiosError.message}`); + if (axiosError.response) { + console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`); + console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`); + } + if (axiosError.code) { + console.error(`[GraphQL Client] Error code: ${axiosError.code}`); + } + } else { + console.error(`[GraphQL Client] Error: ${error.message}`); + } + throw error; + } } // ============================================================ @@ -192,120 +195,180 @@ async function executeGraphQL( /** * Resolve a dispensary slug to its internal platform ID + * Uses GetAddressBasedDispensaryData query */ export async function resolveDispensaryId(slug: string): Promise { - const session = await createBrowserSession(`https://dutchie.com/embedded-menu/${slug}`); + console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`); + + const session = await createSession(slug); try { - // First check if we got it from the page context - if (session.dispensaryId) { - console.log(`[GraphQL Client] Got dispensaryId from page: ${session.dispensaryId}`); - return session.dispensaryId; - } + const variables = { + dispensaryFilter: { + cNameOrID: slug, + }, + }; - // Otherwise try the GetAddressBasedDispensaryData query const result = await executeGraphQL( - session.page, + session, 'GetAddressBasedDispensaryData', - { input: { dispensaryId: slug } }, - GRAPHQL_HASHES.GetAddressBasedDispensaryData + variables, + GRAPHQL_HASHES.GetAddressBasedDispensaryData, + slug ); - const dispensaryId = result?.data?.getAddressBasedDispensaryData?.dispensaryId; - console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId}`); - return dispensaryId || null; - } catch (error: any) { - console.error(`[GraphQL Client] Failed to resolve ${slug}:`, error.message); + const dispensaryId = result?.data?.dispensaryBySlug?.id || + result?.data?.dispensary?.id || + result?.data?.getAddressBasedDispensaryData?.dispensary?.id; + + if (dispensaryId) { + console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId}`); + return dispensaryId; + } + + console.log(`[GraphQL Client] Could not resolve ${slug}, response:`, JSON.stringify(result).slice(0, 300)); return null; } finally { - await closeBrowserSession(session); + await closeSession(session); } } +/** + * Discover Arizona dispensaries via geo-based query + */ +export async function discoverArizonaDispensaries(): Promise { + console.log('[GraphQL Client] Discovering Arizona dispensaries...'); + + // Use Phoenix as the default center + const session = await createSession('AZ-Deeply-Rooted'); + const allDispensaries: any[] = []; + const seenIds = new Set(); + + try { + for (const centerpoint of ARIZONA_CENTERPOINTS) { + console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`); + + const variables = { + dispensariesFilter: { + latitude: centerpoint.lat, + longitude: centerpoint.lng, + distance: 100, + state: 'AZ', + }, + }; + + try { + const result = await executeGraphQL( + session, + 'ConsumerDispensaries', + variables, + GRAPHQL_HASHES.ConsumerDispensaries, + 'AZ-Deeply-Rooted' + ); + + const dispensaries = result?.data?.consumerDispensaries || []; + + for (const d of dispensaries) { + const id = d.id || d.dispensaryId; + if (id && !seenIds.has(id)) { + seenIds.add(id); + allDispensaries.push(d); + } + } + + console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`); + } catch (error: any) { + console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`); + } + + // Delay between requests + await new Promise((r) => setTimeout(r, 1000)); + } + } finally { + await closeSession(session); + } + + console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`); + return allDispensaries; +} + // ============================================================ -// FILTER VARIABLE BUILDING +// PRODUCT FILTERING VARIABLES // ============================================================ /** - * Build GraphQL variables based on crawl mode + * Build filter variables for FilteredProducts query * - * MODE A - "UI parity": Matches what Dutchie website shows - * - Status: 'Active' - * - removeProductsBelowOptionThresholds: true (default behavior) - * - bypassOnlineThresholds: false + * CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b") + * NOT dispensaryFilter.cNameOrID! * - * MODE B - "MAX COVERAGE": Tries to get out-of-stock products - * - Status: undefined (no filter) - * - removeProductsBelowOptionThresholds: false - * - bypassOnlineThresholds: true + * The actual browser request structure is: + * { + * "productsFilter": { + * "dispensaryId": "6405ef617056e8014d79101b", + * "pricingType": "rec", + * "Status": "Active", // Mode A only + * "strainTypes": [], + * "subcategories": [], + * "types": [], + * "useCache": true, + * ... + * }, + * "page": 0, + * "perPage": 100 + * } + * + * Mode A = UI parity (Status: "Active") + * Mode B = MAX COVERAGE (no Status filter) */ function buildFilterVariables( platformDispensaryId: string, pricingType: 'rec' | 'med', crawlMode: CrawlMode, - pageNum: number, + page: number, perPage: number -): FilteredProductsVariables { - if (crawlMode === 'mode_a') { - // UI parity mode - return { - includeEnterpriseSpecials: false, - productsFilter: { - dispensaryId: platformDispensaryId, - pricingType, - Status: 'Active', - types: [], - strainTypes: [], - subcategories: [], - useCache: false, - isDefaultSort: true, - sortBy: 'popularSortIdx', - sortDirection: 1, - bypassOnlineThresholds: false, - isKioskMenu: false, - removeProductsBelowOptionThresholds: true, - }, - page: pageNum, - perPage, - }; - } else { - // MAX COVERAGE mode (mode_b) - return { - includeEnterpriseSpecials: false, - productsFilter: { - dispensaryId: platformDispensaryId, - pricingType, - // No Status filter - try to get all products - types: [], - strainTypes: [], - subcategories: [], - useCache: false, - isDefaultSort: true, - sortBy: 'popularSortIdx', - sortDirection: 1, - bypassOnlineThresholds: true, - isKioskMenu: false, - removeProductsBelowOptionThresholds: false, - }, - page: pageNum, - perPage, - }; +): any { + const isModeA = crawlMode === 'mode_a'; + + const productsFilter: Record = { + dispensaryId: platformDispensaryId, + pricingType: pricingType, + strainTypes: [], + subcategories: [], + types: [], + useCache: false, // Get fresh data + isDefaultSort: true, + sortBy: 'popular', + sortDirection: 1, + bypassOnlineThresholds: false, + isKioskMenu: false, + removeProductsBelowOptionThresholds: true, + }; + + // Mode A: Only active products (UI parity) + if (isModeA) { + productsFilter.Status = 'Active'; } + + return { + includeEnterpriseSpecials: false, + productsFilter, + page, + perPage, + }; } // ============================================================ -// PRODUCT FETCHING WITH RETRY & PAGINATION +// PRODUCT FETCHING WITH PAGINATION // ============================================================ /** - * Fetch all products for a dispensary via paginated GraphQL - * Supports retry logic and proper termination - * - * @param session - Existing browser session to reuse + * Fetch products for a single mode with pagination */ -async function fetchProductsWithSession( - session: BrowserSession, +async function fetchProductsForMode( + session: SessionCredentials, platformDispensaryId: string, + cName: string, pricingType: 'rec' | 'med', crawlMode: CrawlMode ): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> { @@ -319,51 +382,57 @@ async function fetchProductsWithSession( let totalCount = 0; let consecutiveEmptyPages = 0; - console.log(`[GraphQL Client] Fetching products for ${platformDispensaryId} (${pricingType}, ${crawlMode})...`); + console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`); while (pageNum < maxPages) { - const variables = buildFilterVariables( - platformDispensaryId, - pricingType, - crawlMode, - pageNum, - perPage - ); + const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage); let result: any = null; let lastError: Error | null = null; - // Retry logic for failed page fetches + // Retry logic for (let attempt = 0; attempt <= maxRetries; attempt++) { try { result = await executeGraphQL( - session.page, + session, 'FilteredProducts', variables, - GRAPHQL_HASHES.FilteredProducts + GRAPHQL_HASHES.FilteredProducts, + cName ); lastError = null; - break; // Success, exit retry loop + break; } catch (error: any) { lastError = error; console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`); if (attempt < maxRetries) { - await new Promise((r) => setTimeout(r, 1000 * (attempt + 1))); // Exponential backoff + await new Promise((r) => setTimeout(r, 1000 * (attempt + 1))); } } } - // If all retries failed, log error and break if (lastError) { - console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts: ${lastError.message}`); + console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`); break; } - if (result.errors) { - console.error('[GraphQL Client] GraphQL errors:', result.errors); + if (result?.errors) { + console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors)); break; } + // Log response shape on first page + if (pageNum === 0) { + console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`); + if (result?.data) { + console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`); + } + if (!result?.data?.filteredProducts) { + console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`); + console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`); + } + } + const products = result?.data?.filteredProducts?.products || []; const queryInfo = result?.data?.filteredProducts?.queryInfo; @@ -375,7 +444,6 @@ async function fetchProductsWithSession( `[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})` ); - // PROPER TERMINATION: Stop if products.length < perPage (incomplete page = last page) if (products.length === 0) { consecutiveEmptyPages++; if (consecutiveEmptyPages >= 2) { @@ -387,15 +455,13 @@ async function fetchProductsWithSession( allProducts.push(...products); } - // Stop if we got less than a full page (this is the last page) + // Stop if incomplete page (last page) if (products.length < perPage) { - console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping pagination`); + console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`); break; } pageNum++; - - // Small delay between pages await new Promise((r) => setTimeout(r, pageDelayMs)); } @@ -403,8 +469,12 @@ async function fetchProductsWithSession( return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode }; } +// ============================================================ +// LEGACY SINGLE-MODE INTERFACE +// ============================================================ + /** - * Fetch all products for a dispensary (legacy interface - creates new browser) + * Fetch all products for a dispensary (single mode) */ export async function fetchAllProducts( platformDispensaryId: string, @@ -414,28 +484,32 @@ export async function fetchAllProducts( maxPages?: number; menuUrl?: string; crawlMode?: CrawlMode; + cName?: string; } = {} ): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> { const { crawlMode = 'mode_a' } = options; - const menuUrl = options.menuUrl || `https://dutchie.com/dispensaries`; - const session = await createBrowserSession(menuUrl); + // cName is now REQUIRED - no default fallback to avoid using wrong store's session + const cName = options.cName; + if (!cName) { + throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session'); + } + + const session = await createSession(cName); try { - return await fetchProductsWithSession(session, platformDispensaryId, pricingType, crawlMode); + return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode); } finally { - await closeBrowserSession(session); + await closeSession(session); } } // ============================================================ -// MODE A+B MERGING WITH OPTIONS +// MODE A+B MERGING // ============================================================ /** * Merge POSMetaData.children arrays from Mode A and Mode B products - * Uses canonicalID/canonicalSKU/canonicalPackageId as merge key - * Mode B children may have different quantityAvailable for options not in Mode A */ function mergeProductOptions( modeAProduct: DutchieRawProduct, @@ -444,22 +518,17 @@ function mergeProductOptions( const modeAChildren = modeAProduct.POSMetaData?.children || []; const modeBChildren = modeBProduct.POSMetaData?.children || []; - // Create a map keyed by option identifier const getOptionKey = (child: DutchiePOSChild): string => { return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || ''; }; const mergedMap = new Map(); - // Add all Mode A children first (they're "canonical") for (const child of modeAChildren) { const key = getOptionKey(child); - if (key) { - mergedMap.set(key, child); - } + if (key) mergedMap.set(key, child); } - // Add Mode B children that aren't in Mode A (may include OOS options) for (const child of modeBChildren) { const key = getOptionKey(child); if (key && !mergedMap.has(key)) { @@ -472,7 +541,6 @@ function mergeProductOptions( /** * Merge a Mode A product with a Mode B product - * Mode A data is preferred, but children are merged for max coverage */ function mergeProducts( modeAProduct: DutchieRawProduct, @@ -482,10 +550,8 @@ function mergeProducts( return modeAProduct; } - // Merge children arrays const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct); - // Return Mode A product with merged children return { ...modeAProduct, POSMetaData: { @@ -495,9 +561,13 @@ function mergeProducts( }; } +// ============================================================ +// MAIN EXPORT: TWO-MODE CRAWL +// ============================================================ + /** - * Fetch products using BOTH crawl modes with SINGLE browser session - * This ensures maximum coverage by running Mode A then Mode B with the same session + * Fetch products using BOTH crawl modes with SINGLE session + * Runs Mode A then Mode B, merges results */ export async function fetchAllProductsBothModes( platformDispensaryId: string, @@ -506,161 +576,67 @@ export async function fetchAllProductsBothModes( perPage?: number; maxPages?: number; menuUrl?: string; + cName?: string; } = {} ): Promise<{ modeA: { products: DutchieRawProduct[]; totalCount: number }; modeB: { products: DutchieRawProduct[]; totalCount: number }; merged: { products: DutchieRawProduct[]; totalCount: number }; }> { - console.log(`[GraphQL Client] Running two-mode crawl for ${platformDispensaryId} (${pricingType})...`); + // cName is now REQUIRED - no default fallback to avoid using wrong store's session + const cName = options.cName; + if (!cName) { + throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session'); + } - const menuUrl = options.menuUrl || `https://dutchie.com/dispensaries`; + console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`); + console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`); - // Create a SINGLE browser session for both modes - const session = await createBrowserSession(menuUrl); + const session = await createSession(cName); try { - // Run Mode A (UI parity) with shared session - const modeAResult = await fetchProductsWithSession( - session, - platformDispensaryId, - pricingType, - 'mode_a' - ); + // Mode A (UI parity) + const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a'); - // Small delay between modes + // Delay between modes await new Promise((r) => setTimeout(r, dutchieConfig.modeDelayMs)); - // Run Mode B (MAX COVERAGE) with same session - NO new browser! - const modeBResult = await fetchProductsWithSession( - session, - platformDispensaryId, - pricingType, - 'mode_b' - ); + // Mode B (MAX COVERAGE) + const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b'); - // Build lookup map for Mode B products + // Merge results const modeBMap = new Map(); for (const product of modeBResult.products) { modeBMap.set(product._id, product); } - // Merge results - deduplicate by _id, merge options const productMap = new Map(); - // Add Mode A products first (canonical), merging with Mode B if exists + // Add Mode A products, merging with Mode B if exists for (const product of modeAResult.products) { const modeBProduct = modeBMap.get(product._id); const mergedProduct = mergeProducts(product, modeBProduct); productMap.set(product._id, mergedProduct); } - // Add Mode B products that aren't in Mode A (may include OOS items) + // Add Mode B products not in Mode A for (const product of modeBResult.products) { if (!productMap.has(product._id)) { productMap.set(product._id, product); } } - const merged = Array.from(productMap.values()); + const mergedProducts = Array.from(productMap.values()); - console.log(`[GraphQL Client] Two-mode crawl complete:`); - console.log(` Mode A: ${modeAResult.products.length} products`); - console.log(` Mode B: ${modeBResult.products.length} products`); - console.log(` Merged: ${merged.length} unique products`); + console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`); + console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`); return { modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount }, modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount }, - merged: { products: merged, totalCount: merged.length }, + merged: { products: mergedProducts, totalCount: mergedProducts.length }, }; } finally { - // Close the shared session when done - await closeBrowserSession(session); + await closeSession(session); } } - -// ============================================================ -// DISPENSARY DISCOVERY -// ============================================================ - -/** - * Discover dispensaries near a geographic point - */ -export async function discoverDispensaries( - lat: number, - lng: number, - radiusKm: number = 100 -): Promise { - // Skip discovery if disabled in config - if (!dutchieConfig.useDiscovery) { - console.log('[GraphQL Client] Discovery disabled in config, skipping'); - return []; - } - - const session = await createBrowserSession(); - - try { - console.log(`[GraphQL Client] Discovering dispensaries near ${lat}, ${lng}...`); - - // Try to use ConsumerDispensaries or similar discovery query - // Note: The exact operation may need to be captured from live traffic - const result = await executeGraphQL( - session.page, - 'ConsumerDispensaries', - { - filter: { - lat, - lng, - radius: radiusKm * 1000, // Convert to meters if needed - isDelivery: false, - }, - }, - GRAPHQL_HASHES.ConsumerDispensaries - ); - - const dispensaries = result?.data?.consumerDispensaries || []; - console.log(`[GraphQL Client] Found ${dispensaries.length} dispensaries`); - return dispensaries; - } catch (error: any) { - console.error(`[GraphQL Client] Discovery failed:`, error.message); - return []; - } finally { - await closeBrowserSession(session); - } -} - -/** - * Discover all Arizona Dutchie dispensaries using multiple centerpoints - */ -export async function discoverArizonaDispensaries(): Promise { - const allDispensaries = new Map(); - - for (const center of ARIZONA_CENTERPOINTS) { - console.log(`[GraphQL Client] Scanning ${center.name}...`); - - try { - const dispensaries = await discoverDispensaries(center.lat, center.lng, 150); - - for (const disp of dispensaries) { - // Filter to AZ only - const state = disp.state || disp.address?.state; - if (state === 'AZ' || state === 'Arizona') { - const key = disp.slug || disp.cName || disp.id; - if (key && !allDispensaries.has(key)) { - allDispensaries.set(key, disp); - } - } - } - - // Delay between scans - await new Promise((r) => setTimeout(r, 2000)); - } catch (error: any) { - console.error(`[GraphQL Client] Failed to scan ${center.name}:`, error.message); - } - } - - const result = Array.from(allDispensaries.values()); - console.log(`[GraphQL Client] Total unique AZ dispensaries: ${result.length}`); - return result; -} diff --git a/backend/src/dutchie-az/services/product-crawler.ts b/backend/src/dutchie-az/services/product-crawler.ts index b7fc58da..b3e8fe24 100644 --- a/backend/src/dutchie-az/services/product-crawler.ts +++ b/backend/src/dutchie-az/services/product-crawler.ts @@ -7,6 +7,7 @@ import { query, getClient } from '../db/connection'; import { fetchAllProducts, fetchAllProductsBothModes } from './graphql-client'; +import { mapDbRowToDispensary } from './discovery'; import { DutchieRawProduct, DutchieProduct, @@ -49,6 +50,71 @@ function getMax(arr?: number[]): number | undefined { return Math.max(...arr.filter((n) => n !== null && n !== undefined)); } +/** + * Normalize a value to boolean + * Handles Dutchie API returning {} or [] or other non-boolean values + * that would cause "invalid input syntax for type boolean" errors + */ +function normBool(v: any, defaultVal: boolean = false): boolean { + if (v === true) return true; + if (v === false) return false; + // Log unexpected object/array values once for debugging + if (v !== null && v !== undefined && typeof v === 'object') { + console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v)); + } + return defaultVal; +} + +/** + * Normalize a value to Date or undefined + * Handles Dutchie API returning {} or [] or other non-date values + * that would cause "invalid input syntax for type timestamp" errors + */ +function normDate(v: any): Date | undefined { + if (!v) return undefined; + // Reject objects/arrays that aren't dates + if (typeof v === 'object' && !(v instanceof Date)) { + console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v)); + return undefined; + } + // Try parsing + const d = new Date(v); + if (isNaN(d.getTime())) { + console.warn(`[normDate] Invalid date value, ignoring:`, v); + return undefined; + } + return d; +} + +/** + * Extract cName (Dutchie slug) from menuUrl or dispensary slug + * Handles URL formats: + * - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted + * - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock + * Falls back to dispensary.slug if menuUrl extraction fails + */ +function extractCName(dispensary: Dispensary): string { + if (dispensary.menuUrl) { + try { + const url = new URL(dispensary.menuUrl); + // Extract last path segment: /embedded-menu/X or /dispensary/X + const segments = url.pathname.split('/').filter(Boolean); + if (segments.length >= 2) { + const cName = segments[segments.length - 1]; + if (cName) { + console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`); + return cName; + } + } + } catch (e) { + console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`); + } + } + // Fallback to slug + console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`); + return dispensary.slug; +} + /** * Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot */ @@ -120,16 +186,16 @@ export function normalizeProduct( // Status / flags status: raw.Status, - medicalOnly: raw.medicalOnly || false, - recOnly: raw.recOnly || false, - featured: raw.featured || false, - comingSoon: raw.comingSoon || false, - certificateOfAnalysisEnabled: raw.certificateOfAnalysisEnabled || false, + medicalOnly: normBool(raw.medicalOnly, false), + recOnly: normBool(raw.recOnly, false), + featured: normBool(raw.featured, false), + comingSoon: normBool(raw.comingSoon, false), + certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false), - isBelowThreshold: raw.isBelowThreshold || false, - isBelowKioskThreshold: raw.isBelowKioskThreshold || false, - optionsBelowThreshold: raw.optionsBelowThreshold || false, - optionsBelowKioskThreshold: raw.optionsBelowKioskThreshold || false, + isBelowThreshold: normBool(raw.isBelowThreshold, false), + isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false), + optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false), + optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false), // Derived stock status stockStatus: deriveStockStatus(raw), @@ -144,8 +210,8 @@ export function normalizeProduct( weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight, pastCNames: raw.pastCNames, - createdAtDutchie: raw.createdAt ? new Date(raw.createdAt) : undefined, - updatedAtDutchie: raw.updatedAt ? new Date(raw.updatedAt) : undefined, + createdAtDutchie: normDate(raw.createdAt), + updatedAtDutchie: normDate(raw.updatedAt), latestRawPayload: raw, }; @@ -200,10 +266,10 @@ export function normalizeSnapshot( crawlMode, status: raw.Status, - featured: raw.featured || false, - special: isOnSpecial, - medicalOnly: raw.medicalOnly || false, - recOnly: raw.recOnly || false, + featured: normBool(raw.featured, false), + special: normBool(isOnSpecial, false), + medicalOnly: normBool(raw.medicalOnly, false), + recOnly: normBool(raw.recOnly, false), // Product was present in feed isPresentInFeed: true, @@ -223,9 +289,9 @@ export function normalizeSnapshot( // Inventory summary - null = unknown, 0 = all OOS totalQuantityAvailable: totalQty, totalKioskQuantityAvailable: totalKioskQty, - manualInventory: raw.manualInventory || false, - isBelowThreshold: raw.isBelowThreshold || false, - isBelowKioskThreshold: raw.isBelowKioskThreshold || false, + manualInventory: normBool(raw.manualInventory, false), + isBelowThreshold: normBool(raw.isBelowThreshold, false), + isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false), options, rawPayload: raw, @@ -469,13 +535,15 @@ async function updateDispensaryCrawlStats( dispensaryId: number, productCount: number ): Promise { + // Update last_crawl_at to track when we last crawled + // Skip product_count as that column may not exist await query( ` UPDATE dispensaries - SET last_crawled_at = NOW(), product_count = $2, updated_at = NOW() + SET last_crawl_at = NOW(), updated_at = NOW() WHERE id = $1 `, - [dispensaryId, productCount] + [dispensaryId] ); } @@ -701,11 +769,16 @@ export async function crawlDispensaryProducts( const modeAProductIds = new Set(); const modeBProductIds = new Set(); + // Extract cName for this specific dispensary (used for Puppeteer session & headers) + const cName = extractCName(dispensary); + console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`); + if (useBothModes) { // Run two-mode crawl for maximum coverage const bothResults = await fetchAllProductsBothModes( dispensary.platformDispensaryId, - pricingType + pricingType, + { cName } ); modeAProducts = bothResults.modeA.products.length; @@ -742,7 +815,7 @@ export async function crawlDispensaryProducts( const { products, crawlMode } = await fetchAllProducts( dispensary.platformDispensaryId, pricingType, - { crawlMode: 'mode_a' } + { crawlMode: 'mode_a', cName } ); modeAProducts = products.length; @@ -811,13 +884,14 @@ export async function crawlAllArizonaDispensaries( const results: CrawlResult[] = []; // Get all AZ dispensaries with platform IDs - const { rows: dispensaries } = await query( + const { rows: rawRows } = await query( ` SELECT * FROM dispensaries WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL ORDER BY id ` ); + const dispensaries = rawRows.map(mapDbRowToDispensary); console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);