/** * ============================================================ * DUTCHIE PLATFORM CLIENT - LOCKED MODULE * ============================================================ * * DO NOT MODIFY THIS FILE WITHOUT EXPLICIT AUTHORIZATION. * * This is the canonical HTTP client for all Dutchie communication. * All Dutchie workers (Alice, Bella, etc.) MUST use this client. * * IMPLEMENTATION: * - Uses curl via child_process.execSync (bypasses TLS fingerprinting) * - NO Puppeteer, NO axios, NO fetch * - Fingerprint rotation on 403 * - Residential IP compatible * * USAGE: * import { curlPost, curlGet, executeGraphQL } from '@dutchie/client'; * * ============================================================ */ import { execSync } from 'child_process'; // ============================================================ // TYPES // ============================================================ export interface CurlResponse { status: number; data: any; error?: string; } export interface Fingerprint { userAgent: string; acceptLanguage: string; secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string; } // ============================================================ // CONFIGURATION // ============================================================ export const DUTCHIE_CONFIG = { graphqlEndpoint: 'https://dutchie.com/api-3/graphql', baseUrl: 'https://dutchie.com', timeout: 30000, maxRetries: 3, perPage: 100, maxPages: 200, pageDelayMs: 500, modeDelayMs: 2000, }; // ============================================================ // PROXY SUPPORT // ============================================================ // Integrates with the CrawlRotator system from proxy-rotator.ts // On 403 errors: // 1. Record failure on current proxy // 2. Rotate to next proxy // 3. Retry with new proxy // ============================================================ import type { CrawlRotator, Proxy } from '../../services/crawl-rotator'; let currentProxy: string | null = null; let crawlRotator: CrawlRotator | null = null; /** * Set proxy for all Dutchie requests * Format: http://user:pass@host:port or socks5://host:port */ export function setProxy(proxy: string | null): void { currentProxy = proxy; if (proxy) { console.log(`[Dutchie Client] Proxy set: ${proxy.replace(/:[^:@]+@/, ':***@')}`); } else { console.log('[Dutchie Client] Proxy disabled (direct connection)'); } } /** * Get current proxy URL */ export function getProxy(): string | null { return currentProxy; } /** * Set CrawlRotator for proxy rotation on 403s * This enables automatic proxy rotation when blocked */ export function setCrawlRotator(rotator: CrawlRotator | null): void { crawlRotator = rotator; if (rotator) { console.log('[Dutchie Client] CrawlRotator attached - proxy rotation enabled'); // Set initial proxy from rotator const proxy = rotator.proxy.getCurrent(); if (proxy) { currentProxy = rotator.proxy.getProxyUrl(proxy); console.log(`[Dutchie Client] Initial proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`); } } } /** * Get attached CrawlRotator */ export function getCrawlRotator(): CrawlRotator | null { return crawlRotator; } /** * Rotate to next proxy (called on 403) */ async function rotateProxyOn403(error?: string): Promise { if (!crawlRotator) { return false; } // Record failure on current proxy await crawlRotator.recordFailure(error || '403 Forbidden'); // Rotate to next proxy const nextProxy = crawlRotator.rotateProxy(); if (nextProxy) { currentProxy = crawlRotator.proxy.getProxyUrl(nextProxy); console.log(`[Dutchie Client] Rotated proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`); return true; } console.warn('[Dutchie Client] No more proxies available'); return false; } /** * Record success on current proxy */ async function recordProxySuccess(responseTimeMs?: number): Promise { if (crawlRotator) { await crawlRotator.recordSuccess(responseTimeMs); } } /** * Build curl proxy argument */ function getProxyArg(): string { if (!currentProxy) return ''; return `--proxy '${currentProxy}'`; } export const GRAPHQL_HASHES = { FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b', DispensaryInfo: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6', }; // ============================================================ // FINGERPRINTS - Browser profiles for anti-detect // ============================================================ const FINGERPRINTS: Fingerprint[] = [ // Chrome Windows (latest) - typical residential user, use first { userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', acceptLanguage: 'en-US,en;q=0.9', secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', secChUaPlatform: '"Windows"', secChUaMobile: '?0', }, // Chrome Mac (latest) { userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', acceptLanguage: 'en-US,en;q=0.9', secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', secChUaPlatform: '"macOS"', secChUaMobile: '?0', }, // Chrome Windows (120) { userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', acceptLanguage: 'en-US,en;q=0.9', secChUa: '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"', secChUaPlatform: '"Windows"', secChUaMobile: '?0', }, // Firefox Windows { userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0', acceptLanguage: 'en-US,en;q=0.5', }, // Safari Mac { userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15', acceptLanguage: 'en-US,en;q=0.9', }, // Edge Windows { userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0', acceptLanguage: 'en-US,en;q=0.9', secChUa: '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"', secChUaPlatform: '"Windows"', secChUaMobile: '?0', }, ]; let currentFingerprintIndex = 0; // Forward declaration for session (actual CrawlSession interface defined later) let currentSession: { sessionId: string; fingerprint: Fingerprint; proxyUrl: string | null; stateCode?: string; timezone?: string; startedAt: Date; } | null = null; /** * Get current fingerprint - returns session fingerprint if active, otherwise default */ export function getFingerprint(): Fingerprint { // Use session fingerprint if a session is active if (currentSession) { return currentSession.fingerprint; } return FINGERPRINTS[currentFingerprintIndex]; } export function rotateFingerprint(): Fingerprint { currentFingerprintIndex = (currentFingerprintIndex + 1) % FINGERPRINTS.length; const fp = FINGERPRINTS[currentFingerprintIndex]; console.log(`[Dutchie Client] Rotated to fingerprint: ${fp.userAgent.slice(0, 50)}...`); return fp; } export function resetFingerprint(): void { currentFingerprintIndex = 0; } /** * Get a random fingerprint from the pool */ export function getRandomFingerprint(): Fingerprint { const index = Math.floor(Math.random() * FINGERPRINTS.length); return FINGERPRINTS[index]; } // ============================================================ // SESSION MANAGEMENT // Per-session fingerprint rotation for stealth // ============================================================ export interface CrawlSession { sessionId: string; fingerprint: Fingerprint; proxyUrl: string | null; stateCode?: string; timezone?: string; startedAt: Date; } // Note: currentSession variable declared earlier in file for proper scoping /** * Timezone to Accept-Language mapping * US timezones all use en-US but this can be extended for international */ const TIMEZONE_TO_LOCALE: Record = { 'America/Phoenix': 'en-US,en;q=0.9', 'America/Los_Angeles': 'en-US,en;q=0.9', 'America/Denver': 'en-US,en;q=0.9', 'America/Chicago': 'en-US,en;q=0.9', 'America/New_York': 'en-US,en;q=0.9', 'America/Detroit': 'en-US,en;q=0.9', 'America/Anchorage': 'en-US,en;q=0.9', 'Pacific/Honolulu': 'en-US,en;q=0.9', }; /** * Get Accept-Language header for a given timezone */ export function getLocaleForTimezone(timezone?: string): string { if (!timezone) return 'en-US,en;q=0.9'; return TIMEZONE_TO_LOCALE[timezone] || 'en-US,en;q=0.9'; } /** * Start a new crawl session with a random fingerprint * Call this before crawling a store to get a fresh identity */ export function startSession(stateCode?: string, timezone?: string): CrawlSession { const baseFp = getRandomFingerprint(); // Override Accept-Language based on timezone for geographic consistency const fingerprint: Fingerprint = { ...baseFp, acceptLanguage: getLocaleForTimezone(timezone), }; currentSession = { sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`, fingerprint, proxyUrl: currentProxy, stateCode, timezone, startedAt: new Date(), }; console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`); console.log(`[Dutchie Client] Fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`); console.log(`[Dutchie Client] Accept-Language: ${fingerprint.acceptLanguage}`); if (timezone) { console.log(`[Dutchie Client] Timezone: ${timezone}`); } return currentSession; } /** * End the current crawl session */ export function endSession(): void { if (currentSession) { const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000); console.log(`[Dutchie Client] Ended session ${currentSession.sessionId} (${duration}s)`); currentSession = null; } } /** * Get current active session */ export function getCurrentSession(): CrawlSession | null { return currentSession; } // ============================================================ // CURL HTTP CLIENT // ============================================================ /** * Build headers for Dutchie requests */ export function buildHeaders(refererPath: string, fingerprint?: Fingerprint): Record { const fp = fingerprint || getFingerprint(); const refererUrl = `https://dutchie.com${refererPath}`; const headers: Record = { 'accept': 'application/json, text/plain, */*', 'accept-language': fp.acceptLanguage, 'content-type': 'application/json', 'origin': 'https://dutchie.com', 'referer': refererUrl, 'user-agent': fp.userAgent, 'apollographql-client-name': 'Marketplace (production)', }; if (fp.secChUa) { headers['sec-ch-ua'] = fp.secChUa; headers['sec-ch-ua-mobile'] = fp.secChUaMobile || '?0'; headers['sec-ch-ua-platform'] = fp.secChUaPlatform || '"Windows"'; headers['sec-fetch-dest'] = 'empty'; headers['sec-fetch-mode'] = 'cors'; headers['sec-fetch-site'] = 'same-site'; } return headers; } /** * Execute HTTP POST using curl (bypasses TLS fingerprinting) */ export function curlPost(url: string, body: any, headers: Record, timeout = 30000): CurlResponse { const filteredHeaders = Object.entries(headers) .filter(([k]) => k.toLowerCase() !== 'accept-encoding') .map(([k, v]) => `-H '${k}: ${v}'`) .join(' '); const bodyJson = JSON.stringify(body).replace(/'/g, "'\\''"); const timeoutSec = Math.ceil(timeout / 1000); const separator = '___HTTP_STATUS___'; const proxyArg = getProxyArg(); const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} -d '${bodyJson}' '${url}'`; try { const output = execSync(cmd, { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024, timeout: timeout + 5000 }); const separatorIndex = output.lastIndexOf(separator); if (separatorIndex === -1) { const lines = output.trim().split('\n'); const statusCode = parseInt(lines.pop() || '0', 10); const responseBody = lines.join('\n'); try { return { status: statusCode, data: JSON.parse(responseBody) }; } catch { return { status: statusCode, data: responseBody }; } } const responseBody = output.slice(0, separatorIndex); const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10); try { return { status: statusCode, data: JSON.parse(responseBody) }; } catch { return { status: statusCode, data: responseBody }; } } catch (error: any) { return { status: 0, data: null, error: error.message || 'curl request failed' }; } } /** * Execute HTTP GET using curl (bypasses TLS fingerprinting) * Returns HTML or JSON depending on response content-type */ export function curlGet(url: string, headers: Record, timeout = 30000): CurlResponse { const filteredHeaders = Object.entries(headers) .filter(([k]) => k.toLowerCase() !== 'accept-encoding') .map(([k, v]) => `-H '${k}: ${v}'`) .join(' '); const timeoutSec = Math.ceil(timeout / 1000); const separator = '___HTTP_STATUS___'; const proxyArg = getProxyArg(); const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} '${url}'`; try { const output = execSync(cmd, { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024, timeout: timeout + 5000 }); const separatorIndex = output.lastIndexOf(separator); if (separatorIndex === -1) { const lines = output.trim().split('\n'); const statusCode = parseInt(lines.pop() || '0', 10); const responseBody = lines.join('\n'); return { status: statusCode, data: responseBody }; } const responseBody = output.slice(0, separatorIndex); const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10); // Try to parse as JSON, otherwise return as string (HTML) try { return { status: statusCode, data: JSON.parse(responseBody) }; } catch { return { status: statusCode, data: responseBody }; } } catch (error: any) { return { status: 0, data: null, error: error.message || 'curl request failed' }; } } // ============================================================ // GRAPHQL EXECUTION // ============================================================ export interface ExecuteGraphQLOptions { maxRetries?: number; retryOn403?: boolean; cName?: string; // Optional - used for Referer header, defaults to 'cities' } /** * Execute GraphQL query with curl (bypasses TLS fingerprinting) */ export async function executeGraphQL( operationName: string, variables: any, hash: string, options: ExecuteGraphQLOptions ): Promise { const { maxRetries = 3, retryOn403 = true, cName = 'cities' } = options; const body = { operationName, variables, extensions: { persistedQuery: { version: 1, sha256Hash: hash }, }, }; let lastError: Error | null = null; let attempt = 0; while (attempt <= maxRetries) { const fingerprint = getFingerprint(); const headers = buildHeaders(`/embedded-menu/${cName}`, fingerprint); console.log(`[Dutchie Client] curl POST ${operationName} (attempt ${attempt + 1}/${maxRetries + 1})`); const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, headers, DUTCHIE_CONFIG.timeout); console.log(`[Dutchie Client] Response status: ${response.status}`); if (response.error) { console.error(`[Dutchie Client] curl error: ${response.error}`); lastError = new Error(response.error); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } continue; } if (response.status === 200) { if (response.data?.errors?.length > 0) { console.warn(`[Dutchie Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`); } return response.data; } if (response.status === 403 && retryOn403) { console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`); await rotateProxyOn403('403 Forbidden on GraphQL'); rotateFingerprint(); attempt++; await sleep(1000 * attempt); continue; } const bodyPreview = typeof response.data === 'string' ? response.data.slice(0, 200) : JSON.stringify(response.data).slice(0, 200); console.error(`[Dutchie Client] HTTP ${response.status}: ${bodyPreview}`); lastError = new Error(`HTTP ${response.status}`); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } } throw lastError || new Error('Max retries exceeded'); } // ============================================================ // HTML PAGE FETCHING // ============================================================ export interface FetchPageOptions { maxRetries?: number; retryOn403?: boolean; } /** * Fetch HTML page from Dutchie (for city pages, dispensary pages, etc.) * Returns raw HTML string */ export async function fetchPage( path: string, options: FetchPageOptions = {} ): Promise<{ html: string; status: number } | null> { const { maxRetries = 3, retryOn403 = true } = options; const url = `${DUTCHIE_CONFIG.baseUrl}${path}`; let attempt = 0; while (attempt <= maxRetries) { const fingerprint = getFingerprint(); const headers: Record = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'accept-language': fingerprint.acceptLanguage, 'user-agent': fingerprint.userAgent, }; if (fingerprint.secChUa) { headers['sec-ch-ua'] = fingerprint.secChUa; headers['sec-ch-ua-mobile'] = fingerprint.secChUaMobile || '?0'; headers['sec-ch-ua-platform'] = fingerprint.secChUaPlatform || '"Windows"'; headers['sec-fetch-dest'] = 'document'; headers['sec-fetch-mode'] = 'navigate'; headers['sec-fetch-site'] = 'none'; headers['sec-fetch-user'] = '?1'; headers['upgrade-insecure-requests'] = '1'; } console.log(`[Dutchie Client] curl GET ${path} (attempt ${attempt + 1}/${maxRetries + 1})`); const response = curlGet(url, headers, DUTCHIE_CONFIG.timeout); console.log(`[Dutchie Client] Response status: ${response.status}`); if (response.error) { console.error(`[Dutchie Client] curl error: ${response.error}`); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } continue; } if (response.status === 200) { return { html: response.data, status: response.status }; } if (response.status === 403 && retryOn403) { console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`); await rotateProxyOn403('403 Forbidden on page fetch'); rotateFingerprint(); attempt++; await sleep(1000 * attempt); continue; } console.error(`[Dutchie Client] HTTP ${response.status}`); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } } return null; } /** * Extract __NEXT_DATA__ from HTML page */ export function extractNextData(html: string): any | null { const match = html.match(/