/** * ============================================================ * DUTCHIE PLATFORM CLIENT - LOCKED MODULE * ============================================================ * * DO NOT MODIFY THIS FILE WITHOUT EXPLICIT AUTHORIZATION. * * Updated: 2025-12-10 per workflow-12102025.md * * KEY BEHAVIORS (per workflow-12102025.md): * 1. startSession() gets identity from PROXY LOCATION, not task params * 2. On 403: immediately get new IP + new fingerprint, then retry * 3. After 3 consecutive 403s on same proxy → disable it (burned) * 4. Language is always English (en-US) * * IMPLEMENTATION: * - Uses curl via child_process.execSync (bypasses TLS fingerprinting) * - NO Puppeteer, NO axios, NO fetch * - Uses intoli/user-agents via CrawlRotator for realistic fingerprints * - Residential IP compatible * * USAGE: * import { curlPost, curlGet, executeGraphQL, startSession } from '@dutchie/client'; * * ============================================================ */ import { execSync } from 'child_process'; import { buildOrderedHeaders, buildRefererFromMenuUrl, getCurlBinary, isCurlImpersonateAvailable, HeaderContext, BrowserType, } from '../../services/http-fingerprint'; // ============================================================ // TYPES // ============================================================ export interface CurlResponse { status: number; data: any; error?: string; } // Per workflow-12102025.md: fingerprint comes from CrawlRotator's BrowserFingerprint // We keep a simplified interface here for header building export interface Fingerprint { userAgent: string; acceptLanguage: string; secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string; } // ============================================================ // CONFIGURATION // ============================================================ export const DUTCHIE_CONFIG = { graphqlEndpoint: 'https://dutchie.com/api-3/graphql', baseUrl: 'https://dutchie.com', timeout: 30000, maxRetries: 3, perPage: 100, maxPages: 200, pageDelayMs: 500, modeDelayMs: 2000, }; // ============================================================ // PROXY SUPPORT // Per workflow-12102025.md: // - On 403: recordBlock() → increment consecutive_403_count // - After 3 consecutive 403s → proxy disabled // - Immediately rotate to new IP + new fingerprint on 403 // ============================================================ import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator'; let currentProxy: string | null = null; let crawlRotator: CrawlRotator | null = null; /** * Set proxy for all Dutchie requests * Format: http://user:pass@host:port or socks5://host:port */ export function setProxy(proxy: string | null): void { currentProxy = proxy; if (proxy) { console.log(`[Dutchie Client] Proxy set: ${proxy.replace(/:[^:@]+@/, ':***@')}`); } else { console.log('[Dutchie Client] Proxy disabled (direct connection)'); } } /** * Get current proxy URL */ export function getProxy(): string | null { return currentProxy; } /** * Set CrawlRotator for proxy rotation on 403s * Per workflow-12102025.md: enables automatic rotation when blocked */ export function setCrawlRotator(rotator: CrawlRotator | null): void { crawlRotator = rotator; if (rotator) { console.log('[Dutchie Client] CrawlRotator attached - proxy rotation enabled'); const proxy = rotator.proxy.getCurrent(); if (proxy) { currentProxy = rotator.proxy.getProxyUrl(proxy); console.log(`[Dutchie Client] Initial proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`); } } } /** * Get attached CrawlRotator */ export function getCrawlRotator(): CrawlRotator | null { return crawlRotator; } /** * Handle 403 block - per workflow-12102025.md: * 1. Record block on current proxy (increments consecutive_403_count) * 2. Immediately rotate to new proxy (new IP) * 3. Rotate fingerprint * Returns false if no more proxies available */ async function handle403Block(): Promise { if (!crawlRotator) { console.warn('[Dutchie Client] No CrawlRotator - cannot handle 403'); return false; } // Per workflow-12102025.md: record block (tracks consecutive 403s) const wasDisabled = await crawlRotator.recordBlock(); if (wasDisabled) { console.log('[Dutchie Client] Current proxy was disabled (3 consecutive 403s)'); } // Per workflow-12102025.md: immediately get new IP + new fingerprint const { proxy: nextProxy, fingerprint } = crawlRotator.rotateBoth(); if (nextProxy) { currentProxy = crawlRotator.proxy.getProxyUrl(nextProxy); console.log(`[Dutchie Client] Rotated to new proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`); console.log(`[Dutchie Client] New fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`); return true; } console.error('[Dutchie Client] No more proxies available!'); return false; } /** * Record success on current proxy * Per workflow-12102025.md: resets consecutive_403_count */ async function recordProxySuccess(responseTimeMs?: number): Promise { if (crawlRotator) { await crawlRotator.recordSuccess(responseTimeMs); } } /** * Build curl proxy argument */ function getProxyArg(): string { if (!currentProxy) return ''; return `--proxy '${currentProxy}'`; } export const GRAPHQL_HASHES = { FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b', DispensaryInfo: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6', }; // ============================================================ // SESSION MANAGEMENT // Per workflow-12102025.md: // - Session identity comes from PROXY LOCATION // - NOT from task params (no stateCode/timezone params) // - Language is always English // ============================================================ export interface CrawlSession { sessionId: string; fingerprint: BrowserFingerprint; proxyUrl: string | null; proxyTimezone?: string; proxyState?: string; startedAt: Date; // Per workflow-12102025.md: Dynamic Referer per dispensary menuUrl?: string; referer: string; } let currentSession: CrawlSession | null = null; /** * Start a new crawl session * * Per workflow-12102025.md: * - NO state/timezone params - identity comes from proxy location * - Gets fingerprint from CrawlRotator (uses intoli/user-agents) * - Language is always English (en-US) * - Dynamic Referer per dispensary (from menuUrl) * * @param menuUrl - The dispensary's menu URL for dynamic Referer header */ export function startSession(menuUrl?: string): CrawlSession { if (!crawlRotator) { throw new Error('[Dutchie Client] Cannot start session without CrawlRotator'); } // Per workflow-12102025.md: get identity from proxy location const proxyLocation = crawlRotator.getProxyLocation(); const fingerprint = crawlRotator.userAgent.getCurrent(); // Per workflow-12102025.md: Dynamic Referer per dispensary const referer = buildRefererFromMenuUrl(menuUrl); currentSession = { sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`, fingerprint, proxyUrl: currentProxy, proxyTimezone: proxyLocation?.timezone, proxyState: proxyLocation?.state, startedAt: new Date(), menuUrl, referer, }; console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`); console.log(`[Dutchie Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`); console.log(`[Dutchie Client] DNT: ${fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`); console.log(`[Dutchie Client] TLS: ${fingerprint.httpFingerprint.curlImpersonateBinary}`); console.log(`[Dutchie Client] Referer: ${referer}`); if (proxyLocation?.timezone) { console.log(`[Dutchie Client] Proxy: ${proxyLocation.state || 'unknown'} (${proxyLocation.timezone})`); } return currentSession; } /** * End the current crawl session */ export function endSession(): void { if (currentSession) { const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000); console.log(`[Dutchie Client] Ended session ${currentSession.sessionId} (${duration}s)`); currentSession = null; } } /** * Get current active session */ export function getCurrentSession(): CrawlSession | null { return currentSession; } // ============================================================ // CURL HTTP CLIENT // ============================================================ /** * Per workflow-12102025.md: Build headers using HTTP fingerprint system * Returns headers in browser-specific order with all natural variations */ export function buildHeaders(isPost: boolean, contentLength?: number): { headers: Record; orderedHeaders: string[] } { if (!currentSession || !crawlRotator) { throw new Error('[Dutchie Client] Cannot build headers without active session'); } const fp = currentSession.fingerprint; const httpFp = fp.httpFingerprint; // Per workflow-12102025.md: Build context for ordered headers const context: HeaderContext = { userAgent: fp.userAgent, secChUa: fp.secChUa, secChUaPlatform: fp.secChUaPlatform, secChUaMobile: fp.secChUaMobile, referer: currentSession.referer, isPost, contentLength, }; // Per workflow-12102025.md: Get ordered headers from HTTP fingerprint service return buildOrderedHeaders(httpFp, context); } /** * Per workflow-12102025.md: Get curl binary for current session's browser * Uses curl-impersonate for TLS fingerprint matching */ function getCurlBinaryForSession(): string { if (!currentSession) { return 'curl'; // Fallback to standard curl } const browserType = currentSession.fingerprint.browserName as BrowserType; // Per workflow-12102025.md: Check if curl-impersonate is available if (isCurlImpersonateAvailable(browserType)) { return getCurlBinary(browserType); } // Fallback to standard curl with warning console.warn(`[Dutchie Client] curl-impersonate not available for ${browserType}, using standard curl`); return 'curl'; } /** * Per workflow-12102025.md: Execute HTTP POST using curl/curl-impersonate * - Uses browser-specific TLS fingerprint via curl-impersonate * - Headers sent in browser-specific order * - Dynamic Referer per dispensary */ export function curlPost(url: string, body: any, timeout = 30000): CurlResponse { const bodyJson = JSON.stringify(body); // Per workflow-12102025.md: Build ordered headers for POST request const { headers, orderedHeaders } = buildHeaders(true, bodyJson.length); // Per workflow-12102025.md: Build header args in browser-specific order const headerArgs = orderedHeaders .filter(h => h !== 'Host' && h !== 'Content-Length') // curl handles these .map(h => `-H '${h}: ${headers[h]}'`) .join(' '); const bodyEscaped = bodyJson.replace(/'/g, "'\\''"); const timeoutSec = Math.ceil(timeout / 1000); const separator = '___HTTP_STATUS___'; const proxyArg = getProxyArg(); // Per workflow-12102025.md: Use curl-impersonate for TLS fingerprint matching const curlBinary = getCurlBinaryForSession(); const cmd = `${curlBinary} -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${headerArgs} -d '${bodyEscaped}' '${url}'`; try { const output = execSync(cmd, { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024, timeout: timeout + 5000 }); const separatorIndex = output.lastIndexOf(separator); if (separatorIndex === -1) { const lines = output.trim().split('\n'); const statusCode = parseInt(lines.pop() || '0', 10); const responseBody = lines.join('\n'); try { return { status: statusCode, data: JSON.parse(responseBody) }; } catch { return { status: statusCode, data: responseBody }; } } const responseBody = output.slice(0, separatorIndex); const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10); try { return { status: statusCode, data: JSON.parse(responseBody) }; } catch { return { status: statusCode, data: responseBody }; } } catch (error: any) { return { status: 0, data: null, error: error.message || 'curl request failed' }; } } /** * Per workflow-12102025.md: Execute HTTP GET using curl/curl-impersonate * - Uses browser-specific TLS fingerprint via curl-impersonate * - Headers sent in browser-specific order * - Dynamic Referer per dispensary */ export function curlGet(url: string, timeout = 30000): CurlResponse { // Per workflow-12102025.md: Build ordered headers for GET request const { headers, orderedHeaders } = buildHeaders(false); // Per workflow-12102025.md: Build header args in browser-specific order const headerArgs = orderedHeaders .filter(h => h !== 'Host' && h !== 'Content-Length') // curl handles these .map(h => `-H '${h}: ${headers[h]}'`) .join(' '); const timeoutSec = Math.ceil(timeout / 1000); const separator = '___HTTP_STATUS___'; const proxyArg = getProxyArg(); // Per workflow-12102025.md: Use curl-impersonate for TLS fingerprint matching const curlBinary = getCurlBinaryForSession(); const cmd = `${curlBinary} -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${headerArgs} '${url}'`; try { const output = execSync(cmd, { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024, timeout: timeout + 5000 }); const separatorIndex = output.lastIndexOf(separator); if (separatorIndex === -1) { const lines = output.trim().split('\n'); const statusCode = parseInt(lines.pop() || '0', 10); const responseBody = lines.join('\n'); return { status: statusCode, data: responseBody }; } const responseBody = output.slice(0, separatorIndex); const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10); try { return { status: statusCode, data: JSON.parse(responseBody) }; } catch { return { status: statusCode, data: responseBody }; } } catch (error: any) { return { status: 0, data: null, error: error.message || 'curl request failed' }; } } // ============================================================ // GRAPHQL EXECUTION // Per workflow-12102025.md: // - On 403: immediately rotate IP + fingerprint (no delay first) // - Then retry // ============================================================ export interface ExecuteGraphQLOptions { maxRetries?: number; retryOn403?: boolean; cName?: string; } /** * Per workflow-12102025.md: Execute GraphQL query with curl/curl-impersonate * - Uses browser-specific TLS fingerprint * - Headers in browser-specific order * - On 403: immediately rotate IP + fingerprint, then retry */ export async function executeGraphQL( operationName: string, variables: any, hash: string, options: ExecuteGraphQLOptions ): Promise { const { maxRetries = 3, retryOn403 = true } = options; // Per workflow-12102025.md: Session must be active for requests if (!currentSession) { throw new Error('[Dutchie Client] Cannot execute GraphQL without active session - call startSession() first'); } const body = { operationName, variables, extensions: { persistedQuery: { version: 1, sha256Hash: hash }, }, }; let lastError: Error | null = null; let attempt = 0; while (attempt <= maxRetries) { console.log(`[Dutchie Client] curl POST ${operationName} (attempt ${attempt + 1}/${maxRetries + 1})`); const startTime = Date.now(); // Per workflow-12102025.md: curlPost now uses ordered headers and curl-impersonate const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, DUTCHIE_CONFIG.timeout); const responseTime = Date.now() - startTime; console.log(`[Dutchie Client] Response status: ${response.status} (${responseTime}ms)`); if (response.error) { console.error(`[Dutchie Client] curl error: ${response.error}`); lastError = new Error(response.error); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } continue; } if (response.status === 200) { // Per workflow-12102025.md: success resets consecutive 403 count await recordProxySuccess(responseTime); if (response.data?.errors?.length > 0) { console.warn(`[Dutchie Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`); } return response.data; } if (response.status === 403 && retryOn403) { // Per workflow-12102025.md: immediately rotate IP + fingerprint console.warn(`[Dutchie Client] 403 blocked - immediately rotating proxy + fingerprint...`); const hasMoreProxies = await handle403Block(); if (!hasMoreProxies) { throw new Error('All proxies exhausted - no more IPs available'); } // Per workflow-12102025.md: Update session referer after rotation currentSession.referer = buildRefererFromMenuUrl(currentSession.menuUrl); attempt++; // Per workflow-12102025.md: small backoff after rotation await sleep(500); continue; } const bodyPreview = typeof response.data === 'string' ? response.data.slice(0, 200) : JSON.stringify(response.data).slice(0, 200); console.error(`[Dutchie Client] HTTP ${response.status}: ${bodyPreview}`); lastError = new Error(`HTTP ${response.status}`); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } } throw lastError || new Error('Max retries exceeded'); } // ============================================================ // HTML PAGE FETCHING // ============================================================ export interface FetchPageOptions { maxRetries?: number; retryOn403?: boolean; } /** * Per workflow-12102025.md: Fetch HTML page from Dutchie * - Uses browser-specific TLS fingerprint * - Headers in browser-specific order * - Same 403 handling as GraphQL */ export async function fetchPage( path: string, options: FetchPageOptions = {} ): Promise<{ html: string; status: number } | null> { const { maxRetries = 3, retryOn403 = true } = options; const url = `${DUTCHIE_CONFIG.baseUrl}${path}`; // Per workflow-12102025.md: Session must be active for requests if (!currentSession) { throw new Error('[Dutchie Client] Cannot fetch page without active session - call startSession() first'); } let attempt = 0; while (attempt <= maxRetries) { // Per workflow-12102025.md: curlGet now uses ordered headers and curl-impersonate console.log(`[Dutchie Client] curl GET ${path} (attempt ${attempt + 1}/${maxRetries + 1})`); const startTime = Date.now(); const response = curlGet(url, DUTCHIE_CONFIG.timeout); const responseTime = Date.now() - startTime; console.log(`[Dutchie Client] Response status: ${response.status} (${responseTime}ms)`); if (response.error) { console.error(`[Dutchie Client] curl error: ${response.error}`); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } continue; } if (response.status === 200) { // Per workflow-12102025.md: success resets consecutive 403 count await recordProxySuccess(responseTime); return { html: response.data, status: response.status }; } if (response.status === 403 && retryOn403) { // Per workflow-12102025.md: immediately rotate IP + fingerprint console.warn(`[Dutchie Client] 403 blocked - immediately rotating proxy + fingerprint...`); const hasMoreProxies = await handle403Block(); if (!hasMoreProxies) { throw new Error('All proxies exhausted - no more IPs available'); } // Per workflow-12102025.md: Update session after rotation currentSession.referer = buildRefererFromMenuUrl(currentSession.menuUrl); attempt++; // Per workflow-12102025.md: small backoff after rotation await sleep(500); continue; } console.error(`[Dutchie Client] HTTP ${response.status}`); attempt++; if (attempt <= maxRetries) { await sleep(1000 * attempt); } } return null; } /** * Extract __NEXT_DATA__ from HTML page */ export function extractNextData(html: string): any | null { const match = html.match(/