Files
cannaiq/backend/src/platforms/dutchie/client.ts
Kelly 4e84f30f8b feat: Auto-retry tasks, 403 proxy rotation, task deletion
- Fix 403 handler to rotate BOTH proxy and fingerprint (was only fingerprint)
- Add auto-retry logic to task service (retry up to max_retries before failing)
- Add error tooltip on task status badge showing retry count and error message
- Add DELETE /api/tasks/:id endpoint (only for non-running tasks)
- Add delete button to JobQueue task table

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 08:41:14 -07:00

662 lines
20 KiB
TypeScript

/**
* ============================================================
* DUTCHIE PLATFORM CLIENT - LOCKED MODULE
* ============================================================
*
* DO NOT MODIFY THIS FILE WITHOUT EXPLICIT AUTHORIZATION.
*
* This is the canonical HTTP client for all Dutchie communication.
* All Dutchie workers (Alice, Bella, etc.) MUST use this client.
*
* IMPLEMENTATION:
* - Uses curl via child_process.execSync (bypasses TLS fingerprinting)
* - NO Puppeteer, NO axios, NO fetch
* - Fingerprint rotation on 403
* - Residential IP compatible
*
* USAGE:
* import { curlPost, curlGet, executeGraphQL } from '@dutchie/client';
*
* ============================================================
*/
import { execSync } from 'child_process';
// ============================================================
// TYPES
// ============================================================
export interface CurlResponse {
status: number;
data: any;
error?: string;
}
export interface Fingerprint {
userAgent: string;
acceptLanguage: string;
secChUa?: string;
secChUaPlatform?: string;
secChUaMobile?: string;
}
// ============================================================
// CONFIGURATION
// ============================================================
export const DUTCHIE_CONFIG = {
graphqlEndpoint: 'https://dutchie.com/api-3/graphql',
baseUrl: 'https://dutchie.com',
timeout: 30000,
maxRetries: 3,
perPage: 100,
maxPages: 200,
pageDelayMs: 500,
modeDelayMs: 2000,
};
// ============================================================
// PROXY SUPPORT
// ============================================================
// Integrates with the CrawlRotator system from proxy-rotator.ts
// On 403 errors:
// 1. Record failure on current proxy
// 2. Rotate to next proxy
// 3. Retry with new proxy
// ============================================================
import type { CrawlRotator, Proxy } from '../../services/crawl-rotator';
let currentProxy: string | null = null;
let crawlRotator: CrawlRotator | null = null;
/**
* Set proxy for all Dutchie requests
* Format: http://user:pass@host:port or socks5://host:port
*/
export function setProxy(proxy: string | null): void {
currentProxy = proxy;
if (proxy) {
console.log(`[Dutchie Client] Proxy set: ${proxy.replace(/:[^:@]+@/, ':***@')}`);
} else {
console.log('[Dutchie Client] Proxy disabled (direct connection)');
}
}
/**
* Get current proxy URL
*/
export function getProxy(): string | null {
return currentProxy;
}
/**
* Set CrawlRotator for proxy rotation on 403s
* This enables automatic proxy rotation when blocked
*/
export function setCrawlRotator(rotator: CrawlRotator | null): void {
crawlRotator = rotator;
if (rotator) {
console.log('[Dutchie Client] CrawlRotator attached - proxy rotation enabled');
// Set initial proxy from rotator
const proxy = rotator.proxy.getCurrent();
if (proxy) {
currentProxy = rotator.proxy.getProxyUrl(proxy);
console.log(`[Dutchie Client] Initial proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`);
}
}
}
/**
* Get attached CrawlRotator
*/
export function getCrawlRotator(): CrawlRotator | null {
return crawlRotator;
}
/**
* Rotate to next proxy (called on 403)
*/
async function rotateProxyOn403(error?: string): Promise<boolean> {
if (!crawlRotator) {
return false;
}
// Record failure on current proxy
await crawlRotator.recordFailure(error || '403 Forbidden');
// Rotate to next proxy
const nextProxy = crawlRotator.rotateProxy();
if (nextProxy) {
currentProxy = crawlRotator.proxy.getProxyUrl(nextProxy);
console.log(`[Dutchie Client] Rotated proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`);
return true;
}
console.warn('[Dutchie Client] No more proxies available');
return false;
}
/**
* Record success on current proxy
*/
async function recordProxySuccess(responseTimeMs?: number): Promise<void> {
if (crawlRotator) {
await crawlRotator.recordSuccess(responseTimeMs);
}
}
/**
* Build curl proxy argument
*/
function getProxyArg(): string {
if (!currentProxy) return '';
return `--proxy '${currentProxy}'`;
}
export const GRAPHQL_HASHES = {
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
DispensaryInfo: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6',
};
// ============================================================
// FINGERPRINTS - Browser profiles for anti-detect
// ============================================================
const FINGERPRINTS: Fingerprint[] = [
// Chrome Windows (latest) - typical residential user, use first
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
},
// Chrome Mac (latest)
{
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"macOS"',
secChUaMobile: '?0',
},
// Chrome Windows (120)
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
},
// Firefox Windows
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
acceptLanguage: 'en-US,en;q=0.5',
},
// Safari Mac
{
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
acceptLanguage: 'en-US,en;q=0.9',
},
// Edge Windows
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
},
];
let currentFingerprintIndex = 0;
// Forward declaration for session (actual CrawlSession interface defined later)
let currentSession: {
sessionId: string;
fingerprint: Fingerprint;
proxyUrl: string | null;
stateCode?: string;
timezone?: string;
startedAt: Date;
} | null = null;
/**
* Get current fingerprint - returns session fingerprint if active, otherwise default
*/
export function getFingerprint(): Fingerprint {
// Use session fingerprint if a session is active
if (currentSession) {
return currentSession.fingerprint;
}
return FINGERPRINTS[currentFingerprintIndex];
}
export function rotateFingerprint(): Fingerprint {
currentFingerprintIndex = (currentFingerprintIndex + 1) % FINGERPRINTS.length;
const fp = FINGERPRINTS[currentFingerprintIndex];
console.log(`[Dutchie Client] Rotated to fingerprint: ${fp.userAgent.slice(0, 50)}...`);
return fp;
}
export function resetFingerprint(): void {
currentFingerprintIndex = 0;
}
/**
* Get a random fingerprint from the pool
*/
export function getRandomFingerprint(): Fingerprint {
const index = Math.floor(Math.random() * FINGERPRINTS.length);
return FINGERPRINTS[index];
}
// ============================================================
// SESSION MANAGEMENT
// Per-session fingerprint rotation for stealth
// ============================================================
export interface CrawlSession {
sessionId: string;
fingerprint: Fingerprint;
proxyUrl: string | null;
stateCode?: string;
timezone?: string;
startedAt: Date;
}
// Note: currentSession variable declared earlier in file for proper scoping
/**
* Timezone to Accept-Language mapping
* US timezones all use en-US but this can be extended for international
*/
const TIMEZONE_TO_LOCALE: Record<string, string> = {
'America/Phoenix': 'en-US,en;q=0.9',
'America/Los_Angeles': 'en-US,en;q=0.9',
'America/Denver': 'en-US,en;q=0.9',
'America/Chicago': 'en-US,en;q=0.9',
'America/New_York': 'en-US,en;q=0.9',
'America/Detroit': 'en-US,en;q=0.9',
'America/Anchorage': 'en-US,en;q=0.9',
'Pacific/Honolulu': 'en-US,en;q=0.9',
};
/**
* Get Accept-Language header for a given timezone
*/
export function getLocaleForTimezone(timezone?: string): string {
if (!timezone) return 'en-US,en;q=0.9';
return TIMEZONE_TO_LOCALE[timezone] || 'en-US,en;q=0.9';
}
/**
* Start a new crawl session with a random fingerprint
* Call this before crawling a store to get a fresh identity
*/
export function startSession(stateCode?: string, timezone?: string): CrawlSession {
const baseFp = getRandomFingerprint();
// Override Accept-Language based on timezone for geographic consistency
const fingerprint: Fingerprint = {
...baseFp,
acceptLanguage: getLocaleForTimezone(timezone),
};
currentSession = {
sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
fingerprint,
proxyUrl: currentProxy,
stateCode,
timezone,
startedAt: new Date(),
};
console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`);
console.log(`[Dutchie Client] Fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`);
console.log(`[Dutchie Client] Accept-Language: ${fingerprint.acceptLanguage}`);
if (timezone) {
console.log(`[Dutchie Client] Timezone: ${timezone}`);
}
return currentSession;
}
/**
* End the current crawl session
*/
export function endSession(): void {
if (currentSession) {
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
console.log(`[Dutchie Client] Ended session ${currentSession.sessionId} (${duration}s)`);
currentSession = null;
}
}
/**
* Get current active session
*/
export function getCurrentSession(): CrawlSession | null {
return currentSession;
}
// ============================================================
// CURL HTTP CLIENT
// ============================================================
/**
* Build headers for Dutchie requests
*/
export function buildHeaders(refererPath: string, fingerprint?: Fingerprint): Record<string, string> {
const fp = fingerprint || getFingerprint();
const refererUrl = `https://dutchie.com${refererPath}`;
const headers: Record<string, string> = {
'accept': 'application/json, text/plain, */*',
'accept-language': fp.acceptLanguage,
'content-type': 'application/json',
'origin': 'https://dutchie.com',
'referer': refererUrl,
'user-agent': fp.userAgent,
'apollographql-client-name': 'Marketplace (production)',
};
if (fp.secChUa) {
headers['sec-ch-ua'] = fp.secChUa;
headers['sec-ch-ua-mobile'] = fp.secChUaMobile || '?0';
headers['sec-ch-ua-platform'] = fp.secChUaPlatform || '"Windows"';
headers['sec-fetch-dest'] = 'empty';
headers['sec-fetch-mode'] = 'cors';
headers['sec-fetch-site'] = 'same-site';
}
return headers;
}
/**
* Execute HTTP POST using curl (bypasses TLS fingerprinting)
*/
export function curlPost(url: string, body: any, headers: Record<string, string>, timeout = 30000): CurlResponse {
const filteredHeaders = Object.entries(headers)
.filter(([k]) => k.toLowerCase() !== 'accept-encoding')
.map(([k, v]) => `-H '${k}: ${v}'`)
.join(' ');
const bodyJson = JSON.stringify(body).replace(/'/g, "'\\''");
const timeoutSec = Math.ceil(timeout / 1000);
const separator = '___HTTP_STATUS___';
const proxyArg = getProxyArg();
const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} -d '${bodyJson}' '${url}'`;
try {
const output = execSync(cmd, {
encoding: 'utf-8',
maxBuffer: 10 * 1024 * 1024,
timeout: timeout + 5000
});
const separatorIndex = output.lastIndexOf(separator);
if (separatorIndex === -1) {
const lines = output.trim().split('\n');
const statusCode = parseInt(lines.pop() || '0', 10);
const responseBody = lines.join('\n');
try {
return { status: statusCode, data: JSON.parse(responseBody) };
} catch {
return { status: statusCode, data: responseBody };
}
}
const responseBody = output.slice(0, separatorIndex);
const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10);
try {
return { status: statusCode, data: JSON.parse(responseBody) };
} catch {
return { status: statusCode, data: responseBody };
}
} catch (error: any) {
return {
status: 0,
data: null,
error: error.message || 'curl request failed'
};
}
}
/**
* Execute HTTP GET using curl (bypasses TLS fingerprinting)
* Returns HTML or JSON depending on response content-type
*/
export function curlGet(url: string, headers: Record<string, string>, timeout = 30000): CurlResponse {
const filteredHeaders = Object.entries(headers)
.filter(([k]) => k.toLowerCase() !== 'accept-encoding')
.map(([k, v]) => `-H '${k}: ${v}'`)
.join(' ');
const timeoutSec = Math.ceil(timeout / 1000);
const separator = '___HTTP_STATUS___';
const proxyArg = getProxyArg();
const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} '${url}'`;
try {
const output = execSync(cmd, {
encoding: 'utf-8',
maxBuffer: 10 * 1024 * 1024,
timeout: timeout + 5000
});
const separatorIndex = output.lastIndexOf(separator);
if (separatorIndex === -1) {
const lines = output.trim().split('\n');
const statusCode = parseInt(lines.pop() || '0', 10);
const responseBody = lines.join('\n');
return { status: statusCode, data: responseBody };
}
const responseBody = output.slice(0, separatorIndex);
const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10);
// Try to parse as JSON, otherwise return as string (HTML)
try {
return { status: statusCode, data: JSON.parse(responseBody) };
} catch {
return { status: statusCode, data: responseBody };
}
} catch (error: any) {
return {
status: 0,
data: null,
error: error.message || 'curl request failed'
};
}
}
// ============================================================
// GRAPHQL EXECUTION
// ============================================================
export interface ExecuteGraphQLOptions {
maxRetries?: number;
retryOn403?: boolean;
cName?: string; // Optional - used for Referer header, defaults to 'cities'
}
/**
* Execute GraphQL query with curl (bypasses TLS fingerprinting)
*/
export async function executeGraphQL(
operationName: string,
variables: any,
hash: string,
options: ExecuteGraphQLOptions
): Promise<any> {
const { maxRetries = 3, retryOn403 = true, cName = 'cities' } = options;
const body = {
operationName,
variables,
extensions: {
persistedQuery: { version: 1, sha256Hash: hash },
},
};
let lastError: Error | null = null;
let attempt = 0;
while (attempt <= maxRetries) {
const fingerprint = getFingerprint();
const headers = buildHeaders(`/embedded-menu/${cName}`, fingerprint);
console.log(`[Dutchie Client] curl POST ${operationName} (attempt ${attempt + 1}/${maxRetries + 1})`);
const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, headers, DUTCHIE_CONFIG.timeout);
console.log(`[Dutchie Client] Response status: ${response.status}`);
if (response.error) {
console.error(`[Dutchie Client] curl error: ${response.error}`);
lastError = new Error(response.error);
attempt++;
if (attempt <= maxRetries) {
await sleep(1000 * attempt);
}
continue;
}
if (response.status === 200) {
if (response.data?.errors?.length > 0) {
console.warn(`[Dutchie Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
}
return response.data;
}
if (response.status === 403 && retryOn403) {
console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`);
await rotateProxyOn403('403 Forbidden on GraphQL');
rotateFingerprint();
attempt++;
await sleep(1000 * attempt);
continue;
}
const bodyPreview = typeof response.data === 'string'
? response.data.slice(0, 200)
: JSON.stringify(response.data).slice(0, 200);
console.error(`[Dutchie Client] HTTP ${response.status}: ${bodyPreview}`);
lastError = new Error(`HTTP ${response.status}`);
attempt++;
if (attempt <= maxRetries) {
await sleep(1000 * attempt);
}
}
throw lastError || new Error('Max retries exceeded');
}
// ============================================================
// HTML PAGE FETCHING
// ============================================================
export interface FetchPageOptions {
maxRetries?: number;
retryOn403?: boolean;
}
/**
* Fetch HTML page from Dutchie (for city pages, dispensary pages, etc.)
* Returns raw HTML string
*/
export async function fetchPage(
path: string,
options: FetchPageOptions = {}
): Promise<{ html: string; status: number } | null> {
const { maxRetries = 3, retryOn403 = true } = options;
const url = `${DUTCHIE_CONFIG.baseUrl}${path}`;
let attempt = 0;
while (attempt <= maxRetries) {
const fingerprint = getFingerprint();
const headers: Record<string, string> = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'accept-language': fingerprint.acceptLanguage,
'user-agent': fingerprint.userAgent,
};
if (fingerprint.secChUa) {
headers['sec-ch-ua'] = fingerprint.secChUa;
headers['sec-ch-ua-mobile'] = fingerprint.secChUaMobile || '?0';
headers['sec-ch-ua-platform'] = fingerprint.secChUaPlatform || '"Windows"';
headers['sec-fetch-dest'] = 'document';
headers['sec-fetch-mode'] = 'navigate';
headers['sec-fetch-site'] = 'none';
headers['sec-fetch-user'] = '?1';
headers['upgrade-insecure-requests'] = '1';
}
console.log(`[Dutchie Client] curl GET ${path} (attempt ${attempt + 1}/${maxRetries + 1})`);
const response = curlGet(url, headers, DUTCHIE_CONFIG.timeout);
console.log(`[Dutchie Client] Response status: ${response.status}`);
if (response.error) {
console.error(`[Dutchie Client] curl error: ${response.error}`);
attempt++;
if (attempt <= maxRetries) {
await sleep(1000 * attempt);
}
continue;
}
if (response.status === 200) {
return { html: response.data, status: response.status };
}
if (response.status === 403 && retryOn403) {
console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`);
await rotateProxyOn403('403 Forbidden on page fetch');
rotateFingerprint();
attempt++;
await sleep(1000 * attempt);
continue;
}
console.error(`[Dutchie Client] HTTP ${response.status}`);
attempt++;
if (attempt <= maxRetries) {
await sleep(1000 * attempt);
}
}
return null;
}
/**
* Extract __NEXT_DATA__ from HTML page
*/
export function extractNextData(html: string): any | null {
const match = html.match(/<script id="__NEXT_DATA__" type="application\/json">([^<]+)<\/script>/);
if (match && match[1]) {
try {
return JSON.parse(match[1]);
} catch (e) {
console.error('[Dutchie Client] Failed to parse __NEXT_DATA__:', e);
return null;
}
}
return null;
}
// ============================================================
// UTILITY
// ============================================================
function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}