Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
316 lines
8.4 KiB
TypeScript
316 lines
8.4 KiB
TypeScript
/**
|
|
* HTTP Fingerprinting Service
|
|
*
|
|
* Per workflow-12102025.md - HTTP Fingerprinting section:
|
|
* - Full header set per browser type
|
|
* - Browser-specific header ordering
|
|
* - Natural randomization (DNT, Accept quality)
|
|
* - Dynamic Referer per dispensary
|
|
*
|
|
* Canonical location: src/services/http-fingerprint.ts
|
|
*/
|
|
|
|
// ============================================================
|
|
// TYPES
|
|
// ============================================================
|
|
|
|
export type BrowserType = 'Chrome' | 'Firefox' | 'Safari' | 'Edge';
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Full HTTP fingerprint for a session
|
|
*/
|
|
export interface HTTPFingerprint {
|
|
browserType: BrowserType;
|
|
headers: Record<string, string>;
|
|
headerOrder: string[];
|
|
curlImpersonateBinary: string;
|
|
hasDNT: boolean;
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Context for building headers
|
|
*/
|
|
export interface HeaderContext {
|
|
userAgent: string;
|
|
secChUa?: string;
|
|
secChUaPlatform?: string;
|
|
secChUaMobile?: string;
|
|
referer: string;
|
|
isPost: boolean;
|
|
contentLength?: number;
|
|
}
|
|
|
|
// ============================================================
|
|
// CONSTANTS (per workflow-12102025.md)
|
|
// ============================================================
|
|
|
|
/**
|
|
* Per workflow-12102025.md: DNT header distribution (~30% of users)
|
|
*/
|
|
const DNT_PROBABILITY = 0.30;
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Accept header variations for natural traffic
|
|
*/
|
|
const ACCEPT_VARIATIONS = [
|
|
'application/json, text/plain, */*',
|
|
'application/json,text/plain,*/*',
|
|
'*/*',
|
|
];
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Accept-Language variations
|
|
*/
|
|
const ACCEPT_LANGUAGE_VARIATIONS = [
|
|
'en-US,en;q=0.9',
|
|
'en-US,en;q=0.8',
|
|
'en-US;q=0.9,en;q=0.8',
|
|
];
|
|
|
|
/**
|
|
* Per workflow-12102025.md: curl-impersonate binaries per browser
|
|
*/
|
|
const CURL_IMPERSONATE_BINARIES: Record<BrowserType, string> = {
|
|
Chrome: 'curl_chrome131',
|
|
Edge: 'curl_chrome131', // Edge uses Chromium
|
|
Firefox: 'curl_ff133',
|
|
Safari: 'curl_safari17',
|
|
};
|
|
|
|
// ============================================================
|
|
// HEADER ORDERING (per workflow-12102025.md)
|
|
// ============================================================
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Chrome header order for GraphQL requests
|
|
*/
|
|
const CHROME_HEADER_ORDER = [
|
|
'Host',
|
|
'Connection',
|
|
'Content-Length',
|
|
'sec-ch-ua',
|
|
'DNT',
|
|
'sec-ch-ua-mobile',
|
|
'User-Agent',
|
|
'sec-ch-ua-platform',
|
|
'Content-Type',
|
|
'Accept',
|
|
'Origin',
|
|
'sec-fetch-site',
|
|
'sec-fetch-mode',
|
|
'sec-fetch-dest',
|
|
'Referer',
|
|
'Accept-Encoding',
|
|
'Accept-Language',
|
|
];
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Firefox header order for GraphQL requests
|
|
*/
|
|
const FIREFOX_HEADER_ORDER = [
|
|
'Host',
|
|
'User-Agent',
|
|
'Accept',
|
|
'Accept-Language',
|
|
'Accept-Encoding',
|
|
'Content-Type',
|
|
'Content-Length',
|
|
'Origin',
|
|
'DNT',
|
|
'Connection',
|
|
'Referer',
|
|
'sec-fetch-dest',
|
|
'sec-fetch-mode',
|
|
'sec-fetch-site',
|
|
];
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Safari header order for GraphQL requests
|
|
*/
|
|
const SAFARI_HEADER_ORDER = [
|
|
'Host',
|
|
'Connection',
|
|
'Content-Length',
|
|
'Accept',
|
|
'User-Agent',
|
|
'Content-Type',
|
|
'Origin',
|
|
'Referer',
|
|
'Accept-Encoding',
|
|
'Accept-Language',
|
|
];
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Edge uses Chrome order (Chromium-based)
|
|
*/
|
|
const HEADER_ORDERS: Record<BrowserType, string[]> = {
|
|
Chrome: CHROME_HEADER_ORDER,
|
|
Edge: CHROME_HEADER_ORDER,
|
|
Firefox: FIREFOX_HEADER_ORDER,
|
|
Safari: SAFARI_HEADER_ORDER,
|
|
};
|
|
|
|
// ============================================================
|
|
// FINGERPRINT GENERATION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Generate HTTP fingerprint for a session
|
|
* Randomization is done once per session for consistency
|
|
*/
|
|
export function generateHTTPFingerprint(browserType: BrowserType): HTTPFingerprint {
|
|
// Per workflow-12102025.md: DNT randomized per session (~30%)
|
|
const hasDNT = Math.random() < DNT_PROBABILITY;
|
|
|
|
return {
|
|
browserType,
|
|
headers: {}, // Built dynamically per request
|
|
headerOrder: HEADER_ORDERS[browserType],
|
|
curlImpersonateBinary: CURL_IMPERSONATE_BINARIES[browserType],
|
|
hasDNT,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Build complete headers for a request
|
|
* Returns headers in browser-specific order
|
|
*/
|
|
export function buildOrderedHeaders(
|
|
fingerprint: HTTPFingerprint,
|
|
context: HeaderContext
|
|
): { headers: Record<string, string>; orderedHeaders: string[] } {
|
|
const { browserType, hasDNT, headerOrder } = fingerprint;
|
|
const { userAgent, secChUa, secChUaPlatform, secChUaMobile, referer, isPost, contentLength } = context;
|
|
|
|
// Per workflow-12102025.md: Natural randomization for Accept
|
|
const accept = ACCEPT_VARIATIONS[Math.floor(Math.random() * ACCEPT_VARIATIONS.length)];
|
|
const acceptLanguage = ACCEPT_LANGUAGE_VARIATIONS[Math.floor(Math.random() * ACCEPT_LANGUAGE_VARIATIONS.length)];
|
|
|
|
// Build all possible headers
|
|
const allHeaders: Record<string, string> = {
|
|
'Connection': 'keep-alive',
|
|
'User-Agent': userAgent,
|
|
'Accept': accept,
|
|
'Accept-Language': acceptLanguage,
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
};
|
|
|
|
// Per workflow-12102025.md: POST-only headers
|
|
if (isPost) {
|
|
allHeaders['Content-Type'] = 'application/json';
|
|
allHeaders['Origin'] = 'https://dutchie.com';
|
|
if (contentLength !== undefined) {
|
|
allHeaders['Content-Length'] = String(contentLength);
|
|
}
|
|
}
|
|
|
|
// Per workflow-12102025.md: Dynamic Referer per dispensary
|
|
allHeaders['Referer'] = referer;
|
|
|
|
// Per workflow-12102025.md: DNT randomized per session
|
|
if (hasDNT) {
|
|
allHeaders['DNT'] = '1';
|
|
}
|
|
|
|
// Per workflow-12102025.md: Chromium-only headers (Chrome, Edge)
|
|
if (browserType === 'Chrome' || browserType === 'Edge') {
|
|
if (secChUa) allHeaders['sec-ch-ua'] = secChUa;
|
|
if (secChUaMobile) allHeaders['sec-ch-ua-mobile'] = secChUaMobile;
|
|
if (secChUaPlatform) allHeaders['sec-ch-ua-platform'] = secChUaPlatform;
|
|
allHeaders['sec-fetch-site'] = 'same-origin';
|
|
allHeaders['sec-fetch-mode'] = 'cors';
|
|
allHeaders['sec-fetch-dest'] = 'empty';
|
|
}
|
|
|
|
// Per workflow-12102025.md: Firefox has sec-fetch but no sec-ch
|
|
if (browserType === 'Firefox') {
|
|
allHeaders['sec-fetch-site'] = 'same-origin';
|
|
allHeaders['sec-fetch-mode'] = 'cors';
|
|
allHeaders['sec-fetch-dest'] = 'empty';
|
|
}
|
|
|
|
// Per workflow-12102025.md: Safari has no sec-* headers
|
|
|
|
// Filter to only headers that exist and order them
|
|
const orderedHeaders: string[] = [];
|
|
const headers: Record<string, string> = {};
|
|
|
|
for (const headerName of headerOrder) {
|
|
if (allHeaders[headerName]) {
|
|
orderedHeaders.push(headerName);
|
|
headers[headerName] = allHeaders[headerName];
|
|
}
|
|
}
|
|
|
|
return { headers, orderedHeaders };
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Build curl command arguments for headers
|
|
* Headers are added in browser-specific order
|
|
*/
|
|
export function buildCurlHeaderArgs(
|
|
fingerprint: HTTPFingerprint,
|
|
context: HeaderContext
|
|
): string[] {
|
|
const { headers, orderedHeaders } = buildOrderedHeaders(fingerprint, context);
|
|
|
|
const args: string[] = [];
|
|
for (const headerName of orderedHeaders) {
|
|
// Skip Host and Content-Length - curl handles these
|
|
if (headerName === 'Host' || headerName === 'Content-Length') continue;
|
|
args.push('-H', `${headerName}: ${headers[headerName]}`);
|
|
}
|
|
|
|
return args;
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Extract Referer from dispensary menu_url
|
|
*/
|
|
export function buildRefererFromMenuUrl(menuUrl: string | null | undefined): string {
|
|
if (!menuUrl) {
|
|
return 'https://dutchie.com/';
|
|
}
|
|
|
|
// Extract slug from menu_url
|
|
// Formats: /embedded-menu/<slug> or /dispensary/<slug> or full URL
|
|
let slug: string | null = null;
|
|
|
|
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
|
|
const dispensaryMatch = menuUrl.match(/\/dispensary\/([^/?]+)/);
|
|
|
|
if (embeddedMatch) {
|
|
slug = embeddedMatch[1];
|
|
} else if (dispensaryMatch) {
|
|
slug = dispensaryMatch[1];
|
|
}
|
|
|
|
if (slug) {
|
|
return `https://dutchie.com/dispensary/${slug}`;
|
|
}
|
|
|
|
return 'https://dutchie.com/';
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Get curl-impersonate binary for browser
|
|
*/
|
|
export function getCurlBinary(browserType: BrowserType): string {
|
|
return CURL_IMPERSONATE_BINARIES[browserType];
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Check if curl-impersonate is available
|
|
*/
|
|
export function isCurlImpersonateAvailable(browserType: BrowserType): boolean {
|
|
const binary = CURL_IMPERSONATE_BINARIES[browserType];
|
|
try {
|
|
const { execSync } = require('child_process');
|
|
execSync(`which ${binary}`, { stdio: 'ignore' });
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|