/** * Crawl Rotator - Proxy & User Agent Rotation for Crawlers * * Updated: 2025-12-10 per workflow-12102025.md * * KEY BEHAVIORS (per workflow-12102025.md): * 1. Task determines WHAT work to do, proxy determines SESSION IDENTITY * 2. Proxy location (timezone) sets Accept-Language headers (always English) * 3. On 403: immediately get new IP, new fingerprint, retry * 4. After 3 consecutive 403s on same proxy with different fingerprints → disable proxy * * USER-AGENT GENERATION (per workflow-12102025.md): * - Device distribution: Mobile 62%, Desktop 36%, Tablet 2% * - Browser whitelist: Chrome, Safari, Edge, Firefox only * - UA sticks until IP rotates * - Failure = alert admin + stop crawl (no fallback) * * Uses intoli/user-agents for realistic UA generation with daily-updated data. * * Canonical location: src/services/crawl-rotator.ts */ import { Pool } from 'pg'; import UserAgent from 'user-agents'; import { HTTPFingerprint, generateHTTPFingerprint, BrowserType, } from './http-fingerprint'; // ============================================================ // UA CONSTANTS (per workflow-12102025.md) // ============================================================ /** * Per workflow-12102025.md: Device category distribution (hardcoded) * Mobile: 62%, Desktop: 36%, Tablet: 2% */ const DEVICE_WEIGHTS = { mobile: 62, desktop: 36, tablet: 2, } as const; /** * Per workflow-12102025.md: Browser whitelist * Only Chrome (67%), Safari (20%), Edge (6%), Firefox (3%) * Samsung Internet, Opera, and other niche browsers are filtered out */ const ALLOWED_BROWSERS = ['Chrome', 'Safari', 'Edge', 'Firefox'] as const; // ============================================================ // PROXY TYPES // ============================================================ export interface Proxy { id: number; host: string; port: number; username?: string; password?: string; protocol: 'http' | 'https' | 'socks5'; isActive: boolean; lastUsedAt: Date | null; failureCount: number; successCount: number; avgResponseTimeMs: number | null; maxConnections: number; /** * Per workflow-12102025.md: Track consecutive 403s with different fingerprints. * After 3 consecutive 403s → disable proxy (it's burned). */ consecutive403Count: number; // Location info - determines session headers per workflow-12102025.md city?: string; state?: string; country?: string; countryCode?: string; timezone?: string; } export interface ProxyStats { totalProxies: number; activeProxies: number; blockedProxies: number; avgSuccessRate: number; } // ============================================================ // FINGERPRINT TYPE // Per workflow-12102025.md: Full browser fingerprint from user-agents // ============================================================ export interface BrowserFingerprint { userAgent: string; platform: string; screenWidth: number; screenHeight: number; viewportWidth: number; viewportHeight: number; deviceCategory: string; browserName: string; // Per workflow-12102025.md: for session logging // Derived headers for anti-detect acceptLanguage: string; secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string; // Per workflow-12102025.md: HTTP Fingerprinting section httpFingerprint: HTTPFingerprint; } /** * Per workflow-12102025.md: Session log entry for debugging blocked sessions */ export interface UASessionLog { deviceCategory: string; browserName: string; userAgent: string; proxyIp: string | null; sessionStartedAt: Date; } // ============================================================ // PROXY ROTATOR CLASS // ============================================================ export class ProxyRotator { private pool: Pool | null = null; private proxies: Proxy[] = []; private currentIndex: number = 0; private lastRotation: Date = new Date(); constructor(pool?: Pool) { this.pool = pool || null; } setPool(pool: Pool): void { this.pool = pool; } /** * Load proxies from database */ async loadProxies(): Promise { if (!this.pool) { console.warn('[ProxyRotator] No database pool configured'); return; } try { const result = await this.pool.query(` SELECT id, host, port, username, password, protocol, active as "isActive", last_tested_at as "lastUsedAt", failure_count as "failureCount", 0 as "successCount", response_time_ms as "avgResponseTimeMs", COALESCE(max_connections, 1) as "maxConnections", COALESCE(consecutive_403_count, 0) as "consecutive403Count", city, state, country, country_code as "countryCode", timezone FROM proxies WHERE active = true ORDER BY failure_count ASC, last_tested_at ASC NULLS FIRST `); this.proxies = result.rows; const totalCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0); console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies (${totalCapacity} max concurrent connections)`); } catch (error) { console.warn(`[ProxyRotator] Could not load proxies: ${error}`); this.proxies = []; } } /** * Get next proxy in rotation */ getNext(): Proxy | null { if (this.proxies.length === 0) return null; this.currentIndex = (this.currentIndex + 1) % this.proxies.length; this.lastRotation = new Date(); return this.proxies[this.currentIndex]; } /** * Get current proxy without rotating */ getCurrent(): Proxy | null { if (this.proxies.length === 0) return null; return this.proxies[this.currentIndex]; } /** * Get proxy by ID */ getById(id: number): Proxy | null { return this.proxies.find(p => p.id === id) || null; } /** * Rotate to a specific proxy */ setProxy(id: number): boolean { const index = this.proxies.findIndex(p => p.id === id); if (index === -1) return false; this.currentIndex = index; this.lastRotation = new Date(); return true; } /** * Mark proxy as blocked (403 received) * Per workflow-12102025.md: * - Increment consecutive_403_count * - After 3 consecutive 403s with different fingerprints → disable proxy * - This is separate from general failures (timeouts, etc.) */ async markBlocked(proxyId: number): Promise { const proxy = this.proxies.find(p => p.id === proxyId); let shouldDisable = false; if (proxy) { proxy.consecutive403Count++; // Per workflow-12102025.md: 3 consecutive 403s → proxy is burned if (proxy.consecutive403Count >= 3) { proxy.isActive = false; this.proxies = this.proxies.filter(p => p.id !== proxyId); console.log(`[ProxyRotator] Proxy ${proxyId} DISABLED after ${proxy.consecutive403Count} consecutive 403s (burned)`); shouldDisable = true; } else { console.log(`[ProxyRotator] Proxy ${proxyId} blocked (403 #${proxy.consecutive403Count}/3)`); } } // Update database if (this.pool) { try { await this.pool.query(` UPDATE proxies SET consecutive_403_count = COALESCE(consecutive_403_count, 0) + 1, last_failure_at = NOW(), test_result = '403 Forbidden', active = CASE WHEN COALESCE(consecutive_403_count, 0) >= 2 THEN false ELSE active END, updated_at = NOW() WHERE id = $1 `, [proxyId]); } catch (err) { console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err); } } return shouldDisable; } /** * Mark proxy as failed (general error - timeout, connection error, etc.) * Separate from 403 blocking per workflow-12102025.md */ async markFailed(proxyId: number, error?: string): Promise { const proxy = this.proxies.find(p => p.id === proxyId); if (proxy) { proxy.failureCount++; // Deactivate if too many general failures if (proxy.failureCount >= 5) { proxy.isActive = false; this.proxies = this.proxies.filter(p => p.id !== proxyId); console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} general failures`); } } if (this.pool) { try { await this.pool.query(` UPDATE proxies SET failure_count = failure_count + 1, updated_at = NOW(), test_result = $2, active = CASE WHEN failure_count >= 4 THEN false ELSE active END WHERE id = $1 `, [proxyId, error || 'failed']); } catch (err) { console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err); } } } /** * Mark proxy as successful - resets consecutive 403 count * Per workflow-12102025.md: successful request clears the 403 counter */ async markSuccess(proxyId: number, responseTimeMs?: number): Promise { const proxy = this.proxies.find(p => p.id === proxyId); if (proxy) { proxy.successCount++; proxy.consecutive403Count = 0; // Reset on success per workflow-12102025.md proxy.lastUsedAt = new Date(); if (responseTimeMs !== undefined) { proxy.avgResponseTimeMs = proxy.avgResponseTimeMs ? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2) : responseTimeMs; } } if (this.pool) { try { await this.pool.query(` UPDATE proxies SET last_tested_at = NOW(), test_result = 'success', consecutive_403_count = 0, response_time_ms = CASE WHEN response_time_ms IS NULL THEN $2 ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer END, updated_at = NOW() WHERE id = $1 `, [proxyId, responseTimeMs || null]); } catch (err) { console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err); } } } /** * Get proxy URL for HTTP client */ getProxyUrl(proxy: Proxy): string { const auth = proxy.username && proxy.password ? `${proxy.username}:${proxy.password}@` : ''; return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`; } /** * Get stats about proxy pool */ getStats(): ProxyStats { const totalProxies = this.proxies.length; const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0); const blockedProxies = this.proxies.filter(p => p.failureCount >= 5 || p.consecutive403Count >= 3).length; const successRates = this.proxies .filter(p => p.successCount + p.failureCount > 0) .map(p => p.successCount / (p.successCount + p.failureCount)); const avgSuccessRate = successRates.length > 0 ? successRates.reduce((a, b) => a + b, 0) / successRates.length : 0; return { totalProxies, activeProxies, blockedProxies, avgSuccessRate, }; } hasAvailableProxies(): boolean { return this.proxies.length > 0; } } // ============================================================ // USER AGENT ROTATOR CLASS // Per workflow-12102025.md: Uses intoli/user-agents for realistic fingerprints // ============================================================ export class UserAgentRotator { private currentFingerprint: BrowserFingerprint | null = null; private sessionLog: UASessionLog | null = null; constructor() { // Per workflow-12102025.md: Initialize with first fingerprint this.rotate(); } /** * Per workflow-12102025.md: Roll device category based on distribution * Mobile: 62%, Desktop: 36%, Tablet: 2% */ private rollDeviceCategory(): 'mobile' | 'desktop' | 'tablet' { const roll = Math.random() * 100; if (roll < DEVICE_WEIGHTS.mobile) { return 'mobile'; } else if (roll < DEVICE_WEIGHTS.mobile + DEVICE_WEIGHTS.desktop) { return 'desktop'; } else { return 'tablet'; } } /** * Per workflow-12102025.md: Extract browser name from UA string */ private extractBrowserName(userAgent: string): string { if (userAgent.includes('Edg/')) return 'Edge'; if (userAgent.includes('Firefox/')) return 'Firefox'; if (userAgent.includes('Safari/') && !userAgent.includes('Chrome/')) return 'Safari'; if (userAgent.includes('Chrome/')) return 'Chrome'; return 'Unknown'; } /** * Per workflow-12102025.md: Check if browser is in whitelist */ private isAllowedBrowser(userAgent: string): boolean { const browserName = this.extractBrowserName(userAgent); return ALLOWED_BROWSERS.includes(browserName as typeof ALLOWED_BROWSERS[number]); } /** * Generate a new random fingerprint * Per workflow-12102025.md: * - Roll device category (62/36/2) * - Filter to top 4 browsers only * - Failure = alert admin + stop (no fallback) */ rotate(proxyIp?: string): BrowserFingerprint { // Per workflow-12102025.md: Roll device category const deviceCategory = this.rollDeviceCategory(); // Per workflow-12102025.md: Generate UA filtered to device category const generator = new UserAgent({ deviceCategory }); // Per workflow-12102025.md: Try to get an allowed browser (max 50 attempts) let ua: ReturnType; let attempts = 0; const maxAttempts = 50; do { ua = generator(); attempts++; } while (!this.isAllowedBrowser(ua.data.userAgent) && attempts < maxAttempts); // Per workflow-12102025.md: If we can't get allowed browser, this is a failure if (!this.isAllowedBrowser(ua.data.userAgent)) { const errorMsg = `[UserAgentRotator] CRITICAL: Failed to generate allowed browser after ${maxAttempts} attempts. Device: ${deviceCategory}. Last UA: ${ua.data.userAgent}`; console.error(errorMsg); // Per workflow-12102025.md: Alert admin + stop crawl // TODO: Post alert to admin dashboard throw new Error(errorMsg); } const data = ua.data; const browserName = this.extractBrowserName(data.userAgent); // Build sec-ch-ua headers from user agent string const secChUa = this.buildSecChUa(data.userAgent, deviceCategory); // Per workflow-12102025.md: HTTP Fingerprinting - generate full HTTP fingerprint const httpFingerprint = generateHTTPFingerprint(browserName as BrowserType); this.currentFingerprint = { userAgent: data.userAgent, platform: data.platform, screenWidth: data.screenWidth, screenHeight: data.screenHeight, viewportWidth: data.viewportWidth, viewportHeight: data.viewportHeight, deviceCategory: data.deviceCategory, browserName, // Per workflow-12102025.md: for session logging // Per workflow-12102025.md: always English acceptLanguage: 'en-US,en;q=0.9', ...secChUa, // Per workflow-12102025.md: HTTP Fingerprinting section httpFingerprint, }; // Per workflow-12102025.md: Log session data this.sessionLog = { deviceCategory, browserName, userAgent: data.userAgent, proxyIp: proxyIp || null, sessionStartedAt: new Date(), }; console.log(`[UserAgentRotator] New fingerprint: device=${deviceCategory}, browser=${browserName}, UA=${data.userAgent.slice(0, 50)}...`); return this.currentFingerprint; } /** * Get current fingerprint without rotating */ getCurrent(): BrowserFingerprint { if (!this.currentFingerprint) { return this.rotate(); } return this.currentFingerprint; } /** * Get a random fingerprint (rotates and returns) */ getRandom(proxyIp?: string): BrowserFingerprint { return this.rotate(proxyIp); } /** * Per workflow-12102025.md: Get session log for debugging */ getSessionLog(): UASessionLog | null { return this.sessionLog; } /** * Build sec-ch-ua headers from user agent string * Per workflow-12102025.md: Include mobile indicator based on device category */ private buildSecChUa(userAgent: string, deviceCategory: string): { secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string } { const isMobile = deviceCategory === 'mobile' || deviceCategory === 'tablet'; // Extract Chrome version if present const chromeMatch = userAgent.match(/Chrome\/(\d+)/); const edgeMatch = userAgent.match(/Edg\/(\d+)/); if (edgeMatch) { const version = edgeMatch[1]; return { secChUa: `"Microsoft Edge";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`, secChUaPlatform: userAgent.includes('Windows') ? '"Windows"' : userAgent.includes('Android') ? '"Android"' : '"macOS"', secChUaMobile: isMobile ? '?1' : '?0', }; } if (chromeMatch) { const version = chromeMatch[1]; let platform = '"Linux"'; if (userAgent.includes('Windows')) platform = '"Windows"'; else if (userAgent.includes('Mac')) platform = '"macOS"'; else if (userAgent.includes('Android')) platform = '"Android"'; else if (userAgent.includes('iPhone') || userAgent.includes('iPad')) platform = '"iOS"'; return { secChUa: `"Google Chrome";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`, secChUaPlatform: platform, secChUaMobile: isMobile ? '?1' : '?0', }; } // Firefox/Safari don't send sec-ch-ua return {}; } getCount(): number { return 1; // user-agents generates dynamically } } // ============================================================ // COMBINED ROTATOR // Per workflow-12102025.md: Coordinates proxy + fingerprint rotation // ============================================================ export class CrawlRotator { public proxy: ProxyRotator; public userAgent: UserAgentRotator; constructor(pool?: Pool) { this.proxy = new ProxyRotator(pool); this.userAgent = new UserAgentRotator(); } async initialize(): Promise { await this.proxy.loadProxies(); } /** * Rotate proxy only (get new IP) */ rotateProxy(): Proxy | null { return this.proxy.getNext(); } /** * Rotate fingerprint only (new UA, screen size, etc.) */ rotateFingerprint(): BrowserFingerprint { return this.userAgent.rotate(); } /** * Rotate both proxy and fingerprint * Per workflow-12102025.md: called on 403 for fresh identity * Passes proxy IP to UA rotation for session logging */ rotateBoth(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } { const proxy = this.proxy.getNext(); const proxyIp = proxy ? proxy.host : undefined; return { proxy, fingerprint: this.userAgent.rotate(proxyIp), }; } /** * Get current proxy and fingerprint without rotating */ getCurrent(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } { return { proxy: this.proxy.getCurrent(), fingerprint: this.userAgent.getCurrent(), }; } /** * Record success for current proxy * Per workflow-12102025.md: resets consecutive 403 count */ async recordSuccess(responseTimeMs?: number): Promise { const current = this.proxy.getCurrent(); if (current) { await this.proxy.markSuccess(current.id, responseTimeMs); } } /** * Record 403 block for current proxy * Per workflow-12102025.md: increments consecutive_403_count, disables after 3 * Returns true if proxy was disabled */ async recordBlock(): Promise { const current = this.proxy.getCurrent(); if (current) { return await this.proxy.markBlocked(current.id); } return false; } /** * Record general failure (not 403) */ async recordFailure(error?: string): Promise { const current = this.proxy.getCurrent(); if (current) { await this.proxy.markFailed(current.id, error); } } /** * Get current proxy location info * Per workflow-12102025.md: proxy location determines session headers */ getProxyLocation(): { city?: string; state?: string; country?: string; timezone?: string; isRotating: boolean } | null { const current = this.proxy.getCurrent(); if (!current) return null; const isRotating = current.maxConnections > 1; return { city: current.city, state: current.state, country: current.country, timezone: current.timezone, isRotating }; } /** * Get timezone from current proxy * Per workflow-12102025.md: used for Accept-Language header */ getProxyTimezone(): string | undefined { const current = this.proxy.getCurrent(); return current?.timezone; } } // ============================================================ // SINGLETON INSTANCES // ============================================================ export const proxyRotator = new ProxyRotator(); export const userAgentRotator = new UserAgentRotator(); export const crawlRotator = new CrawlRotator();