"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0; const types_1 = require("./types"); const logger_1 = require("../services/logger"); const proxy_1 = require("../services/proxy"); // Diverse, realistic user agents - updated for 2024/2025 const USER_AGENTS = [ // Chrome on Windows (most common) 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', // Chrome on Mac 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', // Chrome on Linux 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', // Firefox 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0', // Safari 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15', // Edge 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0', ]; function getRandomUserAgent() { return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)]; } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * User Agent Rotation Middleware - rotates UA on each request for better evasion */ class UserAgentMiddleware { name = 'UserAgentMiddleware'; priority = 100; lastUserAgent = null; async processRequest(request) { // Always rotate UA on retries or bot detection const forceRotation = request.retryCount > 0 || request.metadata.botDetected; if (!request.metadata.userAgent || forceRotation) { // Get a different UA than the last one used let newUA = getRandomUserAgent(); let attempts = 0; while (newUA === this.lastUserAgent && attempts < 5) { newUA = getRandomUserAgent(); attempts++; } request.metadata.userAgent = newUA; this.lastUserAgent = newUA; if (forceRotation) { logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`); } } return request; } } exports.UserAgentMiddleware = UserAgentMiddleware; // Domains that should skip proxy (datacenter IPs are blocked) const PROXY_SKIP_DOMAINS = [ 'dutchie.com', ]; function shouldSkipProxy(url) { try { const urlObj = new URL(url); return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain)); } catch { return false; } } /** * Proxy Rotation Middleware - uses the central proxy service with timeout handling */ class ProxyMiddleware { name = 'ProxyMiddleware'; priority = 90; currentProxyId = null; async processRequest(request) { // Skip proxy for domains that block datacenter IPs if (shouldSkipProxy(request.url)) { logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`); return request; } // Always try to use a proxy from the central proxy service // The service handles bot detection timeouts automatically const forceRotation = request.retryCount > 0 || request.metadata.botDetected; if (!request.metadata.proxy || forceRotation) { // Get proxy from central service - it handles timeouts automatically const proxy = await (0, proxy_1.getActiveProxy)(); if (proxy) { request.metadata.proxy = { host: proxy.host, port: proxy.port, protocol: proxy.protocol, username: proxy.username, password: proxy.password, }; request.metadata.proxyId = proxy.id; this.currentProxyId = proxy.id; const reason = forceRotation ? 'rotation' : 'initial'; logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`); } else { logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy'); } } return request; } async processResponse(response) { // If bot detection was triggered, put the proxy in timeout if (response.request.metadata.botDetected && response.request.metadata.proxyId) { (0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered'); logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`); } return response; } async processError(error, request) { // If bot detection error, put proxy in timeout if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) { (0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message); logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`); } return error; } } exports.ProxyMiddleware = ProxyMiddleware; /** * Rate Limiting Middleware with Adaptive Delays */ class RateLimitMiddleware { name = 'RateLimitMiddleware'; priority = 80; requestTimes = []; errorCount = 0; baseDelay = 2000; // 2 seconds base delay maxDelay = 30000; // 30 seconds max async processRequest(request) { await this.waitForNextRequest(); return request; } async processResponse(response) { // Record success - gradually reduce error count this.errorCount = Math.max(0, this.errorCount - 1); return response; } async processError(error) { // Record error - increase delay this.errorCount++; return error; } async waitForNextRequest() { // Calculate adaptive delay based on error count const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5)); const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay); // Add random jitter (±20%) const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay; const delay = adaptiveDelay + jitter; const now = Date.now(); const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0; const timeSinceLast = now - lastRequest; if (timeSinceLast < delay) { const waitTime = delay - timeSinceLast; logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`); await sleep(waitTime); } this.requestTimes.push(Date.now()); this.cleanup(); } cleanup() { // Keep only last minute of requests const cutoff = Date.now() - 60000; this.requestTimes = this.requestTimes.filter(t => t > cutoff); } setBaseDelay(ms) { this.baseDelay = ms; } } exports.RateLimitMiddleware = RateLimitMiddleware; /** * Retry Middleware with Exponential Backoff */ class RetryMiddleware { name = 'RetryMiddleware'; priority = 70; isRetryable(error) { const retryableErrors = [ types_1.ErrorType.NETWORK_ERROR, types_1.ErrorType.TIMEOUT, types_1.ErrorType.SERVER_ERROR ]; if ('type' in error) { return retryableErrors.includes(error.type); } // Check error message for common retryable patterns const message = error.message.toLowerCase(); return (message.includes('timeout') || message.includes('network') || message.includes('econnreset') || message.includes('econnrefused') || message.includes('500') || message.includes('502') || message.includes('503')); } async processError(error, request) { if (!this.isRetryable(error)) { logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`); return error; } if (request.retryCount < request.maxRetries) { // Calculate backoff delay const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000); logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`); await sleep(backoffDelay); // Return null to indicate retry should happen return null; } logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`); return error; } } exports.RetryMiddleware = RetryMiddleware; /** * Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation */ class BotDetectionMiddleware { name = 'BotDetectionMiddleware'; priority = 60; detectedCount = 0; DETECTION_THRESHOLD = 3; // Export for use by other middlewares static shouldRotateFingerprint = false; async processResponse(response) { const content = typeof response.content === 'string' ? response.content : JSON.stringify(response.content); // Check for bot detection indicators const botIndicators = [ /captcha/i, /cloudflare/i, /access denied/i, /you have been blocked/i, /unusual traffic/i, /robot/i, /verify.*human/i, /security check/i, /please wait/i, /checking your browser/i, /ray id/i ]; const detected = botIndicators.some(pattern => pattern.test(content)); if (detected) { this.detectedCount++; BotDetectionMiddleware.shouldRotateFingerprint = true; logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`); logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request'); // Mark the request for rotation on retry response.request.metadata.botDetected = true; response.request.metadata.needsNewBrowser = true; if (this.detectedCount >= this.DETECTION_THRESHOLD) { const error = new Error('Bot detection threshold reached - rotating fingerprint'); error.type = types_1.ErrorType.BOT_DETECTION; error.retryable = true; error.request = response.request; throw error; } } else { // Gradually decrease detection count on successful requests this.detectedCount = Math.max(0, this.detectedCount - 0.5); BotDetectionMiddleware.shouldRotateFingerprint = false; } return response; } async processError(error, request) { // If bot detection error, flag for rotation and allow retry if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) { request.metadata.botDetected = true; request.metadata.needsNewBrowser = true; logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry'); // Add delay before retry to avoid rate limiting await sleep(5000 + Math.random() * 5000); return null; // Return null to trigger retry } return error; } } exports.BotDetectionMiddleware = BotDetectionMiddleware; /** * Stealth Mode Middleware */ class StealthMiddleware { name = 'StealthMiddleware'; priority = 95; async processRequest(request) { // Flag that this request needs stealth mode request.metadata.requiresStealth = true; return request; } } exports.StealthMiddleware = StealthMiddleware; /** * Middleware Engine to orchestrate all middlewares */ class MiddlewareEngine { middlewares = []; use(middleware) { this.middlewares.push(middleware); // Sort by priority (higher first) this.middlewares.sort((a, b) => b.priority - a.priority); } async processRequest(request) { let current = request; for (const middleware of this.middlewares) { if (middleware.processRequest) { current = await middleware.processRequest(current); } } return current; } async processResponse(response) { let current = response; for (const middleware of this.middlewares) { if (middleware.processResponse) { current = await middleware.processResponse(current); } } return current; } async processError(error, request) { let currentError = error; for (const middleware of this.middlewares) { if (middleware.processError && currentError) { currentError = await middleware.processError(currentError, request); if (currentError === null) { // Middleware handled the error (e.g., retry) break; } } } return currentError; } } exports.MiddlewareEngine = MiddlewareEngine;