"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0; const types_1 = require("./types"); const logger_1 = require("../services/logger"); const migrate_1 = require("../db/migrate"); const USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15' ]; function getRandomUserAgent() { return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)]; } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * User Agent Rotation Middleware */ class UserAgentMiddleware { name = 'UserAgentMiddleware'; priority = 100; async processRequest(request) { if (!request.metadata.userAgent) { request.metadata.userAgent = getRandomUserAgent(); } return request; } } exports.UserAgentMiddleware = UserAgentMiddleware; /** * Proxy Rotation Middleware */ class ProxyMiddleware { name = 'ProxyMiddleware'; priority = 90; async getActiveProxy() { try { const result = await migrate_1.pool.query(` SELECT host, port, protocol, username, password FROM proxies WHERE active = true AND is_anonymous = true ORDER BY RANDOM() LIMIT 1 `); if (result.rows.length === 0) { return null; } return result.rows[0]; } catch (error) { logger_1.logger.error('scraper', `Failed to get proxy: ${error}`); return null; } } async processRequest(request) { // Only add proxy if not already set if (!request.metadata.proxy && request.retryCount > 0) { // Use proxy on retries request.metadata.proxy = await this.getActiveProxy(); if (request.metadata.proxy) { logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`); } } return request; } } exports.ProxyMiddleware = ProxyMiddleware; /** * Rate Limiting Middleware with Adaptive Delays */ class RateLimitMiddleware { name = 'RateLimitMiddleware'; priority = 80; requestTimes = []; errorCount = 0; baseDelay = 2000; // 2 seconds base delay maxDelay = 30000; // 30 seconds max async processRequest(request) { await this.waitForNextRequest(); return request; } async processResponse(response) { // Record success - gradually reduce error count this.errorCount = Math.max(0, this.errorCount - 1); return response; } async processError(error) { // Record error - increase delay this.errorCount++; return error; } async waitForNextRequest() { // Calculate adaptive delay based on error count const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5)); const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay); // Add random jitter (±20%) const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay; const delay = adaptiveDelay + jitter; const now = Date.now(); const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0; const timeSinceLast = now - lastRequest; if (timeSinceLast < delay) { const waitTime = delay - timeSinceLast; logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`); await sleep(waitTime); } this.requestTimes.push(Date.now()); this.cleanup(); } cleanup() { // Keep only last minute of requests const cutoff = Date.now() - 60000; this.requestTimes = this.requestTimes.filter(t => t > cutoff); } setBaseDelay(ms) { this.baseDelay = ms; } } exports.RateLimitMiddleware = RateLimitMiddleware; /** * Retry Middleware with Exponential Backoff */ class RetryMiddleware { name = 'RetryMiddleware'; priority = 70; isRetryable(error) { const retryableErrors = [ types_1.ErrorType.NETWORK_ERROR, types_1.ErrorType.TIMEOUT, types_1.ErrorType.SERVER_ERROR ]; if ('type' in error) { return retryableErrors.includes(error.type); } // Check error message for common retryable patterns const message = error.message.toLowerCase(); return (message.includes('timeout') || message.includes('network') || message.includes('econnreset') || message.includes('econnrefused') || message.includes('500') || message.includes('502') || message.includes('503')); } async processError(error, request) { if (!this.isRetryable(error)) { logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`); return error; } if (request.retryCount < request.maxRetries) { // Calculate backoff delay const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000); logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`); await sleep(backoffDelay); // Return null to indicate retry should happen return null; } logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`); return error; } } exports.RetryMiddleware = RetryMiddleware; /** * Bot Detection Middleware */ class BotDetectionMiddleware { name = 'BotDetectionMiddleware'; priority = 60; detectedCount = 0; DETECTION_THRESHOLD = 3; async processResponse(response) { const content = typeof response.content === 'string' ? response.content : JSON.stringify(response.content); // Check for bot detection indicators const botIndicators = [ /captcha/i, /cloudflare/i, /access denied/i, /you have been blocked/i, /unusual traffic/i, /robot/i ]; const detected = botIndicators.some(pattern => pattern.test(content)); if (detected) { this.detectedCount++; logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`); if (this.detectedCount >= this.DETECTION_THRESHOLD) { const error = new Error('Bot detection threshold reached'); error.type = types_1.ErrorType.BOT_DETECTION; error.retryable = true; error.request = response.request; throw error; } } else { // Gradually decrease detection count on successful requests this.detectedCount = Math.max(0, this.detectedCount - 0.5); } return response; } } exports.BotDetectionMiddleware = BotDetectionMiddleware; /** * Stealth Mode Middleware */ class StealthMiddleware { name = 'StealthMiddleware'; priority = 95; async processRequest(request) { // Flag that this request needs stealth mode request.metadata.requiresStealth = true; return request; } } exports.StealthMiddleware = StealthMiddleware; /** * Middleware Engine to orchestrate all middlewares */ class MiddlewareEngine { middlewares = []; use(middleware) { this.middlewares.push(middleware); // Sort by priority (higher first) this.middlewares.sort((a, b) => b.priority - a.priority); } async processRequest(request) { let current = request; for (const middleware of this.middlewares) { if (middleware.processRequest) { current = await middleware.processRequest(current); } } return current; } async processResponse(response) { let current = response; for (const middleware of this.middlewares) { if (middleware.processResponse) { current = await middleware.processResponse(current); } } return current; } async processError(error, request) { let currentError = error; for (const middleware of this.middlewares) { if (middleware.processError && currentError) { currentError = await middleware.processError(currentError, request); if (currentError === null) { // Middleware handled the error (e.g., retry) break; } } } return currentError; } } exports.MiddlewareEngine = MiddlewareEngine;