264 lines
9.3 KiB
JavaScript
264 lines
9.3 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
|
|
const types_1 = require("./types");
|
|
const logger_1 = require("../services/logger");
|
|
const migrate_1 = require("../db/migrate");
|
|
const USER_AGENTS = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
|
|
];
|
|
function getRandomUserAgent() {
|
|
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
}
|
|
function sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
/**
|
|
* User Agent Rotation Middleware
|
|
*/
|
|
class UserAgentMiddleware {
|
|
name = 'UserAgentMiddleware';
|
|
priority = 100;
|
|
async processRequest(request) {
|
|
if (!request.metadata.userAgent) {
|
|
request.metadata.userAgent = getRandomUserAgent();
|
|
}
|
|
return request;
|
|
}
|
|
}
|
|
exports.UserAgentMiddleware = UserAgentMiddleware;
|
|
/**
|
|
* Proxy Rotation Middleware
|
|
*/
|
|
class ProxyMiddleware {
|
|
name = 'ProxyMiddleware';
|
|
priority = 90;
|
|
async getActiveProxy() {
|
|
try {
|
|
const result = await migrate_1.pool.query(`
|
|
SELECT host, port, protocol, username, password
|
|
FROM proxies
|
|
WHERE active = true AND is_anonymous = true
|
|
ORDER BY RANDOM()
|
|
LIMIT 1
|
|
`);
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
return result.rows[0];
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
|
|
return null;
|
|
}
|
|
}
|
|
async processRequest(request) {
|
|
// Only add proxy if not already set
|
|
if (!request.metadata.proxy && request.retryCount > 0) {
|
|
// Use proxy on retries
|
|
request.metadata.proxy = await this.getActiveProxy();
|
|
if (request.metadata.proxy) {
|
|
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
|
|
}
|
|
}
|
|
return request;
|
|
}
|
|
}
|
|
exports.ProxyMiddleware = ProxyMiddleware;
|
|
/**
|
|
* Rate Limiting Middleware with Adaptive Delays
|
|
*/
|
|
class RateLimitMiddleware {
|
|
name = 'RateLimitMiddleware';
|
|
priority = 80;
|
|
requestTimes = [];
|
|
errorCount = 0;
|
|
baseDelay = 2000; // 2 seconds base delay
|
|
maxDelay = 30000; // 30 seconds max
|
|
async processRequest(request) {
|
|
await this.waitForNextRequest();
|
|
return request;
|
|
}
|
|
async processResponse(response) {
|
|
// Record success - gradually reduce error count
|
|
this.errorCount = Math.max(0, this.errorCount - 1);
|
|
return response;
|
|
}
|
|
async processError(error) {
|
|
// Record error - increase delay
|
|
this.errorCount++;
|
|
return error;
|
|
}
|
|
async waitForNextRequest() {
|
|
// Calculate adaptive delay based on error count
|
|
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
|
|
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
|
|
// Add random jitter (±20%)
|
|
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
|
|
const delay = adaptiveDelay + jitter;
|
|
const now = Date.now();
|
|
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
|
|
const timeSinceLast = now - lastRequest;
|
|
if (timeSinceLast < delay) {
|
|
const waitTime = delay - timeSinceLast;
|
|
logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
|
|
await sleep(waitTime);
|
|
}
|
|
this.requestTimes.push(Date.now());
|
|
this.cleanup();
|
|
}
|
|
cleanup() {
|
|
// Keep only last minute of requests
|
|
const cutoff = Date.now() - 60000;
|
|
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
|
|
}
|
|
setBaseDelay(ms) {
|
|
this.baseDelay = ms;
|
|
}
|
|
}
|
|
exports.RateLimitMiddleware = RateLimitMiddleware;
|
|
/**
|
|
* Retry Middleware with Exponential Backoff
|
|
*/
|
|
class RetryMiddleware {
|
|
name = 'RetryMiddleware';
|
|
priority = 70;
|
|
isRetryable(error) {
|
|
const retryableErrors = [
|
|
types_1.ErrorType.NETWORK_ERROR,
|
|
types_1.ErrorType.TIMEOUT,
|
|
types_1.ErrorType.SERVER_ERROR
|
|
];
|
|
if ('type' in error) {
|
|
return retryableErrors.includes(error.type);
|
|
}
|
|
// Check error message for common retryable patterns
|
|
const message = error.message.toLowerCase();
|
|
return (message.includes('timeout') ||
|
|
message.includes('network') ||
|
|
message.includes('econnreset') ||
|
|
message.includes('econnrefused') ||
|
|
message.includes('500') ||
|
|
message.includes('502') ||
|
|
message.includes('503'));
|
|
}
|
|
async processError(error, request) {
|
|
if (!this.isRetryable(error)) {
|
|
logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
|
|
return error;
|
|
}
|
|
if (request.retryCount < request.maxRetries) {
|
|
// Calculate backoff delay
|
|
const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000);
|
|
logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
|
|
await sleep(backoffDelay);
|
|
// Return null to indicate retry should happen
|
|
return null;
|
|
}
|
|
logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`);
|
|
return error;
|
|
}
|
|
}
|
|
exports.RetryMiddleware = RetryMiddleware;
|
|
/**
|
|
* Bot Detection Middleware
|
|
*/
|
|
class BotDetectionMiddleware {
|
|
name = 'BotDetectionMiddleware';
|
|
priority = 60;
|
|
detectedCount = 0;
|
|
DETECTION_THRESHOLD = 3;
|
|
async processResponse(response) {
|
|
const content = typeof response.content === 'string'
|
|
? response.content
|
|
: JSON.stringify(response.content);
|
|
// Check for bot detection indicators
|
|
const botIndicators = [
|
|
/captcha/i,
|
|
/cloudflare/i,
|
|
/access denied/i,
|
|
/you have been blocked/i,
|
|
/unusual traffic/i,
|
|
/robot/i
|
|
];
|
|
const detected = botIndicators.some(pattern => pattern.test(content));
|
|
if (detected) {
|
|
this.detectedCount++;
|
|
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
|
|
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
|
|
const error = new Error('Bot detection threshold reached');
|
|
error.type = types_1.ErrorType.BOT_DETECTION;
|
|
error.retryable = true;
|
|
error.request = response.request;
|
|
throw error;
|
|
}
|
|
}
|
|
else {
|
|
// Gradually decrease detection count on successful requests
|
|
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
|
|
}
|
|
return response;
|
|
}
|
|
}
|
|
exports.BotDetectionMiddleware = BotDetectionMiddleware;
|
|
/**
|
|
* Stealth Mode Middleware
|
|
*/
|
|
class StealthMiddleware {
|
|
name = 'StealthMiddleware';
|
|
priority = 95;
|
|
async processRequest(request) {
|
|
// Flag that this request needs stealth mode
|
|
request.metadata.requiresStealth = true;
|
|
return request;
|
|
}
|
|
}
|
|
exports.StealthMiddleware = StealthMiddleware;
|
|
/**
|
|
* Middleware Engine to orchestrate all middlewares
|
|
*/
|
|
class MiddlewareEngine {
|
|
middlewares = [];
|
|
use(middleware) {
|
|
this.middlewares.push(middleware);
|
|
// Sort by priority (higher first)
|
|
this.middlewares.sort((a, b) => b.priority - a.priority);
|
|
}
|
|
async processRequest(request) {
|
|
let current = request;
|
|
for (const middleware of this.middlewares) {
|
|
if (middleware.processRequest) {
|
|
current = await middleware.processRequest(current);
|
|
}
|
|
}
|
|
return current;
|
|
}
|
|
async processResponse(response) {
|
|
let current = response;
|
|
for (const middleware of this.middlewares) {
|
|
if (middleware.processResponse) {
|
|
current = await middleware.processResponse(current);
|
|
}
|
|
}
|
|
return current;
|
|
}
|
|
async processError(error, request) {
|
|
let currentError = error;
|
|
for (const middleware of this.middlewares) {
|
|
if (middleware.processError && currentError) {
|
|
currentError = await middleware.processError(currentError, request);
|
|
if (currentError === null) {
|
|
// Middleware handled the error (e.g., retry)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return currentError;
|
|
}
|
|
}
|
|
exports.MiddlewareEngine = MiddlewareEngine;
|