Files
cannaiq/backend/dist/scraper-v2/middlewares.js
2025-11-28 19:45:44 -07:00

264 lines
9.3 KiB
JavaScript

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
const types_1 = require("./types");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* User Agent Rotation Middleware
*/
class UserAgentMiddleware {
name = 'UserAgentMiddleware';
priority = 100;
async processRequest(request) {
if (!request.metadata.userAgent) {
request.metadata.userAgent = getRandomUserAgent();
}
return request;
}
}
exports.UserAgentMiddleware = UserAgentMiddleware;
/**
* Proxy Rotation Middleware
*/
class ProxyMiddleware {
name = 'ProxyMiddleware';
priority = 90;
async getActiveProxy() {
try {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
catch (error) {
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
return null;
}
}
async processRequest(request) {
// Only add proxy if not already set
if (!request.metadata.proxy && request.retryCount > 0) {
// Use proxy on retries
request.metadata.proxy = await this.getActiveProxy();
if (request.metadata.proxy) {
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
}
}
return request;
}
}
exports.ProxyMiddleware = ProxyMiddleware;
/**
* Rate Limiting Middleware with Adaptive Delays
*/
class RateLimitMiddleware {
name = 'RateLimitMiddleware';
priority = 80;
requestTimes = [];
errorCount = 0;
baseDelay = 2000; // 2 seconds base delay
maxDelay = 30000; // 30 seconds max
async processRequest(request) {
await this.waitForNextRequest();
return request;
}
async processResponse(response) {
// Record success - gradually reduce error count
this.errorCount = Math.max(0, this.errorCount - 1);
return response;
}
async processError(error) {
// Record error - increase delay
this.errorCount++;
return error;
}
async waitForNextRequest() {
// Calculate adaptive delay based on error count
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
// Add random jitter (±20%)
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
const delay = adaptiveDelay + jitter;
const now = Date.now();
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
const timeSinceLast = now - lastRequest;
if (timeSinceLast < delay) {
const waitTime = delay - timeSinceLast;
logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
await sleep(waitTime);
}
this.requestTimes.push(Date.now());
this.cleanup();
}
cleanup() {
// Keep only last minute of requests
const cutoff = Date.now() - 60000;
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
}
setBaseDelay(ms) {
this.baseDelay = ms;
}
}
exports.RateLimitMiddleware = RateLimitMiddleware;
/**
* Retry Middleware with Exponential Backoff
*/
class RetryMiddleware {
name = 'RetryMiddleware';
priority = 70;
isRetryable(error) {
const retryableErrors = [
types_1.ErrorType.NETWORK_ERROR,
types_1.ErrorType.TIMEOUT,
types_1.ErrorType.SERVER_ERROR
];
if ('type' in error) {
return retryableErrors.includes(error.type);
}
// Check error message for common retryable patterns
const message = error.message.toLowerCase();
return (message.includes('timeout') ||
message.includes('network') ||
message.includes('econnreset') ||
message.includes('econnrefused') ||
message.includes('500') ||
message.includes('502') ||
message.includes('503'));
}
async processError(error, request) {
if (!this.isRetryable(error)) {
logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
return error;
}
if (request.retryCount < request.maxRetries) {
// Calculate backoff delay
const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000);
logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
await sleep(backoffDelay);
// Return null to indicate retry should happen
return null;
}
logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`);
return error;
}
}
exports.RetryMiddleware = RetryMiddleware;
/**
* Bot Detection Middleware
*/
class BotDetectionMiddleware {
name = 'BotDetectionMiddleware';
priority = 60;
detectedCount = 0;
DETECTION_THRESHOLD = 3;
async processResponse(response) {
const content = typeof response.content === 'string'
? response.content
: JSON.stringify(response.content);
// Check for bot detection indicators
const botIndicators = [
/captcha/i,
/cloudflare/i,
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error = new Error('Bot detection threshold reached');
error.type = types_1.ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
throw error;
}
}
else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
}
return response;
}
}
exports.BotDetectionMiddleware = BotDetectionMiddleware;
/**
* Stealth Mode Middleware
*/
class StealthMiddleware {
name = 'StealthMiddleware';
priority = 95;
async processRequest(request) {
// Flag that this request needs stealth mode
request.metadata.requiresStealth = true;
return request;
}
}
exports.StealthMiddleware = StealthMiddleware;
/**
* Middleware Engine to orchestrate all middlewares
*/
class MiddlewareEngine {
middlewares = [];
use(middleware) {
this.middlewares.push(middleware);
// Sort by priority (higher first)
this.middlewares.sort((a, b) => b.priority - a.priority);
}
async processRequest(request) {
let current = request;
for (const middleware of this.middlewares) {
if (middleware.processRequest) {
current = await middleware.processRequest(current);
}
}
return current;
}
async processResponse(response) {
let current = response;
for (const middleware of this.middlewares) {
if (middleware.processResponse) {
current = await middleware.processResponse(current);
}
}
return current;
}
async processError(error, request) {
let currentError = error;
for (const middleware of this.middlewares) {
if (middleware.processError && currentError) {
currentError = await middleware.processError(currentError, request);
if (currentError === null) {
// Middleware handled the error (e.g., retry)
break;
}
}
}
return currentError;
}
}
exports.MiddlewareEngine = MiddlewareEngine;