chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
402
backend/src/_deprecated/scraper-v2/middlewares.ts
Normal file
402
backend/src/_deprecated/scraper-v2/middlewares.ts
Normal file
@@ -0,0 +1,402 @@
|
||||
import { Middleware, ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types';
|
||||
import { logger } from '../services/logger';
|
||||
import { pool } from '../db/pool';
|
||||
import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy';
|
||||
|
||||
// Diverse, realistic user agents - updated for 2024/2025
|
||||
const USER_AGENTS = [
|
||||
// Chrome on Windows (most common)
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
||||
// Chrome on Mac
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Firefox
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
// Safari
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
// Edge
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
|
||||
];
|
||||
|
||||
function getRandomUserAgent(): string {
|
||||
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* User Agent Rotation Middleware - rotates UA on each request for better evasion
|
||||
*/
|
||||
export class UserAgentMiddleware implements Middleware {
|
||||
name = 'UserAgentMiddleware';
|
||||
priority = 100;
|
||||
|
||||
private lastUserAgent: string | null = null;
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
// Always rotate UA on retries or bot detection
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
|
||||
if (!request.metadata.userAgent || forceRotation) {
|
||||
// Get a different UA than the last one used
|
||||
let newUA = getRandomUserAgent();
|
||||
let attempts = 0;
|
||||
while (newUA === this.lastUserAgent && attempts < 5) {
|
||||
newUA = getRandomUserAgent();
|
||||
attempts++;
|
||||
}
|
||||
request.metadata.userAgent = newUA;
|
||||
this.lastUserAgent = newUA;
|
||||
|
||||
if (forceRotation) {
|
||||
logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
}
|
||||
|
||||
// Domains that should skip proxy (datacenter IPs are blocked)
|
||||
const PROXY_SKIP_DOMAINS = [
|
||||
'dutchie.com',
|
||||
];
|
||||
|
||||
function shouldSkipProxy(url: string): boolean {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
|
||||
*/
|
||||
export class ProxyMiddleware implements Middleware {
|
||||
name = 'ProxyMiddleware';
|
||||
priority = 90;
|
||||
|
||||
private currentProxyId: number | null = null;
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
// Skip proxy for domains that block datacenter IPs
|
||||
if (shouldSkipProxy(request.url)) {
|
||||
logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
|
||||
return request;
|
||||
}
|
||||
|
||||
// Always try to use a proxy from the central proxy service
|
||||
// The service handles bot detection timeouts automatically
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
|
||||
if (!request.metadata.proxy || forceRotation) {
|
||||
// Get proxy from central service - it handles timeouts automatically
|
||||
const proxy = await getActiveProxy();
|
||||
if (proxy) {
|
||||
request.metadata.proxy = {
|
||||
host: proxy.host,
|
||||
port: proxy.port,
|
||||
protocol: proxy.protocol,
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
};
|
||||
request.metadata.proxyId = proxy.id;
|
||||
this.currentProxyId = proxy.id;
|
||||
const reason = forceRotation ? 'rotation' : 'initial';
|
||||
logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
} else {
|
||||
logger.warn('scraper', '⚠️ No proxy available - running without proxy');
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
|
||||
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
|
||||
// If bot detection was triggered, put the proxy in timeout
|
||||
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
|
||||
putProxyInTimeout(response.request.metadata.proxyId, 'Bot detection triggered');
|
||||
logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
|
||||
// If bot detection error, put proxy in timeout
|
||||
if (isBotDetectionError(error.message) && request.metadata.proxyId) {
|
||||
putProxyInTimeout(request.metadata.proxyId, error.message);
|
||||
logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rate Limiting Middleware with Adaptive Delays
|
||||
*/
|
||||
export class RateLimitMiddleware implements Middleware {
|
||||
name = 'RateLimitMiddleware';
|
||||
priority = 80;
|
||||
|
||||
private requestTimes: number[] = [];
|
||||
private errorCount: number = 0;
|
||||
private baseDelay: number = 2000; // 2 seconds base delay
|
||||
private maxDelay: number = 30000; // 30 seconds max
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
await this.waitForNextRequest();
|
||||
return request;
|
||||
}
|
||||
|
||||
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
|
||||
// Record success - gradually reduce error count
|
||||
this.errorCount = Math.max(0, this.errorCount - 1);
|
||||
return response;
|
||||
}
|
||||
|
||||
async processError(error: Error): Promise<Error | null> {
|
||||
// Record error - increase delay
|
||||
this.errorCount++;
|
||||
return error;
|
||||
}
|
||||
|
||||
private async waitForNextRequest(): Promise<void> {
|
||||
// Calculate adaptive delay based on error count
|
||||
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
|
||||
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
|
||||
|
||||
// Add random jitter (±20%)
|
||||
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
|
||||
const delay = adaptiveDelay + jitter;
|
||||
|
||||
const now = Date.now();
|
||||
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
|
||||
const timeSinceLast = now - lastRequest;
|
||||
|
||||
if (timeSinceLast < delay) {
|
||||
const waitTime = delay - timeSinceLast;
|
||||
logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
|
||||
await sleep(waitTime);
|
||||
}
|
||||
|
||||
this.requestTimes.push(Date.now());
|
||||
this.cleanup();
|
||||
}
|
||||
|
||||
private cleanup(): void {
|
||||
// Keep only last minute of requests
|
||||
const cutoff = Date.now() - 60000;
|
||||
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
|
||||
}
|
||||
|
||||
public setBaseDelay(ms: number): void {
|
||||
this.baseDelay = ms;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry Middleware with Exponential Backoff
|
||||
*/
|
||||
export class RetryMiddleware implements Middleware {
|
||||
name = 'RetryMiddleware';
|
||||
priority = 70;
|
||||
|
||||
private isRetryable(error: Error): boolean {
|
||||
const retryableErrors = [
|
||||
ErrorType.NETWORK_ERROR,
|
||||
ErrorType.TIMEOUT,
|
||||
ErrorType.SERVER_ERROR
|
||||
];
|
||||
|
||||
if ('type' in error) {
|
||||
return retryableErrors.includes((error as ScraperError).type);
|
||||
}
|
||||
|
||||
// Check error message for common retryable patterns
|
||||
const message = error.message.toLowerCase();
|
||||
return (
|
||||
message.includes('timeout') ||
|
||||
message.includes('network') ||
|
||||
message.includes('econnreset') ||
|
||||
message.includes('econnrefused') ||
|
||||
message.includes('500') ||
|
||||
message.includes('502') ||
|
||||
message.includes('503')
|
||||
);
|
||||
}
|
||||
|
||||
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
|
||||
if (!this.isRetryable(error)) {
|
||||
logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
|
||||
return error;
|
||||
}
|
||||
|
||||
if (request.retryCount < request.maxRetries) {
|
||||
// Calculate backoff delay
|
||||
const backoffDelay = Math.min(
|
||||
1000 * Math.pow(2, request.retryCount),
|
||||
30000
|
||||
);
|
||||
|
||||
logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
|
||||
await sleep(backoffDelay);
|
||||
|
||||
// Return null to indicate retry should happen
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.error('scraper', `Max retries exceeded for ${request.url}`);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
|
||||
*/
|
||||
export class BotDetectionMiddleware implements Middleware {
|
||||
name = 'BotDetectionMiddleware';
|
||||
priority = 60;
|
||||
|
||||
private detectedCount: number = 0;
|
||||
private readonly DETECTION_THRESHOLD = 3;
|
||||
|
||||
// Export for use by other middlewares
|
||||
static shouldRotateFingerprint: boolean = false;
|
||||
|
||||
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
|
||||
const content = typeof response.content === 'string'
|
||||
? response.content
|
||||
: JSON.stringify(response.content);
|
||||
|
||||
// Check for bot detection indicators
|
||||
const botIndicators = [
|
||||
/captcha/i,
|
||||
/cloudflare/i,
|
||||
/access denied/i,
|
||||
/you have been blocked/i,
|
||||
/unusual traffic/i,
|
||||
/robot/i,
|
||||
/verify.*human/i,
|
||||
/security check/i,
|
||||
/please wait/i,
|
||||
/checking your browser/i,
|
||||
/ray id/i
|
||||
];
|
||||
|
||||
const detected = botIndicators.some(pattern => pattern.test(content));
|
||||
|
||||
if (detected) {
|
||||
this.detectedCount++;
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = true;
|
||||
|
||||
logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
|
||||
logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
|
||||
|
||||
// Mark the request for rotation on retry
|
||||
response.request.metadata.botDetected = true;
|
||||
response.request.metadata.needsNewBrowser = true;
|
||||
|
||||
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
|
||||
const error: ScraperError = new Error('Bot detection threshold reached - rotating fingerprint') as ScraperError;
|
||||
error.type = ErrorType.BOT_DETECTION;
|
||||
error.retryable = true;
|
||||
error.request = response.request;
|
||||
throw error;
|
||||
}
|
||||
} else {
|
||||
// Gradually decrease detection count on successful requests
|
||||
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = false;
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
|
||||
// If bot detection error, flag for rotation and allow retry
|
||||
if ('type' in error && (error as ScraperError).type === ErrorType.BOT_DETECTION) {
|
||||
request.metadata.botDetected = true;
|
||||
request.metadata.needsNewBrowser = true;
|
||||
logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
|
||||
|
||||
// Add delay before retry to avoid rate limiting
|
||||
await sleep(5000 + Math.random() * 5000);
|
||||
return null; // Return null to trigger retry
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stealth Mode Middleware
|
||||
*/
|
||||
export class StealthMiddleware implements Middleware {
|
||||
name = 'StealthMiddleware';
|
||||
priority = 95;
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
// Flag that this request needs stealth mode
|
||||
request.metadata.requiresStealth = true;
|
||||
return request;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Middleware Engine to orchestrate all middlewares
|
||||
*/
|
||||
export class MiddlewareEngine {
|
||||
private middlewares: Middleware[] = [];
|
||||
|
||||
use(middleware: Middleware): void {
|
||||
this.middlewares.push(middleware);
|
||||
// Sort by priority (higher first)
|
||||
this.middlewares.sort((a, b) => b.priority - a.priority);
|
||||
}
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
let current = request;
|
||||
for (const middleware of this.middlewares) {
|
||||
if (middleware.processRequest) {
|
||||
current = await middleware.processRequest(current);
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
|
||||
let current = response;
|
||||
for (const middleware of this.middlewares) {
|
||||
if (middleware.processResponse) {
|
||||
current = await middleware.processResponse(current);
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
|
||||
let currentError: Error | null = error;
|
||||
for (const middleware of this.middlewares) {
|
||||
if (middleware.processError && currentError) {
|
||||
currentError = await middleware.processError(currentError, request);
|
||||
if (currentError === null) {
|
||||
// Middleware handled the error (e.g., retry)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return currentError;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user