chore: Clean up deprecated code and docs

- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-11 22:17:40 -07:00
parent f2864bd2ad
commit a35976b9e9
61 changed files with 856 additions and 1281 deletions

View File

@@ -0,0 +1,402 @@
import { Middleware, ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types';
import { logger } from '../services/logger';
import { pool } from '../db/pool';
import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy';
// Diverse, realistic user agents - updated for 2024/2025
const USER_AGENTS = [
// Chrome on Windows (most common)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
// Chrome on Mac
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
// Edge
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
];
function getRandomUserAgent(): string {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* User Agent Rotation Middleware - rotates UA on each request for better evasion
*/
export class UserAgentMiddleware implements Middleware {
name = 'UserAgentMiddleware';
priority = 100;
private lastUserAgent: string | null = null;
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
// Always rotate UA on retries or bot detection
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.userAgent || forceRotation) {
// Get a different UA than the last one used
let newUA = getRandomUserAgent();
let attempts = 0;
while (newUA === this.lastUserAgent && attempts < 5) {
newUA = getRandomUserAgent();
attempts++;
}
request.metadata.userAgent = newUA;
this.lastUserAgent = newUA;
if (forceRotation) {
logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
}
}
return request;
}
}
// Domains that should skip proxy (datacenter IPs are blocked)
const PROXY_SKIP_DOMAINS = [
'dutchie.com',
];
function shouldSkipProxy(url: string): boolean {
try {
const urlObj = new URL(url);
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
} catch {
return false;
}
}
/**
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
*/
export class ProxyMiddleware implements Middleware {
name = 'ProxyMiddleware';
priority = 90;
private currentProxyId: number | null = null;
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
// Skip proxy for domains that block datacenter IPs
if (shouldSkipProxy(request.url)) {
logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
return request;
}
// Always try to use a proxy from the central proxy service
// The service handles bot detection timeouts automatically
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.proxy || forceRotation) {
// Get proxy from central service - it handles timeouts automatically
const proxy = await getActiveProxy();
if (proxy) {
request.metadata.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol,
username: proxy.username,
password: proxy.password,
};
request.metadata.proxyId = proxy.id;
this.currentProxyId = proxy.id;
const reason = forceRotation ? 'rotation' : 'initial';
logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
} else {
logger.warn('scraper', '⚠️ No proxy available - running without proxy');
}
}
return request;
}
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
// If bot detection was triggered, put the proxy in timeout
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
putProxyInTimeout(response.request.metadata.proxyId, 'Bot detection triggered');
logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
}
return response;
}
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
// If bot detection error, put proxy in timeout
if (isBotDetectionError(error.message) && request.metadata.proxyId) {
putProxyInTimeout(request.metadata.proxyId, error.message);
logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
}
return error;
}
}
/**
* Rate Limiting Middleware with Adaptive Delays
*/
export class RateLimitMiddleware implements Middleware {
name = 'RateLimitMiddleware';
priority = 80;
private requestTimes: number[] = [];
private errorCount: number = 0;
private baseDelay: number = 2000; // 2 seconds base delay
private maxDelay: number = 30000; // 30 seconds max
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
await this.waitForNextRequest();
return request;
}
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
// Record success - gradually reduce error count
this.errorCount = Math.max(0, this.errorCount - 1);
return response;
}
async processError(error: Error): Promise<Error | null> {
// Record error - increase delay
this.errorCount++;
return error;
}
private async waitForNextRequest(): Promise<void> {
// Calculate adaptive delay based on error count
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
// Add random jitter (±20%)
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
const delay = adaptiveDelay + jitter;
const now = Date.now();
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
const timeSinceLast = now - lastRequest;
if (timeSinceLast < delay) {
const waitTime = delay - timeSinceLast;
logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
await sleep(waitTime);
}
this.requestTimes.push(Date.now());
this.cleanup();
}
private cleanup(): void {
// Keep only last minute of requests
const cutoff = Date.now() - 60000;
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
}
public setBaseDelay(ms: number): void {
this.baseDelay = ms;
}
}
/**
* Retry Middleware with Exponential Backoff
*/
export class RetryMiddleware implements Middleware {
name = 'RetryMiddleware';
priority = 70;
private isRetryable(error: Error): boolean {
const retryableErrors = [
ErrorType.NETWORK_ERROR,
ErrorType.TIMEOUT,
ErrorType.SERVER_ERROR
];
if ('type' in error) {
return retryableErrors.includes((error as ScraperError).type);
}
// Check error message for common retryable patterns
const message = error.message.toLowerCase();
return (
message.includes('timeout') ||
message.includes('network') ||
message.includes('econnreset') ||
message.includes('econnrefused') ||
message.includes('500') ||
message.includes('502') ||
message.includes('503')
);
}
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
if (!this.isRetryable(error)) {
logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
return error;
}
if (request.retryCount < request.maxRetries) {
// Calculate backoff delay
const backoffDelay = Math.min(
1000 * Math.pow(2, request.retryCount),
30000
);
logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
await sleep(backoffDelay);
// Return null to indicate retry should happen
return null;
}
logger.error('scraper', `Max retries exceeded for ${request.url}`);
return error;
}
}
/**
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
*/
export class BotDetectionMiddleware implements Middleware {
name = 'BotDetectionMiddleware';
priority = 60;
private detectedCount: number = 0;
private readonly DETECTION_THRESHOLD = 3;
// Export for use by other middlewares
static shouldRotateFingerprint: boolean = false;
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
const content = typeof response.content === 'string'
? response.content
: JSON.stringify(response.content);
// Check for bot detection indicators
const botIndicators = [
/captcha/i,
/cloudflare/i,
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i,
/verify.*human/i,
/security check/i,
/please wait/i,
/checking your browser/i,
/ray id/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
BotDetectionMiddleware.shouldRotateFingerprint = true;
logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
// Mark the request for rotation on retry
response.request.metadata.botDetected = true;
response.request.metadata.needsNewBrowser = true;
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error: ScraperError = new Error('Bot detection threshold reached - rotating fingerprint') as ScraperError;
error.type = ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
throw error;
}
} else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
BotDetectionMiddleware.shouldRotateFingerprint = false;
}
return response;
}
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
// If bot detection error, flag for rotation and allow retry
if ('type' in error && (error as ScraperError).type === ErrorType.BOT_DETECTION) {
request.metadata.botDetected = true;
request.metadata.needsNewBrowser = true;
logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
// Add delay before retry to avoid rate limiting
await sleep(5000 + Math.random() * 5000);
return null; // Return null to trigger retry
}
return error;
}
}
/**
* Stealth Mode Middleware
*/
export class StealthMiddleware implements Middleware {
name = 'StealthMiddleware';
priority = 95;
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
// Flag that this request needs stealth mode
request.metadata.requiresStealth = true;
return request;
}
}
/**
* Middleware Engine to orchestrate all middlewares
*/
export class MiddlewareEngine {
private middlewares: Middleware[] = [];
use(middleware: Middleware): void {
this.middlewares.push(middleware);
// Sort by priority (higher first)
this.middlewares.sort((a, b) => b.priority - a.priority);
}
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
let current = request;
for (const middleware of this.middlewares) {
if (middleware.processRequest) {
current = await middleware.processRequest(current);
}
}
return current;
}
async processResponse(response: ScraperResponse): Promise<ScraperResponse> {
let current = response;
for (const middleware of this.middlewares) {
if (middleware.processResponse) {
current = await middleware.processResponse(current);
}
}
return current;
}
async processError(error: Error, request: ScraperRequest): Promise<Error | null> {
let currentError: Error | null = error;
for (const middleware of this.middlewares) {
if (middleware.processError && currentError) {
currentError = await middleware.processError(currentError, request);
if (currentError === null) {
// Middleware handled the error (e.g., retry)
break;
}
}
}
return currentError;
}
}