The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
352 lines
14 KiB
JavaScript
352 lines
14 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
|
|
const types_1 = require("./types");
|
|
const logger_1 = require("../services/logger");
|
|
const proxy_1 = require("../services/proxy");
|
|
// Diverse, realistic user agents - updated for 2024/2025
|
|
const USER_AGENTS = [
|
|
// Chrome on Windows (most common)
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
|
// Chrome on Mac
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
// Chrome on Linux
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
// Firefox
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
// Safari
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
|
// Edge
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
|
|
];
|
|
function getRandomUserAgent() {
|
|
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
}
|
|
function sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
/**
|
|
* User Agent Rotation Middleware - rotates UA on each request for better evasion
|
|
*/
|
|
class UserAgentMiddleware {
|
|
name = 'UserAgentMiddleware';
|
|
priority = 100;
|
|
lastUserAgent = null;
|
|
async processRequest(request) {
|
|
// Always rotate UA on retries or bot detection
|
|
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
|
if (!request.metadata.userAgent || forceRotation) {
|
|
// Get a different UA than the last one used
|
|
let newUA = getRandomUserAgent();
|
|
let attempts = 0;
|
|
while (newUA === this.lastUserAgent && attempts < 5) {
|
|
newUA = getRandomUserAgent();
|
|
attempts++;
|
|
}
|
|
request.metadata.userAgent = newUA;
|
|
this.lastUserAgent = newUA;
|
|
if (forceRotation) {
|
|
logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
|
|
}
|
|
}
|
|
return request;
|
|
}
|
|
}
|
|
exports.UserAgentMiddleware = UserAgentMiddleware;
|
|
// Domains that should skip proxy (datacenter IPs are blocked)
|
|
const PROXY_SKIP_DOMAINS = [
|
|
'dutchie.com',
|
|
];
|
|
function shouldSkipProxy(url) {
|
|
try {
|
|
const urlObj = new URL(url);
|
|
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
|
|
}
|
|
catch {
|
|
return false;
|
|
}
|
|
}
|
|
/**
|
|
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
|
|
*/
|
|
class ProxyMiddleware {
|
|
name = 'ProxyMiddleware';
|
|
priority = 90;
|
|
currentProxyId = null;
|
|
async processRequest(request) {
|
|
// Skip proxy for domains that block datacenter IPs
|
|
if (shouldSkipProxy(request.url)) {
|
|
logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
|
|
return request;
|
|
}
|
|
// Always try to use a proxy from the central proxy service
|
|
// The service handles bot detection timeouts automatically
|
|
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
|
if (!request.metadata.proxy || forceRotation) {
|
|
// Get proxy from central service - it handles timeouts automatically
|
|
const proxy = await (0, proxy_1.getActiveProxy)();
|
|
if (proxy) {
|
|
request.metadata.proxy = {
|
|
host: proxy.host,
|
|
port: proxy.port,
|
|
protocol: proxy.protocol,
|
|
username: proxy.username,
|
|
password: proxy.password,
|
|
};
|
|
request.metadata.proxyId = proxy.id;
|
|
this.currentProxyId = proxy.id;
|
|
const reason = forceRotation ? 'rotation' : 'initial';
|
|
logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
|
}
|
|
else {
|
|
logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy');
|
|
}
|
|
}
|
|
return request;
|
|
}
|
|
async processResponse(response) {
|
|
// If bot detection was triggered, put the proxy in timeout
|
|
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
|
|
(0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered');
|
|
logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
|
|
}
|
|
return response;
|
|
}
|
|
async processError(error, request) {
|
|
// If bot detection error, put proxy in timeout
|
|
if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) {
|
|
(0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message);
|
|
logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
|
|
}
|
|
return error;
|
|
}
|
|
}
|
|
exports.ProxyMiddleware = ProxyMiddleware;
|
|
/**
|
|
* Rate Limiting Middleware with Adaptive Delays
|
|
*/
|
|
class RateLimitMiddleware {
|
|
name = 'RateLimitMiddleware';
|
|
priority = 80;
|
|
requestTimes = [];
|
|
errorCount = 0;
|
|
baseDelay = 2000; // 2 seconds base delay
|
|
maxDelay = 30000; // 30 seconds max
|
|
async processRequest(request) {
|
|
await this.waitForNextRequest();
|
|
return request;
|
|
}
|
|
async processResponse(response) {
|
|
// Record success - gradually reduce error count
|
|
this.errorCount = Math.max(0, this.errorCount - 1);
|
|
return response;
|
|
}
|
|
async processError(error) {
|
|
// Record error - increase delay
|
|
this.errorCount++;
|
|
return error;
|
|
}
|
|
async waitForNextRequest() {
|
|
// Calculate adaptive delay based on error count
|
|
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
|
|
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
|
|
// Add random jitter (±20%)
|
|
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
|
|
const delay = adaptiveDelay + jitter;
|
|
const now = Date.now();
|
|
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
|
|
const timeSinceLast = now - lastRequest;
|
|
if (timeSinceLast < delay) {
|
|
const waitTime = delay - timeSinceLast;
|
|
logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
|
|
await sleep(waitTime);
|
|
}
|
|
this.requestTimes.push(Date.now());
|
|
this.cleanup();
|
|
}
|
|
cleanup() {
|
|
// Keep only last minute of requests
|
|
const cutoff = Date.now() - 60000;
|
|
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
|
|
}
|
|
setBaseDelay(ms) {
|
|
this.baseDelay = ms;
|
|
}
|
|
}
|
|
exports.RateLimitMiddleware = RateLimitMiddleware;
|
|
/**
|
|
* Retry Middleware with Exponential Backoff
|
|
*/
|
|
class RetryMiddleware {
|
|
name = 'RetryMiddleware';
|
|
priority = 70;
|
|
isRetryable(error) {
|
|
const retryableErrors = [
|
|
types_1.ErrorType.NETWORK_ERROR,
|
|
types_1.ErrorType.TIMEOUT,
|
|
types_1.ErrorType.SERVER_ERROR
|
|
];
|
|
if ('type' in error) {
|
|
return retryableErrors.includes(error.type);
|
|
}
|
|
// Check error message for common retryable patterns
|
|
const message = error.message.toLowerCase();
|
|
return (message.includes('timeout') ||
|
|
message.includes('network') ||
|
|
message.includes('econnreset') ||
|
|
message.includes('econnrefused') ||
|
|
message.includes('500') ||
|
|
message.includes('502') ||
|
|
message.includes('503'));
|
|
}
|
|
async processError(error, request) {
|
|
if (!this.isRetryable(error)) {
|
|
logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
|
|
return error;
|
|
}
|
|
if (request.retryCount < request.maxRetries) {
|
|
// Calculate backoff delay
|
|
const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000);
|
|
logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
|
|
await sleep(backoffDelay);
|
|
// Return null to indicate retry should happen
|
|
return null;
|
|
}
|
|
logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`);
|
|
return error;
|
|
}
|
|
}
|
|
exports.RetryMiddleware = RetryMiddleware;
|
|
/**
|
|
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
|
|
*/
|
|
class BotDetectionMiddleware {
|
|
name = 'BotDetectionMiddleware';
|
|
priority = 60;
|
|
detectedCount = 0;
|
|
DETECTION_THRESHOLD = 3;
|
|
// Export for use by other middlewares
|
|
static shouldRotateFingerprint = false;
|
|
async processResponse(response) {
|
|
const content = typeof response.content === 'string'
|
|
? response.content
|
|
: JSON.stringify(response.content);
|
|
// Check for bot detection indicators
|
|
const botIndicators = [
|
|
/captcha/i,
|
|
/cloudflare/i,
|
|
/access denied/i,
|
|
/you have been blocked/i,
|
|
/unusual traffic/i,
|
|
/robot/i,
|
|
/verify.*human/i,
|
|
/security check/i,
|
|
/please wait/i,
|
|
/checking your browser/i,
|
|
/ray id/i
|
|
];
|
|
const detected = botIndicators.some(pattern => pattern.test(content));
|
|
if (detected) {
|
|
this.detectedCount++;
|
|
BotDetectionMiddleware.shouldRotateFingerprint = true;
|
|
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
|
|
logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
|
|
// Mark the request for rotation on retry
|
|
response.request.metadata.botDetected = true;
|
|
response.request.metadata.needsNewBrowser = true;
|
|
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
|
|
const error = new Error('Bot detection threshold reached - rotating fingerprint');
|
|
error.type = types_1.ErrorType.BOT_DETECTION;
|
|
error.retryable = true;
|
|
error.request = response.request;
|
|
throw error;
|
|
}
|
|
}
|
|
else {
|
|
// Gradually decrease detection count on successful requests
|
|
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
|
|
BotDetectionMiddleware.shouldRotateFingerprint = false;
|
|
}
|
|
return response;
|
|
}
|
|
async processError(error, request) {
|
|
// If bot detection error, flag for rotation and allow retry
|
|
if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) {
|
|
request.metadata.botDetected = true;
|
|
request.metadata.needsNewBrowser = true;
|
|
logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
|
|
// Add delay before retry to avoid rate limiting
|
|
await sleep(5000 + Math.random() * 5000);
|
|
return null; // Return null to trigger retry
|
|
}
|
|
return error;
|
|
}
|
|
}
|
|
exports.BotDetectionMiddleware = BotDetectionMiddleware;
|
|
/**
|
|
* Stealth Mode Middleware
|
|
*/
|
|
class StealthMiddleware {
|
|
name = 'StealthMiddleware';
|
|
priority = 95;
|
|
async processRequest(request) {
|
|
// Flag that this request needs stealth mode
|
|
request.metadata.requiresStealth = true;
|
|
return request;
|
|
}
|
|
}
|
|
exports.StealthMiddleware = StealthMiddleware;
|
|
/**
|
|
* Middleware Engine to orchestrate all middlewares
|
|
*/
|
|
class MiddlewareEngine {
|
|
middlewares = [];
|
|
use(middleware) {
|
|
this.middlewares.push(middleware);
|
|
// Sort by priority (higher first)
|
|
this.middlewares.sort((a, b) => b.priority - a.priority);
|
|
}
|
|
async processRequest(request) {
|
|
let current = request;
|
|
for (const middleware of this.middlewares) {
|
|
if (middleware.processRequest) {
|
|
current = await middleware.processRequest(current);
|
|
}
|
|
}
|
|
return current;
|
|
}
|
|
async processResponse(response) {
|
|
let current = response;
|
|
for (const middleware of this.middlewares) {
|
|
if (middleware.processResponse) {
|
|
current = await middleware.processResponse(current);
|
|
}
|
|
}
|
|
return current;
|
|
}
|
|
async processError(error, request) {
|
|
let currentError = error;
|
|
for (const middleware of this.middlewares) {
|
|
if (middleware.processError && currentError) {
|
|
currentError = await middleware.processError(currentError, request);
|
|
if (currentError === null) {
|
|
// Middleware handled the error (e.g., retry)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return currentError;
|
|
}
|
|
}
|
|
exports.MiddlewareEngine = MiddlewareEngine;
|