fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
156
backend/dist/scraper-v2/middlewares.js
vendored
156
backend/dist/scraper-v2/middlewares.js
vendored
@@ -3,13 +3,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
|
||||
const types_1 = require("./types");
|
||||
const logger_1 = require("../services/logger");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
// Diverse, realistic user agents - updated for 2024/2025
|
||||
const USER_AGENTS = [
|
||||
// Chrome on Windows (most common)
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
||||
// Chrome on Mac
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Firefox
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
// Safari
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
// Edge
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
|
||||
];
|
||||
function getRandomUserAgent() {
|
||||
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
||||
@@ -18,55 +36,100 @@ function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
/**
|
||||
* User Agent Rotation Middleware
|
||||
* User Agent Rotation Middleware - rotates UA on each request for better evasion
|
||||
*/
|
||||
class UserAgentMiddleware {
|
||||
name = 'UserAgentMiddleware';
|
||||
priority = 100;
|
||||
lastUserAgent = null;
|
||||
async processRequest(request) {
|
||||
if (!request.metadata.userAgent) {
|
||||
request.metadata.userAgent = getRandomUserAgent();
|
||||
// Always rotate UA on retries or bot detection
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
if (!request.metadata.userAgent || forceRotation) {
|
||||
// Get a different UA than the last one used
|
||||
let newUA = getRandomUserAgent();
|
||||
let attempts = 0;
|
||||
while (newUA === this.lastUserAgent && attempts < 5) {
|
||||
newUA = getRandomUserAgent();
|
||||
attempts++;
|
||||
}
|
||||
request.metadata.userAgent = newUA;
|
||||
this.lastUserAgent = newUA;
|
||||
if (forceRotation) {
|
||||
logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
}
|
||||
exports.UserAgentMiddleware = UserAgentMiddleware;
|
||||
// Domains that should skip proxy (datacenter IPs are blocked)
|
||||
const PROXY_SKIP_DOMAINS = [
|
||||
'dutchie.com',
|
||||
];
|
||||
function shouldSkipProxy(url) {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Proxy Rotation Middleware
|
||||
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
|
||||
*/
|
||||
class ProxyMiddleware {
|
||||
name = 'ProxyMiddleware';
|
||||
priority = 90;
|
||||
async getActiveProxy() {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true AND is_anonymous = true
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
`);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
currentProxyId = null;
|
||||
async processRequest(request) {
|
||||
// Only add proxy if not already set
|
||||
if (!request.metadata.proxy && request.retryCount > 0) {
|
||||
// Use proxy on retries
|
||||
request.metadata.proxy = await this.getActiveProxy();
|
||||
if (request.metadata.proxy) {
|
||||
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
|
||||
// Skip proxy for domains that block datacenter IPs
|
||||
if (shouldSkipProxy(request.url)) {
|
||||
logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
|
||||
return request;
|
||||
}
|
||||
// Always try to use a proxy from the central proxy service
|
||||
// The service handles bot detection timeouts automatically
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
if (!request.metadata.proxy || forceRotation) {
|
||||
// Get proxy from central service - it handles timeouts automatically
|
||||
const proxy = await (0, proxy_1.getActiveProxy)();
|
||||
if (proxy) {
|
||||
request.metadata.proxy = {
|
||||
host: proxy.host,
|
||||
port: proxy.port,
|
||||
protocol: proxy.protocol,
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
};
|
||||
request.metadata.proxyId = proxy.id;
|
||||
this.currentProxyId = proxy.id;
|
||||
const reason = forceRotation ? 'rotation' : 'initial';
|
||||
logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy');
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
async processResponse(response) {
|
||||
// If bot detection was triggered, put the proxy in timeout
|
||||
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
|
||||
(0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered');
|
||||
logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
async processError(error, request) {
|
||||
// If bot detection error, put proxy in timeout
|
||||
if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) {
|
||||
(0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message);
|
||||
logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
exports.ProxyMiddleware = ProxyMiddleware;
|
||||
/**
|
||||
@@ -165,13 +228,15 @@ class RetryMiddleware {
|
||||
}
|
||||
exports.RetryMiddleware = RetryMiddleware;
|
||||
/**
|
||||
* Bot Detection Middleware
|
||||
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
|
||||
*/
|
||||
class BotDetectionMiddleware {
|
||||
name = 'BotDetectionMiddleware';
|
||||
priority = 60;
|
||||
detectedCount = 0;
|
||||
DETECTION_THRESHOLD = 3;
|
||||
// Export for use by other middlewares
|
||||
static shouldRotateFingerprint = false;
|
||||
async processResponse(response) {
|
||||
const content = typeof response.content === 'string'
|
||||
? response.content
|
||||
@@ -183,14 +248,24 @@ class BotDetectionMiddleware {
|
||||
/access denied/i,
|
||||
/you have been blocked/i,
|
||||
/unusual traffic/i,
|
||||
/robot/i
|
||||
/robot/i,
|
||||
/verify.*human/i,
|
||||
/security check/i,
|
||||
/please wait/i,
|
||||
/checking your browser/i,
|
||||
/ray id/i
|
||||
];
|
||||
const detected = botIndicators.some(pattern => pattern.test(content));
|
||||
if (detected) {
|
||||
this.detectedCount++;
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = true;
|
||||
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
|
||||
logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
|
||||
// Mark the request for rotation on retry
|
||||
response.request.metadata.botDetected = true;
|
||||
response.request.metadata.needsNewBrowser = true;
|
||||
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
|
||||
const error = new Error('Bot detection threshold reached');
|
||||
const error = new Error('Bot detection threshold reached - rotating fingerprint');
|
||||
error.type = types_1.ErrorType.BOT_DETECTION;
|
||||
error.retryable = true;
|
||||
error.request = response.request;
|
||||
@@ -200,9 +275,22 @@ class BotDetectionMiddleware {
|
||||
else {
|
||||
// Gradually decrease detection count on successful requests
|
||||
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = false;
|
||||
}
|
||||
return response;
|
||||
}
|
||||
async processError(error, request) {
|
||||
// If bot detection error, flag for rotation and allow retry
|
||||
if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) {
|
||||
request.metadata.botDetected = true;
|
||||
request.metadata.needsNewBrowser = true;
|
||||
logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
|
||||
// Add delay before retry to avoid rate limiting
|
||||
await sleep(5000 + Math.random() * 5000);
|
||||
return null; // Return null to trigger retry
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
exports.BotDetectionMiddleware = BotDetectionMiddleware;
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user