fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -3,13 +3,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
const types_1 = require("./types");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
// Diverse, realistic user agents - updated for 2024/2025
const USER_AGENTS = [
// Chrome on Windows (most common)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
// Chrome on Mac
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
// Edge
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
@@ -18,55 +36,100 @@ function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* User Agent Rotation Middleware
* User Agent Rotation Middleware - rotates UA on each request for better evasion
*/
class UserAgentMiddleware {
name = 'UserAgentMiddleware';
priority = 100;
lastUserAgent = null;
async processRequest(request) {
if (!request.metadata.userAgent) {
request.metadata.userAgent = getRandomUserAgent();
// Always rotate UA on retries or bot detection
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.userAgent || forceRotation) {
// Get a different UA than the last one used
let newUA = getRandomUserAgent();
let attempts = 0;
while (newUA === this.lastUserAgent && attempts < 5) {
newUA = getRandomUserAgent();
attempts++;
}
request.metadata.userAgent = newUA;
this.lastUserAgent = newUA;
if (forceRotation) {
logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
}
}
return request;
}
}
exports.UserAgentMiddleware = UserAgentMiddleware;
// Domains that should skip proxy (datacenter IPs are blocked)
const PROXY_SKIP_DOMAINS = [
'dutchie.com',
];
function shouldSkipProxy(url) {
try {
const urlObj = new URL(url);
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
}
catch {
return false;
}
}
/**
* Proxy Rotation Middleware
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
*/
class ProxyMiddleware {
name = 'ProxyMiddleware';
priority = 90;
async getActiveProxy() {
try {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
catch (error) {
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
return null;
}
}
currentProxyId = null;
async processRequest(request) {
// Only add proxy if not already set
if (!request.metadata.proxy && request.retryCount > 0) {
// Use proxy on retries
request.metadata.proxy = await this.getActiveProxy();
if (request.metadata.proxy) {
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
// Skip proxy for domains that block datacenter IPs
if (shouldSkipProxy(request.url)) {
logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
return request;
}
// Always try to use a proxy from the central proxy service
// The service handles bot detection timeouts automatically
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.proxy || forceRotation) {
// Get proxy from central service - it handles timeouts automatically
const proxy = await (0, proxy_1.getActiveProxy)();
if (proxy) {
request.metadata.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol,
username: proxy.username,
password: proxy.password,
};
request.metadata.proxyId = proxy.id;
this.currentProxyId = proxy.id;
const reason = forceRotation ? 'rotation' : 'initial';
logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
else {
logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy');
}
}
return request;
}
async processResponse(response) {
// If bot detection was triggered, put the proxy in timeout
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
(0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered');
logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
}
return response;
}
async processError(error, request) {
// If bot detection error, put proxy in timeout
if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) {
(0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message);
logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
}
return error;
}
}
exports.ProxyMiddleware = ProxyMiddleware;
/**
@@ -165,13 +228,15 @@ class RetryMiddleware {
}
exports.RetryMiddleware = RetryMiddleware;
/**
* Bot Detection Middleware
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
*/
class BotDetectionMiddleware {
name = 'BotDetectionMiddleware';
priority = 60;
detectedCount = 0;
DETECTION_THRESHOLD = 3;
// Export for use by other middlewares
static shouldRotateFingerprint = false;
async processResponse(response) {
const content = typeof response.content === 'string'
? response.content
@@ -183,14 +248,24 @@ class BotDetectionMiddleware {
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i
/robot/i,
/verify.*human/i,
/security check/i,
/please wait/i,
/checking your browser/i,
/ray id/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
BotDetectionMiddleware.shouldRotateFingerprint = true;
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
// Mark the request for rotation on retry
response.request.metadata.botDetected = true;
response.request.metadata.needsNewBrowser = true;
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error = new Error('Bot detection threshold reached');
const error = new Error('Bot detection threshold reached - rotating fingerprint');
error.type = types_1.ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
@@ -200,9 +275,22 @@ class BotDetectionMiddleware {
else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
BotDetectionMiddleware.shouldRotateFingerprint = false;
}
return response;
}
async processError(error, request) {
// If bot detection error, flag for rotation and allow retry
if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) {
request.metadata.botDetected = true;
request.metadata.needsNewBrowser = true;
logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
// Add delay before retry to avoid rate limiting
await sleep(5000 + Math.random() * 5000);
return null; // Return null to trigger retry
}
return error;
}
}
exports.BotDetectionMiddleware = BotDetectionMiddleware;
/**