diff --git a/backend/src/scraper-v2/middlewares.ts b/backend/src/scraper-v2/middlewares.ts index d3de1a95..62b36eed 100644 --- a/backend/src/scraper-v2/middlewares.ts +++ b/backend/src/scraper-v2/middlewares.ts @@ -69,6 +69,20 @@ export class UserAgentMiddleware implements Middleware { } } +// Domains that should skip proxy (datacenter IPs are blocked) +const PROXY_SKIP_DOMAINS = [ + 'dutchie.com', +]; + +function shouldSkipProxy(url: string): boolean { + try { + const urlObj = new URL(url); + return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain)); + } catch { + return false; + } +} + /** * Proxy Rotation Middleware - uses the central proxy service with timeout handling */ @@ -79,6 +93,12 @@ export class ProxyMiddleware implements Middleware { private currentProxyId: number | null = null; async processRequest(request: ScraperRequest): Promise { + // Skip proxy for domains that block datacenter IPs + if (shouldSkipProxy(request.url)) { + logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`); + return request; + } + // Always try to use a proxy from the central proxy service // The service handles bot detection timeouts automatically const forceRotation = request.retryCount > 0 || request.metadata.botDetected;