Skip proxies for Dutchie - datacenter IPs are blocked
Dutchie blocks all our datacenter proxy IPs, returning empty/different content. Direct connection from pod IP works fine (100 products found). Added PROXY_SKIP_DOMAINS list for sites that block datacenter IPs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -69,6 +69,20 @@ export class UserAgentMiddleware implements Middleware {
|
||||
}
|
||||
}
|
||||
|
||||
// Domains that should skip proxy (datacenter IPs are blocked)
|
||||
const PROXY_SKIP_DOMAINS = [
|
||||
'dutchie.com',
|
||||
];
|
||||
|
||||
function shouldSkipProxy(url: string): boolean {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
|
||||
*/
|
||||
@@ -79,6 +93,12 @@ export class ProxyMiddleware implements Middleware {
|
||||
private currentProxyId: number | null = null;
|
||||
|
||||
async processRequest(request: ScraperRequest): Promise<ScraperRequest> {
|
||||
// Skip proxy for domains that block datacenter IPs
|
||||
if (shouldSkipProxy(request.url)) {
|
||||
logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
|
||||
return request;
|
||||
}
|
||||
|
||||
// Always try to use a proxy from the central proxy service
|
||||
// The service handles bot detection timeouts automatically
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
|
||||
Reference in New Issue
Block a user