- Remove cache_from/cache_to from CI (plugin bug splitting commas) - Add preflight() method to CrawlRotator - tests proxy + anti-detect - Add pre-task preflight check - workers MUST pass before executing - Add releaseTask() to release tasks back to pending on preflight fail - Rename proxy_test task to whoami for clarity 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
807 lines
24 KiB
TypeScript
807 lines
24 KiB
TypeScript
/**
|
|
* Crawl Rotator - Proxy & User Agent Rotation for Crawlers
|
|
*
|
|
* Updated: 2025-12-10 per workflow-12102025.md
|
|
*
|
|
* KEY BEHAVIORS (per workflow-12102025.md):
|
|
* 1. Task determines WHAT work to do, proxy determines SESSION IDENTITY
|
|
* 2. Proxy location (timezone) sets Accept-Language headers (always English)
|
|
* 3. On 403: immediately get new IP, new fingerprint, retry
|
|
* 4. After 3 consecutive 403s on same proxy with different fingerprints → disable proxy
|
|
*
|
|
* USER-AGENT GENERATION (per workflow-12102025.md):
|
|
* - Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
|
|
* - Browser whitelist: Chrome, Safari, Edge, Firefox only
|
|
* - UA sticks until IP rotates
|
|
* - Failure = alert admin + stop crawl (no fallback)
|
|
*
|
|
* Uses intoli/user-agents for realistic UA generation with daily-updated data.
|
|
*
|
|
* Canonical location: src/services/crawl-rotator.ts
|
|
*/
|
|
|
|
import { Pool } from 'pg';
|
|
import UserAgent from 'user-agents';
|
|
import {
|
|
HTTPFingerprint,
|
|
generateHTTPFingerprint,
|
|
BrowserType,
|
|
} from './http-fingerprint';
|
|
|
|
// ============================================================
|
|
// UA CONSTANTS (per workflow-12102025.md)
|
|
// ============================================================
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Device category distribution (hardcoded)
|
|
* Mobile: 62%, Desktop: 36%, Tablet: 2%
|
|
*/
|
|
const DEVICE_WEIGHTS = {
|
|
mobile: 62,
|
|
desktop: 36,
|
|
tablet: 2,
|
|
} as const;
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Browser whitelist
|
|
* Only Chrome (67%), Safari (20%), Edge (6%), Firefox (3%)
|
|
* Samsung Internet, Opera, and other niche browsers are filtered out
|
|
*/
|
|
const ALLOWED_BROWSERS = ['Chrome', 'Safari', 'Edge', 'Firefox'] as const;
|
|
|
|
// ============================================================
|
|
// PROXY TYPES
|
|
// ============================================================
|
|
|
|
export interface Proxy {
|
|
id: number;
|
|
host: string;
|
|
port: number;
|
|
username?: string;
|
|
password?: string;
|
|
protocol: 'http' | 'https' | 'socks5';
|
|
isActive: boolean;
|
|
lastUsedAt: Date | null;
|
|
failureCount: number;
|
|
successCount: number;
|
|
avgResponseTimeMs: number | null;
|
|
maxConnections: number;
|
|
/**
|
|
* Per workflow-12102025.md: Track consecutive 403s with different fingerprints.
|
|
* After 3 consecutive 403s → disable proxy (it's burned).
|
|
*/
|
|
consecutive403Count: number;
|
|
// Location info - determines session headers per workflow-12102025.md
|
|
city?: string;
|
|
state?: string;
|
|
country?: string;
|
|
countryCode?: string;
|
|
timezone?: string;
|
|
}
|
|
|
|
export interface ProxyStats {
|
|
totalProxies: number;
|
|
activeProxies: number;
|
|
blockedProxies: number;
|
|
avgSuccessRate: number;
|
|
}
|
|
|
|
// ============================================================
|
|
// FINGERPRINT TYPE
|
|
// Per workflow-12102025.md: Full browser fingerprint from user-agents
|
|
// ============================================================
|
|
|
|
export interface BrowserFingerprint {
|
|
userAgent: string;
|
|
platform: string;
|
|
screenWidth: number;
|
|
screenHeight: number;
|
|
viewportWidth: number;
|
|
viewportHeight: number;
|
|
deviceCategory: string;
|
|
browserName: string; // Per workflow-12102025.md: for session logging
|
|
// Derived headers for anti-detect
|
|
acceptLanguage: string;
|
|
secChUa?: string;
|
|
secChUaPlatform?: string;
|
|
secChUaMobile?: string;
|
|
// Per workflow-12102025.md: HTTP Fingerprinting section
|
|
httpFingerprint: HTTPFingerprint;
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Session log entry for debugging blocked sessions
|
|
*/
|
|
export interface UASessionLog {
|
|
deviceCategory: string;
|
|
browserName: string;
|
|
userAgent: string;
|
|
proxyIp: string | null;
|
|
sessionStartedAt: Date;
|
|
}
|
|
|
|
// ============================================================
|
|
// PROXY ROTATOR CLASS
|
|
// ============================================================
|
|
|
|
export class ProxyRotator {
|
|
private pool: Pool | null = null;
|
|
private proxies: Proxy[] = [];
|
|
private currentIndex: number = 0;
|
|
private lastRotation: Date = new Date();
|
|
|
|
constructor(pool?: Pool) {
|
|
this.pool = pool || null;
|
|
}
|
|
|
|
setPool(pool: Pool): void {
|
|
this.pool = pool;
|
|
}
|
|
|
|
/**
|
|
* Load proxies from database
|
|
*/
|
|
async loadProxies(): Promise<void> {
|
|
if (!this.pool) {
|
|
console.warn('[ProxyRotator] No database pool configured');
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const result = await this.pool.query<Proxy>(`
|
|
SELECT
|
|
id,
|
|
host,
|
|
port,
|
|
username,
|
|
password,
|
|
protocol,
|
|
active as "isActive",
|
|
last_tested_at as "lastUsedAt",
|
|
failure_count as "failureCount",
|
|
0 as "successCount",
|
|
response_time_ms as "avgResponseTimeMs",
|
|
COALESCE(max_connections, 1) as "maxConnections",
|
|
COALESCE(consecutive_403_count, 0) as "consecutive403Count",
|
|
city,
|
|
state,
|
|
country,
|
|
country_code as "countryCode",
|
|
timezone
|
|
FROM proxies
|
|
WHERE active = true
|
|
ORDER BY failure_count ASC, last_tested_at ASC NULLS FIRST
|
|
`);
|
|
|
|
this.proxies = result.rows;
|
|
|
|
const totalCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
|
|
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies (${totalCapacity} max concurrent connections)`);
|
|
} catch (error) {
|
|
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
|
|
this.proxies = [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get next proxy in rotation
|
|
*/
|
|
getNext(): Proxy | null {
|
|
if (this.proxies.length === 0) return null;
|
|
|
|
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
|
|
this.lastRotation = new Date();
|
|
|
|
return this.proxies[this.currentIndex];
|
|
}
|
|
|
|
/**
|
|
* Get current proxy without rotating
|
|
*/
|
|
getCurrent(): Proxy | null {
|
|
if (this.proxies.length === 0) return null;
|
|
return this.proxies[this.currentIndex];
|
|
}
|
|
|
|
/**
|
|
* Get proxy by ID
|
|
*/
|
|
getById(id: number): Proxy | null {
|
|
return this.proxies.find(p => p.id === id) || null;
|
|
}
|
|
|
|
/**
|
|
* Rotate to a specific proxy
|
|
*/
|
|
setProxy(id: number): boolean {
|
|
const index = this.proxies.findIndex(p => p.id === id);
|
|
if (index === -1) return false;
|
|
|
|
this.currentIndex = index;
|
|
this.lastRotation = new Date();
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Mark proxy as blocked (403 received)
|
|
* Per workflow-12102025.md:
|
|
* - Increment consecutive_403_count
|
|
* - After 3 consecutive 403s with different fingerprints → disable proxy
|
|
* - This is separate from general failures (timeouts, etc.)
|
|
*/
|
|
async markBlocked(proxyId: number): Promise<boolean> {
|
|
const proxy = this.proxies.find(p => p.id === proxyId);
|
|
let shouldDisable = false;
|
|
|
|
if (proxy) {
|
|
proxy.consecutive403Count++;
|
|
|
|
// Per workflow-12102025.md: 3 consecutive 403s → proxy is burned
|
|
if (proxy.consecutive403Count >= 3) {
|
|
proxy.isActive = false;
|
|
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
|
console.log(`[ProxyRotator] Proxy ${proxyId} DISABLED after ${proxy.consecutive403Count} consecutive 403s (burned)`);
|
|
shouldDisable = true;
|
|
} else {
|
|
console.log(`[ProxyRotator] Proxy ${proxyId} blocked (403 #${proxy.consecutive403Count}/3)`);
|
|
}
|
|
}
|
|
|
|
// Update database
|
|
if (this.pool) {
|
|
try {
|
|
await this.pool.query(`
|
|
UPDATE proxies
|
|
SET
|
|
consecutive_403_count = COALESCE(consecutive_403_count, 0) + 1,
|
|
last_failure_at = NOW(),
|
|
test_result = '403 Forbidden',
|
|
active = CASE WHEN COALESCE(consecutive_403_count, 0) >= 2 THEN false ELSE active END,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [proxyId]);
|
|
} catch (err) {
|
|
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
|
}
|
|
}
|
|
|
|
return shouldDisable;
|
|
}
|
|
|
|
/**
|
|
* Mark proxy as failed (general error - timeout, connection error, etc.)
|
|
* Separate from 403 blocking per workflow-12102025.md
|
|
*/
|
|
async markFailed(proxyId: number, error?: string): Promise<void> {
|
|
const proxy = this.proxies.find(p => p.id === proxyId);
|
|
if (proxy) {
|
|
proxy.failureCount++;
|
|
|
|
// Deactivate if too many general failures
|
|
if (proxy.failureCount >= 5) {
|
|
proxy.isActive = false;
|
|
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
|
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} general failures`);
|
|
}
|
|
}
|
|
|
|
if (this.pool) {
|
|
try {
|
|
await this.pool.query(`
|
|
UPDATE proxies
|
|
SET
|
|
failure_count = failure_count + 1,
|
|
updated_at = NOW(),
|
|
test_result = $2,
|
|
active = CASE WHEN failure_count >= 4 THEN false ELSE active END
|
|
WHERE id = $1
|
|
`, [proxyId, error || 'failed']);
|
|
} catch (err) {
|
|
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Mark proxy as successful - resets consecutive 403 count
|
|
* Per workflow-12102025.md: successful request clears the 403 counter
|
|
*/
|
|
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
|
|
const proxy = this.proxies.find(p => p.id === proxyId);
|
|
if (proxy) {
|
|
proxy.successCount++;
|
|
proxy.consecutive403Count = 0; // Reset on success per workflow-12102025.md
|
|
proxy.lastUsedAt = new Date();
|
|
if (responseTimeMs !== undefined) {
|
|
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
|
|
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
|
|
: responseTimeMs;
|
|
}
|
|
}
|
|
|
|
if (this.pool) {
|
|
try {
|
|
await this.pool.query(`
|
|
UPDATE proxies
|
|
SET
|
|
last_tested_at = NOW(),
|
|
test_result = 'success',
|
|
consecutive_403_count = 0,
|
|
response_time_ms = CASE
|
|
WHEN response_time_ms IS NULL THEN $2
|
|
ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer
|
|
END,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [proxyId, responseTimeMs || null]);
|
|
} catch (err) {
|
|
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get proxy URL for HTTP client
|
|
*/
|
|
getProxyUrl(proxy: Proxy): string {
|
|
const auth = proxy.username && proxy.password
|
|
? `${proxy.username}:${proxy.password}@`
|
|
: '';
|
|
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
|
|
}
|
|
|
|
/**
|
|
* Get stats about proxy pool
|
|
*/
|
|
getStats(): ProxyStats {
|
|
const totalProxies = this.proxies.length;
|
|
const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
|
|
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5 || p.consecutive403Count >= 3).length;
|
|
|
|
const successRates = this.proxies
|
|
.filter(p => p.successCount + p.failureCount > 0)
|
|
.map(p => p.successCount / (p.successCount + p.failureCount));
|
|
|
|
const avgSuccessRate = successRates.length > 0
|
|
? successRates.reduce((a, b) => a + b, 0) / successRates.length
|
|
: 0;
|
|
|
|
return {
|
|
totalProxies,
|
|
activeProxies,
|
|
blockedProxies,
|
|
avgSuccessRate,
|
|
};
|
|
}
|
|
|
|
hasAvailableProxies(): boolean {
|
|
return this.proxies.length > 0;
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// USER AGENT ROTATOR CLASS
|
|
// Per workflow-12102025.md: Uses intoli/user-agents for realistic fingerprints
|
|
// ============================================================
|
|
|
|
export class UserAgentRotator {
|
|
private currentFingerprint: BrowserFingerprint | null = null;
|
|
private sessionLog: UASessionLog | null = null;
|
|
|
|
constructor() {
|
|
// Per workflow-12102025.md: Initialize with first fingerprint
|
|
this.rotate();
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Roll device category based on distribution
|
|
* Mobile: 62%, Desktop: 36%, Tablet: 2%
|
|
*/
|
|
private rollDeviceCategory(): 'mobile' | 'desktop' | 'tablet' {
|
|
const roll = Math.random() * 100;
|
|
if (roll < DEVICE_WEIGHTS.mobile) {
|
|
return 'mobile';
|
|
} else if (roll < DEVICE_WEIGHTS.mobile + DEVICE_WEIGHTS.desktop) {
|
|
return 'desktop';
|
|
} else {
|
|
return 'tablet';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Extract browser name from UA string
|
|
*/
|
|
private extractBrowserName(userAgent: string): string {
|
|
if (userAgent.includes('Edg/')) return 'Edge';
|
|
if (userAgent.includes('Firefox/')) return 'Firefox';
|
|
if (userAgent.includes('Safari/') && !userAgent.includes('Chrome/')) return 'Safari';
|
|
if (userAgent.includes('Chrome/')) return 'Chrome';
|
|
return 'Unknown';
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Check if browser is in whitelist
|
|
*/
|
|
private isAllowedBrowser(userAgent: string): boolean {
|
|
const browserName = this.extractBrowserName(userAgent);
|
|
return ALLOWED_BROWSERS.includes(browserName as typeof ALLOWED_BROWSERS[number]);
|
|
}
|
|
|
|
/**
|
|
* Generate a new random fingerprint
|
|
* Per workflow-12102025.md:
|
|
* - Roll device category (62/36/2)
|
|
* - Filter to top 4 browsers only
|
|
* - Failure = alert admin + stop (no fallback)
|
|
*/
|
|
rotate(proxyIp?: string): BrowserFingerprint {
|
|
// Per workflow-12102025.md: Roll device category
|
|
const deviceCategory = this.rollDeviceCategory();
|
|
|
|
// Per workflow-12102025.md: Generate UA filtered to device category
|
|
const generator = new UserAgent({ deviceCategory });
|
|
|
|
// Per workflow-12102025.md: Try to get an allowed browser (max 50 attempts)
|
|
let ua: ReturnType<typeof generator>;
|
|
let attempts = 0;
|
|
const maxAttempts = 50;
|
|
|
|
do {
|
|
ua = generator();
|
|
attempts++;
|
|
} while (!this.isAllowedBrowser(ua.data.userAgent) && attempts < maxAttempts);
|
|
|
|
// Per workflow-12102025.md: If we can't get allowed browser, this is a failure
|
|
if (!this.isAllowedBrowser(ua.data.userAgent)) {
|
|
const errorMsg = `[UserAgentRotator] CRITICAL: Failed to generate allowed browser after ${maxAttempts} attempts. Device: ${deviceCategory}. Last UA: ${ua.data.userAgent}`;
|
|
console.error(errorMsg);
|
|
// Per workflow-12102025.md: Alert admin + stop crawl
|
|
// TODO: Post alert to admin dashboard
|
|
throw new Error(errorMsg);
|
|
}
|
|
|
|
const data = ua.data;
|
|
const browserName = this.extractBrowserName(data.userAgent);
|
|
|
|
// Build sec-ch-ua headers from user agent string
|
|
const secChUa = this.buildSecChUa(data.userAgent, deviceCategory);
|
|
|
|
// Per workflow-12102025.md: HTTP Fingerprinting - generate full HTTP fingerprint
|
|
const httpFingerprint = generateHTTPFingerprint(browserName as BrowserType);
|
|
|
|
this.currentFingerprint = {
|
|
userAgent: data.userAgent,
|
|
platform: data.platform,
|
|
screenWidth: data.screenWidth,
|
|
screenHeight: data.screenHeight,
|
|
viewportWidth: data.viewportWidth,
|
|
viewportHeight: data.viewportHeight,
|
|
deviceCategory: data.deviceCategory,
|
|
browserName, // Per workflow-12102025.md: for session logging
|
|
// Per workflow-12102025.md: always English
|
|
acceptLanguage: 'en-US,en;q=0.9',
|
|
...secChUa,
|
|
// Per workflow-12102025.md: HTTP Fingerprinting section
|
|
httpFingerprint,
|
|
};
|
|
|
|
// Per workflow-12102025.md: Log session data
|
|
this.sessionLog = {
|
|
deviceCategory,
|
|
browserName,
|
|
userAgent: data.userAgent,
|
|
proxyIp: proxyIp || null,
|
|
sessionStartedAt: new Date(),
|
|
};
|
|
|
|
console.log(`[UserAgentRotator] New fingerprint: device=${deviceCategory}, browser=${browserName}, UA=${data.userAgent.slice(0, 50)}...`);
|
|
return this.currentFingerprint;
|
|
}
|
|
|
|
/**
|
|
* Get current fingerprint without rotating
|
|
*/
|
|
getCurrent(): BrowserFingerprint {
|
|
if (!this.currentFingerprint) {
|
|
return this.rotate();
|
|
}
|
|
return this.currentFingerprint;
|
|
}
|
|
|
|
/**
|
|
* Get a random fingerprint (rotates and returns)
|
|
*/
|
|
getRandom(proxyIp?: string): BrowserFingerprint {
|
|
return this.rotate(proxyIp);
|
|
}
|
|
|
|
/**
|
|
* Per workflow-12102025.md: Get session log for debugging
|
|
*/
|
|
getSessionLog(): UASessionLog | null {
|
|
return this.sessionLog;
|
|
}
|
|
|
|
/**
|
|
* Build sec-ch-ua headers from user agent string
|
|
* Per workflow-12102025.md: Include mobile indicator based on device category
|
|
*/
|
|
private buildSecChUa(userAgent: string, deviceCategory: string): { secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string } {
|
|
const isMobile = deviceCategory === 'mobile' || deviceCategory === 'tablet';
|
|
|
|
// Extract Chrome version if present
|
|
const chromeMatch = userAgent.match(/Chrome\/(\d+)/);
|
|
const edgeMatch = userAgent.match(/Edg\/(\d+)/);
|
|
|
|
if (edgeMatch) {
|
|
const version = edgeMatch[1];
|
|
return {
|
|
secChUa: `"Microsoft Edge";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
|
|
secChUaPlatform: userAgent.includes('Windows') ? '"Windows"' : userAgent.includes('Android') ? '"Android"' : '"macOS"',
|
|
secChUaMobile: isMobile ? '?1' : '?0',
|
|
};
|
|
}
|
|
|
|
if (chromeMatch) {
|
|
const version = chromeMatch[1];
|
|
let platform = '"Linux"';
|
|
if (userAgent.includes('Windows')) platform = '"Windows"';
|
|
else if (userAgent.includes('Mac')) platform = '"macOS"';
|
|
else if (userAgent.includes('Android')) platform = '"Android"';
|
|
else if (userAgent.includes('iPhone') || userAgent.includes('iPad')) platform = '"iOS"';
|
|
|
|
return {
|
|
secChUa: `"Google Chrome";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
|
|
secChUaPlatform: platform,
|
|
secChUaMobile: isMobile ? '?1' : '?0',
|
|
};
|
|
}
|
|
|
|
// Firefox/Safari don't send sec-ch-ua
|
|
return {};
|
|
}
|
|
|
|
getCount(): number {
|
|
return 1; // user-agents generates dynamically
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// COMBINED ROTATOR
|
|
// Per workflow-12102025.md: Coordinates proxy + fingerprint rotation
|
|
// ============================================================
|
|
|
|
export class CrawlRotator {
|
|
public proxy: ProxyRotator;
|
|
public userAgent: UserAgentRotator;
|
|
|
|
constructor(pool?: Pool) {
|
|
this.proxy = new ProxyRotator(pool);
|
|
this.userAgent = new UserAgentRotator();
|
|
}
|
|
|
|
async initialize(): Promise<void> {
|
|
await this.proxy.loadProxies();
|
|
}
|
|
|
|
/**
|
|
* Rotate proxy only (get new IP)
|
|
*/
|
|
rotateProxy(): Proxy | null {
|
|
return this.proxy.getNext();
|
|
}
|
|
|
|
/**
|
|
* Rotate fingerprint only (new UA, screen size, etc.)
|
|
*/
|
|
rotateFingerprint(): BrowserFingerprint {
|
|
return this.userAgent.rotate();
|
|
}
|
|
|
|
/**
|
|
* Rotate both proxy and fingerprint
|
|
* Per workflow-12102025.md: called on 403 for fresh identity
|
|
* Passes proxy IP to UA rotation for session logging
|
|
*/
|
|
rotateBoth(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } {
|
|
const proxy = this.proxy.getNext();
|
|
const proxyIp = proxy ? proxy.host : undefined;
|
|
return {
|
|
proxy,
|
|
fingerprint: this.userAgent.rotate(proxyIp),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get current proxy and fingerprint without rotating
|
|
*/
|
|
getCurrent(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } {
|
|
return {
|
|
proxy: this.proxy.getCurrent(),
|
|
fingerprint: this.userAgent.getCurrent(),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Record success for current proxy
|
|
* Per workflow-12102025.md: resets consecutive 403 count
|
|
*/
|
|
async recordSuccess(responseTimeMs?: number): Promise<void> {
|
|
const current = this.proxy.getCurrent();
|
|
if (current) {
|
|
await this.proxy.markSuccess(current.id, responseTimeMs);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Record 403 block for current proxy
|
|
* Per workflow-12102025.md: increments consecutive_403_count, disables after 3
|
|
* Returns true if proxy was disabled
|
|
*/
|
|
async recordBlock(): Promise<boolean> {
|
|
const current = this.proxy.getCurrent();
|
|
if (current) {
|
|
return await this.proxy.markBlocked(current.id);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Record general failure (not 403)
|
|
*/
|
|
async recordFailure(error?: string): Promise<void> {
|
|
const current = this.proxy.getCurrent();
|
|
if (current) {
|
|
await this.proxy.markFailed(current.id, error);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get current proxy location info
|
|
* Per workflow-12102025.md: proxy location determines session headers
|
|
*/
|
|
getProxyLocation(): { city?: string; state?: string; country?: string; timezone?: string; isRotating: boolean } | null {
|
|
const current = this.proxy.getCurrent();
|
|
if (!current) return null;
|
|
|
|
const isRotating = current.maxConnections > 1;
|
|
|
|
return {
|
|
city: current.city,
|
|
state: current.state,
|
|
country: current.country,
|
|
timezone: current.timezone,
|
|
isRotating
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get timezone from current proxy
|
|
* Per workflow-12102025.md: used for Accept-Language header
|
|
*/
|
|
getProxyTimezone(): string | undefined {
|
|
const current = this.proxy.getCurrent();
|
|
return current?.timezone;
|
|
}
|
|
|
|
/**
|
|
* Preflight check - verifies proxy and anti-detect are working
|
|
* MUST be called before any task execution to ensure anonymity.
|
|
*
|
|
* Tests:
|
|
* 1. Proxy available - a proxy must be loaded and active
|
|
* 2. Proxy connectivity - makes HTTP request through proxy to verify connection
|
|
* 3. Anti-detect headers - verifies fingerprint is set with required headers
|
|
*
|
|
* @returns Promise<PreflightResult> with pass/fail status and details
|
|
*/
|
|
async preflight(): Promise<PreflightResult> {
|
|
const result: PreflightResult = {
|
|
passed: false,
|
|
proxyAvailable: false,
|
|
proxyConnected: false,
|
|
antidetectReady: false,
|
|
proxyIp: null,
|
|
fingerprint: null,
|
|
error: null,
|
|
responseTimeMs: null,
|
|
};
|
|
|
|
// Step 1: Check proxy is available
|
|
const currentProxy = this.proxy.getCurrent();
|
|
if (!currentProxy) {
|
|
result.error = 'No proxy available';
|
|
console.log('[Preflight] FAILED - No proxy available');
|
|
return result;
|
|
}
|
|
result.proxyAvailable = true;
|
|
result.proxyIp = currentProxy.host;
|
|
|
|
// Step 2: Check fingerprint/anti-detect is ready
|
|
const fingerprint = this.userAgent.getCurrent();
|
|
if (!fingerprint || !fingerprint.userAgent) {
|
|
result.error = 'Anti-detect fingerprint not initialized';
|
|
console.log('[Preflight] FAILED - No fingerprint');
|
|
return result;
|
|
}
|
|
result.antidetectReady = true;
|
|
result.fingerprint = {
|
|
userAgent: fingerprint.userAgent,
|
|
browserName: fingerprint.browserName,
|
|
deviceCategory: fingerprint.deviceCategory,
|
|
};
|
|
|
|
// Step 3: Test proxy connectivity with an actual HTTP request
|
|
// Use httpbin.org/ip to verify request goes through proxy
|
|
const proxyUrl = this.proxy.getProxyUrl(currentProxy);
|
|
const testUrl = 'https://httpbin.org/ip';
|
|
|
|
try {
|
|
const { default: axios } = await import('axios');
|
|
const { HttpsProxyAgent } = await import('https-proxy-agent');
|
|
|
|
const agent = new HttpsProxyAgent(proxyUrl);
|
|
const startTime = Date.now();
|
|
|
|
const response = await axios.get(testUrl, {
|
|
httpsAgent: agent,
|
|
timeout: 15000, // 15 second timeout
|
|
headers: {
|
|
'User-Agent': fingerprint.userAgent,
|
|
'Accept-Language': fingerprint.acceptLanguage,
|
|
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
|
|
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
|
|
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
|
|
},
|
|
});
|
|
|
|
result.responseTimeMs = Date.now() - startTime;
|
|
result.proxyConnected = true;
|
|
result.passed = true;
|
|
|
|
// Mark success on proxy stats
|
|
await this.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
|
|
|
|
console.log(`[Preflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
|
|
} catch (err: any) {
|
|
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
|
|
console.log(`[Preflight] FAILED - Proxy connection error: ${err.message}`);
|
|
|
|
// Mark failure on proxy stats
|
|
await this.proxy.markFailed(currentProxy.id, err.message);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Result from preflight check
|
|
*/
|
|
export interface PreflightResult {
|
|
/** Overall pass/fail */
|
|
passed: boolean;
|
|
/** Step 1: Is a proxy loaded? */
|
|
proxyAvailable: boolean;
|
|
/** Step 2: Did HTTP request through proxy succeed? */
|
|
proxyConnected: boolean;
|
|
/** Step 3: Is fingerprint/anti-detect ready? */
|
|
antidetectReady: boolean;
|
|
/** Current proxy IP */
|
|
proxyIp: string | null;
|
|
/** Fingerprint summary */
|
|
fingerprint: { userAgent: string; browserName: string; deviceCategory: string } | null;
|
|
/** Error message if failed */
|
|
error: string | null;
|
|
/** Proxy response time in ms */
|
|
responseTimeMs: number | null;
|
|
}
|
|
|
|
// ============================================================
|
|
// SINGLETON INSTANCES
|
|
// ============================================================
|
|
|
|
export const proxyRotator = new ProxyRotator();
|
|
export const userAgentRotator = new UserAgentRotator();
|
|
export const crawlRotator = new CrawlRotator();
|