feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,49 +1,53 @@
|
||||
/**
|
||||
* Crawl Rotator - Proxy & User Agent Rotation for Crawlers
|
||||
*
|
||||
* Manages rotation of proxies and user agents to avoid blocks.
|
||||
* Used by platform-specific crawlers (Dutchie, Jane, etc.)
|
||||
* Updated: 2025-12-10 per workflow-12102025.md
|
||||
*
|
||||
* KEY BEHAVIORS (per workflow-12102025.md):
|
||||
* 1. Task determines WHAT work to do, proxy determines SESSION IDENTITY
|
||||
* 2. Proxy location (timezone) sets Accept-Language headers (always English)
|
||||
* 3. On 403: immediately get new IP, new fingerprint, retry
|
||||
* 4. After 3 consecutive 403s on same proxy with different fingerprints → disable proxy
|
||||
*
|
||||
* USER-AGENT GENERATION (per workflow-12102025.md):
|
||||
* - Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
|
||||
* - Browser whitelist: Chrome, Safari, Edge, Firefox only
|
||||
* - UA sticks until IP rotates
|
||||
* - Failure = alert admin + stop crawl (no fallback)
|
||||
*
|
||||
* Uses intoli/user-agents for realistic UA generation with daily-updated data.
|
||||
*
|
||||
* Canonical location: src/services/crawl-rotator.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import UserAgent from 'user-agents';
|
||||
import {
|
||||
HTTPFingerprint,
|
||||
generateHTTPFingerprint,
|
||||
BrowserType,
|
||||
} from './http-fingerprint';
|
||||
|
||||
// ============================================================
|
||||
// USER AGENT CONFIGURATION
|
||||
// UA CONSTANTS (per workflow-12102025.md)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
|
||||
* Updated: 2024
|
||||
* Per workflow-12102025.md: Device category distribution (hardcoded)
|
||||
* Mobile: 62%, Desktop: 36%, Tablet: 2%
|
||||
*/
|
||||
export const USER_AGENTS = [
|
||||
// Chrome on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
|
||||
const DEVICE_WEIGHTS = {
|
||||
mobile: 62,
|
||||
desktop: 36,
|
||||
tablet: 2,
|
||||
} as const;
|
||||
|
||||
// Chrome on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
|
||||
// Firefox on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
|
||||
|
||||
// Firefox on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
|
||||
// Safari on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
|
||||
// Edge on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
];
|
||||
/**
|
||||
* Per workflow-12102025.md: Browser whitelist
|
||||
* Only Chrome (67%), Safari (20%), Edge (6%), Firefox (3%)
|
||||
* Samsung Internet, Opera, and other niche browsers are filtered out
|
||||
*/
|
||||
const ALLOWED_BROWSERS = ['Chrome', 'Safari', 'Edge', 'Firefox'] as const;
|
||||
|
||||
// ============================================================
|
||||
// PROXY TYPES
|
||||
@@ -61,8 +65,13 @@ export interface Proxy {
|
||||
failureCount: number;
|
||||
successCount: number;
|
||||
avgResponseTimeMs: number | null;
|
||||
maxConnections: number; // Number of concurrent connections allowed (for rotating proxies)
|
||||
// Location info (if known)
|
||||
maxConnections: number;
|
||||
/**
|
||||
* Per workflow-12102025.md: Track consecutive 403s with different fingerprints.
|
||||
* After 3 consecutive 403s → disable proxy (it's burned).
|
||||
*/
|
||||
consecutive403Count: number;
|
||||
// Location info - determines session headers per workflow-12102025.md
|
||||
city?: string;
|
||||
state?: string;
|
||||
country?: string;
|
||||
@@ -77,6 +86,40 @@ export interface ProxyStats {
|
||||
avgSuccessRate: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FINGERPRINT TYPE
|
||||
// Per workflow-12102025.md: Full browser fingerprint from user-agents
|
||||
// ============================================================
|
||||
|
||||
export interface BrowserFingerprint {
|
||||
userAgent: string;
|
||||
platform: string;
|
||||
screenWidth: number;
|
||||
screenHeight: number;
|
||||
viewportWidth: number;
|
||||
viewportHeight: number;
|
||||
deviceCategory: string;
|
||||
browserName: string; // Per workflow-12102025.md: for session logging
|
||||
// Derived headers for anti-detect
|
||||
acceptLanguage: string;
|
||||
secChUa?: string;
|
||||
secChUaPlatform?: string;
|
||||
secChUaMobile?: string;
|
||||
// Per workflow-12102025.md: HTTP Fingerprinting section
|
||||
httpFingerprint: HTTPFingerprint;
|
||||
}
|
||||
|
||||
/**
|
||||
* Per workflow-12102025.md: Session log entry for debugging blocked sessions
|
||||
*/
|
||||
export interface UASessionLog {
|
||||
deviceCategory: string;
|
||||
browserName: string;
|
||||
userAgent: string;
|
||||
proxyIp: string | null;
|
||||
sessionStartedAt: Date;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROXY ROTATOR CLASS
|
||||
// ============================================================
|
||||
@@ -91,9 +134,6 @@ export class ProxyRotator {
|
||||
this.pool = pool || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize with database pool
|
||||
*/
|
||||
setPool(pool: Pool): void {
|
||||
this.pool = pool;
|
||||
}
|
||||
@@ -122,6 +162,7 @@ export class ProxyRotator {
|
||||
0 as "successCount",
|
||||
response_time_ms as "avgResponseTimeMs",
|
||||
COALESCE(max_connections, 1) as "maxConnections",
|
||||
COALESCE(consecutive_403_count, 0) as "consecutive403Count",
|
||||
city,
|
||||
state,
|
||||
country,
|
||||
@@ -134,11 +175,9 @@ export class ProxyRotator {
|
||||
|
||||
this.proxies = result.rows;
|
||||
|
||||
// Calculate total concurrent capacity
|
||||
const totalCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
|
||||
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies (${totalCapacity} max concurrent connections)`);
|
||||
} catch (error) {
|
||||
// Table might not exist - that's okay
|
||||
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
|
||||
this.proxies = [];
|
||||
}
|
||||
@@ -150,7 +189,6 @@ export class ProxyRotator {
|
||||
getNext(): Proxy | null {
|
||||
if (this.proxies.length === 0) return null;
|
||||
|
||||
// Round-robin rotation
|
||||
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
|
||||
this.lastRotation = new Date();
|
||||
|
||||
@@ -185,23 +223,68 @@ export class ProxyRotator {
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as failed (temporarily remove from rotation)
|
||||
* Mark proxy as blocked (403 received)
|
||||
* Per workflow-12102025.md:
|
||||
* - Increment consecutive_403_count
|
||||
* - After 3 consecutive 403s with different fingerprints → disable proxy
|
||||
* - This is separate from general failures (timeouts, etc.)
|
||||
*/
|
||||
async markFailed(proxyId: number, error?: string): Promise<void> {
|
||||
// Update in-memory
|
||||
async markBlocked(proxyId: number): Promise<boolean> {
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.failureCount++;
|
||||
let shouldDisable = false;
|
||||
|
||||
// Deactivate if too many failures
|
||||
if (proxy.failureCount >= 5) {
|
||||
if (proxy) {
|
||||
proxy.consecutive403Count++;
|
||||
|
||||
// Per workflow-12102025.md: 3 consecutive 403s → proxy is burned
|
||||
if (proxy.consecutive403Count >= 3) {
|
||||
proxy.isActive = false;
|
||||
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
||||
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
|
||||
console.log(`[ProxyRotator] Proxy ${proxyId} DISABLED after ${proxy.consecutive403Count} consecutive 403s (burned)`);
|
||||
shouldDisable = true;
|
||||
} else {
|
||||
console.log(`[ProxyRotator] Proxy ${proxyId} blocked (403 #${proxy.consecutive403Count}/3)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update database
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
UPDATE proxies
|
||||
SET
|
||||
consecutive_403_count = COALESCE(consecutive_403_count, 0) + 1,
|
||||
last_failure_at = NOW(),
|
||||
test_result = '403 Forbidden',
|
||||
active = CASE WHEN COALESCE(consecutive_403_count, 0) >= 2 THEN false ELSE active END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [proxyId]);
|
||||
} catch (err) {
|
||||
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||
}
|
||||
}
|
||||
|
||||
return shouldDisable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as failed (general error - timeout, connection error, etc.)
|
||||
* Separate from 403 blocking per workflow-12102025.md
|
||||
*/
|
||||
async markFailed(proxyId: number, error?: string): Promise<void> {
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.failureCount++;
|
||||
|
||||
// Deactivate if too many general failures
|
||||
if (proxy.failureCount >= 5) {
|
||||
proxy.isActive = false;
|
||||
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
||||
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} general failures`);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
@@ -220,23 +303,22 @@ export class ProxyRotator {
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as successful
|
||||
* Mark proxy as successful - resets consecutive 403 count
|
||||
* Per workflow-12102025.md: successful request clears the 403 counter
|
||||
*/
|
||||
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
|
||||
// Update in-memory
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.successCount++;
|
||||
proxy.consecutive403Count = 0; // Reset on success per workflow-12102025.md
|
||||
proxy.lastUsedAt = new Date();
|
||||
if (responseTimeMs !== undefined) {
|
||||
// Rolling average
|
||||
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
|
||||
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
|
||||
: responseTimeMs;
|
||||
}
|
||||
}
|
||||
|
||||
// Update database
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
@@ -244,6 +326,7 @@ export class ProxyRotator {
|
||||
SET
|
||||
last_tested_at = NOW(),
|
||||
test_result = 'success',
|
||||
consecutive_403_count = 0,
|
||||
response_time_ms = CASE
|
||||
WHEN response_time_ms IS NULL THEN $2
|
||||
ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer
|
||||
@@ -272,8 +355,8 @@ export class ProxyRotator {
|
||||
*/
|
||||
getStats(): ProxyStats {
|
||||
const totalProxies = this.proxies.length;
|
||||
const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0); // Total concurrent capacity
|
||||
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
|
||||
const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
|
||||
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5 || p.consecutive403Count >= 3).length;
|
||||
|
||||
const successRates = this.proxies
|
||||
.filter(p => p.successCount + p.failureCount > 0)
|
||||
@@ -285,15 +368,12 @@ export class ProxyRotator {
|
||||
|
||||
return {
|
||||
totalProxies,
|
||||
activeProxies, // Total concurrent capacity across all proxies
|
||||
activeProxies,
|
||||
blockedProxies,
|
||||
avgSuccessRate,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy pool has available proxies
|
||||
*/
|
||||
hasAvailableProxies(): boolean {
|
||||
return this.proxies.length > 0;
|
||||
}
|
||||
@@ -301,53 +381,194 @@ export class ProxyRotator {
|
||||
|
||||
// ============================================================
|
||||
// USER AGENT ROTATOR CLASS
|
||||
// Per workflow-12102025.md: Uses intoli/user-agents for realistic fingerprints
|
||||
// ============================================================
|
||||
|
||||
export class UserAgentRotator {
|
||||
private userAgents: string[];
|
||||
private currentIndex: number = 0;
|
||||
private lastRotation: Date = new Date();
|
||||
private currentFingerprint: BrowserFingerprint | null = null;
|
||||
private sessionLog: UASessionLog | null = null;
|
||||
|
||||
constructor(userAgents: string[] = USER_AGENTS) {
|
||||
this.userAgents = userAgents;
|
||||
// Start at random index to avoid patterns
|
||||
this.currentIndex = Math.floor(Math.random() * userAgents.length);
|
||||
constructor() {
|
||||
// Per workflow-12102025.md: Initialize with first fingerprint
|
||||
this.rotate();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get next user agent in rotation
|
||||
* Per workflow-12102025.md: Roll device category based on distribution
|
||||
* Mobile: 62%, Desktop: 36%, Tablet: 2%
|
||||
*/
|
||||
getNext(): string {
|
||||
this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
|
||||
this.lastRotation = new Date();
|
||||
return this.userAgents[this.currentIndex];
|
||||
private rollDeviceCategory(): 'mobile' | 'desktop' | 'tablet' {
|
||||
const roll = Math.random() * 100;
|
||||
if (roll < DEVICE_WEIGHTS.mobile) {
|
||||
return 'mobile';
|
||||
} else if (roll < DEVICE_WEIGHTS.mobile + DEVICE_WEIGHTS.desktop) {
|
||||
return 'desktop';
|
||||
} else {
|
||||
return 'tablet';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current user agent without rotating
|
||||
* Per workflow-12102025.md: Extract browser name from UA string
|
||||
*/
|
||||
getCurrent(): string {
|
||||
return this.userAgents[this.currentIndex];
|
||||
private extractBrowserName(userAgent: string): string {
|
||||
if (userAgent.includes('Edg/')) return 'Edge';
|
||||
if (userAgent.includes('Firefox/')) return 'Firefox';
|
||||
if (userAgent.includes('Safari/') && !userAgent.includes('Chrome/')) return 'Safari';
|
||||
if (userAgent.includes('Chrome/')) return 'Chrome';
|
||||
return 'Unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a random user agent
|
||||
* Per workflow-12102025.md: Check if browser is in whitelist
|
||||
*/
|
||||
getRandom(): string {
|
||||
const index = Math.floor(Math.random() * this.userAgents.length);
|
||||
return this.userAgents[index];
|
||||
private isAllowedBrowser(userAgent: string): boolean {
|
||||
const browserName = this.extractBrowserName(userAgent);
|
||||
return ALLOWED_BROWSERS.includes(browserName as typeof ALLOWED_BROWSERS[number]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get total available user agents
|
||||
* Generate a new random fingerprint
|
||||
* Per workflow-12102025.md:
|
||||
* - Roll device category (62/36/2)
|
||||
* - Filter to top 4 browsers only
|
||||
* - Failure = alert admin + stop (no fallback)
|
||||
*/
|
||||
rotate(proxyIp?: string): BrowserFingerprint {
|
||||
// Per workflow-12102025.md: Roll device category
|
||||
const deviceCategory = this.rollDeviceCategory();
|
||||
|
||||
// Per workflow-12102025.md: Generate UA filtered to device category
|
||||
const generator = new UserAgent({ deviceCategory });
|
||||
|
||||
// Per workflow-12102025.md: Try to get an allowed browser (max 50 attempts)
|
||||
let ua: ReturnType<typeof generator>;
|
||||
let attempts = 0;
|
||||
const maxAttempts = 50;
|
||||
|
||||
do {
|
||||
ua = generator();
|
||||
attempts++;
|
||||
} while (!this.isAllowedBrowser(ua.data.userAgent) && attempts < maxAttempts);
|
||||
|
||||
// Per workflow-12102025.md: If we can't get allowed browser, this is a failure
|
||||
if (!this.isAllowedBrowser(ua.data.userAgent)) {
|
||||
const errorMsg = `[UserAgentRotator] CRITICAL: Failed to generate allowed browser after ${maxAttempts} attempts. Device: ${deviceCategory}. Last UA: ${ua.data.userAgent}`;
|
||||
console.error(errorMsg);
|
||||
// Per workflow-12102025.md: Alert admin + stop crawl
|
||||
// TODO: Post alert to admin dashboard
|
||||
throw new Error(errorMsg);
|
||||
}
|
||||
|
||||
const data = ua.data;
|
||||
const browserName = this.extractBrowserName(data.userAgent);
|
||||
|
||||
// Build sec-ch-ua headers from user agent string
|
||||
const secChUa = this.buildSecChUa(data.userAgent, deviceCategory);
|
||||
|
||||
// Per workflow-12102025.md: HTTP Fingerprinting - generate full HTTP fingerprint
|
||||
const httpFingerprint = generateHTTPFingerprint(browserName as BrowserType);
|
||||
|
||||
this.currentFingerprint = {
|
||||
userAgent: data.userAgent,
|
||||
platform: data.platform,
|
||||
screenWidth: data.screenWidth,
|
||||
screenHeight: data.screenHeight,
|
||||
viewportWidth: data.viewportWidth,
|
||||
viewportHeight: data.viewportHeight,
|
||||
deviceCategory: data.deviceCategory,
|
||||
browserName, // Per workflow-12102025.md: for session logging
|
||||
// Per workflow-12102025.md: always English
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
...secChUa,
|
||||
// Per workflow-12102025.md: HTTP Fingerprinting section
|
||||
httpFingerprint,
|
||||
};
|
||||
|
||||
// Per workflow-12102025.md: Log session data
|
||||
this.sessionLog = {
|
||||
deviceCategory,
|
||||
browserName,
|
||||
userAgent: data.userAgent,
|
||||
proxyIp: proxyIp || null,
|
||||
sessionStartedAt: new Date(),
|
||||
};
|
||||
|
||||
console.log(`[UserAgentRotator] New fingerprint: device=${deviceCategory}, browser=${browserName}, UA=${data.userAgent.slice(0, 50)}...`);
|
||||
return this.currentFingerprint;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current fingerprint without rotating
|
||||
*/
|
||||
getCurrent(): BrowserFingerprint {
|
||||
if (!this.currentFingerprint) {
|
||||
return this.rotate();
|
||||
}
|
||||
return this.currentFingerprint;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a random fingerprint (rotates and returns)
|
||||
*/
|
||||
getRandom(proxyIp?: string): BrowserFingerprint {
|
||||
return this.rotate(proxyIp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Per workflow-12102025.md: Get session log for debugging
|
||||
*/
|
||||
getSessionLog(): UASessionLog | null {
|
||||
return this.sessionLog;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build sec-ch-ua headers from user agent string
|
||||
* Per workflow-12102025.md: Include mobile indicator based on device category
|
||||
*/
|
||||
private buildSecChUa(userAgent: string, deviceCategory: string): { secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string } {
|
||||
const isMobile = deviceCategory === 'mobile' || deviceCategory === 'tablet';
|
||||
|
||||
// Extract Chrome version if present
|
||||
const chromeMatch = userAgent.match(/Chrome\/(\d+)/);
|
||||
const edgeMatch = userAgent.match(/Edg\/(\d+)/);
|
||||
|
||||
if (edgeMatch) {
|
||||
const version = edgeMatch[1];
|
||||
return {
|
||||
secChUa: `"Microsoft Edge";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
|
||||
secChUaPlatform: userAgent.includes('Windows') ? '"Windows"' : userAgent.includes('Android') ? '"Android"' : '"macOS"',
|
||||
secChUaMobile: isMobile ? '?1' : '?0',
|
||||
};
|
||||
}
|
||||
|
||||
if (chromeMatch) {
|
||||
const version = chromeMatch[1];
|
||||
let platform = '"Linux"';
|
||||
if (userAgent.includes('Windows')) platform = '"Windows"';
|
||||
else if (userAgent.includes('Mac')) platform = '"macOS"';
|
||||
else if (userAgent.includes('Android')) platform = '"Android"';
|
||||
else if (userAgent.includes('iPhone') || userAgent.includes('iPad')) platform = '"iOS"';
|
||||
|
||||
return {
|
||||
secChUa: `"Google Chrome";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
|
||||
secChUaPlatform: platform,
|
||||
secChUaMobile: isMobile ? '?1' : '?0',
|
||||
};
|
||||
}
|
||||
|
||||
// Firefox/Safari don't send sec-ch-ua
|
||||
return {};
|
||||
}
|
||||
|
||||
getCount(): number {
|
||||
return this.userAgents.length;
|
||||
return 1; // user-agents generates dynamically
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COMBINED ROTATOR (for convenience)
|
||||
// COMBINED ROTATOR
|
||||
// Per workflow-12102025.md: Coordinates proxy + fingerprint rotation
|
||||
// ============================================================
|
||||
|
||||
export class CrawlRotator {
|
||||
@@ -359,49 +580,51 @@ export class CrawlRotator {
|
||||
this.userAgent = new UserAgentRotator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize rotator (load proxies from DB)
|
||||
*/
|
||||
async initialize(): Promise<void> {
|
||||
await this.proxy.loadProxies();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate proxy only
|
||||
* Rotate proxy only (get new IP)
|
||||
*/
|
||||
rotateProxy(): Proxy | null {
|
||||
return this.proxy.getNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate user agent only
|
||||
* Rotate fingerprint only (new UA, screen size, etc.)
|
||||
*/
|
||||
rotateUserAgent(): string {
|
||||
return this.userAgent.getNext();
|
||||
rotateFingerprint(): BrowserFingerprint {
|
||||
return this.userAgent.rotate();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate both proxy and user agent
|
||||
* Rotate both proxy and fingerprint
|
||||
* Per workflow-12102025.md: called on 403 for fresh identity
|
||||
* Passes proxy IP to UA rotation for session logging
|
||||
*/
|
||||
rotateBoth(): { proxy: Proxy | null; userAgent: string } {
|
||||
rotateBoth(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } {
|
||||
const proxy = this.proxy.getNext();
|
||||
const proxyIp = proxy ? proxy.host : undefined;
|
||||
return {
|
||||
proxy: this.proxy.getNext(),
|
||||
userAgent: this.userAgent.getNext(),
|
||||
proxy,
|
||||
fingerprint: this.userAgent.rotate(proxyIp),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current proxy and user agent without rotating
|
||||
* Get current proxy and fingerprint without rotating
|
||||
*/
|
||||
getCurrent(): { proxy: Proxy | null; userAgent: string } {
|
||||
getCurrent(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } {
|
||||
return {
|
||||
proxy: this.proxy.getCurrent(),
|
||||
userAgent: this.userAgent.getCurrent(),
|
||||
fingerprint: this.userAgent.getCurrent(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Record success for current proxy
|
||||
* Per workflow-12102025.md: resets consecutive 403 count
|
||||
*/
|
||||
async recordSuccess(responseTimeMs?: number): Promise<void> {
|
||||
const current = this.proxy.getCurrent();
|
||||
@@ -411,7 +634,20 @@ export class CrawlRotator {
|
||||
}
|
||||
|
||||
/**
|
||||
* Record failure for current proxy
|
||||
* Record 403 block for current proxy
|
||||
* Per workflow-12102025.md: increments consecutive_403_count, disables after 3
|
||||
* Returns true if proxy was disabled
|
||||
*/
|
||||
async recordBlock(): Promise<boolean> {
|
||||
const current = this.proxy.getCurrent();
|
||||
if (current) {
|
||||
return await this.proxy.markBlocked(current.id);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record general failure (not 403)
|
||||
*/
|
||||
async recordFailure(error?: string): Promise<void> {
|
||||
const current = this.proxy.getCurrent();
|
||||
@@ -421,14 +657,13 @@ export class CrawlRotator {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current proxy location info (for reporting)
|
||||
* Note: For rotating proxies (like IPRoyal), the actual exit location varies per request
|
||||
* Get current proxy location info
|
||||
* Per workflow-12102025.md: proxy location determines session headers
|
||||
*/
|
||||
getProxyLocation(): { city?: string; state?: string; country?: string; timezone?: string; isRotating: boolean } | null {
|
||||
const current = this.proxy.getCurrent();
|
||||
if (!current) return null;
|
||||
|
||||
// Check if this is a rotating proxy (max_connections > 1 usually indicates rotating)
|
||||
const isRotating = current.maxConnections > 1;
|
||||
|
||||
return {
|
||||
@@ -439,6 +674,15 @@ export class CrawlRotator {
|
||||
isRotating
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get timezone from current proxy
|
||||
* Per workflow-12102025.md: used for Accept-Language header
|
||||
*/
|
||||
getProxyTimezone(): string | undefined {
|
||||
const current = this.proxy.getCurrent();
|
||||
return current?.timezone;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
|
||||
Reference in New Issue
Block a user