feat: Add identity pool for diverse IP/fingerprint rotation
- Add worker_identities table and metro_areas for city groupings - Create IdentityPoolService for claiming/releasing identities - Each identity used for 3-5 tasks, then 2-3 hour cooldown - Integrate with task-worker via USE_IDENTITY_POOL feature flag - Update puppeteer-preflight to accept custom proxy URLs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
493
backend/src/services/identity-pool.ts
Normal file
493
backend/src/services/identity-pool.ts
Normal file
@@ -0,0 +1,493 @@
|
||||
/**
|
||||
* Identity Pool Service
|
||||
*
|
||||
* Manages IP/fingerprint identities for diverse worker rotation.
|
||||
* Each identity is used for 3-5 tasks, then cools down 2-3 hours.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Worker queries pending tasks by geo
|
||||
* 2. Claims identity for target city/state (or creates new)
|
||||
* 3. Runs preflight with identity
|
||||
* 4. Completes 3-5 tasks
|
||||
* 5. Releases identity (goes on cooldown)
|
||||
* 6. Immediately claims new identity, repeats
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { buildEvomiProxyUrl, getEvomiConfig } from './crawl-rotator';
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface WorkerIdentity {
|
||||
id: number;
|
||||
session_id: string;
|
||||
ip_address: string | null;
|
||||
state_code: string;
|
||||
city: string | null;
|
||||
fingerprint: IdentityFingerprint;
|
||||
created_at: Date;
|
||||
last_used_at: Date | null;
|
||||
cooldown_until: Date | null;
|
||||
total_tasks_completed: number;
|
||||
total_sessions: number;
|
||||
is_active: boolean;
|
||||
active_worker_id: string | null;
|
||||
is_healthy: boolean;
|
||||
}
|
||||
|
||||
export interface IdentityFingerprint {
|
||||
userAgent: string;
|
||||
browser: string;
|
||||
browserVersion: string;
|
||||
os: string;
|
||||
osVersion: string;
|
||||
device: 'desktop' | 'mobile' | 'tablet';
|
||||
screenWidth: number;
|
||||
screenHeight: number;
|
||||
timezone: string;
|
||||
locale: string;
|
||||
// Additional anti-detect properties
|
||||
webglVendor?: string;
|
||||
webglRenderer?: string;
|
||||
languages?: string[];
|
||||
}
|
||||
|
||||
export interface PendingTaskGeo {
|
||||
state_code: string;
|
||||
city: string | null;
|
||||
pending_count: number;
|
||||
available_identities: number;
|
||||
}
|
||||
|
||||
export interface TaskForIdentity {
|
||||
task_id: number;
|
||||
dispensary_id: number;
|
||||
dispensary_name: string;
|
||||
dispensary_city: string | null;
|
||||
role: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FINGERPRINT GENERATION
|
||||
// Following market share distributions from crawl-rotator.ts
|
||||
// ============================================================
|
||||
|
||||
// Device weights (must match crawl-rotator.ts)
|
||||
const DEVICE_WEIGHTS = {
|
||||
mobile: 62,
|
||||
desktop: 36,
|
||||
tablet: 2,
|
||||
};
|
||||
|
||||
// Browser weights (must match crawl-rotator.ts)
|
||||
const BROWSER_WEIGHTS = {
|
||||
chrome: 67,
|
||||
safari: 20,
|
||||
edge: 6,
|
||||
firefox: 3,
|
||||
};
|
||||
|
||||
// Common screen resolutions by device
|
||||
const SCREEN_RESOLUTIONS = {
|
||||
desktop: [
|
||||
{ width: 1920, height: 1080 },
|
||||
{ width: 1366, height: 768 },
|
||||
{ width: 1536, height: 864 },
|
||||
{ width: 1440, height: 900 },
|
||||
{ width: 1280, height: 720 },
|
||||
{ width: 2560, height: 1440 },
|
||||
],
|
||||
mobile: [
|
||||
{ width: 390, height: 844 }, // iPhone 12/13/14
|
||||
{ width: 393, height: 873 }, // iPhone 14 Pro
|
||||
{ width: 430, height: 932 }, // iPhone 14 Pro Max
|
||||
{ width: 360, height: 800 }, // Android common
|
||||
{ width: 412, height: 915 }, // Pixel 6
|
||||
{ width: 384, height: 854 }, // Android common
|
||||
],
|
||||
tablet: [
|
||||
{ width: 768, height: 1024 }, // iPad
|
||||
{ width: 810, height: 1080 }, // iPad 10th gen
|
||||
{ width: 820, height: 1180 }, // iPad Air
|
||||
{ width: 800, height: 1280 }, // Android tablet
|
||||
],
|
||||
};
|
||||
|
||||
// OS versions by device/browser combo
|
||||
const OS_VERSIONS = {
|
||||
windows: ['10', '11'],
|
||||
macos: ['13.0', '13.5', '14.0', '14.1', '14.2'],
|
||||
ios: ['17.0', '17.1', '17.2', '17.3', '17.4', '18.0', '18.1'],
|
||||
android: ['13', '14'],
|
||||
};
|
||||
|
||||
// Browser versions
|
||||
const BROWSER_VERSIONS = {
|
||||
chrome: ['120', '121', '122', '123', '124', '125'],
|
||||
safari: ['17.0', '17.1', '17.2', '17.3', '17.4'],
|
||||
firefox: ['121', '122', '123', '124', '125'],
|
||||
edge: ['120', '121', '122', '123', '124'],
|
||||
};
|
||||
|
||||
// Timezone mapping by state (IANA format)
|
||||
const STATE_TIMEZONES: Record<string, string[]> = {
|
||||
AZ: ['America/Phoenix'], // Arizona doesn't do DST
|
||||
CA: ['America/Los_Angeles'],
|
||||
CO: ['America/Denver'],
|
||||
FL: ['America/New_York', 'America/Chicago'], // FL spans two zones
|
||||
IL: ['America/Chicago'],
|
||||
MA: ['America/New_York'],
|
||||
MI: ['America/Detroit', 'America/New_York'],
|
||||
NV: ['America/Los_Angeles'],
|
||||
NY: ['America/New_York'],
|
||||
OH: ['America/New_York'],
|
||||
OR: ['America/Los_Angeles'],
|
||||
PA: ['America/New_York'],
|
||||
WA: ['America/Los_Angeles'],
|
||||
// Add more as needed
|
||||
};
|
||||
|
||||
/**
|
||||
* Pick random item based on weights
|
||||
*/
|
||||
function weightedRandom<T extends string>(weights: Record<T, number>): T {
|
||||
const entries = Object.entries(weights) as [T, number][];
|
||||
const total = entries.reduce((sum, [, w]) => sum + w, 0);
|
||||
let random = Math.random() * total;
|
||||
|
||||
for (const [key, weight] of entries) {
|
||||
random -= weight;
|
||||
if (random <= 0) return key;
|
||||
}
|
||||
return entries[0][0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick random item from array
|
||||
*/
|
||||
function randomFrom<T>(arr: T[]): T {
|
||||
return arr[Math.floor(Math.random() * arr.length)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a diverse fingerprint for a given state
|
||||
*/
|
||||
export function generateFingerprint(stateCode: string): IdentityFingerprint {
|
||||
// Pick device type based on market share
|
||||
const device = weightedRandom(DEVICE_WEIGHTS) as 'desktop' | 'mobile' | 'tablet';
|
||||
|
||||
// Pick browser based on market share
|
||||
const browser = weightedRandom(BROWSER_WEIGHTS) as 'chrome' | 'safari' | 'edge' | 'firefox';
|
||||
|
||||
// Browser version
|
||||
const browserVersion = randomFrom(BROWSER_VERSIONS[browser]);
|
||||
|
||||
// OS based on device and browser
|
||||
let os: string;
|
||||
let osVersion: string;
|
||||
|
||||
if (device === 'mobile') {
|
||||
if (browser === 'safari') {
|
||||
os = 'iOS';
|
||||
osVersion = randomFrom(OS_VERSIONS.ios);
|
||||
} else {
|
||||
// Chrome/Firefox on mobile - mostly Android
|
||||
os = Math.random() < 0.7 ? 'Android' : 'iOS';
|
||||
osVersion = os === 'Android' ? randomFrom(OS_VERSIONS.android) : randomFrom(OS_VERSIONS.ios);
|
||||
}
|
||||
} else if (device === 'tablet') {
|
||||
os = Math.random() < 0.6 ? 'iOS' : 'Android';
|
||||
osVersion = os === 'iOS' ? randomFrom(OS_VERSIONS.ios) : randomFrom(OS_VERSIONS.android);
|
||||
} else {
|
||||
// Desktop
|
||||
if (browser === 'safari') {
|
||||
os = 'macOS';
|
||||
osVersion = randomFrom(OS_VERSIONS.macos);
|
||||
} else if (browser === 'edge') {
|
||||
os = 'Windows';
|
||||
osVersion = randomFrom(OS_VERSIONS.windows);
|
||||
} else {
|
||||
// Chrome/Firefox - mix of Windows and macOS
|
||||
os = Math.random() < 0.75 ? 'Windows' : 'macOS';
|
||||
osVersion = os === 'Windows' ? randomFrom(OS_VERSIONS.windows) : randomFrom(OS_VERSIONS.macos);
|
||||
}
|
||||
}
|
||||
|
||||
// Screen resolution
|
||||
const resolution = randomFrom(SCREEN_RESOLUTIONS[device]);
|
||||
|
||||
// Timezone for state
|
||||
const timezones = STATE_TIMEZONES[stateCode] || ['America/New_York'];
|
||||
const timezone = randomFrom(timezones);
|
||||
|
||||
// Build user agent
|
||||
const userAgent = buildUserAgent(browser, browserVersion, os, osVersion, device);
|
||||
|
||||
return {
|
||||
userAgent,
|
||||
browser: `${browser.charAt(0).toUpperCase()}${browser.slice(1)}`,
|
||||
browserVersion,
|
||||
os,
|
||||
osVersion,
|
||||
device,
|
||||
screenWidth: resolution.width,
|
||||
screenHeight: resolution.height,
|
||||
timezone,
|
||||
locale: 'en-US',
|
||||
languages: ['en-US', 'en'],
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build realistic user agent string
|
||||
*/
|
||||
function buildUserAgent(
|
||||
browser: string,
|
||||
version: string,
|
||||
os: string,
|
||||
osVersion: string,
|
||||
device: string
|
||||
): string {
|
||||
// Base Mozilla/5.0 prefix
|
||||
let ua = 'Mozilla/5.0 ';
|
||||
|
||||
// Platform token
|
||||
if (os === 'Windows') {
|
||||
ua += `(Windows NT ${osVersion === '11' ? '10.0' : '10.0'}; Win64; x64) `;
|
||||
} else if (os === 'macOS') {
|
||||
const macVer = osVersion.replace('.', '_');
|
||||
ua += `(Macintosh; Intel Mac OS X ${macVer}) `;
|
||||
} else if (os === 'iOS') {
|
||||
const iosVer = osVersion.replace('.', '_');
|
||||
if (device === 'tablet') {
|
||||
ua += `(iPad; CPU OS ${iosVer} like Mac OS X) `;
|
||||
} else {
|
||||
ua += `(iPhone; CPU iPhone OS ${iosVer} like Mac OS X) `;
|
||||
}
|
||||
} else if (os === 'Android') {
|
||||
ua += `(Linux; Android ${osVersion}; ${device === 'tablet' ? 'Tablet' : 'Mobile'}) `;
|
||||
}
|
||||
|
||||
// Browser engine and version
|
||||
if (browser === 'chrome' || browser === 'edge') {
|
||||
ua += `AppleWebKit/537.36 (KHTML, like Gecko) `;
|
||||
if (browser === 'chrome') {
|
||||
ua += `Chrome/${version}.0.0.0 `;
|
||||
} else {
|
||||
ua += `Chrome/${version}.0.0.0 Edg/${version}.0.0.0 `;
|
||||
}
|
||||
ua += device === 'mobile' ? 'Mobile Safari/537.36' : 'Safari/537.36';
|
||||
} else if (browser === 'safari') {
|
||||
ua += `AppleWebKit/605.1.15 (KHTML, like Gecko) `;
|
||||
ua += `Version/${version} `;
|
||||
ua += device === 'mobile' ? 'Mobile/15E148 Safari/604.1' : 'Safari/605.1.15';
|
||||
} else if (browser === 'firefox') {
|
||||
ua += `Gecko/20100101 Firefox/${version}.0`;
|
||||
}
|
||||
|
||||
return ua;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// IDENTITY POOL SERVICE
|
||||
// ============================================================
|
||||
|
||||
export class IdentityPoolService {
|
||||
constructor(private pool: Pool) {}
|
||||
|
||||
/**
|
||||
* Get pending tasks grouped by geo
|
||||
*/
|
||||
async getPendingTasksByGeo(limit: number = 10): Promise<PendingTaskGeo[]> {
|
||||
const result = await this.pool.query(
|
||||
`SELECT * FROM get_pending_tasks_by_geo($1)`,
|
||||
[limit]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Claim an identity for a worker
|
||||
* Returns existing identity or null if none available (caller should create new)
|
||||
*/
|
||||
async claimIdentity(
|
||||
workerId: string,
|
||||
stateCode: string,
|
||||
city?: string
|
||||
): Promise<WorkerIdentity | null> {
|
||||
const result = await this.pool.query(
|
||||
`SELECT * FROM claim_identity($1, $2, $3)`,
|
||||
[workerId, stateCode, city || null]
|
||||
);
|
||||
|
||||
if (result.rows[0]?.id) {
|
||||
return this.rowToIdentity(result.rows[0]);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new identity with Evomi proxy
|
||||
* Generates session ID, gets IP, creates fingerprint
|
||||
*/
|
||||
async createIdentity(
|
||||
workerId: string,
|
||||
stateCode: string,
|
||||
city?: string
|
||||
): Promise<WorkerIdentity | null> {
|
||||
const evomiConfig = getEvomiConfig();
|
||||
if (!evomiConfig.enabled) {
|
||||
console.error('[IdentityPool] Evomi not configured - cannot create identity');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Generate unique session ID
|
||||
const sessionId = `${workerId.slice(0, 8)}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
|
||||
// Build proxy URL
|
||||
const proxyResult = buildEvomiProxyUrl(stateCode, sessionId, city?.toLowerCase().replace(/\s+/g, '.'));
|
||||
if (!proxyResult) {
|
||||
console.error(`[IdentityPool] Failed to build proxy URL for ${stateCode}/${city}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Test proxy and get IP
|
||||
let ipAddress: string | null = null;
|
||||
try {
|
||||
const axios = require('axios');
|
||||
const HttpsProxyAgent = require('https-proxy-agent').HttpsProxyAgent;
|
||||
const agent = new HttpsProxyAgent(proxyResult.url);
|
||||
|
||||
const response = await axios.get('https://api.ipify.org?format=json', {
|
||||
httpsAgent: agent,
|
||||
timeout: 15000,
|
||||
});
|
||||
ipAddress = response.data?.ip || null;
|
||||
console.log(`[IdentityPool] New identity IP: ${ipAddress} (${proxyResult.geo})`);
|
||||
} catch (err: any) {
|
||||
console.error(`[IdentityPool] Failed to get IP for new identity: ${err.message}`);
|
||||
// Still create identity - IP will be detected during preflight
|
||||
}
|
||||
|
||||
// Generate fingerprint
|
||||
const fingerprint = generateFingerprint(stateCode);
|
||||
|
||||
// Insert into database
|
||||
const insertResult = await this.pool.query(`
|
||||
INSERT INTO worker_identities (
|
||||
session_id, ip_address, state_code, city, fingerprint,
|
||||
is_active, active_worker_id, last_used_at
|
||||
) VALUES ($1, $2, $3, $4, $5, TRUE, $6, NOW())
|
||||
RETURNING *
|
||||
`, [
|
||||
sessionId,
|
||||
ipAddress,
|
||||
stateCode,
|
||||
city || null,
|
||||
JSON.stringify(fingerprint),
|
||||
workerId,
|
||||
]);
|
||||
|
||||
if (insertResult.rows[0]) {
|
||||
console.log(`[IdentityPool] Created new identity #${insertResult.rows[0].id} for ${stateCode}/${city || 'any'}`);
|
||||
return this.rowToIdentity(insertResult.rows[0]);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Release an identity back to pool with cooldown
|
||||
*/
|
||||
async releaseIdentity(
|
||||
identityId: number,
|
||||
tasksCompleted: number = 0,
|
||||
failed: boolean = false
|
||||
): Promise<void> {
|
||||
await this.pool.query(
|
||||
`SELECT release_identity($1, $2, $3)`,
|
||||
[identityId, tasksCompleted, failed]
|
||||
);
|
||||
console.log(`[IdentityPool] Released identity #${identityId} (${tasksCompleted} tasks, failed=${failed})`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get tasks matching an identity's geo
|
||||
*/
|
||||
async getTasksForIdentity(
|
||||
stateCode: string,
|
||||
city: string | null,
|
||||
limit: number = 5
|
||||
): Promise<TaskForIdentity[]> {
|
||||
const result = await this.pool.query(
|
||||
`SELECT * FROM get_tasks_for_identity($1, $2, $3)`,
|
||||
[stateCode, city, limit]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get proxy URL for an identity
|
||||
*/
|
||||
getProxyUrl(identity: WorkerIdentity): string | null {
|
||||
const proxyResult = buildEvomiProxyUrl(
|
||||
identity.state_code,
|
||||
identity.session_id,
|
||||
identity.city?.toLowerCase().replace(/\s+/g, '.') || undefined
|
||||
);
|
||||
return proxyResult?.url || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get identity pool status
|
||||
*/
|
||||
async getPoolStatus(): Promise<any[]> {
|
||||
const result = await this.pool.query(`SELECT * FROM identity_pool_status`);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark identity as unhealthy (e.g., IP got blocked)
|
||||
*/
|
||||
async markUnhealthy(identityId: number): Promise<void> {
|
||||
await this.pool.query(`
|
||||
UPDATE worker_identities
|
||||
SET is_healthy = FALSE, is_active = FALSE, active_worker_id = NULL
|
||||
WHERE id = $1
|
||||
`, [identityId]);
|
||||
console.log(`[IdentityPool] Marked identity #${identityId} as unhealthy`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert DB row to WorkerIdentity
|
||||
*/
|
||||
private rowToIdentity(row: any): WorkerIdentity {
|
||||
return {
|
||||
id: row.id,
|
||||
session_id: row.session_id,
|
||||
ip_address: row.ip_address,
|
||||
state_code: row.state_code,
|
||||
city: row.city,
|
||||
fingerprint: typeof row.fingerprint === 'string'
|
||||
? JSON.parse(row.fingerprint)
|
||||
: row.fingerprint,
|
||||
created_at: row.created_at,
|
||||
last_used_at: row.last_used_at,
|
||||
cooldown_until: row.cooldown_until,
|
||||
total_tasks_completed: row.total_tasks_completed,
|
||||
total_sessions: row.total_sessions,
|
||||
is_active: row.is_active,
|
||||
active_worker_id: row.active_worker_id,
|
||||
is_healthy: row.is_healthy,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get random task count for a session (3-5 for diversity)
|
||||
*/
|
||||
export function getRandomTaskCount(): number {
|
||||
return 3 + Math.floor(Math.random() * 3); // 3, 4, or 5
|
||||
}
|
||||
@@ -84,9 +84,11 @@ export interface PuppeteerPreflightResult extends PreflightResult {
|
||||
* Tests browser-based access with anti-detect verification via fingerprint.com
|
||||
*
|
||||
* @param crawlRotator - CrawlRotator instance to get proxy from pool
|
||||
* @param customProxyUrl - Optional custom proxy URL (for identity pool system)
|
||||
*/
|
||||
export async function runPuppeteerPreflight(
|
||||
crawlRotator?: CrawlRotator
|
||||
crawlRotator?: CrawlRotator,
|
||||
customProxyUrl?: string
|
||||
): Promise<PuppeteerPreflightResult> {
|
||||
const result: PuppeteerPreflightResult = {
|
||||
method: 'http',
|
||||
@@ -105,21 +107,37 @@ export async function runPuppeteerPreflight(
|
||||
let browser: any = null;
|
||||
|
||||
try {
|
||||
// Step 0: Get a proxy - prefer Evomi API, fall back to DB pool
|
||||
// Step 0: Get a proxy - custom URL, Evomi API, or DB pool
|
||||
let proxyUrl: string | null = null;
|
||||
let expectedProxyHost: string | null = null;
|
||||
|
||||
// Try Evomi first (dynamic residential proxies)
|
||||
const evomiConfig = getEvomiConfig();
|
||||
if (evomiConfig.enabled) {
|
||||
// Use AZ as default state for preflight testing
|
||||
const evomiProxy = buildEvomiProxyUrl('AZ', 'preflight-test');
|
||||
if (evomiProxy) {
|
||||
result.proxyAvailable = true;
|
||||
proxyUrl = evomiProxy.url;
|
||||
expectedProxyHost = evomiConfig.host;
|
||||
result.expectedProxyIp = expectedProxyHost;
|
||||
console.log(`[PuppeteerPreflight] Using Evomi proxy: ${evomiProxy.geo}`);
|
||||
// Use custom proxy URL if provided (for identity pool system)
|
||||
if (customProxyUrl) {
|
||||
result.proxyAvailable = true;
|
||||
proxyUrl = customProxyUrl;
|
||||
try {
|
||||
const parsedUrl = new URL(customProxyUrl);
|
||||
expectedProxyHost = parsedUrl.hostname;
|
||||
} catch {
|
||||
expectedProxyHost = 'custom';
|
||||
}
|
||||
result.expectedProxyIp = expectedProxyHost;
|
||||
console.log(`[PuppeteerPreflight] Using custom proxy URL (identity pool)`);
|
||||
}
|
||||
|
||||
// Try Evomi if no custom proxy (dynamic residential proxies)
|
||||
if (!proxyUrl) {
|
||||
const evomiConfig = getEvomiConfig();
|
||||
if (evomiConfig.enabled) {
|
||||
// Use AZ as default state for preflight testing
|
||||
const evomiProxy = buildEvomiProxyUrl('AZ', 'preflight-test');
|
||||
if (evomiProxy) {
|
||||
result.proxyAvailable = true;
|
||||
proxyUrl = evomiProxy.url;
|
||||
expectedProxyHost = evomiConfig.host;
|
||||
result.expectedProxyIp = expectedProxyHost;
|
||||
console.log(`[PuppeteerPreflight] Using Evomi proxy: ${evomiProxy.geo}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,9 +153,9 @@ export async function runPuppeteerPreflight(
|
||||
}
|
||||
}
|
||||
|
||||
// No proxy available from either source
|
||||
// No proxy available from any source
|
||||
if (!proxyUrl) {
|
||||
result.error = 'No proxy available (Evomi not configured, DB pool empty)';
|
||||
result.error = 'No proxy available (no custom URL, Evomi not configured, DB pool empty)';
|
||||
console.log(`[PuppeteerPreflight] FAILED - No proxy available`);
|
||||
return result;
|
||||
}
|
||||
@@ -318,10 +336,12 @@ export async function runPuppeteerPreflight(
|
||||
*
|
||||
* @param crawlRotator - CrawlRotator instance to get proxy from pool
|
||||
* @param maxRetries - Number of retry attempts (default 1)
|
||||
* @param customProxyUrl - Optional custom proxy URL (for identity pool system)
|
||||
*/
|
||||
export async function runPuppeteerPreflightWithRetry(
|
||||
crawlRotator?: CrawlRotator,
|
||||
maxRetries: number = 1
|
||||
maxRetries: number = 1,
|
||||
customProxyUrl?: string
|
||||
): Promise<PuppeteerPreflightResult> {
|
||||
let lastResult: PuppeteerPreflightResult | null = null;
|
||||
|
||||
@@ -331,7 +351,7 @@ export async function runPuppeteerPreflightWithRetry(
|
||||
await new Promise((r) => setTimeout(r, 5000)); // Wait 5s between retries
|
||||
}
|
||||
|
||||
lastResult = await runPuppeteerPreflight(crawlRotator);
|
||||
lastResult = await runPuppeteerPreflight(crawlRotator, customProxyUrl);
|
||||
|
||||
if (lastResult.passed) {
|
||||
return lastResult;
|
||||
|
||||
@@ -70,6 +70,12 @@ import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../ser
|
||||
// Geo-targeted proxy support
|
||||
import { buildEvomiProxyUrl, getEvomiConfig } from '../services/crawl-rotator';
|
||||
|
||||
// Identity pool for diverse IP/fingerprint rotation
|
||||
import { IdentityPoolService, WorkerIdentity, getRandomTaskCount } from '../services/identity-pool';
|
||||
|
||||
// Feature flag: Use new identity pool system (set via env var)
|
||||
const USE_IDENTITY_POOL = process.env.USE_IDENTITY_POOL === 'true';
|
||||
|
||||
// Task handlers by role
|
||||
// Platform-based handlers: {task}-{platform}.ts convention
|
||||
import { handleProductRefresh } from './handlers/product-refresh';
|
||||
@@ -357,12 +363,31 @@ export class TaskWorker {
|
||||
private storedTimezone: string | null = null;
|
||||
private storedFingerprint: WorkerFingerprint | null = null;
|
||||
|
||||
// ==========================================================================
|
||||
// IDENTITY POOL TRACKING (new system - enabled via USE_IDENTITY_POOL)
|
||||
// ==========================================================================
|
||||
// Workers claim identities (IP + fingerprint) from pool.
|
||||
// Each identity used for 3-5 tasks, then cools down 2-3 hours.
|
||||
// This creates diverse, natural browsing patterns.
|
||||
// ==========================================================================
|
||||
private identityPool: IdentityPoolService | null = null;
|
||||
private currentIdentity: WorkerIdentity | null = null;
|
||||
private identityTasksCompleted: number = 0;
|
||||
private identityMaxTasks: number = 5; // Random 3-5, set when identity claimed
|
||||
private identityProxyUrl: string | null = null;
|
||||
|
||||
constructor(role: TaskRole | null = null, workerId?: string) {
|
||||
this.pool = getPool();
|
||||
this.role = role;
|
||||
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
|
||||
this.crawlRotator = new CrawlRotator(this.pool);
|
||||
|
||||
// Initialize identity pool if feature enabled
|
||||
if (USE_IDENTITY_POOL) {
|
||||
this.identityPool = new IdentityPoolService(this.pool);
|
||||
console.log(`[TaskWorker] Identity pool system ENABLED`);
|
||||
}
|
||||
|
||||
// Initialize CPU tracking
|
||||
const cpuUsage = process.cpuUsage();
|
||||
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
|
||||
@@ -899,6 +924,179 @@ export class TaskWorker {
|
||||
}
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// IDENTITY POOL METHODS (new system - enabled via USE_IDENTITY_POOL)
|
||||
// ==========================================================================
|
||||
|
||||
/**
|
||||
* Ensure worker has a valid identity from the pool.
|
||||
* If no identity or identity exhausted, claim/create new one.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Check if current identity is valid and has tasks remaining
|
||||
* 2. If not, release current identity (if any)
|
||||
* 3. Query pending tasks to find target city/state
|
||||
* 4. Claim existing identity or create new one
|
||||
* 5. Run preflight with the identity's proxy
|
||||
* 6. Return true if ready to claim tasks
|
||||
*/
|
||||
private async ensureIdentity(): Promise<boolean> {
|
||||
if (!this.identityPool) {
|
||||
console.error(`[TaskWorker] Identity pool not initialized`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Check if current identity is still valid
|
||||
if (this.currentIdentity && this.identityTasksCompleted < this.identityMaxTasks) {
|
||||
return true; // Still have tasks remaining with current identity
|
||||
}
|
||||
|
||||
// Release current identity if exhausted
|
||||
if (this.currentIdentity) {
|
||||
await this.releaseCurrentIdentity();
|
||||
}
|
||||
|
||||
// Find target city/state based on pending tasks
|
||||
const pendingGeo = await this.identityPool.getPendingTasksByGeo(5);
|
||||
if (pendingGeo.length === 0) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} no pending tasks available`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Pick city with most pending tasks
|
||||
const target = pendingGeo[0];
|
||||
console.log(`[TaskWorker] ${this.friendlyName} targeting ${target.city || 'any'}, ${target.state_code} (${target.pending_count} pending tasks)`);
|
||||
|
||||
// Try to claim existing identity
|
||||
let identity = await this.identityPool.claimIdentity(
|
||||
this.workerId,
|
||||
target.state_code,
|
||||
target.city || undefined
|
||||
);
|
||||
|
||||
// Create new identity if none available
|
||||
if (!identity) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} no available identity, creating new one...`);
|
||||
identity = await this.identityPool.createIdentity(
|
||||
this.workerId,
|
||||
target.state_code,
|
||||
target.city || undefined
|
||||
);
|
||||
}
|
||||
|
||||
if (!identity) {
|
||||
console.error(`[TaskWorker] ${this.friendlyName} failed to claim/create identity`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set up identity
|
||||
this.currentIdentity = identity;
|
||||
this.identityTasksCompleted = 0;
|
||||
this.identityMaxTasks = getRandomTaskCount(); // 3-5 random
|
||||
this.identityProxyUrl = this.identityPool.getProxyUrl(identity);
|
||||
|
||||
// Update stored fingerprint from identity
|
||||
this.storedFingerprint = {
|
||||
timezone: identity.fingerprint.timezone,
|
||||
city: identity.city || undefined,
|
||||
state: identity.state_code,
|
||||
ip: identity.ip_address || undefined,
|
||||
locale: identity.fingerprint.locale,
|
||||
};
|
||||
this.storedTimezone = identity.fingerprint.timezone;
|
||||
|
||||
console.log(`[TaskWorker] ${this.friendlyName} claimed identity #${identity.id}: ${identity.city || 'any'}, ${identity.state_code} (max ${this.identityMaxTasks} tasks)`);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} fingerprint: ${identity.fingerprint.browser} on ${identity.fingerprint.os}, ${identity.fingerprint.device}`);
|
||||
|
||||
// Run preflight with this identity's proxy
|
||||
const preflightPassed = await this.runIdentityPreflight();
|
||||
if (!preflightPassed) {
|
||||
console.error(`[TaskWorker] ${this.friendlyName} preflight failed for identity #${identity.id}`);
|
||||
await this.identityPool.releaseIdentity(identity.id, 0, true); // Release as failed
|
||||
this.currentIdentity = null;
|
||||
this.identityProxyUrl = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (err: any) {
|
||||
console.error(`[TaskWorker] ${this.friendlyName} ensureIdentity error: ${err.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run preflight with current identity's proxy
|
||||
*/
|
||||
private async runIdentityPreflight(): Promise<boolean> {
|
||||
if (!this.currentIdentity || !this.identityProxyUrl) {
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log(`[TaskWorker] ${this.friendlyName} running preflight for identity #${this.currentIdentity.id}...`);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} using identity proxy URL for ${this.currentIdentity.state_code}/${this.currentIdentity.city || 'any'}`);
|
||||
|
||||
try {
|
||||
// Use puppeteer preflight with identity's specific proxy URL
|
||||
const result = await runPuppeteerPreflightWithRetry(this.crawlRotator, 1, this.identityProxyUrl);
|
||||
|
||||
if (result.passed) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} identity preflight PASSED (IP: ${result.proxyIp})`);
|
||||
|
||||
// Update identity IP if we got it
|
||||
if (result.proxyIp && this.currentIdentity) {
|
||||
await this.pool.query(`
|
||||
UPDATE worker_identities SET ip_address = $2 WHERE id = $1
|
||||
`, [this.currentIdentity.id, result.proxyIp]);
|
||||
}
|
||||
|
||||
this.preflightHttpPassed = true;
|
||||
this.preflightHttpResult = result;
|
||||
return true;
|
||||
} else {
|
||||
console.error(`[TaskWorker] ${this.friendlyName} identity preflight FAILED: ${result.error}`);
|
||||
return false;
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error(`[TaskWorker] ${this.friendlyName} identity preflight error: ${err.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Release current identity back to pool with cooldown
|
||||
*/
|
||||
private async releaseCurrentIdentity(): Promise<void> {
|
||||
if (!this.currentIdentity || !this.identityPool) {
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[TaskWorker] ${this.friendlyName} releasing identity #${this.currentIdentity.id} (${this.identityTasksCompleted} tasks completed)`);
|
||||
|
||||
await this.identityPool.releaseIdentity(
|
||||
this.currentIdentity.id,
|
||||
this.identityTasksCompleted,
|
||||
false // Not failed
|
||||
);
|
||||
|
||||
this.currentIdentity = null;
|
||||
this.identityTasksCompleted = 0;
|
||||
this.identityProxyUrl = null;
|
||||
this.preflightHttpPassed = false; // Need new preflight for next identity
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment task count for current identity.
|
||||
* Call this after each successful task completion.
|
||||
*/
|
||||
private incrementIdentityTaskCount(): void {
|
||||
if (this.currentIdentity) {
|
||||
this.identityTasksCompleted++;
|
||||
console.log(`[TaskWorker] ${this.friendlyName} identity #${this.currentIdentity.id} task ${this.identityTasksCompleted}/${this.identityMaxTasks}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the effective max concurrent tasks based on working hours.
|
||||
* Uses the worker's timezone (from preflight IP geolocation) to determine
|
||||
@@ -1288,17 +1486,39 @@ export class TaskWorker {
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// GEO SESSION GATE - Ensure worker has valid geo assignment
|
||||
// Worker must have a state assignment to claim tasks.
|
||||
// Session = 60 min OR 7 stores, whichever comes first.
|
||||
// If no valid session, assign one based on demand.
|
||||
// GEO SESSION / IDENTITY GATE
|
||||
// Worker must have a geo assignment or identity to claim tasks.
|
||||
//
|
||||
// USE_IDENTITY_POOL=true (new system):
|
||||
// - Worker claims identity (IP + fingerprint) from pool
|
||||
// - Each identity used for 3-5 tasks, then 2-3 hour cooldown
|
||||
// - Creates diverse, natural browsing patterns
|
||||
//
|
||||
// USE_IDENTITY_POOL=false (legacy system):
|
||||
// - Session = 60 min OR 7 stores, whichever comes first
|
||||
// - State assigned based on demand
|
||||
// =================================================================
|
||||
const geoValid = await this.ensureGeoSession();
|
||||
if (!geoValid) {
|
||||
// No tasks available in any state, or assignment failed
|
||||
console.log(`[TaskWorker] ${this.friendlyName} no geo session available, waiting...`);
|
||||
await this.sleep(30000);
|
||||
return;
|
||||
let geoValid: boolean;
|
||||
if (USE_IDENTITY_POOL) {
|
||||
geoValid = await this.ensureIdentity();
|
||||
if (!geoValid) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} no identity available, waiting...`);
|
||||
await this.sleep(30000);
|
||||
return;
|
||||
}
|
||||
// Update geoState for task claiming filter
|
||||
if (this.currentIdentity) {
|
||||
this.geoState = this.currentIdentity.state_code;
|
||||
this.geoCity = this.currentIdentity.city;
|
||||
}
|
||||
} else {
|
||||
geoValid = await this.ensureGeoSession();
|
||||
if (!geoValid) {
|
||||
// No tasks available in any state, or assignment failed
|
||||
console.log(`[TaskWorker] ${this.friendlyName} no geo session available, waiting...`);
|
||||
await this.sleep(30000);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
@@ -1450,6 +1670,11 @@ export class TaskWorker {
|
||||
await this.reportTaskCompletion(true);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id} [${this.activeTasks.size}/${this.maxConcurrentTasks} active]`);
|
||||
|
||||
// Track identity task count (for identity pool rotation)
|
||||
if (USE_IDENTITY_POOL) {
|
||||
this.incrementIdentityTaskCount();
|
||||
}
|
||||
|
||||
// Chain next task if applicable
|
||||
const chainedTask = await taskService.chainNextTask({
|
||||
...task,
|
||||
|
||||
Reference in New Issue
Block a user