feat: Stealth worker system with mandatory proxy rotation
## Worker System - Role-agnostic workers that can handle any task type - Pod-based architecture with StatefulSet (5-15 pods, 5 workers each) - Custom pod names (Aethelgard, Xylos, Kryll, etc.) - Worker registry with friendly names and resource monitoring - Hub-and-spoke visualization on JobQueue page ## Stealth & Anti-Detection (REQUIRED) - Proxies are MANDATORY - workers fail to start without active proxies - CrawlRotator initializes on worker startup - Loads proxies from `proxies` table - Auto-rotates proxy + fingerprint on 403 errors - 12 browser fingerprints (Chrome, Firefox, Safari, Edge) - Locale/timezone matching for geographic consistency ## Task System - Renamed product_resync → product_refresh - Task chaining: store_discovery → entry_point → product_discovery - Priority-based claiming with FOR UPDATE SKIP LOCKED - Heartbeat and stale task recovery ## UI Updates - JobQueue: Pod visualization, resource monitoring on hover - WorkersDashboard: Simplified worker list - Removed unused filters from task list ## Other - IP2Location service for visitor analytics - Findagram consumer features scaffolding - Documentation updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,26 +1,58 @@
|
||||
/**
|
||||
* Task Worker
|
||||
*
|
||||
* A unified worker that processes tasks from the worker_tasks queue.
|
||||
* Replaces the fragmented job systems (job_schedules, dispensary_crawl_jobs, etc.)
|
||||
* A unified worker that pulls tasks from the worker_tasks queue.
|
||||
* Workers register on startup, get a friendly name, and pull tasks.
|
||||
*
|
||||
* Architecture:
|
||||
* - Tasks are generated on schedule (by scheduler or API)
|
||||
* - Workers PULL tasks from the pool (not assigned to them)
|
||||
* - Tasks are claimed in order of priority (DESC) then creation time (ASC)
|
||||
* - Workers report heartbeats to worker_registry
|
||||
* - Workers are ROLE-AGNOSTIC by default (can handle any task type)
|
||||
*
|
||||
* Stealth & Anti-Detection:
|
||||
* PROXIES ARE REQUIRED - workers will fail to start if no proxies available.
|
||||
*
|
||||
* On startup, workers initialize the CrawlRotator which provides:
|
||||
* - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy
|
||||
* - User-Agent rotation: Cycles through realistic browser fingerprints
|
||||
* - Fingerprint rotation: Changes browser profile on blocks
|
||||
* - Locale/timezone: Matches Accept-Language to target state
|
||||
*
|
||||
* The CrawlRotator is wired to the Dutchie client via setCrawlRotator().
|
||||
* Task handlers call startSession() which picks a random fingerprint.
|
||||
* On 403 errors, the client automatically:
|
||||
* 1. Records failure on current proxy
|
||||
* 2. Rotates to next proxy
|
||||
* 3. Rotates fingerprint
|
||||
* 4. Retries the request
|
||||
*
|
||||
* Usage:
|
||||
* WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts
|
||||
* npx tsx src/tasks/task-worker.ts # Role-agnostic (any task)
|
||||
* WORKER_ROLE=product_refresh npx tsx src/tasks/task-worker.ts # Role-specific
|
||||
*
|
||||
* Environment:
|
||||
* WORKER_ROLE - Which task role to process (required)
|
||||
* WORKER_ID - Optional custom worker ID
|
||||
* WORKER_ROLE - Which task role to process (optional, null = any task)
|
||||
* WORKER_ID - Optional custom worker ID (auto-generated if not provided)
|
||||
* POD_NAME - Kubernetes pod name (optional)
|
||||
* POLL_INTERVAL_MS - How often to check for tasks (default: 5000)
|
||||
* HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000)
|
||||
* API_BASE_URL - Backend API URL for registration (default: http://localhost:3010)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { taskService, TaskRole, WorkerTask } from './task-service';
|
||||
import { getPool } from '../db/pool';
|
||||
import os from 'os';
|
||||
|
||||
// Stealth/rotation support
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
import { setCrawlRotator } from '../platforms/dutchie';
|
||||
|
||||
// Task handlers by role
|
||||
import { handleProductResync } from './handlers/product-resync';
|
||||
import { handleProductRefresh } from './handlers/product-refresh';
|
||||
import { handleProductDiscovery } from './handlers/product-discovery';
|
||||
import { handleStoreDiscovery } from './handlers/store-discovery';
|
||||
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
||||
@@ -28,6 +60,7 @@ import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
||||
|
||||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||||
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
|
||||
|
||||
export interface TaskContext {
|
||||
pool: Pool;
|
||||
@@ -48,7 +81,7 @@ export interface TaskResult {
|
||||
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
|
||||
|
||||
const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
||||
product_resync: handleProductResync,
|
||||
product_refresh: handleProductRefresh,
|
||||
product_discovery: handleProductDiscovery,
|
||||
store_discovery: handleStoreDiscovery,
|
||||
entry_point_discovery: handleEntryPointDiscovery,
|
||||
@@ -58,15 +91,160 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
||||
export class TaskWorker {
|
||||
private pool: Pool;
|
||||
private workerId: string;
|
||||
private role: TaskRole;
|
||||
private role: TaskRole | null; // null = role-agnostic (any task)
|
||||
private friendlyName: string = '';
|
||||
private isRunning: boolean = false;
|
||||
private heartbeatInterval: NodeJS.Timeout | null = null;
|
||||
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
|
||||
private currentTask: WorkerTask | null = null;
|
||||
private crawlRotator: CrawlRotator;
|
||||
|
||||
constructor(role: TaskRole, workerId?: string) {
|
||||
constructor(role: TaskRole | null = null, workerId?: string) {
|
||||
this.pool = getPool();
|
||||
this.role = role;
|
||||
this.workerId = workerId || `worker-${role}-${uuidv4().slice(0, 8)}`;
|
||||
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
|
||||
this.crawlRotator = new CrawlRotator(this.pool);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize stealth systems (proxy rotation, fingerprints)
|
||||
* Called once on worker startup before processing any tasks.
|
||||
*
|
||||
* IMPORTANT: Proxies are REQUIRED. Workers will fail to start if no proxies available.
|
||||
*/
|
||||
private async initializeStealth(): Promise<void> {
|
||||
// Load proxies from database
|
||||
await this.crawlRotator.initialize();
|
||||
|
||||
const stats = this.crawlRotator.proxy.getStats();
|
||||
if (stats.activeProxies === 0) {
|
||||
throw new Error('No active proxies available. Workers MUST use proxies for all requests. Add proxies to the database before starting workers.');
|
||||
}
|
||||
|
||||
console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`);
|
||||
|
||||
// Wire rotator to Dutchie client - proxies will be used for ALL requests
|
||||
setCrawlRotator(this.crawlRotator);
|
||||
|
||||
console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Register worker with the registry (get friendly name)
|
||||
*/
|
||||
private async register(): Promise<void> {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE_URL}/api/worker-registry/register`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
role: this.role,
|
||||
worker_id: this.workerId,
|
||||
pod_name: process.env.POD_NAME || process.env.HOSTNAME,
|
||||
hostname: os.hostname(),
|
||||
metadata: {
|
||||
pid: process.pid,
|
||||
node_version: process.version,
|
||||
started_at: new Date().toISOString()
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
if (data.success) {
|
||||
this.friendlyName = data.friendly_name;
|
||||
console.log(`[TaskWorker] ${data.message}`);
|
||||
} else {
|
||||
console.warn(`[TaskWorker] Registration warning: ${data.error}`);
|
||||
this.friendlyName = this.workerId.slice(0, 12);
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Registration is optional - worker can still function without it
|
||||
console.warn(`[TaskWorker] Could not register with API (will continue): ${error.message}`);
|
||||
this.friendlyName = this.workerId.slice(0, 12);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deregister worker from the registry
|
||||
*/
|
||||
private async deregister(): Promise<void> {
|
||||
try {
|
||||
await fetch(`${API_BASE_URL}/api/worker-registry/deregister`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ worker_id: this.workerId })
|
||||
});
|
||||
console.log(`[TaskWorker] ${this.friendlyName} signed off`);
|
||||
} catch {
|
||||
// Ignore deregistration errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send heartbeat to registry with resource usage
|
||||
*/
|
||||
private async sendRegistryHeartbeat(): Promise<void> {
|
||||
try {
|
||||
const memUsage = process.memoryUsage();
|
||||
const cpuUsage = process.cpuUsage();
|
||||
|
||||
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
worker_id: this.workerId,
|
||||
current_task_id: this.currentTask?.id || null,
|
||||
status: this.currentTask ? 'active' : 'idle',
|
||||
resources: {
|
||||
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
|
||||
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
|
||||
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
|
||||
cpu_user_ms: Math.round(cpuUsage.user / 1000),
|
||||
cpu_system_ms: Math.round(cpuUsage.system / 1000),
|
||||
}
|
||||
})
|
||||
});
|
||||
} catch {
|
||||
// Ignore heartbeat errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Report task completion to registry
|
||||
*/
|
||||
private async reportTaskCompletion(success: boolean): Promise<void> {
|
||||
try {
|
||||
await fetch(`${API_BASE_URL}/api/worker-registry/task-completed`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
worker_id: this.workerId,
|
||||
success
|
||||
})
|
||||
});
|
||||
} catch {
|
||||
// Ignore errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start registry heartbeat interval
|
||||
*/
|
||||
private startRegistryHeartbeat(): void {
|
||||
this.registryHeartbeatInterval = setInterval(async () => {
|
||||
await this.sendRegistryHeartbeat();
|
||||
}, HEARTBEAT_INTERVAL_MS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop registry heartbeat interval
|
||||
*/
|
||||
private stopRegistryHeartbeat(): void {
|
||||
if (this.registryHeartbeatInterval) {
|
||||
clearInterval(this.registryHeartbeatInterval);
|
||||
this.registryHeartbeatInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -74,7 +252,18 @@ export class TaskWorker {
|
||||
*/
|
||||
async start(): Promise<void> {
|
||||
this.isRunning = true;
|
||||
console.log(`[TaskWorker] Starting worker ${this.workerId} for role: ${this.role}`);
|
||||
|
||||
// Initialize stealth systems (proxy rotation, fingerprints)
|
||||
await this.initializeStealth();
|
||||
|
||||
// Register with the API to get a friendly name
|
||||
await this.register();
|
||||
|
||||
// Start registry heartbeat
|
||||
this.startRegistryHeartbeat();
|
||||
|
||||
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
|
||||
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg}`);
|
||||
|
||||
while (this.isRunning) {
|
||||
try {
|
||||
@@ -91,10 +280,12 @@ export class TaskWorker {
|
||||
/**
|
||||
* Stop the worker
|
||||
*/
|
||||
stop(): void {
|
||||
async stop(): Promise<void> {
|
||||
this.isRunning = false;
|
||||
this.stopHeartbeat();
|
||||
console.log(`[TaskWorker] Stopping worker ${this.workerId}...`);
|
||||
this.stopRegistryHeartbeat();
|
||||
await this.deregister();
|
||||
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -142,7 +333,8 @@ export class TaskWorker {
|
||||
if (result.success) {
|
||||
// Mark as completed
|
||||
await taskService.completeTask(task.id, result);
|
||||
console.log(`[TaskWorker] Task ${task.id} completed successfully`);
|
||||
await this.reportTaskCompletion(true);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id}`);
|
||||
|
||||
// Chain next task if applicable
|
||||
const chainedTask = await taskService.chainNextTask({
|
||||
@@ -156,12 +348,14 @@ export class TaskWorker {
|
||||
} else {
|
||||
// Mark as failed
|
||||
await taskService.failTask(task.id, result.error || 'Unknown error');
|
||||
console.log(`[TaskWorker] Task ${task.id} failed: ${result.error}`);
|
||||
await this.reportTaskCompletion(false);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Mark as failed
|
||||
await taskService.failTask(task.id, error.message);
|
||||
console.error(`[TaskWorker] Task ${task.id} threw error:`, error.message);
|
||||
await this.reportTaskCompletion(false);
|
||||
console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
|
||||
} finally {
|
||||
this.stopHeartbeat();
|
||||
this.currentTask = null;
|
||||
@@ -201,7 +395,7 @@ export class TaskWorker {
|
||||
/**
|
||||
* Get worker info
|
||||
*/
|
||||
getInfo(): { workerId: string; role: TaskRole; isRunning: boolean; currentTaskId: number | null } {
|
||||
getInfo(): { workerId: string; role: TaskRole | null; isRunning: boolean; currentTaskId: number | null } {
|
||||
return {
|
||||
workerId: this.workerId,
|
||||
role: this.role,
|
||||
@@ -216,30 +410,27 @@ export class TaskWorker {
|
||||
// ============================================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const role = process.env.WORKER_ROLE as TaskRole;
|
||||
|
||||
if (!role) {
|
||||
console.error('Error: WORKER_ROLE environment variable is required');
|
||||
console.error('Valid roles: store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh');
|
||||
process.exit(1);
|
||||
}
|
||||
const role = process.env.WORKER_ROLE as TaskRole | undefined;
|
||||
|
||||
const validRoles: TaskRole[] = [
|
||||
'store_discovery',
|
||||
'entry_point_discovery',
|
||||
'product_discovery',
|
||||
'product_resync',
|
||||
'product_refresh',
|
||||
'analytics_refresh',
|
||||
];
|
||||
|
||||
if (!validRoles.includes(role)) {
|
||||
// If role specified, validate it
|
||||
if (role && !validRoles.includes(role)) {
|
||||
console.error(`Error: Invalid WORKER_ROLE: ${role}`);
|
||||
console.error(`Valid roles: ${validRoles.join(', ')}`);
|
||||
console.error('Or omit WORKER_ROLE for role-agnostic worker (any task)');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const workerId = process.env.WORKER_ID;
|
||||
const worker = new TaskWorker(role, workerId);
|
||||
// Pass null for role-agnostic, or the specific role
|
||||
const worker = new TaskWorker(role || null, workerId);
|
||||
|
||||
// Handle graceful shutdown
|
||||
process.on('SIGTERM', () => {
|
||||
|
||||
Reference in New Issue
Block a user