feat: Stealth worker system with mandatory proxy rotation

## Worker System
- Role-agnostic workers that can handle any task type
- Pod-based architecture with StatefulSet (5-15 pods, 5 workers each)
- Custom pod names (Aethelgard, Xylos, Kryll, etc.)
- Worker registry with friendly names and resource monitoring
- Hub-and-spoke visualization on JobQueue page

## Stealth & Anti-Detection (REQUIRED)
- Proxies are MANDATORY - workers fail to start without active proxies
- CrawlRotator initializes on worker startup
- Loads proxies from `proxies` table
- Auto-rotates proxy + fingerprint on 403 errors
- 12 browser fingerprints (Chrome, Firefox, Safari, Edge)
- Locale/timezone matching for geographic consistency

## Task System
- Renamed product_resync → product_refresh
- Task chaining: store_discovery → entry_point → product_discovery
- Priority-based claiming with FOR UPDATE SKIP LOCKED
- Heartbeat and stale task recovery

## UI Updates
- JobQueue: Pod visualization, resource monitoring on hover
- WorkersDashboard: Simplified worker list
- Removed unused filters from task list

## Other
- IP2Location service for visitor analytics
- Findagram consumer features scaffolding
- Documentation updates

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 00:44:59 -07:00
parent 0295637ed6
commit 56cc171287
61 changed files with 8591 additions and 2076 deletions

View File

@@ -1,26 +1,58 @@
/**
* Task Worker
*
* A unified worker that processes tasks from the worker_tasks queue.
* Replaces the fragmented job systems (job_schedules, dispensary_crawl_jobs, etc.)
* A unified worker that pulls tasks from the worker_tasks queue.
* Workers register on startup, get a friendly name, and pull tasks.
*
* Architecture:
* - Tasks are generated on schedule (by scheduler or API)
* - Workers PULL tasks from the pool (not assigned to them)
* - Tasks are claimed in order of priority (DESC) then creation time (ASC)
* - Workers report heartbeats to worker_registry
* - Workers are ROLE-AGNOSTIC by default (can handle any task type)
*
* Stealth & Anti-Detection:
* PROXIES ARE REQUIRED - workers will fail to start if no proxies available.
*
* On startup, workers initialize the CrawlRotator which provides:
* - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy
* - User-Agent rotation: Cycles through realistic browser fingerprints
* - Fingerprint rotation: Changes browser profile on blocks
* - Locale/timezone: Matches Accept-Language to target state
*
* The CrawlRotator is wired to the Dutchie client via setCrawlRotator().
* Task handlers call startSession() which picks a random fingerprint.
* On 403 errors, the client automatically:
* 1. Records failure on current proxy
* 2. Rotates to next proxy
* 3. Rotates fingerprint
* 4. Retries the request
*
* Usage:
* WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts
* npx tsx src/tasks/task-worker.ts # Role-agnostic (any task)
* WORKER_ROLE=product_refresh npx tsx src/tasks/task-worker.ts # Role-specific
*
* Environment:
* WORKER_ROLE - Which task role to process (required)
* WORKER_ID - Optional custom worker ID
* WORKER_ROLE - Which task role to process (optional, null = any task)
* WORKER_ID - Optional custom worker ID (auto-generated if not provided)
* POD_NAME - Kubernetes pod name (optional)
* POLL_INTERVAL_MS - How often to check for tasks (default: 5000)
* HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000)
* API_BASE_URL - Backend API URL for registration (default: http://localhost:3010)
*/
import { Pool } from 'pg';
import { v4 as uuidv4 } from 'uuid';
import { taskService, TaskRole, WorkerTask } from './task-service';
import { getPool } from '../db/pool';
import os from 'os';
// Stealth/rotation support
import { CrawlRotator } from '../services/crawl-rotator';
import { setCrawlRotator } from '../platforms/dutchie';
// Task handlers by role
import { handleProductResync } from './handlers/product-resync';
import { handleProductRefresh } from './handlers/product-refresh';
import { handleProductDiscovery } from './handlers/product-discovery';
import { handleStoreDiscovery } from './handlers/store-discovery';
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
@@ -28,6 +60,7 @@ import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
export interface TaskContext {
pool: Pool;
@@ -48,7 +81,7 @@ export interface TaskResult {
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
product_resync: handleProductResync,
product_refresh: handleProductRefresh,
product_discovery: handleProductDiscovery,
store_discovery: handleStoreDiscovery,
entry_point_discovery: handleEntryPointDiscovery,
@@ -58,15 +91,160 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
export class TaskWorker {
private pool: Pool;
private workerId: string;
private role: TaskRole;
private role: TaskRole | null; // null = role-agnostic (any task)
private friendlyName: string = '';
private isRunning: boolean = false;
private heartbeatInterval: NodeJS.Timeout | null = null;
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
private currentTask: WorkerTask | null = null;
private crawlRotator: CrawlRotator;
constructor(role: TaskRole, workerId?: string) {
constructor(role: TaskRole | null = null, workerId?: string) {
this.pool = getPool();
this.role = role;
this.workerId = workerId || `worker-${role}-${uuidv4().slice(0, 8)}`;
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
this.crawlRotator = new CrawlRotator(this.pool);
}
/**
* Initialize stealth systems (proxy rotation, fingerprints)
* Called once on worker startup before processing any tasks.
*
* IMPORTANT: Proxies are REQUIRED. Workers will fail to start if no proxies available.
*/
private async initializeStealth(): Promise<void> {
// Load proxies from database
await this.crawlRotator.initialize();
const stats = this.crawlRotator.proxy.getStats();
if (stats.activeProxies === 0) {
throw new Error('No active proxies available. Workers MUST use proxies for all requests. Add proxies to the database before starting workers.');
}
console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`);
// Wire rotator to Dutchie client - proxies will be used for ALL requests
setCrawlRotator(this.crawlRotator);
console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`);
}
/**
* Register worker with the registry (get friendly name)
*/
private async register(): Promise<void> {
try {
const response = await fetch(`${API_BASE_URL}/api/worker-registry/register`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
role: this.role,
worker_id: this.workerId,
pod_name: process.env.POD_NAME || process.env.HOSTNAME,
hostname: os.hostname(),
metadata: {
pid: process.pid,
node_version: process.version,
started_at: new Date().toISOString()
}
})
});
const data = await response.json();
if (data.success) {
this.friendlyName = data.friendly_name;
console.log(`[TaskWorker] ${data.message}`);
} else {
console.warn(`[TaskWorker] Registration warning: ${data.error}`);
this.friendlyName = this.workerId.slice(0, 12);
}
} catch (error: any) {
// Registration is optional - worker can still function without it
console.warn(`[TaskWorker] Could not register with API (will continue): ${error.message}`);
this.friendlyName = this.workerId.slice(0, 12);
}
}
/**
* Deregister worker from the registry
*/
private async deregister(): Promise<void> {
try {
await fetch(`${API_BASE_URL}/api/worker-registry/deregister`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ worker_id: this.workerId })
});
console.log(`[TaskWorker] ${this.friendlyName} signed off`);
} catch {
// Ignore deregistration errors
}
}
/**
* Send heartbeat to registry with resource usage
*/
private async sendRegistryHeartbeat(): Promise<void> {
try {
const memUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
worker_id: this.workerId,
current_task_id: this.currentTask?.id || null,
status: this.currentTask ? 'active' : 'idle',
resources: {
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
cpu_user_ms: Math.round(cpuUsage.user / 1000),
cpu_system_ms: Math.round(cpuUsage.system / 1000),
}
})
});
} catch {
// Ignore heartbeat errors
}
}
/**
* Report task completion to registry
*/
private async reportTaskCompletion(success: boolean): Promise<void> {
try {
await fetch(`${API_BASE_URL}/api/worker-registry/task-completed`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
worker_id: this.workerId,
success
})
});
} catch {
// Ignore errors
}
}
/**
* Start registry heartbeat interval
*/
private startRegistryHeartbeat(): void {
this.registryHeartbeatInterval = setInterval(async () => {
await this.sendRegistryHeartbeat();
}, HEARTBEAT_INTERVAL_MS);
}
/**
* Stop registry heartbeat interval
*/
private stopRegistryHeartbeat(): void {
if (this.registryHeartbeatInterval) {
clearInterval(this.registryHeartbeatInterval);
this.registryHeartbeatInterval = null;
}
}
/**
@@ -74,7 +252,18 @@ export class TaskWorker {
*/
async start(): Promise<void> {
this.isRunning = true;
console.log(`[TaskWorker] Starting worker ${this.workerId} for role: ${this.role}`);
// Initialize stealth systems (proxy rotation, fingerprints)
await this.initializeStealth();
// Register with the API to get a friendly name
await this.register();
// Start registry heartbeat
this.startRegistryHeartbeat();
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg}`);
while (this.isRunning) {
try {
@@ -91,10 +280,12 @@ export class TaskWorker {
/**
* Stop the worker
*/
stop(): void {
async stop(): Promise<void> {
this.isRunning = false;
this.stopHeartbeat();
console.log(`[TaskWorker] Stopping worker ${this.workerId}...`);
this.stopRegistryHeartbeat();
await this.deregister();
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
}
/**
@@ -142,7 +333,8 @@ export class TaskWorker {
if (result.success) {
// Mark as completed
await taskService.completeTask(task.id, result);
console.log(`[TaskWorker] Task ${task.id} completed successfully`);
await this.reportTaskCompletion(true);
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id}`);
// Chain next task if applicable
const chainedTask = await taskService.chainNextTask({
@@ -156,12 +348,14 @@ export class TaskWorker {
} else {
// Mark as failed
await taskService.failTask(task.id, result.error || 'Unknown error');
console.log(`[TaskWorker] Task ${task.id} failed: ${result.error}`);
await this.reportTaskCompletion(false);
console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`);
}
} catch (error: any) {
// Mark as failed
await taskService.failTask(task.id, error.message);
console.error(`[TaskWorker] Task ${task.id} threw error:`, error.message);
await this.reportTaskCompletion(false);
console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
} finally {
this.stopHeartbeat();
this.currentTask = null;
@@ -201,7 +395,7 @@ export class TaskWorker {
/**
* Get worker info
*/
getInfo(): { workerId: string; role: TaskRole; isRunning: boolean; currentTaskId: number | null } {
getInfo(): { workerId: string; role: TaskRole | null; isRunning: boolean; currentTaskId: number | null } {
return {
workerId: this.workerId,
role: this.role,
@@ -216,30 +410,27 @@ export class TaskWorker {
// ============================================================
async function main(): Promise<void> {
const role = process.env.WORKER_ROLE as TaskRole;
if (!role) {
console.error('Error: WORKER_ROLE environment variable is required');
console.error('Valid roles: store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh');
process.exit(1);
}
const role = process.env.WORKER_ROLE as TaskRole | undefined;
const validRoles: TaskRole[] = [
'store_discovery',
'entry_point_discovery',
'product_discovery',
'product_resync',
'product_refresh',
'analytics_refresh',
];
if (!validRoles.includes(role)) {
// If role specified, validate it
if (role && !validRoles.includes(role)) {
console.error(`Error: Invalid WORKER_ROLE: ${role}`);
console.error(`Valid roles: ${validRoles.join(', ')}`);
console.error('Or omit WORKER_ROLE for role-agnostic worker (any task)');
process.exit(1);
}
const workerId = process.env.WORKER_ID;
const worker = new TaskWorker(role, workerId);
// Pass null for role-agnostic, or the specific role
const worker = new TaskWorker(role || null, workerId);
// Handle graceful shutdown
process.on('SIGTERM', () => {