/** * Task Worker * * A unified worker that pulls tasks from the worker_tasks queue. * Workers register on startup, get a friendly name, and pull tasks. * * Architecture: * - Tasks are generated on schedule (by scheduler or API) * - Workers PULL tasks from the pool (not assigned to them) * - Tasks are claimed in order of priority (DESC) then creation time (ASC) * - Workers report heartbeats to worker_registry * - Workers are ROLE-AGNOSTIC by default (can handle any task type) * * Stealth & Anti-Detection (LAZY INITIALIZATION): * Workers start IMMEDIATELY without waiting for proxies. * Stealth systems (proxies, fingerprints, preflights) are initialized * on first task claim, not at worker startup. * * This allows workers to: * - Register and send heartbeats immediately * - Wait in main loop without blocking on proxy availability * - Initialize proxies/preflights only when tasks are actually available * * On first task claim attempt, workers initialize the CrawlRotator which provides: * - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy * - User-Agent rotation: Cycles through realistic browser fingerprints * - Fingerprint rotation: Changes browser profile on blocks * - Locale/timezone: Matches Accept-Language to target state * * The CrawlRotator is wired to the Dutchie client via setCrawlRotator(). * Task handlers call startSession() which picks a random fingerprint. * On 403 errors, the client automatically: * 1. Records failure on current proxy * 2. Rotates to next proxy * 3. Rotates fingerprint * 4. Retries the request * * Usage: * npx tsx src/tasks/task-worker.ts # Role-agnostic (any task) * WORKER_ROLE=product_refresh npx tsx src/tasks/task-worker.ts # Role-specific * * Environment: * WORKER_ROLE - Which task role to process (optional, null = any task) * POD_NAME - K8s StatefulSet pod name (PRIMARY - use this for persistent identity) * WORKER_ID - Custom worker ID (fallback if POD_NAME not set) * POLL_INTERVAL_MS - How often to check for tasks (default: 5000) * HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000) * API_BASE_URL - Backend API URL for registration (default: http://localhost:3010) * * Worker Identity: * Workers use POD_NAME as their worker_id for persistent identity across restarts. * In K8s StatefulSet, POD_NAME = "scraper-worker-0" through "scraper-worker-7". * This ensures workers re-register with the same ID instead of creating new entries. */ import { Pool } from 'pg'; import { v4 as uuidv4 } from 'uuid'; import { taskService, TaskRole, WorkerTask } from './task-service'; import { getPool } from '../db/pool'; import os from 'os'; // Stealth/rotation support import { CrawlRotator } from '../services/crawl-rotator'; import { setCrawlRotator } from '../platforms/dutchie'; // Dual-transport preflight system import { runCurlPreflight, CurlPreflightResult } from '../services/curl-preflight'; import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight'; // Task handlers by role // Platform-based handlers: {task}-{platform}.ts convention import { handleProductRefresh } from './handlers/product-refresh'; import { handleStoreDiscoveryState } from './handlers/store-discovery-state'; import { handleEntryPointDiscovery } from './handlers/entry-point-discovery'; import { handleAnalyticsRefresh } from './handlers/analytics-refresh'; import { handleWhoami } from './handlers/whoami'; // Dutchie Platform Handlers import { handleProductDiscoveryDutchie } from './handlers/product-discovery-dutchie'; import { handleStoreDiscoveryDutchie } from './handlers/store-discovery-dutchie'; // Jane Platform Handlers import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane'; import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane'; import { handleProductDiscoveryJane } from './handlers/product-discovery-jane'; const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000'); const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000'); const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010'; // ============================================================================= // CONCURRENT TASK PROCESSING SETTINGS // ============================================================================= // Workers can process multiple tasks simultaneously using async I/O. // This improves throughput for I/O-bound tasks (network calls, DB queries). // // Resource thresholds trigger "backoff" - the worker stops claiming new tasks // but continues processing existing ones until resources return to normal. // // See: docs/WORKER_TASK_ARCHITECTURE.md#concurrent-task-processing // ============================================================================= // Maximum number of tasks this worker will run concurrently // Browser tasks (Puppeteer) use ~400MB RAM each. With 2GB pod limit: // - 3 browsers = ~1.3GB = SAFE // - 4 browsers = ~1.7GB = RISKY // - 5+ browsers = OOM CRASH // See: docs/WORKER_TASK_ARCHITECTURE.md#browser-task-memory-limits const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3'); // When heap memory usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks // Default 85% - gives headroom before OOM const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85'); // Parse max heap size from NODE_OPTIONS (--max-old-space-size=1500) // This is used as the denominator for memory percentage calculation // V8's heapTotal is dynamic and stays small when idle, causing false high percentages function getMaxHeapSizeMb(): number { const nodeOptions = process.env.NODE_OPTIONS || ''; const match = nodeOptions.match(/--max-old-space-size=(\d+)/); if (match) { return parseInt(match[1], 10); } // Fallback: use 512MB if not specified return 512; } const MAX_HEAP_SIZE_MB = getMaxHeapSizeMb(); // When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks // Default 90% - allows some burst capacity const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90'); // How long to wait (ms) when in backoff state before rechecking resources const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000'); export interface TaskContext { pool: Pool; workerId: string; task: WorkerTask; heartbeat: () => Promise; crawlRotator?: CrawlRotator; /** Update the current step being executed (shown in dashboard) */ updateStep: (step: string, detail?: string) => void; } export interface TaskResult { success: boolean; productsProcessed?: number; snapshotsCreated?: number; storesDiscovered?: number; error?: string; [key: string]: unknown; } type TaskHandler = (ctx: TaskContext) => Promise; // Per TASK_WORKFLOW_2024-12-10.md: Handler registry // Platform-agnostic handlers (shared across Dutchie and Jane) // product_refresh: Reads local payload, uses platform-aware normalizer, upserts to DB const SHARED_HANDLERS: Partial> = { product_refresh: handleProductRefresh, store_discovery_state: handleStoreDiscoveryState, entry_point_discovery: handleEntryPointDiscovery, analytics_refresh: handleAnalyticsRefresh, whoami: handleWhoami, }; /** * Get the appropriate handler for a task based on platform. * * Naming convention: {task}-{platform}.ts * - product-discovery-dutchie.ts * - product-discovery-jane.ts * - store-discovery-dutchie.ts * - store-discovery-jane.ts * * All handlers use HTTP/Puppeteer transport (curl transport is deprecated). */ function getHandlerForTask(task: WorkerTask): TaskHandler | undefined { const role = task.role as TaskRole; const platform = task.platform || 'dutchie'; // ========================================================================== // JANE PLATFORM ROUTING // ========================================================================== if (platform === 'jane') { if (role === 'store_discovery' || role === 'store_discovery_state') { console.log(`[TaskWorker] Using Jane handler for store_discovery`); return handleStoreDiscoveryJane; } if (role === 'entry_point_discovery') { console.log(`[TaskWorker] Using Jane handler for entry_point_discovery`); return handleEntryPointDiscoveryJane; } if (role === 'product_discovery') { console.log(`[TaskWorker] Using Jane handler for product_discovery`); return handleProductDiscoveryJane; } } // ========================================================================== // DUTCHIE PLATFORM ROUTING (default) // ========================================================================== if (role === 'product_discovery') { console.log(`[TaskWorker] Using Dutchie handler for product_discovery`); return handleProductDiscoveryDutchie; } if (role === 'store_discovery') { console.log(`[TaskWorker] Using Dutchie handler for store_discovery`); return handleStoreDiscoveryDutchie; } // ========================================================================== // SHARED HANDLERS (platform-agnostic) // ========================================================================== return SHARED_HANDLERS[role]; } /** * Resource usage stats reported to the registry and used for backoff decisions. * These values are included in worker heartbeats and displayed in the UI. */ interface ResourceStats { /** Current heap memory usage as decimal (0.0 to 1.0) */ memoryPercent: number; /** Current heap used in MB */ memoryMb: number; /** Total heap available in MB */ memoryTotalMb: number; /** CPU usage percentage since last check (0 to 100) */ cpuPercent: number; /** True if worker is currently in backoff state */ isBackingOff: boolean; /** Reason for backoff (e.g., "Memory at 87.3% (threshold: 85%)") */ backoffReason: string | null; } export class TaskWorker { private pool: Pool; private workerId: string; private role: TaskRole | null; // null = role-agnostic (any task) private friendlyName: string = ''; private isRunning: boolean = false; private heartbeatInterval: NodeJS.Timeout | null = null; private registryHeartbeatInterval: NodeJS.Timeout | null = null; private staleCleanupInterval: NodeJS.Timeout | null = null; private crawlRotator: CrawlRotator; // ========================================================================== // CONCURRENT TASK TRACKING // ========================================================================== // activeTasks: Map of task ID -> task object for all currently running tasks // taskPromises: Map of task ID -> Promise for cleanup when task completes // maxConcurrentTasks: How many tasks this worker will run in parallel // ========================================================================== private activeTasks: Map = new Map(); private taskPromises: Map> = new Map(); private maxConcurrentTasks: number = MAX_CONCURRENT_TASKS; // ========================================================================== // RESOURCE MONITORING FOR BACKOFF // ========================================================================== // CPU tracking uses differential measurement - we track last values and // calculate percentage based on elapsed time since last check. // ========================================================================== private lastCpuUsage: { user: number; system: number } = { user: 0, system: 0 }; private lastCpuCheck: number = Date.now(); private isBackingOff: boolean = false; private backoffReason: string | null = null; // ========================================================================== // DUAL-TRANSPORT PREFLIGHT STATUS // ========================================================================== // Workers run BOTH preflights on startup: // - curl: axios/proxy transport - fast, for simple API calls // - http: Puppeteer/browser transport - anti-detect, for Dutchie GraphQL // // Task claiming checks method compatibility - worker must have passed // the preflight for the task's required method. // ========================================================================== private preflightCurlPassed: boolean = false; private preflightHttpPassed: boolean = false; private preflightCurlResult: CurlPreflightResult | null = null; private preflightHttpResult: PuppeteerPreflightResult | null = null; // ========================================================================== // LAZY INITIALIZATION FLAGS // ========================================================================== // Stealth/proxy initialization is deferred until first task claim. // Workers register immediately and enter main loop without blocking. // ========================================================================== private stealthInitialized: boolean = false; private preflightsCompleted: boolean = false; private initializingPromise: Promise | null = null; // ========================================================================== // PREFLIGHT RETRY SETTINGS // ========================================================================== // If preflight fails, worker retries every PREFLIGHT_RETRY_INTERVAL_MS // Worker is BLOCKED from claiming ANY tasks until preflight passes. // This ensures unqualified workers never touch the task pool. // ========================================================================== private static readonly PREFLIGHT_RETRY_INTERVAL_MS = 30000; // 30 seconds private isRetryingPreflight: boolean = false; // ========================================================================== // STEP TRACKING FOR DASHBOARD VISIBILITY // ========================================================================== // Workers report their current step in heartbeats so the dashboard can show // real-time progress like "preflight", "loading page", "processing products" // ========================================================================== private currentStep: string = 'idle'; private currentStepDetail: string | null = null; private currentStepStartedAt: Date | null = null; /** Map of task ID -> step info for concurrent tasks */ private taskSteps: Map = new Map(); constructor(role: TaskRole | null = null, workerId?: string) { this.pool = getPool(); this.role = role; this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`; this.crawlRotator = new CrawlRotator(this.pool); // Initialize CPU tracking const cpuUsage = process.cpuUsage(); this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system }; this.lastCpuCheck = Date.now(); } /** * Get current resource usage * Memory percentage is calculated against MAX_HEAP_SIZE_MB (from --max-old-space-size) * NOT against V8's dynamic heapTotal which stays small when idle */ private getResourceStats(): ResourceStats { const memUsage = process.memoryUsage(); const heapUsedMb = memUsage.heapUsed / 1024 / 1024; // Use MAX_HEAP_SIZE_MB as ceiling, not dynamic heapTotal // V8's heapTotal stays small when idle (e.g., 36MB) causing false 95%+ readings // With --max-old-space-size=1500, we should calculate against 1500MB const memoryPercent = heapUsedMb / MAX_HEAP_SIZE_MB; // Calculate CPU usage since last check const cpuUsage = process.cpuUsage(); const now = Date.now(); const elapsed = now - this.lastCpuCheck; let cpuPercent = 0; if (elapsed > 0) { const userDiff = (cpuUsage.user - this.lastCpuUsage.user) / 1000; // microseconds to ms const systemDiff = (cpuUsage.system - this.lastCpuUsage.system) / 1000; cpuPercent = ((userDiff + systemDiff) / elapsed) * 100; } // Update last values this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system }; this.lastCpuCheck = now; return { memoryPercent, memoryMb: Math.round(heapUsedMb), memoryTotalMb: MAX_HEAP_SIZE_MB, // Use max-old-space-size, not dynamic heapTotal cpuPercent: Math.min(100, cpuPercent), // Cap at 100% isBackingOff: this.isBackingOff, backoffReason: this.backoffReason, }; } /** * Check if we should back off from taking new tasks */ private shouldBackOff(): { backoff: boolean; reason: string | null } { const stats = this.getResourceStats(); if (stats.memoryPercent > MEMORY_BACKOFF_THRESHOLD) { return { backoff: true, reason: `Memory at ${(stats.memoryPercent * 100).toFixed(1)}% (threshold: ${MEMORY_BACKOFF_THRESHOLD * 100}%)` }; } if (stats.cpuPercent > CPU_BACKOFF_THRESHOLD * 100) { return { backoff: true, reason: `CPU at ${stats.cpuPercent.toFixed(1)}% (threshold: ${CPU_BACKOFF_THRESHOLD * 100}%)` }; } return { backoff: false, reason: null }; } /** * Get count of currently running tasks */ get activeTaskCount(): number { return this.activeTasks.size; } /** * Check if we can accept more tasks */ private canAcceptMoreTasks(): boolean { return this.activeTasks.size < this.maxConcurrentTasks; } // ========================================================================== // STEP TRACKING METHODS // ========================================================================== /** * Update the current step for a task (for dashboard visibility) * @param taskId - The task ID to update * @param step - Short step name (e.g., "preflight", "loading", "processing") * @param detail - Optional detail (e.g., "Verifying IP 1.2.3.4") */ public updateTaskStep(taskId: number, step: string, detail?: string): void { this.taskSteps.set(taskId, { step, detail: detail || null, startedAt: new Date(), }); // Also update the "primary" step for single-task backwards compat if (this.activeTasks.size === 1 || taskId === Array.from(this.activeTasks.keys())[0]) { this.currentStep = step; this.currentStepDetail = detail || null; this.currentStepStartedAt = new Date(); } console.log(`[TaskWorker] Step: ${step}${detail ? ` - ${detail}` : ''} (task #${taskId})`); } /** * Clear step tracking for a task (when task completes) */ private clearTaskStep(taskId: number): void { this.taskSteps.delete(taskId); // Reset primary step if no more active tasks if (this.activeTasks.size === 0) { this.currentStep = 'idle'; this.currentStepDetail = null; this.currentStepStartedAt = null; } } /** * Get current step info for all active tasks (for heartbeat) */ private getTaskStepsInfo(): Array<{ task_id: number; step: string; detail: string | null; elapsed_ms: number; }> { const now = Date.now(); return Array.from(this.taskSteps.entries()).map(([taskId, info]) => ({ task_id: taskId, step: info.step, detail: info.detail, elapsed_ms: now - info.startedAt.getTime(), })); } /** * Initialize stealth systems (proxy rotation, fingerprints) * Called LAZILY on first task claim attempt (NOT at worker startup). * * IMPORTANT: Proxies are REQUIRED to claim tasks. This method waits until proxies are available. * Workers listen for PostgreSQL NOTIFY 'proxy_added' to wake up immediately when proxies are added. */ private async initializeStealth(): Promise { const MAX_WAIT_MINUTES = 60; const POLL_INTERVAL_MS = 30000; // 30 seconds fallback polling const maxAttempts = (MAX_WAIT_MINUTES * 60 * 1000) / POLL_INTERVAL_MS; let attempts = 0; let notifyClient: any = null; // Set up PostgreSQL LISTEN for proxy notifications try { notifyClient = await this.pool.connect(); await notifyClient.query('LISTEN proxy_added'); console.log(`[TaskWorker] Listening for proxy_added notifications...`); } catch (err: any) { console.log(`[TaskWorker] Could not set up LISTEN (will poll): ${err.message}`); } // Create a promise that resolves when notified let notifyResolve: (() => void) | null = null; if (notifyClient) { notifyClient.on('notification', (msg: any) => { if (msg.channel === 'proxy_added') { console.log(`[TaskWorker] Received proxy_added notification!`); if (notifyResolve) notifyResolve(); } }); } try { while (attempts < maxAttempts) { try { // Load proxies from database await this.crawlRotator.initialize(); const stats = this.crawlRotator.proxy.getStats(); if (stats.activeProxies > 0) { console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`); // Wire rotator to Dutchie client - proxies will be used for ALL requests setCrawlRotator(this.crawlRotator); console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`); return; } attempts++; console.log(`[TaskWorker] No active proxies available (attempt ${attempts}). Waiting for proxies...`); // Wait for either notification or timeout await new Promise((resolve) => { notifyResolve = resolve; setTimeout(resolve, POLL_INTERVAL_MS); }); } catch (error: any) { attempts++; console.log(`[TaskWorker] Error loading proxies (attempt ${attempts}): ${error.message}. Retrying...`); await this.sleep(POLL_INTERVAL_MS); } } throw new Error(`No active proxies available after waiting ${MAX_WAIT_MINUTES} minutes. Add proxies to the database.`); } finally { // Clean up LISTEN connection if (notifyClient) { try { await notifyClient.query('UNLISTEN proxy_added'); notifyClient.release(); } catch { // Ignore cleanup errors } } } } /** * Run dual-transport preflights on startup * Tests both curl (axios/proxy) and http (Puppeteer/browser) transport methods. * Results are reported to worker_registry and used for task claiming. * * NOTE: All current tasks require 'http' method, so http preflight must pass * for the worker to claim any tasks. Curl preflight is for future use. */ private async runDualPreflights(): Promise { console.log(`[TaskWorker] Running dual-transport preflights...`); // Run both preflights in parallel for efficiency const [curlResult, httpResult] = await Promise.all([ runCurlPreflight(this.crawlRotator).catch((err): CurlPreflightResult => ({ method: 'curl', passed: false, proxyAvailable: false, proxyConnected: false, antidetectReady: false, proxyIp: null, fingerprint: null, error: `Preflight error: ${err.message}`, responseTimeMs: null, })), runPuppeteerPreflightWithRetry(this.crawlRotator, 1).catch((err): PuppeteerPreflightResult => ({ method: 'http', passed: false, proxyAvailable: false, proxyConnected: false, antidetectReady: false, proxyIp: null, fingerprint: null, error: `Preflight error: ${err.message}`, responseTimeMs: null, productsReturned: 0, })), ]); // Store results this.preflightCurlResult = curlResult; this.preflightHttpResult = httpResult; this.preflightCurlPassed = curlResult.passed; this.preflightHttpPassed = httpResult.passed; // Log results console.log(`[TaskWorker] CURL preflight: ${curlResult.passed ? 'PASSED' : 'FAILED'}${curlResult.error ? ` - ${curlResult.error}` : ''}`); console.log(`[TaskWorker] HTTP preflight: ${httpResult.passed ? 'PASSED' : 'FAILED'}${httpResult.error ? ` - ${httpResult.error}` : ''}`); if (httpResult.passed && httpResult.productsReturned) { console.log(`[TaskWorker] HTTP preflight returned ${httpResult.productsReturned} products from test store`); } // Report to worker_registry via API await this.reportPreflightStatus(); // Since all tasks require 'http', warn if http preflight failed if (!this.preflightHttpPassed) { console.warn(`[TaskWorker] WARNING: HTTP preflight failed - this worker cannot claim any tasks!`); console.warn(`[TaskWorker] Error: ${httpResult.error}`); } } /** * Report preflight status to worker_registry * Function signature: update_worker_preflight(worker_id, transport, status, ip, response_ms, error, fingerprint) */ private async reportPreflightStatus(): Promise { try { // Update worker_registry directly via SQL (more reliable than API) // CURL preflight - includes IP address await this.pool.query(` SELECT update_worker_preflight($1, 'curl', $2, $3, $4, $5, $6) `, [ this.workerId, this.preflightCurlPassed ? 'passed' : 'failed', this.preflightCurlResult?.proxyIp || null, this.preflightCurlResult?.responseTimeMs || null, this.preflightCurlResult?.error || null, null, // No fingerprint for curl ]); // HTTP preflight - includes IP, fingerprint, and timezone data const httpFingerprint = this.preflightHttpResult ? { ...this.preflightHttpResult.fingerprint, detectedTimezone: (this.preflightHttpResult as any).detectedTimezone, detectedLocation: (this.preflightHttpResult as any).detectedLocation, productsReturned: this.preflightHttpResult.productsReturned, botDetection: (this.preflightHttpResult as any).botDetection, } : null; await this.pool.query(` SELECT update_worker_preflight($1, 'http', $2, $3, $4, $5, $6) `, [ this.workerId, this.preflightHttpPassed ? 'passed' : 'failed', this.preflightHttpResult?.proxyIp || null, this.preflightHttpResult?.responseTimeMs || null, this.preflightHttpResult?.error || null, httpFingerprint ? JSON.stringify(httpFingerprint) : null, ]); console.log(`[TaskWorker] Preflight status reported to worker_registry`); if (this.preflightHttpResult?.proxyIp) { console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${(this.preflightHttpResult as any).detectedTimezone || 'unknown'}`); } } catch (err: any) { // Non-fatal - worker can still function console.warn(`[TaskWorker] Could not report preflight status: ${err.message}`); } } /** * Retry preflight until it passes. * Worker is BLOCKED from claiming ANY tasks until HTTP preflight passes. * This ensures unqualified workers never touch the task pool. * * All current tasks require 'http' method, so HTTP preflight is mandatory. */ private async retryPreflightUntilPass(): Promise { if (this.preflightHttpPassed) { return; // Already passed } if (this.isRetryingPreflight) { return; // Already retrying } this.isRetryingPreflight = true; let retryCount = 0; console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight FAILED - entering retry loop (every ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s)`); console.log(`[TaskWorker] ${this.friendlyName} BLOCKED from task pool until preflight passes`); while (!this.preflightHttpPassed && this.isRunning) { retryCount++; // Wait before retry await this.sleep(TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS); if (!this.isRunning) { break; // Worker stopping } console.log(`[TaskWorker] ${this.friendlyName} preflight retry #${retryCount}...`); // Reload proxies before retry (might have new ones) try { await this.crawlRotator.initialize(); const stats = this.crawlRotator.proxy.getStats(); console.log(`[TaskWorker] Proxies available: ${stats.activeProxies}`); } catch (err: any) { console.warn(`[TaskWorker] Proxy reload failed: ${err.message}`); } // Re-run HTTP preflight try { const httpResult = await runPuppeteerPreflightWithRetry(this.crawlRotator, 1); this.preflightHttpResult = httpResult; this.preflightHttpPassed = httpResult.passed; if (httpResult.passed) { console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight PASSED on retry #${retryCount}!`); console.log(`[TaskWorker] ${this.friendlyName} IP: ${httpResult.proxyIp}, Products: ${httpResult.productsReturned}`); console.log(`[TaskWorker] ${this.friendlyName} now QUALIFIED to claim tasks`); // Report updated status await this.reportPreflightStatus(); break; } else { console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight still FAILED: ${httpResult.error}`); console.log(`[TaskWorker] ${this.friendlyName} will retry in ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s...`); } } catch (err: any) { console.error(`[TaskWorker] ${this.friendlyName} preflight retry error: ${err.message}`); } } this.isRetryingPreflight = false; } /** * Get the effective max concurrent tasks based on working hours. * Uses the worker's timezone (from preflight IP geolocation) to determine * the current hour's weight, then scales max concurrent tasks accordingly. * * This creates natural traffic patterns - workers run fewer concurrent * tasks during off-peak hours, full capacity during peak hours. * * Example with MAX_CONCURRENT_TASKS = 3: * 3 AM (5% weight): effectiveMax = max(1, round(3 × 0.05)) = 1 * 12 PM (75% weight): effectiveMax = max(1, round(3 × 0.75)) = 2 * 6 PM (100% weight): effectiveMax = max(1, round(3 × 1.00)) = 3 */ private async getEffectiveMaxTasks(): Promise<{ effectiveMax: number; hourWeight: number; reason: string }> { try { const result = await this.pool.query(` SELECT current_hour, hour_weight, worker_timezone FROM check_working_hours($1, 'natural_traffic') `, [this.workerId]); if (result.rows.length === 0) { // Function returned nothing - default to full capacity return { effectiveMax: this.maxConcurrentTasks, hourWeight: 100, reason: 'Working hours check returned no result - using full capacity' }; } const row = result.rows[0]; const hourWeight = row.hour_weight || 100; // Scale max concurrent tasks by hour weight, minimum 1 const effectiveMax = Math.max(1, Math.round(this.maxConcurrentTasks * hourWeight / 100)); return { effectiveMax, hourWeight, reason: `Hour ${row.current_hour} (${row.worker_timezone}): ${hourWeight}% → ${effectiveMax}/${this.maxConcurrentTasks} slots` }; } catch (err: any) { // On error, default to full capacity (don't block workers due to DB issues) console.warn(`[TaskWorker] Working hours check failed: ${err.message} - using full capacity`); return { effectiveMax: this.maxConcurrentTasks, hourWeight: 100, reason: 'Working hours check error - using full capacity' }; } } /** * Lazy initialization of stealth systems. * Called BEFORE claiming first task (not at worker startup). * This allows workers to register and enter main loop immediately. * * Returns true if initialization succeeded, false otherwise. */ private async ensureStealthInitialized(): Promise { // Already initialized if (this.stealthInitialized && this.preflightsCompleted) { return true; } // Already initializing (prevent concurrent init attempts) if (this.initializingPromise) { await this.initializingPromise; return this.stealthInitialized && this.preflightsCompleted; } console.log(`[TaskWorker] ${this.friendlyName} lazy-initializing stealth systems (first task claim)...`); this.initializingPromise = (async () => { try { // Initialize proxy/fingerprint rotation await this.initializeStealth(); this.stealthInitialized = true; // Run dual-transport preflights await this.runDualPreflights(); this.preflightsCompleted = true; const preflightMsg = `curl=${this.preflightCurlPassed ? '✓' : '✗'} http=${this.preflightHttpPassed ? '✓' : '✗'}`; console.log(`[TaskWorker] ${this.friendlyName} stealth ready (${preflightMsg})`); } catch (err: any) { console.error(`[TaskWorker] ${this.friendlyName} stealth init failed: ${err.message}`); this.stealthInitialized = false; this.preflightsCompleted = false; } })(); await this.initializingPromise; this.initializingPromise = null; return this.stealthInitialized && this.preflightsCompleted; } /** * Register worker with the registry (get friendly name) */ private async register(): Promise { try { const response = await fetch(`${API_BASE_URL}/api/worker-registry/register`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ role: this.role, worker_id: this.workerId, pod_name: process.env.POD_NAME || process.env.HOSTNAME, hostname: os.hostname(), metadata: { pid: process.pid, node_version: process.version, started_at: new Date().toISOString() } }) }); const data = await response.json(); if (data.success) { this.friendlyName = data.friendly_name; console.log(`[TaskWorker] ${data.message}`); } else { console.warn(`[TaskWorker] Registration warning: ${data.error}`); this.friendlyName = this.workerId.slice(0, 12); } } catch (error: any) { // Registration is optional - worker can still function without it console.warn(`[TaskWorker] Could not register with API (will continue): ${error.message}`); this.friendlyName = this.workerId.slice(0, 12); } } /** * Deregister worker from the registry */ private async deregister(): Promise { try { await fetch(`${API_BASE_URL}/api/worker-registry/deregister`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ worker_id: this.workerId }) }); console.log(`[TaskWorker] ${this.friendlyName} signed off`); } catch { // Ignore deregistration errors } } /** * Send heartbeat to registry with resource usage, proxy location, and step info */ private async sendRegistryHeartbeat(): Promise { try { const memUsage = process.memoryUsage(); const cpuUsage = process.cpuUsage(); const proxyLocation = this.crawlRotator.getProxyLocation(); const resourceStats = this.getResourceStats(); // Get array of active task IDs const activeTaskIds = Array.from(this.activeTasks.keys()); // Get step info for all active tasks const taskSteps = this.getTaskStepsInfo(); await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ worker_id: this.workerId, current_task_id: activeTaskIds[0] || null, // Primary task for backwards compat current_task_ids: activeTaskIds, // All active tasks active_task_count: this.activeTasks.size, max_concurrent_tasks: this.maxConcurrentTasks, status: this.activeTasks.size > 0 ? 'active' : 'idle', // Step tracking for dashboard visibility current_step: this.currentStep, current_step_detail: this.currentStepDetail, current_step_started_at: this.currentStepStartedAt?.toISOString() || null, task_steps: taskSteps, // Per-task step info for concurrent workers resources: { memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024), memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024), memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024), memory_percent: Math.round(resourceStats.memoryPercent * 100), cpu_user_ms: Math.round(cpuUsage.user / 1000), cpu_system_ms: Math.round(cpuUsage.system / 1000), cpu_percent: Math.round(resourceStats.cpuPercent), proxy_location: proxyLocation, is_backing_off: this.isBackingOff, backoff_reason: this.backoffReason, } }) }); } catch { // Ignore heartbeat errors } } /** * Report task completion to registry */ private async reportTaskCompletion(success: boolean): Promise { try { await fetch(`${API_BASE_URL}/api/worker-registry/task-completed`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ worker_id: this.workerId, success }) }); } catch { // Ignore errors } } /** * Start registry heartbeat interval */ private startRegistryHeartbeat(): void { this.registryHeartbeatInterval = setInterval(async () => { await this.sendRegistryHeartbeat(); }, HEARTBEAT_INTERVAL_MS); } /** * Stop registry heartbeat interval */ private stopRegistryHeartbeat(): void { if (this.registryHeartbeatInterval) { clearInterval(this.registryHeartbeatInterval); this.registryHeartbeatInterval = null; } } /** * Run stale task cleanup once * Recovers tasks left in claimed/running status after worker crashes */ private async runStaleTaskCleanup(): Promise { try { console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`); const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold if (cleanupResult.cleaned > 0) { console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`); } } catch (err: any) { console.error(`[TaskWorker] Stale task cleanup error:`, err.message); } } /** * Start periodic stale task cleanup (every 10 minutes) * Only run by worker-0 to avoid races */ private startPeriodicStaleCleanup(): void { const STALE_CLEANUP_INTERVAL_MS = 10 * 60 * 1000; // 10 minutes this.staleCleanupInterval = setInterval(async () => { await this.runStaleTaskCleanup(); }, STALE_CLEANUP_INTERVAL_MS); console.log(`[TaskWorker] ${this.friendlyName} started periodic stale cleanup (every 10 min)`); } /** * Stop periodic stale task cleanup */ private stopPeriodicStaleCleanup(): void { if (this.staleCleanupInterval) { clearInterval(this.staleCleanupInterval); this.staleCleanupInterval = null; } } /** * Start the worker loop * * Workers start IMMEDIATELY without blocking on proxy/preflight init. * Stealth systems are lazy-initialized on first task claim. * This allows workers to register and send heartbeats even when proxies aren't ready. */ async start(): Promise { this.isRunning = true; // Register with the API to get a friendly name (non-blocking) await this.register(); // Start registry heartbeat immediately this.startRegistryHeartbeat(); // Cleanup stale tasks on startup and periodically // ALL workers run cleanup to ensure stale tasks are recovered even if some workers crash // The cleanup query uses SELECT FOR UPDATE SKIP LOCKED to avoid races // Run immediately on startup await this.runStaleTaskCleanup(); // Start periodic cleanup every 10 minutes this.startPeriodicStaleCleanup(); const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)'; console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (stealth=lazy, max ${this.maxConcurrentTasks} concurrent tasks)`); while (this.isRunning) { try { await this.mainLoop(); } catch (error: any) { console.error(`[TaskWorker] Loop error:`, error.message); await this.sleep(POLL_INTERVAL_MS); } } // Wait for any remaining tasks to complete if (this.taskPromises.size > 0) { console.log(`[TaskWorker] Waiting for ${this.taskPromises.size} active tasks to complete...`); await Promise.allSettled(this.taskPromises.values()); } console.log(`[TaskWorker] Worker ${this.workerId} stopped`); } /** * Main loop - tries to fill up to maxConcurrentTasks */ private async mainLoop(): Promise { // Check resource usage and backoff if needed const { backoff, reason } = this.shouldBackOff(); if (backoff) { if (!this.isBackingOff) { console.log(`[TaskWorker] ${this.friendlyName} backing off: ${reason}`); } this.isBackingOff = true; this.backoffReason = reason; await this.sleep(BACKOFF_DURATION_MS); return; } // Clear backoff state if (this.isBackingOff) { console.log(`[TaskWorker] ${this.friendlyName} resuming normal operation`); this.isBackingOff = false; this.backoffReason = null; } // Periodically reload proxies to pick up changes (new proxies, disabled proxies) // This runs every ~60 seconds (controlled by setProxyReloadInterval) if (this.stealthInitialized) { await this.crawlRotator.reloadIfStale(); } // Check for decommission signal const shouldDecommission = await this.checkDecommission(); if (shouldDecommission) { console.log(`[TaskWorker] ${this.friendlyName} received decommission signal - waiting for ${this.activeTasks.size} tasks to complete`); // Stop accepting new tasks, wait for current to finish this.isRunning = false; return; } // Try to claim more tasks if we have capacity if (this.canAcceptMoreTasks()) { // ================================================================= // LAZY INITIALIZATION - Initialize stealth on first task claim // Workers start immediately and init proxies only when needed // ================================================================= if (!this.stealthInitialized) { const initSuccess = await this.ensureStealthInitialized(); if (!initSuccess) { // Init failed - wait and retry next loop console.log(`[TaskWorker] ${this.friendlyName} stealth init failed, waiting before retry...`); await this.sleep(30000); return; } } // ================================================================= // PREFLIGHT GATE - BLOCK unqualified workers from task pool // All tasks require HTTP method, so HTTP preflight MUST pass. // If preflight failed, worker retries every 30 seconds. // Worker CANNOT claim ANY tasks until preflight passes. // ================================================================= if (!this.preflightHttpPassed) { console.log(`[TaskWorker] ${this.friendlyName} BLOCKED - HTTP preflight not passed, cannot claim tasks`); await this.retryPreflightUntilPass(); return; // Return to main loop, will re-check on next iteration } // ================================================================= // WORKING HOURS GATE - Simulate natural traffic patterns // Workers scale their concurrent task limit based on the current // hour's weight in their timezone. This creates natural throughput: // 3 AM (5%): 1 slot → worker runs 1 task at a time // 6 PM (100%): 3 slots → worker runs full capacity // ================================================================= const { effectiveMax, hourWeight, reason } = await this.getEffectiveMaxTasks(); if (this.activeTasks.size >= effectiveMax) { // Already at working hours capacity - wait before checking again const sleepMs = 10000 + Math.random() * 20000; // 10-30 seconds if (this.activeTasks.size > 0) { // Only log if we're actually throttled (have tasks but can't take more) console.log(`[TaskWorker] ${this.friendlyName} AT CAPACITY - ${reason} (${this.activeTasks.size}/${effectiveMax} active)`); } await this.sleep(sleepMs); return; // Return to main loop, will re-check on next iteration } // Pass preflight capabilities to only claim compatible tasks const task = await taskService.claimTask( this.role, this.workerId, this.preflightCurlPassed, this.preflightHttpPassed ); if (task) { console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`); // ================================================================= // PREFLIGHT CHECK - Use stored preflight results based on task method // We already ran dual-transport preflights at startup, so just verify // the correct preflight passed for this task's required method. // ================================================================= const taskMethod = task.method || 'http'; // Default to http if not specified let preflightPassed = false; let preflightMsg = ''; if (taskMethod === 'http' && this.preflightHttpPassed) { preflightPassed = true; preflightMsg = `HTTP preflight passed (IP: ${this.preflightHttpResult?.proxyIp || 'unknown'})`; } else if (taskMethod === 'curl' && this.preflightCurlPassed) { preflightPassed = true; preflightMsg = `CURL preflight passed (IP: ${this.preflightCurlResult?.proxyIp || 'unknown'})`; } else if (!task.method && (this.preflightHttpPassed || this.preflightCurlPassed)) { // No method preference - either transport works preflightPassed = true; preflightMsg = this.preflightHttpPassed ? 'HTTP preflight passed' : 'CURL preflight passed'; } if (!preflightPassed) { const errorMsg = taskMethod === 'http' ? 'HTTP preflight not passed - cannot execute http tasks' : 'CURL preflight not passed - cannot execute curl tasks'; console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${errorMsg}`); console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without preflight`); // Release task back to pending so another worker can pick it up await taskService.releaseTask(task.id); // Wait before trying again - give proxies time to recover await this.sleep(30000); // 30 second wait on preflight failure return; } console.log(`[TaskWorker] ${this.friendlyName} preflight verified for task ${task.id}: ${preflightMsg}`); this.activeTasks.set(task.id, task); // Start task in background (don't await) const taskPromise = this.executeTask(task); this.taskPromises.set(task.id, taskPromise); // Clean up when done taskPromise.finally(() => { this.activeTasks.delete(task.id); this.taskPromises.delete(task.id); }); // Immediately try to claim more tasks (don't wait for poll interval) return; } } // No task claimed or at capacity - wait before next poll await this.sleep(POLL_INTERVAL_MS); } /** * Stop the worker */ async stop(): Promise { this.isRunning = false; this.stopHeartbeat(); this.stopRegistryHeartbeat(); this.stopPeriodicStaleCleanup(); await this.deregister(); console.log(`[TaskWorker] ${this.friendlyName} stopped`); } /** * Execute a single task (runs concurrently with other tasks) */ private async executeTask(task: WorkerTask): Promise { console.log(`[TaskWorker] ${this.friendlyName} starting task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`); try { // Mark as running await taskService.startTask(task.id); // Get handler for this role (considers method for dual-transport) const handler = getHandlerForTask(task); if (!handler) { throw new Error(`No handler registered for role: ${task.role}`); } // Create context with step tracking const ctx: TaskContext = { pool: this.pool, workerId: this.workerId, task, heartbeat: async () => { await taskService.heartbeat(task.id); }, crawlRotator: this.crawlRotator, updateStep: (step: string, detail?: string) => { this.updateTaskStep(task.id, step, detail); }, }; // Initialize step tracking for this task this.updateTaskStep(task.id, 'starting', `Initializing ${task.role}`); // Execute the task const result = await handler(ctx); if (result.success) { // Clear step tracking this.clearTaskStep(task.id); // Mark as completed await taskService.completeTask(task.id, result); await this.reportTaskCompletion(true); console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id} [${this.activeTasks.size}/${this.maxConcurrentTasks} active]`); // Chain next task if applicable const chainedTask = await taskService.chainNextTask({ ...task, status: 'completed', result, }); if (chainedTask) { console.log(`[TaskWorker] Chained new task ${chainedTask.id} (${chainedTask.role})`); } } else { // Clear step tracking this.clearTaskStep(task.id); // Mark as failed await taskService.failTask(task.id, result.error || 'Unknown error'); await this.reportTaskCompletion(false); console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`); } } catch (error: any) { // Clear step tracking this.clearTaskStep(task.id); // Mark as failed await taskService.failTask(task.id, error.message); await this.reportTaskCompletion(false); console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message); } // Note: cleanup (removing from activeTasks) is handled in mainLoop's finally block } /** * Check if this worker has been flagged for decommission * Returns true if worker should stop after current task */ private async checkDecommission(): Promise { try { // Check worker_registry for decommission flag const result = await this.pool.query( `SELECT decommission_requested, decommission_reason FROM worker_registry WHERE worker_id = $1`, [this.workerId] ); if (result.rows.length > 0 && result.rows[0].decommission_requested) { const reason = result.rows[0].decommission_reason || 'No reason provided'; console.log(`[TaskWorker] Decommission requested: ${reason}`); return true; } return false; } catch (error: any) { // If we can't check, continue running console.warn(`[TaskWorker] Could not check decommission status: ${error.message}`); return false; } } /** * Start heartbeat interval */ private startHeartbeat(taskId: number): void { this.heartbeatInterval = setInterval(async () => { try { await taskService.heartbeat(taskId); } catch (error: any) { console.warn(`[TaskWorker] Heartbeat failed:`, error.message); } }, HEARTBEAT_INTERVAL_MS); } /** * Stop heartbeat interval */ private stopHeartbeat(): void { if (this.heartbeatInterval) { clearInterval(this.heartbeatInterval); this.heartbeatInterval = null; } } /** * Sleep helper */ private sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } /** * Get worker info */ getInfo(): { workerId: string; role: TaskRole | null; isRunning: boolean; activeTaskIds: number[]; activeTaskCount: number; maxConcurrentTasks: number; isBackingOff: boolean; backoffReason: string | null; preflightCurlPassed: boolean; preflightHttpPassed: boolean; } { return { workerId: this.workerId, role: this.role, isRunning: this.isRunning, activeTaskIds: Array.from(this.activeTasks.keys()), activeTaskCount: this.activeTasks.size, maxConcurrentTasks: this.maxConcurrentTasks, isBackingOff: this.isBackingOff, backoffReason: this.backoffReason, preflightCurlPassed: this.preflightCurlPassed, preflightHttpPassed: this.preflightHttpPassed, }; } } // ============================================================ // CLI ENTRY POINT // ============================================================ async function main(): Promise { const role = process.env.WORKER_ROLE as TaskRole | undefined; // Per TASK_WORKFLOW_2024-12-10.md: Valid task roles const validRoles: TaskRole[] = [ 'store_discovery', 'entry_point_discovery', 'product_discovery', 'payload_fetch', // Fetches from API, saves to disk 'product_refresh', // Reads from disk, processes to DB 'analytics_refresh', ]; // If role specified, validate it if (role && !validRoles.includes(role)) { console.error(`Error: Invalid WORKER_ROLE: ${role}`); console.error(`Valid roles: ${validRoles.join(', ')}`); console.error('Or omit WORKER_ROLE for role-agnostic worker (any task)'); process.exit(1); } // Use POD_NAME for persistent identity in K8s StatefulSet // This ensures workers keep the same ID across restarts // Falls back to WORKER_ID, then generates UUID if neither is set const workerId = process.env.POD_NAME || process.env.WORKER_ID; // Pass null for role-agnostic, or the specific role const worker = new TaskWorker(role || null, workerId); // Handle graceful shutdown process.on('SIGTERM', () => { console.log('[TaskWorker] Received SIGTERM, shutting down...'); worker.stop(); }); process.on('SIGINT', () => { console.log('[TaskWorker] Received SIGINT, shutting down...'); worker.stop(); }); await worker.start(); } // Run if this is the main module if (require.main === module) { main().catch((error) => { console.error('[TaskWorker] Fatal error:', error); process.exit(1); }); } export { main };