Platform isolation:
- Rename handlers to {task}-{platform}.ts convention
- Deprecate -curl variants (now _deprecated-*)
- Platform-based routing in task-worker.ts
- Add Jane platform handlers and client
Evomi geo-targeting:
- Add dynamic proxy URL builder with state/city targeting
- Session stickiness per worker per state (30 min)
- Fallback to static proxy table when API unavailable
- Add proxy tracking columns to worker_tasks
Proxy management:
- New /proxies admin page for visibility
- Track proxy_ip, proxy_geo, proxy_source per task
- Show active sessions and task history
Validation filtering:
- Filter by validated stores (platform_dispensary_id + menu_url)
- Mark incomplete stores as deprecated
- Update all dashboard/stats queries
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1424 lines
54 KiB
TypeScript
1424 lines
54 KiB
TypeScript
/**
|
||
* Task Worker
|
||
*
|
||
* A unified worker that pulls tasks from the worker_tasks queue.
|
||
* Workers register on startup, get a friendly name, and pull tasks.
|
||
*
|
||
* Architecture:
|
||
* - Tasks are generated on schedule (by scheduler or API)
|
||
* - Workers PULL tasks from the pool (not assigned to them)
|
||
* - Tasks are claimed in order of priority (DESC) then creation time (ASC)
|
||
* - Workers report heartbeats to worker_registry
|
||
* - Workers are ROLE-AGNOSTIC by default (can handle any task type)
|
||
*
|
||
* Stealth & Anti-Detection (LAZY INITIALIZATION):
|
||
* Workers start IMMEDIATELY without waiting for proxies.
|
||
* Stealth systems (proxies, fingerprints, preflights) are initialized
|
||
* on first task claim, not at worker startup.
|
||
*
|
||
* This allows workers to:
|
||
* - Register and send heartbeats immediately
|
||
* - Wait in main loop without blocking on proxy availability
|
||
* - Initialize proxies/preflights only when tasks are actually available
|
||
*
|
||
* On first task claim attempt, workers initialize the CrawlRotator which provides:
|
||
* - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy
|
||
* - User-Agent rotation: Cycles through realistic browser fingerprints
|
||
* - Fingerprint rotation: Changes browser profile on blocks
|
||
* - Locale/timezone: Matches Accept-Language to target state
|
||
*
|
||
* The CrawlRotator is wired to the Dutchie client via setCrawlRotator().
|
||
* Task handlers call startSession() which picks a random fingerprint.
|
||
* On 403 errors, the client automatically:
|
||
* 1. Records failure on current proxy
|
||
* 2. Rotates to next proxy
|
||
* 3. Rotates fingerprint
|
||
* 4. Retries the request
|
||
*
|
||
* Usage:
|
||
* npx tsx src/tasks/task-worker.ts # Role-agnostic (any task)
|
||
* WORKER_ROLE=product_refresh npx tsx src/tasks/task-worker.ts # Role-specific
|
||
*
|
||
* Environment:
|
||
* WORKER_ROLE - Which task role to process (optional, null = any task)
|
||
* POD_NAME - K8s StatefulSet pod name (PRIMARY - use this for persistent identity)
|
||
* WORKER_ID - Custom worker ID (fallback if POD_NAME not set)
|
||
* POLL_INTERVAL_MS - How often to check for tasks (default: 5000)
|
||
* HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000)
|
||
* API_BASE_URL - Backend API URL for registration (default: http://localhost:3010)
|
||
*
|
||
* Worker Identity:
|
||
* Workers use POD_NAME as their worker_id for persistent identity across restarts.
|
||
* In K8s StatefulSet, POD_NAME = "scraper-worker-0" through "scraper-worker-7".
|
||
* This ensures workers re-register with the same ID instead of creating new entries.
|
||
*/
|
||
|
||
import { Pool } from 'pg';
|
||
import { v4 as uuidv4 } from 'uuid';
|
||
import { taskService, TaskRole, WorkerTask } from './task-service';
|
||
import { getPool } from '../db/pool';
|
||
import os from 'os';
|
||
|
||
// Stealth/rotation support
|
||
import { CrawlRotator } from '../services/crawl-rotator';
|
||
import { setCrawlRotator } from '../platforms/dutchie';
|
||
|
||
// Dual-transport preflight system
|
||
import { runCurlPreflight, CurlPreflightResult } from '../services/curl-preflight';
|
||
import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight';
|
||
|
||
// Task handlers by role
|
||
// Platform-based handlers: {task}-{platform}.ts convention
|
||
import { handleProductRefresh } from './handlers/product-refresh';
|
||
import { handleStoreDiscoveryState } from './handlers/store-discovery-state';
|
||
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
||
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
||
import { handleWhoami } from './handlers/whoami';
|
||
|
||
// Dutchie Platform Handlers
|
||
import { handleProductDiscoveryDutchie } from './handlers/product-discovery-dutchie';
|
||
import { handleStoreDiscoveryDutchie } from './handlers/store-discovery-dutchie';
|
||
|
||
// Jane Platform Handlers
|
||
import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane';
|
||
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
|
||
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
|
||
|
||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
|
||
|
||
// =============================================================================
|
||
// CONCURRENT TASK PROCESSING SETTINGS
|
||
// =============================================================================
|
||
// Workers can process multiple tasks simultaneously using async I/O.
|
||
// This improves throughput for I/O-bound tasks (network calls, DB queries).
|
||
//
|
||
// Resource thresholds trigger "backoff" - the worker stops claiming new tasks
|
||
// but continues processing existing ones until resources return to normal.
|
||
//
|
||
// See: docs/WORKER_TASK_ARCHITECTURE.md#concurrent-task-processing
|
||
// =============================================================================
|
||
|
||
// Maximum number of tasks this worker will run concurrently
|
||
// Browser tasks (Puppeteer) use ~400MB RAM each. With 2GB pod limit:
|
||
// - 3 browsers = ~1.3GB = SAFE
|
||
// - 4 browsers = ~1.7GB = RISKY
|
||
// - 5+ browsers = OOM CRASH
|
||
// See: docs/WORKER_TASK_ARCHITECTURE.md#browser-task-memory-limits
|
||
const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
|
||
|
||
// When heap memory usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
|
||
// Default 85% - gives headroom before OOM
|
||
const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
|
||
|
||
// Parse max heap size from NODE_OPTIONS (--max-old-space-size=1500)
|
||
// This is used as the denominator for memory percentage calculation
|
||
// V8's heapTotal is dynamic and stays small when idle, causing false high percentages
|
||
function getMaxHeapSizeMb(): number {
|
||
const nodeOptions = process.env.NODE_OPTIONS || '';
|
||
const match = nodeOptions.match(/--max-old-space-size=(\d+)/);
|
||
if (match) {
|
||
return parseInt(match[1], 10);
|
||
}
|
||
// Fallback: use 512MB if not specified
|
||
return 512;
|
||
}
|
||
const MAX_HEAP_SIZE_MB = getMaxHeapSizeMb();
|
||
|
||
// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
|
||
// Default 90% - allows some burst capacity
|
||
const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
|
||
|
||
// How long to wait (ms) when in backoff state before rechecking resources
|
||
const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
|
||
|
||
export interface TaskContext {
|
||
pool: Pool;
|
||
workerId: string;
|
||
task: WorkerTask;
|
||
heartbeat: () => Promise<void>;
|
||
crawlRotator?: CrawlRotator;
|
||
/** Update the current step being executed (shown in dashboard) */
|
||
updateStep: (step: string, detail?: string) => void;
|
||
}
|
||
|
||
export interface TaskResult {
|
||
success: boolean;
|
||
productsProcessed?: number;
|
||
snapshotsCreated?: number;
|
||
storesDiscovered?: number;
|
||
error?: string;
|
||
[key: string]: unknown;
|
||
}
|
||
|
||
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
|
||
|
||
// Per TASK_WORKFLOW_2024-12-10.md: Handler registry
|
||
// Platform-agnostic handlers (shared across Dutchie and Jane)
|
||
// product_refresh: Reads local payload, uses platform-aware normalizer, upserts to DB
|
||
const SHARED_HANDLERS: Partial<Record<TaskRole, TaskHandler>> = {
|
||
product_refresh: handleProductRefresh,
|
||
store_discovery_state: handleStoreDiscoveryState,
|
||
entry_point_discovery: handleEntryPointDiscovery,
|
||
analytics_refresh: handleAnalyticsRefresh,
|
||
whoami: handleWhoami,
|
||
};
|
||
|
||
/**
|
||
* Get the appropriate handler for a task based on platform.
|
||
*
|
||
* Naming convention: {task}-{platform}.ts
|
||
* - product-discovery-dutchie.ts
|
||
* - product-discovery-jane.ts
|
||
* - store-discovery-dutchie.ts
|
||
* - store-discovery-jane.ts
|
||
*
|
||
* All handlers use HTTP/Puppeteer transport (curl transport is deprecated).
|
||
*/
|
||
function getHandlerForTask(task: WorkerTask): TaskHandler | undefined {
|
||
const role = task.role as TaskRole;
|
||
const platform = task.platform || 'dutchie';
|
||
|
||
// ==========================================================================
|
||
// JANE PLATFORM ROUTING
|
||
// ==========================================================================
|
||
if (platform === 'jane') {
|
||
if (role === 'store_discovery' || role === 'store_discovery_state') {
|
||
console.log(`[TaskWorker] Using Jane handler for store_discovery`);
|
||
return handleStoreDiscoveryJane;
|
||
}
|
||
if (role === 'entry_point_discovery') {
|
||
console.log(`[TaskWorker] Using Jane handler for entry_point_discovery`);
|
||
return handleEntryPointDiscoveryJane;
|
||
}
|
||
if (role === 'product_discovery') {
|
||
console.log(`[TaskWorker] Using Jane handler for product_discovery`);
|
||
return handleProductDiscoveryJane;
|
||
}
|
||
}
|
||
|
||
// ==========================================================================
|
||
// DUTCHIE PLATFORM ROUTING (default)
|
||
// ==========================================================================
|
||
if (role === 'product_discovery') {
|
||
console.log(`[TaskWorker] Using Dutchie handler for product_discovery`);
|
||
return handleProductDiscoveryDutchie;
|
||
}
|
||
|
||
if (role === 'store_discovery') {
|
||
console.log(`[TaskWorker] Using Dutchie handler for store_discovery`);
|
||
return handleStoreDiscoveryDutchie;
|
||
}
|
||
|
||
// ==========================================================================
|
||
// SHARED HANDLERS (platform-agnostic)
|
||
// ==========================================================================
|
||
return SHARED_HANDLERS[role];
|
||
}
|
||
|
||
/**
|
||
* Resource usage stats reported to the registry and used for backoff decisions.
|
||
* These values are included in worker heartbeats and displayed in the UI.
|
||
*/
|
||
interface ResourceStats {
|
||
/** Current heap memory usage as decimal (0.0 to 1.0) */
|
||
memoryPercent: number;
|
||
/** Current heap used in MB */
|
||
memoryMb: number;
|
||
/** Total heap available in MB */
|
||
memoryTotalMb: number;
|
||
/** CPU usage percentage since last check (0 to 100) */
|
||
cpuPercent: number;
|
||
/** True if worker is currently in backoff state */
|
||
isBackingOff: boolean;
|
||
/** Reason for backoff (e.g., "Memory at 87.3% (threshold: 85%)") */
|
||
backoffReason: string | null;
|
||
}
|
||
|
||
export class TaskWorker {
|
||
private pool: Pool;
|
||
private workerId: string;
|
||
private role: TaskRole | null; // null = role-agnostic (any task)
|
||
private friendlyName: string = '';
|
||
private isRunning: boolean = false;
|
||
private heartbeatInterval: NodeJS.Timeout | null = null;
|
||
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
|
||
private staleCleanupInterval: NodeJS.Timeout | null = null;
|
||
private crawlRotator: CrawlRotator;
|
||
|
||
// ==========================================================================
|
||
// CONCURRENT TASK TRACKING
|
||
// ==========================================================================
|
||
// activeTasks: Map of task ID -> task object for all currently running tasks
|
||
// taskPromises: Map of task ID -> Promise for cleanup when task completes
|
||
// maxConcurrentTasks: How many tasks this worker will run in parallel
|
||
// ==========================================================================
|
||
private activeTasks: Map<number, WorkerTask> = new Map();
|
||
private taskPromises: Map<number, Promise<void>> = new Map();
|
||
private maxConcurrentTasks: number = MAX_CONCURRENT_TASKS;
|
||
|
||
// ==========================================================================
|
||
// RESOURCE MONITORING FOR BACKOFF
|
||
// ==========================================================================
|
||
// CPU tracking uses differential measurement - we track last values and
|
||
// calculate percentage based on elapsed time since last check.
|
||
// ==========================================================================
|
||
private lastCpuUsage: { user: number; system: number } = { user: 0, system: 0 };
|
||
private lastCpuCheck: number = Date.now();
|
||
private isBackingOff: boolean = false;
|
||
private backoffReason: string | null = null;
|
||
|
||
// ==========================================================================
|
||
// DUAL-TRANSPORT PREFLIGHT STATUS
|
||
// ==========================================================================
|
||
// Workers run BOTH preflights on startup:
|
||
// - curl: axios/proxy transport - fast, for simple API calls
|
||
// - http: Puppeteer/browser transport - anti-detect, for Dutchie GraphQL
|
||
//
|
||
// Task claiming checks method compatibility - worker must have passed
|
||
// the preflight for the task's required method.
|
||
// ==========================================================================
|
||
private preflightCurlPassed: boolean = false;
|
||
private preflightHttpPassed: boolean = false;
|
||
private preflightCurlResult: CurlPreflightResult | null = null;
|
||
private preflightHttpResult: PuppeteerPreflightResult | null = null;
|
||
|
||
// ==========================================================================
|
||
// LAZY INITIALIZATION FLAGS
|
||
// ==========================================================================
|
||
// Stealth/proxy initialization is deferred until first task claim.
|
||
// Workers register immediately and enter main loop without blocking.
|
||
// ==========================================================================
|
||
private stealthInitialized: boolean = false;
|
||
private preflightsCompleted: boolean = false;
|
||
private initializingPromise: Promise<void> | null = null;
|
||
|
||
// ==========================================================================
|
||
// PREFLIGHT RETRY SETTINGS
|
||
// ==========================================================================
|
||
// If preflight fails, worker retries every PREFLIGHT_RETRY_INTERVAL_MS
|
||
// Worker is BLOCKED from claiming ANY tasks until preflight passes.
|
||
// This ensures unqualified workers never touch the task pool.
|
||
// ==========================================================================
|
||
private static readonly PREFLIGHT_RETRY_INTERVAL_MS = 30000; // 30 seconds
|
||
private isRetryingPreflight: boolean = false;
|
||
|
||
// ==========================================================================
|
||
// STEP TRACKING FOR DASHBOARD VISIBILITY
|
||
// ==========================================================================
|
||
// Workers report their current step in heartbeats so the dashboard can show
|
||
// real-time progress like "preflight", "loading page", "processing products"
|
||
// ==========================================================================
|
||
private currentStep: string = 'idle';
|
||
private currentStepDetail: string | null = null;
|
||
private currentStepStartedAt: Date | null = null;
|
||
/** Map of task ID -> step info for concurrent tasks */
|
||
private taskSteps: Map<number, { step: string; detail: string | null; startedAt: Date }> = new Map();
|
||
|
||
constructor(role: TaskRole | null = null, workerId?: string) {
|
||
this.pool = getPool();
|
||
this.role = role;
|
||
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
|
||
this.crawlRotator = new CrawlRotator(this.pool);
|
||
|
||
// Initialize CPU tracking
|
||
const cpuUsage = process.cpuUsage();
|
||
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
|
||
this.lastCpuCheck = Date.now();
|
||
}
|
||
|
||
/**
|
||
* Get current resource usage
|
||
* Memory percentage is calculated against MAX_HEAP_SIZE_MB (from --max-old-space-size)
|
||
* NOT against V8's dynamic heapTotal which stays small when idle
|
||
*/
|
||
private getResourceStats(): ResourceStats {
|
||
const memUsage = process.memoryUsage();
|
||
const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
|
||
// Use MAX_HEAP_SIZE_MB as ceiling, not dynamic heapTotal
|
||
// V8's heapTotal stays small when idle (e.g., 36MB) causing false 95%+ readings
|
||
// With --max-old-space-size=1500, we should calculate against 1500MB
|
||
const memoryPercent = heapUsedMb / MAX_HEAP_SIZE_MB;
|
||
|
||
// Calculate CPU usage since last check
|
||
const cpuUsage = process.cpuUsage();
|
||
const now = Date.now();
|
||
const elapsed = now - this.lastCpuCheck;
|
||
|
||
let cpuPercent = 0;
|
||
if (elapsed > 0) {
|
||
const userDiff = (cpuUsage.user - this.lastCpuUsage.user) / 1000; // microseconds to ms
|
||
const systemDiff = (cpuUsage.system - this.lastCpuUsage.system) / 1000;
|
||
cpuPercent = ((userDiff + systemDiff) / elapsed) * 100;
|
||
}
|
||
|
||
// Update last values
|
||
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
|
||
this.lastCpuCheck = now;
|
||
|
||
return {
|
||
memoryPercent,
|
||
memoryMb: Math.round(heapUsedMb),
|
||
memoryTotalMb: MAX_HEAP_SIZE_MB, // Use max-old-space-size, not dynamic heapTotal
|
||
cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
|
||
isBackingOff: this.isBackingOff,
|
||
backoffReason: this.backoffReason,
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Check if we should back off from taking new tasks
|
||
*/
|
||
private shouldBackOff(): { backoff: boolean; reason: string | null } {
|
||
const stats = this.getResourceStats();
|
||
|
||
if (stats.memoryPercent > MEMORY_BACKOFF_THRESHOLD) {
|
||
return { backoff: true, reason: `Memory at ${(stats.memoryPercent * 100).toFixed(1)}% (threshold: ${MEMORY_BACKOFF_THRESHOLD * 100}%)` };
|
||
}
|
||
|
||
if (stats.cpuPercent > CPU_BACKOFF_THRESHOLD * 100) {
|
||
return { backoff: true, reason: `CPU at ${stats.cpuPercent.toFixed(1)}% (threshold: ${CPU_BACKOFF_THRESHOLD * 100}%)` };
|
||
}
|
||
|
||
return { backoff: false, reason: null };
|
||
}
|
||
|
||
/**
|
||
* Get count of currently running tasks
|
||
*/
|
||
get activeTaskCount(): number {
|
||
return this.activeTasks.size;
|
||
}
|
||
|
||
/**
|
||
* Check if we can accept more tasks
|
||
*/
|
||
private canAcceptMoreTasks(): boolean {
|
||
return this.activeTasks.size < this.maxConcurrentTasks;
|
||
}
|
||
|
||
// ==========================================================================
|
||
// STEP TRACKING METHODS
|
||
// ==========================================================================
|
||
|
||
/**
|
||
* Update the current step for a task (for dashboard visibility)
|
||
* @param taskId - The task ID to update
|
||
* @param step - Short step name (e.g., "preflight", "loading", "processing")
|
||
* @param detail - Optional detail (e.g., "Verifying IP 1.2.3.4")
|
||
*/
|
||
public updateTaskStep(taskId: number, step: string, detail?: string): void {
|
||
this.taskSteps.set(taskId, {
|
||
step,
|
||
detail: detail || null,
|
||
startedAt: new Date(),
|
||
});
|
||
|
||
// Also update the "primary" step for single-task backwards compat
|
||
if (this.activeTasks.size === 1 || taskId === Array.from(this.activeTasks.keys())[0]) {
|
||
this.currentStep = step;
|
||
this.currentStepDetail = detail || null;
|
||
this.currentStepStartedAt = new Date();
|
||
}
|
||
|
||
console.log(`[TaskWorker] Step: ${step}${detail ? ` - ${detail}` : ''} (task #${taskId})`);
|
||
}
|
||
|
||
/**
|
||
* Clear step tracking for a task (when task completes)
|
||
*/
|
||
private clearTaskStep(taskId: number): void {
|
||
this.taskSteps.delete(taskId);
|
||
|
||
// Reset primary step if no more active tasks
|
||
if (this.activeTasks.size === 0) {
|
||
this.currentStep = 'idle';
|
||
this.currentStepDetail = null;
|
||
this.currentStepStartedAt = null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get current step info for all active tasks (for heartbeat)
|
||
*/
|
||
private getTaskStepsInfo(): Array<{
|
||
task_id: number;
|
||
step: string;
|
||
detail: string | null;
|
||
elapsed_ms: number;
|
||
}> {
|
||
const now = Date.now();
|
||
return Array.from(this.taskSteps.entries()).map(([taskId, info]) => ({
|
||
task_id: taskId,
|
||
step: info.step,
|
||
detail: info.detail,
|
||
elapsed_ms: now - info.startedAt.getTime(),
|
||
}));
|
||
}
|
||
|
||
/**
|
||
* Initialize stealth systems (proxy rotation, fingerprints)
|
||
* Called LAZILY on first task claim attempt (NOT at worker startup).
|
||
*
|
||
* IMPORTANT: Proxies are REQUIRED to claim tasks. This method waits until proxies are available.
|
||
* Workers listen for PostgreSQL NOTIFY 'proxy_added' to wake up immediately when proxies are added.
|
||
*/
|
||
private async initializeStealth(): Promise<void> {
|
||
const MAX_WAIT_MINUTES = 60;
|
||
const POLL_INTERVAL_MS = 30000; // 30 seconds fallback polling
|
||
const maxAttempts = (MAX_WAIT_MINUTES * 60 * 1000) / POLL_INTERVAL_MS;
|
||
let attempts = 0;
|
||
let notifyClient: any = null;
|
||
|
||
// Set up PostgreSQL LISTEN for proxy notifications
|
||
try {
|
||
notifyClient = await this.pool.connect();
|
||
await notifyClient.query('LISTEN proxy_added');
|
||
console.log(`[TaskWorker] Listening for proxy_added notifications...`);
|
||
} catch (err: any) {
|
||
console.log(`[TaskWorker] Could not set up LISTEN (will poll): ${err.message}`);
|
||
}
|
||
|
||
// Create a promise that resolves when notified
|
||
let notifyResolve: (() => void) | null = null;
|
||
if (notifyClient) {
|
||
notifyClient.on('notification', (msg: any) => {
|
||
if (msg.channel === 'proxy_added') {
|
||
console.log(`[TaskWorker] Received proxy_added notification!`);
|
||
if (notifyResolve) notifyResolve();
|
||
}
|
||
});
|
||
}
|
||
|
||
try {
|
||
while (attempts < maxAttempts) {
|
||
try {
|
||
// Load proxies from database
|
||
await this.crawlRotator.initialize();
|
||
|
||
const stats = this.crawlRotator.proxy.getStats();
|
||
if (stats.activeProxies > 0) {
|
||
console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`);
|
||
|
||
// Wire rotator to Dutchie client - proxies will be used for ALL requests
|
||
setCrawlRotator(this.crawlRotator);
|
||
|
||
console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`);
|
||
return;
|
||
}
|
||
|
||
attempts++;
|
||
console.log(`[TaskWorker] No active proxies available (attempt ${attempts}). Waiting for proxies...`);
|
||
|
||
// Wait for either notification or timeout
|
||
await new Promise<void>((resolve) => {
|
||
notifyResolve = resolve;
|
||
setTimeout(resolve, POLL_INTERVAL_MS);
|
||
});
|
||
} catch (error: any) {
|
||
attempts++;
|
||
console.log(`[TaskWorker] Error loading proxies (attempt ${attempts}): ${error.message}. Retrying...`);
|
||
await this.sleep(POLL_INTERVAL_MS);
|
||
}
|
||
}
|
||
|
||
throw new Error(`No active proxies available after waiting ${MAX_WAIT_MINUTES} minutes. Add proxies to the database.`);
|
||
} finally {
|
||
// Clean up LISTEN connection
|
||
if (notifyClient) {
|
||
try {
|
||
await notifyClient.query('UNLISTEN proxy_added');
|
||
notifyClient.release();
|
||
} catch {
|
||
// Ignore cleanup errors
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Run dual-transport preflights on startup
|
||
* Tests both curl (axios/proxy) and http (Puppeteer/browser) transport methods.
|
||
* Results are reported to worker_registry and used for task claiming.
|
||
*
|
||
* NOTE: All current tasks require 'http' method, so http preflight must pass
|
||
* for the worker to claim any tasks. Curl preflight is for future use.
|
||
*/
|
||
private async runDualPreflights(): Promise<void> {
|
||
console.log(`[TaskWorker] Running dual-transport preflights...`);
|
||
|
||
// Run both preflights in parallel for efficiency
|
||
const [curlResult, httpResult] = await Promise.all([
|
||
runCurlPreflight(this.crawlRotator).catch((err): CurlPreflightResult => ({
|
||
method: 'curl',
|
||
passed: false,
|
||
proxyAvailable: false,
|
||
proxyConnected: false,
|
||
antidetectReady: false,
|
||
proxyIp: null,
|
||
fingerprint: null,
|
||
error: `Preflight error: ${err.message}`,
|
||
responseTimeMs: null,
|
||
})),
|
||
runPuppeteerPreflightWithRetry(this.crawlRotator, 1).catch((err): PuppeteerPreflightResult => ({
|
||
method: 'http',
|
||
passed: false,
|
||
proxyAvailable: false,
|
||
proxyConnected: false,
|
||
antidetectReady: false,
|
||
proxyIp: null,
|
||
fingerprint: null,
|
||
error: `Preflight error: ${err.message}`,
|
||
responseTimeMs: null,
|
||
productsReturned: 0,
|
||
})),
|
||
]);
|
||
|
||
// Store results
|
||
this.preflightCurlResult = curlResult;
|
||
this.preflightHttpResult = httpResult;
|
||
this.preflightCurlPassed = curlResult.passed;
|
||
this.preflightHttpPassed = httpResult.passed;
|
||
|
||
// Log results
|
||
console.log(`[TaskWorker] CURL preflight: ${curlResult.passed ? 'PASSED' : 'FAILED'}${curlResult.error ? ` - ${curlResult.error}` : ''}`);
|
||
console.log(`[TaskWorker] HTTP preflight: ${httpResult.passed ? 'PASSED' : 'FAILED'}${httpResult.error ? ` - ${httpResult.error}` : ''}`);
|
||
|
||
if (httpResult.passed && httpResult.productsReturned) {
|
||
console.log(`[TaskWorker] HTTP preflight returned ${httpResult.productsReturned} products from test store`);
|
||
}
|
||
|
||
// Report to worker_registry via API
|
||
await this.reportPreflightStatus();
|
||
|
||
// Since all tasks require 'http', warn if http preflight failed
|
||
if (!this.preflightHttpPassed) {
|
||
console.warn(`[TaskWorker] WARNING: HTTP preflight failed - this worker cannot claim any tasks!`);
|
||
console.warn(`[TaskWorker] Error: ${httpResult.error}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Report preflight status to worker_registry
|
||
* Function signature: update_worker_preflight(worker_id, transport, status, ip, response_ms, error, fingerprint)
|
||
*/
|
||
private async reportPreflightStatus(): Promise<void> {
|
||
try {
|
||
// Update worker_registry directly via SQL (more reliable than API)
|
||
// CURL preflight - includes IP address
|
||
await this.pool.query(`
|
||
SELECT update_worker_preflight($1, 'curl', $2, $3, $4, $5, $6)
|
||
`, [
|
||
this.workerId,
|
||
this.preflightCurlPassed ? 'passed' : 'failed',
|
||
this.preflightCurlResult?.proxyIp || null,
|
||
this.preflightCurlResult?.responseTimeMs || null,
|
||
this.preflightCurlResult?.error || null,
|
||
null, // No fingerprint for curl
|
||
]);
|
||
|
||
// HTTP preflight - includes IP, fingerprint, and timezone data
|
||
const httpFingerprint = this.preflightHttpResult ? {
|
||
...this.preflightHttpResult.fingerprint,
|
||
detectedTimezone: (this.preflightHttpResult as any).detectedTimezone,
|
||
detectedLocation: (this.preflightHttpResult as any).detectedLocation,
|
||
productsReturned: this.preflightHttpResult.productsReturned,
|
||
botDetection: (this.preflightHttpResult as any).botDetection,
|
||
} : null;
|
||
|
||
await this.pool.query(`
|
||
SELECT update_worker_preflight($1, 'http', $2, $3, $4, $5, $6)
|
||
`, [
|
||
this.workerId,
|
||
this.preflightHttpPassed ? 'passed' : 'failed',
|
||
this.preflightHttpResult?.proxyIp || null,
|
||
this.preflightHttpResult?.responseTimeMs || null,
|
||
this.preflightHttpResult?.error || null,
|
||
httpFingerprint ? JSON.stringify(httpFingerprint) : null,
|
||
]);
|
||
|
||
console.log(`[TaskWorker] Preflight status reported to worker_registry`);
|
||
if (this.preflightHttpResult?.proxyIp) {
|
||
console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${(this.preflightHttpResult as any).detectedTimezone || 'unknown'}`);
|
||
}
|
||
} catch (err: any) {
|
||
// Non-fatal - worker can still function
|
||
console.warn(`[TaskWorker] Could not report preflight status: ${err.message}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Retry preflight until it passes.
|
||
* Worker is BLOCKED from claiming ANY tasks until HTTP preflight passes.
|
||
* This ensures unqualified workers never touch the task pool.
|
||
*
|
||
* All current tasks require 'http' method, so HTTP preflight is mandatory.
|
||
*/
|
||
private async retryPreflightUntilPass(): Promise<void> {
|
||
if (this.preflightHttpPassed) {
|
||
return; // Already passed
|
||
}
|
||
|
||
if (this.isRetryingPreflight) {
|
||
return; // Already retrying
|
||
}
|
||
|
||
this.isRetryingPreflight = true;
|
||
let retryCount = 0;
|
||
|
||
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight FAILED - entering retry loop (every ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s)`);
|
||
console.log(`[TaskWorker] ${this.friendlyName} BLOCKED from task pool until preflight passes`);
|
||
|
||
while (!this.preflightHttpPassed && this.isRunning) {
|
||
retryCount++;
|
||
|
||
// Wait before retry
|
||
await this.sleep(TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS);
|
||
|
||
if (!this.isRunning) {
|
||
break; // Worker stopping
|
||
}
|
||
|
||
console.log(`[TaskWorker] ${this.friendlyName} preflight retry #${retryCount}...`);
|
||
|
||
// Reload proxies before retry (might have new ones)
|
||
try {
|
||
await this.crawlRotator.initialize();
|
||
const stats = this.crawlRotator.proxy.getStats();
|
||
console.log(`[TaskWorker] Proxies available: ${stats.activeProxies}`);
|
||
} catch (err: any) {
|
||
console.warn(`[TaskWorker] Proxy reload failed: ${err.message}`);
|
||
}
|
||
|
||
// Re-run HTTP preflight
|
||
try {
|
||
const httpResult = await runPuppeteerPreflightWithRetry(this.crawlRotator, 1);
|
||
this.preflightHttpResult = httpResult;
|
||
this.preflightHttpPassed = httpResult.passed;
|
||
|
||
if (httpResult.passed) {
|
||
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight PASSED on retry #${retryCount}!`);
|
||
console.log(`[TaskWorker] ${this.friendlyName} IP: ${httpResult.proxyIp}, Products: ${httpResult.productsReturned}`);
|
||
console.log(`[TaskWorker] ${this.friendlyName} now QUALIFIED to claim tasks`);
|
||
|
||
// Report updated status
|
||
await this.reportPreflightStatus();
|
||
break;
|
||
} else {
|
||
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight still FAILED: ${httpResult.error}`);
|
||
console.log(`[TaskWorker] ${this.friendlyName} will retry in ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s...`);
|
||
}
|
||
} catch (err: any) {
|
||
console.error(`[TaskWorker] ${this.friendlyName} preflight retry error: ${err.message}`);
|
||
}
|
||
}
|
||
|
||
this.isRetryingPreflight = false;
|
||
}
|
||
|
||
/**
|
||
* Get the effective max concurrent tasks based on working hours.
|
||
* Uses the worker's timezone (from preflight IP geolocation) to determine
|
||
* the current hour's weight, then scales max concurrent tasks accordingly.
|
||
*
|
||
* This creates natural traffic patterns - workers run fewer concurrent
|
||
* tasks during off-peak hours, full capacity during peak hours.
|
||
*
|
||
* Example with MAX_CONCURRENT_TASKS = 3:
|
||
* 3 AM (5% weight): effectiveMax = max(1, round(3 × 0.05)) = 1
|
||
* 12 PM (75% weight): effectiveMax = max(1, round(3 × 0.75)) = 2
|
||
* 6 PM (100% weight): effectiveMax = max(1, round(3 × 1.00)) = 3
|
||
*/
|
||
private async getEffectiveMaxTasks(): Promise<{ effectiveMax: number; hourWeight: number; reason: string }> {
|
||
try {
|
||
const result = await this.pool.query(`
|
||
SELECT current_hour, hour_weight, worker_timezone
|
||
FROM check_working_hours($1, 'natural_traffic')
|
||
`, [this.workerId]);
|
||
|
||
if (result.rows.length === 0) {
|
||
// Function returned nothing - default to full capacity
|
||
return {
|
||
effectiveMax: this.maxConcurrentTasks,
|
||
hourWeight: 100,
|
||
reason: 'Working hours check returned no result - using full capacity'
|
||
};
|
||
}
|
||
|
||
const row = result.rows[0];
|
||
const hourWeight = row.hour_weight || 100;
|
||
|
||
// Scale max concurrent tasks by hour weight, minimum 1
|
||
const effectiveMax = Math.max(1, Math.round(this.maxConcurrentTasks * hourWeight / 100));
|
||
|
||
return {
|
||
effectiveMax,
|
||
hourWeight,
|
||
reason: `Hour ${row.current_hour} (${row.worker_timezone}): ${hourWeight}% → ${effectiveMax}/${this.maxConcurrentTasks} slots`
|
||
};
|
||
} catch (err: any) {
|
||
// On error, default to full capacity (don't block workers due to DB issues)
|
||
console.warn(`[TaskWorker] Working hours check failed: ${err.message} - using full capacity`);
|
||
return {
|
||
effectiveMax: this.maxConcurrentTasks,
|
||
hourWeight: 100,
|
||
reason: 'Working hours check error - using full capacity'
|
||
};
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Lazy initialization of stealth systems.
|
||
* Called BEFORE claiming first task (not at worker startup).
|
||
* This allows workers to register and enter main loop immediately.
|
||
*
|
||
* Returns true if initialization succeeded, false otherwise.
|
||
*/
|
||
private async ensureStealthInitialized(): Promise<boolean> {
|
||
// Already initialized
|
||
if (this.stealthInitialized && this.preflightsCompleted) {
|
||
return true;
|
||
}
|
||
|
||
// Already initializing (prevent concurrent init attempts)
|
||
if (this.initializingPromise) {
|
||
await this.initializingPromise;
|
||
return this.stealthInitialized && this.preflightsCompleted;
|
||
}
|
||
|
||
console.log(`[TaskWorker] ${this.friendlyName} lazy-initializing stealth systems (first task claim)...`);
|
||
|
||
this.initializingPromise = (async () => {
|
||
try {
|
||
// Initialize proxy/fingerprint rotation
|
||
await this.initializeStealth();
|
||
this.stealthInitialized = true;
|
||
|
||
// Run dual-transport preflights
|
||
await this.runDualPreflights();
|
||
this.preflightsCompleted = true;
|
||
|
||
const preflightMsg = `curl=${this.preflightCurlPassed ? '✓' : '✗'} http=${this.preflightHttpPassed ? '✓' : '✗'}`;
|
||
console.log(`[TaskWorker] ${this.friendlyName} stealth ready (${preflightMsg})`);
|
||
} catch (err: any) {
|
||
console.error(`[TaskWorker] ${this.friendlyName} stealth init failed: ${err.message}`);
|
||
this.stealthInitialized = false;
|
||
this.preflightsCompleted = false;
|
||
}
|
||
})();
|
||
|
||
await this.initializingPromise;
|
||
this.initializingPromise = null;
|
||
return this.stealthInitialized && this.preflightsCompleted;
|
||
}
|
||
|
||
/**
|
||
* Register worker with the registry (get friendly name)
|
||
*/
|
||
private async register(): Promise<void> {
|
||
try {
|
||
const response = await fetch(`${API_BASE_URL}/api/worker-registry/register`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
role: this.role,
|
||
worker_id: this.workerId,
|
||
pod_name: process.env.POD_NAME || process.env.HOSTNAME,
|
||
hostname: os.hostname(),
|
||
metadata: {
|
||
pid: process.pid,
|
||
node_version: process.version,
|
||
started_at: new Date().toISOString()
|
||
}
|
||
})
|
||
});
|
||
|
||
const data = await response.json();
|
||
if (data.success) {
|
||
this.friendlyName = data.friendly_name;
|
||
console.log(`[TaskWorker] ${data.message}`);
|
||
} else {
|
||
console.warn(`[TaskWorker] Registration warning: ${data.error}`);
|
||
this.friendlyName = this.workerId.slice(0, 12);
|
||
}
|
||
} catch (error: any) {
|
||
// Registration is optional - worker can still function without it
|
||
console.warn(`[TaskWorker] Could not register with API (will continue): ${error.message}`);
|
||
this.friendlyName = this.workerId.slice(0, 12);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Deregister worker from the registry
|
||
*/
|
||
private async deregister(): Promise<void> {
|
||
try {
|
||
await fetch(`${API_BASE_URL}/api/worker-registry/deregister`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ worker_id: this.workerId })
|
||
});
|
||
console.log(`[TaskWorker] ${this.friendlyName} signed off`);
|
||
} catch {
|
||
// Ignore deregistration errors
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Send heartbeat to registry with resource usage, proxy location, and step info
|
||
*/
|
||
private async sendRegistryHeartbeat(): Promise<void> {
|
||
try {
|
||
const memUsage = process.memoryUsage();
|
||
const cpuUsage = process.cpuUsage();
|
||
const proxyLocation = this.crawlRotator.getProxyLocation();
|
||
const resourceStats = this.getResourceStats();
|
||
|
||
// Get array of active task IDs
|
||
const activeTaskIds = Array.from(this.activeTasks.keys());
|
||
|
||
// Get step info for all active tasks
|
||
const taskSteps = this.getTaskStepsInfo();
|
||
|
||
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
worker_id: this.workerId,
|
||
current_task_id: activeTaskIds[0] || null, // Primary task for backwards compat
|
||
current_task_ids: activeTaskIds, // All active tasks
|
||
active_task_count: this.activeTasks.size,
|
||
max_concurrent_tasks: this.maxConcurrentTasks,
|
||
status: this.activeTasks.size > 0 ? 'active' : 'idle',
|
||
// Step tracking for dashboard visibility
|
||
current_step: this.currentStep,
|
||
current_step_detail: this.currentStepDetail,
|
||
current_step_started_at: this.currentStepStartedAt?.toISOString() || null,
|
||
task_steps: taskSteps, // Per-task step info for concurrent workers
|
||
resources: {
|
||
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
|
||
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
|
||
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
|
||
memory_percent: Math.round(resourceStats.memoryPercent * 100),
|
||
cpu_user_ms: Math.round(cpuUsage.user / 1000),
|
||
cpu_system_ms: Math.round(cpuUsage.system / 1000),
|
||
cpu_percent: Math.round(resourceStats.cpuPercent),
|
||
proxy_location: proxyLocation,
|
||
is_backing_off: this.isBackingOff,
|
||
backoff_reason: this.backoffReason,
|
||
}
|
||
})
|
||
});
|
||
} catch {
|
||
// Ignore heartbeat errors
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Report task completion to registry
|
||
*/
|
||
private async reportTaskCompletion(success: boolean): Promise<void> {
|
||
try {
|
||
await fetch(`${API_BASE_URL}/api/worker-registry/task-completed`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
worker_id: this.workerId,
|
||
success
|
||
})
|
||
});
|
||
} catch {
|
||
// Ignore errors
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Start registry heartbeat interval
|
||
*/
|
||
private startRegistryHeartbeat(): void {
|
||
this.registryHeartbeatInterval = setInterval(async () => {
|
||
await this.sendRegistryHeartbeat();
|
||
}, HEARTBEAT_INTERVAL_MS);
|
||
}
|
||
|
||
/**
|
||
* Stop registry heartbeat interval
|
||
*/
|
||
private stopRegistryHeartbeat(): void {
|
||
if (this.registryHeartbeatInterval) {
|
||
clearInterval(this.registryHeartbeatInterval);
|
||
this.registryHeartbeatInterval = null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Run stale task cleanup once
|
||
* Recovers tasks left in claimed/running status after worker crashes
|
||
*/
|
||
private async runStaleTaskCleanup(): Promise<void> {
|
||
try {
|
||
console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`);
|
||
const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold
|
||
if (cleanupResult.cleaned > 0) {
|
||
console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`);
|
||
}
|
||
} catch (err: any) {
|
||
console.error(`[TaskWorker] Stale task cleanup error:`, err.message);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Start periodic stale task cleanup (every 10 minutes)
|
||
* Only run by worker-0 to avoid races
|
||
*/
|
||
private startPeriodicStaleCleanup(): void {
|
||
const STALE_CLEANUP_INTERVAL_MS = 10 * 60 * 1000; // 10 minutes
|
||
this.staleCleanupInterval = setInterval(async () => {
|
||
await this.runStaleTaskCleanup();
|
||
}, STALE_CLEANUP_INTERVAL_MS);
|
||
console.log(`[TaskWorker] ${this.friendlyName} started periodic stale cleanup (every 10 min)`);
|
||
}
|
||
|
||
/**
|
||
* Stop periodic stale task cleanup
|
||
*/
|
||
private stopPeriodicStaleCleanup(): void {
|
||
if (this.staleCleanupInterval) {
|
||
clearInterval(this.staleCleanupInterval);
|
||
this.staleCleanupInterval = null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Start the worker loop
|
||
*
|
||
* Workers start IMMEDIATELY without blocking on proxy/preflight init.
|
||
* Stealth systems are lazy-initialized on first task claim.
|
||
* This allows workers to register and send heartbeats even when proxies aren't ready.
|
||
*/
|
||
async start(): Promise<void> {
|
||
this.isRunning = true;
|
||
|
||
// Register with the API to get a friendly name (non-blocking)
|
||
await this.register();
|
||
|
||
// Start registry heartbeat immediately
|
||
this.startRegistryHeartbeat();
|
||
|
||
// Cleanup stale tasks on startup and periodically
|
||
// ALL workers run cleanup to ensure stale tasks are recovered even if some workers crash
|
||
// The cleanup query uses SELECT FOR UPDATE SKIP LOCKED to avoid races
|
||
// Run immediately on startup
|
||
await this.runStaleTaskCleanup();
|
||
|
||
// Start periodic cleanup every 10 minutes
|
||
this.startPeriodicStaleCleanup();
|
||
|
||
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
|
||
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (stealth=lazy, max ${this.maxConcurrentTasks} concurrent tasks)`);
|
||
|
||
while (this.isRunning) {
|
||
try {
|
||
await this.mainLoop();
|
||
} catch (error: any) {
|
||
console.error(`[TaskWorker] Loop error:`, error.message);
|
||
await this.sleep(POLL_INTERVAL_MS);
|
||
}
|
||
}
|
||
|
||
// Wait for any remaining tasks to complete
|
||
if (this.taskPromises.size > 0) {
|
||
console.log(`[TaskWorker] Waiting for ${this.taskPromises.size} active tasks to complete...`);
|
||
await Promise.allSettled(this.taskPromises.values());
|
||
}
|
||
|
||
console.log(`[TaskWorker] Worker ${this.workerId} stopped`);
|
||
}
|
||
|
||
/**
|
||
* Main loop - tries to fill up to maxConcurrentTasks
|
||
*/
|
||
private async mainLoop(): Promise<void> {
|
||
// Check resource usage and backoff if needed
|
||
const { backoff, reason } = this.shouldBackOff();
|
||
if (backoff) {
|
||
if (!this.isBackingOff) {
|
||
console.log(`[TaskWorker] ${this.friendlyName} backing off: ${reason}`);
|
||
}
|
||
this.isBackingOff = true;
|
||
this.backoffReason = reason;
|
||
await this.sleep(BACKOFF_DURATION_MS);
|
||
return;
|
||
}
|
||
|
||
// Clear backoff state
|
||
if (this.isBackingOff) {
|
||
console.log(`[TaskWorker] ${this.friendlyName} resuming normal operation`);
|
||
this.isBackingOff = false;
|
||
this.backoffReason = null;
|
||
}
|
||
|
||
// Periodically reload proxies to pick up changes (new proxies, disabled proxies)
|
||
// This runs every ~60 seconds (controlled by setProxyReloadInterval)
|
||
if (this.stealthInitialized) {
|
||
await this.crawlRotator.reloadIfStale();
|
||
}
|
||
|
||
// Check for decommission signal
|
||
const shouldDecommission = await this.checkDecommission();
|
||
if (shouldDecommission) {
|
||
console.log(`[TaskWorker] ${this.friendlyName} received decommission signal - waiting for ${this.activeTasks.size} tasks to complete`);
|
||
// Stop accepting new tasks, wait for current to finish
|
||
this.isRunning = false;
|
||
return;
|
||
}
|
||
|
||
// Try to claim more tasks if we have capacity
|
||
if (this.canAcceptMoreTasks()) {
|
||
// =================================================================
|
||
// LAZY INITIALIZATION - Initialize stealth on first task claim
|
||
// Workers start immediately and init proxies only when needed
|
||
// =================================================================
|
||
if (!this.stealthInitialized) {
|
||
const initSuccess = await this.ensureStealthInitialized();
|
||
if (!initSuccess) {
|
||
// Init failed - wait and retry next loop
|
||
console.log(`[TaskWorker] ${this.friendlyName} stealth init failed, waiting before retry...`);
|
||
await this.sleep(30000);
|
||
return;
|
||
}
|
||
}
|
||
|
||
// =================================================================
|
||
// PREFLIGHT GATE - BLOCK unqualified workers from task pool
|
||
// All tasks require HTTP method, so HTTP preflight MUST pass.
|
||
// If preflight failed, worker retries every 30 seconds.
|
||
// Worker CANNOT claim ANY tasks until preflight passes.
|
||
// =================================================================
|
||
if (!this.preflightHttpPassed) {
|
||
console.log(`[TaskWorker] ${this.friendlyName} BLOCKED - HTTP preflight not passed, cannot claim tasks`);
|
||
await this.retryPreflightUntilPass();
|
||
return; // Return to main loop, will re-check on next iteration
|
||
}
|
||
|
||
// =================================================================
|
||
// WORKING HOURS GATE - Simulate natural traffic patterns
|
||
// Workers scale their concurrent task limit based on the current
|
||
// hour's weight in their timezone. This creates natural throughput:
|
||
// 3 AM (5%): 1 slot → worker runs 1 task at a time
|
||
// 6 PM (100%): 3 slots → worker runs full capacity
|
||
// =================================================================
|
||
const { effectiveMax, hourWeight, reason } = await this.getEffectiveMaxTasks();
|
||
if (this.activeTasks.size >= effectiveMax) {
|
||
// Already at working hours capacity - wait before checking again
|
||
const sleepMs = 10000 + Math.random() * 20000; // 10-30 seconds
|
||
if (this.activeTasks.size > 0) {
|
||
// Only log if we're actually throttled (have tasks but can't take more)
|
||
console.log(`[TaskWorker] ${this.friendlyName} AT CAPACITY - ${reason} (${this.activeTasks.size}/${effectiveMax} active)`);
|
||
}
|
||
await this.sleep(sleepMs);
|
||
return; // Return to main loop, will re-check on next iteration
|
||
}
|
||
|
||
// Pass preflight capabilities to only claim compatible tasks
|
||
const task = await taskService.claimTask(
|
||
this.role,
|
||
this.workerId,
|
||
this.preflightCurlPassed,
|
||
this.preflightHttpPassed
|
||
);
|
||
|
||
if (task) {
|
||
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
|
||
|
||
// =================================================================
|
||
// PREFLIGHT CHECK - Use stored preflight results based on task method
|
||
// We already ran dual-transport preflights at startup, so just verify
|
||
// the correct preflight passed for this task's required method.
|
||
// =================================================================
|
||
const taskMethod = task.method || 'http'; // Default to http if not specified
|
||
let preflightPassed = false;
|
||
let preflightMsg = '';
|
||
|
||
if (taskMethod === 'http' && this.preflightHttpPassed) {
|
||
preflightPassed = true;
|
||
preflightMsg = `HTTP preflight passed (IP: ${this.preflightHttpResult?.proxyIp || 'unknown'})`;
|
||
} else if (taskMethod === 'curl' && this.preflightCurlPassed) {
|
||
preflightPassed = true;
|
||
preflightMsg = `CURL preflight passed (IP: ${this.preflightCurlResult?.proxyIp || 'unknown'})`;
|
||
} else if (!task.method && (this.preflightHttpPassed || this.preflightCurlPassed)) {
|
||
// No method preference - either transport works
|
||
preflightPassed = true;
|
||
preflightMsg = this.preflightHttpPassed ? 'HTTP preflight passed' : 'CURL preflight passed';
|
||
}
|
||
|
||
if (!preflightPassed) {
|
||
const errorMsg = taskMethod === 'http'
|
||
? 'HTTP preflight not passed - cannot execute http tasks'
|
||
: 'CURL preflight not passed - cannot execute curl tasks';
|
||
console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${errorMsg}`);
|
||
console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without preflight`);
|
||
|
||
// Release task back to pending so another worker can pick it up
|
||
await taskService.releaseTask(task.id);
|
||
|
||
// Wait before trying again - give proxies time to recover
|
||
await this.sleep(30000); // 30 second wait on preflight failure
|
||
return;
|
||
}
|
||
|
||
console.log(`[TaskWorker] ${this.friendlyName} preflight verified for task ${task.id}: ${preflightMsg}`);
|
||
|
||
this.activeTasks.set(task.id, task);
|
||
|
||
// Start task in background (don't await)
|
||
const taskPromise = this.executeTask(task);
|
||
this.taskPromises.set(task.id, taskPromise);
|
||
|
||
// Clean up when done
|
||
taskPromise.finally(() => {
|
||
this.activeTasks.delete(task.id);
|
||
this.taskPromises.delete(task.id);
|
||
});
|
||
|
||
// Immediately try to claim more tasks (don't wait for poll interval)
|
||
return;
|
||
}
|
||
}
|
||
|
||
// No task claimed or at capacity - wait before next poll
|
||
await this.sleep(POLL_INTERVAL_MS);
|
||
}
|
||
|
||
/**
|
||
* Stop the worker
|
||
*/
|
||
async stop(): Promise<void> {
|
||
this.isRunning = false;
|
||
this.stopHeartbeat();
|
||
this.stopRegistryHeartbeat();
|
||
this.stopPeriodicStaleCleanup();
|
||
await this.deregister();
|
||
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
|
||
}
|
||
|
||
/**
|
||
* Execute a single task (runs concurrently with other tasks)
|
||
*/
|
||
private async executeTask(task: WorkerTask): Promise<void> {
|
||
console.log(`[TaskWorker] ${this.friendlyName} starting task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`);
|
||
|
||
try {
|
||
// Mark as running
|
||
await taskService.startTask(task.id);
|
||
|
||
// Get handler for this role (considers method for dual-transport)
|
||
const handler = getHandlerForTask(task);
|
||
if (!handler) {
|
||
throw new Error(`No handler registered for role: ${task.role}`);
|
||
}
|
||
|
||
// Create context with step tracking
|
||
const ctx: TaskContext = {
|
||
pool: this.pool,
|
||
workerId: this.workerId,
|
||
task,
|
||
heartbeat: async () => {
|
||
await taskService.heartbeat(task.id);
|
||
},
|
||
crawlRotator: this.crawlRotator,
|
||
updateStep: (step: string, detail?: string) => {
|
||
this.updateTaskStep(task.id, step, detail);
|
||
},
|
||
};
|
||
|
||
// Initialize step tracking for this task
|
||
this.updateTaskStep(task.id, 'starting', `Initializing ${task.role}`);
|
||
|
||
// Execute the task
|
||
const result = await handler(ctx);
|
||
|
||
if (result.success) {
|
||
// Clear step tracking
|
||
this.clearTaskStep(task.id);
|
||
|
||
// Mark as completed
|
||
await taskService.completeTask(task.id, result);
|
||
await this.reportTaskCompletion(true);
|
||
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id} [${this.activeTasks.size}/${this.maxConcurrentTasks} active]`);
|
||
|
||
// Chain next task if applicable
|
||
const chainedTask = await taskService.chainNextTask({
|
||
...task,
|
||
status: 'completed',
|
||
result,
|
||
});
|
||
if (chainedTask) {
|
||
console.log(`[TaskWorker] Chained new task ${chainedTask.id} (${chainedTask.role})`);
|
||
}
|
||
} else {
|
||
// Clear step tracking
|
||
this.clearTaskStep(task.id);
|
||
|
||
// Mark as failed
|
||
await taskService.failTask(task.id, result.error || 'Unknown error');
|
||
await this.reportTaskCompletion(false);
|
||
console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`);
|
||
}
|
||
} catch (error: any) {
|
||
// Clear step tracking
|
||
this.clearTaskStep(task.id);
|
||
|
||
// Mark as failed
|
||
await taskService.failTask(task.id, error.message);
|
||
await this.reportTaskCompletion(false);
|
||
console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
|
||
}
|
||
// Note: cleanup (removing from activeTasks) is handled in mainLoop's finally block
|
||
}
|
||
|
||
/**
|
||
* Check if this worker has been flagged for decommission
|
||
* Returns true if worker should stop after current task
|
||
*/
|
||
private async checkDecommission(): Promise<boolean> {
|
||
try {
|
||
// Check worker_registry for decommission flag
|
||
const result = await this.pool.query(
|
||
`SELECT decommission_requested, decommission_reason
|
||
FROM worker_registry
|
||
WHERE worker_id = $1`,
|
||
[this.workerId]
|
||
);
|
||
|
||
if (result.rows.length > 0 && result.rows[0].decommission_requested) {
|
||
const reason = result.rows[0].decommission_reason || 'No reason provided';
|
||
console.log(`[TaskWorker] Decommission requested: ${reason}`);
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
} catch (error: any) {
|
||
// If we can't check, continue running
|
||
console.warn(`[TaskWorker] Could not check decommission status: ${error.message}`);
|
||
return false;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Start heartbeat interval
|
||
*/
|
||
private startHeartbeat(taskId: number): void {
|
||
this.heartbeatInterval = setInterval(async () => {
|
||
try {
|
||
await taskService.heartbeat(taskId);
|
||
} catch (error: any) {
|
||
console.warn(`[TaskWorker] Heartbeat failed:`, error.message);
|
||
}
|
||
}, HEARTBEAT_INTERVAL_MS);
|
||
}
|
||
|
||
/**
|
||
* Stop heartbeat interval
|
||
*/
|
||
private stopHeartbeat(): void {
|
||
if (this.heartbeatInterval) {
|
||
clearInterval(this.heartbeatInterval);
|
||
this.heartbeatInterval = null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Sleep helper
|
||
*/
|
||
private sleep(ms: number): Promise<void> {
|
||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||
}
|
||
|
||
/**
|
||
* Get worker info
|
||
*/
|
||
getInfo(): {
|
||
workerId: string;
|
||
role: TaskRole | null;
|
||
isRunning: boolean;
|
||
activeTaskIds: number[];
|
||
activeTaskCount: number;
|
||
maxConcurrentTasks: number;
|
||
isBackingOff: boolean;
|
||
backoffReason: string | null;
|
||
preflightCurlPassed: boolean;
|
||
preflightHttpPassed: boolean;
|
||
} {
|
||
return {
|
||
workerId: this.workerId,
|
||
role: this.role,
|
||
isRunning: this.isRunning,
|
||
activeTaskIds: Array.from(this.activeTasks.keys()),
|
||
activeTaskCount: this.activeTasks.size,
|
||
maxConcurrentTasks: this.maxConcurrentTasks,
|
||
isBackingOff: this.isBackingOff,
|
||
backoffReason: this.backoffReason,
|
||
preflightCurlPassed: this.preflightCurlPassed,
|
||
preflightHttpPassed: this.preflightHttpPassed,
|
||
};
|
||
}
|
||
}
|
||
|
||
// ============================================================
|
||
// CLI ENTRY POINT
|
||
// ============================================================
|
||
|
||
async function main(): Promise<void> {
|
||
const role = process.env.WORKER_ROLE as TaskRole | undefined;
|
||
|
||
// Per TASK_WORKFLOW_2024-12-10.md: Valid task roles
|
||
const validRoles: TaskRole[] = [
|
||
'store_discovery',
|
||
'entry_point_discovery',
|
||
'product_discovery',
|
||
'payload_fetch', // Fetches from API, saves to disk
|
||
'product_refresh', // Reads from disk, processes to DB
|
||
'analytics_refresh',
|
||
];
|
||
|
||
// If role specified, validate it
|
||
if (role && !validRoles.includes(role)) {
|
||
console.error(`Error: Invalid WORKER_ROLE: ${role}`);
|
||
console.error(`Valid roles: ${validRoles.join(', ')}`);
|
||
console.error('Or omit WORKER_ROLE for role-agnostic worker (any task)');
|
||
process.exit(1);
|
||
}
|
||
|
||
// Use POD_NAME for persistent identity in K8s StatefulSet
|
||
// This ensures workers keep the same ID across restarts
|
||
// Falls back to WORKER_ID, then generates UUID if neither is set
|
||
const workerId = process.env.POD_NAME || process.env.WORKER_ID;
|
||
// Pass null for role-agnostic, or the specific role
|
||
const worker = new TaskWorker(role || null, workerId);
|
||
|
||
// Handle graceful shutdown
|
||
process.on('SIGTERM', () => {
|
||
console.log('[TaskWorker] Received SIGTERM, shutting down...');
|
||
worker.stop();
|
||
});
|
||
|
||
process.on('SIGINT', () => {
|
||
console.log('[TaskWorker] Received SIGINT, shutting down...');
|
||
worker.stop();
|
||
});
|
||
|
||
await worker.start();
|
||
}
|
||
|
||
// Run if this is the main module
|
||
if (require.main === module) {
|
||
main().catch((error) => {
|
||
console.error('[TaskWorker] Fatal error:', error);
|
||
process.exit(1);
|
||
});
|
||
}
|
||
|
||
export { main };
|