Files
cannaiq/backend/src/tasks/task-worker.ts
Kelly c215d11a84 feat: Platform isolation, Evomi geo-targeting, proxy management
Platform isolation:
- Rename handlers to {task}-{platform}.ts convention
- Deprecate -curl variants (now _deprecated-*)
- Platform-based routing in task-worker.ts
- Add Jane platform handlers and client

Evomi geo-targeting:
- Add dynamic proxy URL builder with state/city targeting
- Session stickiness per worker per state (30 min)
- Fallback to static proxy table when API unavailable
- Add proxy tracking columns to worker_tasks

Proxy management:
- New /proxies admin page for visibility
- Track proxy_ip, proxy_geo, proxy_source per task
- Show active sessions and task history

Validation filtering:
- Filter by validated stores (platform_dispensary_id + menu_url)
- Mark incomplete stores as deprecated
- Update all dashboard/stats queries

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 15:16:48 -07:00

1424 lines
54 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Task Worker
*
* A unified worker that pulls tasks from the worker_tasks queue.
* Workers register on startup, get a friendly name, and pull tasks.
*
* Architecture:
* - Tasks are generated on schedule (by scheduler or API)
* - Workers PULL tasks from the pool (not assigned to them)
* - Tasks are claimed in order of priority (DESC) then creation time (ASC)
* - Workers report heartbeats to worker_registry
* - Workers are ROLE-AGNOSTIC by default (can handle any task type)
*
* Stealth & Anti-Detection (LAZY INITIALIZATION):
* Workers start IMMEDIATELY without waiting for proxies.
* Stealth systems (proxies, fingerprints, preflights) are initialized
* on first task claim, not at worker startup.
*
* This allows workers to:
* - Register and send heartbeats immediately
* - Wait in main loop without blocking on proxy availability
* - Initialize proxies/preflights only when tasks are actually available
*
* On first task claim attempt, workers initialize the CrawlRotator which provides:
* - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy
* - User-Agent rotation: Cycles through realistic browser fingerprints
* - Fingerprint rotation: Changes browser profile on blocks
* - Locale/timezone: Matches Accept-Language to target state
*
* The CrawlRotator is wired to the Dutchie client via setCrawlRotator().
* Task handlers call startSession() which picks a random fingerprint.
* On 403 errors, the client automatically:
* 1. Records failure on current proxy
* 2. Rotates to next proxy
* 3. Rotates fingerprint
* 4. Retries the request
*
* Usage:
* npx tsx src/tasks/task-worker.ts # Role-agnostic (any task)
* WORKER_ROLE=product_refresh npx tsx src/tasks/task-worker.ts # Role-specific
*
* Environment:
* WORKER_ROLE - Which task role to process (optional, null = any task)
* POD_NAME - K8s StatefulSet pod name (PRIMARY - use this for persistent identity)
* WORKER_ID - Custom worker ID (fallback if POD_NAME not set)
* POLL_INTERVAL_MS - How often to check for tasks (default: 5000)
* HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000)
* API_BASE_URL - Backend API URL for registration (default: http://localhost:3010)
*
* Worker Identity:
* Workers use POD_NAME as their worker_id for persistent identity across restarts.
* In K8s StatefulSet, POD_NAME = "scraper-worker-0" through "scraper-worker-7".
* This ensures workers re-register with the same ID instead of creating new entries.
*/
import { Pool } from 'pg';
import { v4 as uuidv4 } from 'uuid';
import { taskService, TaskRole, WorkerTask } from './task-service';
import { getPool } from '../db/pool';
import os from 'os';
// Stealth/rotation support
import { CrawlRotator } from '../services/crawl-rotator';
import { setCrawlRotator } from '../platforms/dutchie';
// Dual-transport preflight system
import { runCurlPreflight, CurlPreflightResult } from '../services/curl-preflight';
import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight';
// Task handlers by role
// Platform-based handlers: {task}-{platform}.ts convention
import { handleProductRefresh } from './handlers/product-refresh';
import { handleStoreDiscoveryState } from './handlers/store-discovery-state';
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
import { handleWhoami } from './handlers/whoami';
// Dutchie Platform Handlers
import { handleProductDiscoveryDutchie } from './handlers/product-discovery-dutchie';
import { handleStoreDiscoveryDutchie } from './handlers/store-discovery-dutchie';
// Jane Platform Handlers
import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane';
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
// =============================================================================
// CONCURRENT TASK PROCESSING SETTINGS
// =============================================================================
// Workers can process multiple tasks simultaneously using async I/O.
// This improves throughput for I/O-bound tasks (network calls, DB queries).
//
// Resource thresholds trigger "backoff" - the worker stops claiming new tasks
// but continues processing existing ones until resources return to normal.
//
// See: docs/WORKER_TASK_ARCHITECTURE.md#concurrent-task-processing
// =============================================================================
// Maximum number of tasks this worker will run concurrently
// Browser tasks (Puppeteer) use ~400MB RAM each. With 2GB pod limit:
// - 3 browsers = ~1.3GB = SAFE
// - 4 browsers = ~1.7GB = RISKY
// - 5+ browsers = OOM CRASH
// See: docs/WORKER_TASK_ARCHITECTURE.md#browser-task-memory-limits
const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
// When heap memory usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
// Default 85% - gives headroom before OOM
const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
// Parse max heap size from NODE_OPTIONS (--max-old-space-size=1500)
// This is used as the denominator for memory percentage calculation
// V8's heapTotal is dynamic and stays small when idle, causing false high percentages
function getMaxHeapSizeMb(): number {
const nodeOptions = process.env.NODE_OPTIONS || '';
const match = nodeOptions.match(/--max-old-space-size=(\d+)/);
if (match) {
return parseInt(match[1], 10);
}
// Fallback: use 512MB if not specified
return 512;
}
const MAX_HEAP_SIZE_MB = getMaxHeapSizeMb();
// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
// Default 90% - allows some burst capacity
const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
// How long to wait (ms) when in backoff state before rechecking resources
const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
export interface TaskContext {
pool: Pool;
workerId: string;
task: WorkerTask;
heartbeat: () => Promise<void>;
crawlRotator?: CrawlRotator;
/** Update the current step being executed (shown in dashboard) */
updateStep: (step: string, detail?: string) => void;
}
export interface TaskResult {
success: boolean;
productsProcessed?: number;
snapshotsCreated?: number;
storesDiscovered?: number;
error?: string;
[key: string]: unknown;
}
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
// Per TASK_WORKFLOW_2024-12-10.md: Handler registry
// Platform-agnostic handlers (shared across Dutchie and Jane)
// product_refresh: Reads local payload, uses platform-aware normalizer, upserts to DB
const SHARED_HANDLERS: Partial<Record<TaskRole, TaskHandler>> = {
product_refresh: handleProductRefresh,
store_discovery_state: handleStoreDiscoveryState,
entry_point_discovery: handleEntryPointDiscovery,
analytics_refresh: handleAnalyticsRefresh,
whoami: handleWhoami,
};
/**
* Get the appropriate handler for a task based on platform.
*
* Naming convention: {task}-{platform}.ts
* - product-discovery-dutchie.ts
* - product-discovery-jane.ts
* - store-discovery-dutchie.ts
* - store-discovery-jane.ts
*
* All handlers use HTTP/Puppeteer transport (curl transport is deprecated).
*/
function getHandlerForTask(task: WorkerTask): TaskHandler | undefined {
const role = task.role as TaskRole;
const platform = task.platform || 'dutchie';
// ==========================================================================
// JANE PLATFORM ROUTING
// ==========================================================================
if (platform === 'jane') {
if (role === 'store_discovery' || role === 'store_discovery_state') {
console.log(`[TaskWorker] Using Jane handler for store_discovery`);
return handleStoreDiscoveryJane;
}
if (role === 'entry_point_discovery') {
console.log(`[TaskWorker] Using Jane handler for entry_point_discovery`);
return handleEntryPointDiscoveryJane;
}
if (role === 'product_discovery') {
console.log(`[TaskWorker] Using Jane handler for product_discovery`);
return handleProductDiscoveryJane;
}
}
// ==========================================================================
// DUTCHIE PLATFORM ROUTING (default)
// ==========================================================================
if (role === 'product_discovery') {
console.log(`[TaskWorker] Using Dutchie handler for product_discovery`);
return handleProductDiscoveryDutchie;
}
if (role === 'store_discovery') {
console.log(`[TaskWorker] Using Dutchie handler for store_discovery`);
return handleStoreDiscoveryDutchie;
}
// ==========================================================================
// SHARED HANDLERS (platform-agnostic)
// ==========================================================================
return SHARED_HANDLERS[role];
}
/**
* Resource usage stats reported to the registry and used for backoff decisions.
* These values are included in worker heartbeats and displayed in the UI.
*/
interface ResourceStats {
/** Current heap memory usage as decimal (0.0 to 1.0) */
memoryPercent: number;
/** Current heap used in MB */
memoryMb: number;
/** Total heap available in MB */
memoryTotalMb: number;
/** CPU usage percentage since last check (0 to 100) */
cpuPercent: number;
/** True if worker is currently in backoff state */
isBackingOff: boolean;
/** Reason for backoff (e.g., "Memory at 87.3% (threshold: 85%)") */
backoffReason: string | null;
}
export class TaskWorker {
private pool: Pool;
private workerId: string;
private role: TaskRole | null; // null = role-agnostic (any task)
private friendlyName: string = '';
private isRunning: boolean = false;
private heartbeatInterval: NodeJS.Timeout | null = null;
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
private staleCleanupInterval: NodeJS.Timeout | null = null;
private crawlRotator: CrawlRotator;
// ==========================================================================
// CONCURRENT TASK TRACKING
// ==========================================================================
// activeTasks: Map of task ID -> task object for all currently running tasks
// taskPromises: Map of task ID -> Promise for cleanup when task completes
// maxConcurrentTasks: How many tasks this worker will run in parallel
// ==========================================================================
private activeTasks: Map<number, WorkerTask> = new Map();
private taskPromises: Map<number, Promise<void>> = new Map();
private maxConcurrentTasks: number = MAX_CONCURRENT_TASKS;
// ==========================================================================
// RESOURCE MONITORING FOR BACKOFF
// ==========================================================================
// CPU tracking uses differential measurement - we track last values and
// calculate percentage based on elapsed time since last check.
// ==========================================================================
private lastCpuUsage: { user: number; system: number } = { user: 0, system: 0 };
private lastCpuCheck: number = Date.now();
private isBackingOff: boolean = false;
private backoffReason: string | null = null;
// ==========================================================================
// DUAL-TRANSPORT PREFLIGHT STATUS
// ==========================================================================
// Workers run BOTH preflights on startup:
// - curl: axios/proxy transport - fast, for simple API calls
// - http: Puppeteer/browser transport - anti-detect, for Dutchie GraphQL
//
// Task claiming checks method compatibility - worker must have passed
// the preflight for the task's required method.
// ==========================================================================
private preflightCurlPassed: boolean = false;
private preflightHttpPassed: boolean = false;
private preflightCurlResult: CurlPreflightResult | null = null;
private preflightHttpResult: PuppeteerPreflightResult | null = null;
// ==========================================================================
// LAZY INITIALIZATION FLAGS
// ==========================================================================
// Stealth/proxy initialization is deferred until first task claim.
// Workers register immediately and enter main loop without blocking.
// ==========================================================================
private stealthInitialized: boolean = false;
private preflightsCompleted: boolean = false;
private initializingPromise: Promise<void> | null = null;
// ==========================================================================
// PREFLIGHT RETRY SETTINGS
// ==========================================================================
// If preflight fails, worker retries every PREFLIGHT_RETRY_INTERVAL_MS
// Worker is BLOCKED from claiming ANY tasks until preflight passes.
// This ensures unqualified workers never touch the task pool.
// ==========================================================================
private static readonly PREFLIGHT_RETRY_INTERVAL_MS = 30000; // 30 seconds
private isRetryingPreflight: boolean = false;
// ==========================================================================
// STEP TRACKING FOR DASHBOARD VISIBILITY
// ==========================================================================
// Workers report their current step in heartbeats so the dashboard can show
// real-time progress like "preflight", "loading page", "processing products"
// ==========================================================================
private currentStep: string = 'idle';
private currentStepDetail: string | null = null;
private currentStepStartedAt: Date | null = null;
/** Map of task ID -> step info for concurrent tasks */
private taskSteps: Map<number, { step: string; detail: string | null; startedAt: Date }> = new Map();
constructor(role: TaskRole | null = null, workerId?: string) {
this.pool = getPool();
this.role = role;
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
this.crawlRotator = new CrawlRotator(this.pool);
// Initialize CPU tracking
const cpuUsage = process.cpuUsage();
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
this.lastCpuCheck = Date.now();
}
/**
* Get current resource usage
* Memory percentage is calculated against MAX_HEAP_SIZE_MB (from --max-old-space-size)
* NOT against V8's dynamic heapTotal which stays small when idle
*/
private getResourceStats(): ResourceStats {
const memUsage = process.memoryUsage();
const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
// Use MAX_HEAP_SIZE_MB as ceiling, not dynamic heapTotal
// V8's heapTotal stays small when idle (e.g., 36MB) causing false 95%+ readings
// With --max-old-space-size=1500, we should calculate against 1500MB
const memoryPercent = heapUsedMb / MAX_HEAP_SIZE_MB;
// Calculate CPU usage since last check
const cpuUsage = process.cpuUsage();
const now = Date.now();
const elapsed = now - this.lastCpuCheck;
let cpuPercent = 0;
if (elapsed > 0) {
const userDiff = (cpuUsage.user - this.lastCpuUsage.user) / 1000; // microseconds to ms
const systemDiff = (cpuUsage.system - this.lastCpuUsage.system) / 1000;
cpuPercent = ((userDiff + systemDiff) / elapsed) * 100;
}
// Update last values
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
this.lastCpuCheck = now;
return {
memoryPercent,
memoryMb: Math.round(heapUsedMb),
memoryTotalMb: MAX_HEAP_SIZE_MB, // Use max-old-space-size, not dynamic heapTotal
cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
isBackingOff: this.isBackingOff,
backoffReason: this.backoffReason,
};
}
/**
* Check if we should back off from taking new tasks
*/
private shouldBackOff(): { backoff: boolean; reason: string | null } {
const stats = this.getResourceStats();
if (stats.memoryPercent > MEMORY_BACKOFF_THRESHOLD) {
return { backoff: true, reason: `Memory at ${(stats.memoryPercent * 100).toFixed(1)}% (threshold: ${MEMORY_BACKOFF_THRESHOLD * 100}%)` };
}
if (stats.cpuPercent > CPU_BACKOFF_THRESHOLD * 100) {
return { backoff: true, reason: `CPU at ${stats.cpuPercent.toFixed(1)}% (threshold: ${CPU_BACKOFF_THRESHOLD * 100}%)` };
}
return { backoff: false, reason: null };
}
/**
* Get count of currently running tasks
*/
get activeTaskCount(): number {
return this.activeTasks.size;
}
/**
* Check if we can accept more tasks
*/
private canAcceptMoreTasks(): boolean {
return this.activeTasks.size < this.maxConcurrentTasks;
}
// ==========================================================================
// STEP TRACKING METHODS
// ==========================================================================
/**
* Update the current step for a task (for dashboard visibility)
* @param taskId - The task ID to update
* @param step - Short step name (e.g., "preflight", "loading", "processing")
* @param detail - Optional detail (e.g., "Verifying IP 1.2.3.4")
*/
public updateTaskStep(taskId: number, step: string, detail?: string): void {
this.taskSteps.set(taskId, {
step,
detail: detail || null,
startedAt: new Date(),
});
// Also update the "primary" step for single-task backwards compat
if (this.activeTasks.size === 1 || taskId === Array.from(this.activeTasks.keys())[0]) {
this.currentStep = step;
this.currentStepDetail = detail || null;
this.currentStepStartedAt = new Date();
}
console.log(`[TaskWorker] Step: ${step}${detail ? ` - ${detail}` : ''} (task #${taskId})`);
}
/**
* Clear step tracking for a task (when task completes)
*/
private clearTaskStep(taskId: number): void {
this.taskSteps.delete(taskId);
// Reset primary step if no more active tasks
if (this.activeTasks.size === 0) {
this.currentStep = 'idle';
this.currentStepDetail = null;
this.currentStepStartedAt = null;
}
}
/**
* Get current step info for all active tasks (for heartbeat)
*/
private getTaskStepsInfo(): Array<{
task_id: number;
step: string;
detail: string | null;
elapsed_ms: number;
}> {
const now = Date.now();
return Array.from(this.taskSteps.entries()).map(([taskId, info]) => ({
task_id: taskId,
step: info.step,
detail: info.detail,
elapsed_ms: now - info.startedAt.getTime(),
}));
}
/**
* Initialize stealth systems (proxy rotation, fingerprints)
* Called LAZILY on first task claim attempt (NOT at worker startup).
*
* IMPORTANT: Proxies are REQUIRED to claim tasks. This method waits until proxies are available.
* Workers listen for PostgreSQL NOTIFY 'proxy_added' to wake up immediately when proxies are added.
*/
private async initializeStealth(): Promise<void> {
const MAX_WAIT_MINUTES = 60;
const POLL_INTERVAL_MS = 30000; // 30 seconds fallback polling
const maxAttempts = (MAX_WAIT_MINUTES * 60 * 1000) / POLL_INTERVAL_MS;
let attempts = 0;
let notifyClient: any = null;
// Set up PostgreSQL LISTEN for proxy notifications
try {
notifyClient = await this.pool.connect();
await notifyClient.query('LISTEN proxy_added');
console.log(`[TaskWorker] Listening for proxy_added notifications...`);
} catch (err: any) {
console.log(`[TaskWorker] Could not set up LISTEN (will poll): ${err.message}`);
}
// Create a promise that resolves when notified
let notifyResolve: (() => void) | null = null;
if (notifyClient) {
notifyClient.on('notification', (msg: any) => {
if (msg.channel === 'proxy_added') {
console.log(`[TaskWorker] Received proxy_added notification!`);
if (notifyResolve) notifyResolve();
}
});
}
try {
while (attempts < maxAttempts) {
try {
// Load proxies from database
await this.crawlRotator.initialize();
const stats = this.crawlRotator.proxy.getStats();
if (stats.activeProxies > 0) {
console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`);
// Wire rotator to Dutchie client - proxies will be used for ALL requests
setCrawlRotator(this.crawlRotator);
console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`);
return;
}
attempts++;
console.log(`[TaskWorker] No active proxies available (attempt ${attempts}). Waiting for proxies...`);
// Wait for either notification or timeout
await new Promise<void>((resolve) => {
notifyResolve = resolve;
setTimeout(resolve, POLL_INTERVAL_MS);
});
} catch (error: any) {
attempts++;
console.log(`[TaskWorker] Error loading proxies (attempt ${attempts}): ${error.message}. Retrying...`);
await this.sleep(POLL_INTERVAL_MS);
}
}
throw new Error(`No active proxies available after waiting ${MAX_WAIT_MINUTES} minutes. Add proxies to the database.`);
} finally {
// Clean up LISTEN connection
if (notifyClient) {
try {
await notifyClient.query('UNLISTEN proxy_added');
notifyClient.release();
} catch {
// Ignore cleanup errors
}
}
}
}
/**
* Run dual-transport preflights on startup
* Tests both curl (axios/proxy) and http (Puppeteer/browser) transport methods.
* Results are reported to worker_registry and used for task claiming.
*
* NOTE: All current tasks require 'http' method, so http preflight must pass
* for the worker to claim any tasks. Curl preflight is for future use.
*/
private async runDualPreflights(): Promise<void> {
console.log(`[TaskWorker] Running dual-transport preflights...`);
// Run both preflights in parallel for efficiency
const [curlResult, httpResult] = await Promise.all([
runCurlPreflight(this.crawlRotator).catch((err): CurlPreflightResult => ({
method: 'curl',
passed: false,
proxyAvailable: false,
proxyConnected: false,
antidetectReady: false,
proxyIp: null,
fingerprint: null,
error: `Preflight error: ${err.message}`,
responseTimeMs: null,
})),
runPuppeteerPreflightWithRetry(this.crawlRotator, 1).catch((err): PuppeteerPreflightResult => ({
method: 'http',
passed: false,
proxyAvailable: false,
proxyConnected: false,
antidetectReady: false,
proxyIp: null,
fingerprint: null,
error: `Preflight error: ${err.message}`,
responseTimeMs: null,
productsReturned: 0,
})),
]);
// Store results
this.preflightCurlResult = curlResult;
this.preflightHttpResult = httpResult;
this.preflightCurlPassed = curlResult.passed;
this.preflightHttpPassed = httpResult.passed;
// Log results
console.log(`[TaskWorker] CURL preflight: ${curlResult.passed ? 'PASSED' : 'FAILED'}${curlResult.error ? ` - ${curlResult.error}` : ''}`);
console.log(`[TaskWorker] HTTP preflight: ${httpResult.passed ? 'PASSED' : 'FAILED'}${httpResult.error ? ` - ${httpResult.error}` : ''}`);
if (httpResult.passed && httpResult.productsReturned) {
console.log(`[TaskWorker] HTTP preflight returned ${httpResult.productsReturned} products from test store`);
}
// Report to worker_registry via API
await this.reportPreflightStatus();
// Since all tasks require 'http', warn if http preflight failed
if (!this.preflightHttpPassed) {
console.warn(`[TaskWorker] WARNING: HTTP preflight failed - this worker cannot claim any tasks!`);
console.warn(`[TaskWorker] Error: ${httpResult.error}`);
}
}
/**
* Report preflight status to worker_registry
* Function signature: update_worker_preflight(worker_id, transport, status, ip, response_ms, error, fingerprint)
*/
private async reportPreflightStatus(): Promise<void> {
try {
// Update worker_registry directly via SQL (more reliable than API)
// CURL preflight - includes IP address
await this.pool.query(`
SELECT update_worker_preflight($1, 'curl', $2, $3, $4, $5, $6)
`, [
this.workerId,
this.preflightCurlPassed ? 'passed' : 'failed',
this.preflightCurlResult?.proxyIp || null,
this.preflightCurlResult?.responseTimeMs || null,
this.preflightCurlResult?.error || null,
null, // No fingerprint for curl
]);
// HTTP preflight - includes IP, fingerprint, and timezone data
const httpFingerprint = this.preflightHttpResult ? {
...this.preflightHttpResult.fingerprint,
detectedTimezone: (this.preflightHttpResult as any).detectedTimezone,
detectedLocation: (this.preflightHttpResult as any).detectedLocation,
productsReturned: this.preflightHttpResult.productsReturned,
botDetection: (this.preflightHttpResult as any).botDetection,
} : null;
await this.pool.query(`
SELECT update_worker_preflight($1, 'http', $2, $3, $4, $5, $6)
`, [
this.workerId,
this.preflightHttpPassed ? 'passed' : 'failed',
this.preflightHttpResult?.proxyIp || null,
this.preflightHttpResult?.responseTimeMs || null,
this.preflightHttpResult?.error || null,
httpFingerprint ? JSON.stringify(httpFingerprint) : null,
]);
console.log(`[TaskWorker] Preflight status reported to worker_registry`);
if (this.preflightHttpResult?.proxyIp) {
console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${(this.preflightHttpResult as any).detectedTimezone || 'unknown'}`);
}
} catch (err: any) {
// Non-fatal - worker can still function
console.warn(`[TaskWorker] Could not report preflight status: ${err.message}`);
}
}
/**
* Retry preflight until it passes.
* Worker is BLOCKED from claiming ANY tasks until HTTP preflight passes.
* This ensures unqualified workers never touch the task pool.
*
* All current tasks require 'http' method, so HTTP preflight is mandatory.
*/
private async retryPreflightUntilPass(): Promise<void> {
if (this.preflightHttpPassed) {
return; // Already passed
}
if (this.isRetryingPreflight) {
return; // Already retrying
}
this.isRetryingPreflight = true;
let retryCount = 0;
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight FAILED - entering retry loop (every ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s)`);
console.log(`[TaskWorker] ${this.friendlyName} BLOCKED from task pool until preflight passes`);
while (!this.preflightHttpPassed && this.isRunning) {
retryCount++;
// Wait before retry
await this.sleep(TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS);
if (!this.isRunning) {
break; // Worker stopping
}
console.log(`[TaskWorker] ${this.friendlyName} preflight retry #${retryCount}...`);
// Reload proxies before retry (might have new ones)
try {
await this.crawlRotator.initialize();
const stats = this.crawlRotator.proxy.getStats();
console.log(`[TaskWorker] Proxies available: ${stats.activeProxies}`);
} catch (err: any) {
console.warn(`[TaskWorker] Proxy reload failed: ${err.message}`);
}
// Re-run HTTP preflight
try {
const httpResult = await runPuppeteerPreflightWithRetry(this.crawlRotator, 1);
this.preflightHttpResult = httpResult;
this.preflightHttpPassed = httpResult.passed;
if (httpResult.passed) {
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight PASSED on retry #${retryCount}!`);
console.log(`[TaskWorker] ${this.friendlyName} IP: ${httpResult.proxyIp}, Products: ${httpResult.productsReturned}`);
console.log(`[TaskWorker] ${this.friendlyName} now QUALIFIED to claim tasks`);
// Report updated status
await this.reportPreflightStatus();
break;
} else {
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight still FAILED: ${httpResult.error}`);
console.log(`[TaskWorker] ${this.friendlyName} will retry in ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s...`);
}
} catch (err: any) {
console.error(`[TaskWorker] ${this.friendlyName} preflight retry error: ${err.message}`);
}
}
this.isRetryingPreflight = false;
}
/**
* Get the effective max concurrent tasks based on working hours.
* Uses the worker's timezone (from preflight IP geolocation) to determine
* the current hour's weight, then scales max concurrent tasks accordingly.
*
* This creates natural traffic patterns - workers run fewer concurrent
* tasks during off-peak hours, full capacity during peak hours.
*
* Example with MAX_CONCURRENT_TASKS = 3:
* 3 AM (5% weight): effectiveMax = max(1, round(3 × 0.05)) = 1
* 12 PM (75% weight): effectiveMax = max(1, round(3 × 0.75)) = 2
* 6 PM (100% weight): effectiveMax = max(1, round(3 × 1.00)) = 3
*/
private async getEffectiveMaxTasks(): Promise<{ effectiveMax: number; hourWeight: number; reason: string }> {
try {
const result = await this.pool.query(`
SELECT current_hour, hour_weight, worker_timezone
FROM check_working_hours($1, 'natural_traffic')
`, [this.workerId]);
if (result.rows.length === 0) {
// Function returned nothing - default to full capacity
return {
effectiveMax: this.maxConcurrentTasks,
hourWeight: 100,
reason: 'Working hours check returned no result - using full capacity'
};
}
const row = result.rows[0];
const hourWeight = row.hour_weight || 100;
// Scale max concurrent tasks by hour weight, minimum 1
const effectiveMax = Math.max(1, Math.round(this.maxConcurrentTasks * hourWeight / 100));
return {
effectiveMax,
hourWeight,
reason: `Hour ${row.current_hour} (${row.worker_timezone}): ${hourWeight}% → ${effectiveMax}/${this.maxConcurrentTasks} slots`
};
} catch (err: any) {
// On error, default to full capacity (don't block workers due to DB issues)
console.warn(`[TaskWorker] Working hours check failed: ${err.message} - using full capacity`);
return {
effectiveMax: this.maxConcurrentTasks,
hourWeight: 100,
reason: 'Working hours check error - using full capacity'
};
}
}
/**
* Lazy initialization of stealth systems.
* Called BEFORE claiming first task (not at worker startup).
* This allows workers to register and enter main loop immediately.
*
* Returns true if initialization succeeded, false otherwise.
*/
private async ensureStealthInitialized(): Promise<boolean> {
// Already initialized
if (this.stealthInitialized && this.preflightsCompleted) {
return true;
}
// Already initializing (prevent concurrent init attempts)
if (this.initializingPromise) {
await this.initializingPromise;
return this.stealthInitialized && this.preflightsCompleted;
}
console.log(`[TaskWorker] ${this.friendlyName} lazy-initializing stealth systems (first task claim)...`);
this.initializingPromise = (async () => {
try {
// Initialize proxy/fingerprint rotation
await this.initializeStealth();
this.stealthInitialized = true;
// Run dual-transport preflights
await this.runDualPreflights();
this.preflightsCompleted = true;
const preflightMsg = `curl=${this.preflightCurlPassed ? '✓' : '✗'} http=${this.preflightHttpPassed ? '✓' : '✗'}`;
console.log(`[TaskWorker] ${this.friendlyName} stealth ready (${preflightMsg})`);
} catch (err: any) {
console.error(`[TaskWorker] ${this.friendlyName} stealth init failed: ${err.message}`);
this.stealthInitialized = false;
this.preflightsCompleted = false;
}
})();
await this.initializingPromise;
this.initializingPromise = null;
return this.stealthInitialized && this.preflightsCompleted;
}
/**
* Register worker with the registry (get friendly name)
*/
private async register(): Promise<void> {
try {
const response = await fetch(`${API_BASE_URL}/api/worker-registry/register`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
role: this.role,
worker_id: this.workerId,
pod_name: process.env.POD_NAME || process.env.HOSTNAME,
hostname: os.hostname(),
metadata: {
pid: process.pid,
node_version: process.version,
started_at: new Date().toISOString()
}
})
});
const data = await response.json();
if (data.success) {
this.friendlyName = data.friendly_name;
console.log(`[TaskWorker] ${data.message}`);
} else {
console.warn(`[TaskWorker] Registration warning: ${data.error}`);
this.friendlyName = this.workerId.slice(0, 12);
}
} catch (error: any) {
// Registration is optional - worker can still function without it
console.warn(`[TaskWorker] Could not register with API (will continue): ${error.message}`);
this.friendlyName = this.workerId.slice(0, 12);
}
}
/**
* Deregister worker from the registry
*/
private async deregister(): Promise<void> {
try {
await fetch(`${API_BASE_URL}/api/worker-registry/deregister`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ worker_id: this.workerId })
});
console.log(`[TaskWorker] ${this.friendlyName} signed off`);
} catch {
// Ignore deregistration errors
}
}
/**
* Send heartbeat to registry with resource usage, proxy location, and step info
*/
private async sendRegistryHeartbeat(): Promise<void> {
try {
const memUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
const proxyLocation = this.crawlRotator.getProxyLocation();
const resourceStats = this.getResourceStats();
// Get array of active task IDs
const activeTaskIds = Array.from(this.activeTasks.keys());
// Get step info for all active tasks
const taskSteps = this.getTaskStepsInfo();
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
worker_id: this.workerId,
current_task_id: activeTaskIds[0] || null, // Primary task for backwards compat
current_task_ids: activeTaskIds, // All active tasks
active_task_count: this.activeTasks.size,
max_concurrent_tasks: this.maxConcurrentTasks,
status: this.activeTasks.size > 0 ? 'active' : 'idle',
// Step tracking for dashboard visibility
current_step: this.currentStep,
current_step_detail: this.currentStepDetail,
current_step_started_at: this.currentStepStartedAt?.toISOString() || null,
task_steps: taskSteps, // Per-task step info for concurrent workers
resources: {
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
memory_percent: Math.round(resourceStats.memoryPercent * 100),
cpu_user_ms: Math.round(cpuUsage.user / 1000),
cpu_system_ms: Math.round(cpuUsage.system / 1000),
cpu_percent: Math.round(resourceStats.cpuPercent),
proxy_location: proxyLocation,
is_backing_off: this.isBackingOff,
backoff_reason: this.backoffReason,
}
})
});
} catch {
// Ignore heartbeat errors
}
}
/**
* Report task completion to registry
*/
private async reportTaskCompletion(success: boolean): Promise<void> {
try {
await fetch(`${API_BASE_URL}/api/worker-registry/task-completed`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
worker_id: this.workerId,
success
})
});
} catch {
// Ignore errors
}
}
/**
* Start registry heartbeat interval
*/
private startRegistryHeartbeat(): void {
this.registryHeartbeatInterval = setInterval(async () => {
await this.sendRegistryHeartbeat();
}, HEARTBEAT_INTERVAL_MS);
}
/**
* Stop registry heartbeat interval
*/
private stopRegistryHeartbeat(): void {
if (this.registryHeartbeatInterval) {
clearInterval(this.registryHeartbeatInterval);
this.registryHeartbeatInterval = null;
}
}
/**
* Run stale task cleanup once
* Recovers tasks left in claimed/running status after worker crashes
*/
private async runStaleTaskCleanup(): Promise<void> {
try {
console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`);
const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold
if (cleanupResult.cleaned > 0) {
console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`);
}
} catch (err: any) {
console.error(`[TaskWorker] Stale task cleanup error:`, err.message);
}
}
/**
* Start periodic stale task cleanup (every 10 minutes)
* Only run by worker-0 to avoid races
*/
private startPeriodicStaleCleanup(): void {
const STALE_CLEANUP_INTERVAL_MS = 10 * 60 * 1000; // 10 minutes
this.staleCleanupInterval = setInterval(async () => {
await this.runStaleTaskCleanup();
}, STALE_CLEANUP_INTERVAL_MS);
console.log(`[TaskWorker] ${this.friendlyName} started periodic stale cleanup (every 10 min)`);
}
/**
* Stop periodic stale task cleanup
*/
private stopPeriodicStaleCleanup(): void {
if (this.staleCleanupInterval) {
clearInterval(this.staleCleanupInterval);
this.staleCleanupInterval = null;
}
}
/**
* Start the worker loop
*
* Workers start IMMEDIATELY without blocking on proxy/preflight init.
* Stealth systems are lazy-initialized on first task claim.
* This allows workers to register and send heartbeats even when proxies aren't ready.
*/
async start(): Promise<void> {
this.isRunning = true;
// Register with the API to get a friendly name (non-blocking)
await this.register();
// Start registry heartbeat immediately
this.startRegistryHeartbeat();
// Cleanup stale tasks on startup and periodically
// ALL workers run cleanup to ensure stale tasks are recovered even if some workers crash
// The cleanup query uses SELECT FOR UPDATE SKIP LOCKED to avoid races
// Run immediately on startup
await this.runStaleTaskCleanup();
// Start periodic cleanup every 10 minutes
this.startPeriodicStaleCleanup();
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (stealth=lazy, max ${this.maxConcurrentTasks} concurrent tasks)`);
while (this.isRunning) {
try {
await this.mainLoop();
} catch (error: any) {
console.error(`[TaskWorker] Loop error:`, error.message);
await this.sleep(POLL_INTERVAL_MS);
}
}
// Wait for any remaining tasks to complete
if (this.taskPromises.size > 0) {
console.log(`[TaskWorker] Waiting for ${this.taskPromises.size} active tasks to complete...`);
await Promise.allSettled(this.taskPromises.values());
}
console.log(`[TaskWorker] Worker ${this.workerId} stopped`);
}
/**
* Main loop - tries to fill up to maxConcurrentTasks
*/
private async mainLoop(): Promise<void> {
// Check resource usage and backoff if needed
const { backoff, reason } = this.shouldBackOff();
if (backoff) {
if (!this.isBackingOff) {
console.log(`[TaskWorker] ${this.friendlyName} backing off: ${reason}`);
}
this.isBackingOff = true;
this.backoffReason = reason;
await this.sleep(BACKOFF_DURATION_MS);
return;
}
// Clear backoff state
if (this.isBackingOff) {
console.log(`[TaskWorker] ${this.friendlyName} resuming normal operation`);
this.isBackingOff = false;
this.backoffReason = null;
}
// Periodically reload proxies to pick up changes (new proxies, disabled proxies)
// This runs every ~60 seconds (controlled by setProxyReloadInterval)
if (this.stealthInitialized) {
await this.crawlRotator.reloadIfStale();
}
// Check for decommission signal
const shouldDecommission = await this.checkDecommission();
if (shouldDecommission) {
console.log(`[TaskWorker] ${this.friendlyName} received decommission signal - waiting for ${this.activeTasks.size} tasks to complete`);
// Stop accepting new tasks, wait for current to finish
this.isRunning = false;
return;
}
// Try to claim more tasks if we have capacity
if (this.canAcceptMoreTasks()) {
// =================================================================
// LAZY INITIALIZATION - Initialize stealth on first task claim
// Workers start immediately and init proxies only when needed
// =================================================================
if (!this.stealthInitialized) {
const initSuccess = await this.ensureStealthInitialized();
if (!initSuccess) {
// Init failed - wait and retry next loop
console.log(`[TaskWorker] ${this.friendlyName} stealth init failed, waiting before retry...`);
await this.sleep(30000);
return;
}
}
// =================================================================
// PREFLIGHT GATE - BLOCK unqualified workers from task pool
// All tasks require HTTP method, so HTTP preflight MUST pass.
// If preflight failed, worker retries every 30 seconds.
// Worker CANNOT claim ANY tasks until preflight passes.
// =================================================================
if (!this.preflightHttpPassed) {
console.log(`[TaskWorker] ${this.friendlyName} BLOCKED - HTTP preflight not passed, cannot claim tasks`);
await this.retryPreflightUntilPass();
return; // Return to main loop, will re-check on next iteration
}
// =================================================================
// WORKING HOURS GATE - Simulate natural traffic patterns
// Workers scale their concurrent task limit based on the current
// hour's weight in their timezone. This creates natural throughput:
// 3 AM (5%): 1 slot → worker runs 1 task at a time
// 6 PM (100%): 3 slots → worker runs full capacity
// =================================================================
const { effectiveMax, hourWeight, reason } = await this.getEffectiveMaxTasks();
if (this.activeTasks.size >= effectiveMax) {
// Already at working hours capacity - wait before checking again
const sleepMs = 10000 + Math.random() * 20000; // 10-30 seconds
if (this.activeTasks.size > 0) {
// Only log if we're actually throttled (have tasks but can't take more)
console.log(`[TaskWorker] ${this.friendlyName} AT CAPACITY - ${reason} (${this.activeTasks.size}/${effectiveMax} active)`);
}
await this.sleep(sleepMs);
return; // Return to main loop, will re-check on next iteration
}
// Pass preflight capabilities to only claim compatible tasks
const task = await taskService.claimTask(
this.role,
this.workerId,
this.preflightCurlPassed,
this.preflightHttpPassed
);
if (task) {
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
// =================================================================
// PREFLIGHT CHECK - Use stored preflight results based on task method
// We already ran dual-transport preflights at startup, so just verify
// the correct preflight passed for this task's required method.
// =================================================================
const taskMethod = task.method || 'http'; // Default to http if not specified
let preflightPassed = false;
let preflightMsg = '';
if (taskMethod === 'http' && this.preflightHttpPassed) {
preflightPassed = true;
preflightMsg = `HTTP preflight passed (IP: ${this.preflightHttpResult?.proxyIp || 'unknown'})`;
} else if (taskMethod === 'curl' && this.preflightCurlPassed) {
preflightPassed = true;
preflightMsg = `CURL preflight passed (IP: ${this.preflightCurlResult?.proxyIp || 'unknown'})`;
} else if (!task.method && (this.preflightHttpPassed || this.preflightCurlPassed)) {
// No method preference - either transport works
preflightPassed = true;
preflightMsg = this.preflightHttpPassed ? 'HTTP preflight passed' : 'CURL preflight passed';
}
if (!preflightPassed) {
const errorMsg = taskMethod === 'http'
? 'HTTP preflight not passed - cannot execute http tasks'
: 'CURL preflight not passed - cannot execute curl tasks';
console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${errorMsg}`);
console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without preflight`);
// Release task back to pending so another worker can pick it up
await taskService.releaseTask(task.id);
// Wait before trying again - give proxies time to recover
await this.sleep(30000); // 30 second wait on preflight failure
return;
}
console.log(`[TaskWorker] ${this.friendlyName} preflight verified for task ${task.id}: ${preflightMsg}`);
this.activeTasks.set(task.id, task);
// Start task in background (don't await)
const taskPromise = this.executeTask(task);
this.taskPromises.set(task.id, taskPromise);
// Clean up when done
taskPromise.finally(() => {
this.activeTasks.delete(task.id);
this.taskPromises.delete(task.id);
});
// Immediately try to claim more tasks (don't wait for poll interval)
return;
}
}
// No task claimed or at capacity - wait before next poll
await this.sleep(POLL_INTERVAL_MS);
}
/**
* Stop the worker
*/
async stop(): Promise<void> {
this.isRunning = false;
this.stopHeartbeat();
this.stopRegistryHeartbeat();
this.stopPeriodicStaleCleanup();
await this.deregister();
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
}
/**
* Execute a single task (runs concurrently with other tasks)
*/
private async executeTask(task: WorkerTask): Promise<void> {
console.log(`[TaskWorker] ${this.friendlyName} starting task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`);
try {
// Mark as running
await taskService.startTask(task.id);
// Get handler for this role (considers method for dual-transport)
const handler = getHandlerForTask(task);
if (!handler) {
throw new Error(`No handler registered for role: ${task.role}`);
}
// Create context with step tracking
const ctx: TaskContext = {
pool: this.pool,
workerId: this.workerId,
task,
heartbeat: async () => {
await taskService.heartbeat(task.id);
},
crawlRotator: this.crawlRotator,
updateStep: (step: string, detail?: string) => {
this.updateTaskStep(task.id, step, detail);
},
};
// Initialize step tracking for this task
this.updateTaskStep(task.id, 'starting', `Initializing ${task.role}`);
// Execute the task
const result = await handler(ctx);
if (result.success) {
// Clear step tracking
this.clearTaskStep(task.id);
// Mark as completed
await taskService.completeTask(task.id, result);
await this.reportTaskCompletion(true);
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id} [${this.activeTasks.size}/${this.maxConcurrentTasks} active]`);
// Chain next task if applicable
const chainedTask = await taskService.chainNextTask({
...task,
status: 'completed',
result,
});
if (chainedTask) {
console.log(`[TaskWorker] Chained new task ${chainedTask.id} (${chainedTask.role})`);
}
} else {
// Clear step tracking
this.clearTaskStep(task.id);
// Mark as failed
await taskService.failTask(task.id, result.error || 'Unknown error');
await this.reportTaskCompletion(false);
console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`);
}
} catch (error: any) {
// Clear step tracking
this.clearTaskStep(task.id);
// Mark as failed
await taskService.failTask(task.id, error.message);
await this.reportTaskCompletion(false);
console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
}
// Note: cleanup (removing from activeTasks) is handled in mainLoop's finally block
}
/**
* Check if this worker has been flagged for decommission
* Returns true if worker should stop after current task
*/
private async checkDecommission(): Promise<boolean> {
try {
// Check worker_registry for decommission flag
const result = await this.pool.query(
`SELECT decommission_requested, decommission_reason
FROM worker_registry
WHERE worker_id = $1`,
[this.workerId]
);
if (result.rows.length > 0 && result.rows[0].decommission_requested) {
const reason = result.rows[0].decommission_reason || 'No reason provided';
console.log(`[TaskWorker] Decommission requested: ${reason}`);
return true;
}
return false;
} catch (error: any) {
// If we can't check, continue running
console.warn(`[TaskWorker] Could not check decommission status: ${error.message}`);
return false;
}
}
/**
* Start heartbeat interval
*/
private startHeartbeat(taskId: number): void {
this.heartbeatInterval = setInterval(async () => {
try {
await taskService.heartbeat(taskId);
} catch (error: any) {
console.warn(`[TaskWorker] Heartbeat failed:`, error.message);
}
}, HEARTBEAT_INTERVAL_MS);
}
/**
* Stop heartbeat interval
*/
private stopHeartbeat(): void {
if (this.heartbeatInterval) {
clearInterval(this.heartbeatInterval);
this.heartbeatInterval = null;
}
}
/**
* Sleep helper
*/
private sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Get worker info
*/
getInfo(): {
workerId: string;
role: TaskRole | null;
isRunning: boolean;
activeTaskIds: number[];
activeTaskCount: number;
maxConcurrentTasks: number;
isBackingOff: boolean;
backoffReason: string | null;
preflightCurlPassed: boolean;
preflightHttpPassed: boolean;
} {
return {
workerId: this.workerId,
role: this.role,
isRunning: this.isRunning,
activeTaskIds: Array.from(this.activeTasks.keys()),
activeTaskCount: this.activeTasks.size,
maxConcurrentTasks: this.maxConcurrentTasks,
isBackingOff: this.isBackingOff,
backoffReason: this.backoffReason,
preflightCurlPassed: this.preflightCurlPassed,
preflightHttpPassed: this.preflightHttpPassed,
};
}
}
// ============================================================
// CLI ENTRY POINT
// ============================================================
async function main(): Promise<void> {
const role = process.env.WORKER_ROLE as TaskRole | undefined;
// Per TASK_WORKFLOW_2024-12-10.md: Valid task roles
const validRoles: TaskRole[] = [
'store_discovery',
'entry_point_discovery',
'product_discovery',
'payload_fetch', // Fetches from API, saves to disk
'product_refresh', // Reads from disk, processes to DB
'analytics_refresh',
];
// If role specified, validate it
if (role && !validRoles.includes(role)) {
console.error(`Error: Invalid WORKER_ROLE: ${role}`);
console.error(`Valid roles: ${validRoles.join(', ')}`);
console.error('Or omit WORKER_ROLE for role-agnostic worker (any task)');
process.exit(1);
}
// Use POD_NAME for persistent identity in K8s StatefulSet
// This ensures workers keep the same ID across restarts
// Falls back to WORKER_ID, then generates UUID if neither is set
const workerId = process.env.POD_NAME || process.env.WORKER_ID;
// Pass null for role-agnostic, or the specific role
const worker = new TaskWorker(role || null, workerId);
// Handle graceful shutdown
process.on('SIGTERM', () => {
console.log('[TaskWorker] Received SIGTERM, shutting down...');
worker.stop();
});
process.on('SIGINT', () => {
console.log('[TaskWorker] Received SIGINT, shutting down...');
worker.stop();
});
await worker.start();
}
// Run if this is the main module
if (require.main === module) {
main().catch((error) => {
console.error('[TaskWorker] Fatal error:', error);
process.exit(1);
});
}
export { main };