feat(workers): Concurrent task processing with resource-based backoff

Workers can now process multiple tasks concurrently (default: 3 max). Self-regulate based on resource usage - back off at 85% memory or 90% CPU. Backend changes: - TaskWorker handles concurrent tasks using async Maps - Resource monitoring (memory %, CPU %) with backoff logic - Heartbeat reports active_task_count, max_concurrent_tasks, resource stats - Decommission support via worker_commands table Frontend changes: - Workers Dashboard shows tasks per worker (N/M format) - Resource badges with color-coded thresholds - Pod visualization with clickable selection - Decommission controls per worker New env vars: - MAX_CONCURRENT_TASKS (default: 3) - MEMORY_BACKOFF_THRESHOLD (default: 0.85) - CPU_BACKOFF_THRESHOLD (default: 0.90) - BACKOFF_DURATION_MS (default: 10000) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:47:24 -07:00
parent be251c6fb3
commit 8e2f07c941
10 changed files with 1243 additions and 416 deletions
--- a/backend/src/tasks/task-worker.ts
+++ b/backend/src/tasks/task-worker.ts
@@ -64,6 +64,33 @@ const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
 const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
 const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';

+// =============================================================================
+// CONCURRENT TASK PROCESSING SETTINGS
+// =============================================================================
+// Workers can process multiple tasks simultaneously using async I/O.
+// This improves throughput for I/O-bound tasks (network calls, DB queries).
+//
+// Resource thresholds trigger "backoff" - the worker stops claiming new tasks
+// but continues processing existing ones until resources return to normal.
+//
+// See: docs/WORKER_TASK_ARCHITECTURE.md#concurrent-task-processing
+// =============================================================================
+
+// Maximum number of tasks this worker will run concurrently
+// Tune based on workload: I/O-bound tasks benefit from higher concurrency
+const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
+
+// When heap memory usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
+// Default 85% - gives headroom before OOM
+const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
+
+// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
+// Default 90% - allows some burst capacity
+const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
+
+// How long to wait (ms) when in backoff state before rechecking resources
+const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
+
 export interface TaskContext {
  pool: Pool;
  workerId: string;
@@ -94,6 +121,25 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
  analytics_refresh: handleAnalyticsRefresh,
 };

+/**
+ * Resource usage stats reported to the registry and used for backoff decisions.
+ * These values are included in worker heartbeats and displayed in the UI.
+ */
+interface ResourceStats {
+  /** Current heap memory usage as decimal (0.0 to 1.0) */
+  memoryPercent: number;
+  /** Current heap used in MB */
+  memoryMb: number;
+  /** Total heap available in MB */
+  memoryTotalMb: number;
+  /** CPU usage percentage since last check (0 to 100) */
+  cpuPercent: number;
+  /** True if worker is currently in backoff state */
+  isBackingOff: boolean;
+  /** Reason for backoff (e.g., "Memory at 87.3% (threshold: 85%)") */
+  backoffReason: string | null;
+}
+
 export class TaskWorker {
  private pool: Pool;
  private workerId: string;
@@ -102,14 +148,106 @@ export class TaskWorker {
  private isRunning: boolean = false;
  private heartbeatInterval: NodeJS.Timeout | null = null;
  private registryHeartbeatInterval: NodeJS.Timeout | null = null;
-  private currentTask: WorkerTask | null = null;
  private crawlRotator: CrawlRotator;

+  // ==========================================================================
+  // CONCURRENT TASK TRACKING
+  // ==========================================================================
+  // activeTasks: Map of task ID -> task object for all currently running tasks
+  // taskPromises: Map of task ID -> Promise for cleanup when task completes
+  // maxConcurrentTasks: How many tasks this worker will run in parallel
+  // ==========================================================================
+  private activeTasks: Map<number, WorkerTask> = new Map();
+  private taskPromises: Map<number, Promise<void>> = new Map();
+  private maxConcurrentTasks: number = MAX_CONCURRENT_TASKS;
+
+  // ==========================================================================
+  // RESOURCE MONITORING FOR BACKOFF
+  // ==========================================================================
+  // CPU tracking uses differential measurement - we track last values and
+  // calculate percentage based on elapsed time since last check.
+  // ==========================================================================
+  private lastCpuUsage: { user: number; system: number } = { user: 0, system: 0 };
+  private lastCpuCheck: number = Date.now();
+  private isBackingOff: boolean = false;
+  private backoffReason: string | null = null;
+
  constructor(role: TaskRole | null = null, workerId?: string) {
    this.pool = getPool();
    this.role = role;
    this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
    this.crawlRotator = new CrawlRotator(this.pool);
+
+    // Initialize CPU tracking
+    const cpuUsage = process.cpuUsage();
+    this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
+    this.lastCpuCheck = Date.now();
+  }
+
+  /**
+   * Get current resource usage
+   */
+  private getResourceStats(): ResourceStats {
+    const memUsage = process.memoryUsage();
+    const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
+    const heapTotalMb = memUsage.heapTotal / 1024 / 1024;
+    const memoryPercent = heapUsedMb / heapTotalMb;
+
+    // Calculate CPU usage since last check
+    const cpuUsage = process.cpuUsage();
+    const now = Date.now();
+    const elapsed = now - this.lastCpuCheck;
+
+    let cpuPercent = 0;
+    if (elapsed > 0) {
+      const userDiff = (cpuUsage.user - this.lastCpuUsage.user) / 1000; // microseconds to ms
+      const systemDiff = (cpuUsage.system - this.lastCpuUsage.system) / 1000;
+      cpuPercent = ((userDiff + systemDiff) / elapsed) * 100;
+    }
+
+    // Update last values
+    this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
+    this.lastCpuCheck = now;
+
+    return {
+      memoryPercent,
+      memoryMb: Math.round(heapUsedMb),
+      memoryTotalMb: Math.round(heapTotalMb),
+      cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
+      isBackingOff: this.isBackingOff,
+      backoffReason: this.backoffReason,
+    };
+  }
+
+  /**
+   * Check if we should back off from taking new tasks
+   */
+  private shouldBackOff(): { backoff: boolean; reason: string | null } {
+    const stats = this.getResourceStats();
+
+    if (stats.memoryPercent > MEMORY_BACKOFF_THRESHOLD) {
+      return { backoff: true, reason: `Memory at ${(stats.memoryPercent * 100).toFixed(1)}% (threshold: ${MEMORY_BACKOFF_THRESHOLD * 100}%)` };
+    }
+
+    if (stats.cpuPercent > CPU_BACKOFF_THRESHOLD * 100) {
+      return { backoff: true, reason: `CPU at ${stats.cpuPercent.toFixed(1)}% (threshold: ${CPU_BACKOFF_THRESHOLD * 100}%)` };
+    }
+
+    return { backoff: false, reason: null };
+  }
+
+  /**
+   * Get count of currently running tasks
+   */
+  get activeTaskCount(): number {
+    return this.activeTasks.size;
+  }
+
+  /**
+   * Check if we can accept more tasks
+   */
+  private canAcceptMoreTasks(): boolean {
+    return this.activeTasks.size < this.maxConcurrentTasks;
  }

  /**
@@ -252,21 +390,32 @@ export class TaskWorker {
      const memUsage = process.memoryUsage();
      const cpuUsage = process.cpuUsage();
      const proxyLocation = this.crawlRotator.getProxyLocation();
+      const resourceStats = this.getResourceStats();
+
+      // Get array of active task IDs
+      const activeTaskIds = Array.from(this.activeTasks.keys());

      await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
          worker_id: this.workerId,
-          current_task_id: this.currentTask?.id || null,
-          status: this.currentTask ? 'active' : 'idle',
+          current_task_id: activeTaskIds[0] || null, // Primary task for backwards compat
+          current_task_ids: activeTaskIds, // All active tasks
+          active_task_count: this.activeTasks.size,
+          max_concurrent_tasks: this.maxConcurrentTasks,
+          status: this.activeTasks.size > 0 ? 'active' : 'idle',
          resources: {
            memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
            memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
            memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
+            memory_percent: Math.round(resourceStats.memoryPercent * 100),
            cpu_user_ms: Math.round(cpuUsage.user / 1000),
            cpu_system_ms: Math.round(cpuUsage.system / 1000),
+            cpu_percent: Math.round(resourceStats.cpuPercent),
            proxy_location: proxyLocation,
+            is_backing_off: this.isBackingOff,
+            backoff_reason: this.backoffReason,
          }
        })
      });
@@ -328,20 +477,85 @@ export class TaskWorker {
    this.startRegistryHeartbeat();

    const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
-    console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg}`);
+    console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (max ${this.maxConcurrentTasks} concurrent tasks)`);

    while (this.isRunning) {
      try {
-        await this.processNextTask();
+        await this.mainLoop();
      } catch (error: any) {
        console.error(`[TaskWorker] Loop error:`, error.message);
        await this.sleep(POLL_INTERVAL_MS);
      }
    }

+    // Wait for any remaining tasks to complete
+    if (this.taskPromises.size > 0) {
+      console.log(`[TaskWorker] Waiting for ${this.taskPromises.size} active tasks to complete...`);
+      await Promise.allSettled(this.taskPromises.values());
+    }
+
    console.log(`[TaskWorker] Worker ${this.workerId} stopped`);
  }

+  /**
+   * Main loop - tries to fill up to maxConcurrentTasks
+   */
+  private async mainLoop(): Promise<void> {
+    // Check resource usage and backoff if needed
+    const { backoff, reason } = this.shouldBackOff();
+    if (backoff) {
+      if (!this.isBackingOff) {
+        console.log(`[TaskWorker] ${this.friendlyName} backing off: ${reason}`);
+      }
+      this.isBackingOff = true;
+      this.backoffReason = reason;
+      await this.sleep(BACKOFF_DURATION_MS);
+      return;
+    }
+
+    // Clear backoff state
+    if (this.isBackingOff) {
+      console.log(`[TaskWorker] ${this.friendlyName} resuming normal operation`);
+      this.isBackingOff = false;
+      this.backoffReason = null;
+    }
+
+    // Check for decommission signal
+    const shouldDecommission = await this.checkDecommission();
+    if (shouldDecommission) {
+      console.log(`[TaskWorker] ${this.friendlyName} received decommission signal - waiting for ${this.activeTasks.size} tasks to complete`);
+      // Stop accepting new tasks, wait for current to finish
+      this.isRunning = false;
+      return;
+    }
+
+    // Try to claim more tasks if we have capacity
+    if (this.canAcceptMoreTasks()) {
+      const task = await taskService.claimTask(this.role, this.workerId);
+
+      if (task) {
+        console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
+        this.activeTasks.set(task.id, task);
+
+        // Start task in background (don't await)
+        const taskPromise = this.executeTask(task);
+        this.taskPromises.set(task.id, taskPromise);
+
+        // Clean up when done
+        taskPromise.finally(() => {
+          this.activeTasks.delete(task.id);
+          this.taskPromises.delete(task.id);
+        });
+
+        // Immediately try to claim more tasks (don't wait for poll interval)
+        return;
+      }
+    }
+
+    // No task claimed or at capacity - wait before next poll
+    await this.sleep(POLL_INTERVAL_MS);
+  }
+
  /**
   * Stop the worker
   */
@@ -354,23 +568,10 @@ export class TaskWorker {
  }

  /**
-   * Process the next available task
+   * Execute a single task (runs concurrently with other tasks)
   */
-  private async processNextTask(): Promise<void> {
-    // Try to claim a task
-    const task = await taskService.claimTask(this.role, this.workerId);
-
-    if (!task) {
-      // No tasks available, wait and retry
-      await this.sleep(POLL_INTERVAL_MS);
-      return;
-    }
-
-    this.currentTask = task;
-    console.log(`[TaskWorker] Claimed task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`);
-
-    // Start heartbeat
-    this.startHeartbeat(task.id);
+  private async executeTask(task: WorkerTask): Promise<void> {
+    console.log(`[TaskWorker] ${this.friendlyName} starting task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`);

    try {
      // Mark as running
@@ -399,7 +600,7 @@ export class TaskWorker {
        // Mark as completed
        await taskService.completeTask(task.id, result);
        await this.reportTaskCompletion(true);
-        console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id}`);
+        console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id} [${this.activeTasks.size}/${this.maxConcurrentTasks} active]`);

        // Chain next task if applicable
        const chainedTask = await taskService.chainNextTask({
@@ -421,9 +622,35 @@ export class TaskWorker {
      await taskService.failTask(task.id, error.message);
      await this.reportTaskCompletion(false);
      console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
-    } finally {
-      this.stopHeartbeat();
-      this.currentTask = null;
+    }
+    // Note: cleanup (removing from activeTasks) is handled in mainLoop's finally block
+  }
+
+  /**
+   * Check if this worker has been flagged for decommission
+   * Returns true if worker should stop after current task
+   */
+  private async checkDecommission(): Promise<boolean> {
+    try {
+      // Check worker_registry for decommission flag
+      const result = await this.pool.query(
+        `SELECT decommission_requested, decommission_reason
+         FROM worker_registry
+         WHERE worker_id = $1`,
+        [this.workerId]
+      );
+
+      if (result.rows.length > 0 && result.rows[0].decommission_requested) {
+        const reason = result.rows[0].decommission_reason || 'No reason provided';
+        console.log(`[TaskWorker] Decommission requested: ${reason}`);
+        return true;
+      }
+
+      return false;
+    } catch (error: any) {
+      // If we can't check, continue running
+      console.warn(`[TaskWorker] Could not check decommission status: ${error.message}`);
+      return false;
    }
  }

@@ -460,12 +687,25 @@ export class TaskWorker {
  /**
   * Get worker info
   */
-  getInfo(): { workerId: string; role: TaskRole | null; isRunning: boolean; currentTaskId: number | null } {
+  getInfo(): {
+    workerId: string;
+    role: TaskRole | null;
+    isRunning: boolean;
+    activeTaskIds: number[];
+    activeTaskCount: number;
+    maxConcurrentTasks: number;
+    isBackingOff: boolean;
+    backoffReason: string | null;
+  } {
    return {
      workerId: this.workerId,
      role: this.role,
      isRunning: this.isRunning,
-      currentTaskId: this.currentTask?.id || null,
+      activeTaskIds: Array.from(this.activeTasks.keys()),
+      activeTaskCount: this.activeTasks.size,
+      maxConcurrentTasks: this.maxConcurrentTasks,
+      isBackingOff: this.isBackingOff,
+      backoffReason: this.backoffReason,
    };
  }
 }