feat(workers): Concurrent task processing with resource-based backoff

Workers can now process multiple tasks concurrently (default: 3 max).
Self-regulate based on resource usage - back off at 85% memory or 90% CPU.

Backend changes:
- TaskWorker handles concurrent tasks using async Maps
- Resource monitoring (memory %, CPU %) with backoff logic
- Heartbeat reports active_task_count, max_concurrent_tasks, resource stats
- Decommission support via worker_commands table

Frontend changes:
- Workers Dashboard shows tasks per worker (N/M format)
- Resource badges with color-coded thresholds
- Pod visualization with clickable selection
- Decommission controls per worker

New env vars:
- MAX_CONCURRENT_TASKS (default: 3)
- MEMORY_BACKOFF_THRESHOLD (default: 0.85)
- CPU_BACKOFF_THRESHOLD (default: 0.90)
- BACKOFF_DURATION_MS (default: 10000)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-11 11:47:24 -07:00
parent be251c6fb3
commit 8e2f07c941
10 changed files with 1243 additions and 416 deletions

View File

@@ -64,6 +64,33 @@ const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
// =============================================================================
// CONCURRENT TASK PROCESSING SETTINGS
// =============================================================================
// Workers can process multiple tasks simultaneously using async I/O.
// This improves throughput for I/O-bound tasks (network calls, DB queries).
//
// Resource thresholds trigger "backoff" - the worker stops claiming new tasks
// but continues processing existing ones until resources return to normal.
//
// See: docs/WORKER_TASK_ARCHITECTURE.md#concurrent-task-processing
// =============================================================================
// Maximum number of tasks this worker will run concurrently
// Tune based on workload: I/O-bound tasks benefit from higher concurrency
const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
// When heap memory usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
// Default 85% - gives headroom before OOM
const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
// Default 90% - allows some burst capacity
const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
// How long to wait (ms) when in backoff state before rechecking resources
const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000');
export interface TaskContext {
pool: Pool;
workerId: string;
@@ -94,6 +121,25 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
analytics_refresh: handleAnalyticsRefresh,
};
/**
* Resource usage stats reported to the registry and used for backoff decisions.
* These values are included in worker heartbeats and displayed in the UI.
*/
interface ResourceStats {
/** Current heap memory usage as decimal (0.0 to 1.0) */
memoryPercent: number;
/** Current heap used in MB */
memoryMb: number;
/** Total heap available in MB */
memoryTotalMb: number;
/** CPU usage percentage since last check (0 to 100) */
cpuPercent: number;
/** True if worker is currently in backoff state */
isBackingOff: boolean;
/** Reason for backoff (e.g., "Memory at 87.3% (threshold: 85%)") */
backoffReason: string | null;
}
export class TaskWorker {
private pool: Pool;
private workerId: string;
@@ -102,14 +148,106 @@ export class TaskWorker {
private isRunning: boolean = false;
private heartbeatInterval: NodeJS.Timeout | null = null;
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
private currentTask: WorkerTask | null = null;
private crawlRotator: CrawlRotator;
// ==========================================================================
// CONCURRENT TASK TRACKING
// ==========================================================================
// activeTasks: Map of task ID -> task object for all currently running tasks
// taskPromises: Map of task ID -> Promise for cleanup when task completes
// maxConcurrentTasks: How many tasks this worker will run in parallel
// ==========================================================================
private activeTasks: Map<number, WorkerTask> = new Map();
private taskPromises: Map<number, Promise<void>> = new Map();
private maxConcurrentTasks: number = MAX_CONCURRENT_TASKS;
// ==========================================================================
// RESOURCE MONITORING FOR BACKOFF
// ==========================================================================
// CPU tracking uses differential measurement - we track last values and
// calculate percentage based on elapsed time since last check.
// ==========================================================================
private lastCpuUsage: { user: number; system: number } = { user: 0, system: 0 };
private lastCpuCheck: number = Date.now();
private isBackingOff: boolean = false;
private backoffReason: string | null = null;
constructor(role: TaskRole | null = null, workerId?: string) {
this.pool = getPool();
this.role = role;
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
this.crawlRotator = new CrawlRotator(this.pool);
// Initialize CPU tracking
const cpuUsage = process.cpuUsage();
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
this.lastCpuCheck = Date.now();
}
/**
* Get current resource usage
*/
private getResourceStats(): ResourceStats {
const memUsage = process.memoryUsage();
const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
const heapTotalMb = memUsage.heapTotal / 1024 / 1024;
const memoryPercent = heapUsedMb / heapTotalMb;
// Calculate CPU usage since last check
const cpuUsage = process.cpuUsage();
const now = Date.now();
const elapsed = now - this.lastCpuCheck;
let cpuPercent = 0;
if (elapsed > 0) {
const userDiff = (cpuUsage.user - this.lastCpuUsage.user) / 1000; // microseconds to ms
const systemDiff = (cpuUsage.system - this.lastCpuUsage.system) / 1000;
cpuPercent = ((userDiff + systemDiff) / elapsed) * 100;
}
// Update last values
this.lastCpuUsage = { user: cpuUsage.user, system: cpuUsage.system };
this.lastCpuCheck = now;
return {
memoryPercent,
memoryMb: Math.round(heapUsedMb),
memoryTotalMb: Math.round(heapTotalMb),
cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
isBackingOff: this.isBackingOff,
backoffReason: this.backoffReason,
};
}
/**
* Check if we should back off from taking new tasks
*/
private shouldBackOff(): { backoff: boolean; reason: string | null } {
const stats = this.getResourceStats();
if (stats.memoryPercent > MEMORY_BACKOFF_THRESHOLD) {
return { backoff: true, reason: `Memory at ${(stats.memoryPercent * 100).toFixed(1)}% (threshold: ${MEMORY_BACKOFF_THRESHOLD * 100}%)` };
}
if (stats.cpuPercent > CPU_BACKOFF_THRESHOLD * 100) {
return { backoff: true, reason: `CPU at ${stats.cpuPercent.toFixed(1)}% (threshold: ${CPU_BACKOFF_THRESHOLD * 100}%)` };
}
return { backoff: false, reason: null };
}
/**
* Get count of currently running tasks
*/
get activeTaskCount(): number {
return this.activeTasks.size;
}
/**
* Check if we can accept more tasks
*/
private canAcceptMoreTasks(): boolean {
return this.activeTasks.size < this.maxConcurrentTasks;
}
/**
@@ -252,21 +390,32 @@ export class TaskWorker {
const memUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
const proxyLocation = this.crawlRotator.getProxyLocation();
const resourceStats = this.getResourceStats();
// Get array of active task IDs
const activeTaskIds = Array.from(this.activeTasks.keys());
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
worker_id: this.workerId,
current_task_id: this.currentTask?.id || null,
status: this.currentTask ? 'active' : 'idle',
current_task_id: activeTaskIds[0] || null, // Primary task for backwards compat
current_task_ids: activeTaskIds, // All active tasks
active_task_count: this.activeTasks.size,
max_concurrent_tasks: this.maxConcurrentTasks,
status: this.activeTasks.size > 0 ? 'active' : 'idle',
resources: {
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
memory_percent: Math.round(resourceStats.memoryPercent * 100),
cpu_user_ms: Math.round(cpuUsage.user / 1000),
cpu_system_ms: Math.round(cpuUsage.system / 1000),
cpu_percent: Math.round(resourceStats.cpuPercent),
proxy_location: proxyLocation,
is_backing_off: this.isBackingOff,
backoff_reason: this.backoffReason,
}
})
});
@@ -328,20 +477,85 @@ export class TaskWorker {
this.startRegistryHeartbeat();
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg}`);
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (max ${this.maxConcurrentTasks} concurrent tasks)`);
while (this.isRunning) {
try {
await this.processNextTask();
await this.mainLoop();
} catch (error: any) {
console.error(`[TaskWorker] Loop error:`, error.message);
await this.sleep(POLL_INTERVAL_MS);
}
}
// Wait for any remaining tasks to complete
if (this.taskPromises.size > 0) {
console.log(`[TaskWorker] Waiting for ${this.taskPromises.size} active tasks to complete...`);
await Promise.allSettled(this.taskPromises.values());
}
console.log(`[TaskWorker] Worker ${this.workerId} stopped`);
}
/**
* Main loop - tries to fill up to maxConcurrentTasks
*/
private async mainLoop(): Promise<void> {
// Check resource usage and backoff if needed
const { backoff, reason } = this.shouldBackOff();
if (backoff) {
if (!this.isBackingOff) {
console.log(`[TaskWorker] ${this.friendlyName} backing off: ${reason}`);
}
this.isBackingOff = true;
this.backoffReason = reason;
await this.sleep(BACKOFF_DURATION_MS);
return;
}
// Clear backoff state
if (this.isBackingOff) {
console.log(`[TaskWorker] ${this.friendlyName} resuming normal operation`);
this.isBackingOff = false;
this.backoffReason = null;
}
// Check for decommission signal
const shouldDecommission = await this.checkDecommission();
if (shouldDecommission) {
console.log(`[TaskWorker] ${this.friendlyName} received decommission signal - waiting for ${this.activeTasks.size} tasks to complete`);
// Stop accepting new tasks, wait for current to finish
this.isRunning = false;
return;
}
// Try to claim more tasks if we have capacity
if (this.canAcceptMoreTasks()) {
const task = await taskService.claimTask(this.role, this.workerId);
if (task) {
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
this.activeTasks.set(task.id, task);
// Start task in background (don't await)
const taskPromise = this.executeTask(task);
this.taskPromises.set(task.id, taskPromise);
// Clean up when done
taskPromise.finally(() => {
this.activeTasks.delete(task.id);
this.taskPromises.delete(task.id);
});
// Immediately try to claim more tasks (don't wait for poll interval)
return;
}
}
// No task claimed or at capacity - wait before next poll
await this.sleep(POLL_INTERVAL_MS);
}
/**
* Stop the worker
*/
@@ -354,23 +568,10 @@ export class TaskWorker {
}
/**
* Process the next available task
* Execute a single task (runs concurrently with other tasks)
*/
private async processNextTask(): Promise<void> {
// Try to claim a task
const task = await taskService.claimTask(this.role, this.workerId);
if (!task) {
// No tasks available, wait and retry
await this.sleep(POLL_INTERVAL_MS);
return;
}
this.currentTask = task;
console.log(`[TaskWorker] Claimed task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`);
// Start heartbeat
this.startHeartbeat(task.id);
private async executeTask(task: WorkerTask): Promise<void> {
console.log(`[TaskWorker] ${this.friendlyName} starting task ${task.id} (${task.role}) for dispensary ${task.dispensary_id || 'N/A'}`);
try {
// Mark as running
@@ -399,7 +600,7 @@ export class TaskWorker {
// Mark as completed
await taskService.completeTask(task.id, result);
await this.reportTaskCompletion(true);
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id}`);
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id} [${this.activeTasks.size}/${this.maxConcurrentTasks} active]`);
// Chain next task if applicable
const chainedTask = await taskService.chainNextTask({
@@ -421,9 +622,35 @@ export class TaskWorker {
await taskService.failTask(task.id, error.message);
await this.reportTaskCompletion(false);
console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
} finally {
this.stopHeartbeat();
this.currentTask = null;
}
// Note: cleanup (removing from activeTasks) is handled in mainLoop's finally block
}
/**
* Check if this worker has been flagged for decommission
* Returns true if worker should stop after current task
*/
private async checkDecommission(): Promise<boolean> {
try {
// Check worker_registry for decommission flag
const result = await this.pool.query(
`SELECT decommission_requested, decommission_reason
FROM worker_registry
WHERE worker_id = $1`,
[this.workerId]
);
if (result.rows.length > 0 && result.rows[0].decommission_requested) {
const reason = result.rows[0].decommission_reason || 'No reason provided';
console.log(`[TaskWorker] Decommission requested: ${reason}`);
return true;
}
return false;
} catch (error: any) {
// If we can't check, continue running
console.warn(`[TaskWorker] Could not check decommission status: ${error.message}`);
return false;
}
}
@@ -460,12 +687,25 @@ export class TaskWorker {
/**
* Get worker info
*/
getInfo(): { workerId: string; role: TaskRole | null; isRunning: boolean; currentTaskId: number | null } {
getInfo(): {
workerId: string;
role: TaskRole | null;
isRunning: boolean;
activeTaskIds: number[];
activeTaskCount: number;
maxConcurrentTasks: number;
isBackingOff: boolean;
backoffReason: string | null;
} {
return {
workerId: this.workerId,
role: this.role,
isRunning: this.isRunning,
currentTaskId: this.currentTask?.id || null,
activeTaskIds: Array.from(this.activeTasks.keys()),
activeTaskCount: this.activeTasks.size,
maxConcurrentTasks: this.maxConcurrentTasks,
isBackingOff: this.isBackingOff,
backoffReason: this.backoffReason,
};
}
}