feat: Worker improvements and Run Now duplicate prevention

- Fix Run Now to prevent duplicate task creation
- Add loading state to Run Now button in UI
- Return early when no stores need refresh
- Worker dashboard improvements
- Browser pooling architecture updates
- K8s worker config updates (8 replicas, 3 concurrent tasks)
This commit is contained in:
Kelly
2025-12-12 20:11:31 -07:00
parent c98c409f59
commit 63023a4061
12 changed files with 809 additions and 239 deletions

View File

@@ -97,8 +97,12 @@ const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
// =============================================================================
// Maximum number of tasks this worker will run concurrently
// Tune based on workload: I/O-bound tasks benefit from higher concurrency
const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '15');
// Browser tasks (Puppeteer) use ~400MB RAM each. With 2GB pod limit:
// - 3 browsers = ~1.3GB = SAFE
// - 4 browsers = ~1.7GB = RISKY
// - 5+ browsers = OOM CRASH
// See: docs/WORKER_TASK_ARCHITECTURE.md#browser-task-memory-limits
const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
// When heap memory usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
// Default 85% - gives headroom before OOM
@@ -131,6 +135,8 @@ export interface TaskContext {
task: WorkerTask;
heartbeat: () => Promise<void>;
crawlRotator?: CrawlRotator;
/** Update the current step being executed (shown in dashboard) */
updateStep: (step: string, detail?: string) => void;
}
export interface TaskResult {
@@ -264,6 +270,18 @@ export class TaskWorker {
private preflightsCompleted: boolean = false;
private initializingPromise: Promise<void> | null = null;
// ==========================================================================
// STEP TRACKING FOR DASHBOARD VISIBILITY
// ==========================================================================
// Workers report their current step in heartbeats so the dashboard can show
// real-time progress like "preflight", "loading page", "processing products"
// ==========================================================================
private currentStep: string = 'idle';
private currentStepDetail: string | null = null;
private currentStepStartedAt: Date | null = null;
/** Map of task ID -> step info for concurrent tasks */
private taskSteps: Map<number, { step: string; detail: string | null; startedAt: Date }> = new Map();
constructor(role: TaskRole | null = null, workerId?: string) {
this.pool = getPool();
this.role = role;
@@ -346,6 +364,65 @@ export class TaskWorker {
return this.activeTasks.size < this.maxConcurrentTasks;
}
// ==========================================================================
// STEP TRACKING METHODS
// ==========================================================================
/**
* Update the current step for a task (for dashboard visibility)
* @param taskId - The task ID to update
* @param step - Short step name (e.g., "preflight", "loading", "processing")
* @param detail - Optional detail (e.g., "Verifying IP 1.2.3.4")
*/
public updateTaskStep(taskId: number, step: string, detail?: string): void {
this.taskSteps.set(taskId, {
step,
detail: detail || null,
startedAt: new Date(),
});
// Also update the "primary" step for single-task backwards compat
if (this.activeTasks.size === 1 || taskId === Array.from(this.activeTasks.keys())[0]) {
this.currentStep = step;
this.currentStepDetail = detail || null;
this.currentStepStartedAt = new Date();
}
console.log(`[TaskWorker] Step: ${step}${detail ? ` - ${detail}` : ''} (task #${taskId})`);
}
/**
* Clear step tracking for a task (when task completes)
*/
private clearTaskStep(taskId: number): void {
this.taskSteps.delete(taskId);
// Reset primary step if no more active tasks
if (this.activeTasks.size === 0) {
this.currentStep = 'idle';
this.currentStepDetail = null;
this.currentStepStartedAt = null;
}
}
/**
* Get current step info for all active tasks (for heartbeat)
*/
private getTaskStepsInfo(): Array<{
task_id: number;
step: string;
detail: string | null;
elapsed_ms: number;
}> {
const now = Date.now();
return Array.from(this.taskSteps.entries()).map(([taskId, info]) => ({
task_id: taskId,
step: info.step,
detail: info.detail,
elapsed_ms: now - info.startedAt.getTime(),
}));
}
/**
* Initialize stealth systems (proxy rotation, fingerprints)
* Called LAZILY on first task claim attempt (NOT at worker startup).
@@ -635,7 +712,7 @@ export class TaskWorker {
}
/**
* Send heartbeat to registry with resource usage and proxy location
* Send heartbeat to registry with resource usage, proxy location, and step info
*/
private async sendRegistryHeartbeat(): Promise<void> {
try {
@@ -647,6 +724,9 @@ export class TaskWorker {
// Get array of active task IDs
const activeTaskIds = Array.from(this.activeTasks.keys());
// Get step info for all active tasks
const taskSteps = this.getTaskStepsInfo();
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
@@ -657,6 +737,11 @@ export class TaskWorker {
active_task_count: this.activeTasks.size,
max_concurrent_tasks: this.maxConcurrentTasks,
status: this.activeTasks.size > 0 ? 'active' : 'idle',
// Step tracking for dashboard visibility
current_step: this.currentStep,
current_step_detail: this.currentStepDetail,
current_step_started_at: this.currentStepStartedAt?.toISOString() || null,
task_steps: taskSteps, // Per-task step info for concurrent workers
resources: {
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
@@ -915,7 +1000,7 @@ export class TaskWorker {
throw new Error(`No handler registered for role: ${task.role}`);
}
// Create context
// Create context with step tracking
const ctx: TaskContext = {
pool: this.pool,
workerId: this.workerId,
@@ -924,12 +1009,21 @@ export class TaskWorker {
await taskService.heartbeat(task.id);
},
crawlRotator: this.crawlRotator,
updateStep: (step: string, detail?: string) => {
this.updateTaskStep(task.id, step, detail);
},
};
// Initialize step tracking for this task
this.updateTaskStep(task.id, 'starting', `Initializing ${task.role}`);
// Execute the task
const result = await handler(ctx);
if (result.success) {
// Clear step tracking
this.clearTaskStep(task.id);
// Mark as completed
await taskService.completeTask(task.id, result);
await this.reportTaskCompletion(true);
@@ -945,12 +1039,18 @@ export class TaskWorker {
console.log(`[TaskWorker] Chained new task ${chainedTask.id} (${chainedTask.role})`);
}
} else {
// Clear step tracking
this.clearTaskStep(task.id);
// Mark as failed
await taskService.failTask(task.id, result.error || 'Unknown error');
await this.reportTaskCompletion(false);
console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`);
}
} catch (error: any) {
// Clear step tracking
this.clearTaskStep(task.id);
// Mark as failed
await taskService.failTask(task.id, error.message);
await this.reportTaskCompletion(false);