diff --git a/backend/src/tasks/task-service.ts b/backend/src/tasks/task-service.ts index 94a1775b..a8a65081 100644 --- a/backend/src/tasks/task-service.ts +++ b/backend/src/tasks/task-service.ts @@ -301,12 +301,20 @@ class TaskService { /** * Mark a task as failed * - * Soft failures (timeouts, connection issues): Requeue back to pending for later pickup - * Hard failures (business logic errors): Mark as failed permanently + * Soft failures (timeouts, connection issues): Requeue immediately + * Hard failures: Auto-retry up to MAX_RETRIES with exponential backoff */ async failTask(taskId: number, errorMessage: string): Promise { + const MAX_RETRIES = 3; const isSoft = this.isSoftFailure(errorMessage); + // Get current retry count + const { rows } = await pool.query( + `SELECT retry_count FROM worker_tasks WHERE id = $1`, + [taskId] + ); + const retryCount = rows[0]?.retry_count ?? 0; + if (isSoft) { // Soft failure: put back in queue immediately for another worker await pool.query( @@ -325,16 +333,36 @@ class TaskService { return true; } - // Hard failure: mark as permanently failed + // Hard failure: auto-retry with exponential backoff if under max retries + if (retryCount < MAX_RETRIES) { + const delayMinutes = Math.pow(2, retryCount) * 5; // 5, 10, 20 minutes + await pool.query( + `UPDATE worker_tasks + SET status = 'pending', + worker_id = NULL, + claimed_at = NULL, + started_at = NULL, + error_message = $2, + retry_count = retry_count + 1, + scheduled_for = NOW() + INTERVAL '${delayMinutes} minutes', + updated_at = NOW() + WHERE id = $1`, + [taskId, `Retry ${retryCount + 1}/${MAX_RETRIES}: ${errorMessage}`] + ); + console.log(`[TaskService] Task ${taskId} scheduled for retry ${retryCount + 1}/${MAX_RETRIES} in ${delayMinutes} minutes`); + return true; + } + + // Max retries exceeded: mark as permanently failed await pool.query( `UPDATE worker_tasks SET status = 'failed', completed_at = NOW(), error_message = $2 WHERE id = $1`, - [taskId, `Hard failure: ${errorMessage}`] + [taskId, `Failed after ${MAX_RETRIES} retries: ${errorMessage}`] ); - console.log(`[TaskService] Task ${taskId} hard failed: ${errorMessage}`); + console.log(`[TaskService] Task ${taskId} permanently failed after ${MAX_RETRIES} retries`); return false; }