feat: Auto-retry failed tasks with exponential backoff

- Hard failures now auto-retry up to 3 times
- Exponential backoff: 5, 10, 20 minutes
- Only permanently fails after max retries exceeded
- Soft failures still requeue immediately

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-13 01:19:48 -07:00
parent b1c1955082
commit b7e96359ef

View File

@@ -301,12 +301,20 @@ class TaskService {
/**
* Mark a task as failed
*
* Soft failures (timeouts, connection issues): Requeue back to pending for later pickup
* Hard failures (business logic errors): Mark as failed permanently
* Soft failures (timeouts, connection issues): Requeue immediately
* Hard failures: Auto-retry up to MAX_RETRIES with exponential backoff
*/
async failTask(taskId: number, errorMessage: string): Promise<boolean> {
const MAX_RETRIES = 3;
const isSoft = this.isSoftFailure(errorMessage);
// Get current retry count
const { rows } = await pool.query(
`SELECT retry_count FROM worker_tasks WHERE id = $1`,
[taskId]
);
const retryCount = rows[0]?.retry_count ?? 0;
if (isSoft) {
// Soft failure: put back in queue immediately for another worker
await pool.query(
@@ -325,16 +333,36 @@ class TaskService {
return true;
}
// Hard failure: mark as permanently failed
// Hard failure: auto-retry with exponential backoff if under max retries
if (retryCount < MAX_RETRIES) {
const delayMinutes = Math.pow(2, retryCount) * 5; // 5, 10, 20 minutes
await pool.query(
`UPDATE worker_tasks
SET status = 'pending',
worker_id = NULL,
claimed_at = NULL,
started_at = NULL,
error_message = $2,
retry_count = retry_count + 1,
scheduled_for = NOW() + INTERVAL '${delayMinutes} minutes',
updated_at = NOW()
WHERE id = $1`,
[taskId, `Retry ${retryCount + 1}/${MAX_RETRIES}: ${errorMessage}`]
);
console.log(`[TaskService] Task ${taskId} scheduled for retry ${retryCount + 1}/${MAX_RETRIES} in ${delayMinutes} minutes`);
return true;
}
// Max retries exceeded: mark as permanently failed
await pool.query(
`UPDATE worker_tasks
SET status = 'failed',
completed_at = NOW(),
error_message = $2
WHERE id = $1`,
[taskId, `Hard failure: ${errorMessage}`]
[taskId, `Failed after ${MAX_RETRIES} retries: ${errorMessage}`]
);
console.log(`[TaskService] Task ${taskId} hard failed: ${errorMessage}`);
console.log(`[TaskService] Task ${taskId} permanently failed after ${MAX_RETRIES} retries`);
return false;
}