feat: Auto-retry failed tasks with exponential backoff
- Hard failures now auto-retry up to 3 times - Exponential backoff: 5, 10, 20 minutes - Only permanently fails after max retries exceeded - Soft failures still requeue immediately 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -301,12 +301,20 @@ class TaskService {
|
||||
/**
|
||||
* Mark a task as failed
|
||||
*
|
||||
* Soft failures (timeouts, connection issues): Requeue back to pending for later pickup
|
||||
* Hard failures (business logic errors): Mark as failed permanently
|
||||
* Soft failures (timeouts, connection issues): Requeue immediately
|
||||
* Hard failures: Auto-retry up to MAX_RETRIES with exponential backoff
|
||||
*/
|
||||
async failTask(taskId: number, errorMessage: string): Promise<boolean> {
|
||||
const MAX_RETRIES = 3;
|
||||
const isSoft = this.isSoftFailure(errorMessage);
|
||||
|
||||
// Get current retry count
|
||||
const { rows } = await pool.query(
|
||||
`SELECT retry_count FROM worker_tasks WHERE id = $1`,
|
||||
[taskId]
|
||||
);
|
||||
const retryCount = rows[0]?.retry_count ?? 0;
|
||||
|
||||
if (isSoft) {
|
||||
// Soft failure: put back in queue immediately for another worker
|
||||
await pool.query(
|
||||
@@ -325,16 +333,36 @@ class TaskService {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Hard failure: mark as permanently failed
|
||||
// Hard failure: auto-retry with exponential backoff if under max retries
|
||||
if (retryCount < MAX_RETRIES) {
|
||||
const delayMinutes = Math.pow(2, retryCount) * 5; // 5, 10, 20 minutes
|
||||
await pool.query(
|
||||
`UPDATE worker_tasks
|
||||
SET status = 'pending',
|
||||
worker_id = NULL,
|
||||
claimed_at = NULL,
|
||||
started_at = NULL,
|
||||
error_message = $2,
|
||||
retry_count = retry_count + 1,
|
||||
scheduled_for = NOW() + INTERVAL '${delayMinutes} minutes',
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[taskId, `Retry ${retryCount + 1}/${MAX_RETRIES}: ${errorMessage}`]
|
||||
);
|
||||
console.log(`[TaskService] Task ${taskId} scheduled for retry ${retryCount + 1}/${MAX_RETRIES} in ${delayMinutes} minutes`);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Max retries exceeded: mark as permanently failed
|
||||
await pool.query(
|
||||
`UPDATE worker_tasks
|
||||
SET status = 'failed',
|
||||
completed_at = NOW(),
|
||||
error_message = $2
|
||||
WHERE id = $1`,
|
||||
[taskId, `Hard failure: ${errorMessage}`]
|
||||
[taskId, `Failed after ${MAX_RETRIES} retries: ${errorMessage}`]
|
||||
);
|
||||
console.log(`[TaskService] Task ${taskId} hard failed: ${errorMessage}`);
|
||||
console.log(`[TaskService] Task ${taskId} permanently failed after ${MAX_RETRIES} retries`);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user