feat: Auto-retry failed tasks with exponential backoff
- Hard failures now auto-retry up to 3 times - Exponential backoff: 5, 10, 20 minutes - Only permanently fails after max retries exceeded - Soft failures still requeue immediately 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -301,12 +301,20 @@ class TaskService {
|
|||||||
/**
|
/**
|
||||||
* Mark a task as failed
|
* Mark a task as failed
|
||||||
*
|
*
|
||||||
* Soft failures (timeouts, connection issues): Requeue back to pending for later pickup
|
* Soft failures (timeouts, connection issues): Requeue immediately
|
||||||
* Hard failures (business logic errors): Mark as failed permanently
|
* Hard failures: Auto-retry up to MAX_RETRIES with exponential backoff
|
||||||
*/
|
*/
|
||||||
async failTask(taskId: number, errorMessage: string): Promise<boolean> {
|
async failTask(taskId: number, errorMessage: string): Promise<boolean> {
|
||||||
|
const MAX_RETRIES = 3;
|
||||||
const isSoft = this.isSoftFailure(errorMessage);
|
const isSoft = this.isSoftFailure(errorMessage);
|
||||||
|
|
||||||
|
// Get current retry count
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`SELECT retry_count FROM worker_tasks WHERE id = $1`,
|
||||||
|
[taskId]
|
||||||
|
);
|
||||||
|
const retryCount = rows[0]?.retry_count ?? 0;
|
||||||
|
|
||||||
if (isSoft) {
|
if (isSoft) {
|
||||||
// Soft failure: put back in queue immediately for another worker
|
// Soft failure: put back in queue immediately for another worker
|
||||||
await pool.query(
|
await pool.query(
|
||||||
@@ -325,16 +333,36 @@ class TaskService {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hard failure: mark as permanently failed
|
// Hard failure: auto-retry with exponential backoff if under max retries
|
||||||
|
if (retryCount < MAX_RETRIES) {
|
||||||
|
const delayMinutes = Math.pow(2, retryCount) * 5; // 5, 10, 20 minutes
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE worker_tasks
|
||||||
|
SET status = 'pending',
|
||||||
|
worker_id = NULL,
|
||||||
|
claimed_at = NULL,
|
||||||
|
started_at = NULL,
|
||||||
|
error_message = $2,
|
||||||
|
retry_count = retry_count + 1,
|
||||||
|
scheduled_for = NOW() + INTERVAL '${delayMinutes} minutes',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[taskId, `Retry ${retryCount + 1}/${MAX_RETRIES}: ${errorMessage}`]
|
||||||
|
);
|
||||||
|
console.log(`[TaskService] Task ${taskId} scheduled for retry ${retryCount + 1}/${MAX_RETRIES} in ${delayMinutes} minutes`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Max retries exceeded: mark as permanently failed
|
||||||
await pool.query(
|
await pool.query(
|
||||||
`UPDATE worker_tasks
|
`UPDATE worker_tasks
|
||||||
SET status = 'failed',
|
SET status = 'failed',
|
||||||
completed_at = NOW(),
|
completed_at = NOW(),
|
||||||
error_message = $2
|
error_message = $2
|
||||||
WHERE id = $1`,
|
WHERE id = $1`,
|
||||||
[taskId, `Hard failure: ${errorMessage}`]
|
[taskId, `Failed after ${MAX_RETRIES} retries: ${errorMessage}`]
|
||||||
);
|
);
|
||||||
console.log(`[TaskService] Task ${taskId} hard failed: ${errorMessage}`);
|
console.log(`[TaskService] Task ${taskId} permanently failed after ${MAX_RETRIES} retries`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user