feat: Auto-retry failed proxies after cooldown period

- Add last_failed_at column to track failure time - Failed proxies auto-retry after 4 hours (configurable) - Proxies permanently failed after 10 failures - Add /retry-stats and /reenable-failed API endpoints - markProxySuccess() re-enables recovered proxies 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 00:08:44 -07:00
parent 675f42841e
commit e62f927218
3 changed files with 194 additions and 8 deletions
--- a/backend/migrations/095_proxy_auto_retry.sql
+++ b/backend/migrations/095_proxy_auto_retry.sql
@@ -0,0 +1,81 @@
+-- Migration: Auto-retry failed proxies after cooldown period
+-- Proxies that fail will be retried after a configurable interval
+
+-- Add last_failed_at column to track when proxy last failed
+ALTER TABLE proxies ADD COLUMN IF NOT EXISTS last_failed_at TIMESTAMP;
+
+-- Add retry settings
+INSERT INTO settings (key, value, description)
+VALUES
+  ('proxy_retry_interval_hours', '4', 'Hours to wait before retrying a failed proxy'),
+  ('proxy_max_failures_before_permanent', '10', 'Max failures before proxy is permanently disabled')
+ON CONFLICT (key) DO NOTHING;
+
+-- Create function to get eligible proxies (active OR failed but past retry interval)
+CREATE OR REPLACE FUNCTION get_eligible_proxy_ids()
+RETURNS TABLE(proxy_id INT) AS $$
+DECLARE
+  retry_hours INT;
+BEGIN
+  -- Get retry interval from settings (default 4 hours)
+  SELECT COALESCE(value::int, 4) INTO retry_hours
+  FROM settings WHERE key = 'proxy_retry_interval_hours';
+
+  RETURN QUERY
+  SELECT p.id
+  FROM proxies p
+  WHERE p.active = true
+     OR (
+       p.active = false
+       AND p.last_failed_at IS NOT NULL
+       AND p.last_failed_at < NOW() - (retry_hours || ' hours')::interval
+       AND p.failure_count < 10  -- Don't retry if too many failures
+     )
+  ORDER BY
+    p.active DESC,  -- Prefer active proxies
+    p.failure_count ASC,  -- Then prefer proxies with fewer failures
+    RANDOM();
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create scheduled job to periodically re-enable proxies past their retry window
+-- This runs every hour and marks proxies as active if they're past retry interval
+CREATE OR REPLACE FUNCTION auto_reenable_proxies()
+RETURNS INT AS $$
+DECLARE
+  retry_hours INT;
+  max_failures INT;
+  reenabled_count INT;
+BEGIN
+  -- Get settings
+  SELECT COALESCE(value::int, 4) INTO retry_hours
+  FROM settings WHERE key = 'proxy_retry_interval_hours';
+
+  SELECT COALESCE(value::int, 10) INTO max_failures
+  FROM settings WHERE key = 'proxy_max_failures_before_permanent';
+
+  -- Re-enable proxies that have cooled down
+  UPDATE proxies
+  SET active = true,
+      updated_at = NOW()
+  WHERE active = false
+    AND last_failed_at IS NOT NULL
+    AND last_failed_at < NOW() - (retry_hours || ' hours')::interval
+    AND failure_count < max_failures;
+
+  GET DIAGNOSTICS reenabled_count = ROW_COUNT;
+
+  IF reenabled_count > 0 THEN
+    RAISE NOTICE 'Auto-reenabled % proxies after % hour cooldown', reenabled_count, retry_hours;
+  END IF;
+
+  RETURN reenabled_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Add index for efficient querying
+CREATE INDEX IF NOT EXISTS idx_proxies_retry
+ON proxies(active, last_failed_at, failure_count);
+
+COMMENT ON COLUMN proxies.last_failed_at IS 'Timestamp of last failure - used for auto-retry logic';
+COMMENT ON FUNCTION auto_reenable_proxies() IS 'Call periodically to re-enable failed proxies that have cooled down';
--- a/backend/src/routes/proxies.ts
+++ b/backend/src/routes/proxies.ts
@@ -288,4 +288,56 @@ router.post('/update-locations', requireRole('superadmin', 'admin'), async (req,
  }
 });

+// Get proxy retry stats
+router.get('/retry-stats', async (req, res) => {
+  try {
+    const stats = await pool.query(`
+      SELECT
+        COUNT(*) FILTER (WHERE active = true) as active_count,
+        COUNT(*) FILTER (WHERE active = false) as inactive_count,
+        COUNT(*) FILTER (WHERE active = false AND last_failed_at IS NOT NULL
+          AND last_failed_at < NOW() - INTERVAL '4 hours' AND failure_count < 10) as ready_for_retry,
+        COUNT(*) FILTER (WHERE failure_count >= 10) as permanently_failed
+      FROM proxies
+    `);
+
+    res.json(stats.rows[0]);
+  } catch (error) {
+    console.error('Error fetching retry stats:', error);
+    res.status(500).json({ error: 'Failed to fetch retry stats' });
+  }
+});
+
+// Manually re-enable proxies that have passed their retry interval
+router.post('/reenable-failed', requireRole('superadmin', 'admin'), async (req, res) => {
+  try {
+    // Get retry interval from settings
+    const settingsResult = await pool.query(`
+      SELECT value::int as hours FROM settings WHERE key = 'proxy_retry_interval_hours'
+    `);
+    const retryHours = settingsResult.rows[0]?.hours || 4;
+
+    // Re-enable proxies
+    const result = await pool.query(`
+      UPDATE proxies
+      SET active = true,
+          updated_at = NOW()
+      WHERE active = false
+        AND last_failed_at IS NOT NULL
+        AND last_failed_at < NOW() - ($1 || ' hours')::interval
+        AND failure_count < 10
+      RETURNING id
+    `, [retryHours]);
+
+    res.json({
+      message: `Re-enabled ${result.rowCount} proxies`,
+      count: result.rowCount,
+      retryIntervalHours: retryHours
+    });
+  } catch (error) {
+    console.error('Error re-enabling proxies:', error);
+    res.status(500).json({ error: 'Failed to re-enable proxies' });
+  }
+});
+
 export default router;
--- a/backend/src/services/proxy.ts
+++ b/backend/src/services/proxy.ts
@@ -54,18 +54,57 @@ export function isProxyInTimeout(proxyId: number): boolean {
  return true;
 }

+// Get retry interval from settings (cached for 5 minutes)
+let cachedRetryHours: number | null = null;
+let retryHoursCacheTime = 0;
+const RETRY_CACHE_TTL = 5 * 60 * 1000; // 5 minutes
+
+async function getRetryIntervalHours(): Promise<number> {
+  const now = Date.now();
+  if (cachedRetryHours !== null && now - retryHoursCacheTime < RETRY_CACHE_TTL) {
+    return cachedRetryHours;
+  }
+
+  try {
+    const result = await pool.query(`
+      SELECT value::int FROM settings WHERE key = 'proxy_retry_interval_hours'
+    `);
+    cachedRetryHours = result.rows[0]?.value || 4;
+    retryHoursCacheTime = now;
+  } catch {
+    cachedRetryHours = 4; // default
+  }
+  return cachedRetryHours as number;
+}
+
 // Get active proxy that's not in timeout
+// Also includes failed proxies that have passed their retry interval
 export async function getActiveProxy(): Promise<{ id: number; host: string; port: number; protocol: string; username?: string; password?: string } | null> {
+  const retryHours = await getRetryIntervalHours();
+
  const result = await pool.query(`
-    SELECT id, host, port, protocol, username, password
+    SELECT id, host, port, protocol, username, password, active, failure_count
    FROM proxies
    WHERE active = true
-    ORDER BY RANDOM()
-  `);
+       OR (
+         active = false
+         AND last_failed_at IS NOT NULL
+         AND last_failed_at < NOW() - ($1 || ' hours')::interval
+         AND failure_count < 10
+       )
+    ORDER BY
+      active DESC,
+      failure_count ASC,
+      RANDOM()
+  `, [retryHours]);

  // Filter out proxies in timeout
  for (const proxy of result.rows) {
    if (!isProxyInTimeout(proxy.id)) {
+      // If this is a retry of a failed proxy, log it
+      if (!proxy.active) {
+        console.log(`🔄 Retrying previously failed proxy ${proxy.id} (${proxy.failure_count} failures)`);
+      }
      return proxy;
    }
  }
@@ -341,11 +380,12 @@ export async function moveProxyToFailed(proxyId: number, errorMsg: string): Prom
 }

 export async function incrementProxyFailure(proxyId: number, errorMsg: string): Promise<boolean> {
-  // Increment failure count
+  // Increment failure count and set last_failed_at
  const result = await pool.query(`
    UPDATE proxies
    SET failure_count = failure_count + 1,
        active = false,
+        last_failed_at = CURRENT_TIMESTAMP,
        updated_at = CURRENT_TIMESTAMP
    WHERE id = $1
    RETURNING failure_count, host, port, protocol
@@ -358,13 +398,26 @@ export async function incrementProxyFailure(proxyId: number, errorMsg: string):
  const proxy = result.rows[0];
  const failureCount = proxy.failure_count;

-  console.log(`⚠️  Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
+  console.log(`⚠️  Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port} (will retry after cooldown)`);

-  // If failed 3 times, move to failed table
-  if (failureCount >= 3) {
+  // If failed 10+ times, move to permanently failed table
+  if (failureCount >= 10) {
    await moveProxyToFailed(proxyId, errorMsg);
    return true; // Moved to failed
  }

-  return false; // Still in active proxies
+  return false; // Still in proxies table, will be retried after cooldown
+}
+
+// Mark proxy as successful (re-enable if it was being retried)
+export async function markProxySuccess(proxyId: number): Promise<void> {
+  await pool.query(`
+    UPDATE proxies
+    SET active = true,
+        failure_count = GREATEST(0, failure_count - 1),
+        success_count = success_count + 1,
+        last_failed_at = NULL,
+        updated_at = CURRENT_TIMESTAMP
+    WHERE id = $1
+  `, [proxyId]);
 }