From fdce5e03029fd25400d31092638017066f9844bc Mon Sep 17 00:00:00 2001 From: Kelly Date: Thu, 11 Dec 2025 19:11:42 -0700 Subject: [PATCH] fix(workers): Fix false memory backoff and add backing-off color coding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix memory calculation to use max-old-space-size (1500MB) instead of V8's dynamic heapTotal. This prevents false 95%+ readings when idle. - Add yellow color for backing-off workers in pod visualization - Update legend and tooltips with backing-off status - Remove pool toggle from TasksDashboard (moved to Workers page) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/src/tasks/task-worker.ts | 24 +++++++++++-- cannaiq/src/pages/TasksDashboard.tsx | 48 ++++---------------------- cannaiq/src/pages/WorkersDashboard.tsx | 14 ++++---- k8s/scraper-worker.yaml | 8 +++-- 4 files changed, 42 insertions(+), 52 deletions(-) diff --git a/backend/src/tasks/task-worker.ts b/backend/src/tasks/task-worker.ts index bdc44506..baee5773 100644 --- a/backend/src/tasks/task-worker.ts +++ b/backend/src/tasks/task-worker.ts @@ -84,6 +84,20 @@ const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3'); // Default 85% - gives headroom before OOM const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85'); +// Parse max heap size from NODE_OPTIONS (--max-old-space-size=1500) +// This is used as the denominator for memory percentage calculation +// V8's heapTotal is dynamic and stays small when idle, causing false high percentages +function getMaxHeapSizeMb(): number { + const nodeOptions = process.env.NODE_OPTIONS || ''; + const match = nodeOptions.match(/--max-old-space-size=(\d+)/); + if (match) { + return parseInt(match[1], 10); + } + // Fallback: use 512MB if not specified + return 512; +} +const MAX_HEAP_SIZE_MB = getMaxHeapSizeMb(); + // When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks // Default 90% - allows some burst capacity const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90'); @@ -186,12 +200,16 @@ export class TaskWorker { /** * Get current resource usage + * Memory percentage is calculated against MAX_HEAP_SIZE_MB (from --max-old-space-size) + * NOT against V8's dynamic heapTotal which stays small when idle */ private getResourceStats(): ResourceStats { const memUsage = process.memoryUsage(); const heapUsedMb = memUsage.heapUsed / 1024 / 1024; - const heapTotalMb = memUsage.heapTotal / 1024 / 1024; - const memoryPercent = heapUsedMb / heapTotalMb; + // Use MAX_HEAP_SIZE_MB as ceiling, not dynamic heapTotal + // V8's heapTotal stays small when idle (e.g., 36MB) causing false 95%+ readings + // With --max-old-space-size=1500, we should calculate against 1500MB + const memoryPercent = heapUsedMb / MAX_HEAP_SIZE_MB; // Calculate CPU usage since last check const cpuUsage = process.cpuUsage(); @@ -212,7 +230,7 @@ export class TaskWorker { return { memoryPercent, memoryMb: Math.round(heapUsedMb), - memoryTotalMb: Math.round(heapTotalMb), + memoryTotalMb: MAX_HEAP_SIZE_MB, // Use max-old-space-size, not dynamic heapTotal cpuPercent: Math.min(100, cpuPercent), // Cap at 100% isBackingOff: this.isBackingOff, backoffReason: this.backoffReason, diff --git a/cannaiq/src/pages/TasksDashboard.tsx b/cannaiq/src/pages/TasksDashboard.tsx index c472f571..1600cc3e 100644 --- a/cannaiq/src/pages/TasksDashboard.tsx +++ b/cannaiq/src/pages/TasksDashboard.tsx @@ -16,7 +16,6 @@ import { ChevronRight, Gauge, Users, - Play, Square, Plus, X, @@ -451,7 +450,6 @@ export default function TasksDashboard() { const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const [poolPaused, setPoolPaused] = useState(false); - const [poolLoading, setPoolLoading] = useState(false); const [showCreateModal, setShowCreateModal] = useState(false); // Pagination @@ -490,23 +488,6 @@ export default function TasksDashboard() { } }; - const togglePool = async () => { - setPoolLoading(true); - try { - if (poolPaused) { - await api.resumeTaskPool(); - setPoolPaused(false); - } else { - await api.pauseTaskPool(); - setPoolPaused(true); - } - } catch (err: any) { - setError(err.message || 'Failed to toggle pool'); - } finally { - setPoolLoading(false); - } - }; - const handleDeleteTask = async (taskId: number) => { if (!confirm('Delete this task?')) return; try { @@ -579,28 +560,13 @@ export default function TasksDashboard() { Create Task - {/* Pool Toggle */} - + {/* Pool status indicator */} + {poolPaused && ( + + + Pool Paused + + )} Auto-refreshes every 15s diff --git a/cannaiq/src/pages/WorkersDashboard.tsx b/cannaiq/src/pages/WorkersDashboard.tsx index be704dbd..3540e33a 100644 --- a/cannaiq/src/pages/WorkersDashboard.tsx +++ b/cannaiq/src/pages/WorkersDashboard.tsx @@ -369,8 +369,10 @@ function PodVisualization({ const isBusy = worker.current_task_id !== null; const isDecommissioning = worker.decommission_requested; - const workerColor = isDecommissioning ? 'bg-orange-500' : isBusy ? 'bg-blue-500' : 'bg-emerald-500'; - const workerBorder = isDecommissioning ? 'border-orange-300' : isBusy ? 'border-blue-300' : 'border-emerald-300'; + const isBackingOff = worker.metadata?.is_backing_off; + // Color priority: decommissioning > backing off > busy > idle + const workerColor = isDecommissioning ? 'bg-orange-500' : isBackingOff ? 'bg-yellow-500' : isBusy ? 'bg-blue-500' : 'bg-emerald-500'; + const workerBorder = isDecommissioning ? 'border-orange-300' : isBackingOff ? 'border-yellow-300' : isBusy ? 'border-blue-300' : 'border-emerald-300'; // Line from center to worker const lineLength = radius - 10; @@ -381,7 +383,7 @@ function PodVisualization({
{/* Connection line */}
{index + 1}
@@ -700,11 +702,11 @@ export function WorkersDashboard() { Worker Pods ({Array.from(groupWorkersByPod(workers)).length} pods, {activeWorkers.length} workers)

- idle + ready | busy | - mixed + backing off | stopping

diff --git a/k8s/scraper-worker.yaml b/k8s/scraper-worker.yaml index 2e490d22..4d6179d2 100644 --- a/k8s/scraper-worker.yaml +++ b/k8s/scraper-worker.yaml @@ -40,12 +40,16 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: API_BASE_URL + value: "http://scraper" + - name: NODE_OPTIONS + value: "--max-old-space-size=1500" resources: requests: - memory: "256Mi" + memory: "1Gi" cpu: "100m" limits: - memory: "512Mi" + memory: "2Gi" cpu: "500m" livenessProbe: exec: