fix(workers): Fix false memory backoff and add backing-off color coding

- Fix memory calculation to use max-old-space-size (1500MB) instead of
  V8's dynamic heapTotal. This prevents false 95%+ readings when idle.
- Add yellow color for backing-off workers in pod visualization
- Update legend and tooltips with backing-off status
- Remove pool toggle from TasksDashboard (moved to Workers page)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-11 19:11:42 -07:00
parent 4679b245de
commit fdce5e0302
4 changed files with 42 additions and 52 deletions

View File

@@ -84,6 +84,20 @@ const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
// Default 85% - gives headroom before OOM // Default 85% - gives headroom before OOM
const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85'); const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
// Parse max heap size from NODE_OPTIONS (--max-old-space-size=1500)
// This is used as the denominator for memory percentage calculation
// V8's heapTotal is dynamic and stays small when idle, causing false high percentages
function getMaxHeapSizeMb(): number {
const nodeOptions = process.env.NODE_OPTIONS || '';
const match = nodeOptions.match(/--max-old-space-size=(\d+)/);
if (match) {
return parseInt(match[1], 10);
}
// Fallback: use 512MB if not specified
return 512;
}
const MAX_HEAP_SIZE_MB = getMaxHeapSizeMb();
// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks // When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
// Default 90% - allows some burst capacity // Default 90% - allows some burst capacity
const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90'); const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
@@ -186,12 +200,16 @@ export class TaskWorker {
/** /**
* Get current resource usage * Get current resource usage
* Memory percentage is calculated against MAX_HEAP_SIZE_MB (from --max-old-space-size)
* NOT against V8's dynamic heapTotal which stays small when idle
*/ */
private getResourceStats(): ResourceStats { private getResourceStats(): ResourceStats {
const memUsage = process.memoryUsage(); const memUsage = process.memoryUsage();
const heapUsedMb = memUsage.heapUsed / 1024 / 1024; const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
const heapTotalMb = memUsage.heapTotal / 1024 / 1024; // Use MAX_HEAP_SIZE_MB as ceiling, not dynamic heapTotal
const memoryPercent = heapUsedMb / heapTotalMb; // V8's heapTotal stays small when idle (e.g., 36MB) causing false 95%+ readings
// With --max-old-space-size=1500, we should calculate against 1500MB
const memoryPercent = heapUsedMb / MAX_HEAP_SIZE_MB;
// Calculate CPU usage since last check // Calculate CPU usage since last check
const cpuUsage = process.cpuUsage(); const cpuUsage = process.cpuUsage();
@@ -212,7 +230,7 @@ export class TaskWorker {
return { return {
memoryPercent, memoryPercent,
memoryMb: Math.round(heapUsedMb), memoryMb: Math.round(heapUsedMb),
memoryTotalMb: Math.round(heapTotalMb), memoryTotalMb: MAX_HEAP_SIZE_MB, // Use max-old-space-size, not dynamic heapTotal
cpuPercent: Math.min(100, cpuPercent), // Cap at 100% cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
isBackingOff: this.isBackingOff, isBackingOff: this.isBackingOff,
backoffReason: this.backoffReason, backoffReason: this.backoffReason,

View File

@@ -16,7 +16,6 @@ import {
ChevronRight, ChevronRight,
Gauge, Gauge,
Users, Users,
Play,
Square, Square,
Plus, Plus,
X, X,
@@ -451,7 +450,6 @@ export default function TasksDashboard() {
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
const [poolPaused, setPoolPaused] = useState(false); const [poolPaused, setPoolPaused] = useState(false);
const [poolLoading, setPoolLoading] = useState(false);
const [showCreateModal, setShowCreateModal] = useState(false); const [showCreateModal, setShowCreateModal] = useState(false);
// Pagination // Pagination
@@ -490,23 +488,6 @@ export default function TasksDashboard() {
} }
}; };
const togglePool = async () => {
setPoolLoading(true);
try {
if (poolPaused) {
await api.resumeTaskPool();
setPoolPaused(false);
} else {
await api.pauseTaskPool();
setPoolPaused(true);
}
} catch (err: any) {
setError(err.message || 'Failed to toggle pool');
} finally {
setPoolLoading(false);
}
};
const handleDeleteTask = async (taskId: number) => { const handleDeleteTask = async (taskId: number) => {
if (!confirm('Delete this task?')) return; if (!confirm('Delete this task?')) return;
try { try {
@@ -579,28 +560,13 @@ export default function TasksDashboard() {
<Plus className="w-4 h-4" /> <Plus className="w-4 h-4" />
Create Task Create Task
</button> </button>
{/* Pool Toggle */} {/* Pool status indicator */}
<button {poolPaused && (
onClick={togglePool} <span className="inline-flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium bg-yellow-100 text-yellow-800">
disabled={poolLoading} <Square className="w-4 h-4" />
className={`flex items-center gap-2 px-4 py-2 rounded-lg font-medium transition-colors ${ Pool Paused
poolPaused </span>
? 'bg-emerald-100 text-emerald-700 hover:bg-emerald-200'
: 'bg-red-100 text-red-700 hover:bg-red-200'
}`}
>
{poolPaused ? (
<>
<Play className={`w-5 h-5 ${poolLoading ? 'animate-pulse' : ''}`} />
Resume Pool
</>
) : (
<>
<Square className={`w-5 h-5 ${poolLoading ? 'animate-pulse' : ''}`} />
Pause Pool
</>
)} )}
</button>
<span className="text-sm text-gray-400">Auto-refreshes every 15s</span> <span className="text-sm text-gray-400">Auto-refreshes every 15s</span>
</div> </div>
</div> </div>

View File

@@ -369,8 +369,10 @@ function PodVisualization({
const isBusy = worker.current_task_id !== null; const isBusy = worker.current_task_id !== null;
const isDecommissioning = worker.decommission_requested; const isDecommissioning = worker.decommission_requested;
const workerColor = isDecommissioning ? 'bg-orange-500' : isBusy ? 'bg-blue-500' : 'bg-emerald-500'; const isBackingOff = worker.metadata?.is_backing_off;
const workerBorder = isDecommissioning ? 'border-orange-300' : isBusy ? 'border-blue-300' : 'border-emerald-300'; // Color priority: decommissioning > backing off > busy > idle
const workerColor = isDecommissioning ? 'bg-orange-500' : isBackingOff ? 'bg-yellow-500' : isBusy ? 'bg-blue-500' : 'bg-emerald-500';
const workerBorder = isDecommissioning ? 'border-orange-300' : isBackingOff ? 'border-yellow-300' : isBusy ? 'border-blue-300' : 'border-emerald-300';
// Line from center to worker // Line from center to worker
const lineLength = radius - 10; const lineLength = radius - 10;
@@ -381,7 +383,7 @@ function PodVisualization({
<div key={worker.id}> <div key={worker.id}>
{/* Connection line */} {/* Connection line */}
<div <div
className={`absolute w-0.5 ${isDecommissioning ? 'bg-orange-300' : isBusy ? 'bg-blue-300' : 'bg-emerald-300'}`} className={`absolute w-0.5 ${isDecommissioning ? 'bg-orange-300' : isBackingOff ? 'bg-yellow-300' : isBusy ? 'bg-blue-300' : 'bg-emerald-300'}`}
style={{ style={{
height: `${lineLength}px`, height: `${lineLength}px`,
left: '50%', left: '50%',
@@ -398,7 +400,7 @@ function PodVisualization({
top: '50%', top: '50%',
transform: `translate(-50%, -50%) translate(${x}px, ${y}px)`, transform: `translate(-50%, -50%) translate(${x}px, ${y}px)`,
}} }}
title={`${worker.friendly_name}\nStatus: ${isDecommissioning ? 'Stopping after current task' : isBusy ? `Working on task #${worker.current_task_id}` : 'Idle - waiting for tasks'}\nMemory: ${worker.metadata?.memory_mb || 0} MB\nCPU: ${formatCpuTime(worker.metadata?.cpu_user_ms || 0)} user, ${formatCpuTime(worker.metadata?.cpu_system_ms || 0)} sys\nCompleted: ${worker.tasks_completed} | Failed: ${worker.tasks_failed}\nLast heartbeat: ${new Date(worker.last_heartbeat_at).toLocaleTimeString()}`} title={`${worker.friendly_name}\nStatus: ${isDecommissioning ? 'Stopping after current task' : isBackingOff ? `Backing off: ${worker.metadata?.backoff_reason || 'resource pressure'}` : isBusy ? `Working on task #${worker.current_task_id}` : 'Ready - waiting for tasks'}\nMemory: ${worker.metadata?.memory_mb || 0} MB (${worker.metadata?.memory_percent || 0}%)\nCPU: ${formatCpuTime(worker.metadata?.cpu_user_ms || 0)} user, ${formatCpuTime(worker.metadata?.cpu_system_ms || 0)} sys\nCompleted: ${worker.tasks_completed} | Failed: ${worker.tasks_failed}\nLast heartbeat: ${new Date(worker.last_heartbeat_at).toLocaleTimeString()}`}
> >
{index + 1} {index + 1}
</div> </div>
@@ -700,11 +702,11 @@ export function WorkersDashboard() {
Worker Pods ({Array.from(groupWorkersByPod(workers)).length} pods, {activeWorkers.length} workers) Worker Pods ({Array.from(groupWorkersByPod(workers)).length} pods, {activeWorkers.length} workers)
</h3> </h3>
<p className="text-xs text-gray-500 mt-0.5"> <p className="text-xs text-gray-500 mt-0.5">
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-emerald-500"></span> idle</span> <span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-emerald-500"></span> ready</span>
<span className="mx-2">|</span> <span className="mx-2">|</span>
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-blue-500"></span> busy</span> <span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-blue-500"></span> busy</span>
<span className="mx-2">|</span> <span className="mx-2">|</span>
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-yellow-500"></span> mixed</span> <span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-yellow-500"></span> backing off</span>
<span className="mx-2">|</span> <span className="mx-2">|</span>
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-orange-500"></span> stopping</span> <span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-orange-500"></span> stopping</span>
</p> </p>

View File

@@ -40,12 +40,16 @@ spec:
valueFrom: valueFrom:
fieldRef: fieldRef:
fieldPath: metadata.name fieldPath: metadata.name
- name: API_BASE_URL
value: "http://scraper"
- name: NODE_OPTIONS
value: "--max-old-space-size=1500"
resources: resources:
requests: requests:
memory: "256Mi" memory: "1Gi"
cpu: "100m" cpu: "100m"
limits: limits:
memory: "512Mi" memory: "2Gi"
cpu: "500m" cpu: "500m"
livenessProbe: livenessProbe:
exec: exec: