fix(workers): Fix false memory backoff and add backing-off color coding
- Fix memory calculation to use max-old-space-size (1500MB) instead of V8's dynamic heapTotal. This prevents false 95%+ readings when idle. - Add yellow color for backing-off workers in pod visualization - Update legend and tooltips with backing-off status - Remove pool toggle from TasksDashboard (moved to Workers page) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -84,6 +84,20 @@ const MAX_CONCURRENT_TASKS = parseInt(process.env.MAX_CONCURRENT_TASKS || '3');
|
|||||||
// Default 85% - gives headroom before OOM
|
// Default 85% - gives headroom before OOM
|
||||||
const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
|
const MEMORY_BACKOFF_THRESHOLD = parseFloat(process.env.MEMORY_BACKOFF_THRESHOLD || '0.85');
|
||||||
|
|
||||||
|
// Parse max heap size from NODE_OPTIONS (--max-old-space-size=1500)
|
||||||
|
// This is used as the denominator for memory percentage calculation
|
||||||
|
// V8's heapTotal is dynamic and stays small when idle, causing false high percentages
|
||||||
|
function getMaxHeapSizeMb(): number {
|
||||||
|
const nodeOptions = process.env.NODE_OPTIONS || '';
|
||||||
|
const match = nodeOptions.match(/--max-old-space-size=(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
return parseInt(match[1], 10);
|
||||||
|
}
|
||||||
|
// Fallback: use 512MB if not specified
|
||||||
|
return 512;
|
||||||
|
}
|
||||||
|
const MAX_HEAP_SIZE_MB = getMaxHeapSizeMb();
|
||||||
|
|
||||||
// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
|
// When CPU usage exceeds this threshold (as decimal 0.0-1.0), stop claiming new tasks
|
||||||
// Default 90% - allows some burst capacity
|
// Default 90% - allows some burst capacity
|
||||||
const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
|
const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0.90');
|
||||||
@@ -186,12 +200,16 @@ export class TaskWorker {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get current resource usage
|
* Get current resource usage
|
||||||
|
* Memory percentage is calculated against MAX_HEAP_SIZE_MB (from --max-old-space-size)
|
||||||
|
* NOT against V8's dynamic heapTotal which stays small when idle
|
||||||
*/
|
*/
|
||||||
private getResourceStats(): ResourceStats {
|
private getResourceStats(): ResourceStats {
|
||||||
const memUsage = process.memoryUsage();
|
const memUsage = process.memoryUsage();
|
||||||
const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
|
const heapUsedMb = memUsage.heapUsed / 1024 / 1024;
|
||||||
const heapTotalMb = memUsage.heapTotal / 1024 / 1024;
|
// Use MAX_HEAP_SIZE_MB as ceiling, not dynamic heapTotal
|
||||||
const memoryPercent = heapUsedMb / heapTotalMb;
|
// V8's heapTotal stays small when idle (e.g., 36MB) causing false 95%+ readings
|
||||||
|
// With --max-old-space-size=1500, we should calculate against 1500MB
|
||||||
|
const memoryPercent = heapUsedMb / MAX_HEAP_SIZE_MB;
|
||||||
|
|
||||||
// Calculate CPU usage since last check
|
// Calculate CPU usage since last check
|
||||||
const cpuUsage = process.cpuUsage();
|
const cpuUsage = process.cpuUsage();
|
||||||
@@ -212,7 +230,7 @@ export class TaskWorker {
|
|||||||
return {
|
return {
|
||||||
memoryPercent,
|
memoryPercent,
|
||||||
memoryMb: Math.round(heapUsedMb),
|
memoryMb: Math.round(heapUsedMb),
|
||||||
memoryTotalMb: Math.round(heapTotalMb),
|
memoryTotalMb: MAX_HEAP_SIZE_MB, // Use max-old-space-size, not dynamic heapTotal
|
||||||
cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
|
cpuPercent: Math.min(100, cpuPercent), // Cap at 100%
|
||||||
isBackingOff: this.isBackingOff,
|
isBackingOff: this.isBackingOff,
|
||||||
backoffReason: this.backoffReason,
|
backoffReason: this.backoffReason,
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ import {
|
|||||||
ChevronRight,
|
ChevronRight,
|
||||||
Gauge,
|
Gauge,
|
||||||
Users,
|
Users,
|
||||||
Play,
|
|
||||||
Square,
|
Square,
|
||||||
Plus,
|
Plus,
|
||||||
X,
|
X,
|
||||||
@@ -451,7 +450,6 @@ export default function TasksDashboard() {
|
|||||||
const [loading, setLoading] = useState(true);
|
const [loading, setLoading] = useState(true);
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
const [poolPaused, setPoolPaused] = useState(false);
|
const [poolPaused, setPoolPaused] = useState(false);
|
||||||
const [poolLoading, setPoolLoading] = useState(false);
|
|
||||||
const [showCreateModal, setShowCreateModal] = useState(false);
|
const [showCreateModal, setShowCreateModal] = useState(false);
|
||||||
|
|
||||||
// Pagination
|
// Pagination
|
||||||
@@ -490,23 +488,6 @@ export default function TasksDashboard() {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const togglePool = async () => {
|
|
||||||
setPoolLoading(true);
|
|
||||||
try {
|
|
||||||
if (poolPaused) {
|
|
||||||
await api.resumeTaskPool();
|
|
||||||
setPoolPaused(false);
|
|
||||||
} else {
|
|
||||||
await api.pauseTaskPool();
|
|
||||||
setPoolPaused(true);
|
|
||||||
}
|
|
||||||
} catch (err: any) {
|
|
||||||
setError(err.message || 'Failed to toggle pool');
|
|
||||||
} finally {
|
|
||||||
setPoolLoading(false);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleDeleteTask = async (taskId: number) => {
|
const handleDeleteTask = async (taskId: number) => {
|
||||||
if (!confirm('Delete this task?')) return;
|
if (!confirm('Delete this task?')) return;
|
||||||
try {
|
try {
|
||||||
@@ -579,28 +560,13 @@ export default function TasksDashboard() {
|
|||||||
<Plus className="w-4 h-4" />
|
<Plus className="w-4 h-4" />
|
||||||
Create Task
|
Create Task
|
||||||
</button>
|
</button>
|
||||||
{/* Pool Toggle */}
|
{/* Pool status indicator */}
|
||||||
<button
|
{poolPaused && (
|
||||||
onClick={togglePool}
|
<span className="inline-flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium bg-yellow-100 text-yellow-800">
|
||||||
disabled={poolLoading}
|
<Square className="w-4 h-4" />
|
||||||
className={`flex items-center gap-2 px-4 py-2 rounded-lg font-medium transition-colors ${
|
Pool Paused
|
||||||
poolPaused
|
</span>
|
||||||
? 'bg-emerald-100 text-emerald-700 hover:bg-emerald-200'
|
)}
|
||||||
: 'bg-red-100 text-red-700 hover:bg-red-200'
|
|
||||||
}`}
|
|
||||||
>
|
|
||||||
{poolPaused ? (
|
|
||||||
<>
|
|
||||||
<Play className={`w-5 h-5 ${poolLoading ? 'animate-pulse' : ''}`} />
|
|
||||||
Resume Pool
|
|
||||||
</>
|
|
||||||
) : (
|
|
||||||
<>
|
|
||||||
<Square className={`w-5 h-5 ${poolLoading ? 'animate-pulse' : ''}`} />
|
|
||||||
Pause Pool
|
|
||||||
</>
|
|
||||||
)}
|
|
||||||
</button>
|
|
||||||
<span className="text-sm text-gray-400">Auto-refreshes every 15s</span>
|
<span className="text-sm text-gray-400">Auto-refreshes every 15s</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -369,8 +369,10 @@ function PodVisualization({
|
|||||||
|
|
||||||
const isBusy = worker.current_task_id !== null;
|
const isBusy = worker.current_task_id !== null;
|
||||||
const isDecommissioning = worker.decommission_requested;
|
const isDecommissioning = worker.decommission_requested;
|
||||||
const workerColor = isDecommissioning ? 'bg-orange-500' : isBusy ? 'bg-blue-500' : 'bg-emerald-500';
|
const isBackingOff = worker.metadata?.is_backing_off;
|
||||||
const workerBorder = isDecommissioning ? 'border-orange-300' : isBusy ? 'border-blue-300' : 'border-emerald-300';
|
// Color priority: decommissioning > backing off > busy > idle
|
||||||
|
const workerColor = isDecommissioning ? 'bg-orange-500' : isBackingOff ? 'bg-yellow-500' : isBusy ? 'bg-blue-500' : 'bg-emerald-500';
|
||||||
|
const workerBorder = isDecommissioning ? 'border-orange-300' : isBackingOff ? 'border-yellow-300' : isBusy ? 'border-blue-300' : 'border-emerald-300';
|
||||||
|
|
||||||
// Line from center to worker
|
// Line from center to worker
|
||||||
const lineLength = radius - 10;
|
const lineLength = radius - 10;
|
||||||
@@ -381,7 +383,7 @@ function PodVisualization({
|
|||||||
<div key={worker.id}>
|
<div key={worker.id}>
|
||||||
{/* Connection line */}
|
{/* Connection line */}
|
||||||
<div
|
<div
|
||||||
className={`absolute w-0.5 ${isDecommissioning ? 'bg-orange-300' : isBusy ? 'bg-blue-300' : 'bg-emerald-300'}`}
|
className={`absolute w-0.5 ${isDecommissioning ? 'bg-orange-300' : isBackingOff ? 'bg-yellow-300' : isBusy ? 'bg-blue-300' : 'bg-emerald-300'}`}
|
||||||
style={{
|
style={{
|
||||||
height: `${lineLength}px`,
|
height: `${lineLength}px`,
|
||||||
left: '50%',
|
left: '50%',
|
||||||
@@ -398,7 +400,7 @@ function PodVisualization({
|
|||||||
top: '50%',
|
top: '50%',
|
||||||
transform: `translate(-50%, -50%) translate(${x}px, ${y}px)`,
|
transform: `translate(-50%, -50%) translate(${x}px, ${y}px)`,
|
||||||
}}
|
}}
|
||||||
title={`${worker.friendly_name}\nStatus: ${isDecommissioning ? 'Stopping after current task' : isBusy ? `Working on task #${worker.current_task_id}` : 'Idle - waiting for tasks'}\nMemory: ${worker.metadata?.memory_mb || 0} MB\nCPU: ${formatCpuTime(worker.metadata?.cpu_user_ms || 0)} user, ${formatCpuTime(worker.metadata?.cpu_system_ms || 0)} sys\nCompleted: ${worker.tasks_completed} | Failed: ${worker.tasks_failed}\nLast heartbeat: ${new Date(worker.last_heartbeat_at).toLocaleTimeString()}`}
|
title={`${worker.friendly_name}\nStatus: ${isDecommissioning ? 'Stopping after current task' : isBackingOff ? `Backing off: ${worker.metadata?.backoff_reason || 'resource pressure'}` : isBusy ? `Working on task #${worker.current_task_id}` : 'Ready - waiting for tasks'}\nMemory: ${worker.metadata?.memory_mb || 0} MB (${worker.metadata?.memory_percent || 0}%)\nCPU: ${formatCpuTime(worker.metadata?.cpu_user_ms || 0)} user, ${formatCpuTime(worker.metadata?.cpu_system_ms || 0)} sys\nCompleted: ${worker.tasks_completed} | Failed: ${worker.tasks_failed}\nLast heartbeat: ${new Date(worker.last_heartbeat_at).toLocaleTimeString()}`}
|
||||||
>
|
>
|
||||||
{index + 1}
|
{index + 1}
|
||||||
</div>
|
</div>
|
||||||
@@ -700,11 +702,11 @@ export function WorkersDashboard() {
|
|||||||
Worker Pods ({Array.from(groupWorkersByPod(workers)).length} pods, {activeWorkers.length} workers)
|
Worker Pods ({Array.from(groupWorkersByPod(workers)).length} pods, {activeWorkers.length} workers)
|
||||||
</h3>
|
</h3>
|
||||||
<p className="text-xs text-gray-500 mt-0.5">
|
<p className="text-xs text-gray-500 mt-0.5">
|
||||||
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-emerald-500"></span> idle</span>
|
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-emerald-500"></span> ready</span>
|
||||||
<span className="mx-2">|</span>
|
<span className="mx-2">|</span>
|
||||||
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-blue-500"></span> busy</span>
|
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-blue-500"></span> busy</span>
|
||||||
<span className="mx-2">|</span>
|
<span className="mx-2">|</span>
|
||||||
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-yellow-500"></span> mixed</span>
|
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-yellow-500"></span> backing off</span>
|
||||||
<span className="mx-2">|</span>
|
<span className="mx-2">|</span>
|
||||||
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-orange-500"></span> stopping</span>
|
<span className="inline-flex items-center gap-1"><span className="w-2 h-2 rounded-full bg-orange-500"></span> stopping</span>
|
||||||
</p>
|
</p>
|
||||||
|
|||||||
@@ -40,12 +40,16 @@ spec:
|
|||||||
valueFrom:
|
valueFrom:
|
||||||
fieldRef:
|
fieldRef:
|
||||||
fieldPath: metadata.name
|
fieldPath: metadata.name
|
||||||
|
- name: API_BASE_URL
|
||||||
|
value: "http://scraper"
|
||||||
|
- name: NODE_OPTIONS
|
||||||
|
value: "--max-old-space-size=1500"
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
memory: "256Mi"
|
memory: "1Gi"
|
||||||
cpu: "100m"
|
cpu: "100m"
|
||||||
limits:
|
limits:
|
||||||
memory: "512Mi"
|
memory: "2Gi"
|
||||||
cpu: "500m"
|
cpu: "500m"
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
exec:
|
exec:
|
||||||
|
|||||||
Reference in New Issue
Block a user