feat(workers): Add preflight schema and StatefulSet
- Migration 085: Add curl_ip, http_ip, fingerprint_data, preflight_status, preflight_at columns to worker_registry - StatefulSet manifest for 8 persistent workers with OnDelete update strategy 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
77
backend/k8s/scraper-worker-statefulset.yaml
Normal file
77
backend/k8s/scraper-worker-statefulset.yaml
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: scraper-worker
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
labels:
|
||||||
|
app: scraper-worker
|
||||||
|
spec:
|
||||||
|
clusterIP: None # Headless service required for StatefulSet
|
||||||
|
selector:
|
||||||
|
app: scraper-worker
|
||||||
|
ports:
|
||||||
|
- port: 3010
|
||||||
|
name: http
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: StatefulSet
|
||||||
|
metadata:
|
||||||
|
name: scraper-worker
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
serviceName: scraper-worker
|
||||||
|
replicas: 8
|
||||||
|
podManagementPolicy: Parallel # Start all pods at once
|
||||||
|
updateStrategy:
|
||||||
|
type: OnDelete # Pods only update when manually deleted - no automatic restarts
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: scraper-worker
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: scraper-worker
|
||||||
|
spec:
|
||||||
|
terminationGracePeriodSeconds: 60
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: regcred
|
||||||
|
containers:
|
||||||
|
- name: worker
|
||||||
|
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
|
command: ["node"]
|
||||||
|
args: ["dist/tasks/task-worker.js"]
|
||||||
|
env:
|
||||||
|
- name: WORKER_MODE
|
||||||
|
value: "true"
|
||||||
|
- name: POD_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.name
|
||||||
|
- name: MAX_CONCURRENT_TASKS
|
||||||
|
value: "50"
|
||||||
|
- name: API_BASE_URL
|
||||||
|
value: http://scraper
|
||||||
|
- name: NODE_OPTIONS
|
||||||
|
value: --max-old-space-size=1500
|
||||||
|
envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: scraper-config
|
||||||
|
- secretRef:
|
||||||
|
name: scraper-secrets
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 2Gi
|
||||||
|
livenessProbe:
|
||||||
|
exec:
|
||||||
|
command:
|
||||||
|
- /bin/sh
|
||||||
|
- -c
|
||||||
|
- pgrep -f 'task-worker' > /dev/null
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
failureThreshold: 3
|
||||||
168
backend/migrations/085_preflight_ip_fingerprint.sql
Normal file
168
backend/migrations/085_preflight_ip_fingerprint.sql
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
-- Migration 085: Add IP and fingerprint columns for preflight reporting
|
||||||
|
-- These columns were missing from migration 084
|
||||||
|
|
||||||
|
-- ===================================================================
|
||||||
|
-- PART 1: Add IP address columns to worker_registry
|
||||||
|
-- ===================================================================
|
||||||
|
|
||||||
|
-- IP address detected during curl/axios preflight
|
||||||
|
ALTER TABLE worker_registry
|
||||||
|
ADD COLUMN IF NOT EXISTS curl_ip VARCHAR(45);
|
||||||
|
|
||||||
|
-- IP address detected during http/Puppeteer preflight
|
||||||
|
ALTER TABLE worker_registry
|
||||||
|
ADD COLUMN IF NOT EXISTS http_ip VARCHAR(45);
|
||||||
|
|
||||||
|
-- ===================================================================
|
||||||
|
-- PART 2: Add fingerprint data column
|
||||||
|
-- ===================================================================
|
||||||
|
|
||||||
|
-- Browser fingerprint data captured during Puppeteer preflight
|
||||||
|
ALTER TABLE worker_registry
|
||||||
|
ADD COLUMN IF NOT EXISTS fingerprint_data JSONB;
|
||||||
|
|
||||||
|
-- ===================================================================
|
||||||
|
-- PART 3: Add combined preflight status/timestamp for convenience
|
||||||
|
-- ===================================================================
|
||||||
|
|
||||||
|
-- Overall preflight status (computed from both transports)
|
||||||
|
-- Values: 'pending', 'passed', 'partial', 'failed'
|
||||||
|
-- - 'pending': neither transport tested
|
||||||
|
-- - 'passed': both transports passed (or http passed for browser-only)
|
||||||
|
-- - 'partial': at least one passed
|
||||||
|
-- - 'failed': no transport passed
|
||||||
|
ALTER TABLE worker_registry
|
||||||
|
ADD COLUMN IF NOT EXISTS preflight_status VARCHAR(20) DEFAULT 'pending';
|
||||||
|
|
||||||
|
-- Most recent preflight completion timestamp
|
||||||
|
ALTER TABLE worker_registry
|
||||||
|
ADD COLUMN IF NOT EXISTS preflight_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
-- ===================================================================
|
||||||
|
-- PART 4: Update function to set preflight status
|
||||||
|
-- ===================================================================
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION update_worker_preflight(
|
||||||
|
p_worker_id VARCHAR(100),
|
||||||
|
p_transport VARCHAR(10), -- 'curl' or 'http'
|
||||||
|
p_status VARCHAR(20), -- 'passed', 'failed', 'skipped'
|
||||||
|
p_ip VARCHAR(45) DEFAULT NULL,
|
||||||
|
p_response_ms INTEGER DEFAULT NULL,
|
||||||
|
p_error TEXT DEFAULT NULL,
|
||||||
|
p_fingerprint JSONB DEFAULT NULL
|
||||||
|
) RETURNS VOID AS $$
|
||||||
|
DECLARE
|
||||||
|
v_curl_status VARCHAR(20);
|
||||||
|
v_http_status VARCHAR(20);
|
||||||
|
v_overall_status VARCHAR(20);
|
||||||
|
BEGIN
|
||||||
|
IF p_transport = 'curl' THEN
|
||||||
|
UPDATE worker_registry
|
||||||
|
SET
|
||||||
|
preflight_curl_status = p_status,
|
||||||
|
preflight_curl_at = NOW(),
|
||||||
|
preflight_curl_ms = p_response_ms,
|
||||||
|
preflight_curl_error = p_error,
|
||||||
|
curl_ip = p_ip,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE worker_id = p_worker_id;
|
||||||
|
ELSIF p_transport = 'http' THEN
|
||||||
|
UPDATE worker_registry
|
||||||
|
SET
|
||||||
|
preflight_http_status = p_status,
|
||||||
|
preflight_http_at = NOW(),
|
||||||
|
preflight_http_ms = p_response_ms,
|
||||||
|
preflight_http_error = p_error,
|
||||||
|
http_ip = p_ip,
|
||||||
|
fingerprint_data = COALESCE(p_fingerprint, fingerprint_data),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE worker_id = p_worker_id;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Update overall preflight status
|
||||||
|
SELECT preflight_curl_status, preflight_http_status
|
||||||
|
INTO v_curl_status, v_http_status
|
||||||
|
FROM worker_registry
|
||||||
|
WHERE worker_id = p_worker_id;
|
||||||
|
|
||||||
|
-- Compute overall status
|
||||||
|
IF v_curl_status = 'passed' AND v_http_status = 'passed' THEN
|
||||||
|
v_overall_status := 'passed';
|
||||||
|
ELSIF v_curl_status = 'passed' OR v_http_status = 'passed' THEN
|
||||||
|
v_overall_status := 'partial';
|
||||||
|
ELSIF v_curl_status = 'failed' OR v_http_status = 'failed' THEN
|
||||||
|
v_overall_status := 'failed';
|
||||||
|
ELSE
|
||||||
|
v_overall_status := 'pending';
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
UPDATE worker_registry
|
||||||
|
SET
|
||||||
|
preflight_status = v_overall_status,
|
||||||
|
preflight_at = NOW()
|
||||||
|
WHERE worker_id = p_worker_id;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ===================================================================
|
||||||
|
-- PART 5: Update v_active_workers view
|
||||||
|
-- ===================================================================
|
||||||
|
|
||||||
|
DROP VIEW IF EXISTS v_active_workers;
|
||||||
|
|
||||||
|
CREATE VIEW v_active_workers AS
|
||||||
|
SELECT
|
||||||
|
wr.id,
|
||||||
|
wr.worker_id,
|
||||||
|
wr.friendly_name,
|
||||||
|
wr.role,
|
||||||
|
wr.status,
|
||||||
|
wr.pod_name,
|
||||||
|
wr.hostname,
|
||||||
|
wr.started_at,
|
||||||
|
wr.last_heartbeat_at,
|
||||||
|
wr.last_task_at,
|
||||||
|
wr.tasks_completed,
|
||||||
|
wr.tasks_failed,
|
||||||
|
wr.current_task_id,
|
||||||
|
-- IP addresses from preflights
|
||||||
|
wr.curl_ip,
|
||||||
|
wr.http_ip,
|
||||||
|
-- Combined preflight status
|
||||||
|
wr.preflight_status,
|
||||||
|
wr.preflight_at,
|
||||||
|
-- Detailed preflight status per transport
|
||||||
|
wr.preflight_curl_status,
|
||||||
|
wr.preflight_http_status,
|
||||||
|
wr.preflight_curl_at,
|
||||||
|
wr.preflight_http_at,
|
||||||
|
wr.preflight_curl_error,
|
||||||
|
wr.preflight_http_error,
|
||||||
|
wr.preflight_curl_ms,
|
||||||
|
wr.preflight_http_ms,
|
||||||
|
-- Fingerprint data
|
||||||
|
wr.fingerprint_data,
|
||||||
|
-- Computed fields
|
||||||
|
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
|
||||||
|
CASE
|
||||||
|
WHEN wr.status = 'offline' THEN 'offline'
|
||||||
|
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||||
|
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
|
||||||
|
ELSE 'ready'
|
||||||
|
END as health_status,
|
||||||
|
-- Capability flags (can this worker handle curl/http tasks?)
|
||||||
|
(wr.preflight_curl_status = 'passed') as can_curl,
|
||||||
|
(wr.preflight_http_status = 'passed') as can_http
|
||||||
|
FROM worker_registry wr
|
||||||
|
WHERE wr.status != 'terminated'
|
||||||
|
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
|
||||||
|
|
||||||
|
-- ===================================================================
|
||||||
|
-- Comments
|
||||||
|
-- ===================================================================
|
||||||
|
|
||||||
|
COMMENT ON COLUMN worker_registry.curl_ip IS 'IP address detected during curl/axios preflight';
|
||||||
|
COMMENT ON COLUMN worker_registry.http_ip IS 'IP address detected during Puppeteer preflight';
|
||||||
|
COMMENT ON COLUMN worker_registry.fingerprint_data IS 'Browser fingerprint captured during Puppeteer preflight';
|
||||||
|
COMMENT ON COLUMN worker_registry.preflight_status IS 'Overall preflight status: pending, passed, partial, failed';
|
||||||
|
COMMENT ON COLUMN worker_registry.preflight_at IS 'Most recent preflight completion timestamp';
|
||||||
Reference in New Issue
Block a user