diff --git a/backend/k8s/scraper-worker-statefulset.yaml b/backend/k8s/scraper-worker-statefulset.yaml new file mode 100644 index 00000000..25973abd --- /dev/null +++ b/backend/k8s/scraper-worker-statefulset.yaml @@ -0,0 +1,77 @@ +apiVersion: v1 +kind: Service +metadata: + name: scraper-worker + namespace: dispensary-scraper + labels: + app: scraper-worker +spec: + clusterIP: None # Headless service required for StatefulSet + selector: + app: scraper-worker + ports: + - port: 3010 + name: http +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: scraper-worker + namespace: dispensary-scraper +spec: + serviceName: scraper-worker + replicas: 8 + podManagementPolicy: Parallel # Start all pods at once + updateStrategy: + type: OnDelete # Pods only update when manually deleted - no automatic restarts + selector: + matchLabels: + app: scraper-worker + template: + metadata: + labels: + app: scraper-worker + spec: + terminationGracePeriodSeconds: 60 + imagePullSecrets: + - name: regcred + containers: + - name: worker + image: code.cannabrands.app/creationshop/dispensary-scraper:latest + imagePullPolicy: Always + command: ["node"] + args: ["dist/tasks/task-worker.js"] + env: + - name: WORKER_MODE + value: "true" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MAX_CONCURRENT_TASKS + value: "50" + - name: API_BASE_URL + value: http://scraper + - name: NODE_OPTIONS + value: --max-old-space-size=1500 + envFrom: + - configMapRef: + name: scraper-config + - secretRef: + name: scraper-secrets + resources: + requests: + cpu: 100m + memory: 1Gi + limits: + cpu: 500m + memory: 2Gi + livenessProbe: + exec: + command: + - /bin/sh + - -c + - pgrep -f 'task-worker' > /dev/null + initialDelaySeconds: 10 + periodSeconds: 30 + failureThreshold: 3 diff --git a/backend/migrations/085_preflight_ip_fingerprint.sql b/backend/migrations/085_preflight_ip_fingerprint.sql new file mode 100644 index 00000000..f60501e7 --- /dev/null +++ b/backend/migrations/085_preflight_ip_fingerprint.sql @@ -0,0 +1,168 @@ +-- Migration 085: Add IP and fingerprint columns for preflight reporting +-- These columns were missing from migration 084 + +-- =================================================================== +-- PART 1: Add IP address columns to worker_registry +-- =================================================================== + +-- IP address detected during curl/axios preflight +ALTER TABLE worker_registry +ADD COLUMN IF NOT EXISTS curl_ip VARCHAR(45); + +-- IP address detected during http/Puppeteer preflight +ALTER TABLE worker_registry +ADD COLUMN IF NOT EXISTS http_ip VARCHAR(45); + +-- =================================================================== +-- PART 2: Add fingerprint data column +-- =================================================================== + +-- Browser fingerprint data captured during Puppeteer preflight +ALTER TABLE worker_registry +ADD COLUMN IF NOT EXISTS fingerprint_data JSONB; + +-- =================================================================== +-- PART 3: Add combined preflight status/timestamp for convenience +-- =================================================================== + +-- Overall preflight status (computed from both transports) +-- Values: 'pending', 'passed', 'partial', 'failed' +-- - 'pending': neither transport tested +-- - 'passed': both transports passed (or http passed for browser-only) +-- - 'partial': at least one passed +-- - 'failed': no transport passed +ALTER TABLE worker_registry +ADD COLUMN IF NOT EXISTS preflight_status VARCHAR(20) DEFAULT 'pending'; + +-- Most recent preflight completion timestamp +ALTER TABLE worker_registry +ADD COLUMN IF NOT EXISTS preflight_at TIMESTAMPTZ; + +-- =================================================================== +-- PART 4: Update function to set preflight status +-- =================================================================== + +CREATE OR REPLACE FUNCTION update_worker_preflight( + p_worker_id VARCHAR(100), + p_transport VARCHAR(10), -- 'curl' or 'http' + p_status VARCHAR(20), -- 'passed', 'failed', 'skipped' + p_ip VARCHAR(45) DEFAULT NULL, + p_response_ms INTEGER DEFAULT NULL, + p_error TEXT DEFAULT NULL, + p_fingerprint JSONB DEFAULT NULL +) RETURNS VOID AS $$ +DECLARE + v_curl_status VARCHAR(20); + v_http_status VARCHAR(20); + v_overall_status VARCHAR(20); +BEGIN + IF p_transport = 'curl' THEN + UPDATE worker_registry + SET + preflight_curl_status = p_status, + preflight_curl_at = NOW(), + preflight_curl_ms = p_response_ms, + preflight_curl_error = p_error, + curl_ip = p_ip, + updated_at = NOW() + WHERE worker_id = p_worker_id; + ELSIF p_transport = 'http' THEN + UPDATE worker_registry + SET + preflight_http_status = p_status, + preflight_http_at = NOW(), + preflight_http_ms = p_response_ms, + preflight_http_error = p_error, + http_ip = p_ip, + fingerprint_data = COALESCE(p_fingerprint, fingerprint_data), + updated_at = NOW() + WHERE worker_id = p_worker_id; + END IF; + + -- Update overall preflight status + SELECT preflight_curl_status, preflight_http_status + INTO v_curl_status, v_http_status + FROM worker_registry + WHERE worker_id = p_worker_id; + + -- Compute overall status + IF v_curl_status = 'passed' AND v_http_status = 'passed' THEN + v_overall_status := 'passed'; + ELSIF v_curl_status = 'passed' OR v_http_status = 'passed' THEN + v_overall_status := 'partial'; + ELSIF v_curl_status = 'failed' OR v_http_status = 'failed' THEN + v_overall_status := 'failed'; + ELSE + v_overall_status := 'pending'; + END IF; + + UPDATE worker_registry + SET + preflight_status = v_overall_status, + preflight_at = NOW() + WHERE worker_id = p_worker_id; +END; +$$ LANGUAGE plpgsql; + +-- =================================================================== +-- PART 5: Update v_active_workers view +-- =================================================================== + +DROP VIEW IF EXISTS v_active_workers; + +CREATE VIEW v_active_workers AS +SELECT + wr.id, + wr.worker_id, + wr.friendly_name, + wr.role, + wr.status, + wr.pod_name, + wr.hostname, + wr.started_at, + wr.last_heartbeat_at, + wr.last_task_at, + wr.tasks_completed, + wr.tasks_failed, + wr.current_task_id, + -- IP addresses from preflights + wr.curl_ip, + wr.http_ip, + -- Combined preflight status + wr.preflight_status, + wr.preflight_at, + -- Detailed preflight status per transport + wr.preflight_curl_status, + wr.preflight_http_status, + wr.preflight_curl_at, + wr.preflight_http_at, + wr.preflight_curl_error, + wr.preflight_http_error, + wr.preflight_curl_ms, + wr.preflight_http_ms, + -- Fingerprint data + wr.fingerprint_data, + -- Computed fields + EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat, + CASE + WHEN wr.status = 'offline' THEN 'offline' + WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale' + WHEN wr.current_task_id IS NOT NULL THEN 'busy' + ELSE 'ready' + END as health_status, + -- Capability flags (can this worker handle curl/http tasks?) + (wr.preflight_curl_status = 'passed') as can_curl, + (wr.preflight_http_status = 'passed') as can_http +FROM worker_registry wr +WHERE wr.status != 'terminated' +ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC; + +-- =================================================================== +-- Comments +-- =================================================================== + +COMMENT ON COLUMN worker_registry.curl_ip IS 'IP address detected during curl/axios preflight'; +COMMENT ON COLUMN worker_registry.http_ip IS 'IP address detected during Puppeteer preflight'; +COMMENT ON COLUMN worker_registry.fingerprint_data IS 'Browser fingerprint captured during Puppeteer preflight'; +COMMENT ON COLUMN worker_registry.preflight_status IS 'Overall preflight status: pending, passed, partial, failed'; +COMMENT ON COLUMN worker_registry.preflight_at IS 'Most recent preflight completion timestamp';