feat: Stealth worker system with mandatory proxy rotation
## Worker System - Role-agnostic workers that can handle any task type - Pod-based architecture with StatefulSet (5-15 pods, 5 workers each) - Custom pod names (Aethelgard, Xylos, Kryll, etc.) - Worker registry with friendly names and resource monitoring - Hub-and-spoke visualization on JobQueue page ## Stealth & Anti-Detection (REQUIRED) - Proxies are MANDATORY - workers fail to start without active proxies - CrawlRotator initializes on worker startup - Loads proxies from `proxies` table - Auto-rotates proxy + fingerprint on 403 errors - 12 browser fingerprints (Chrome, Firefox, Safari, Edge) - Locale/timezone matching for geographic consistency ## Task System - Renamed product_resync → product_refresh - Task chaining: store_discovery → entry_point → product_discovery - Priority-based claiming with FOR UPDATE SKIP LOCKED - Heartbeat and stale task recovery ## UI Updates - JobQueue: Pod visualization, resource monitoring on hover - WorkersDashboard: Simplified worker list - Removed unused filters from task list ## Other - IP2Location service for visitor analytics - Findagram consumer features scaffolding - Documentation updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,65 +1,162 @@
|
||||
# Hydration Worker Deployment
|
||||
# These workers process raw_payloads → canonical tables.
|
||||
# Scale this deployment to increase hydration throughput.
|
||||
# Task Worker Pods
|
||||
# Each pod runs 5 role-agnostic workers that pull tasks from worker_tasks queue.
|
||||
#
|
||||
# Architecture:
|
||||
# - The main 'scraper' deployment runs the API server + scheduler (1 replica)
|
||||
# - This 'scraper-worker' deployment runs hydration workers (5 replicas)
|
||||
# - Workers use DB-level locking to prevent double-processing
|
||||
# - Each worker processes payloads in batches with configurable limits
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
# - Pods are named from a predefined list (Aethelgard, Xylos, etc.)
|
||||
# - Each pod spawns 5 worker processes
|
||||
# - Workers register with API and show their pod name
|
||||
# - HPA scales pods 5-15 based on pending task count
|
||||
# - Workers use DB-level locking (FOR UPDATE SKIP LOCKED) to prevent conflicts
|
||||
#
|
||||
# Pod Names (up to 25):
|
||||
# Aethelgard, Xylos, Kryll, Coriolis, Dimidium, Veridia, Zetani, Talos IV,
|
||||
# Onyx, Celestia, Gormand, Betha, Ragnar, Syphon, Axiom, Nadir, Terra Nova,
|
||||
# Acheron, Nexus, Vespera, Helios Prime, Oasis, Mordina, Cygnus, Umbra
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: scraper-worker
|
||||
name: pod-names
|
||||
namespace: dispensary-scraper
|
||||
data:
|
||||
names: |
|
||||
Aethelgard
|
||||
Xylos
|
||||
Kryll
|
||||
Coriolis
|
||||
Dimidium
|
||||
Veridia
|
||||
Zetani
|
||||
Talos IV
|
||||
Onyx
|
||||
Celestia
|
||||
Gormand
|
||||
Betha
|
||||
Ragnar
|
||||
Syphon
|
||||
Axiom
|
||||
Nadir
|
||||
Terra Nova
|
||||
Acheron
|
||||
Nexus
|
||||
Vespera
|
||||
Helios Prime
|
||||
Oasis
|
||||
Mordina
|
||||
Cygnus
|
||||
Umbra
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: worker-pod
|
||||
namespace: dispensary-scraper
|
||||
spec:
|
||||
serviceName: worker-pods
|
||||
replicas: 5
|
||||
podManagementPolicy: Parallel
|
||||
selector:
|
||||
matchLabels:
|
||||
app: scraper-worker
|
||||
app: worker-pod
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: scraper-worker
|
||||
app: worker-pod
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: regcred
|
||||
containers:
|
||||
- name: worker
|
||||
- name: workers
|
||||
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||
# Run the hydration worker in loop mode
|
||||
command: ["node"]
|
||||
args: ["dist/scripts/run-hydration.js", "--mode=payload", "--loop"]
|
||||
# Run 5 workers per pod
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
# Get pod ordinal (0, 1, 2, etc.)
|
||||
ORDINAL=$(echo $HOSTNAME | rev | cut -d'-' -f1 | rev)
|
||||
# Get pod name from configmap
|
||||
POD_NAME=$(sed -n "$((ORDINAL + 1))p" /etc/pod-names/names)
|
||||
echo "Starting pod: $POD_NAME (ordinal: $ORDINAL)"
|
||||
|
||||
# Start 5 workers in this pod
|
||||
for i in 1 2 3 4 5; do
|
||||
WORKER_ID="${POD_NAME}-worker-${i}" \
|
||||
POD_NAME="$POD_NAME" \
|
||||
node dist/tasks/task-worker.js &
|
||||
done
|
||||
|
||||
# Wait for all workers
|
||||
wait
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: scraper-config
|
||||
- secretRef:
|
||||
name: scraper-secrets
|
||||
env:
|
||||
# Worker-specific environment variables
|
||||
- name: WORKER_MODE
|
||||
value: "true"
|
||||
# Pod name becomes part of worker ID for debugging
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: API_BASE_URL
|
||||
value: "http://scraper:3010"
|
||||
- name: WORKERS_PER_POD
|
||||
value: "5"
|
||||
volumeMounts:
|
||||
- name: pod-names
|
||||
mountPath: /etc/pod-names
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
# Health check - workers don't expose ports, but we can use a file check
|
||||
cpu: "200m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "1000m"
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- "pgrep -f 'run-hydration' > /dev/null"
|
||||
initialDelaySeconds: 10
|
||||
- "pgrep -f 'task-worker' > /dev/null"
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
# Graceful shutdown - give workers time to complete current batch
|
||||
volumes:
|
||||
- name: pod-names
|
||||
configMap:
|
||||
name: pod-names
|
||||
terminationGracePeriodSeconds: 60
|
||||
---
|
||||
# Headless service for StatefulSet
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: worker-pods
|
||||
namespace: dispensary-scraper
|
||||
spec:
|
||||
clusterIP: None
|
||||
selector:
|
||||
app: worker-pod
|
||||
ports:
|
||||
- port: 80
|
||||
name: placeholder
|
||||
---
|
||||
# HPA to scale pods based on pending tasks
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: worker-pod-hpa
|
||||
namespace: dispensary-scraper
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
name: worker-pod
|
||||
minReplicas: 5
|
||||
maxReplicas: 15
|
||||
metrics:
|
||||
- type: External
|
||||
external:
|
||||
metric:
|
||||
name: pending_tasks
|
||||
selector:
|
||||
matchLabels:
|
||||
queue: worker_tasks
|
||||
target:
|
||||
type: AverageValue
|
||||
averageValue: "10"
|
||||
|
||||
Reference in New Issue
Block a user