diff --git a/k8s/scraper-worker.yaml b/k8s/scraper-worker.yaml index 6435afa7..a8dd1a84 100644 --- a/k8s/scraper-worker.yaml +++ b/k8s/scraper-worker.yaml @@ -1,12 +1,12 @@ -# Dutchie AZ Worker Deployment -# These workers poll the job queue and process crawl jobs. -# Scale this deployment to increase crawl throughput. +# Hydration Worker Deployment +# These workers process raw_payloads → canonical tables. +# Scale this deployment to increase hydration throughput. # # Architecture: # - The main 'scraper' deployment runs the API server + scheduler (1 replica) -# - This 'scraper-worker' deployment runs workers that poll and claim jobs (5 replicas) -# - Workers use DB-level locking (FOR UPDATE SKIP LOCKED) to prevent double-crawls -# - Each worker sends heartbeats; stale jobs are recovered automatically +# - This 'scraper-worker' deployment runs hydration workers (5 replicas) +# - Workers use DB-level locking to prevent double-processing +# - Each worker processes payloads in batches with configurable limits apiVersion: apps/v1 kind: Deployment metadata: @@ -27,9 +27,9 @@ spec: containers: - name: worker image: code.cannabrands.app/creationshop/dispensary-scraper:latest - # Run the worker process instead of the main server + # Run the hydration worker in loop mode command: ["node"] - args: ["dist/dutchie-az/services/worker.js"] + args: ["dist/scripts/run-hydration.js", "--mode=payload", "--loop"] envFrom: - configMapRef: name: scraper-config @@ -57,9 +57,9 @@ spec: command: - /bin/sh - -c - - "pgrep -f 'worker.js' > /dev/null" + - "pgrep -f 'run-hydration' > /dev/null" initialDelaySeconds: 10 periodSeconds: 30 failureThreshold: 3 - # Graceful shutdown - give workers time to complete current job + # Graceful shutdown - give workers time to complete current batch terminationGracePeriodSeconds: 60