feat(tasks): Add unified task-based worker architecture

Replace fragmented job systems (job_schedules, dispensary_crawl_jobs, SyncOrchestrator) with a single unified task queue: - Add worker_tasks table with atomic task claiming via SELECT FOR UPDATE SKIP LOCKED - Add TaskService for CRUD, claiming, and capacity metrics - Add TaskWorker with role-based handlers (resync, discovery, analytics) - Add /api/tasks endpoints for management and migration from legacy systems - Add TasksDashboard UI and integrate task counts into main dashboard - Add comprehensive documentation Task roles: store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh Run workers with: WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 16:27:03 -07:00
parent 7f9cf559cf
commit 89c262ee20
18 changed files with 3167 additions and 2 deletions
--- a/backend/migrations/074_worker_task_queue.sql
+++ b/backend/migrations/074_worker_task_queue.sql
@@ -0,0 +1,322 @@
+-- Migration 074: Worker Task Queue System
+-- Implements role-based task queue with per-store locking and capacity tracking
+
+-- Task queue table
+CREATE TABLE IF NOT EXISTS worker_tasks (
+  id SERIAL PRIMARY KEY,
+
+  -- Task identification
+  role VARCHAR(50) NOT NULL,           -- store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh
+  dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE,
+  platform VARCHAR(20),                 -- dutchie, jane, treez, etc.
+
+  -- Task state
+  status VARCHAR(20) NOT NULL DEFAULT 'pending',
+  priority INTEGER DEFAULT 0,           -- Higher = more urgent
+
+  -- Scheduling
+  scheduled_for TIMESTAMPTZ,            -- For batch scheduling (e.g., every 4 hours)
+
+  -- Ownership
+  worker_id VARCHAR(100),               -- Pod name or worker ID
+  claimed_at TIMESTAMPTZ,
+  started_at TIMESTAMPTZ,
+  completed_at TIMESTAMPTZ,
+  last_heartbeat_at TIMESTAMPTZ,
+
+  -- Results
+  result JSONB,                         -- Task output data
+  error_message TEXT,
+  retry_count INTEGER DEFAULT 0,
+  max_retries INTEGER DEFAULT 3,
+
+  -- Metadata
+  created_at TIMESTAMPTZ DEFAULT NOW(),
+  updated_at TIMESTAMPTZ DEFAULT NOW(),
+
+  -- Constraints
+  CONSTRAINT valid_status CHECK (status IN ('pending', 'claimed', 'running', 'completed', 'failed', 'stale'))
+);
+
+-- Indexes for efficient task claiming
+CREATE INDEX IF NOT EXISTS idx_worker_tasks_pending
+  ON worker_tasks(role, priority DESC, created_at ASC)
+  WHERE status = 'pending';
+
+CREATE INDEX IF NOT EXISTS idx_worker_tasks_claimed
+  ON worker_tasks(worker_id, claimed_at)
+  WHERE status = 'claimed';
+
+CREATE INDEX IF NOT EXISTS idx_worker_tasks_running
+  ON worker_tasks(worker_id, last_heartbeat_at)
+  WHERE status = 'running';
+
+CREATE INDEX IF NOT EXISTS idx_worker_tasks_dispensary
+  ON worker_tasks(dispensary_id)
+  WHERE dispensary_id IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_worker_tasks_scheduled
+  ON worker_tasks(scheduled_for)
+  WHERE status = 'pending' AND scheduled_for IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_worker_tasks_history
+  ON worker_tasks(role, completed_at DESC)
+  WHERE status IN ('completed', 'failed');
+
+-- Partial unique index to prevent duplicate active tasks per store
+-- Only one task can be claimed/running for a given dispensary at a time
+CREATE UNIQUE INDEX IF NOT EXISTS idx_worker_tasks_unique_active_store
+  ON worker_tasks(dispensary_id)
+  WHERE status IN ('claimed', 'running') AND dispensary_id IS NOT NULL;
+
+-- Worker registration table (tracks active workers)
+CREATE TABLE IF NOT EXISTS worker_registry (
+  id SERIAL PRIMARY KEY,
+  worker_id VARCHAR(100) UNIQUE NOT NULL,
+  role VARCHAR(50) NOT NULL,
+  pod_name VARCHAR(100),
+  hostname VARCHAR(100),
+  started_at TIMESTAMPTZ DEFAULT NOW(),
+  last_heartbeat_at TIMESTAMPTZ DEFAULT NOW(),
+  tasks_completed INTEGER DEFAULT 0,
+  tasks_failed INTEGER DEFAULT 0,
+  status VARCHAR(20) DEFAULT 'active',
+
+  CONSTRAINT valid_worker_status CHECK (status IN ('active', 'idle', 'offline'))
+);
+
+CREATE INDEX IF NOT EXISTS idx_worker_registry_role
+  ON worker_registry(role, status);
+
+CREATE INDEX IF NOT EXISTS idx_worker_registry_heartbeat
+  ON worker_registry(last_heartbeat_at)
+  WHERE status = 'active';
+
+-- Task completion tracking (summarized history)
+CREATE TABLE IF NOT EXISTS task_completion_log (
+  id SERIAL PRIMARY KEY,
+  role VARCHAR(50) NOT NULL,
+  date DATE NOT NULL DEFAULT CURRENT_DATE,
+  hour INTEGER NOT NULL DEFAULT EXTRACT(HOUR FROM NOW()),
+
+  tasks_created INTEGER DEFAULT 0,
+  tasks_completed INTEGER DEFAULT 0,
+  tasks_failed INTEGER DEFAULT 0,
+
+  avg_duration_sec NUMERIC(10,2),
+  min_duration_sec NUMERIC(10,2),
+  max_duration_sec NUMERIC(10,2),
+
+  updated_at TIMESTAMPTZ DEFAULT NOW(),
+
+  UNIQUE(role, date, hour)
+);
+
+-- Capacity planning view
+CREATE OR REPLACE VIEW v_worker_capacity AS
+SELECT
+  role,
+  COUNT(*) FILTER (WHERE status = 'pending') as pending_tasks,
+  COUNT(*) FILTER (WHERE status = 'pending' AND (scheduled_for IS NULL OR scheduled_for <= NOW())) as ready_tasks,
+  COUNT(*) FILTER (WHERE status = 'claimed') as claimed_tasks,
+  COUNT(*) FILTER (WHERE status = 'running') as running_tasks,
+  COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') as completed_last_hour,
+  COUNT(*) FILTER (WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') as failed_last_hour,
+  COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) as active_workers,
+  AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
+    FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') as avg_duration_sec,
+  -- Capacity planning metrics
+  CASE
+    WHEN COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') > 0
+    THEN 3600.0 / NULLIF(AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
+      FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'), 0)
+    ELSE NULL
+  END as tasks_per_worker_hour,
+  -- Estimated time to drain queue
+  CASE
+    WHEN COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) > 0
+      AND COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') > 0
+    THEN COUNT(*) FILTER (WHERE status = 'pending') / NULLIF(
+      COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) *
+      (3600.0 / NULLIF(AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
+        FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'), 0)),
+      0
+    )
+    ELSE NULL
+  END as estimated_hours_to_drain
+FROM worker_tasks
+GROUP BY role;
+
+-- Task history view (for UI)
+CREATE OR REPLACE VIEW v_task_history AS
+SELECT
+  t.id,
+  t.role,
+  t.dispensary_id,
+  d.name as dispensary_name,
+  t.platform,
+  t.status,
+  t.priority,
+  t.worker_id,
+  t.scheduled_for,
+  t.claimed_at,
+  t.started_at,
+  t.completed_at,
+  t.error_message,
+  t.retry_count,
+  t.created_at,
+  EXTRACT(EPOCH FROM (t.completed_at - t.started_at)) as duration_sec
+FROM worker_tasks t
+LEFT JOIN dispensaries d ON d.id = t.dispensary_id
+ORDER BY t.created_at DESC;
+
+-- Function to claim a task atomically
+CREATE OR REPLACE FUNCTION claim_task(
+  p_role VARCHAR(50),
+  p_worker_id VARCHAR(100)
+) RETURNS worker_tasks AS $$
+DECLARE
+  claimed_task worker_tasks;
+BEGIN
+  UPDATE worker_tasks
+  SET
+    status = 'claimed',
+    worker_id = p_worker_id,
+    claimed_at = NOW(),
+    updated_at = NOW()
+  WHERE id = (
+    SELECT id FROM worker_tasks
+    WHERE role = p_role
+      AND status = 'pending'
+      AND (scheduled_for IS NULL OR scheduled_for <= NOW())
+      -- Exclude stores that already have an active task
+      AND (dispensary_id IS NULL OR dispensary_id NOT IN (
+        SELECT dispensary_id FROM worker_tasks
+        WHERE status IN ('claimed', 'running')
+        AND dispensary_id IS NOT NULL
+      ))
+    ORDER BY priority DESC, created_at ASC
+    LIMIT 1
+    FOR UPDATE SKIP LOCKED
+  )
+  RETURNING * INTO claimed_task;
+
+  RETURN claimed_task;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Function to mark stale tasks (workers that died)
+CREATE OR REPLACE FUNCTION recover_stale_tasks(
+  stale_threshold_minutes INTEGER DEFAULT 10
+) RETURNS INTEGER AS $$
+DECLARE
+  recovered_count INTEGER;
+BEGIN
+  WITH stale AS (
+    UPDATE worker_tasks
+    SET
+      status = 'pending',
+      worker_id = NULL,
+      claimed_at = NULL,
+      started_at = NULL,
+      retry_count = retry_count + 1,
+      updated_at = NOW()
+    WHERE status IN ('claimed', 'running')
+      AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
+      AND retry_count < max_retries
+    RETURNING id
+  )
+  SELECT COUNT(*) INTO recovered_count FROM stale;
+
+  -- Mark tasks that exceeded retries as failed
+  UPDATE worker_tasks
+  SET
+    status = 'failed',
+    error_message = 'Exceeded max retries after worker failures',
+    completed_at = NOW(),
+    updated_at = NOW()
+  WHERE status IN ('claimed', 'running')
+    AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
+    AND retry_count >= max_retries;
+
+  RETURN recovered_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Function to generate daily resync tasks
+CREATE OR REPLACE FUNCTION generate_resync_tasks(
+  p_batches_per_day INTEGER DEFAULT 6,  -- Every 4 hours
+  p_date DATE DEFAULT CURRENT_DATE
+) RETURNS INTEGER AS $$
+DECLARE
+  store_count INTEGER;
+  stores_per_batch INTEGER;
+  batch_num INTEGER;
+  scheduled_time TIMESTAMPTZ;
+  created_count INTEGER := 0;
+BEGIN
+  -- Count active stores that need resync
+  SELECT COUNT(*) INTO store_count
+  FROM dispensaries
+  WHERE crawl_enabled = true
+    AND menu_type = 'dutchie'
+    AND platform_dispensary_id IS NOT NULL;
+
+  IF store_count = 0 THEN
+    RETURN 0;
+  END IF;
+
+  stores_per_batch := CEIL(store_count::NUMERIC / p_batches_per_day);
+
+  FOR batch_num IN 0..(p_batches_per_day - 1) LOOP
+    scheduled_time := p_date + (batch_num * 4 || ' hours')::INTERVAL;
+
+    INSERT INTO worker_tasks (role, dispensary_id, platform, scheduled_for, priority)
+    SELECT
+      'product_resync',
+      d.id,
+      'dutchie',
+      scheduled_time,
+      0
+    FROM (
+      SELECT id, ROW_NUMBER() OVER (ORDER BY id) as rn
+      FROM dispensaries
+      WHERE crawl_enabled = true
+        AND menu_type = 'dutchie'
+        AND platform_dispensary_id IS NOT NULL
+    ) d
+    WHERE d.rn > (batch_num * stores_per_batch)
+      AND d.rn <= ((batch_num + 1) * stores_per_batch)
+    ON CONFLICT DO NOTHING;
+
+    GET DIAGNOSTICS created_count = created_count + ROW_COUNT;
+  END LOOP;
+
+  RETURN created_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Trigger to update timestamp
+CREATE OR REPLACE FUNCTION update_worker_tasks_timestamp()
+RETURNS TRIGGER AS $$
+BEGIN
+  NEW.updated_at = NOW();
+  RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+DROP TRIGGER IF EXISTS worker_tasks_updated_at ON worker_tasks;
+CREATE TRIGGER worker_tasks_updated_at
+  BEFORE UPDATE ON worker_tasks
+  FOR EACH ROW
+  EXECUTE FUNCTION update_worker_tasks_timestamp();
+
+-- Comments
+COMMENT ON TABLE worker_tasks IS 'Central task queue for all worker roles';
+COMMENT ON TABLE worker_registry IS 'Registry of active workers and their stats';
+COMMENT ON TABLE task_completion_log IS 'Hourly aggregated task completion metrics';
+COMMENT ON VIEW v_worker_capacity IS 'Real-time capacity planning metrics per role';
+COMMENT ON VIEW v_task_history IS 'Task history with dispensary details for UI';
+COMMENT ON FUNCTION claim_task IS 'Atomically claim a task for a worker, respecting per-store locking';
+COMMENT ON FUNCTION recover_stale_tasks IS 'Release tasks from dead workers back to pending';
+COMMENT ON FUNCTION generate_resync_tasks IS 'Generate daily product resync tasks in batches';