Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
50 lines
2.1 KiB
SQL
50 lines
2.1 KiB
SQL
-- Migration 079: Task Schedules for Database-Driven Scheduler
|
|
-- Per TASK_WORKFLOW_2024-12-10.md: Replaces node-cron with DB-driven scheduling
|
|
--
|
|
-- 2024-12-10: Created for reliable, multi-replica-safe task scheduling
|
|
|
|
-- task_schedules: Stores schedule definitions and state
|
|
CREATE TABLE IF NOT EXISTS task_schedules (
|
|
id SERIAL PRIMARY KEY,
|
|
name VARCHAR(100) NOT NULL UNIQUE,
|
|
role VARCHAR(50) NOT NULL, -- TaskRole: product_refresh, store_discovery, etc.
|
|
description TEXT,
|
|
|
|
-- Schedule configuration
|
|
enabled BOOLEAN DEFAULT TRUE,
|
|
interval_hours INTEGER NOT NULL DEFAULT 4,
|
|
priority INTEGER DEFAULT 0,
|
|
|
|
-- Optional scope filters
|
|
state_code VARCHAR(2), -- NULL = all states
|
|
platform VARCHAR(50), -- NULL = all platforms
|
|
|
|
-- Execution state (updated by scheduler)
|
|
last_run_at TIMESTAMPTZ,
|
|
next_run_at TIMESTAMPTZ,
|
|
last_task_count INTEGER DEFAULT 0,
|
|
last_error TEXT,
|
|
|
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
|
);
|
|
|
|
-- Indexes for scheduler queries
|
|
CREATE INDEX IF NOT EXISTS idx_task_schedules_enabled ON task_schedules(enabled) WHERE enabled = TRUE;
|
|
CREATE INDEX IF NOT EXISTS idx_task_schedules_next_run ON task_schedules(next_run_at) WHERE enabled = TRUE;
|
|
|
|
-- Insert default schedules
|
|
INSERT INTO task_schedules (name, role, interval_hours, priority, description, next_run_at)
|
|
VALUES
|
|
('product_refresh_all', 'product_refresh', 4, 0, 'Generate product refresh tasks for all crawl-enabled stores every 4 hours', NOW()),
|
|
('store_discovery_dutchie', 'store_discovery', 24, 5, 'Discover new Dutchie stores daily', NOW()),
|
|
('analytics_refresh', 'analytics_refresh', 6, 0, 'Refresh analytics materialized views every 6 hours', NOW())
|
|
ON CONFLICT (name) DO NOTHING;
|
|
|
|
-- Comment for documentation
|
|
COMMENT ON TABLE task_schedules IS 'Database-driven task scheduler configuration. Per TASK_WORKFLOW_2024-12-10.md:
|
|
- Schedules persist in DB (survive restarts)
|
|
- Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
|
|
- Scheduler polls every 60s and executes due schedules
|
|
- Creates tasks in worker_tasks for task-worker.ts to process';
|