feat(tasks): Refactor task workflow with payload/refresh separation

Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 22:15:35 -07:00
parent 1fb0eb94c2
commit 4949b22457
33 changed files with 4064 additions and 737 deletions

View File

@@ -52,6 +52,8 @@ import { CrawlRotator } from '../services/crawl-rotator';
import { setCrawlRotator } from '../platforms/dutchie';
// Task handlers by role
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch and product_refresh are now separate
import { handlePayloadFetch } from './handlers/payload-fetch';
import { handleProductRefresh } from './handlers/product-refresh';
import { handleProductDiscovery } from './handlers/product-discovery';
import { handleStoreDiscovery } from './handlers/store-discovery';
@@ -80,8 +82,12 @@ export interface TaskResult {
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
// Per TASK_WORKFLOW_2024-12-10.md: Handler registry
// payload_fetch: Fetches from Dutchie API, saves to disk, chains to product_refresh
// product_refresh: Reads local payload, normalizes, upserts to DB
const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
product_refresh: handleProductRefresh,
payload_fetch: handlePayloadFetch, // NEW: API fetch -> disk
product_refresh: handleProductRefresh, // CHANGED: disk -> DB
product_discovery: handleProductDiscovery,
store_discovery: handleStoreDiscovery,
entry_point_discovery: handleEntryPointDiscovery,
@@ -414,11 +420,13 @@ export class TaskWorker {
async function main(): Promise<void> {
const role = process.env.WORKER_ROLE as TaskRole | undefined;
// Per TASK_WORKFLOW_2024-12-10.md: Valid task roles
const validRoles: TaskRole[] = [
'store_discovery',
'entry_point_discovery',
'product_discovery',
'product_refresh',
'payload_fetch', // NEW: Fetches from API, saves to disk
'product_refresh', // CHANGED: Reads from disk, processes to DB
'analytics_refresh',
];