feat(tasks): Refactor task workflow with payload/refresh separation

Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 22:15:35 -07:00
parent 1fb0eb94c2
commit 4949b22457
33 changed files with 4064 additions and 737 deletions

View File

@@ -127,6 +127,8 @@ export interface PromotionSummary {
errors: string[];
}>;
durationMs: number;
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
newDispensaryIds: number[];
}
/**
@@ -469,6 +471,8 @@ export async function promoteDiscoveredLocations(
const results: PromotionResult[] = [];
const rejectedRecords: PromotionSummary['rejectedRecords'] = [];
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
const newDispensaryIds: number[] = [];
let created = 0;
let updated = 0;
let skipped = 0;
@@ -525,6 +529,8 @@ export async function promoteDiscoveredLocations(
if (promotionResult.action === 'created') {
created++;
// Per TASK_WORKFLOW_2024-12-10.md: Track new IDs for task chaining
newDispensaryIds.push(promotionResult.dispensaryId);
} else {
updated++;
}
@@ -548,6 +554,8 @@ export async function promoteDiscoveredLocations(
results,
rejectedRecords,
durationMs: Date.now() - startTime,
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
newDispensaryIds,
};
}