feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -21,11 +21,15 @@ async function tableExists(tableName: string): Promise<boolean> {
|
||||
return result.rows[0].exists;
|
||||
}
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Task roles
|
||||
// payload_fetch: Hits Dutchie API, saves raw payload to filesystem
|
||||
// product_refresh: Reads local payload, normalizes, upserts to DB
|
||||
export type TaskRole =
|
||||
| 'store_discovery'
|
||||
| 'entry_point_discovery'
|
||||
| 'product_discovery'
|
||||
| 'product_refresh'
|
||||
| 'payload_fetch' // NEW: Fetches from API, saves to disk
|
||||
| 'product_refresh' // CHANGED: Now reads from local payload
|
||||
| 'analytics_refresh';
|
||||
|
||||
export type TaskStatus =
|
||||
@@ -55,6 +59,7 @@ export interface WorkerTask {
|
||||
error_message: string | null;
|
||||
retry_count: number;
|
||||
max_retries: number;
|
||||
payload: Record<string, unknown> | null; // Per TASK_WORKFLOW_2024-12-10.md: Task chaining data
|
||||
created_at: Date;
|
||||
updated_at: Date;
|
||||
}
|
||||
@@ -65,6 +70,7 @@ export interface CreateTaskParams {
|
||||
platform?: string;
|
||||
priority?: number;
|
||||
scheduled_for?: Date;
|
||||
payload?: Record<string, unknown>; // Per TASK_WORKFLOW_2024-12-10.md: For task chaining data
|
||||
}
|
||||
|
||||
export interface CapacityMetrics {
|
||||
@@ -96,8 +102,8 @@ class TaskService {
|
||||
*/
|
||||
async createTask(params: CreateTaskParams): Promise<WorkerTask> {
|
||||
const result = await pool.query(
|
||||
`INSERT INTO worker_tasks (role, dispensary_id, platform, priority, scheduled_for)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
`INSERT INTO worker_tasks (role, dispensary_id, platform, priority, scheduled_for, payload)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
RETURNING *`,
|
||||
[
|
||||
params.role,
|
||||
@@ -105,6 +111,7 @@ class TaskService {
|
||||
params.platform ?? null,
|
||||
params.priority ?? 0,
|
||||
params.scheduled_for ?? null,
|
||||
params.payload ? JSON.stringify(params.payload) : null,
|
||||
]
|
||||
);
|
||||
return result.rows[0] as WorkerTask;
|
||||
@@ -401,6 +408,17 @@ class TaskService {
|
||||
/**
|
||||
* Chain next task after completion
|
||||
* Called automatically when a task completes successfully
|
||||
*
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Task chaining flow:
|
||||
*
|
||||
* Discovery flow (new stores):
|
||||
* store_discovery → product_discovery → payload_fetch → product_refresh
|
||||
*
|
||||
* Scheduled flow (existing stores):
|
||||
* payload_fetch → product_refresh
|
||||
*
|
||||
* Note: entry_point_discovery is deprecated since platform_dispensary_id
|
||||
* is now resolved during store promotion.
|
||||
*/
|
||||
async chainNextTask(completedTask: WorkerTask): Promise<WorkerTask | null> {
|
||||
if (completedTask.status !== 'completed') {
|
||||
@@ -409,12 +427,14 @@ class TaskService {
|
||||
|
||||
switch (completedTask.role) {
|
||||
case 'store_discovery': {
|
||||
// New stores discovered -> create entry_point_discovery tasks
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: New stores discovered -> create product_discovery tasks
|
||||
// Skip entry_point_discovery since platform_dispensary_id is set during promotion
|
||||
const newStoreIds = (completedTask.result as { newStoreIds?: number[] })?.newStoreIds;
|
||||
if (newStoreIds && newStoreIds.length > 0) {
|
||||
console.log(`[TaskService] Chaining ${newStoreIds.length} product_discovery tasks for new stores`);
|
||||
for (const storeId of newStoreIds) {
|
||||
await this.createTask({
|
||||
role: 'entry_point_discovery',
|
||||
role: 'product_discovery',
|
||||
dispensary_id: storeId,
|
||||
platform: completedTask.platform ?? undefined,
|
||||
priority: 10, // High priority for new stores
|
||||
@@ -425,7 +445,8 @@ class TaskService {
|
||||
}
|
||||
|
||||
case 'entry_point_discovery': {
|
||||
// Entry point resolved -> create product_discovery task
|
||||
// DEPRECATED: Entry point resolution now happens during store promotion
|
||||
// Kept for backward compatibility with any in-flight tasks
|
||||
const success = (completedTask.result as { success?: boolean })?.success;
|
||||
if (success && completedTask.dispensary_id) {
|
||||
return this.createTask({
|
||||
@@ -439,8 +460,15 @@ class TaskService {
|
||||
}
|
||||
|
||||
case 'product_discovery': {
|
||||
// Product discovery done -> store is now ready for regular resync
|
||||
// No immediate chaining needed; will be picked up by daily batch generation
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Product discovery chains internally to payload_fetch
|
||||
// No external chaining needed - handleProductDiscovery calls handlePayloadFetch directly
|
||||
break;
|
||||
}
|
||||
|
||||
case 'payload_fetch': {
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch chains to product_refresh
|
||||
// This is handled internally by the payload_fetch handler via taskService.createTask
|
||||
// No external chaining needed here
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user