feat(tasks): Refactor task workflow with payload/refresh separation

Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 22:15:35 -07:00
parent 1fb0eb94c2
commit 4949b22457
33 changed files with 4064 additions and 737 deletions

View File

@@ -1,8 +1,16 @@
/**
* Store Discovery Handler
*
* Discovers new stores by crawling location APIs and adding them
* to discovery_locations table.
* Per TASK_WORKFLOW_2024-12-10.md: Discovers new stores and returns their IDs for task chaining.
*
* Flow:
* 1. For each active state, run Dutchie discovery
* 2. Discover locations via GraphQL
* 3. Auto-promote valid locations to dispensaries table
* 4. Return newStoreIds[] for chaining to payload_fetch
*
* Chaining:
* store_discovery → (returns newStoreIds) → payload_fetch → product_refresh
*/
import { TaskContext, TaskResult } from '../task-worker';
@@ -10,7 +18,7 @@ import { discoverState } from '../../discovery';
export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult> {
const { pool, task } = ctx;
const platform = task.platform || 'default';
const platform = task.platform || 'dutchie';
console.log(`[StoreDiscovery] Starting discovery for platform: ${platform}`);
@@ -22,11 +30,13 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
const stateCodes = statesResult.rows.map(r => r.code);
if (stateCodes.length === 0) {
return { success: true, storesDiscovered: 0, message: 'No active states to discover' };
return { success: true, storesDiscovered: 0, newStoreIds: [], message: 'No active states to discover' };
}
let totalDiscovered = 0;
let totalPromoted = 0;
// Per TASK_WORKFLOW_2024-12-10.md: Collect all new store IDs for task chaining
const allNewStoreIds: number[] = [];
// Run discovery for each state
for (const stateCode of stateCodes) {
@@ -39,6 +49,13 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
const result = await discoverState(pool, stateCode);
totalDiscovered += result.totalLocationsFound || 0;
totalPromoted += result.totalLocationsUpserted || 0;
// Per TASK_WORKFLOW_2024-12-10.md: Collect new IDs for chaining
if (result.newDispensaryIds && result.newDispensaryIds.length > 0) {
allNewStoreIds.push(...result.newDispensaryIds);
console.log(`[StoreDiscovery] ${stateCode}: ${result.newDispensaryIds.length} new stores`);
}
console.log(`[StoreDiscovery] ${stateCode}: found ${result.totalLocationsFound}, upserted ${result.totalLocationsUpserted}`);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
@@ -47,13 +64,15 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
}
}
console.log(`[StoreDiscovery] Complete: ${totalDiscovered} discovered, ${totalPromoted} promoted`);
console.log(`[StoreDiscovery] Complete: ${totalDiscovered} discovered, ${totalPromoted} promoted, ${allNewStoreIds.length} new stores`);
return {
success: true,
storesDiscovered: totalDiscovered,
storesPromoted: totalPromoted,
statesProcessed: stateCodes.length,
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
newStoreIds: allNewStoreIds,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
@@ -61,6 +80,7 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
return {
success: false,
error: errorMessage,
newStoreIds: [],
};
}
}