feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,8 +1,16 @@
|
||||
/**
|
||||
* Store Discovery Handler
|
||||
*
|
||||
* Discovers new stores by crawling location APIs and adding them
|
||||
* to discovery_locations table.
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Discovers new stores and returns their IDs for task chaining.
|
||||
*
|
||||
* Flow:
|
||||
* 1. For each active state, run Dutchie discovery
|
||||
* 2. Discover locations via GraphQL
|
||||
* 3. Auto-promote valid locations to dispensaries table
|
||||
* 4. Return newStoreIds[] for chaining to payload_fetch
|
||||
*
|
||||
* Chaining:
|
||||
* store_discovery → (returns newStoreIds) → payload_fetch → product_refresh
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
@@ -10,7 +18,7 @@ import { discoverState } from '../../discovery';
|
||||
|
||||
export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool, task } = ctx;
|
||||
const platform = task.platform || 'default';
|
||||
const platform = task.platform || 'dutchie';
|
||||
|
||||
console.log(`[StoreDiscovery] Starting discovery for platform: ${platform}`);
|
||||
|
||||
@@ -22,11 +30,13 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
|
||||
const stateCodes = statesResult.rows.map(r => r.code);
|
||||
|
||||
if (stateCodes.length === 0) {
|
||||
return { success: true, storesDiscovered: 0, message: 'No active states to discover' };
|
||||
return { success: true, storesDiscovered: 0, newStoreIds: [], message: 'No active states to discover' };
|
||||
}
|
||||
|
||||
let totalDiscovered = 0;
|
||||
let totalPromoted = 0;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Collect all new store IDs for task chaining
|
||||
const allNewStoreIds: number[] = [];
|
||||
|
||||
// Run discovery for each state
|
||||
for (const stateCode of stateCodes) {
|
||||
@@ -39,6 +49,13 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
|
||||
const result = await discoverState(pool, stateCode);
|
||||
totalDiscovered += result.totalLocationsFound || 0;
|
||||
totalPromoted += result.totalLocationsUpserted || 0;
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Collect new IDs for chaining
|
||||
if (result.newDispensaryIds && result.newDispensaryIds.length > 0) {
|
||||
allNewStoreIds.push(...result.newDispensaryIds);
|
||||
console.log(`[StoreDiscovery] ${stateCode}: ${result.newDispensaryIds.length} new stores`);
|
||||
}
|
||||
|
||||
console.log(`[StoreDiscovery] ${stateCode}: found ${result.totalLocationsFound}, upserted ${result.totalLocationsUpserted}`);
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
@@ -47,13 +64,15 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[StoreDiscovery] Complete: ${totalDiscovered} discovered, ${totalPromoted} promoted`);
|
||||
console.log(`[StoreDiscovery] Complete: ${totalDiscovered} discovered, ${totalPromoted} promoted, ${allNewStoreIds.length} new stores`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
storesDiscovered: totalDiscovered,
|
||||
storesPromoted: totalPromoted,
|
||||
statesProcessed: stateCodes.length,
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
|
||||
newStoreIds: allNewStoreIds,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
@@ -61,6 +80,7 @@ export async function handleStoreDiscovery(ctx: TaskContext): Promise<TaskResult
|
||||
return {
|
||||
success: false,
|
||||
error: errorMessage,
|
||||
newStoreIds: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user