Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
32 lines
1.1 KiB
TypeScript
32 lines
1.1 KiB
TypeScript
/**
|
|
* Product Discovery Handler
|
|
*
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Initial product fetch for newly discovered stores.
|
|
*
|
|
* Flow:
|
|
* 1. Triggered after store_discovery promotes a new dispensary
|
|
* 2. Chains to payload_fetch to get initial product data
|
|
* 3. payload_fetch chains to product_refresh for DB upsert
|
|
*
|
|
* Chaining:
|
|
* store_discovery → (newStoreIds) → product_discovery → payload_fetch → product_refresh
|
|
*/
|
|
|
|
import { TaskContext, TaskResult } from '../task-worker';
|
|
import { handlePayloadFetch } from './payload-fetch';
|
|
|
|
export async function handleProductDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
|
const { task } = ctx;
|
|
const dispensaryId = task.dispensary_id;
|
|
|
|
if (!dispensaryId) {
|
|
return { success: false, error: 'No dispensary_id provided' };
|
|
}
|
|
|
|
console.log(`[ProductDiscovery] Starting initial product discovery for dispensary ${dispensaryId}`);
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Chain to payload_fetch for API → disk
|
|
// payload_fetch will then chain to product_refresh for disk → DB
|
|
return handlePayloadFetch(ctx);
|
|
}
|