feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,8 @@ import { initializeMinio, isMinioEnabled } from './utils/minio';
|
||||
import { initializeImageStorage } from './utils/image-storage';
|
||||
import { logger } from './services/logger';
|
||||
import { cleanupOrphanedJobs } from './services/proxyTestQueue';
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Database-driven task scheduler
|
||||
import { taskScheduler } from './services/task-scheduler';
|
||||
import { runAutoMigrations } from './db/auto-migrate';
|
||||
import { getPool } from './db/pool';
|
||||
import healthRoutes from './routes/health';
|
||||
@@ -142,6 +144,8 @@ import seoRoutes from './routes/seo';
|
||||
import priceAnalyticsRoutes from './routes/price-analytics';
|
||||
import tasksRoutes from './routes/tasks';
|
||||
import workerRegistryRoutes from './routes/worker-registry';
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Raw payload access API
|
||||
import payloadsRoutes from './routes/payloads';
|
||||
|
||||
// Mark requests from trusted domains (cannaiq.co, findagram.co, findadispo.com)
|
||||
// These domains can access the API without authentication
|
||||
@@ -222,6 +226,10 @@ console.log('[Tasks] Routes registered at /api/tasks');
|
||||
app.use('/api/worker-registry', workerRegistryRoutes);
|
||||
console.log('[WorkerRegistry] Routes registered at /api/worker-registry');
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Raw payload access API
|
||||
app.use('/api/payloads', payloadsRoutes);
|
||||
console.log('[Payloads] Routes registered at /api/payloads');
|
||||
|
||||
// Phase 3: Analytics V2 - Enhanced analytics with rec/med state segmentation
|
||||
try {
|
||||
const analyticsV2Router = createAnalyticsV2Router(getPool());
|
||||
@@ -326,6 +334,17 @@ async function startServer() {
|
||||
// Clean up any orphaned proxy test jobs from previous server runs
|
||||
await cleanupOrphanedJobs();
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Start database-driven task scheduler
|
||||
// This replaces node-cron - schedules are stored in DB and survive restarts
|
||||
// Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
|
||||
try {
|
||||
await taskScheduler.start();
|
||||
logger.info('system', 'Task scheduler started');
|
||||
} catch (err: any) {
|
||||
// Non-fatal - scheduler can recover on next poll
|
||||
logger.warn('system', `Task scheduler startup warning: ${err.message}`);
|
||||
}
|
||||
|
||||
app.listen(PORT, () => {
|
||||
logger.info('system', `Server running on port ${PORT}`);
|
||||
console.log(`🚀 Server running on port ${PORT}`);
|
||||
|
||||
Reference in New Issue
Block a user