feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
375
backend/src/services/task-scheduler.ts
Normal file
375
backend/src/services/task-scheduler.ts
Normal file
@@ -0,0 +1,375 @@
|
||||
/**
|
||||
* Database-Driven Task Scheduler
|
||||
*
|
||||
* Per TASK_WORKFLOW_2024-12-10.md:
|
||||
* - Schedules stored in DB (survives restarts)
|
||||
* - Uses SELECT FOR UPDATE to prevent duplicate execution across replicas
|
||||
* - Polls every 60s to check if schedules are due
|
||||
* - Generates tasks into worker_tasks table for task-worker.ts to process
|
||||
*
|
||||
* 2024-12-10: Created to replace legacy node-cron scheduler
|
||||
*/
|
||||
|
||||
import { pool } from '../db/pool';
|
||||
import { taskService, TaskRole } from '../tasks/task-service';
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Poll interval for checking schedules
|
||||
const POLL_INTERVAL_MS = 60_000; // 60 seconds
|
||||
|
||||
interface TaskSchedule {
|
||||
id: number;
|
||||
name: string;
|
||||
role: TaskRole;
|
||||
enabled: boolean;
|
||||
interval_hours: number;
|
||||
last_run_at: Date | null;
|
||||
next_run_at: Date | null;
|
||||
state_code: string | null;
|
||||
priority: number;
|
||||
}
|
||||
|
||||
class TaskScheduler {
|
||||
private pollTimer: NodeJS.Timeout | null = null;
|
||||
private isRunning = false;
|
||||
|
||||
/**
|
||||
* Start the scheduler
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Called on API server startup
|
||||
*/
|
||||
async start(): Promise<void> {
|
||||
if (this.isRunning) {
|
||||
console.log('[TaskScheduler] Already running');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('[TaskScheduler] Starting database-driven scheduler...');
|
||||
this.isRunning = true;
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: On startup, recover stale tasks
|
||||
try {
|
||||
const recovered = await taskService.recoverStaleTasks(10);
|
||||
if (recovered > 0) {
|
||||
console.log(`[TaskScheduler] Recovered ${recovered} stale tasks from dead workers`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error('[TaskScheduler] Failed to recover stale tasks:', err.message);
|
||||
}
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Ensure default schedules exist
|
||||
await this.ensureDefaultSchedules();
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Check immediately on startup
|
||||
await this.checkAndRunDueSchedules();
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Then poll every 60 seconds
|
||||
this.pollTimer = setInterval(async () => {
|
||||
await this.checkAndRunDueSchedules();
|
||||
}, POLL_INTERVAL_MS);
|
||||
|
||||
console.log('[TaskScheduler] Started - polling every 60s');
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the scheduler
|
||||
*/
|
||||
stop(): void {
|
||||
if (this.pollTimer) {
|
||||
clearInterval(this.pollTimer);
|
||||
this.pollTimer = null;
|
||||
}
|
||||
this.isRunning = false;
|
||||
console.log('[TaskScheduler] Stopped');
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure default schedules exist in the database
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Creates schedules if they don't exist
|
||||
*/
|
||||
private async ensureDefaultSchedules(): Promise<void> {
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Default schedules for task generation
|
||||
// NOTE: payload_fetch replaces direct product_refresh - it chains to product_refresh
|
||||
const defaults = [
|
||||
{
|
||||
name: 'payload_fetch_all',
|
||||
role: 'payload_fetch' as TaskRole,
|
||||
interval_hours: 4,
|
||||
priority: 0,
|
||||
description: 'Fetch payloads from Dutchie API for all crawl-enabled stores every 4 hours. Chains to product_refresh.',
|
||||
},
|
||||
{
|
||||
name: 'store_discovery_dutchie',
|
||||
role: 'store_discovery' as TaskRole,
|
||||
interval_hours: 24,
|
||||
priority: 5,
|
||||
description: 'Discover new Dutchie stores daily',
|
||||
},
|
||||
{
|
||||
name: 'analytics_refresh',
|
||||
role: 'analytics_refresh' as TaskRole,
|
||||
interval_hours: 6,
|
||||
priority: 0,
|
||||
description: 'Refresh analytics materialized views every 6 hours',
|
||||
},
|
||||
];
|
||||
|
||||
for (const sched of defaults) {
|
||||
try {
|
||||
await pool.query(`
|
||||
INSERT INTO task_schedules (name, role, interval_hours, priority, description, enabled, next_run_at)
|
||||
VALUES ($1, $2, $3, $4, $5, true, NOW())
|
||||
ON CONFLICT (name) DO NOTHING
|
||||
`, [sched.name, sched.role, sched.interval_hours, sched.priority, sched.description]);
|
||||
} catch (err: any) {
|
||||
// Table may not exist yet - will be created by migration
|
||||
if (!err.message.includes('does not exist')) {
|
||||
console.error(`[TaskScheduler] Failed to create default schedule ${sched.name}:`, err.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for and run any due schedules
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Uses SELECT FOR UPDATE SKIP LOCKED to prevent duplicates
|
||||
*/
|
||||
private async checkAndRunDueSchedules(): Promise<void> {
|
||||
const client = await pool.connect();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Atomic claim of due schedules
|
||||
const result = await client.query<TaskSchedule>(`
|
||||
SELECT *
|
||||
FROM task_schedules
|
||||
WHERE enabled = true
|
||||
AND (next_run_at IS NULL OR next_run_at <= NOW())
|
||||
FOR UPDATE SKIP LOCKED
|
||||
`);
|
||||
|
||||
for (const schedule of result.rows) {
|
||||
console.log(`[TaskScheduler] Running schedule: ${schedule.name} (${schedule.role})`);
|
||||
|
||||
try {
|
||||
const tasksCreated = await this.executeSchedule(schedule);
|
||||
console.log(`[TaskScheduler] Schedule ${schedule.name} created ${tasksCreated} tasks`);
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Update last_run_at and calculate next_run_at
|
||||
await client.query(`
|
||||
UPDATE task_schedules
|
||||
SET
|
||||
last_run_at = NOW(),
|
||||
next_run_at = NOW() + ($1 || ' hours')::interval,
|
||||
last_task_count = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [schedule.interval_hours, tasksCreated, schedule.id]);
|
||||
|
||||
} catch (err: any) {
|
||||
console.error(`[TaskScheduler] Schedule ${schedule.name} failed:`, err.message);
|
||||
|
||||
// Still update next_run_at to prevent infinite retry loop
|
||||
await client.query(`
|
||||
UPDATE task_schedules
|
||||
SET
|
||||
next_run_at = NOW() + ($1 || ' hours')::interval,
|
||||
last_error = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [schedule.interval_hours, err.message, schedule.id]);
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
} catch (err: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[TaskScheduler] Failed to check schedules:', err.message);
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a schedule and create tasks
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Different logic per role
|
||||
*/
|
||||
private async executeSchedule(schedule: TaskSchedule): Promise<number> {
|
||||
switch (schedule.role) {
|
||||
case 'payload_fetch':
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch replaces direct product_refresh
|
||||
return this.generatePayloadFetchTasks(schedule);
|
||||
|
||||
case 'product_refresh':
|
||||
// Legacy - kept for manual triggers, but scheduled crawls use payload_fetch
|
||||
return this.generatePayloadFetchTasks(schedule);
|
||||
|
||||
case 'store_discovery':
|
||||
return this.generateStoreDiscoveryTasks(schedule);
|
||||
|
||||
case 'analytics_refresh':
|
||||
return this.generateAnalyticsRefreshTasks(schedule);
|
||||
|
||||
default:
|
||||
console.warn(`[TaskScheduler] Unknown role: ${schedule.role}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate payload_fetch tasks for stores that need crawling
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: payload_fetch hits API, saves to disk, chains to product_refresh
|
||||
*/
|
||||
private async generatePayloadFetchTasks(schedule: TaskSchedule): Promise<number> {
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Find stores needing refresh
|
||||
const result = await pool.query(`
|
||||
SELECT d.id
|
||||
FROM dispensaries d
|
||||
WHERE d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
-- No pending/running payload_fetch or product_refresh task already
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM worker_tasks t
|
||||
WHERE t.dispensary_id = d.id
|
||||
AND t.role IN ('payload_fetch', 'product_refresh')
|
||||
AND t.status IN ('pending', 'claimed', 'running')
|
||||
)
|
||||
-- Never fetched OR last fetch > interval ago
|
||||
AND (
|
||||
d.last_fetch_at IS NULL
|
||||
OR d.last_fetch_at < NOW() - ($1 || ' hours')::interval
|
||||
)
|
||||
${schedule.state_code ? 'AND d.state_id = (SELECT id FROM states WHERE code = $2)' : ''}
|
||||
`, schedule.state_code ? [schedule.interval_hours, schedule.state_code] : [schedule.interval_hours]);
|
||||
|
||||
const dispensaryIds = result.rows.map((r: { id: number }) => r.id);
|
||||
|
||||
if (dispensaryIds.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Create payload_fetch tasks (they chain to product_refresh)
|
||||
const tasks = dispensaryIds.map((id: number) => ({
|
||||
role: 'payload_fetch' as TaskRole,
|
||||
dispensary_id: id,
|
||||
priority: schedule.priority,
|
||||
}));
|
||||
|
||||
return taskService.createTasks(tasks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate store_discovery tasks
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: One task per platform
|
||||
*/
|
||||
private async generateStoreDiscoveryTasks(schedule: TaskSchedule): Promise<number> {
|
||||
// Check if discovery task already pending
|
||||
const existing = await taskService.listTasks({
|
||||
role: 'store_discovery',
|
||||
status: ['pending', 'claimed', 'running'],
|
||||
limit: 1,
|
||||
});
|
||||
|
||||
if (existing.length > 0) {
|
||||
console.log('[TaskScheduler] Store discovery task already pending, skipping');
|
||||
return 0;
|
||||
}
|
||||
|
||||
await taskService.createTask({
|
||||
role: 'store_discovery',
|
||||
platform: 'dutchie',
|
||||
priority: schedule.priority,
|
||||
});
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate analytics_refresh tasks
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Single task to refresh all MVs
|
||||
*/
|
||||
private async generateAnalyticsRefreshTasks(schedule: TaskSchedule): Promise<number> {
|
||||
// Check if analytics task already pending
|
||||
const existing = await taskService.listTasks({
|
||||
role: 'analytics_refresh',
|
||||
status: ['pending', 'claimed', 'running'],
|
||||
limit: 1,
|
||||
});
|
||||
|
||||
if (existing.length > 0) {
|
||||
console.log('[TaskScheduler] Analytics refresh task already pending, skipping');
|
||||
return 0;
|
||||
}
|
||||
|
||||
await taskService.createTask({
|
||||
role: 'analytics_refresh',
|
||||
priority: schedule.priority,
|
||||
});
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all schedules for dashboard display
|
||||
*/
|
||||
async getSchedules(): Promise<TaskSchedule[]> {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT * FROM task_schedules ORDER BY name
|
||||
`);
|
||||
return result.rows as TaskSchedule[];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a schedule
|
||||
*/
|
||||
async updateSchedule(id: number, updates: Partial<TaskSchedule>): Promise<void> {
|
||||
const setClauses: string[] = [];
|
||||
const values: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (updates.enabled !== undefined) {
|
||||
setClauses.push(`enabled = $${paramIndex++}`);
|
||||
values.push(updates.enabled);
|
||||
}
|
||||
if (updates.interval_hours !== undefined) {
|
||||
setClauses.push(`interval_hours = $${paramIndex++}`);
|
||||
values.push(updates.interval_hours);
|
||||
}
|
||||
if (updates.priority !== undefined) {
|
||||
setClauses.push(`priority = $${paramIndex++}`);
|
||||
values.push(updates.priority);
|
||||
}
|
||||
|
||||
if (setClauses.length === 0) return;
|
||||
|
||||
setClauses.push('updated_at = NOW()');
|
||||
values.push(id);
|
||||
|
||||
await pool.query(`
|
||||
UPDATE task_schedules
|
||||
SET ${setClauses.join(', ')}
|
||||
WHERE id = $${paramIndex}
|
||||
`, values);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trigger a schedule to run immediately
|
||||
*/
|
||||
async triggerSchedule(id: number): Promise<number> {
|
||||
const result = await pool.query(`
|
||||
SELECT * FROM task_schedules WHERE id = $1
|
||||
`, [id]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
throw new Error(`Schedule ${id} not found`);
|
||||
}
|
||||
|
||||
return this.executeSchedule(result.rows[0] as TaskSchedule);
|
||||
}
|
||||
}
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Singleton instance
|
||||
export const taskScheduler = new TaskScheduler();
|
||||
Reference in New Issue
Block a user