Files
cannaiq/backend/src/services/task-scheduler.ts
Kelly 4949b22457 feat(tasks): Refactor task workflow with payload/refresh separation
Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 22:15:35 -07:00

376 lines
12 KiB
TypeScript

/**
* Database-Driven Task Scheduler
*
* Per TASK_WORKFLOW_2024-12-10.md:
* - Schedules stored in DB (survives restarts)
* - Uses SELECT FOR UPDATE to prevent duplicate execution across replicas
* - Polls every 60s to check if schedules are due
* - Generates tasks into worker_tasks table for task-worker.ts to process
*
* 2024-12-10: Created to replace legacy node-cron scheduler
*/
import { pool } from '../db/pool';
import { taskService, TaskRole } from '../tasks/task-service';
// Per TASK_WORKFLOW_2024-12-10.md: Poll interval for checking schedules
const POLL_INTERVAL_MS = 60_000; // 60 seconds
interface TaskSchedule {
id: number;
name: string;
role: TaskRole;
enabled: boolean;
interval_hours: number;
last_run_at: Date | null;
next_run_at: Date | null;
state_code: string | null;
priority: number;
}
class TaskScheduler {
private pollTimer: NodeJS.Timeout | null = null;
private isRunning = false;
/**
* Start the scheduler
* Per TASK_WORKFLOW_2024-12-10.md: Called on API server startup
*/
async start(): Promise<void> {
if (this.isRunning) {
console.log('[TaskScheduler] Already running');
return;
}
console.log('[TaskScheduler] Starting database-driven scheduler...');
this.isRunning = true;
// Per TASK_WORKFLOW_2024-12-10.md: On startup, recover stale tasks
try {
const recovered = await taskService.recoverStaleTasks(10);
if (recovered > 0) {
console.log(`[TaskScheduler] Recovered ${recovered} stale tasks from dead workers`);
}
} catch (err: any) {
console.error('[TaskScheduler] Failed to recover stale tasks:', err.message);
}
// Per TASK_WORKFLOW_2024-12-10.md: Ensure default schedules exist
await this.ensureDefaultSchedules();
// Per TASK_WORKFLOW_2024-12-10.md: Check immediately on startup
await this.checkAndRunDueSchedules();
// Per TASK_WORKFLOW_2024-12-10.md: Then poll every 60 seconds
this.pollTimer = setInterval(async () => {
await this.checkAndRunDueSchedules();
}, POLL_INTERVAL_MS);
console.log('[TaskScheduler] Started - polling every 60s');
}
/**
* Stop the scheduler
*/
stop(): void {
if (this.pollTimer) {
clearInterval(this.pollTimer);
this.pollTimer = null;
}
this.isRunning = false;
console.log('[TaskScheduler] Stopped');
}
/**
* Ensure default schedules exist in the database
* Per TASK_WORKFLOW_2024-12-10.md: Creates schedules if they don't exist
*/
private async ensureDefaultSchedules(): Promise<void> {
// Per TASK_WORKFLOW_2024-12-10.md: Default schedules for task generation
// NOTE: payload_fetch replaces direct product_refresh - it chains to product_refresh
const defaults = [
{
name: 'payload_fetch_all',
role: 'payload_fetch' as TaskRole,
interval_hours: 4,
priority: 0,
description: 'Fetch payloads from Dutchie API for all crawl-enabled stores every 4 hours. Chains to product_refresh.',
},
{
name: 'store_discovery_dutchie',
role: 'store_discovery' as TaskRole,
interval_hours: 24,
priority: 5,
description: 'Discover new Dutchie stores daily',
},
{
name: 'analytics_refresh',
role: 'analytics_refresh' as TaskRole,
interval_hours: 6,
priority: 0,
description: 'Refresh analytics materialized views every 6 hours',
},
];
for (const sched of defaults) {
try {
await pool.query(`
INSERT INTO task_schedules (name, role, interval_hours, priority, description, enabled, next_run_at)
VALUES ($1, $2, $3, $4, $5, true, NOW())
ON CONFLICT (name) DO NOTHING
`, [sched.name, sched.role, sched.interval_hours, sched.priority, sched.description]);
} catch (err: any) {
// Table may not exist yet - will be created by migration
if (!err.message.includes('does not exist')) {
console.error(`[TaskScheduler] Failed to create default schedule ${sched.name}:`, err.message);
}
}
}
}
/**
* Check for and run any due schedules
* Per TASK_WORKFLOW_2024-12-10.md: Uses SELECT FOR UPDATE SKIP LOCKED to prevent duplicates
*/
private async checkAndRunDueSchedules(): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Per TASK_WORKFLOW_2024-12-10.md: Atomic claim of due schedules
const result = await client.query<TaskSchedule>(`
SELECT *
FROM task_schedules
WHERE enabled = true
AND (next_run_at IS NULL OR next_run_at <= NOW())
FOR UPDATE SKIP LOCKED
`);
for (const schedule of result.rows) {
console.log(`[TaskScheduler] Running schedule: ${schedule.name} (${schedule.role})`);
try {
const tasksCreated = await this.executeSchedule(schedule);
console.log(`[TaskScheduler] Schedule ${schedule.name} created ${tasksCreated} tasks`);
// Per TASK_WORKFLOW_2024-12-10.md: Update last_run_at and calculate next_run_at
await client.query(`
UPDATE task_schedules
SET
last_run_at = NOW(),
next_run_at = NOW() + ($1 || ' hours')::interval,
last_task_count = $2,
updated_at = NOW()
WHERE id = $3
`, [schedule.interval_hours, tasksCreated, schedule.id]);
} catch (err: any) {
console.error(`[TaskScheduler] Schedule ${schedule.name} failed:`, err.message);
// Still update next_run_at to prevent infinite retry loop
await client.query(`
UPDATE task_schedules
SET
next_run_at = NOW() + ($1 || ' hours')::interval,
last_error = $2,
updated_at = NOW()
WHERE id = $3
`, [schedule.interval_hours, err.message, schedule.id]);
}
}
await client.query('COMMIT');
} catch (err: any) {
await client.query('ROLLBACK');
console.error('[TaskScheduler] Failed to check schedules:', err.message);
} finally {
client.release();
}
}
/**
* Execute a schedule and create tasks
* Per TASK_WORKFLOW_2024-12-10.md: Different logic per role
*/
private async executeSchedule(schedule: TaskSchedule): Promise<number> {
switch (schedule.role) {
case 'payload_fetch':
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch replaces direct product_refresh
return this.generatePayloadFetchTasks(schedule);
case 'product_refresh':
// Legacy - kept for manual triggers, but scheduled crawls use payload_fetch
return this.generatePayloadFetchTasks(schedule);
case 'store_discovery':
return this.generateStoreDiscoveryTasks(schedule);
case 'analytics_refresh':
return this.generateAnalyticsRefreshTasks(schedule);
default:
console.warn(`[TaskScheduler] Unknown role: ${schedule.role}`);
return 0;
}
}
/**
* Generate payload_fetch tasks for stores that need crawling
* Per TASK_WORKFLOW_2024-12-10.md: payload_fetch hits API, saves to disk, chains to product_refresh
*/
private async generatePayloadFetchTasks(schedule: TaskSchedule): Promise<number> {
// Per TASK_WORKFLOW_2024-12-10.md: Find stores needing refresh
const result = await pool.query(`
SELECT d.id
FROM dispensaries d
WHERE d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL
-- No pending/running payload_fetch or product_refresh task already
AND NOT EXISTS (
SELECT 1 FROM worker_tasks t
WHERE t.dispensary_id = d.id
AND t.role IN ('payload_fetch', 'product_refresh')
AND t.status IN ('pending', 'claimed', 'running')
)
-- Never fetched OR last fetch > interval ago
AND (
d.last_fetch_at IS NULL
OR d.last_fetch_at < NOW() - ($1 || ' hours')::interval
)
${schedule.state_code ? 'AND d.state_id = (SELECT id FROM states WHERE code = $2)' : ''}
`, schedule.state_code ? [schedule.interval_hours, schedule.state_code] : [schedule.interval_hours]);
const dispensaryIds = result.rows.map((r: { id: number }) => r.id);
if (dispensaryIds.length === 0) {
return 0;
}
// Per TASK_WORKFLOW_2024-12-10.md: Create payload_fetch tasks (they chain to product_refresh)
const tasks = dispensaryIds.map((id: number) => ({
role: 'payload_fetch' as TaskRole,
dispensary_id: id,
priority: schedule.priority,
}));
return taskService.createTasks(tasks);
}
/**
* Generate store_discovery tasks
* Per TASK_WORKFLOW_2024-12-10.md: One task per platform
*/
private async generateStoreDiscoveryTasks(schedule: TaskSchedule): Promise<number> {
// Check if discovery task already pending
const existing = await taskService.listTasks({
role: 'store_discovery',
status: ['pending', 'claimed', 'running'],
limit: 1,
});
if (existing.length > 0) {
console.log('[TaskScheduler] Store discovery task already pending, skipping');
return 0;
}
await taskService.createTask({
role: 'store_discovery',
platform: 'dutchie',
priority: schedule.priority,
});
return 1;
}
/**
* Generate analytics_refresh tasks
* Per TASK_WORKFLOW_2024-12-10.md: Single task to refresh all MVs
*/
private async generateAnalyticsRefreshTasks(schedule: TaskSchedule): Promise<number> {
// Check if analytics task already pending
const existing = await taskService.listTasks({
role: 'analytics_refresh',
status: ['pending', 'claimed', 'running'],
limit: 1,
});
if (existing.length > 0) {
console.log('[TaskScheduler] Analytics refresh task already pending, skipping');
return 0;
}
await taskService.createTask({
role: 'analytics_refresh',
priority: schedule.priority,
});
return 1;
}
/**
* Get all schedules for dashboard display
*/
async getSchedules(): Promise<TaskSchedule[]> {
try {
const result = await pool.query(`
SELECT * FROM task_schedules ORDER BY name
`);
return result.rows as TaskSchedule[];
} catch {
return [];
}
}
/**
* Update a schedule
*/
async updateSchedule(id: number, updates: Partial<TaskSchedule>): Promise<void> {
const setClauses: string[] = [];
const values: any[] = [];
let paramIndex = 1;
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
values.push(updates.enabled);
}
if (updates.interval_hours !== undefined) {
setClauses.push(`interval_hours = $${paramIndex++}`);
values.push(updates.interval_hours);
}
if (updates.priority !== undefined) {
setClauses.push(`priority = $${paramIndex++}`);
values.push(updates.priority);
}
if (setClauses.length === 0) return;
setClauses.push('updated_at = NOW()');
values.push(id);
await pool.query(`
UPDATE task_schedules
SET ${setClauses.join(', ')}
WHERE id = $${paramIndex}
`, values);
}
/**
* Trigger a schedule to run immediately
*/
async triggerSchedule(id: number): Promise<number> {
const result = await pool.query(`
SELECT * FROM task_schedules WHERE id = $1
`, [id]);
if (result.rows.length === 0) {
throw new Error(`Schedule ${id} not found`);
}
return this.executeSchedule(result.rows[0] as TaskSchedule);
}
}
// Per TASK_WORKFLOW_2024-12-10.md: Singleton instance
export const taskScheduler = new TaskScheduler();