Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
376 lines
12 KiB
TypeScript
376 lines
12 KiB
TypeScript
/**
|
|
* Database-Driven Task Scheduler
|
|
*
|
|
* Per TASK_WORKFLOW_2024-12-10.md:
|
|
* - Schedules stored in DB (survives restarts)
|
|
* - Uses SELECT FOR UPDATE to prevent duplicate execution across replicas
|
|
* - Polls every 60s to check if schedules are due
|
|
* - Generates tasks into worker_tasks table for task-worker.ts to process
|
|
*
|
|
* 2024-12-10: Created to replace legacy node-cron scheduler
|
|
*/
|
|
|
|
import { pool } from '../db/pool';
|
|
import { taskService, TaskRole } from '../tasks/task-service';
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Poll interval for checking schedules
|
|
const POLL_INTERVAL_MS = 60_000; // 60 seconds
|
|
|
|
interface TaskSchedule {
|
|
id: number;
|
|
name: string;
|
|
role: TaskRole;
|
|
enabled: boolean;
|
|
interval_hours: number;
|
|
last_run_at: Date | null;
|
|
next_run_at: Date | null;
|
|
state_code: string | null;
|
|
priority: number;
|
|
}
|
|
|
|
class TaskScheduler {
|
|
private pollTimer: NodeJS.Timeout | null = null;
|
|
private isRunning = false;
|
|
|
|
/**
|
|
* Start the scheduler
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Called on API server startup
|
|
*/
|
|
async start(): Promise<void> {
|
|
if (this.isRunning) {
|
|
console.log('[TaskScheduler] Already running');
|
|
return;
|
|
}
|
|
|
|
console.log('[TaskScheduler] Starting database-driven scheduler...');
|
|
this.isRunning = true;
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: On startup, recover stale tasks
|
|
try {
|
|
const recovered = await taskService.recoverStaleTasks(10);
|
|
if (recovered > 0) {
|
|
console.log(`[TaskScheduler] Recovered ${recovered} stale tasks from dead workers`);
|
|
}
|
|
} catch (err: any) {
|
|
console.error('[TaskScheduler] Failed to recover stale tasks:', err.message);
|
|
}
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Ensure default schedules exist
|
|
await this.ensureDefaultSchedules();
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Check immediately on startup
|
|
await this.checkAndRunDueSchedules();
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Then poll every 60 seconds
|
|
this.pollTimer = setInterval(async () => {
|
|
await this.checkAndRunDueSchedules();
|
|
}, POLL_INTERVAL_MS);
|
|
|
|
console.log('[TaskScheduler] Started - polling every 60s');
|
|
}
|
|
|
|
/**
|
|
* Stop the scheduler
|
|
*/
|
|
stop(): void {
|
|
if (this.pollTimer) {
|
|
clearInterval(this.pollTimer);
|
|
this.pollTimer = null;
|
|
}
|
|
this.isRunning = false;
|
|
console.log('[TaskScheduler] Stopped');
|
|
}
|
|
|
|
/**
|
|
* Ensure default schedules exist in the database
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Creates schedules if they don't exist
|
|
*/
|
|
private async ensureDefaultSchedules(): Promise<void> {
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Default schedules for task generation
|
|
// NOTE: payload_fetch replaces direct product_refresh - it chains to product_refresh
|
|
const defaults = [
|
|
{
|
|
name: 'payload_fetch_all',
|
|
role: 'payload_fetch' as TaskRole,
|
|
interval_hours: 4,
|
|
priority: 0,
|
|
description: 'Fetch payloads from Dutchie API for all crawl-enabled stores every 4 hours. Chains to product_refresh.',
|
|
},
|
|
{
|
|
name: 'store_discovery_dutchie',
|
|
role: 'store_discovery' as TaskRole,
|
|
interval_hours: 24,
|
|
priority: 5,
|
|
description: 'Discover new Dutchie stores daily',
|
|
},
|
|
{
|
|
name: 'analytics_refresh',
|
|
role: 'analytics_refresh' as TaskRole,
|
|
interval_hours: 6,
|
|
priority: 0,
|
|
description: 'Refresh analytics materialized views every 6 hours',
|
|
},
|
|
];
|
|
|
|
for (const sched of defaults) {
|
|
try {
|
|
await pool.query(`
|
|
INSERT INTO task_schedules (name, role, interval_hours, priority, description, enabled, next_run_at)
|
|
VALUES ($1, $2, $3, $4, $5, true, NOW())
|
|
ON CONFLICT (name) DO NOTHING
|
|
`, [sched.name, sched.role, sched.interval_hours, sched.priority, sched.description]);
|
|
} catch (err: any) {
|
|
// Table may not exist yet - will be created by migration
|
|
if (!err.message.includes('does not exist')) {
|
|
console.error(`[TaskScheduler] Failed to create default schedule ${sched.name}:`, err.message);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check for and run any due schedules
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Uses SELECT FOR UPDATE SKIP LOCKED to prevent duplicates
|
|
*/
|
|
private async checkAndRunDueSchedules(): Promise<void> {
|
|
const client = await pool.connect();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Atomic claim of due schedules
|
|
const result = await client.query<TaskSchedule>(`
|
|
SELECT *
|
|
FROM task_schedules
|
|
WHERE enabled = true
|
|
AND (next_run_at IS NULL OR next_run_at <= NOW())
|
|
FOR UPDATE SKIP LOCKED
|
|
`);
|
|
|
|
for (const schedule of result.rows) {
|
|
console.log(`[TaskScheduler] Running schedule: ${schedule.name} (${schedule.role})`);
|
|
|
|
try {
|
|
const tasksCreated = await this.executeSchedule(schedule);
|
|
console.log(`[TaskScheduler] Schedule ${schedule.name} created ${tasksCreated} tasks`);
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Update last_run_at and calculate next_run_at
|
|
await client.query(`
|
|
UPDATE task_schedules
|
|
SET
|
|
last_run_at = NOW(),
|
|
next_run_at = NOW() + ($1 || ' hours')::interval,
|
|
last_task_count = $2,
|
|
updated_at = NOW()
|
|
WHERE id = $3
|
|
`, [schedule.interval_hours, tasksCreated, schedule.id]);
|
|
|
|
} catch (err: any) {
|
|
console.error(`[TaskScheduler] Schedule ${schedule.name} failed:`, err.message);
|
|
|
|
// Still update next_run_at to prevent infinite retry loop
|
|
await client.query(`
|
|
UPDATE task_schedules
|
|
SET
|
|
next_run_at = NOW() + ($1 || ' hours')::interval,
|
|
last_error = $2,
|
|
updated_at = NOW()
|
|
WHERE id = $3
|
|
`, [schedule.interval_hours, err.message, schedule.id]);
|
|
}
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
} catch (err: any) {
|
|
await client.query('ROLLBACK');
|
|
console.error('[TaskScheduler] Failed to check schedules:', err.message);
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Execute a schedule and create tasks
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Different logic per role
|
|
*/
|
|
private async executeSchedule(schedule: TaskSchedule): Promise<number> {
|
|
switch (schedule.role) {
|
|
case 'payload_fetch':
|
|
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch replaces direct product_refresh
|
|
return this.generatePayloadFetchTasks(schedule);
|
|
|
|
case 'product_refresh':
|
|
// Legacy - kept for manual triggers, but scheduled crawls use payload_fetch
|
|
return this.generatePayloadFetchTasks(schedule);
|
|
|
|
case 'store_discovery':
|
|
return this.generateStoreDiscoveryTasks(schedule);
|
|
|
|
case 'analytics_refresh':
|
|
return this.generateAnalyticsRefreshTasks(schedule);
|
|
|
|
default:
|
|
console.warn(`[TaskScheduler] Unknown role: ${schedule.role}`);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate payload_fetch tasks for stores that need crawling
|
|
* Per TASK_WORKFLOW_2024-12-10.md: payload_fetch hits API, saves to disk, chains to product_refresh
|
|
*/
|
|
private async generatePayloadFetchTasks(schedule: TaskSchedule): Promise<number> {
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Find stores needing refresh
|
|
const result = await pool.query(`
|
|
SELECT d.id
|
|
FROM dispensaries d
|
|
WHERE d.crawl_enabled = true
|
|
AND d.platform_dispensary_id IS NOT NULL
|
|
-- No pending/running payload_fetch or product_refresh task already
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM worker_tasks t
|
|
WHERE t.dispensary_id = d.id
|
|
AND t.role IN ('payload_fetch', 'product_refresh')
|
|
AND t.status IN ('pending', 'claimed', 'running')
|
|
)
|
|
-- Never fetched OR last fetch > interval ago
|
|
AND (
|
|
d.last_fetch_at IS NULL
|
|
OR d.last_fetch_at < NOW() - ($1 || ' hours')::interval
|
|
)
|
|
${schedule.state_code ? 'AND d.state_id = (SELECT id FROM states WHERE code = $2)' : ''}
|
|
`, schedule.state_code ? [schedule.interval_hours, schedule.state_code] : [schedule.interval_hours]);
|
|
|
|
const dispensaryIds = result.rows.map((r: { id: number }) => r.id);
|
|
|
|
if (dispensaryIds.length === 0) {
|
|
return 0;
|
|
}
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Create payload_fetch tasks (they chain to product_refresh)
|
|
const tasks = dispensaryIds.map((id: number) => ({
|
|
role: 'payload_fetch' as TaskRole,
|
|
dispensary_id: id,
|
|
priority: schedule.priority,
|
|
}));
|
|
|
|
return taskService.createTasks(tasks);
|
|
}
|
|
|
|
/**
|
|
* Generate store_discovery tasks
|
|
* Per TASK_WORKFLOW_2024-12-10.md: One task per platform
|
|
*/
|
|
private async generateStoreDiscoveryTasks(schedule: TaskSchedule): Promise<number> {
|
|
// Check if discovery task already pending
|
|
const existing = await taskService.listTasks({
|
|
role: 'store_discovery',
|
|
status: ['pending', 'claimed', 'running'],
|
|
limit: 1,
|
|
});
|
|
|
|
if (existing.length > 0) {
|
|
console.log('[TaskScheduler] Store discovery task already pending, skipping');
|
|
return 0;
|
|
}
|
|
|
|
await taskService.createTask({
|
|
role: 'store_discovery',
|
|
platform: 'dutchie',
|
|
priority: schedule.priority,
|
|
});
|
|
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Generate analytics_refresh tasks
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Single task to refresh all MVs
|
|
*/
|
|
private async generateAnalyticsRefreshTasks(schedule: TaskSchedule): Promise<number> {
|
|
// Check if analytics task already pending
|
|
const existing = await taskService.listTasks({
|
|
role: 'analytics_refresh',
|
|
status: ['pending', 'claimed', 'running'],
|
|
limit: 1,
|
|
});
|
|
|
|
if (existing.length > 0) {
|
|
console.log('[TaskScheduler] Analytics refresh task already pending, skipping');
|
|
return 0;
|
|
}
|
|
|
|
await taskService.createTask({
|
|
role: 'analytics_refresh',
|
|
priority: schedule.priority,
|
|
});
|
|
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Get all schedules for dashboard display
|
|
*/
|
|
async getSchedules(): Promise<TaskSchedule[]> {
|
|
try {
|
|
const result = await pool.query(`
|
|
SELECT * FROM task_schedules ORDER BY name
|
|
`);
|
|
return result.rows as TaskSchedule[];
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Update a schedule
|
|
*/
|
|
async updateSchedule(id: number, updates: Partial<TaskSchedule>): Promise<void> {
|
|
const setClauses: string[] = [];
|
|
const values: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (updates.enabled !== undefined) {
|
|
setClauses.push(`enabled = $${paramIndex++}`);
|
|
values.push(updates.enabled);
|
|
}
|
|
if (updates.interval_hours !== undefined) {
|
|
setClauses.push(`interval_hours = $${paramIndex++}`);
|
|
values.push(updates.interval_hours);
|
|
}
|
|
if (updates.priority !== undefined) {
|
|
setClauses.push(`priority = $${paramIndex++}`);
|
|
values.push(updates.priority);
|
|
}
|
|
|
|
if (setClauses.length === 0) return;
|
|
|
|
setClauses.push('updated_at = NOW()');
|
|
values.push(id);
|
|
|
|
await pool.query(`
|
|
UPDATE task_schedules
|
|
SET ${setClauses.join(', ')}
|
|
WHERE id = $${paramIndex}
|
|
`, values);
|
|
}
|
|
|
|
/**
|
|
* Trigger a schedule to run immediately
|
|
*/
|
|
async triggerSchedule(id: number): Promise<number> {
|
|
const result = await pool.query(`
|
|
SELECT * FROM task_schedules WHERE id = $1
|
|
`, [id]);
|
|
|
|
if (result.rows.length === 0) {
|
|
throw new Error(`Schedule ${id} not found`);
|
|
}
|
|
|
|
return this.executeSchedule(result.rows[0] as TaskSchedule);
|
|
}
|
|
}
|
|
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Singleton instance
|
|
export const taskScheduler = new TaskScheduler();
|