Merge pull request 'feat(tasks): Consolidate schedule management into task_schedules' (#57) from feat/task-schedules-consolidation into master

Reviewed-on: https://code.cannabrands.app/Creationshop/dispensary-scraper/pulls/57
This commit is contained in:
kelly
2025-12-12 08:31:29 +00:00
6 changed files with 1124 additions and 22 deletions

View File

@@ -3,6 +3,24 @@
*
* Endpoints for managing worker tasks, viewing capacity metrics,
* and generating batch tasks.
*
* SCHEDULE MANAGEMENT (added 2025-12-12):
* This file now contains the canonical schedule management endpoints.
* The job_schedules table has been deprecated and all schedule management
* is now consolidated into task_schedules:
*
* Schedule endpoints:
* GET /api/tasks/schedules - List all schedules
* POST /api/tasks/schedules - Create new schedule
* GET /api/tasks/schedules/:id - Get schedule by ID
* PUT /api/tasks/schedules/:id - Update schedule
* DELETE /api/tasks/schedules/:id - Delete schedule
* DELETE /api/tasks/schedules - Bulk delete schedules
* POST /api/tasks/schedules/:id/run-now - Trigger schedule immediately
* POST /api/tasks/schedules/:id/toggle - Toggle schedule enabled/disabled
*
* Note: Schedule routes are defined BEFORE /:id to avoid route conflicts
* (Express matches routes in order, and "schedules" would match /:id otherwise)
*/
import { Router, Request, Response } from 'express';
@@ -131,6 +149,366 @@ router.get('/capacity/:role', async (req: Request, res: Response) => {
}
});
// ============================================================
// SCHEDULE MANAGEMENT ROUTES
// (Must be before /:id to avoid route conflicts)
// ============================================================
/**
* GET /api/tasks/schedules
* List all task schedules
*/
router.get('/schedules', async (req: Request, res: Response) => {
try {
const enabledOnly = req.query.enabled === 'true';
let query = `
SELECT id, name, role, description, enabled, interval_hours,
priority, state_code, platform, last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
FROM task_schedules
`;
if (enabledOnly) {
query += ` WHERE enabled = true`;
}
query += ` ORDER BY name`;
const result = await pool.query(query);
res.json({ schedules: result.rows });
} catch (error: unknown) {
console.error('Error listing schedules:', error);
res.status(500).json({ error: 'Failed to list schedules' });
}
});
/**
* DELETE /api/tasks/schedules
* Bulk delete schedules
*
* Body:
* - ids: number[] (required) - array of schedule IDs to delete
* - all: boolean (optional) - if true, delete all schedules (ids ignored)
*/
router.delete('/schedules', async (req: Request, res: Response) => {
try {
const { ids, all } = req.body;
let result;
if (all === true) {
// Delete all schedules
result = await pool.query(`
DELETE FROM task_schedules RETURNING id, name
`);
} else if (Array.isArray(ids) && ids.length > 0) {
// Delete specific schedules by IDs
result = await pool.query(`
DELETE FROM task_schedules WHERE id = ANY($1) RETURNING id, name
`, [ids]);
} else {
return res.status(400).json({
error: 'Either provide ids array or set all=true',
});
}
res.json({
success: true,
deleted_count: result.rowCount,
deleted: result.rows,
message: `Deleted ${result.rowCount} schedule(s)`,
});
} catch (error: unknown) {
console.error('Error bulk deleting schedules:', error);
res.status(500).json({ error: 'Failed to delete schedules' });
}
});
/**
* POST /api/tasks/schedules
* Create a new schedule
*
* Body:
* - name: string (required, unique)
* - role: TaskRole (required)
* - description: string (optional)
* - enabled: boolean (default true)
* - interval_hours: number (required)
* - priority: number (default 0)
* - state_code: string (optional)
* - platform: string (optional)
*/
router.post('/schedules', async (req: Request, res: Response) => {
try {
const {
name,
role,
description,
enabled = true,
interval_hours,
priority = 0,
state_code,
platform,
} = req.body;
if (!name || !role || !interval_hours) {
return res.status(400).json({
error: 'name, role, and interval_hours are required',
});
}
// Calculate next_run_at based on interval
const nextRunAt = new Date(Date.now() + interval_hours * 60 * 60 * 1000);
const result = await pool.query(`
INSERT INTO task_schedules
(name, role, description, enabled, interval_hours, priority, state_code, platform, next_run_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
RETURNING id, name, role, description, enabled, interval_hours,
priority, state_code, platform, last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
`, [name, role, description, enabled, interval_hours, priority, state_code, platform, nextRunAt]);
res.status(201).json(result.rows[0]);
} catch (error: any) {
if (error.code === '23505') {
// Unique constraint violation
return res.status(409).json({ error: 'A schedule with this name already exists' });
}
console.error('Error creating schedule:', error);
res.status(500).json({ error: 'Failed to create schedule' });
}
});
/**
* GET /api/tasks/schedules/:id
* Get a specific schedule by ID
*/
router.get('/schedules/:id', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const result = await pool.query(`
SELECT id, name, role, description, enabled, interval_hours,
priority, state_code, platform, last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
FROM task_schedules
WHERE id = $1
`, [scheduleId]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
res.json(result.rows[0]);
} catch (error: unknown) {
console.error('Error getting schedule:', error);
res.status(500).json({ error: 'Failed to get schedule' });
}
});
/**
* PUT /api/tasks/schedules/:id
* Update an existing schedule
*/
router.put('/schedules/:id', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const {
name,
role,
description,
enabled,
interval_hours,
priority,
state_code,
platform,
} = req.body;
// Build dynamic update query
const updates: string[] = [];
const values: any[] = [];
let paramIndex = 1;
if (name !== undefined) {
updates.push(`name = $${paramIndex++}`);
values.push(name);
}
if (role !== undefined) {
updates.push(`role = $${paramIndex++}`);
values.push(role);
}
if (description !== undefined) {
updates.push(`description = $${paramIndex++}`);
values.push(description);
}
if (enabled !== undefined) {
updates.push(`enabled = $${paramIndex++}`);
values.push(enabled);
}
if (interval_hours !== undefined) {
updates.push(`interval_hours = $${paramIndex++}`);
values.push(interval_hours);
// Recalculate next_run_at if interval changed
const nextRunAt = new Date(Date.now() + interval_hours * 60 * 60 * 1000);
updates.push(`next_run_at = $${paramIndex++}`);
values.push(nextRunAt);
}
if (priority !== undefined) {
updates.push(`priority = $${paramIndex++}`);
values.push(priority);
}
if (state_code !== undefined) {
updates.push(`state_code = $${paramIndex++}`);
values.push(state_code || null);
}
if (platform !== undefined) {
updates.push(`platform = $${paramIndex++}`);
values.push(platform || null);
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No fields to update' });
}
updates.push('updated_at = NOW()');
values.push(scheduleId);
const result = await pool.query(`
UPDATE task_schedules
SET ${updates.join(', ')}
WHERE id = $${paramIndex}
RETURNING id, name, role, description, enabled, interval_hours,
priority, state_code, platform, last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
`, values);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
res.json(result.rows[0]);
} catch (error: any) {
if (error.code === '23505') {
return res.status(409).json({ error: 'A schedule with this name already exists' });
}
console.error('Error updating schedule:', error);
res.status(500).json({ error: 'Failed to update schedule' });
}
});
/**
* DELETE /api/tasks/schedules/:id
* Delete a schedule
*/
router.delete('/schedules/:id', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const result = await pool.query(`
DELETE FROM task_schedules WHERE id = $1 RETURNING id, name
`, [scheduleId]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
res.json({
success: true,
message: `Schedule "${result.rows[0].name}" deleted`,
});
} catch (error: unknown) {
console.error('Error deleting schedule:', error);
res.status(500).json({ error: 'Failed to delete schedule' });
}
});
/**
* POST /api/tasks/schedules/:id/run-now
* Manually trigger a scheduled task to run immediately
*/
router.post('/schedules/:id/run-now', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
// Get the schedule
const scheduleResult = await pool.query(`
SELECT id, name, role, state_code, platform, priority
FROM task_schedules WHERE id = $1
`, [scheduleId]);
if (scheduleResult.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
const schedule = scheduleResult.rows[0];
// Create a task based on the schedule
const task = await taskService.createTask({
role: schedule.role,
platform: schedule.platform,
priority: schedule.priority + 10, // Boost priority for manual runs
});
// Update last_run_at on the schedule
await pool.query(`
UPDATE task_schedules
SET last_run_at = NOW(),
next_run_at = NOW() + (interval_hours || ' hours')::interval,
updated_at = NOW()
WHERE id = $1
`, [scheduleId]);
res.json({
success: true,
message: `Schedule "${schedule.name}" triggered`,
task,
});
} catch (error: unknown) {
console.error('Error running schedule:', error);
res.status(500).json({ error: 'Failed to run schedule' });
}
});
/**
* POST /api/tasks/schedules/:id/toggle
* Toggle a schedule's enabled status
*/
router.post('/schedules/:id/toggle', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const result = await pool.query(`
UPDATE task_schedules
SET enabled = NOT enabled,
updated_at = NOW()
WHERE id = $1
RETURNING id, name, enabled
`, [scheduleId]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
res.json({
success: true,
schedule: result.rows[0],
message: result.rows[0].enabled
? `Schedule "${result.rows[0].name}" enabled`
: `Schedule "${result.rows[0].name}" disabled`,
});
} catch (error: unknown) {
console.error('Error toggling schedule:', error);
res.status(500).json({ error: 'Failed to toggle schedule' });
}
});
// ============================================================
// TASK-SPECIFIC ROUTES (with :id parameter)
// ============================================================
/**
* GET /api/tasks/:id
* Get a specific task by ID

View File

@@ -4,10 +4,25 @@
* Provider-agnostic worker management and job monitoring.
* Replaces legacy /api/dutchie-az/admin/schedules and /api/dutchie-az/monitor/* routes.
*
* DEPRECATION NOTE (2025-12-12):
* This file still queries job_schedules for backwards compatibility with
* the /api/workers endpoints that display worker status. However, the
* job_schedules table is DEPRECATED - all entries have been disabled.
*
* Schedule management has been consolidated into task_schedules:
* - Use /api/tasks/schedules for schedule CRUD operations
* - Use TasksDashboard.tsx (/admin/tasks) for schedule management UI
* - task_schedules uses interval_hours (simpler than base_interval_minutes + jitter)
*
* The /api/workers endpoints remain useful for:
* - Monitoring active workers and job status
* - K8s scaling controls
* - Job history and logs
*
* Endpoints:
* GET /api/workers - List all workers/schedules
* GET /api/workers/active - List currently active workers
* GET /api/workers/schedule - Get all job schedules
* GET /api/workers/schedule - Get all job schedules (DEPRECATED - use /api/tasks/schedules)
* GET /api/workers/:workerName - Get specific worker details
* GET /api/workers/:workerName/scope - Get worker's scope (states, etc.)
* GET /api/workers/:workerName/stats - Get worker statistics

View File

@@ -11,10 +11,17 @@
* - Workers report heartbeats to worker_registry
* - Workers are ROLE-AGNOSTIC by default (can handle any task type)
*
* Stealth & Anti-Detection:
* PROXIES ARE REQUIRED - workers will fail to start if no proxies available.
* Stealth & Anti-Detection (LAZY INITIALIZATION):
* Workers start IMMEDIATELY without waiting for proxies.
* Stealth systems (proxies, fingerprints, preflights) are initialized
* on first task claim, not at worker startup.
*
* On startup, workers initialize the CrawlRotator which provides:
* This allows workers to:
* - Register and send heartbeats immediately
* - Wait in main loop without blocking on proxy availability
* - Initialize proxies/preflights only when tasks are actually available
*
* On first task claim attempt, workers initialize the CrawlRotator which provides:
* - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy
* - User-Agent rotation: Cycles through realistic browser fingerprints
* - Fingerprint rotation: Changes browser profile on blocks
@@ -34,11 +41,16 @@
*
* Environment:
* WORKER_ROLE - Which task role to process (optional, null = any task)
* WORKER_ID - Optional custom worker ID (auto-generated if not provided)
* POD_NAME - Kubernetes pod name (optional)
* POD_NAME - K8s StatefulSet pod name (PRIMARY - use this for persistent identity)
* WORKER_ID - Custom worker ID (fallback if POD_NAME not set)
* POLL_INTERVAL_MS - How often to check for tasks (default: 5000)
* HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000)
* API_BASE_URL - Backend API URL for registration (default: http://localhost:3010)
*
* Worker Identity:
* Workers use POD_NAME as their worker_id for persistent identity across restarts.
* In K8s StatefulSet, POD_NAME = "scraper-worker-0" through "scraper-worker-7".
* This ensures workers re-register with the same ID instead of creating new entries.
*/
import { Pool } from 'pg';
@@ -209,6 +221,16 @@ export class TaskWorker {
private preflightCurlResult: CurlPreflightResult | null = null;
private preflightHttpResult: PuppeteerPreflightResult | null = null;
// ==========================================================================
// LAZY INITIALIZATION FLAGS
// ==========================================================================
// Stealth/proxy initialization is deferred until first task claim.
// Workers register immediately and enter main loop without blocking.
// ==========================================================================
private stealthInitialized: boolean = false;
private preflightsCompleted: boolean = false;
private initializingPromise: Promise<void> | null = null;
constructor(role: TaskRole | null = null, workerId?: string) {
this.pool = getPool();
this.role = role;
@@ -293,9 +315,9 @@ export class TaskWorker {
/**
* Initialize stealth systems (proxy rotation, fingerprints)
* Called once on worker startup before processing any tasks.
* Called LAZILY on first task claim attempt (NOT at worker startup).
*
* IMPORTANT: Proxies are REQUIRED. Workers will wait until proxies are available.
* IMPORTANT: Proxies are REQUIRED to claim tasks. This method waits until proxies are available.
* Workers listen for PostgreSQL NOTIFY 'proxy_added' to wake up immediately when proxies are added.
*/
private async initializeStealth(): Promise<void> {
@@ -482,6 +504,51 @@ export class TaskWorker {
}
}
/**
* Lazy initialization of stealth systems.
* Called BEFORE claiming first task (not at worker startup).
* This allows workers to register and enter main loop immediately.
*
* Returns true if initialization succeeded, false otherwise.
*/
private async ensureStealthInitialized(): Promise<boolean> {
// Already initialized
if (this.stealthInitialized && this.preflightsCompleted) {
return true;
}
// Already initializing (prevent concurrent init attempts)
if (this.initializingPromise) {
await this.initializingPromise;
return this.stealthInitialized && this.preflightsCompleted;
}
console.log(`[TaskWorker] ${this.friendlyName} lazy-initializing stealth systems (first task claim)...`);
this.initializingPromise = (async () => {
try {
// Initialize proxy/fingerprint rotation
await this.initializeStealth();
this.stealthInitialized = true;
// Run dual-transport preflights
await this.runDualPreflights();
this.preflightsCompleted = true;
const preflightMsg = `curl=${this.preflightCurlPassed ? '✓' : '✗'} http=${this.preflightHttpPassed ? '✓' : '✗'}`;
console.log(`[TaskWorker] ${this.friendlyName} stealth ready (${preflightMsg})`);
} catch (err: any) {
console.error(`[TaskWorker] ${this.friendlyName} stealth init failed: ${err.message}`);
this.stealthInitialized = false;
this.preflightsCompleted = false;
}
})();
await this.initializingPromise;
this.initializingPromise = null;
return this.stealthInitialized && this.preflightsCompleted;
}
/**
* Register worker with the registry (get friendly name)
*/
@@ -615,25 +682,22 @@ export class TaskWorker {
/**
* Start the worker loop
*
* Workers start IMMEDIATELY without blocking on proxy/preflight init.
* Stealth systems are lazy-initialized on first task claim.
* This allows workers to register and send heartbeats even when proxies aren't ready.
*/
async start(): Promise<void> {
this.isRunning = true;
// Initialize stealth systems (proxy rotation, fingerprints)
await this.initializeStealth();
// Register with the API to get a friendly name
// Register with the API to get a friendly name (non-blocking)
await this.register();
// Run dual-transport preflights
await this.runDualPreflights();
// Start registry heartbeat
// Start registry heartbeat immediately
this.startRegistryHeartbeat();
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
const preflightMsg = `curl=${this.preflightCurlPassed ? '✓' : '✗'} http=${this.preflightHttpPassed ? '✓' : '✗'}`;
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (${preflightMsg}, max ${this.maxConcurrentTasks} concurrent tasks)`);
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (stealth=lazy, max ${this.maxConcurrentTasks} concurrent tasks)`);
while (this.isRunning) {
try {
@@ -687,6 +751,20 @@ export class TaskWorker {
// Try to claim more tasks if we have capacity
if (this.canAcceptMoreTasks()) {
// =================================================================
// LAZY INITIALIZATION - Initialize stealth on first task claim
// Workers start immediately and init proxies only when needed
// =================================================================
if (!this.stealthInitialized) {
const initSuccess = await this.ensureStealthInitialized();
if (!initSuccess) {
// Init failed - wait and retry next loop
console.log(`[TaskWorker] ${this.friendlyName} stealth init failed, waiting before retry...`);
await this.sleep(30000);
return;
}
}
// Pass preflight capabilities to only claim compatible tasks
const task = await taskService.claimTask(
this.role,
@@ -922,7 +1000,10 @@ async function main(): Promise<void> {
process.exit(1);
}
const workerId = process.env.WORKER_ID;
// Use POD_NAME for persistent identity in K8s StatefulSet
// This ensures workers keep the same ID across restarts
// Falls back to WORKER_ID, then generates UUID if neither is set
const workerId = process.env.POD_NAME || process.env.WORKER_ID;
// Pass null for role-agnostic, or the specific role
const worker = new TaskWorker(role || null, workerId);