feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -543,6 +543,9 @@ router.post('/bulk-priority', async (req: Request, res: Response) => {
|
||||
|
||||
/**
|
||||
* POST /api/job-queue/enqueue - Add a new job to the queue
|
||||
*
|
||||
* 2024-12-10: Rewired to use worker_tasks via taskService.
|
||||
* Legacy dispensary_crawl_jobs code commented out below.
|
||||
*/
|
||||
router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -552,6 +555,59 @@ router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
|
||||
}
|
||||
|
||||
// 2024-12-10: Map legacy job_type to new task role
|
||||
const roleMap: Record<string, string> = {
|
||||
'dutchie_product_crawl': 'product_refresh',
|
||||
'menu_detection': 'entry_point_discovery',
|
||||
'menu_detection_single': 'entry_point_discovery',
|
||||
'product_discovery': 'product_discovery',
|
||||
'store_discovery': 'store_discovery',
|
||||
};
|
||||
const role = roleMap[job_type] || 'product_refresh';
|
||||
|
||||
// 2024-12-10: Use taskService to create task in worker_tasks table
|
||||
const { taskService } = await import('../tasks/task-service');
|
||||
|
||||
// Check if task already pending for this dispensary
|
||||
const existingTasks = await taskService.listTasks({
|
||||
dispensary_id,
|
||||
role: role as any,
|
||||
status: ['pending', 'claimed', 'running'],
|
||||
limit: 1,
|
||||
});
|
||||
|
||||
if (existingTasks.length > 0) {
|
||||
return res.json({
|
||||
success: true,
|
||||
task_id: existingTasks[0].id,
|
||||
message: 'Task already queued'
|
||||
});
|
||||
}
|
||||
|
||||
const task = await taskService.createTask({
|
||||
role: role as any,
|
||||
dispensary_id,
|
||||
priority,
|
||||
});
|
||||
|
||||
res.json({ success: true, task_id: task.id, message: 'Task enqueued' });
|
||||
} catch (error: any) {
|
||||
console.error('[JobQueue] Error enqueuing task:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/*
|
||||
* LEGACY CODE - 2024-12-10: Commented out, was using orphaned dispensary_crawl_jobs table
|
||||
*
|
||||
router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { dispensary_id, job_type = 'dutchie_product_crawl', priority = 0 } = req.body;
|
||||
|
||||
if (!dispensary_id) {
|
||||
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
|
||||
}
|
||||
|
||||
// Check if job already pending for this dispensary
|
||||
const existing = await pool.query(`
|
||||
SELECT id FROM dispensary_crawl_jobs
|
||||
@@ -585,6 +641,7 @@ router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
*/
|
||||
|
||||
/**
|
||||
* POST /api/job-queue/pause - Pause queue processing
|
||||
@@ -612,6 +669,8 @@ router.get('/paused', async (_req: Request, res: Response) => {
|
||||
/**
|
||||
* POST /api/job-queue/enqueue-batch - Queue multiple dispensaries at once
|
||||
* Body: { dispensary_ids: number[], job_type?: string, priority?: number }
|
||||
*
|
||||
* 2024-12-10: Rewired to use worker_tasks via taskService.
|
||||
*/
|
||||
router.post('/enqueue-batch', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -625,35 +684,30 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
|
||||
return res.status(400).json({ success: false, error: 'Maximum 500 dispensaries per batch' });
|
||||
}
|
||||
|
||||
// Insert jobs, skipping duplicates
|
||||
const { rows } = await pool.query(`
|
||||
INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
|
||||
SELECT
|
||||
d.id,
|
||||
$2::text,
|
||||
$3::integer,
|
||||
'api_batch',
|
||||
'pending',
|
||||
NOW()
|
||||
FROM dispensaries d
|
||||
WHERE d.id = ANY($1::int[])
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_jobs cj
|
||||
WHERE cj.dispensary_id = d.id
|
||||
AND cj.job_type = $2::text
|
||||
AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
RETURNING id, dispensary_id
|
||||
`, [dispensary_ids, job_type, priority]);
|
||||
// 2024-12-10: Map legacy job_type to new task role
|
||||
const roleMap: Record<string, string> = {
|
||||
'dutchie_product_crawl': 'product_refresh',
|
||||
'menu_detection': 'entry_point_discovery',
|
||||
'product_discovery': 'product_discovery',
|
||||
};
|
||||
const role = roleMap[job_type] || 'product_refresh';
|
||||
|
||||
// 2024-12-10: Use taskService to create tasks in worker_tasks table
|
||||
const { taskService } = await import('../tasks/task-service');
|
||||
|
||||
const tasks = dispensary_ids.map(dispensary_id => ({
|
||||
role: role as any,
|
||||
dispensary_id,
|
||||
priority,
|
||||
}));
|
||||
|
||||
const createdCount = await taskService.createTasks(tasks);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
queued: rows.length,
|
||||
queued: createdCount,
|
||||
requested: dispensary_ids.length,
|
||||
job_ids: rows.map(r => r.id),
|
||||
message: `Queued ${rows.length} of ${dispensary_ids.length} dispensaries`
|
||||
message: `Queued ${createdCount} of ${dispensary_ids.length} dispensaries`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[JobQueue] Error batch enqueuing:', error);
|
||||
@@ -664,6 +718,8 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
|
||||
/**
|
||||
* POST /api/job-queue/enqueue-state - Queue all crawl-enabled dispensaries for a state
|
||||
* Body: { state_code: string, job_type?: string, priority?: number, limit?: number }
|
||||
*
|
||||
* 2024-12-10: Rewired to use worker_tasks via taskService.
|
||||
*/
|
||||
router.post('/enqueue-state', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -673,52 +729,55 @@ router.post('/enqueue-state', async (req: Request, res: Response) => {
|
||||
return res.status(400).json({ success: false, error: 'state_code is required (e.g., "AZ")' });
|
||||
}
|
||||
|
||||
// Get state_id and queue jobs
|
||||
const { rows } = await pool.query(`
|
||||
WITH target_state AS (
|
||||
SELECT id FROM states WHERE code = $1
|
||||
)
|
||||
INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
|
||||
SELECT
|
||||
d.id,
|
||||
$2::text,
|
||||
$3::integer,
|
||||
'api_state',
|
||||
'pending',
|
||||
NOW()
|
||||
FROM dispensaries d, target_state
|
||||
WHERE d.state_id = target_state.id
|
||||
// 2024-12-10: Map legacy job_type to new task role
|
||||
const roleMap: Record<string, string> = {
|
||||
'dutchie_product_crawl': 'product_refresh',
|
||||
'menu_detection': 'entry_point_discovery',
|
||||
'product_discovery': 'product_discovery',
|
||||
};
|
||||
const role = roleMap[job_type] || 'product_refresh';
|
||||
|
||||
// Get dispensary IDs for the state
|
||||
const dispensaryResult = await pool.query(`
|
||||
SELECT d.id
|
||||
FROM dispensaries d
|
||||
JOIN states s ON s.id = d.state_id
|
||||
WHERE s.code = $1
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_jobs cj
|
||||
WHERE cj.dispensary_id = d.id
|
||||
AND cj.job_type = $2::text
|
||||
AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
LIMIT $4::integer
|
||||
RETURNING id, dispensary_id
|
||||
`, [state_code.toUpperCase(), job_type, priority, limit]);
|
||||
LIMIT $2
|
||||
`, [state_code.toUpperCase(), limit]);
|
||||
|
||||
const dispensary_ids = dispensaryResult.rows.map((r: any) => r.id);
|
||||
|
||||
// 2024-12-10: Use taskService to create tasks in worker_tasks table
|
||||
const { taskService } = await import('../tasks/task-service');
|
||||
|
||||
const tasks = dispensary_ids.map((dispensary_id: number) => ({
|
||||
role: role as any,
|
||||
dispensary_id,
|
||||
priority,
|
||||
}));
|
||||
|
||||
const createdCount = await taskService.createTasks(tasks);
|
||||
|
||||
// Get total available count
|
||||
const countResult = await pool.query(`
|
||||
WITH target_state AS (
|
||||
SELECT id FROM states WHERE code = $1
|
||||
)
|
||||
SELECT COUNT(*) as total
|
||||
FROM dispensaries d, target_state
|
||||
WHERE d.state_id = target_state.id
|
||||
FROM dispensaries d
|
||||
JOIN states s ON s.id = d.state_id
|
||||
WHERE s.code = $1
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
`, [state_code.toUpperCase()]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
queued: rows.length,
|
||||
queued: createdCount,
|
||||
total_available: parseInt(countResult.rows[0].total),
|
||||
state: state_code.toUpperCase(),
|
||||
job_type,
|
||||
message: `Queued ${rows.length} dispensaries for ${state_code.toUpperCase()}`
|
||||
role,
|
||||
message: `Queued ${createdCount} dispensaries for ${state_code.toUpperCase()}`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[JobQueue] Error enqueuing state:', error);
|
||||
|
||||
Reference in New Issue
Block a user