feat(tasks): Refactor task workflow with payload/refresh separation

Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 22:15:35 -07:00
parent 1fb0eb94c2
commit 4949b22457
33 changed files with 4064 additions and 737 deletions

View File

@@ -543,6 +543,9 @@ router.post('/bulk-priority', async (req: Request, res: Response) => {
/**
* POST /api/job-queue/enqueue - Add a new job to the queue
*
* 2024-12-10: Rewired to use worker_tasks via taskService.
* Legacy dispensary_crawl_jobs code commented out below.
*/
router.post('/enqueue', async (req: Request, res: Response) => {
try {
@@ -552,6 +555,59 @@ router.post('/enqueue', async (req: Request, res: Response) => {
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
}
// 2024-12-10: Map legacy job_type to new task role
const roleMap: Record<string, string> = {
'dutchie_product_crawl': 'product_refresh',
'menu_detection': 'entry_point_discovery',
'menu_detection_single': 'entry_point_discovery',
'product_discovery': 'product_discovery',
'store_discovery': 'store_discovery',
};
const role = roleMap[job_type] || 'product_refresh';
// 2024-12-10: Use taskService to create task in worker_tasks table
const { taskService } = await import('../tasks/task-service');
// Check if task already pending for this dispensary
const existingTasks = await taskService.listTasks({
dispensary_id,
role: role as any,
status: ['pending', 'claimed', 'running'],
limit: 1,
});
if (existingTasks.length > 0) {
return res.json({
success: true,
task_id: existingTasks[0].id,
message: 'Task already queued'
});
}
const task = await taskService.createTask({
role: role as any,
dispensary_id,
priority,
});
res.json({ success: true, task_id: task.id, message: 'Task enqueued' });
} catch (error: any) {
console.error('[JobQueue] Error enqueuing task:', error);
res.status(500).json({ success: false, error: error.message });
}
});
/*
* LEGACY CODE - 2024-12-10: Commented out, was using orphaned dispensary_crawl_jobs table
*
router.post('/enqueue', async (req: Request, res: Response) => {
try {
const { dispensary_id, job_type = 'dutchie_product_crawl', priority = 0 } = req.body;
if (!dispensary_id) {
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
}
// Check if job already pending for this dispensary
const existing = await pool.query(`
SELECT id FROM dispensary_crawl_jobs
@@ -585,6 +641,7 @@ router.post('/enqueue', async (req: Request, res: Response) => {
res.status(500).json({ success: false, error: error.message });
}
});
*/
/**
* POST /api/job-queue/pause - Pause queue processing
@@ -612,6 +669,8 @@ router.get('/paused', async (_req: Request, res: Response) => {
/**
* POST /api/job-queue/enqueue-batch - Queue multiple dispensaries at once
* Body: { dispensary_ids: number[], job_type?: string, priority?: number }
*
* 2024-12-10: Rewired to use worker_tasks via taskService.
*/
router.post('/enqueue-batch', async (req: Request, res: Response) => {
try {
@@ -625,35 +684,30 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
return res.status(400).json({ success: false, error: 'Maximum 500 dispensaries per batch' });
}
// Insert jobs, skipping duplicates
const { rows } = await pool.query(`
INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
SELECT
d.id,
$2::text,
$3::integer,
'api_batch',
'pending',
NOW()
FROM dispensaries d
WHERE d.id = ANY($1::int[])
AND d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL
AND NOT EXISTS (
SELECT 1 FROM dispensary_crawl_jobs cj
WHERE cj.dispensary_id = d.id
AND cj.job_type = $2::text
AND cj.status IN ('pending', 'running')
)
RETURNING id, dispensary_id
`, [dispensary_ids, job_type, priority]);
// 2024-12-10: Map legacy job_type to new task role
const roleMap: Record<string, string> = {
'dutchie_product_crawl': 'product_refresh',
'menu_detection': 'entry_point_discovery',
'product_discovery': 'product_discovery',
};
const role = roleMap[job_type] || 'product_refresh';
// 2024-12-10: Use taskService to create tasks in worker_tasks table
const { taskService } = await import('../tasks/task-service');
const tasks = dispensary_ids.map(dispensary_id => ({
role: role as any,
dispensary_id,
priority,
}));
const createdCount = await taskService.createTasks(tasks);
res.json({
success: true,
queued: rows.length,
queued: createdCount,
requested: dispensary_ids.length,
job_ids: rows.map(r => r.id),
message: `Queued ${rows.length} of ${dispensary_ids.length} dispensaries`
message: `Queued ${createdCount} of ${dispensary_ids.length} dispensaries`
});
} catch (error: any) {
console.error('[JobQueue] Error batch enqueuing:', error);
@@ -664,6 +718,8 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
/**
* POST /api/job-queue/enqueue-state - Queue all crawl-enabled dispensaries for a state
* Body: { state_code: string, job_type?: string, priority?: number, limit?: number }
*
* 2024-12-10: Rewired to use worker_tasks via taskService.
*/
router.post('/enqueue-state', async (req: Request, res: Response) => {
try {
@@ -673,52 +729,55 @@ router.post('/enqueue-state', async (req: Request, res: Response) => {
return res.status(400).json({ success: false, error: 'state_code is required (e.g., "AZ")' });
}
// Get state_id and queue jobs
const { rows } = await pool.query(`
WITH target_state AS (
SELECT id FROM states WHERE code = $1
)
INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
SELECT
d.id,
$2::text,
$3::integer,
'api_state',
'pending',
NOW()
FROM dispensaries d, target_state
WHERE d.state_id = target_state.id
// 2024-12-10: Map legacy job_type to new task role
const roleMap: Record<string, string> = {
'dutchie_product_crawl': 'product_refresh',
'menu_detection': 'entry_point_discovery',
'product_discovery': 'product_discovery',
};
const role = roleMap[job_type] || 'product_refresh';
// Get dispensary IDs for the state
const dispensaryResult = await pool.query(`
SELECT d.id
FROM dispensaries d
JOIN states s ON s.id = d.state_id
WHERE s.code = $1
AND d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL
AND NOT EXISTS (
SELECT 1 FROM dispensary_crawl_jobs cj
WHERE cj.dispensary_id = d.id
AND cj.job_type = $2::text
AND cj.status IN ('pending', 'running')
)
LIMIT $4::integer
RETURNING id, dispensary_id
`, [state_code.toUpperCase(), job_type, priority, limit]);
LIMIT $2
`, [state_code.toUpperCase(), limit]);
const dispensary_ids = dispensaryResult.rows.map((r: any) => r.id);
// 2024-12-10: Use taskService to create tasks in worker_tasks table
const { taskService } = await import('../tasks/task-service');
const tasks = dispensary_ids.map((dispensary_id: number) => ({
role: role as any,
dispensary_id,
priority,
}));
const createdCount = await taskService.createTasks(tasks);
// Get total available count
const countResult = await pool.query(`
WITH target_state AS (
SELECT id FROM states WHERE code = $1
)
SELECT COUNT(*) as total
FROM dispensaries d, target_state
WHERE d.state_id = target_state.id
FROM dispensaries d
JOIN states s ON s.id = d.state_id
WHERE s.code = $1
AND d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL
`, [state_code.toUpperCase()]);
res.json({
success: true,
queued: rows.length,
queued: createdCount,
total_available: parseInt(countResult.rows[0].total),
state: state_code.toUpperCase(),
job_type,
message: `Queued ${rows.length} dispensaries for ${state_code.toUpperCase()}`
role,
message: `Queued ${createdCount} dispensaries for ${state_code.toUpperCase()}`
});
} catch (error: any) {
console.error('[JobQueue] Error enqueuing state:', error);