feat: Parallelized store discovery, modification tracking, and task deduplication
Store Discovery Parallelization: - Add store_discovery_state handler for per-state parallel discovery - Add POST /api/tasks/batch/store-discovery endpoint - 8 workers can now process states in parallel (~30-45 min vs 3+ hours) Modification Tracking (Migration 090): - Add last_modified_at, last_modified_by_task, last_modified_task_id to dispensaries - Add same columns to store_products - Update all handlers to set tracking info on modifications Stale Task Recovery: - Add periodic stale cleanup every 10 minutes (worker-0 only) - Prevents orphaned tasks from blocking queue after worker crashes Task Deduplication: - createStaggeredTasks now skips if pending/active task exists for same role - Skips if same role completed within last 4 hours - API responses include skipped count 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
@@ -1182,17 +1182,20 @@ router.post('/batch/staggered', async (req: Request, res: Response) => {
|
||||
method
|
||||
);
|
||||
|
||||
const totalDuration = (dispensary_ids.length - 1) * stagger_seconds;
|
||||
const totalDuration = (result.created - 1) * stagger_seconds;
|
||||
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
|
||||
|
||||
res.status(201).json({
|
||||
success: true,
|
||||
created: result.created,
|
||||
skipped: result.skipped,
|
||||
task_ids: result.taskIds,
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${result.created} staggered ${role} tasks (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
|
||||
message: result.skipped > 0
|
||||
? `Created ${result.created} staggered ${role} tasks, skipped ${result.skipped} (duplicate/recently completed)`
|
||||
: `Created ${result.created} staggered ${role} tasks (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating staggered tasks:', error);
|
||||
@@ -1326,6 +1329,107 @@ router.post('/batch/entry-point-discovery', async (req: Request, res: Response)
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/batch/store-discovery
|
||||
* Create parallelized store_discovery_state tasks for all active states
|
||||
*
|
||||
* Instead of one monolithic store_discovery task that takes hours,
|
||||
* this creates individual tasks for each state that can run in parallel.
|
||||
*
|
||||
* Body (optional):
|
||||
* - stagger_seconds: number (default: 10) - Seconds between each state task
|
||||
* - priority: number (default: 5) - Task priority
|
||||
* - states: string[] (optional) - Specific state codes to discover (default: all active)
|
||||
*/
|
||||
router.post('/batch/store-discovery', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
stagger_seconds = 10,
|
||||
priority = 5,
|
||||
states: specificStates,
|
||||
} = req.body;
|
||||
|
||||
// Get active states
|
||||
let statesQuery = `
|
||||
SELECT code, name FROM states WHERE is_active = true
|
||||
`;
|
||||
const params: any[] = [];
|
||||
|
||||
if (specificStates && Array.isArray(specificStates) && specificStates.length > 0) {
|
||||
statesQuery += ` AND code = ANY($1)`;
|
||||
params.push(specificStates.map((s: string) => s.toUpperCase()));
|
||||
}
|
||||
|
||||
statesQuery += ` ORDER BY code`;
|
||||
|
||||
const statesResult = await pool.query(statesQuery, params);
|
||||
|
||||
if (statesResult.rows.length === 0) {
|
||||
return res.json({
|
||||
success: true,
|
||||
message: 'No active states to discover',
|
||||
tasks_created: 0,
|
||||
});
|
||||
}
|
||||
|
||||
// Check for existing pending/running store_discovery_state tasks
|
||||
const existingResult = await pool.query(`
|
||||
SELECT payload->>'state_code' as state_code
|
||||
FROM worker_tasks
|
||||
WHERE role = 'store_discovery_state'
|
||||
AND status IN ('pending', 'claimed', 'running')
|
||||
`);
|
||||
const existingStates = new Set(existingResult.rows.map((r: any) => r.state_code));
|
||||
|
||||
// Filter out states that already have pending tasks
|
||||
const statesToCreate = statesResult.rows.filter(
|
||||
(s: { code: string }) => !existingStates.has(s.code)
|
||||
);
|
||||
|
||||
if (statesToCreate.length === 0) {
|
||||
return res.json({
|
||||
success: true,
|
||||
message: 'All states already have pending store_discovery_state tasks',
|
||||
tasks_created: 0,
|
||||
skipped: statesResult.rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
// Create staggered tasks for each state
|
||||
const taskIds: number[] = [];
|
||||
for (let i = 0; i < statesToCreate.length; i++) {
|
||||
const state = statesToCreate[i];
|
||||
const scheduledFor = new Date(Date.now() + i * stagger_seconds * 1000);
|
||||
|
||||
const result = await pool.query(`
|
||||
INSERT INTO worker_tasks (role, priority, scheduled_for, method, payload)
|
||||
VALUES ('store_discovery_state', $1, $2, 'http', $3)
|
||||
RETURNING id
|
||||
`, [priority, scheduledFor, JSON.stringify({ state_code: state.code })]);
|
||||
|
||||
taskIds.push(result.rows[0].id);
|
||||
}
|
||||
|
||||
const totalDuration = statesToCreate.length * stagger_seconds;
|
||||
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
|
||||
|
||||
res.status(201).json({
|
||||
success: true,
|
||||
tasks_created: taskIds.length,
|
||||
task_ids: taskIds,
|
||||
states: statesToCreate.map((s: { code: string; name: string }) => s.code),
|
||||
skipped: statesResult.rows.length - statesToCreate.length,
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_start_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${taskIds.length} store_discovery_state tasks for parallel execution`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating store discovery tasks:', error);
|
||||
res.status(500).json({ error: 'Failed to create store discovery tasks' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STATE-BASED CRAWL ENDPOINTS
|
||||
// ============================================================
|
||||
@@ -1414,11 +1518,13 @@ router.post('/crawl-state/:stateCode', async (req: Request, res: Response) => {
|
||||
state_name: state.name,
|
||||
tasks_created: result.created,
|
||||
stores_in_state: dispensariesResult.rows.length,
|
||||
skipped: dispensariesResult.rows.length - result.created,
|
||||
skipped: result.skipped,
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${result.created} product_discovery tasks for ${state.name} (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
|
||||
message: result.skipped > 0
|
||||
? `Created ${result.created} product_discovery tasks for ${state.name}, skipped ${result.skipped} (duplicate/recently completed)`
|
||||
: `Created ${result.created} product_discovery tasks for ${state.name} (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating state crawl tasks:', error);
|
||||
|
||||
Reference in New Issue
Block a user