feat: Parallelized store discovery, modification tracking, and task deduplication

Store Discovery Parallelization:
- Add store_discovery_state handler for per-state parallel discovery
- Add POST /api/tasks/batch/store-discovery endpoint
- 8 workers can now process states in parallel (~30-45 min vs 3+ hours)

Modification Tracking (Migration 090):
- Add last_modified_at, last_modified_by_task, last_modified_task_id to dispensaries
- Add same columns to store_products
- Update all handlers to set tracking info on modifications

Stale Task Recovery:
- Add periodic stale cleanup every 10 minutes (worker-0 only)
- Prevents orphaned tasks from blocking queue after worker crashes

Task Deduplication:
- createStaggeredTasks now skips if pending/active task exists for same role
- Skips if same role completed within last 4 hours
- API responses include skipped count

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Kelly
2025-12-12 22:15:04 -07:00
parent e4e8438d8b
commit c62f8cbf06
11 changed files with 815 additions and 51 deletions

View File

@@ -1182,17 +1182,20 @@ router.post('/batch/staggered', async (req: Request, res: Response) => {
method
);
const totalDuration = (dispensary_ids.length - 1) * stagger_seconds;
const totalDuration = (result.created - 1) * stagger_seconds;
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
res.status(201).json({
success: true,
created: result.created,
skipped: result.skipped,
task_ids: result.taskIds,
stagger_seconds,
total_duration_seconds: totalDuration,
estimated_completion: estimatedEndTime.toISOString(),
message: `Created ${result.created} staggered ${role} tasks (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
message: result.skipped > 0
? `Created ${result.created} staggered ${role} tasks, skipped ${result.skipped} (duplicate/recently completed)`
: `Created ${result.created} staggered ${role} tasks (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
});
} catch (error: unknown) {
console.error('Error creating staggered tasks:', error);
@@ -1326,6 +1329,107 @@ router.post('/batch/entry-point-discovery', async (req: Request, res: Response)
}
});
/**
* POST /api/tasks/batch/store-discovery
* Create parallelized store_discovery_state tasks for all active states
*
* Instead of one monolithic store_discovery task that takes hours,
* this creates individual tasks for each state that can run in parallel.
*
* Body (optional):
* - stagger_seconds: number (default: 10) - Seconds between each state task
* - priority: number (default: 5) - Task priority
* - states: string[] (optional) - Specific state codes to discover (default: all active)
*/
router.post('/batch/store-discovery', async (req: Request, res: Response) => {
try {
const {
stagger_seconds = 10,
priority = 5,
states: specificStates,
} = req.body;
// Get active states
let statesQuery = `
SELECT code, name FROM states WHERE is_active = true
`;
const params: any[] = [];
if (specificStates && Array.isArray(specificStates) && specificStates.length > 0) {
statesQuery += ` AND code = ANY($1)`;
params.push(specificStates.map((s: string) => s.toUpperCase()));
}
statesQuery += ` ORDER BY code`;
const statesResult = await pool.query(statesQuery, params);
if (statesResult.rows.length === 0) {
return res.json({
success: true,
message: 'No active states to discover',
tasks_created: 0,
});
}
// Check for existing pending/running store_discovery_state tasks
const existingResult = await pool.query(`
SELECT payload->>'state_code' as state_code
FROM worker_tasks
WHERE role = 'store_discovery_state'
AND status IN ('pending', 'claimed', 'running')
`);
const existingStates = new Set(existingResult.rows.map((r: any) => r.state_code));
// Filter out states that already have pending tasks
const statesToCreate = statesResult.rows.filter(
(s: { code: string }) => !existingStates.has(s.code)
);
if (statesToCreate.length === 0) {
return res.json({
success: true,
message: 'All states already have pending store_discovery_state tasks',
tasks_created: 0,
skipped: statesResult.rows.length,
});
}
// Create staggered tasks for each state
const taskIds: number[] = [];
for (let i = 0; i < statesToCreate.length; i++) {
const state = statesToCreate[i];
const scheduledFor = new Date(Date.now() + i * stagger_seconds * 1000);
const result = await pool.query(`
INSERT INTO worker_tasks (role, priority, scheduled_for, method, payload)
VALUES ('store_discovery_state', $1, $2, 'http', $3)
RETURNING id
`, [priority, scheduledFor, JSON.stringify({ state_code: state.code })]);
taskIds.push(result.rows[0].id);
}
const totalDuration = statesToCreate.length * stagger_seconds;
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
res.status(201).json({
success: true,
tasks_created: taskIds.length,
task_ids: taskIds,
states: statesToCreate.map((s: { code: string; name: string }) => s.code),
skipped: statesResult.rows.length - statesToCreate.length,
stagger_seconds,
total_duration_seconds: totalDuration,
estimated_start_completion: estimatedEndTime.toISOString(),
message: `Created ${taskIds.length} store_discovery_state tasks for parallel execution`,
});
} catch (error: unknown) {
console.error('Error creating store discovery tasks:', error);
res.status(500).json({ error: 'Failed to create store discovery tasks' });
}
});
// ============================================================
// STATE-BASED CRAWL ENDPOINTS
// ============================================================
@@ -1414,11 +1518,13 @@ router.post('/crawl-state/:stateCode', async (req: Request, res: Response) => {
state_name: state.name,
tasks_created: result.created,
stores_in_state: dispensariesResult.rows.length,
skipped: dispensariesResult.rows.length - result.created,
skipped: result.skipped,
stagger_seconds,
total_duration_seconds: totalDuration,
estimated_completion: estimatedEndTime.toISOString(),
message: `Created ${result.created} product_discovery tasks for ${state.name} (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
message: result.skipped > 0
? `Created ${result.created} product_discovery tasks for ${state.name}, skipped ${result.skipped} (duplicate/recently completed)`
: `Created ${result.created} product_discovery tasks for ${state.name} (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
});
} catch (error: unknown) {
console.error('Error creating state crawl tasks:', error);