feat: Parallelized store discovery, modification tracking, and task deduplication

Store Discovery Parallelization:
- Add store_discovery_state handler for per-state parallel discovery
- Add POST /api/tasks/batch/store-discovery endpoint
- 8 workers can now process states in parallel (~30-45 min vs 3+ hours)

Modification Tracking (Migration 090):
- Add last_modified_at, last_modified_by_task, last_modified_task_id to dispensaries
- Add same columns to store_products
- Update all handlers to set tracking info on modifications

Stale Task Recovery:
- Add periodic stale cleanup every 10 minutes (worker-0 only)
- Prevents orphaned tasks from blocking queue after worker crashes

Task Deduplication:
- createStaggeredTasks now skips if pending/active task exists for same role
- Skips if same role completed within last 4 hours
- API responses include skipped count

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Kelly
2025-12-12 22:15:04 -07:00
parent e4e8438d8b
commit c62f8cbf06
11 changed files with 815 additions and 51 deletions

View File

@@ -76,6 +76,7 @@ import { handleProductDiscovery } from './handlers/product-discovery-curl';
import { handleProductDiscoveryHttp } from './handlers/product-discovery-http';
import { handleStoreDiscovery } from './handlers/store-discovery';
import { handleStoreDiscoveryHttp } from './handlers/store-discovery-http';
import { handleStoreDiscoveryState } from './handlers/store-discovery-state';
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
import { handleWhoami } from './handlers/whoami';
@@ -159,6 +160,7 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
product_refresh: handleProductRefresh, // disk -> DB
product_discovery: handleProductDiscovery, // Default: curl (see getHandlerForTask for http override)
store_discovery: handleStoreDiscovery,
store_discovery_state: handleStoreDiscoveryState, // Per-state parallelized discovery
entry_point_discovery: handleEntryPointDiscovery,
analytics_refresh: handleAnalyticsRefresh,
whoami: handleWhoami, // Tests proxy + anti-detect
@@ -221,6 +223,7 @@ export class TaskWorker {
private isRunning: boolean = false;
private heartbeatInterval: NodeJS.Timeout | null = null;
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
private staleCleanupInterval: NodeJS.Timeout | null = null;
private crawlRotator: CrawlRotator;
// ==========================================================================
@@ -798,6 +801,44 @@ export class TaskWorker {
}
}
/**
* Run stale task cleanup once
* Recovers tasks left in claimed/running status after worker crashes
*/
private async runStaleTaskCleanup(): Promise<void> {
try {
console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`);
const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold
if (cleanupResult.cleaned > 0) {
console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`);
}
} catch (err: any) {
console.error(`[TaskWorker] Stale task cleanup error:`, err.message);
}
}
/**
* Start periodic stale task cleanup (every 10 minutes)
* Only run by worker-0 to avoid races
*/
private startPeriodicStaleCleanup(): void {
const STALE_CLEANUP_INTERVAL_MS = 10 * 60 * 1000; // 10 minutes
this.staleCleanupInterval = setInterval(async () => {
await this.runStaleTaskCleanup();
}, STALE_CLEANUP_INTERVAL_MS);
console.log(`[TaskWorker] ${this.friendlyName} started periodic stale cleanup (every 10 min)`);
}
/**
* Stop periodic stale task cleanup
*/
private stopPeriodicStaleCleanup(): void {
if (this.staleCleanupInterval) {
clearInterval(this.staleCleanupInterval);
this.staleCleanupInterval = null;
}
}
/**
* Start the worker loop
*
@@ -814,18 +855,14 @@ export class TaskWorker {
// Start registry heartbeat immediately
this.startRegistryHeartbeat();
// Cleanup stale tasks on startup (only worker-0 does this to avoid races)
// This handles tasks left in 'claimed'/'running' status when workers restart
// Cleanup stale tasks on startup and periodically (only worker-0 does this to avoid races)
// This handles tasks left in 'claimed'/'running' status when workers restart or crash
if (this.workerId.endsWith('-0') || this.workerId === 'scraper-worker-0') {
try {
console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`);
const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold
if (cleanupResult.cleaned > 0) {
console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`);
}
} catch (err: any) {
console.error(`[TaskWorker] Stale task cleanup error:`, err.message);
}
// Run immediately on startup
await this.runStaleTaskCleanup();
// Start periodic cleanup every 10 minutes
this.startPeriodicStaleCleanup();
}
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
@@ -980,6 +1017,7 @@ export class TaskWorker {
this.isRunning = false;
this.stopHeartbeat();
this.stopRegistryHeartbeat();
this.stopPeriodicStaleCleanup();
await this.deregister();
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
}