feat: Parallelized store discovery, modification tracking, and task deduplication
Store Discovery Parallelization: - Add store_discovery_state handler for per-state parallel discovery - Add POST /api/tasks/batch/store-discovery endpoint - 8 workers can now process states in parallel (~30-45 min vs 3+ hours) Modification Tracking (Migration 090): - Add last_modified_at, last_modified_by_task, last_modified_task_id to dispensaries - Add same columns to store_products - Update all handlers to set tracking info on modifications Stale Task Recovery: - Add periodic stale cleanup every 10 minutes (worker-0 only) - Prevents orphaned tasks from blocking queue after worker crashes Task Deduplication: - createStaggeredTasks now skips if pending/active task exists for same role - Skips if same role completed within last 4 hours - API responses include skipped count 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
@@ -76,6 +76,7 @@ import { handleProductDiscovery } from './handlers/product-discovery-curl';
|
||||
import { handleProductDiscoveryHttp } from './handlers/product-discovery-http';
|
||||
import { handleStoreDiscovery } from './handlers/store-discovery';
|
||||
import { handleStoreDiscoveryHttp } from './handlers/store-discovery-http';
|
||||
import { handleStoreDiscoveryState } from './handlers/store-discovery-state';
|
||||
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
||||
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
||||
import { handleWhoami } from './handlers/whoami';
|
||||
@@ -159,6 +160,7 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
||||
product_refresh: handleProductRefresh, // disk -> DB
|
||||
product_discovery: handleProductDiscovery, // Default: curl (see getHandlerForTask for http override)
|
||||
store_discovery: handleStoreDiscovery,
|
||||
store_discovery_state: handleStoreDiscoveryState, // Per-state parallelized discovery
|
||||
entry_point_discovery: handleEntryPointDiscovery,
|
||||
analytics_refresh: handleAnalyticsRefresh,
|
||||
whoami: handleWhoami, // Tests proxy + anti-detect
|
||||
@@ -221,6 +223,7 @@ export class TaskWorker {
|
||||
private isRunning: boolean = false;
|
||||
private heartbeatInterval: NodeJS.Timeout | null = null;
|
||||
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
|
||||
private staleCleanupInterval: NodeJS.Timeout | null = null;
|
||||
private crawlRotator: CrawlRotator;
|
||||
|
||||
// ==========================================================================
|
||||
@@ -798,6 +801,44 @@ export class TaskWorker {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run stale task cleanup once
|
||||
* Recovers tasks left in claimed/running status after worker crashes
|
||||
*/
|
||||
private async runStaleTaskCleanup(): Promise<void> {
|
||||
try {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`);
|
||||
const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold
|
||||
if (cleanupResult.cleaned > 0) {
|
||||
console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error(`[TaskWorker] Stale task cleanup error:`, err.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start periodic stale task cleanup (every 10 minutes)
|
||||
* Only run by worker-0 to avoid races
|
||||
*/
|
||||
private startPeriodicStaleCleanup(): void {
|
||||
const STALE_CLEANUP_INTERVAL_MS = 10 * 60 * 1000; // 10 minutes
|
||||
this.staleCleanupInterval = setInterval(async () => {
|
||||
await this.runStaleTaskCleanup();
|
||||
}, STALE_CLEANUP_INTERVAL_MS);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} started periodic stale cleanup (every 10 min)`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop periodic stale task cleanup
|
||||
*/
|
||||
private stopPeriodicStaleCleanup(): void {
|
||||
if (this.staleCleanupInterval) {
|
||||
clearInterval(this.staleCleanupInterval);
|
||||
this.staleCleanupInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the worker loop
|
||||
*
|
||||
@@ -814,18 +855,14 @@ export class TaskWorker {
|
||||
// Start registry heartbeat immediately
|
||||
this.startRegistryHeartbeat();
|
||||
|
||||
// Cleanup stale tasks on startup (only worker-0 does this to avoid races)
|
||||
// This handles tasks left in 'claimed'/'running' status when workers restart
|
||||
// Cleanup stale tasks on startup and periodically (only worker-0 does this to avoid races)
|
||||
// This handles tasks left in 'claimed'/'running' status when workers restart or crash
|
||||
if (this.workerId.endsWith('-0') || this.workerId === 'scraper-worker-0') {
|
||||
try {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} running stale task cleanup...`);
|
||||
const cleanupResult = await taskService.cleanupStaleTasks(30); // 30 minute threshold
|
||||
if (cleanupResult.cleaned > 0) {
|
||||
console.log(`[TaskWorker] Cleaned up ${cleanupResult.cleaned} stale tasks`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error(`[TaskWorker] Stale task cleanup error:`, err.message);
|
||||
}
|
||||
// Run immediately on startup
|
||||
await this.runStaleTaskCleanup();
|
||||
|
||||
// Start periodic cleanup every 10 minutes
|
||||
this.startPeriodicStaleCleanup();
|
||||
}
|
||||
|
||||
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
|
||||
@@ -980,6 +1017,7 @@ export class TaskWorker {
|
||||
this.isRunning = false;
|
||||
this.stopHeartbeat();
|
||||
this.stopRegistryHeartbeat();
|
||||
this.stopPeriodicStaleCleanup();
|
||||
await this.deregister();
|
||||
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user