feat: Performance optimizations and preflight improvements

- Add missing /api/analytics/national/summary endpoint
- Optimize dashboard activity queries (subquery vs JOIN+GROUP BY)
- Add PreflightSummary component to Workers page with gold qualified badge
- Add preflight retry logic - workers retry every 30s until qualified
- Run stale task cleanup on ALL workers (not just worker-0)
- Add preflight fields to worker-registry API (ip, fingerprint, is_qualified)

Database indexes added:
- idx_store_products_created_at (for recent products)
- idx_dispensaries_last_crawl_at (for recent scrapes)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-13 02:06:33 -07:00
parent 7849ee0256
commit 983cd71fc2
5 changed files with 301 additions and 73 deletions

View File

@@ -273,6 +273,16 @@ export class TaskWorker {
private preflightsCompleted: boolean = false;
private initializingPromise: Promise<void> | null = null;
// ==========================================================================
// PREFLIGHT RETRY SETTINGS
// ==========================================================================
// If preflight fails, worker retries every PREFLIGHT_RETRY_INTERVAL_MS
// Worker is BLOCKED from claiming ANY tasks until preflight passes.
// This ensures unqualified workers never touch the task pool.
// ==========================================================================
private static readonly PREFLIGHT_RETRY_INTERVAL_MS = 30000; // 30 seconds
private isRetryingPreflight: boolean = false;
// ==========================================================================
// STEP TRACKING FOR DASHBOARD VISIBILITY
// ==========================================================================
@@ -617,6 +627,75 @@ export class TaskWorker {
}
}
/**
* Retry preflight until it passes.
* Worker is BLOCKED from claiming ANY tasks until HTTP preflight passes.
* This ensures unqualified workers never touch the task pool.
*
* All current tasks require 'http' method, so HTTP preflight is mandatory.
*/
private async retryPreflightUntilPass(): Promise<void> {
if (this.preflightHttpPassed) {
return; // Already passed
}
if (this.isRetryingPreflight) {
return; // Already retrying
}
this.isRetryingPreflight = true;
let retryCount = 0;
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight FAILED - entering retry loop (every ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s)`);
console.log(`[TaskWorker] ${this.friendlyName} BLOCKED from task pool until preflight passes`);
while (!this.preflightHttpPassed && this.isRunning) {
retryCount++;
// Wait before retry
await this.sleep(TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS);
if (!this.isRunning) {
break; // Worker stopping
}
console.log(`[TaskWorker] ${this.friendlyName} preflight retry #${retryCount}...`);
// Reload proxies before retry (might have new ones)
try {
await this.crawlRotator.initialize();
const stats = this.crawlRotator.proxy.getStats();
console.log(`[TaskWorker] Proxies available: ${stats.activeProxies}`);
} catch (err: any) {
console.warn(`[TaskWorker] Proxy reload failed: ${err.message}`);
}
// Re-run HTTP preflight
try {
const httpResult = await runPuppeteerPreflightWithRetry(this.crawlRotator, 1);
this.preflightHttpResult = httpResult;
this.preflightHttpPassed = httpResult.passed;
if (httpResult.passed) {
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight PASSED on retry #${retryCount}!`);
console.log(`[TaskWorker] ${this.friendlyName} IP: ${httpResult.proxyIp}, Products: ${httpResult.productsReturned}`);
console.log(`[TaskWorker] ${this.friendlyName} now QUALIFIED to claim tasks`);
// Report updated status
await this.reportPreflightStatus();
break;
} else {
console.log(`[TaskWorker] ${this.friendlyName} HTTP preflight still FAILED: ${httpResult.error}`);
console.log(`[TaskWorker] ${this.friendlyName} will retry in ${TaskWorker.PREFLIGHT_RETRY_INTERVAL_MS / 1000}s...`);
}
} catch (err: any) {
console.error(`[TaskWorker] ${this.friendlyName} preflight retry error: ${err.message}`);
}
}
this.isRetryingPreflight = false;
}
/**
* Lazy initialization of stealth systems.
* Called BEFORE claiming first task (not at worker startup).
@@ -855,15 +934,14 @@ export class TaskWorker {
// Start registry heartbeat immediately
this.startRegistryHeartbeat();
// Cleanup stale tasks on startup and periodically (only worker-0 does this to avoid races)
// This handles tasks left in 'claimed'/'running' status when workers restart or crash
if (this.workerId.endsWith('-0') || this.workerId === 'scraper-worker-0') {
// Run immediately on startup
await this.runStaleTaskCleanup();
// Cleanup stale tasks on startup and periodically
// ALL workers run cleanup to ensure stale tasks are recovered even if some workers crash
// The cleanup query uses SELECT FOR UPDATE SKIP LOCKED to avoid races
// Run immediately on startup
await this.runStaleTaskCleanup();
// Start periodic cleanup every 10 minutes
this.startPeriodicStaleCleanup();
}
// Start periodic cleanup every 10 minutes
this.startPeriodicStaleCleanup();
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (stealth=lazy, max ${this.maxConcurrentTasks} concurrent tasks)`);
@@ -940,6 +1018,18 @@ export class TaskWorker {
}
}
// =================================================================
// PREFLIGHT GATE - BLOCK unqualified workers from task pool
// All tasks require HTTP method, so HTTP preflight MUST pass.
// If preflight failed, worker retries every 30 seconds.
// Worker CANNOT claim ANY tasks until preflight passes.
// =================================================================
if (!this.preflightHttpPassed) {
console.log(`[TaskWorker] ${this.friendlyName} BLOCKED - HTTP preflight not passed, cannot claim tasks`);
await this.retryPreflightUntilPass();
return; // Return to main loop, will re-check on next iteration
}
// Pass preflight capabilities to only claim compatible tasks
const task = await taskService.claimTask(
this.role,