- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
161 lines
3.8 KiB
TypeScript
161 lines
3.8 KiB
TypeScript
import { ScraperRequest } from './types';
|
|
import { logger } from '../services/logger';
|
|
import crypto from 'crypto';
|
|
|
|
export class RequestScheduler {
|
|
private queue: ScraperRequest[] = [];
|
|
private inProgress: Set<string> = new Set();
|
|
private seen: Set<string> = new Set();
|
|
private deduplicationEnabled: boolean = true;
|
|
|
|
constructor(deduplicationEnabled: boolean = true) {
|
|
this.deduplicationEnabled = deduplicationEnabled;
|
|
}
|
|
|
|
/**
|
|
* Generate fingerprint for request deduplication
|
|
*/
|
|
private generateFingerprint(request: Partial<ScraperRequest>): string {
|
|
if (request.fingerprint) {
|
|
return request.fingerprint;
|
|
}
|
|
|
|
// Generate fingerprint based on URL and relevant metadata
|
|
const data = {
|
|
url: request.url,
|
|
method: request.metadata?.method || 'GET',
|
|
body: request.metadata?.body
|
|
};
|
|
|
|
return crypto.createHash('md5').update(JSON.stringify(data)).digest('hex');
|
|
}
|
|
|
|
/**
|
|
* Add a request to the queue
|
|
*/
|
|
enqueue(partialRequest: Partial<ScraperRequest>): boolean {
|
|
if (!partialRequest.url) {
|
|
logger.warn('scraper', 'Cannot enqueue request without URL');
|
|
return false;
|
|
}
|
|
|
|
const fingerprint = this.generateFingerprint(partialRequest);
|
|
|
|
// Check for duplicates
|
|
if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
|
|
logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
|
|
return false;
|
|
}
|
|
|
|
// Create full request with defaults
|
|
const request: ScraperRequest = {
|
|
url: partialRequest.url,
|
|
priority: partialRequest.priority ?? 0,
|
|
retryCount: partialRequest.retryCount ?? 0,
|
|
maxRetries: partialRequest.maxRetries ?? 3,
|
|
metadata: partialRequest.metadata || {},
|
|
callback: partialRequest.callback!,
|
|
errorHandler: partialRequest.errorHandler,
|
|
fingerprint
|
|
};
|
|
|
|
this.queue.push(request);
|
|
this.seen.add(fingerprint);
|
|
|
|
// Sort by priority (higher priority first)
|
|
this.queue.sort((a, b) => b.priority - a.priority);
|
|
|
|
logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Get the next request from the queue
|
|
*/
|
|
dequeue(): ScraperRequest | null {
|
|
const request = this.queue.shift();
|
|
|
|
if (request) {
|
|
this.inProgress.add(request.fingerprint!);
|
|
}
|
|
|
|
return request || null;
|
|
}
|
|
|
|
/**
|
|
* Mark a request as complete
|
|
*/
|
|
markComplete(request: ScraperRequest): void {
|
|
if (request.fingerprint) {
|
|
this.inProgress.delete(request.fingerprint);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Requeue a failed request (for retry)
|
|
*/
|
|
requeueForRetry(request: ScraperRequest): boolean {
|
|
if (request.fingerprint) {
|
|
this.inProgress.delete(request.fingerprint);
|
|
this.seen.delete(request.fingerprint);
|
|
}
|
|
|
|
request.retryCount++;
|
|
|
|
if (request.retryCount > request.maxRetries) {
|
|
logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
|
|
return false;
|
|
}
|
|
|
|
// Decrease priority for retried requests
|
|
request.priority = Math.max(0, request.priority - 1);
|
|
|
|
return this.enqueue(request);
|
|
}
|
|
|
|
/**
|
|
* Get queue stats
|
|
*/
|
|
getStats(): {
|
|
pending: number;
|
|
inProgress: number;
|
|
total: number;
|
|
} {
|
|
return {
|
|
pending: this.queue.length,
|
|
inProgress: this.inProgress.size,
|
|
total: this.seen.size
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Check if queue is empty
|
|
*/
|
|
isEmpty(): boolean {
|
|
return this.queue.length === 0 && this.inProgress.size === 0;
|
|
}
|
|
|
|
/**
|
|
* Clear all queues
|
|
*/
|
|
clear(): void {
|
|
this.queue = [];
|
|
this.inProgress.clear();
|
|
this.seen.clear();
|
|
}
|
|
|
|
/**
|
|
* Get pending requests count
|
|
*/
|
|
getPendingCount(): number {
|
|
return this.queue.length;
|
|
}
|
|
|
|
/**
|
|
* Get in-progress count
|
|
*/
|
|
getInProgressCount(): number {
|
|
return this.inProgress.size;
|
|
}
|
|
}
|