Files
cannaiq/backend/src/_deprecated/scraper-v2/scheduler.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

161 lines
3.8 KiB
TypeScript

import { ScraperRequest } from './types';
import { logger } from '../services/logger';
import crypto from 'crypto';
export class RequestScheduler {
private queue: ScraperRequest[] = [];
private inProgress: Set<string> = new Set();
private seen: Set<string> = new Set();
private deduplicationEnabled: boolean = true;
constructor(deduplicationEnabled: boolean = true) {
this.deduplicationEnabled = deduplicationEnabled;
}
/**
* Generate fingerprint for request deduplication
*/
private generateFingerprint(request: Partial<ScraperRequest>): string {
if (request.fingerprint) {
return request.fingerprint;
}
// Generate fingerprint based on URL and relevant metadata
const data = {
url: request.url,
method: request.metadata?.method || 'GET',
body: request.metadata?.body
};
return crypto.createHash('md5').update(JSON.stringify(data)).digest('hex');
}
/**
* Add a request to the queue
*/
enqueue(partialRequest: Partial<ScraperRequest>): boolean {
if (!partialRequest.url) {
logger.warn('scraper', 'Cannot enqueue request without URL');
return false;
}
const fingerprint = this.generateFingerprint(partialRequest);
// Check for duplicates
if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
return false;
}
// Create full request with defaults
const request: ScraperRequest = {
url: partialRequest.url,
priority: partialRequest.priority ?? 0,
retryCount: partialRequest.retryCount ?? 0,
maxRetries: partialRequest.maxRetries ?? 3,
metadata: partialRequest.metadata || {},
callback: partialRequest.callback!,
errorHandler: partialRequest.errorHandler,
fingerprint
};
this.queue.push(request);
this.seen.add(fingerprint);
// Sort by priority (higher priority first)
this.queue.sort((a, b) => b.priority - a.priority);
logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
return true;
}
/**
* Get the next request from the queue
*/
dequeue(): ScraperRequest | null {
const request = this.queue.shift();
if (request) {
this.inProgress.add(request.fingerprint!);
}
return request || null;
}
/**
* Mark a request as complete
*/
markComplete(request: ScraperRequest): void {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
}
}
/**
* Requeue a failed request (for retry)
*/
requeueForRetry(request: ScraperRequest): boolean {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
this.seen.delete(request.fingerprint);
}
request.retryCount++;
if (request.retryCount > request.maxRetries) {
logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
return false;
}
// Decrease priority for retried requests
request.priority = Math.max(0, request.priority - 1);
return this.enqueue(request);
}
/**
* Get queue stats
*/
getStats(): {
pending: number;
inProgress: number;
total: number;
} {
return {
pending: this.queue.length,
inProgress: this.inProgress.size,
total: this.seen.size
};
}
/**
* Check if queue is empty
*/
isEmpty(): boolean {
return this.queue.length === 0 && this.inProgress.size === 0;
}
/**
* Clear all queues
*/
clear(): void {
this.queue = [];
this.inProgress.clear();
this.seen.clear();
}
/**
* Get pending requests count
*/
getPendingCount(): number {
return this.queue.length;
}
/**
* Get in-progress count
*/
getInProgressCount(): number {
return this.inProgress.size;
}
}