137 lines
4.1 KiB
JavaScript
137 lines
4.1 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.RequestScheduler = void 0;
|
|
const logger_1 = require("../services/logger");
|
|
const crypto_1 = __importDefault(require("crypto"));
|
|
class RequestScheduler {
|
|
queue = [];
|
|
inProgress = new Set();
|
|
seen = new Set();
|
|
deduplicationEnabled = true;
|
|
constructor(deduplicationEnabled = true) {
|
|
this.deduplicationEnabled = deduplicationEnabled;
|
|
}
|
|
/**
|
|
* Generate fingerprint for request deduplication
|
|
*/
|
|
generateFingerprint(request) {
|
|
if (request.fingerprint) {
|
|
return request.fingerprint;
|
|
}
|
|
// Generate fingerprint based on URL and relevant metadata
|
|
const data = {
|
|
url: request.url,
|
|
method: request.metadata?.method || 'GET',
|
|
body: request.metadata?.body
|
|
};
|
|
return crypto_1.default.createHash('md5').update(JSON.stringify(data)).digest('hex');
|
|
}
|
|
/**
|
|
* Add a request to the queue
|
|
*/
|
|
enqueue(partialRequest) {
|
|
if (!partialRequest.url) {
|
|
logger_1.logger.warn('scraper', 'Cannot enqueue request without URL');
|
|
return false;
|
|
}
|
|
const fingerprint = this.generateFingerprint(partialRequest);
|
|
// Check for duplicates
|
|
if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
|
|
logger_1.logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
|
|
return false;
|
|
}
|
|
// Create full request with defaults
|
|
const request = {
|
|
url: partialRequest.url,
|
|
priority: partialRequest.priority ?? 0,
|
|
retryCount: partialRequest.retryCount ?? 0,
|
|
maxRetries: partialRequest.maxRetries ?? 3,
|
|
metadata: partialRequest.metadata || {},
|
|
callback: partialRequest.callback,
|
|
errorHandler: partialRequest.errorHandler,
|
|
fingerprint
|
|
};
|
|
this.queue.push(request);
|
|
this.seen.add(fingerprint);
|
|
// Sort by priority (higher priority first)
|
|
this.queue.sort((a, b) => b.priority - a.priority);
|
|
logger_1.logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
|
|
return true;
|
|
}
|
|
/**
|
|
* Get the next request from the queue
|
|
*/
|
|
dequeue() {
|
|
const request = this.queue.shift();
|
|
if (request) {
|
|
this.inProgress.add(request.fingerprint);
|
|
}
|
|
return request || null;
|
|
}
|
|
/**
|
|
* Mark a request as complete
|
|
*/
|
|
markComplete(request) {
|
|
if (request.fingerprint) {
|
|
this.inProgress.delete(request.fingerprint);
|
|
}
|
|
}
|
|
/**
|
|
* Requeue a failed request (for retry)
|
|
*/
|
|
requeueForRetry(request) {
|
|
if (request.fingerprint) {
|
|
this.inProgress.delete(request.fingerprint);
|
|
this.seen.delete(request.fingerprint);
|
|
}
|
|
request.retryCount++;
|
|
if (request.retryCount > request.maxRetries) {
|
|
logger_1.logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
|
|
return false;
|
|
}
|
|
// Decrease priority for retried requests
|
|
request.priority = Math.max(0, request.priority - 1);
|
|
return this.enqueue(request);
|
|
}
|
|
/**
|
|
* Get queue stats
|
|
*/
|
|
getStats() {
|
|
return {
|
|
pending: this.queue.length,
|
|
inProgress: this.inProgress.size,
|
|
total: this.seen.size
|
|
};
|
|
}
|
|
/**
|
|
* Check if queue is empty
|
|
*/
|
|
isEmpty() {
|
|
return this.queue.length === 0 && this.inProgress.size === 0;
|
|
}
|
|
/**
|
|
* Clear all queues
|
|
*/
|
|
clear() {
|
|
this.queue = [];
|
|
this.inProgress.clear();
|
|
this.seen.clear();
|
|
}
|
|
/**
|
|
* Get pending requests count
|
|
*/
|
|
getPendingCount() {
|
|
return this.queue.length;
|
|
}
|
|
/**
|
|
* Get in-progress count
|
|
*/
|
|
getInProgressCount() {
|
|
return this.inProgress.size;
|
|
}
|
|
}
|
|
exports.RequestScheduler = RequestScheduler;
|