Files
cannaiq/backend/dist/scraper-v2/scheduler.js
2025-11-28 19:45:44 -07:00

137 lines
4.1 KiB
JavaScript

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.RequestScheduler = void 0;
const logger_1 = require("../services/logger");
const crypto_1 = __importDefault(require("crypto"));
class RequestScheduler {
queue = [];
inProgress = new Set();
seen = new Set();
deduplicationEnabled = true;
constructor(deduplicationEnabled = true) {
this.deduplicationEnabled = deduplicationEnabled;
}
/**
* Generate fingerprint for request deduplication
*/
generateFingerprint(request) {
if (request.fingerprint) {
return request.fingerprint;
}
// Generate fingerprint based on URL and relevant metadata
const data = {
url: request.url,
method: request.metadata?.method || 'GET',
body: request.metadata?.body
};
return crypto_1.default.createHash('md5').update(JSON.stringify(data)).digest('hex');
}
/**
* Add a request to the queue
*/
enqueue(partialRequest) {
if (!partialRequest.url) {
logger_1.logger.warn('scraper', 'Cannot enqueue request without URL');
return false;
}
const fingerprint = this.generateFingerprint(partialRequest);
// Check for duplicates
if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
logger_1.logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
return false;
}
// Create full request with defaults
const request = {
url: partialRequest.url,
priority: partialRequest.priority ?? 0,
retryCount: partialRequest.retryCount ?? 0,
maxRetries: partialRequest.maxRetries ?? 3,
metadata: partialRequest.metadata || {},
callback: partialRequest.callback,
errorHandler: partialRequest.errorHandler,
fingerprint
};
this.queue.push(request);
this.seen.add(fingerprint);
// Sort by priority (higher priority first)
this.queue.sort((a, b) => b.priority - a.priority);
logger_1.logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
return true;
}
/**
* Get the next request from the queue
*/
dequeue() {
const request = this.queue.shift();
if (request) {
this.inProgress.add(request.fingerprint);
}
return request || null;
}
/**
* Mark a request as complete
*/
markComplete(request) {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
}
}
/**
* Requeue a failed request (for retry)
*/
requeueForRetry(request) {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
this.seen.delete(request.fingerprint);
}
request.retryCount++;
if (request.retryCount > request.maxRetries) {
logger_1.logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
return false;
}
// Decrease priority for retried requests
request.priority = Math.max(0, request.priority - 1);
return this.enqueue(request);
}
/**
* Get queue stats
*/
getStats() {
return {
pending: this.queue.length,
inProgress: this.inProgress.size,
total: this.seen.size
};
}
/**
* Check if queue is empty
*/
isEmpty() {
return this.queue.length === 0 && this.inProgress.size === 0;
}
/**
* Clear all queues
*/
clear() {
this.queue = [];
this.inProgress.clear();
this.seen.clear();
}
/**
* Get pending requests count
*/
getPendingCount() {
return this.queue.length;
}
/**
* Get in-progress count
*/
getInProgressCount() {
return this.inProgress.size;
}
}
exports.RequestScheduler = RequestScheduler;