Files
cannaiq/backend/dist/scraper-v2/index.js
2025-11-28 19:45:44 -07:00

109 lines
6.0 KiB
JavaScript

"use strict";
/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
* - Downloader: HTTP + Browser hybrid fetcher
* - Middlewares: Request/response processing chain
* - Pipelines: Item processing and persistence
* - Navigation: Category discovery
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = exports.PipelineEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = exports.MiddlewareEngine = exports.NavigationDiscovery = exports.Downloader = exports.RequestScheduler = exports.DutchieSpider = exports.ScraperEngine = void 0;
exports.scrapeCategory = scrapeCategory;
exports.scrapeStore = scrapeStore;
exports.discoverCategories = discoverCategories;
var engine_1 = require("./engine");
Object.defineProperty(exports, "ScraperEngine", { enumerable: true, get: function () { return engine_1.ScraperEngine; } });
Object.defineProperty(exports, "DutchieSpider", { enumerable: true, get: function () { return engine_1.DutchieSpider; } });
var scheduler_1 = require("./scheduler");
Object.defineProperty(exports, "RequestScheduler", { enumerable: true, get: function () { return scheduler_1.RequestScheduler; } });
var downloader_1 = require("./downloader");
Object.defineProperty(exports, "Downloader", { enumerable: true, get: function () { return downloader_1.Downloader; } });
var navigation_1 = require("./navigation");
Object.defineProperty(exports, "NavigationDiscovery", { enumerable: true, get: function () { return navigation_1.NavigationDiscovery; } });
var middlewares_1 = require("./middlewares");
Object.defineProperty(exports, "MiddlewareEngine", { enumerable: true, get: function () { return middlewares_1.MiddlewareEngine; } });
Object.defineProperty(exports, "UserAgentMiddleware", { enumerable: true, get: function () { return middlewares_1.UserAgentMiddleware; } });
Object.defineProperty(exports, "ProxyMiddleware", { enumerable: true, get: function () { return middlewares_1.ProxyMiddleware; } });
Object.defineProperty(exports, "RateLimitMiddleware", { enumerable: true, get: function () { return middlewares_1.RateLimitMiddleware; } });
Object.defineProperty(exports, "RetryMiddleware", { enumerable: true, get: function () { return middlewares_1.RetryMiddleware; } });
Object.defineProperty(exports, "BotDetectionMiddleware", { enumerable: true, get: function () { return middlewares_1.BotDetectionMiddleware; } });
Object.defineProperty(exports, "StealthMiddleware", { enumerable: true, get: function () { return middlewares_1.StealthMiddleware; } });
var pipelines_1 = require("./pipelines");
Object.defineProperty(exports, "PipelineEngine", { enumerable: true, get: function () { return pipelines_1.PipelineEngine; } });
Object.defineProperty(exports, "ValidationPipeline", { enumerable: true, get: function () { return pipelines_1.ValidationPipeline; } });
Object.defineProperty(exports, "SanitizationPipeline", { enumerable: true, get: function () { return pipelines_1.SanitizationPipeline; } });
Object.defineProperty(exports, "DeduplicationPipeline", { enumerable: true, get: function () { return pipelines_1.DeduplicationPipeline; } });
Object.defineProperty(exports, "ImagePipeline", { enumerable: true, get: function () { return pipelines_1.ImagePipeline; } });
Object.defineProperty(exports, "DatabasePipeline", { enumerable: true, get: function () { return pipelines_1.DatabasePipeline; } });
Object.defineProperty(exports, "StatsPipeline", { enumerable: true, get: function () { return pipelines_1.StatsPipeline; } });
__exportStar(require("./types"), exports);
// Main API functions
const engine_2 = require("./engine");
const navigation_2 = require("./navigation");
const downloader_2 = require("./downloader");
const logger_1 = require("../services/logger");
/**
* Scrape a single category
*/
async function scrapeCategory(storeId, categoryId) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {
await spider.scrapeCategory(storeId, categoryId);
}
catch (error) {
logger_1.logger.error('scraper', `scrapeCategory failed: ${error}`);
throw error;
}
}
/**
* Scrape an entire store
*/
async function scrapeStore(storeId, parallel = 3) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {
await spider.scrapeStore(storeId, parallel);
}
catch (error) {
logger_1.logger.error('scraper', `scrapeStore failed: ${error}`);
throw error;
}
}
/**
* Discover categories for a store
*/
async function discoverCategories(storeId) {
const downloader = new downloader_2.Downloader();
const discovery = new navigation_2.NavigationDiscovery(downloader);
try {
// Discover categories (uses your existing Dutchie category structure)
await discovery.discoverCategories(storeId);
}
catch (error) {
logger_1.logger.error('scraper', `discoverCategories failed: ${error}`);
throw error;
}
finally {
await downloader.cleanup();
}
}