109 lines
6.0 KiB
JavaScript
109 lines
6.0 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Scraper V2 - Scrapy-inspired web scraping framework
|
|
*
|
|
* Architecture:
|
|
* - Engine: Main orchestrator
|
|
* - Scheduler: Priority queue with deduplication
|
|
* - Downloader: HTTP + Browser hybrid fetcher
|
|
* - Middlewares: Request/response processing chain
|
|
* - Pipelines: Item processing and persistence
|
|
* - Navigation: Category discovery
|
|
*/
|
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
if (k2 === undefined) k2 = k;
|
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
}
|
|
Object.defineProperty(o, k2, desc);
|
|
}) : (function(o, m, k, k2) {
|
|
if (k2 === undefined) k2 = k;
|
|
o[k2] = m[k];
|
|
}));
|
|
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = exports.PipelineEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = exports.MiddlewareEngine = exports.NavigationDiscovery = exports.Downloader = exports.RequestScheduler = exports.DutchieSpider = exports.ScraperEngine = void 0;
|
|
exports.scrapeCategory = scrapeCategory;
|
|
exports.scrapeStore = scrapeStore;
|
|
exports.discoverCategories = discoverCategories;
|
|
var engine_1 = require("./engine");
|
|
Object.defineProperty(exports, "ScraperEngine", { enumerable: true, get: function () { return engine_1.ScraperEngine; } });
|
|
Object.defineProperty(exports, "DutchieSpider", { enumerable: true, get: function () { return engine_1.DutchieSpider; } });
|
|
var scheduler_1 = require("./scheduler");
|
|
Object.defineProperty(exports, "RequestScheduler", { enumerable: true, get: function () { return scheduler_1.RequestScheduler; } });
|
|
var downloader_1 = require("./downloader");
|
|
Object.defineProperty(exports, "Downloader", { enumerable: true, get: function () { return downloader_1.Downloader; } });
|
|
var navigation_1 = require("./navigation");
|
|
Object.defineProperty(exports, "NavigationDiscovery", { enumerable: true, get: function () { return navigation_1.NavigationDiscovery; } });
|
|
var middlewares_1 = require("./middlewares");
|
|
Object.defineProperty(exports, "MiddlewareEngine", { enumerable: true, get: function () { return middlewares_1.MiddlewareEngine; } });
|
|
Object.defineProperty(exports, "UserAgentMiddleware", { enumerable: true, get: function () { return middlewares_1.UserAgentMiddleware; } });
|
|
Object.defineProperty(exports, "ProxyMiddleware", { enumerable: true, get: function () { return middlewares_1.ProxyMiddleware; } });
|
|
Object.defineProperty(exports, "RateLimitMiddleware", { enumerable: true, get: function () { return middlewares_1.RateLimitMiddleware; } });
|
|
Object.defineProperty(exports, "RetryMiddleware", { enumerable: true, get: function () { return middlewares_1.RetryMiddleware; } });
|
|
Object.defineProperty(exports, "BotDetectionMiddleware", { enumerable: true, get: function () { return middlewares_1.BotDetectionMiddleware; } });
|
|
Object.defineProperty(exports, "StealthMiddleware", { enumerable: true, get: function () { return middlewares_1.StealthMiddleware; } });
|
|
var pipelines_1 = require("./pipelines");
|
|
Object.defineProperty(exports, "PipelineEngine", { enumerable: true, get: function () { return pipelines_1.PipelineEngine; } });
|
|
Object.defineProperty(exports, "ValidationPipeline", { enumerable: true, get: function () { return pipelines_1.ValidationPipeline; } });
|
|
Object.defineProperty(exports, "SanitizationPipeline", { enumerable: true, get: function () { return pipelines_1.SanitizationPipeline; } });
|
|
Object.defineProperty(exports, "DeduplicationPipeline", { enumerable: true, get: function () { return pipelines_1.DeduplicationPipeline; } });
|
|
Object.defineProperty(exports, "ImagePipeline", { enumerable: true, get: function () { return pipelines_1.ImagePipeline; } });
|
|
Object.defineProperty(exports, "DatabasePipeline", { enumerable: true, get: function () { return pipelines_1.DatabasePipeline; } });
|
|
Object.defineProperty(exports, "StatsPipeline", { enumerable: true, get: function () { return pipelines_1.StatsPipeline; } });
|
|
__exportStar(require("./types"), exports);
|
|
// Main API functions
|
|
const engine_2 = require("./engine");
|
|
const navigation_2 = require("./navigation");
|
|
const downloader_2 = require("./downloader");
|
|
const logger_1 = require("../services/logger");
|
|
/**
|
|
* Scrape a single category
|
|
*/
|
|
async function scrapeCategory(storeId, categoryId) {
|
|
const engine = new engine_2.ScraperEngine(1);
|
|
const spider = new engine_2.DutchieSpider(engine);
|
|
try {
|
|
await spider.scrapeCategory(storeId, categoryId);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `scrapeCategory failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
/**
|
|
* Scrape an entire store
|
|
*/
|
|
async function scrapeStore(storeId, parallel = 3) {
|
|
const engine = new engine_2.ScraperEngine(1);
|
|
const spider = new engine_2.DutchieSpider(engine);
|
|
try {
|
|
await spider.scrapeStore(storeId, parallel);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `scrapeStore failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
/**
|
|
* Discover categories for a store
|
|
*/
|
|
async function discoverCategories(storeId) {
|
|
const downloader = new downloader_2.Downloader();
|
|
const discovery = new navigation_2.NavigationDiscovery(downloader);
|
|
try {
|
|
// Discover categories (uses your existing Dutchie category structure)
|
|
await discovery.discoverCategories(storeId);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `discoverCategories failed: ${error}`);
|
|
throw error;
|
|
}
|
|
finally {
|
|
await downloader.cleanup();
|
|
}
|
|
}
|