/** * Scraper V2 - Scrapy-inspired web scraping framework * * IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module. * Dutchie crawling must go through the dutchie-az GraphQL pipeline: * src/dutchie-az/services/product-crawler.ts * * This scraper-v2 module uses DOM-based extraction which is unreliable * for Dutchie. The new dutchie-az pipeline uses GraphQL directly. * * Architecture: * - Engine: Main orchestrator * - Scheduler: Priority queue with deduplication * - Downloader: HTTP + Browser hybrid fetcher * - Middlewares: Request/response processing chain * - Pipelines: Item processing and persistence * - Navigation: Category discovery */ export { ScraperEngine, DutchieSpider } from './engine'; export { RequestScheduler } from './scheduler'; export { Downloader } from './downloader'; export { NavigationDiscovery } from './navigation'; export { MiddlewareEngine, UserAgentMiddleware, ProxyMiddleware, RateLimitMiddleware, RetryMiddleware, BotDetectionMiddleware, StealthMiddleware } from './middlewares'; export { PipelineEngine, ValidationPipeline, SanitizationPipeline, DeduplicationPipeline, ImagePipeline, DatabasePipeline, StatsPipeline } from './pipelines'; export { CanonicalDatabasePipeline, createCrawlRun, completeCrawlRun } from './canonical-pipeline'; export * from './types'; // Main API functions import { ScraperEngine, DutchieSpider } from './engine'; import { NavigationDiscovery } from './navigation'; import { Downloader } from './downloader'; import { logger } from '../services/logger'; /** * Scrape a single category */ export async function scrapeCategory(storeId: number, categoryId: number): Promise { const engine = new ScraperEngine(1); const spider = new DutchieSpider(engine); try { await spider.scrapeCategory(storeId, categoryId); } catch (error) { logger.error('scraper', `scrapeCategory failed: ${error}`); throw error; } } /** * Scrape an entire store */ export async function scrapeStore(storeId: number, parallel: number = 3, _userAgent?: string): Promise { const engine = new ScraperEngine(1); const spider = new DutchieSpider(engine); try { await spider.scrapeStore(storeId, parallel); } catch (error) { logger.error('scraper', `scrapeStore failed: ${error}`); throw error; } } /** * Discover categories for a store */ export async function discoverCategories(storeId: number): Promise { const downloader = new Downloader(); const discovery = new NavigationDiscovery(downloader); try { // Discover categories (uses your existing Dutchie category structure) await discovery.discoverCategories(storeId); } catch (error) { logger.error('scraper', `discoverCategories failed: ${error}`); throw error; } finally { await downloader.cleanup(); } }