## SEO Template Library - Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration) - Add Template Library tab in SEO Orchestrator with accordion-based editors - Add template preview, validation, and variable injection engine - Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate ## Discovery Pipeline - Add promotion.ts for discovery location validation and promotion - Add discover-all-states.ts script for multi-state discovery - Add promotion log migration (067) - Enhance discovery routes and types ## Orchestrator & Admin - Add crawl_enabled filter to stores page - Add API permissions page - Add job queue management - Add price analytics routes - Add markets and intelligence routes - Enhance dashboard and worker monitoring ## Infrastructure - Add migrations for worker definitions, SEO settings, field alignment - Add canonical pipeline for scraper v2 - Update hydration and sync orchestrator - Enhance multi-state query service 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
102 lines
2.8 KiB
TypeScript
102 lines
2.8 KiB
TypeScript
/**
|
|
* Scraper V2 - Scrapy-inspired web scraping framework
|
|
*
|
|
* IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
|
|
* Dutchie crawling must go through the dutchie-az GraphQL pipeline:
|
|
* src/dutchie-az/services/product-crawler.ts
|
|
*
|
|
* This scraper-v2 module uses DOM-based extraction which is unreliable
|
|
* for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
|
|
*
|
|
* Architecture:
|
|
* - Engine: Main orchestrator
|
|
* - Scheduler: Priority queue with deduplication
|
|
* - Downloader: HTTP + Browser hybrid fetcher
|
|
* - Middlewares: Request/response processing chain
|
|
* - Pipelines: Item processing and persistence
|
|
* - Navigation: Category discovery
|
|
*/
|
|
|
|
export { ScraperEngine, DutchieSpider } from './engine';
|
|
export { RequestScheduler } from './scheduler';
|
|
export { Downloader } from './downloader';
|
|
export { NavigationDiscovery } from './navigation';
|
|
export {
|
|
MiddlewareEngine,
|
|
UserAgentMiddleware,
|
|
ProxyMiddleware,
|
|
RateLimitMiddleware,
|
|
RetryMiddleware,
|
|
BotDetectionMiddleware,
|
|
StealthMiddleware
|
|
} from './middlewares';
|
|
export {
|
|
PipelineEngine,
|
|
ValidationPipeline,
|
|
SanitizationPipeline,
|
|
DeduplicationPipeline,
|
|
ImagePipeline,
|
|
DatabasePipeline,
|
|
StatsPipeline
|
|
} from './pipelines';
|
|
export {
|
|
CanonicalDatabasePipeline,
|
|
createCrawlRun,
|
|
completeCrawlRun
|
|
} from './canonical-pipeline';
|
|
export * from './types';
|
|
|
|
// Main API functions
|
|
import { ScraperEngine, DutchieSpider } from './engine';
|
|
import { NavigationDiscovery } from './navigation';
|
|
import { Downloader } from './downloader';
|
|
import { logger } from '../services/logger';
|
|
|
|
/**
|
|
* Scrape a single category
|
|
*/
|
|
export async function scrapeCategory(storeId: number, categoryId: number): Promise<void> {
|
|
const engine = new ScraperEngine(1);
|
|
const spider = new DutchieSpider(engine);
|
|
|
|
try {
|
|
await spider.scrapeCategory(storeId, categoryId);
|
|
} catch (error) {
|
|
logger.error('scraper', `scrapeCategory failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Scrape an entire store
|
|
*/
|
|
export async function scrapeStore(storeId: number, parallel: number = 3, _userAgent?: string): Promise<void> {
|
|
const engine = new ScraperEngine(1);
|
|
const spider = new DutchieSpider(engine);
|
|
|
|
try {
|
|
await spider.scrapeStore(storeId, parallel);
|
|
} catch (error) {
|
|
logger.error('scraper', `scrapeStore failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Discover categories for a store
|
|
*/
|
|
export async function discoverCategories(storeId: number): Promise<void> {
|
|
const downloader = new Downloader();
|
|
const discovery = new NavigationDiscovery(downloader);
|
|
|
|
try {
|
|
// Discover categories (uses your existing Dutchie category structure)
|
|
await discovery.discoverCategories(storeId);
|
|
} catch (error) {
|
|
logger.error('scraper', `discoverCategories failed: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
await downloader.cleanup();
|
|
}
|
|
}
|