Files
cannaiq/backend/src/scraper-v2/index.ts
Kelly 2f483b3084 feat: SEO template library, discovery pipeline, and orchestrator enhancements
## SEO Template Library
- Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration)
- Add Template Library tab in SEO Orchestrator with accordion-based editors
- Add template preview, validation, and variable injection engine
- Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate

## Discovery Pipeline
- Add promotion.ts for discovery location validation and promotion
- Add discover-all-states.ts script for multi-state discovery
- Add promotion log migration (067)
- Enhance discovery routes and types

## Orchestrator & Admin
- Add crawl_enabled filter to stores page
- Add API permissions page
- Add job queue management
- Add price analytics routes
- Add markets and intelligence routes
- Enhance dashboard and worker monitoring

## Infrastructure
- Add migrations for worker definitions, SEO settings, field alignment
- Add canonical pipeline for scraper v2
- Update hydration and sync orchestrator
- Enhance multi-state query service

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 00:05:34 -07:00

102 lines
2.8 KiB
TypeScript

/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
* Dutchie crawling must go through the dutchie-az GraphQL pipeline:
* src/dutchie-az/services/product-crawler.ts
*
* This scraper-v2 module uses DOM-based extraction which is unreliable
* for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
* - Downloader: HTTP + Browser hybrid fetcher
* - Middlewares: Request/response processing chain
* - Pipelines: Item processing and persistence
* - Navigation: Category discovery
*/
export { ScraperEngine, DutchieSpider } from './engine';
export { RequestScheduler } from './scheduler';
export { Downloader } from './downloader';
export { NavigationDiscovery } from './navigation';
export {
MiddlewareEngine,
UserAgentMiddleware,
ProxyMiddleware,
RateLimitMiddleware,
RetryMiddleware,
BotDetectionMiddleware,
StealthMiddleware
} from './middlewares';
export {
PipelineEngine,
ValidationPipeline,
SanitizationPipeline,
DeduplicationPipeline,
ImagePipeline,
DatabasePipeline,
StatsPipeline
} from './pipelines';
export {
CanonicalDatabasePipeline,
createCrawlRun,
completeCrawlRun
} from './canonical-pipeline';
export * from './types';
// Main API functions
import { ScraperEngine, DutchieSpider } from './engine';
import { NavigationDiscovery } from './navigation';
import { Downloader } from './downloader';
import { logger } from '../services/logger';
/**
* Scrape a single category
*/
export async function scrapeCategory(storeId: number, categoryId: number): Promise<void> {
const engine = new ScraperEngine(1);
const spider = new DutchieSpider(engine);
try {
await spider.scrapeCategory(storeId, categoryId);
} catch (error) {
logger.error('scraper', `scrapeCategory failed: ${error}`);
throw error;
}
}
/**
* Scrape an entire store
*/
export async function scrapeStore(storeId: number, parallel: number = 3, _userAgent?: string): Promise<void> {
const engine = new ScraperEngine(1);
const spider = new DutchieSpider(engine);
try {
await spider.scrapeStore(storeId, parallel);
} catch (error) {
logger.error('scraper', `scrapeStore failed: ${error}`);
throw error;
}
}
/**
* Discover categories for a store
*/
export async function discoverCategories(storeId: number): Promise<void> {
const downloader = new Downloader();
const discovery = new NavigationDiscovery(downloader);
try {
// Discover categories (uses your existing Dutchie category structure)
await discovery.discoverCategories(storeId);
} catch (error) {
logger.error('scraper', `discoverCategories failed: ${error}`);
throw error;
} finally {
await downloader.cleanup();
}
}