Files
cannaiq/backend/src/_deprecated/scraper-v2/index.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

102 lines
2.8 KiB
TypeScript

/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
* Dutchie crawling must go through the dutchie-az GraphQL pipeline:
* src/dutchie-az/services/product-crawler.ts
*
* This scraper-v2 module uses DOM-based extraction which is unreliable
* for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
* - Downloader: HTTP + Browser hybrid fetcher
* - Middlewares: Request/response processing chain
* - Pipelines: Item processing and persistence
* - Navigation: Category discovery
*/
export { ScraperEngine, DutchieSpider } from './engine';
export { RequestScheduler } from './scheduler';
export { Downloader } from './downloader';
export { NavigationDiscovery } from './navigation';
export {
MiddlewareEngine,
UserAgentMiddleware,
ProxyMiddleware,
RateLimitMiddleware,
RetryMiddleware,
BotDetectionMiddleware,
StealthMiddleware
} from './middlewares';
export {
PipelineEngine,
ValidationPipeline,
SanitizationPipeline,
DeduplicationPipeline,
ImagePipeline,
DatabasePipeline,
StatsPipeline
} from './pipelines';
export {
CanonicalDatabasePipeline,
createCrawlRun,
completeCrawlRun
} from './canonical-pipeline';
export * from './types';
// Main API functions
import { ScraperEngine, DutchieSpider } from './engine';
import { NavigationDiscovery } from './navigation';
import { Downloader } from './downloader';
import { logger } from '../services/logger';
/**
* Scrape a single category
*/
export async function scrapeCategory(storeId: number, categoryId: number): Promise<void> {
const engine = new ScraperEngine(1);
const spider = new DutchieSpider(engine);
try {
await spider.scrapeCategory(storeId, categoryId);
} catch (error) {
logger.error('scraper', `scrapeCategory failed: ${error}`);
throw error;
}
}
/**
* Scrape an entire store
*/
export async function scrapeStore(storeId: number, parallel: number = 3, _userAgent?: string): Promise<void> {
const engine = new ScraperEngine(1);
const spider = new DutchieSpider(engine);
try {
await spider.scrapeStore(storeId, parallel);
} catch (error) {
logger.error('scraper', `scrapeStore failed: ${error}`);
throw error;
}
}
/**
* Discover categories for a store
*/
export async function discoverCategories(storeId: number): Promise<void> {
const downloader = new Downloader();
const discovery = new NavigationDiscovery(downloader);
try {
// Discover categories (uses your existing Dutchie category structure)
await discovery.discoverCategories(storeId);
} catch (error) {
logger.error('scraper', `discoverCategories failed: ${error}`);
throw error;
} finally {
await downloader.cleanup();
}
}