Initial commit - Dutchie dispensary scraper

This commit is contained in:
Kelly
2025-11-28 19:45:44 -07:00
commit 5757a8e9bd
23375 changed files with 3788799 additions and 0 deletions

View File

@@ -0,0 +1,89 @@
/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
* - Downloader: HTTP + Browser hybrid fetcher
* - Middlewares: Request/response processing chain
* - Pipelines: Item processing and persistence
* - Navigation: Category discovery
*/
export { ScraperEngine, DutchieSpider } from './engine';
export { RequestScheduler } from './scheduler';
export { Downloader } from './downloader';
export { NavigationDiscovery } from './navigation';
export {
MiddlewareEngine,
UserAgentMiddleware,
ProxyMiddleware,
RateLimitMiddleware,
RetryMiddleware,
BotDetectionMiddleware,
StealthMiddleware
} from './middlewares';
export {
PipelineEngine,
ValidationPipeline,
SanitizationPipeline,
DeduplicationPipeline,
ImagePipeline,
DatabasePipeline,
StatsPipeline
} from './pipelines';
export * from './types';
// Main API functions
import { ScraperEngine, DutchieSpider } from './engine';
import { NavigationDiscovery } from './navigation';
import { Downloader } from './downloader';
import { logger } from '../services/logger';
/**
* Scrape a single category
*/
export async function scrapeCategory(storeId: number, categoryId: number): Promise<void> {
const engine = new ScraperEngine(1);
const spider = new DutchieSpider(engine);
try {
await spider.scrapeCategory(storeId, categoryId);
} catch (error) {
logger.error('scraper', `scrapeCategory failed: ${error}`);
throw error;
}
}
/**
* Scrape an entire store
*/
export async function scrapeStore(storeId: number, parallel: number = 3): Promise<void> {
const engine = new ScraperEngine(1);
const spider = new DutchieSpider(engine);
try {
await spider.scrapeStore(storeId, parallel);
} catch (error) {
logger.error('scraper', `scrapeStore failed: ${error}`);
throw error;
}
}
/**
* Discover categories for a store
*/
export async function discoverCategories(storeId: number): Promise<void> {
const downloader = new Downloader();
const discovery = new NavigationDiscovery(downloader);
try {
// Discover categories (uses your existing Dutchie category structure)
await discovery.discoverCategories(storeId);
} catch (error) {
logger.error('scraper', `discoverCategories failed: ${error}`);
throw error;
} finally {
await downloader.cleanup();
}
}