"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.DutchieSpider = exports.ScraperEngine = void 0; const scheduler_1 = require("./scheduler"); const downloader_1 = require("./downloader"); const middlewares_1 = require("./middlewares"); const pipelines_1 = require("./pipelines"); const logger_1 = require("../services/logger"); const migrate_1 = require("../db/migrate"); /** * Main Scraper Engine - orchestrates the entire scraping process */ class ScraperEngine { scheduler; downloader; middlewareEngine; pipelineEngine; stats; isRunning = false; concurrency = 1; // Conservative default constructor(concurrency = 1) { this.scheduler = new scheduler_1.RequestScheduler(); this.downloader = new downloader_1.Downloader(); this.middlewareEngine = new middlewares_1.MiddlewareEngine(); this.pipelineEngine = new pipelines_1.PipelineEngine(); this.concurrency = concurrency; // Initialize stats this.stats = { requestsTotal: 0, requestsSuccess: 0, requestsFailed: 0, itemsScraped: 0, itemsSaved: 0, itemsDropped: 0, errorsCount: 0, startTime: new Date() }; // Setup middlewares this.setupMiddlewares(); // Setup pipelines this.setupPipelines(); } /** * Setup middleware chain */ setupMiddlewares() { this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware()); this.middlewareEngine.use(new middlewares_1.ProxyMiddleware()); this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware()); this.middlewareEngine.use(new middlewares_1.RetryMiddleware()); this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware()); this.middlewareEngine.use(new middlewares_1.StealthMiddleware()); } /** * Setup pipeline chain */ setupPipelines() { this.pipelineEngine.use(new pipelines_1.ValidationPipeline()); this.pipelineEngine.use(new pipelines_1.SanitizationPipeline()); this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline()); this.pipelineEngine.use(new pipelines_1.ImagePipeline()); this.pipelineEngine.use(new pipelines_1.StatsPipeline()); this.pipelineEngine.use(new pipelines_1.DatabasePipeline()); } /** * Add a request to the queue */ enqueue(request) { this.scheduler.enqueue(request); } /** * Start the scraping engine */ async start() { if (this.isRunning) { logger_1.logger.warn('scraper', 'Engine is already running'); return; } this.isRunning = true; this.stats.startTime = new Date(); logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`); // Process queue await this.processQueue(); this.isRunning = false; this.stats.endTime = new Date(); this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime(); logger_1.logger.info('scraper', `✅ Scraper engine finished`); this.logStats(); // Cleanup await this.downloader.cleanup(); } /** * Process the request queue */ async processQueue() { while (!this.scheduler.isEmpty() && this.isRunning) { const request = this.scheduler.dequeue(); if (!request) { // Wait a bit and check again await new Promise(resolve => setTimeout(resolve, 100)); continue; } try { await this.processRequest(request); } catch (error) { logger_1.logger.error('scraper', `Failed to process request: ${error}`); } } } /** * Process a single request */ async processRequest(request) { this.stats.requestsTotal++; try { logger_1.logger.debug('scraper', `Processing: ${request.url}`); // Apply request middlewares const processedRequest = await this.middlewareEngine.processRequest(request); // Download let response = await this.downloader.fetch(processedRequest); // Apply response middlewares response = await this.middlewareEngine.processResponse(response); // Parse response using callback const parseResult = await request.callback(response); // Process items through pipeline if (parseResult.items && parseResult.items.length > 0) { for (const item of parseResult.items) { await this.processItem(item, 'default'); } } // Enqueue follow-up requests if (parseResult.requests && parseResult.requests.length > 0) { for (const followUpRequest of parseResult.requests) { this.scheduler.enqueue(followUpRequest); } } this.stats.requestsSuccess++; this.scheduler.markComplete(request); } catch (error) { this.stats.requestsFailed++; this.stats.errorsCount++; logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`); // Apply error middlewares const handledError = await this.middlewareEngine.processError(error, request); // If error is null, it was handled (e.g., retry) if (handledError === null) { this.scheduler.requeueForRetry(request); } else { this.scheduler.markComplete(request); // Call error handler if provided if (request.errorHandler) { await request.errorHandler(error, request); } } } } /** * Process an item through pipelines */ async processItem(item, spider) { this.stats.itemsScraped++; try { const processedItem = await this.pipelineEngine.processItem(item, spider); if (processedItem) { this.stats.itemsSaved++; } else { this.stats.itemsDropped++; } } catch (error) { logger_1.logger.error('scraper', `Failed to process item: ${error}`); this.stats.itemsDropped++; } } /** * Log statistics */ logStats() { logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); logger_1.logger.info('scraper', '📊 Scraper Statistics'); logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`); logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`); logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`); logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`); // Get stats from StatsPipeline const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline'); if (statsPipeline) { const itemStats = statsPipeline.getStats(); logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`); logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`); logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`); } logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); } /** * Stop the engine */ stop() { this.isRunning = false; logger_1.logger.info('scraper', 'Stopping scraper engine...'); } /** * Get current stats */ getStats() { return { ...this.stats }; } /** * Get queue stats */ getQueueStats() { return this.scheduler.getStats(); } } exports.ScraperEngine = ScraperEngine; /** * Spider for scraping Dutchie categories */ class DutchieSpider { engine; constructor(engine) { this.engine = engine; } /** * Scrape a category */ async scrapeCategory(storeId, categoryId) { logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`); const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`; let registerScraper, updateScraperStats, completeScraper; try { // Import monitoring functions const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor'))); registerScraper = monitor.registerScraper; updateScraperStats = monitor.updateScraperStats; completeScraper = monitor.completeScraper; } catch (e) { // Monitoring not available } try { // Get category info const categoryResult = await migrate_1.pool.query(` SELECT c.*, s.slug as store_slug, s.name as store_name FROM categories c JOIN stores s ON c.store_id = s.id WHERE c.id = $1 `, [categoryId]); if (categoryResult.rows.length === 0) { throw new Error('Category not found'); } const category = categoryResult.rows[0]; logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`); // Register with monitoring system if (registerScraper) { registerScraper(scraperId, storeId, category.store_name, categoryId, category.name); } // Mark products as out of stock before scraping await migrate_1.pool.query(` UPDATE products SET in_stock = false WHERE store_id = $1 AND category_id = $2 `, [storeId, categoryId]); if (updateScraperStats) { updateScraperStats(scraperId, {}, 'Marking products as out of stock'); } // Enqueue category page request this.engine.enqueue({ url: category.dutchie_url, priority: 100, maxRetries: 3, metadata: { requiresBrowser: true, storeId, categoryId, categorySlug: category.slug, storeSlug: category.store_slug }, callback: this.parseCategoryPage.bind(this) }); // Start the engine if (updateScraperStats) { updateScraperStats(scraperId, {}, 'Scraping category page'); } await this.engine.start(); // Update stats from engine const engineStats = this.engine.getStats(); if (updateScraperStats) { updateScraperStats(scraperId, { requestsTotal: engineStats.requestsTotal, requestsSuccess: engineStats.requestsSuccess, itemsSaved: engineStats.itemsSaved, itemsDropped: engineStats.itemsDropped, errorsCount: engineStats.errorsCount }, 'Finalizing'); } // Update category last_scraped_at await migrate_1.pool.query(` UPDATE categories SET last_scraped_at = CURRENT_TIMESTAMP WHERE id = $1 `, [categoryId]); logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`); if (completeScraper) { completeScraper(scraperId); } } catch (error) { logger_1.logger.error('scraper', `Category scrape failed: ${error}`); if (completeScraper) { completeScraper(scraperId, String(error)); } throw error; } } /** * Parse category page (product listing) */ async parseCategoryPage(response) { const page = await this.engine['downloader'].getCurrentPage(); if (!page) { throw new Error('No active page'); } logger_1.logger.info('scraper', 'Parsing category page...'); // Extract product cards const productCards = await page.evaluate(() => { // @ts-ignore - runs in browser context const cards = document.querySelectorAll('[data-testid="product-list-item"]'); const items = []; cards.forEach((card) => { try { const allText = card.textContent || ''; // Extract name let name = ''; const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4']; for (const sel of nameSelectors) { const el = card.querySelector(sel); if (el?.textContent?.trim()) { name = el.textContent.trim().split('\n')[0].trim(); break; } } if (!name || name.length < 2) return; // Extract price let price = null; let originalPrice = null; const priceMatches = allText.match(/\$(\d+\.?\d*)/g); if (priceMatches && priceMatches.length > 0) { price = parseFloat(priceMatches[0].replace('$', '')); if (priceMatches.length > 1) { originalPrice = parseFloat(priceMatches[1].replace('$', '')); } } // Extract link const linkEl = card.querySelector('a[href*="/product/"]'); let href = linkEl?.getAttribute('href') || ''; if (href && href.startsWith('/')) { // @ts-ignore - runs in browser context href = window.location.origin + href; } // Extract image URL from product card let imageUrl = null; const imgSelectors = [ 'img[src*="images.dutchie.com"]', 'img[src*="dutchie"]', 'img[data-testid*="product"]', 'img[class*="product"]', 'img[class*="Product"]', 'picture img', 'img' ]; for (const sel of imgSelectors) { const img = card.querySelector(sel); if (img) { const src = img.getAttribute('src') || img.getAttribute('data-src') || ''; if (src && (src.includes('dutchie.com') || src.includes('images.'))) { imageUrl = src; break; } } } items.push({ name, price, originalPrice, href, imageUrl }); } catch (err) { console.error('Error parsing product card:', err); } }); return items; }); logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`); // Create follow-up requests for each product const requests = productCards.map((card, index) => ({ url: card.href, priority: 50, maxRetries: 3, metadata: { ...response.request.metadata, productName: card.name, productPrice: card.price, productOriginalPrice: card.originalPrice, productImageUrl: card.imageUrl, // Pass image from category page requiresBrowser: true }, callback: this.parseProductPage.bind(this) })); return { items: [], requests }; } /** * Parse individual product page */ async parseProductPage(response) { const page = await this.engine['downloader'].getCurrentPage(); if (!page) { throw new Error('No active page'); } const productName = response.request.metadata.productName; logger_1.logger.debug('scraper', `Parsing product: ${productName}`); // Extract product details const details = await page.evaluate(() => { // @ts-ignore - runs in browser context const allText = document.body.textContent || ''; // Extract image - expanded selectors for better coverage let fullSizeImage = null; const mainImageSelectors = [ 'img[src*="images.dutchie.com"]', 'img[src*="dutchie"]', 'img[class*="ProductImage"]', 'img[class*="product-image"]', 'img[class*="Product"]', '[class*="ImageGallery"] img', '[data-testid*="product"] img', '[data-testid*="image"] img', 'picture img', 'main img' ]; for (const sel of mainImageSelectors) { // @ts-ignore - runs in browser context const img = document.querySelector(sel); const src = img?.src || img?.getAttribute('data-src') || ''; if (src && (src.includes('dutchie.com') || src.includes('images.'))) { fullSizeImage = src; break; } } // Extract description let description = ''; const descSelectors = [ '[class*="description"]', '[class*="Description"]', '[data-testid*="description"]', 'p[class*="product"]' ]; for (const sel of descSelectors) { // @ts-ignore - runs in browser context const el = document.querySelector(sel); if (el?.textContent?.trim() && el.textContent.length > 20) { description = el.textContent.trim(); break; } } // Extract THC/CBD let thc = null; const thcPatterns = [ /THC[:\s]*(\d+\.?\d*)\s*%/i, /Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i, /(\d+\.?\d*)\s*%\s+THC/i ]; for (const pattern of thcPatterns) { const match = allText.match(pattern); if (match) { thc = parseFloat(match[1]); break; } } let cbd = null; const cbdPatterns = [ /CBD[:\s]*(\d+\.?\d*)\s*%/i, /Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i, /(\d+\.?\d*)\s*%\s+CBD/i ]; for (const pattern of cbdPatterns) { const match = allText.match(pattern); if (match) { cbd = parseFloat(match[1]); break; } } // Extract strain type let strainType = null; if (allText.match(/\bindica\b/i)) strainType = 'Indica'; else if (allText.match(/\bsativa\b/i)) strainType = 'Sativa'; else if (allText.match(/\bhybrid\b/i)) strainType = 'Hybrid'; // Extract brand let brand = null; const brandSelectors = [ '[class*="brand"]', '[class*="Brand"]', '[data-testid*="brand"]' ]; for (const sel of brandSelectors) { // @ts-ignore - runs in browser context const el = document.querySelector(sel); if (el?.textContent?.trim()) { brand = el.textContent.trim(); break; } } // Extract metadata const terpenes = []; const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene']; terpeneNames.forEach(terp => { if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) { terpenes.push(terp); } }); const effects = []; const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic']; effectNames.forEach(effect => { if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) { effects.push(effect); } }); return { fullSizeImage, description, thc, cbd, strainType, brand, terpenes, effects }; }); // Create product item // Use image from product page, fallback to category page image const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined; const product = { dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`, name: productName || 'Unknown Product', description: details.description, price: response.request.metadata.productPrice, originalPrice: response.request.metadata.productOriginalPrice, thcPercentage: details.thc || undefined, cbdPercentage: details.cbd || undefined, strainType: details.strainType || undefined, brand: details.brand || undefined, imageUrl: imageUrl, dutchieUrl: response.url, metadata: { terpenes: details.terpenes, effects: details.effects }, storeId: response.request.metadata.storeId, categoryId: response.request.metadata.categoryId }; return { items: [product], requests: [] }; } /** * Scrape entire store */ async scrapeStore(storeId, parallel = 3) { logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`); try { // Check if categories exist, if not, discover them first const categoryCountResult = await migrate_1.pool.query(` SELECT COUNT(*) as count FROM categories WHERE store_id = $1 `, [storeId]); if (parseInt(categoryCountResult.rows[0].count) === 0) { logger_1.logger.info('scraper', 'No categories found - running discovery first'); const { discoverCategories } = await Promise.resolve().then(() => __importStar(require('./index'))); await discoverCategories(storeId); } // Get all leaf categories (no children) const categoriesResult = await migrate_1.pool.query(` SELECT c.id, c.name FROM categories c WHERE c.store_id = $1 AND c.scrape_enabled = true AND NOT EXISTS ( SELECT 1 FROM categories child WHERE child.parent_id = c.id ) ORDER BY c.name `, [storeId]); const categories = categoriesResult.rows; logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`); if (parallel === 1) { // Sequential scraping (original behavior) for (const category of categories) { try { await this.scrapeCategory(storeId, category.id); await new Promise(resolve => setTimeout(resolve, 3000)); } catch (error) { logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`); } } } else { // Parallel scraping with concurrency limit const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel); const successful = results.filter(r => r.status === 'fulfilled').length; const failed = results.filter(r => r.status === 'rejected').length; logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`); } // Update store last_scraped_at await migrate_1.pool.query(` UPDATE stores SET last_scraped_at = CURRENT_TIMESTAMP WHERE id = $1 `, [storeId]); logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`); } catch (error) { logger_1.logger.error('scraper', `Store scrape failed: ${error}`); throw error; } } /** * Scrape multiple categories in parallel with concurrency limit */ async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) { const results = []; // Process categories in batches for (let i = 0; i < categories.length; i += concurrency) { const batch = categories.slice(i, i + concurrency); logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`); const batchPromises = batch.map(category => { // Create a new spider instance for each category const engine = new ScraperEngine(1); // 1 concurrent request per spider const spider = new DutchieSpider(engine); return spider.scrapeCategory(storeId, category.id) .catch(error => { logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`); throw error; }); }); const batchResults = await Promise.allSettled(batchPromises); results.push(...batchResults); // Delay between batches to avoid overwhelming the server if (i + concurrency < categories.length) { logger_1.logger.info('scraper', 'Waiting 5s before next batch...'); await new Promise(resolve => setTimeout(resolve, 5000)); } } return results; } } exports.DutchieSpider = DutchieSpider;