Files
cannaiq/backend/dist/scraper-v2/engine.js
2025-11-28 19:45:44 -07:00

653 lines
26 KiB
JavaScript

"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.DutchieSpider = exports.ScraperEngine = void 0;
const scheduler_1 = require("./scheduler");
const downloader_1 = require("./downloader");
const middlewares_1 = require("./middlewares");
const pipelines_1 = require("./pipelines");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
/**
* Main Scraper Engine - orchestrates the entire scraping process
*/
class ScraperEngine {
scheduler;
downloader;
middlewareEngine;
pipelineEngine;
stats;
isRunning = false;
concurrency = 1; // Conservative default
constructor(concurrency = 1) {
this.scheduler = new scheduler_1.RequestScheduler();
this.downloader = new downloader_1.Downloader();
this.middlewareEngine = new middlewares_1.MiddlewareEngine();
this.pipelineEngine = new pipelines_1.PipelineEngine();
this.concurrency = concurrency;
// Initialize stats
this.stats = {
requestsTotal: 0,
requestsSuccess: 0,
requestsFailed: 0,
itemsScraped: 0,
itemsSaved: 0,
itemsDropped: 0,
errorsCount: 0,
startTime: new Date()
};
// Setup middlewares
this.setupMiddlewares();
// Setup pipelines
this.setupPipelines();
}
/**
* Setup middleware chain
*/
setupMiddlewares() {
this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware());
this.middlewareEngine.use(new middlewares_1.ProxyMiddleware());
this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware());
this.middlewareEngine.use(new middlewares_1.RetryMiddleware());
this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware());
this.middlewareEngine.use(new middlewares_1.StealthMiddleware());
}
/**
* Setup pipeline chain
*/
setupPipelines() {
this.pipelineEngine.use(new pipelines_1.ValidationPipeline());
this.pipelineEngine.use(new pipelines_1.SanitizationPipeline());
this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline());
this.pipelineEngine.use(new pipelines_1.ImagePipeline());
this.pipelineEngine.use(new pipelines_1.StatsPipeline());
this.pipelineEngine.use(new pipelines_1.DatabasePipeline());
}
/**
* Add a request to the queue
*/
enqueue(request) {
this.scheduler.enqueue(request);
}
/**
* Start the scraping engine
*/
async start() {
if (this.isRunning) {
logger_1.logger.warn('scraper', 'Engine is already running');
return;
}
this.isRunning = true;
this.stats.startTime = new Date();
logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`);
// Process queue
await this.processQueue();
this.isRunning = false;
this.stats.endTime = new Date();
this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime();
logger_1.logger.info('scraper', `✅ Scraper engine finished`);
this.logStats();
// Cleanup
await this.downloader.cleanup();
}
/**
* Process the request queue
*/
async processQueue() {
while (!this.scheduler.isEmpty() && this.isRunning) {
const request = this.scheduler.dequeue();
if (!request) {
// Wait a bit and check again
await new Promise(resolve => setTimeout(resolve, 100));
continue;
}
try {
await this.processRequest(request);
}
catch (error) {
logger_1.logger.error('scraper', `Failed to process request: ${error}`);
}
}
}
/**
* Process a single request
*/
async processRequest(request) {
this.stats.requestsTotal++;
try {
logger_1.logger.debug('scraper', `Processing: ${request.url}`);
// Apply request middlewares
const processedRequest = await this.middlewareEngine.processRequest(request);
// Download
let response = await this.downloader.fetch(processedRequest);
// Apply response middlewares
response = await this.middlewareEngine.processResponse(response);
// Parse response using callback
const parseResult = await request.callback(response);
// Process items through pipeline
if (parseResult.items && parseResult.items.length > 0) {
for (const item of parseResult.items) {
await this.processItem(item, 'default');
}
}
// Enqueue follow-up requests
if (parseResult.requests && parseResult.requests.length > 0) {
for (const followUpRequest of parseResult.requests) {
this.scheduler.enqueue(followUpRequest);
}
}
this.stats.requestsSuccess++;
this.scheduler.markComplete(request);
}
catch (error) {
this.stats.requestsFailed++;
this.stats.errorsCount++;
logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`);
// Apply error middlewares
const handledError = await this.middlewareEngine.processError(error, request);
// If error is null, it was handled (e.g., retry)
if (handledError === null) {
this.scheduler.requeueForRetry(request);
}
else {
this.scheduler.markComplete(request);
// Call error handler if provided
if (request.errorHandler) {
await request.errorHandler(error, request);
}
}
}
}
/**
* Process an item through pipelines
*/
async processItem(item, spider) {
this.stats.itemsScraped++;
try {
const processedItem = await this.pipelineEngine.processItem(item, spider);
if (processedItem) {
this.stats.itemsSaved++;
}
else {
this.stats.itemsDropped++;
}
}
catch (error) {
logger_1.logger.error('scraper', `Failed to process item: ${error}`);
this.stats.itemsDropped++;
}
}
/**
* Log statistics
*/
logStats() {
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
logger_1.logger.info('scraper', '📊 Scraper Statistics');
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`);
logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`);
logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`);
logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`);
// Get stats from StatsPipeline
const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline');
if (statsPipeline) {
const itemStats = statsPipeline.getStats();
logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`);
logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`);
logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`);
}
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
}
/**
* Stop the engine
*/
stop() {
this.isRunning = false;
logger_1.logger.info('scraper', 'Stopping scraper engine...');
}
/**
* Get current stats
*/
getStats() {
return { ...this.stats };
}
/**
* Get queue stats
*/
getQueueStats() {
return this.scheduler.getStats();
}
}
exports.ScraperEngine = ScraperEngine;
/**
* Spider for scraping Dutchie categories
*/
class DutchieSpider {
engine;
constructor(engine) {
this.engine = engine;
}
/**
* Scrape a category
*/
async scrapeCategory(storeId, categoryId) {
logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`);
const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`;
let registerScraper, updateScraperStats, completeScraper;
try {
// Import monitoring functions
const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor')));
registerScraper = monitor.registerScraper;
updateScraperStats = monitor.updateScraperStats;
completeScraper = monitor.completeScraper;
}
catch (e) {
// Monitoring not available
}
try {
// Get category info
const categoryResult = await migrate_1.pool.query(`
SELECT c.*, s.slug as store_slug, s.name as store_name
FROM categories c
JOIN stores s ON c.store_id = s.id
WHERE c.id = $1
`, [categoryId]);
if (categoryResult.rows.length === 0) {
throw new Error('Category not found');
}
const category = categoryResult.rows[0];
logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`);
// Register with monitoring system
if (registerScraper) {
registerScraper(scraperId, storeId, category.store_name, categoryId, category.name);
}
// Mark products as out of stock before scraping
await migrate_1.pool.query(`
UPDATE products
SET in_stock = false
WHERE store_id = $1 AND category_id = $2
`, [storeId, categoryId]);
if (updateScraperStats) {
updateScraperStats(scraperId, {}, 'Marking products as out of stock');
}
// Enqueue category page request
this.engine.enqueue({
url: category.dutchie_url,
priority: 100,
maxRetries: 3,
metadata: {
requiresBrowser: true,
storeId,
categoryId,
categorySlug: category.slug,
storeSlug: category.store_slug
},
callback: this.parseCategoryPage.bind(this)
});
// Start the engine
if (updateScraperStats) {
updateScraperStats(scraperId, {}, 'Scraping category page');
}
await this.engine.start();
// Update stats from engine
const engineStats = this.engine.getStats();
if (updateScraperStats) {
updateScraperStats(scraperId, {
requestsTotal: engineStats.requestsTotal,
requestsSuccess: engineStats.requestsSuccess,
itemsSaved: engineStats.itemsSaved,
itemsDropped: engineStats.itemsDropped,
errorsCount: engineStats.errorsCount
}, 'Finalizing');
}
// Update category last_scraped_at
await migrate_1.pool.query(`
UPDATE categories
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [categoryId]);
logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`);
if (completeScraper) {
completeScraper(scraperId);
}
}
catch (error) {
logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
if (completeScraper) {
completeScraper(scraperId, error.toString());
}
throw error;
}
}
/**
* Parse category page (product listing)
*/
async parseCategoryPage(response) {
const page = await this.engine['downloader'].getCurrentPage();
if (!page) {
throw new Error('No active page');
}
logger_1.logger.info('scraper', 'Parsing category page...');
// Extract product cards
const productCards = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
const items = [];
cards.forEach((card) => {
try {
const allText = card.textContent || '';
// Extract name
let name = '';
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
for (const sel of nameSelectors) {
const el = card.querySelector(sel);
if (el?.textContent?.trim()) {
name = el.textContent.trim().split('\n')[0].trim();
break;
}
}
if (!name || name.length < 2)
return;
// Extract price
let price = null;
let originalPrice = null;
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
if (priceMatches && priceMatches.length > 0) {
price = parseFloat(priceMatches[0].replace('$', ''));
if (priceMatches.length > 1) {
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
}
}
// Extract link
const linkEl = card.querySelector('a[href*="/product/"]');
let href = linkEl?.getAttribute('href') || '';
if (href && href.startsWith('/')) {
// @ts-ignore - runs in browser context
href = window.location.origin + href;
}
items.push({ name, price, originalPrice, href });
}
catch (err) {
console.error('Error parsing product card:', err);
}
});
return items;
});
logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`);
// Create follow-up requests for each product
const requests = productCards.map((card, index) => ({
url: card.href,
priority: 50,
maxRetries: 3,
metadata: {
...response.request.metadata,
productName: card.name,
productPrice: card.price,
productOriginalPrice: card.originalPrice,
requiresBrowser: true
},
callback: this.parseProductPage.bind(this)
}));
return { items: [], requests };
}
/**
* Parse individual product page
*/
async parseProductPage(response) {
const page = await this.engine['downloader'].getCurrentPage();
if (!page) {
throw new Error('No active page');
}
const productName = response.request.metadata.productName;
logger_1.logger.debug('scraper', `Parsing product: ${productName}`);
// Extract product details
const details = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const allText = document.body.textContent || '';
// Extract image
let fullSizeImage = null;
const mainImageSelectors = [
'img[class*="ProductImage"]',
'img[class*="product-image"]',
'[class*="ImageGallery"] img',
'main img',
'img[src*="images.dutchie.com"]'
];
for (const sel of mainImageSelectors) {
// @ts-ignore - runs in browser context
const img = document.querySelector(sel);
if (img?.src && img.src.includes('dutchie.com')) {
fullSizeImage = img.src;
break;
}
}
// Extract description
let description = '';
const descSelectors = [
'[class*="description"]',
'[class*="Description"]',
'[data-testid*="description"]',
'p[class*="product"]'
];
for (const sel of descSelectors) {
// @ts-ignore - runs in browser context
const el = document.querySelector(sel);
if (el?.textContent?.trim() && el.textContent.length > 20) {
description = el.textContent.trim();
break;
}
}
// Extract THC/CBD
let thc = null;
const thcPatterns = [
/THC[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+THC/i
];
for (const pattern of thcPatterns) {
const match = allText.match(pattern);
if (match) {
thc = parseFloat(match[1]);
break;
}
}
let cbd = null;
const cbdPatterns = [
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+CBD/i
];
for (const pattern of cbdPatterns) {
const match = allText.match(pattern);
if (match) {
cbd = parseFloat(match[1]);
break;
}
}
// Extract strain type
let strainType = null;
if (allText.match(/\bindica\b/i))
strainType = 'Indica';
else if (allText.match(/\bsativa\b/i))
strainType = 'Sativa';
else if (allText.match(/\bhybrid\b/i))
strainType = 'Hybrid';
// Extract brand
let brand = null;
const brandSelectors = [
'[class*="brand"]',
'[class*="Brand"]',
'[data-testid*="brand"]'
];
for (const sel of brandSelectors) {
// @ts-ignore - runs in browser context
const el = document.querySelector(sel);
if (el?.textContent?.trim()) {
brand = el.textContent.trim();
break;
}
}
// Extract metadata
const terpenes = [];
const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene'];
terpeneNames.forEach(terp => {
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
terpenes.push(terp);
}
});
const effects = [];
const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic'];
effectNames.forEach(effect => {
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
effects.push(effect);
}
});
return {
fullSizeImage,
description,
thc,
cbd,
strainType,
brand,
terpenes,
effects
};
});
// Create product item
const product = {
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
name: productName || 'Unknown Product',
description: details.description,
price: response.request.metadata.productPrice,
originalPrice: response.request.metadata.productOriginalPrice,
thcPercentage: details.thc || undefined,
cbdPercentage: details.cbd || undefined,
strainType: details.strainType || undefined,
brand: details.brand || undefined,
imageUrl: details.fullSizeImage || undefined,
dutchieUrl: response.url,
metadata: {
terpenes: details.terpenes,
effects: details.effects
},
storeId: response.request.metadata.storeId,
categoryId: response.request.metadata.categoryId
};
return { items: [product], requests: [] };
}
/**
* Scrape entire store
*/
async scrapeStore(storeId, parallel = 3) {
logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
try {
// Get all leaf categories (no children)
const categoriesResult = await migrate_1.pool.query(`
SELECT c.id, c.name
FROM categories c
WHERE c.store_id = $1
AND c.scrape_enabled = true
AND NOT EXISTS (
SELECT 1 FROM categories child
WHERE child.parent_id = c.id
)
ORDER BY c.name
`, [storeId]);
const categories = categoriesResult.rows;
logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`);
if (parallel === 1) {
// Sequential scraping (original behavior)
for (const category of categories) {
try {
await this.scrapeCategory(storeId, category.id);
await new Promise(resolve => setTimeout(resolve, 3000));
}
catch (error) {
logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`);
}
}
}
else {
// Parallel scraping with concurrency limit
const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel);
const successful = results.filter(r => r.status === 'fulfilled').length;
const failed = results.filter(r => r.status === 'rejected').length;
logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`);
}
// Update store last_scraped_at
await migrate_1.pool.query(`
UPDATE stores
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [storeId]);
logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`);
}
catch (error) {
logger_1.logger.error('scraper', `Store scrape failed: ${error}`);
throw error;
}
}
/**
* Scrape multiple categories in parallel with concurrency limit
*/
async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) {
const results = [];
// Process categories in batches
for (let i = 0; i < categories.length; i += concurrency) {
const batch = categories.slice(i, i + concurrency);
logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`);
const batchPromises = batch.map(category => {
// Create a new spider instance for each category
const engine = new ScraperEngine(1); // 1 concurrent request per spider
const spider = new DutchieSpider(engine);
return spider.scrapeCategory(storeId, category.id)
.catch(error => {
logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`);
throw error;
});
});
const batchResults = await Promise.allSettled(batchPromises);
results.push(...batchResults);
// Delay between batches to avoid overwhelming the server
if (i + concurrency < categories.length) {
logger_1.logger.info('scraper', 'Waiting 5s before next batch...');
await new Promise(resolve => setTimeout(resolve, 5000));
}
}
return results;
}
}
exports.DutchieSpider = DutchieSpider;