653 lines
26 KiB
JavaScript
653 lines
26 KiB
JavaScript
"use strict";
|
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
if (k2 === undefined) k2 = k;
|
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
}
|
|
Object.defineProperty(o, k2, desc);
|
|
}) : (function(o, m, k, k2) {
|
|
if (k2 === undefined) k2 = k;
|
|
o[k2] = m[k];
|
|
}));
|
|
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
}) : function(o, v) {
|
|
o["default"] = v;
|
|
});
|
|
var __importStar = (this && this.__importStar) || (function () {
|
|
var ownKeys = function(o) {
|
|
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
var ar = [];
|
|
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
return ar;
|
|
};
|
|
return ownKeys(o);
|
|
};
|
|
return function (mod) {
|
|
if (mod && mod.__esModule) return mod;
|
|
var result = {};
|
|
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
__setModuleDefault(result, mod);
|
|
return result;
|
|
};
|
|
})();
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.DutchieSpider = exports.ScraperEngine = void 0;
|
|
const scheduler_1 = require("./scheduler");
|
|
const downloader_1 = require("./downloader");
|
|
const middlewares_1 = require("./middlewares");
|
|
const pipelines_1 = require("./pipelines");
|
|
const logger_1 = require("../services/logger");
|
|
const migrate_1 = require("../db/migrate");
|
|
/**
|
|
* Main Scraper Engine - orchestrates the entire scraping process
|
|
*/
|
|
class ScraperEngine {
|
|
scheduler;
|
|
downloader;
|
|
middlewareEngine;
|
|
pipelineEngine;
|
|
stats;
|
|
isRunning = false;
|
|
concurrency = 1; // Conservative default
|
|
constructor(concurrency = 1) {
|
|
this.scheduler = new scheduler_1.RequestScheduler();
|
|
this.downloader = new downloader_1.Downloader();
|
|
this.middlewareEngine = new middlewares_1.MiddlewareEngine();
|
|
this.pipelineEngine = new pipelines_1.PipelineEngine();
|
|
this.concurrency = concurrency;
|
|
// Initialize stats
|
|
this.stats = {
|
|
requestsTotal: 0,
|
|
requestsSuccess: 0,
|
|
requestsFailed: 0,
|
|
itemsScraped: 0,
|
|
itemsSaved: 0,
|
|
itemsDropped: 0,
|
|
errorsCount: 0,
|
|
startTime: new Date()
|
|
};
|
|
// Setup middlewares
|
|
this.setupMiddlewares();
|
|
// Setup pipelines
|
|
this.setupPipelines();
|
|
}
|
|
/**
|
|
* Setup middleware chain
|
|
*/
|
|
setupMiddlewares() {
|
|
this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware());
|
|
this.middlewareEngine.use(new middlewares_1.ProxyMiddleware());
|
|
this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware());
|
|
this.middlewareEngine.use(new middlewares_1.RetryMiddleware());
|
|
this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware());
|
|
this.middlewareEngine.use(new middlewares_1.StealthMiddleware());
|
|
}
|
|
/**
|
|
* Setup pipeline chain
|
|
*/
|
|
setupPipelines() {
|
|
this.pipelineEngine.use(new pipelines_1.ValidationPipeline());
|
|
this.pipelineEngine.use(new pipelines_1.SanitizationPipeline());
|
|
this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline());
|
|
this.pipelineEngine.use(new pipelines_1.ImagePipeline());
|
|
this.pipelineEngine.use(new pipelines_1.StatsPipeline());
|
|
this.pipelineEngine.use(new pipelines_1.DatabasePipeline());
|
|
}
|
|
/**
|
|
* Add a request to the queue
|
|
*/
|
|
enqueue(request) {
|
|
this.scheduler.enqueue(request);
|
|
}
|
|
/**
|
|
* Start the scraping engine
|
|
*/
|
|
async start() {
|
|
if (this.isRunning) {
|
|
logger_1.logger.warn('scraper', 'Engine is already running');
|
|
return;
|
|
}
|
|
this.isRunning = true;
|
|
this.stats.startTime = new Date();
|
|
logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`);
|
|
// Process queue
|
|
await this.processQueue();
|
|
this.isRunning = false;
|
|
this.stats.endTime = new Date();
|
|
this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime();
|
|
logger_1.logger.info('scraper', `✅ Scraper engine finished`);
|
|
this.logStats();
|
|
// Cleanup
|
|
await this.downloader.cleanup();
|
|
}
|
|
/**
|
|
* Process the request queue
|
|
*/
|
|
async processQueue() {
|
|
while (!this.scheduler.isEmpty() && this.isRunning) {
|
|
const request = this.scheduler.dequeue();
|
|
if (!request) {
|
|
// Wait a bit and check again
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
continue;
|
|
}
|
|
try {
|
|
await this.processRequest(request);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Failed to process request: ${error}`);
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Process a single request
|
|
*/
|
|
async processRequest(request) {
|
|
this.stats.requestsTotal++;
|
|
try {
|
|
logger_1.logger.debug('scraper', `Processing: ${request.url}`);
|
|
// Apply request middlewares
|
|
const processedRequest = await this.middlewareEngine.processRequest(request);
|
|
// Download
|
|
let response = await this.downloader.fetch(processedRequest);
|
|
// Apply response middlewares
|
|
response = await this.middlewareEngine.processResponse(response);
|
|
// Parse response using callback
|
|
const parseResult = await request.callback(response);
|
|
// Process items through pipeline
|
|
if (parseResult.items && parseResult.items.length > 0) {
|
|
for (const item of parseResult.items) {
|
|
await this.processItem(item, 'default');
|
|
}
|
|
}
|
|
// Enqueue follow-up requests
|
|
if (parseResult.requests && parseResult.requests.length > 0) {
|
|
for (const followUpRequest of parseResult.requests) {
|
|
this.scheduler.enqueue(followUpRequest);
|
|
}
|
|
}
|
|
this.stats.requestsSuccess++;
|
|
this.scheduler.markComplete(request);
|
|
}
|
|
catch (error) {
|
|
this.stats.requestsFailed++;
|
|
this.stats.errorsCount++;
|
|
logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`);
|
|
// Apply error middlewares
|
|
const handledError = await this.middlewareEngine.processError(error, request);
|
|
// If error is null, it was handled (e.g., retry)
|
|
if (handledError === null) {
|
|
this.scheduler.requeueForRetry(request);
|
|
}
|
|
else {
|
|
this.scheduler.markComplete(request);
|
|
// Call error handler if provided
|
|
if (request.errorHandler) {
|
|
await request.errorHandler(error, request);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Process an item through pipelines
|
|
*/
|
|
async processItem(item, spider) {
|
|
this.stats.itemsScraped++;
|
|
try {
|
|
const processedItem = await this.pipelineEngine.processItem(item, spider);
|
|
if (processedItem) {
|
|
this.stats.itemsSaved++;
|
|
}
|
|
else {
|
|
this.stats.itemsDropped++;
|
|
}
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Failed to process item: ${error}`);
|
|
this.stats.itemsDropped++;
|
|
}
|
|
}
|
|
/**
|
|
* Log statistics
|
|
*/
|
|
logStats() {
|
|
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
logger_1.logger.info('scraper', '📊 Scraper Statistics');
|
|
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`);
|
|
logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`);
|
|
logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`);
|
|
logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`);
|
|
// Get stats from StatsPipeline
|
|
const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline');
|
|
if (statsPipeline) {
|
|
const itemStats = statsPipeline.getStats();
|
|
logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`);
|
|
logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`);
|
|
logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`);
|
|
}
|
|
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
}
|
|
/**
|
|
* Stop the engine
|
|
*/
|
|
stop() {
|
|
this.isRunning = false;
|
|
logger_1.logger.info('scraper', 'Stopping scraper engine...');
|
|
}
|
|
/**
|
|
* Get current stats
|
|
*/
|
|
getStats() {
|
|
return { ...this.stats };
|
|
}
|
|
/**
|
|
* Get queue stats
|
|
*/
|
|
getQueueStats() {
|
|
return this.scheduler.getStats();
|
|
}
|
|
}
|
|
exports.ScraperEngine = ScraperEngine;
|
|
/**
|
|
* Spider for scraping Dutchie categories
|
|
*/
|
|
class DutchieSpider {
|
|
engine;
|
|
constructor(engine) {
|
|
this.engine = engine;
|
|
}
|
|
/**
|
|
* Scrape a category
|
|
*/
|
|
async scrapeCategory(storeId, categoryId) {
|
|
logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`);
|
|
const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`;
|
|
let registerScraper, updateScraperStats, completeScraper;
|
|
try {
|
|
// Import monitoring functions
|
|
const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor')));
|
|
registerScraper = monitor.registerScraper;
|
|
updateScraperStats = monitor.updateScraperStats;
|
|
completeScraper = monitor.completeScraper;
|
|
}
|
|
catch (e) {
|
|
// Monitoring not available
|
|
}
|
|
try {
|
|
// Get category info
|
|
const categoryResult = await migrate_1.pool.query(`
|
|
SELECT c.*, s.slug as store_slug, s.name as store_name
|
|
FROM categories c
|
|
JOIN stores s ON c.store_id = s.id
|
|
WHERE c.id = $1
|
|
`, [categoryId]);
|
|
if (categoryResult.rows.length === 0) {
|
|
throw new Error('Category not found');
|
|
}
|
|
const category = categoryResult.rows[0];
|
|
logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`);
|
|
// Register with monitoring system
|
|
if (registerScraper) {
|
|
registerScraper(scraperId, storeId, category.store_name, categoryId, category.name);
|
|
}
|
|
// Mark products as out of stock before scraping
|
|
await migrate_1.pool.query(`
|
|
UPDATE products
|
|
SET in_stock = false
|
|
WHERE store_id = $1 AND category_id = $2
|
|
`, [storeId, categoryId]);
|
|
if (updateScraperStats) {
|
|
updateScraperStats(scraperId, {}, 'Marking products as out of stock');
|
|
}
|
|
// Enqueue category page request
|
|
this.engine.enqueue({
|
|
url: category.dutchie_url,
|
|
priority: 100,
|
|
maxRetries: 3,
|
|
metadata: {
|
|
requiresBrowser: true,
|
|
storeId,
|
|
categoryId,
|
|
categorySlug: category.slug,
|
|
storeSlug: category.store_slug
|
|
},
|
|
callback: this.parseCategoryPage.bind(this)
|
|
});
|
|
// Start the engine
|
|
if (updateScraperStats) {
|
|
updateScraperStats(scraperId, {}, 'Scraping category page');
|
|
}
|
|
await this.engine.start();
|
|
// Update stats from engine
|
|
const engineStats = this.engine.getStats();
|
|
if (updateScraperStats) {
|
|
updateScraperStats(scraperId, {
|
|
requestsTotal: engineStats.requestsTotal,
|
|
requestsSuccess: engineStats.requestsSuccess,
|
|
itemsSaved: engineStats.itemsSaved,
|
|
itemsDropped: engineStats.itemsDropped,
|
|
errorsCount: engineStats.errorsCount
|
|
}, 'Finalizing');
|
|
}
|
|
// Update category last_scraped_at
|
|
await migrate_1.pool.query(`
|
|
UPDATE categories
|
|
SET last_scraped_at = CURRENT_TIMESTAMP
|
|
WHERE id = $1
|
|
`, [categoryId]);
|
|
logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`);
|
|
if (completeScraper) {
|
|
completeScraper(scraperId);
|
|
}
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
|
|
if (completeScraper) {
|
|
completeScraper(scraperId, error.toString());
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
/**
|
|
* Parse category page (product listing)
|
|
*/
|
|
async parseCategoryPage(response) {
|
|
const page = await this.engine['downloader'].getCurrentPage();
|
|
if (!page) {
|
|
throw new Error('No active page');
|
|
}
|
|
logger_1.logger.info('scraper', 'Parsing category page...');
|
|
// Extract product cards
|
|
const productCards = await page.evaluate(() => {
|
|
// @ts-ignore - runs in browser context
|
|
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
|
|
const items = [];
|
|
cards.forEach((card) => {
|
|
try {
|
|
const allText = card.textContent || '';
|
|
// Extract name
|
|
let name = '';
|
|
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
|
|
for (const sel of nameSelectors) {
|
|
const el = card.querySelector(sel);
|
|
if (el?.textContent?.trim()) {
|
|
name = el.textContent.trim().split('\n')[0].trim();
|
|
break;
|
|
}
|
|
}
|
|
if (!name || name.length < 2)
|
|
return;
|
|
// Extract price
|
|
let price = null;
|
|
let originalPrice = null;
|
|
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
|
|
if (priceMatches && priceMatches.length > 0) {
|
|
price = parseFloat(priceMatches[0].replace('$', ''));
|
|
if (priceMatches.length > 1) {
|
|
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
|
|
}
|
|
}
|
|
// Extract link
|
|
const linkEl = card.querySelector('a[href*="/product/"]');
|
|
let href = linkEl?.getAttribute('href') || '';
|
|
if (href && href.startsWith('/')) {
|
|
// @ts-ignore - runs in browser context
|
|
href = window.location.origin + href;
|
|
}
|
|
items.push({ name, price, originalPrice, href });
|
|
}
|
|
catch (err) {
|
|
console.error('Error parsing product card:', err);
|
|
}
|
|
});
|
|
return items;
|
|
});
|
|
logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`);
|
|
// Create follow-up requests for each product
|
|
const requests = productCards.map((card, index) => ({
|
|
url: card.href,
|
|
priority: 50,
|
|
maxRetries: 3,
|
|
metadata: {
|
|
...response.request.metadata,
|
|
productName: card.name,
|
|
productPrice: card.price,
|
|
productOriginalPrice: card.originalPrice,
|
|
requiresBrowser: true
|
|
},
|
|
callback: this.parseProductPage.bind(this)
|
|
}));
|
|
return { items: [], requests };
|
|
}
|
|
/**
|
|
* Parse individual product page
|
|
*/
|
|
async parseProductPage(response) {
|
|
const page = await this.engine['downloader'].getCurrentPage();
|
|
if (!page) {
|
|
throw new Error('No active page');
|
|
}
|
|
const productName = response.request.metadata.productName;
|
|
logger_1.logger.debug('scraper', `Parsing product: ${productName}`);
|
|
// Extract product details
|
|
const details = await page.evaluate(() => {
|
|
// @ts-ignore - runs in browser context
|
|
const allText = document.body.textContent || '';
|
|
// Extract image
|
|
let fullSizeImage = null;
|
|
const mainImageSelectors = [
|
|
'img[class*="ProductImage"]',
|
|
'img[class*="product-image"]',
|
|
'[class*="ImageGallery"] img',
|
|
'main img',
|
|
'img[src*="images.dutchie.com"]'
|
|
];
|
|
for (const sel of mainImageSelectors) {
|
|
// @ts-ignore - runs in browser context
|
|
const img = document.querySelector(sel);
|
|
if (img?.src && img.src.includes('dutchie.com')) {
|
|
fullSizeImage = img.src;
|
|
break;
|
|
}
|
|
}
|
|
// Extract description
|
|
let description = '';
|
|
const descSelectors = [
|
|
'[class*="description"]',
|
|
'[class*="Description"]',
|
|
'[data-testid*="description"]',
|
|
'p[class*="product"]'
|
|
];
|
|
for (const sel of descSelectors) {
|
|
// @ts-ignore - runs in browser context
|
|
const el = document.querySelector(sel);
|
|
if (el?.textContent?.trim() && el.textContent.length > 20) {
|
|
description = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
// Extract THC/CBD
|
|
let thc = null;
|
|
const thcPatterns = [
|
|
/THC[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/(\d+\.?\d*)\s*%\s+THC/i
|
|
];
|
|
for (const pattern of thcPatterns) {
|
|
const match = allText.match(pattern);
|
|
if (match) {
|
|
thc = parseFloat(match[1]);
|
|
break;
|
|
}
|
|
}
|
|
let cbd = null;
|
|
const cbdPatterns = [
|
|
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/(\d+\.?\d*)\s*%\s+CBD/i
|
|
];
|
|
for (const pattern of cbdPatterns) {
|
|
const match = allText.match(pattern);
|
|
if (match) {
|
|
cbd = parseFloat(match[1]);
|
|
break;
|
|
}
|
|
}
|
|
// Extract strain type
|
|
let strainType = null;
|
|
if (allText.match(/\bindica\b/i))
|
|
strainType = 'Indica';
|
|
else if (allText.match(/\bsativa\b/i))
|
|
strainType = 'Sativa';
|
|
else if (allText.match(/\bhybrid\b/i))
|
|
strainType = 'Hybrid';
|
|
// Extract brand
|
|
let brand = null;
|
|
const brandSelectors = [
|
|
'[class*="brand"]',
|
|
'[class*="Brand"]',
|
|
'[data-testid*="brand"]'
|
|
];
|
|
for (const sel of brandSelectors) {
|
|
// @ts-ignore - runs in browser context
|
|
const el = document.querySelector(sel);
|
|
if (el?.textContent?.trim()) {
|
|
brand = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
// Extract metadata
|
|
const terpenes = [];
|
|
const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene'];
|
|
terpeneNames.forEach(terp => {
|
|
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
|
|
terpenes.push(terp);
|
|
}
|
|
});
|
|
const effects = [];
|
|
const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic'];
|
|
effectNames.forEach(effect => {
|
|
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
|
|
effects.push(effect);
|
|
}
|
|
});
|
|
return {
|
|
fullSizeImage,
|
|
description,
|
|
thc,
|
|
cbd,
|
|
strainType,
|
|
brand,
|
|
terpenes,
|
|
effects
|
|
};
|
|
});
|
|
// Create product item
|
|
const product = {
|
|
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
|
|
name: productName || 'Unknown Product',
|
|
description: details.description,
|
|
price: response.request.metadata.productPrice,
|
|
originalPrice: response.request.metadata.productOriginalPrice,
|
|
thcPercentage: details.thc || undefined,
|
|
cbdPercentage: details.cbd || undefined,
|
|
strainType: details.strainType || undefined,
|
|
brand: details.brand || undefined,
|
|
imageUrl: details.fullSizeImage || undefined,
|
|
dutchieUrl: response.url,
|
|
metadata: {
|
|
terpenes: details.terpenes,
|
|
effects: details.effects
|
|
},
|
|
storeId: response.request.metadata.storeId,
|
|
categoryId: response.request.metadata.categoryId
|
|
};
|
|
return { items: [product], requests: [] };
|
|
}
|
|
/**
|
|
* Scrape entire store
|
|
*/
|
|
async scrapeStore(storeId, parallel = 3) {
|
|
logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
|
|
try {
|
|
// Get all leaf categories (no children)
|
|
const categoriesResult = await migrate_1.pool.query(`
|
|
SELECT c.id, c.name
|
|
FROM categories c
|
|
WHERE c.store_id = $1
|
|
AND c.scrape_enabled = true
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM categories child
|
|
WHERE child.parent_id = c.id
|
|
)
|
|
ORDER BY c.name
|
|
`, [storeId]);
|
|
const categories = categoriesResult.rows;
|
|
logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`);
|
|
if (parallel === 1) {
|
|
// Sequential scraping (original behavior)
|
|
for (const category of categories) {
|
|
try {
|
|
await this.scrapeCategory(storeId, category.id);
|
|
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Parallel scraping with concurrency limit
|
|
const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel);
|
|
const successful = results.filter(r => r.status === 'fulfilled').length;
|
|
const failed = results.filter(r => r.status === 'rejected').length;
|
|
logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`);
|
|
}
|
|
// Update store last_scraped_at
|
|
await migrate_1.pool.query(`
|
|
UPDATE stores
|
|
SET last_scraped_at = CURRENT_TIMESTAMP
|
|
WHERE id = $1
|
|
`, [storeId]);
|
|
logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Store scrape failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
/**
|
|
* Scrape multiple categories in parallel with concurrency limit
|
|
*/
|
|
async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) {
|
|
const results = [];
|
|
// Process categories in batches
|
|
for (let i = 0; i < categories.length; i += concurrency) {
|
|
const batch = categories.slice(i, i + concurrency);
|
|
logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`);
|
|
const batchPromises = batch.map(category => {
|
|
// Create a new spider instance for each category
|
|
const engine = new ScraperEngine(1); // 1 concurrent request per spider
|
|
const spider = new DutchieSpider(engine);
|
|
return spider.scrapeCategory(storeId, category.id)
|
|
.catch(error => {
|
|
logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`);
|
|
throw error;
|
|
});
|
|
});
|
|
const batchResults = await Promise.allSettled(batchPromises);
|
|
results.push(...batchResults);
|
|
// Delay between batches to avoid overwhelming the server
|
|
if (i + concurrency < categories.length) {
|
|
logger_1.logger.info('scraper', 'Waiting 5s before next batch...');
|
|
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
}
|
|
exports.DutchieSpider = DutchieSpider;
|