"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const express_1 = require("express"); const migrate_1 = require("../db/migrate"); const proxy_1 = require("../services/proxy"); const middleware_1 = require("../auth/middleware"); const router = (0, express_1.Router)(); router.use(middleware_1.authMiddleware); const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'; // In-memory job tracking const activeJobs = new Map(); // Get job status router.get('/status/:jobId', (req, res) => { const job = activeJobs.get(req.params.jobId); if (!job) { return res.status(404).json({ error: 'Job not found' }); } res.json(job); }); // List active jobs router.get('/jobs', (req, res) => { const jobs = Array.from(activeJobs.values()); res.json({ jobs }); }); // Start parallel scrape router.post('/start', async (req, res) => { const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body; try { // Find the store const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]); if (storeResult.rows.length === 0) { return res.status(404).json({ error: `Store not found: ${storeName}` }); } const store = storeResult.rows[0]; // Get categories const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]); if (categoriesResult.rows.length === 0) { return res.status(404).json({ error: 'No categories found for this store' }); } const categories = categoriesResult.rows; // Create job const jobId = `scrape-${Date.now()}`; const job = { id: jobId, storeName: store.name, status: 'running', workers, startedAt: new Date(), results: [] }; activeJobs.set(jobId, job); // Start scraping in background runParallelScrape(job, store, categories, workers, useProxies).catch(err => { console.error('Parallel scrape error:', err); job.status = 'failed'; }); res.json({ message: 'Parallel scrape started', jobId, store: store.name, categories: categories.length, workers }); } catch (error) { console.error('Failed to start parallel scrape:', error); res.status(500).json({ error: error.message }); } }); async function runParallelScrape(job, store, categories, numWorkers, useProxies) { const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); puppeteer.use(StealthPlugin()); // Expand categories for multiple passes const expandedCategories = []; const passes = Math.ceil(numWorkers / Math.max(categories.length, 1)); for (let i = 0; i < passes; i++) { expandedCategories.push(...categories); } const categoryIndex = { current: 0 }; const worker = async (workerId) => { while (categoryIndex.current < expandedCategories.length) { const idx = categoryIndex.current++; const category = expandedCategories[idx]; if (!category) break; const result = await scrapeCategory(puppeteer, workerId, category, useProxies); job.results.push({ category: category.name, success: result.success, products: result.products, error: result.error }); // Delay between requests await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000)); } }; // Start workers with staggered starts const workers = []; for (let i = 0; i < numWorkers; i++) { workers.push(worker(i + 1)); await new Promise(resolve => setTimeout(resolve, 500)); } await Promise.all(workers); job.status = 'completed'; job.completedAt = new Date(); // Clean up job after 1 hour setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000); } async function scrapeCategory(puppeteer, workerId, category, useProxies) { let browser = null; let proxyId = null; try { let proxy = null; if (useProxies) { proxy = await (0, proxy_1.getActiveProxy)(); } const args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1080', ]; if (proxy) { proxyId = proxy.id; if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') { args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`); } else { args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`); } } browser = await puppeteer.launch({ headless: 'new', args, executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium', }); const page = await browser.newPage(); await page.setUserAgent(FIREFOX_USER_AGENT); await page.setViewport({ width: 1920, height: 1080 }); if (proxy?.username && proxy?.password) { await page.authenticate({ username: proxy.username, password: proxy.password, }); } console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`); const response = await page.goto(category.url, { waitUntil: 'networkidle2', timeout: 60000, }); if (!response || !response.ok()) { throw new Error(`Failed to load page: ${response?.status()}`); } await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', { timeout: 30000, }).catch(() => { }); const products = await page.evaluate(() => { // Try data-testid first, then fall back to product links const listItems = document.querySelectorAll('[data-testid="product-list-item"]'); if (listItems.length > 0) return listItems.length; return document.querySelectorAll('a[href*="/product/"]').length; }); console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`); await browser.close(); return { success: true, products }; } catch (error) { console.error(`[Worker ${workerId}] Error:`, error.message); if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) { (0, proxy_1.putProxyInTimeout)(proxyId, error.message); } if (browser) { await browser.close().catch(() => { }); } return { success: false, products: 0, error: error.message }; } } exports.default = router;