"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const migrate_1 = require("../db/migrate"); const proxy_1 = require("../services/proxy"); const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'; const NUM_WORKERS = parseInt(process.argv[2] || '15'); const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted'; const USE_PROXIES = process.argv[4] !== 'no-proxy'; async function getStore(name) { const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]); return result.rows[0] || null; } async function getCategories(storeId) { const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]); return result.rows; } async function scrapeWithProxy(workerId, store, category) { let browser = null; let proxyId = null; try { // Get a proxy (if enabled) let proxy = null; if (USE_PROXIES) { proxy = await (0, proxy_1.getActiveProxy)(); if (proxy) { proxyId = proxy.id; console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`); } else { console.log(`[Worker ${workerId}] No proxy available, using direct connection`); } } else { console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`); } // Build browser args const args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1080', ]; if (proxy) { if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') { args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`); } else { args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`); } } browser = await puppeteer_extra_1.default.launch({ headless: true, args, executablePath: process.env.PUPPETEER_EXECUTABLE_PATH, }); const page = await browser.newPage(); await page.setUserAgent(FIREFOX_USER_AGENT); await page.setViewport({ width: 1920, height: 1080 }); // Handle proxy auth if needed if (proxy?.username && proxy?.password) { await page.authenticate({ username: proxy.username, password: proxy.password, }); } console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`); // Navigate to the category page const response = await page.goto(category.url, { waitUntil: 'networkidle2', timeout: 60000, }); if (!response || !response.ok()) { throw new Error(`Failed to load page: ${response?.status()}`); } // Wait for products to load await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', { timeout: 30000, }).catch(() => { console.log(`[Worker ${workerId}] No products found on page`); }); // Extract products const products = await page.evaluate(() => { // Try data-testid first, then fall back to product links const listItems = document.querySelectorAll('[data-testid="product-list-item"]'); if (listItems.length > 0) return listItems.length; return document.querySelectorAll('a[href*="/product/"]').length; }); console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`); await browser.close(); return { success: true, products }; } catch (error) { console.error(`[Worker ${workerId}] Error:`, error.message); // Check for bot detection if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) { (0, proxy_1.putProxyInTimeout)(proxyId, error.message); } if (browser) { await browser.close().catch(() => { }); } return { success: false, products: 0, error: error.message }; } } async function worker(workerId, store, categories, categoryIndex) { while (categoryIndex.current < categories.length) { const idx = categoryIndex.current++; const category = categories[idx]; if (!category) break; console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`); const result = await scrapeWithProxy(workerId, store, category); if (result.success) { console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`); } else { console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`); } // Small delay between requests await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000)); } console.log(`[Worker ${workerId}] Finished all assigned work`); } async function main() { console.log(`\n${'='.repeat(60)}`); console.log(`Parallel Scraper - ${NUM_WORKERS} workers`); console.log(`Target: ${DISPENSARY_NAME}`); console.log(`User Agent: Firefox`); console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`); console.log(`${'='.repeat(60)}\n`); // Find the store const store = await getStore(DISPENSARY_NAME); if (!store) { console.error(`Store not found: ${DISPENSARY_NAME}`); process.exit(1); } console.log(`Found store: ${store.name} (ID: ${store.id})`); // Get categories const categories = await getCategories(store.id); if (categories.length === 0) { console.error('No categories found for this store'); process.exit(1); } console.log(`Found ${categories.length} categories to scrape`); console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`); // Check proxies const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies'); console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`); // Shared index for work distribution const categoryIndex = { current: 0 }; // For a store with few categories, we'll run multiple passes // Expand the work by duplicating categories for parallel workers const expandedCategories = []; const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1)); for (let i = 0; i < passes; i++) { expandedCategories.push(...categories); } console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`); // Start workers const workers = []; for (let i = 0; i < NUM_WORKERS; i++) { workers.push(worker(i + 1, store, expandedCategories, categoryIndex)); // Stagger worker starts await new Promise(resolve => setTimeout(resolve, 500)); } // Wait for all workers await Promise.all(workers); console.log(`\n${'='.repeat(60)}`); console.log('All workers completed!'); console.log(`${'='.repeat(60)}\n`); await migrate_1.pool.end(); } main().catch(console.error);