"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Downloader = void 0; const puppeteer_1 = __importDefault(require("puppeteer")); const axios_1 = __importDefault(require("axios")); const types_1 = require("./types"); const logger_1 = require("../services/logger"); // Fingerprint profiles for randomization const SCREEN_RESOLUTIONS = [ { width: 1920, height: 1080 }, { width: 1366, height: 768 }, { width: 1536, height: 864 }, { width: 1440, height: 900 }, { width: 1280, height: 720 }, { width: 2560, height: 1440 }, { width: 1680, height: 1050 }, { width: 1600, height: 900 }, ]; const TIMEZONES = [ 'America/New_York', 'America/Chicago', 'America/Denver', 'America/Los_Angeles', 'America/Phoenix', ]; const LANGUAGES = [ ['en-US', 'en'], ['en-US', 'en', 'es'], ['en-US'], ]; const PLATFORMS = [ 'Win32', 'MacIntel', 'Linux x86_64', ]; const WEBGL_VENDORS = [ 'Google Inc. (NVIDIA)', 'Google Inc. (Intel)', 'Google Inc. (AMD)', 'Intel Inc.', 'NVIDIA Corporation', ]; const WEBGL_RENDERERS = [ 'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)', 'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)', 'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)', 'Intel Iris OpenGL Engine', 'NVIDIA GeForce RTX 3070/PCIe/SSE2', 'AMD Radeon Pro 5500M OpenGL Engine', ]; function generateRandomFingerprint() { return { screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)], timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)], languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)], platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)], hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)], deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)], webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)], webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)], }; } class Downloader { browser = null; page = null; pageInUse = false; currentFingerprint = generateRandomFingerprint(); needsNewFingerprint = false; /** * Force new fingerprint on next browser creation */ rotateFingerprint() { this.needsNewFingerprint = true; logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled'); } /** * Initialize browser instance with fingerprint */ async getBrowser(forceNew = false) { // Create new browser if needed for fingerprint rotation if (forceNew || this.needsNewFingerprint) { await this.close(); this.currentFingerprint = generateRandomFingerprint(); this.needsNewFingerprint = false; logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`); } if (!this.browser || !this.browser.isConnected()) { const { screen } = this.currentFingerprint; const launchOptions = { headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', `--window-size=${screen.width},${screen.height}`, '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', '--disable-infobars', '--disable-extensions', ] }; this.browser = await puppeteer_1.default.launch(launchOptions); logger_1.logger.info('scraper', 'Browser instance created'); } return this.browser; } /** * Get or create a page instance with current fingerprint */ async getPage(forceNew = false) { if (!this.page || this.page.isClosed() || forceNew) { const browser = await this.getBrowser(forceNew); this.page = await browser.newPage(); const { screen } = this.currentFingerprint; await this.page.setViewport({ width: screen.width, height: screen.height, deviceScaleFactor: 1, }); // Apply fingerprint await this.applyFingerprint(this.page); logger_1.logger.debug('scraper', 'New page created with fingerprint'); } return this.page; } /** * Apply full fingerprint to page */ async applyFingerprint(page) { const fp = this.currentFingerprint; await page.evaluateOnNewDocument((fingerprint) => { // Hide webdriver Object.defineProperty(navigator, 'webdriver', { get: () => false, }); // Spoof platform Object.defineProperty(navigator, 'platform', { get: () => fingerprint.platform, }); // Spoof languages Object.defineProperty(navigator, 'languages', { get: () => fingerprint.languages, }); // Spoof hardware concurrency Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => fingerprint.hardwareConcurrency, }); // Spoof device memory Object.defineProperty(navigator, 'deviceMemory', { get: () => fingerprint.deviceMemory, }); // Spoof plugins (realistic count) Object.defineProperty(navigator, 'plugins', { get: () => { const plugins = []; for (let i = 0; i < 5; i++) { plugins.push({ name: `Plugin ${i}`, filename: `plugin${i}.dll`, description: `Description ${i}`, }); } plugins.length = 5; return plugins; }, }); // Chrome object window.chrome = { runtime: {}, loadTimes: () => ({}), csi: () => ({}), app: {}, }; // Permissions const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => parameters.name === 'notifications' ? Promise.resolve({ state: 'denied' }) : originalQuery(parameters); // WebGL fingerprint spoofing const getParameterProxyHandler = { apply: function (target, thisArg, argumentsList) { const param = argumentsList[0]; // UNMASKED_VENDOR_WEBGL if (param === 37445) { return fingerprint.webglVendor; } // UNMASKED_RENDERER_WEBGL if (param === 37446) { return fingerprint.webglRenderer; } return Reflect.apply(target, thisArg, argumentsList); } }; // Override WebGL const originalGetContext = HTMLCanvasElement.prototype.getContext; HTMLCanvasElement.prototype.getContext = function (type, ...args) { const context = originalGetContext.call(this, type, ...args); if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) { const glContext = context; const originalGetParameter = glContext.getParameter.bind(glContext); glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler); } return context; }; // Canvas fingerprint noise const originalToDataURL = HTMLCanvasElement.prototype.toDataURL; HTMLCanvasElement.prototype.toDataURL = function (type) { const context = this.getContext('2d'); if (context) { const imageData = context.getImageData(0, 0, this.width, this.height); for (let i = 0; i < imageData.data.length; i += 4) { // Add tiny noise to RGB values imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0); } context.putImageData(imageData, 0, 0); } return originalToDataURL.call(this, type); }; // Screen dimensions Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width }); Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height }); Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width }); Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 }); Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width }); Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 }); Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width }); Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height }); }, fp); // Set timezone via CDP const client = await page.target().createCDPSession(); await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone }); } /** * Apply stealth mode to page (legacy - now uses applyFingerprint) */ async makePageStealthy(page) { // Now handled by applyFingerprint await this.applyFingerprint(page); } /** * Configure proxy for browser */ getProxyArgs(proxy) { if (proxy.protocol === 'socks5') { return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`]; } else if (proxy.protocol === 'http' || proxy.protocol === 'https') { return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`]; } return []; } /** * HTTP-based fetch (lightweight, fast) */ async httpFetch(request) { try { const config = { timeout: 30000, headers: { 'User-Agent': request.metadata.userAgent || 'Mozilla/5.0', ...request.metadata.headers }, validateStatus: () => true // Don't throw on any status }; // Add proxy if available if (request.metadata.proxy) { const proxy = request.metadata.proxy; config.proxy = { host: proxy.host, port: proxy.port, protocol: proxy.protocol }; if (proxy.username && proxy.password) { config.proxy.auth = { username: proxy.username, password: proxy.password }; } } const response = await axios_1.default.get(request.url, config); return { url: request.url, statusCode: response.status, content: response.data, metadata: { headers: response.headers, method: 'http' }, request }; } catch (error) { const scraperError = new Error(error.message); if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') { scraperError.type = types_1.ErrorType.TIMEOUT; } else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') { scraperError.type = types_1.ErrorType.NETWORK_ERROR; } else { scraperError.type = types_1.ErrorType.UNKNOWN; } scraperError.retryable = true; scraperError.request = request; throw scraperError; } } /** * Browser-based fetch (for JS-heavy sites) */ async browserFetch(request) { // Wait if page is in use while (this.pageInUse) { await new Promise(resolve => setTimeout(resolve, 100)); } this.pageInUse = true; try { const page = await this.getPage(); // Apply stealth mode if required if (request.metadata.requiresStealth) { await this.makePageStealthy(page); } // Set user agent if (request.metadata.userAgent) { await page.setUserAgent(request.metadata.userAgent); } // Navigate to page - use networkidle2 for SPAs like Dutchie // Increased timeout to 90s - Dutchie pages can take 30-40s to fully load const navigationPromise = page.goto(request.url, { waitUntil: 'networkidle2', timeout: 90000 }); const response = await navigationPromise; if (!response) { throw new Error('Navigation failed - no response'); } // Wait for React to render product content // Try to wait for products, but don't fail if they don't appear (empty category) try { await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', { timeout: 10000 }); } catch { // Products might not exist in this category - continue anyway logger_1.logger.debug('scraper', 'No products found within timeout - continuing'); } // Additional wait for any lazy-loaded content await page.waitForTimeout(2000); // Check for lazy-loaded content await this.autoScroll(page); // Get page content const content = await page.content(); const statusCode = response.status(); return { url: request.url, statusCode, content, metadata: { method: 'browser', finalUrl: page.url() }, request }; } catch (error) { const scraperError = new Error(error.message); if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) { scraperError.type = types_1.ErrorType.TIMEOUT; } else if (error.message.includes('net::')) { scraperError.type = types_1.ErrorType.NETWORK_ERROR; } else if (error.message.includes('404')) { scraperError.type = types_1.ErrorType.NOT_FOUND; } else { scraperError.type = types_1.ErrorType.UNKNOWN; } scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND; scraperError.request = request; throw scraperError; } finally { this.pageInUse = false; } } /** * Auto-scroll to load lazy content */ async autoScroll(page) { try { await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0; const distance = 500; const maxScrolls = 20; // Prevent infinite scrolling let scrollCount = 0; const timer = setInterval(() => { // @ts-ignore - runs in browser context const scrollHeight = document.body.scrollHeight; // @ts-ignore - runs in browser context window.scrollBy(0, distance); totalHeight += distance; scrollCount++; if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) { clearInterval(timer); // Scroll back to top // @ts-ignore - runs in browser context window.scrollTo(0, 0); resolve(); } }, 200); }); }); // Wait for any lazy-loaded content await page.waitForTimeout(1000); } catch (error) { logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`); } } /** * Main fetch method - tries HTTP first, falls back to browser */ async fetch(request) { const startTime = Date.now(); try { // Force browser mode if required if (request.metadata.requiresBrowser) { logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`); const response = await this.browserFetch(request); logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`); return response; } // Try HTTP first (faster) try { logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`); const response = await this.httpFetch(request); // Check if we got a meaningful response if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) { logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`); return response; } // Fall through to browser mode for non-2xx responses logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`); } catch (httpError) { logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`); } // Fall back to browser request.metadata.requiresBrowser = true; const response = await this.browserFetch(request); logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`); return response; } catch (error) { logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`); throw error; } } /** * Evaluate JavaScript in the current page context */ async evaluate(fn) { if (!this.page || this.page.isClosed()) { throw new Error('No active page for evaluation'); } return await this.page.evaluate(fn); } /** * Get the current page (for custom operations) */ async getCurrentPage() { return this.page; } /** * Close the browser */ async close() { if (this.page && !this.page.isClosed()) { await this.page.close(); this.page = null; } if (this.browser && this.browser.isConnected()) { await this.browser.close(); this.browser = null; logger_1.logger.info('scraper', 'Browser closed'); } } /** * Clean up resources */ async cleanup() { await this.close(); } } exports.Downloader = Downloader;