"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Downloader = void 0; const puppeteer_1 = __importDefault(require("puppeteer")); const axios_1 = __importDefault(require("axios")); const types_1 = require("./types"); const logger_1 = require("../services/logger"); class Downloader { browser = null; page = null; pageInUse = false; /** * Initialize browser instance (lazy initialization) */ async getBrowser() { if (!this.browser || !this.browser.isConnected()) { const launchOptions = { headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', '--window-size=1920,1080', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process' ] }; this.browser = await puppeteer_1.default.launch(launchOptions); logger_1.logger.info('scraper', 'Browser instance created'); } return this.browser; } /** * Get or create a page instance */ async getPage() { if (!this.page || this.page.isClosed()) { const browser = await this.getBrowser(); this.page = await browser.newPage(); await this.page.setViewport({ width: 1920, height: 1080 }); logger_1.logger.debug('scraper', 'New page created'); } return this.page; } /** * Apply stealth mode to page */ async makePageStealthy(page) { await page.evaluateOnNewDocument(() => { // @ts-ignore - runs in browser context Object.defineProperty(navigator, 'webdriver', { get: () => false, }); // @ts-ignore - runs in browser context Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5], }); // @ts-ignore - runs in browser context Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'], }); // @ts-ignore - runs in browser context window.chrome = { runtime: {}, }; // @ts-ignore - runs in browser context const originalQuery = window.navigator.permissions.query; // @ts-ignore - runs in browser context window.navigator.permissions.query = (parameters) => parameters.name === 'notifications' ? Promise.resolve({ state: 'denied' }) : originalQuery(parameters); }); } /** * Configure proxy for browser */ getProxyArgs(proxy) { if (proxy.protocol === 'socks5') { return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`]; } else if (proxy.protocol === 'http' || proxy.protocol === 'https') { return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`]; } return []; } /** * HTTP-based fetch (lightweight, fast) */ async httpFetch(request) { try { const config = { timeout: 30000, headers: { 'User-Agent': request.metadata.userAgent || 'Mozilla/5.0', ...request.metadata.headers }, validateStatus: () => true // Don't throw on any status }; // Add proxy if available if (request.metadata.proxy) { const proxy = request.metadata.proxy; config.proxy = { host: proxy.host, port: proxy.port, protocol: proxy.protocol }; if (proxy.username && proxy.password) { config.proxy.auth = { username: proxy.username, password: proxy.password }; } } const response = await axios_1.default.get(request.url, config); return { url: request.url, statusCode: response.status, content: response.data, metadata: { headers: response.headers, method: 'http' }, request }; } catch (error) { const scraperError = new Error(error.message); if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') { scraperError.type = types_1.ErrorType.TIMEOUT; } else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') { scraperError.type = types_1.ErrorType.NETWORK_ERROR; } else { scraperError.type = types_1.ErrorType.UNKNOWN; } scraperError.retryable = true; scraperError.request = request; throw scraperError; } } /** * Browser-based fetch (for JS-heavy sites) */ async browserFetch(request) { // Wait if page is in use while (this.pageInUse) { await new Promise(resolve => setTimeout(resolve, 100)); } this.pageInUse = true; try { const page = await this.getPage(); // Apply stealth mode if required if (request.metadata.requiresStealth) { await this.makePageStealthy(page); } // Set user agent if (request.metadata.userAgent) { await page.setUserAgent(request.metadata.userAgent); } // Navigate to page const navigationPromise = page.goto(request.url, { waitUntil: 'domcontentloaded', timeout: 60000 }); const response = await navigationPromise; if (!response) { throw new Error('Navigation failed - no response'); } // Wait for initial render await page.waitForTimeout(3000); // Check for lazy-loaded content await this.autoScroll(page); // Get page content const content = await page.content(); const statusCode = response.status(); return { url: request.url, statusCode, content, metadata: { method: 'browser', finalUrl: page.url() }, request }; } catch (error) { const scraperError = new Error(error.message); if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) { scraperError.type = types_1.ErrorType.TIMEOUT; } else if (error.message.includes('net::')) { scraperError.type = types_1.ErrorType.NETWORK_ERROR; } else if (error.message.includes('404')) { scraperError.type = types_1.ErrorType.NOT_FOUND; } else { scraperError.type = types_1.ErrorType.UNKNOWN; } scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND; scraperError.request = request; throw scraperError; } finally { this.pageInUse = false; } } /** * Auto-scroll to load lazy content */ async autoScroll(page) { try { await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0; const distance = 500; const maxScrolls = 20; // Prevent infinite scrolling let scrollCount = 0; const timer = setInterval(() => { // @ts-ignore - runs in browser context const scrollHeight = document.body.scrollHeight; // @ts-ignore - runs in browser context window.scrollBy(0, distance); totalHeight += distance; scrollCount++; if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) { clearInterval(timer); // Scroll back to top // @ts-ignore - runs in browser context window.scrollTo(0, 0); resolve(); } }, 200); }); }); // Wait for any lazy-loaded content await page.waitForTimeout(1000); } catch (error) { logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`); } } /** * Main fetch method - tries HTTP first, falls back to browser */ async fetch(request) { const startTime = Date.now(); try { // Force browser mode if required if (request.metadata.requiresBrowser) { logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`); const response = await this.browserFetch(request); logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`); return response; } // Try HTTP first (faster) try { logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`); const response = await this.httpFetch(request); // Check if we got a meaningful response if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) { logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`); return response; } // Fall through to browser mode for non-2xx responses logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`); } catch (httpError) { logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`); } // Fall back to browser request.metadata.requiresBrowser = true; const response = await this.browserFetch(request); logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`); return response; } catch (error) { logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`); throw error; } } /** * Evaluate JavaScript in the current page context */ async evaluate(fn) { if (!this.page || this.page.isClosed()) { throw new Error('No active page for evaluation'); } return await this.page.evaluate(fn); } /** * Get the current page (for custom operations) */ async getCurrentPage() { return this.page; } /** * Close the browser */ async close() { if (this.page && !this.page.isClosed()) { await this.page.close(); this.page = null; } if (this.browser && this.browser.isConnected()) { await this.browser.close(); this.browser = null; logger_1.logger.info('scraper', 'Browser closed'); } } /** * Clean up resources */ async cleanup() { await this.close(); } } exports.Downloader = Downloader;