325 lines
12 KiB
JavaScript
325 lines
12 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.Downloader = void 0;
|
|
const puppeteer_1 = __importDefault(require("puppeteer"));
|
|
const axios_1 = __importDefault(require("axios"));
|
|
const types_1 = require("./types");
|
|
const logger_1 = require("../services/logger");
|
|
class Downloader {
|
|
browser = null;
|
|
page = null;
|
|
pageInUse = false;
|
|
/**
|
|
* Initialize browser instance (lazy initialization)
|
|
*/
|
|
async getBrowser() {
|
|
if (!this.browser || !this.browser.isConnected()) {
|
|
const launchOptions = {
|
|
headless: 'new',
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--window-size=1920,1080',
|
|
'--disable-web-security',
|
|
'--disable-features=IsolateOrigins,site-per-process'
|
|
]
|
|
};
|
|
this.browser = await puppeteer_1.default.launch(launchOptions);
|
|
logger_1.logger.info('scraper', 'Browser instance created');
|
|
}
|
|
return this.browser;
|
|
}
|
|
/**
|
|
* Get or create a page instance
|
|
*/
|
|
async getPage() {
|
|
if (!this.page || this.page.isClosed()) {
|
|
const browser = await this.getBrowser();
|
|
this.page = await browser.newPage();
|
|
await this.page.setViewport({ width: 1920, height: 1080 });
|
|
logger_1.logger.debug('scraper', 'New page created');
|
|
}
|
|
return this.page;
|
|
}
|
|
/**
|
|
* Apply stealth mode to page
|
|
*/
|
|
async makePageStealthy(page) {
|
|
await page.evaluateOnNewDocument(() => {
|
|
// @ts-ignore - runs in browser context
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false,
|
|
});
|
|
// @ts-ignore - runs in browser context
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => [1, 2, 3, 4, 5],
|
|
});
|
|
// @ts-ignore - runs in browser context
|
|
Object.defineProperty(navigator, 'languages', {
|
|
get: () => ['en-US', 'en'],
|
|
});
|
|
// @ts-ignore - runs in browser context
|
|
window.chrome = {
|
|
runtime: {},
|
|
};
|
|
// @ts-ignore - runs in browser context
|
|
const originalQuery = window.navigator.permissions.query;
|
|
// @ts-ignore - runs in browser context
|
|
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
|
|
? Promise.resolve({ state: 'denied' })
|
|
: originalQuery(parameters);
|
|
});
|
|
}
|
|
/**
|
|
* Configure proxy for browser
|
|
*/
|
|
getProxyArgs(proxy) {
|
|
if (proxy.protocol === 'socks5') {
|
|
return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
|
|
}
|
|
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
|
|
return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
|
|
}
|
|
return [];
|
|
}
|
|
/**
|
|
* HTTP-based fetch (lightweight, fast)
|
|
*/
|
|
async httpFetch(request) {
|
|
try {
|
|
const config = {
|
|
timeout: 30000,
|
|
headers: {
|
|
'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
|
|
...request.metadata.headers
|
|
},
|
|
validateStatus: () => true // Don't throw on any status
|
|
};
|
|
// Add proxy if available
|
|
if (request.metadata.proxy) {
|
|
const proxy = request.metadata.proxy;
|
|
config.proxy = {
|
|
host: proxy.host,
|
|
port: proxy.port,
|
|
protocol: proxy.protocol
|
|
};
|
|
if (proxy.username && proxy.password) {
|
|
config.proxy.auth = {
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
};
|
|
}
|
|
}
|
|
const response = await axios_1.default.get(request.url, config);
|
|
return {
|
|
url: request.url,
|
|
statusCode: response.status,
|
|
content: response.data,
|
|
metadata: {
|
|
headers: response.headers,
|
|
method: 'http'
|
|
},
|
|
request
|
|
};
|
|
}
|
|
catch (error) {
|
|
const scraperError = new Error(error.message);
|
|
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
|
|
scraperError.type = types_1.ErrorType.TIMEOUT;
|
|
}
|
|
else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
|
|
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
|
|
}
|
|
else {
|
|
scraperError.type = types_1.ErrorType.UNKNOWN;
|
|
}
|
|
scraperError.retryable = true;
|
|
scraperError.request = request;
|
|
throw scraperError;
|
|
}
|
|
}
|
|
/**
|
|
* Browser-based fetch (for JS-heavy sites)
|
|
*/
|
|
async browserFetch(request) {
|
|
// Wait if page is in use
|
|
while (this.pageInUse) {
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
}
|
|
this.pageInUse = true;
|
|
try {
|
|
const page = await this.getPage();
|
|
// Apply stealth mode if required
|
|
if (request.metadata.requiresStealth) {
|
|
await this.makePageStealthy(page);
|
|
}
|
|
// Set user agent
|
|
if (request.metadata.userAgent) {
|
|
await page.setUserAgent(request.metadata.userAgent);
|
|
}
|
|
// Navigate to page
|
|
const navigationPromise = page.goto(request.url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
const response = await navigationPromise;
|
|
if (!response) {
|
|
throw new Error('Navigation failed - no response');
|
|
}
|
|
// Wait for initial render
|
|
await page.waitForTimeout(3000);
|
|
// Check for lazy-loaded content
|
|
await this.autoScroll(page);
|
|
// Get page content
|
|
const content = await page.content();
|
|
const statusCode = response.status();
|
|
return {
|
|
url: request.url,
|
|
statusCode,
|
|
content,
|
|
metadata: {
|
|
method: 'browser',
|
|
finalUrl: page.url()
|
|
},
|
|
request
|
|
};
|
|
}
|
|
catch (error) {
|
|
const scraperError = new Error(error.message);
|
|
if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
|
|
scraperError.type = types_1.ErrorType.TIMEOUT;
|
|
}
|
|
else if (error.message.includes('net::')) {
|
|
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
|
|
}
|
|
else if (error.message.includes('404')) {
|
|
scraperError.type = types_1.ErrorType.NOT_FOUND;
|
|
}
|
|
else {
|
|
scraperError.type = types_1.ErrorType.UNKNOWN;
|
|
}
|
|
scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
|
|
scraperError.request = request;
|
|
throw scraperError;
|
|
}
|
|
finally {
|
|
this.pageInUse = false;
|
|
}
|
|
}
|
|
/**
|
|
* Auto-scroll to load lazy content
|
|
*/
|
|
async autoScroll(page) {
|
|
try {
|
|
await page.evaluate(async () => {
|
|
await new Promise((resolve) => {
|
|
let totalHeight = 0;
|
|
const distance = 500;
|
|
const maxScrolls = 20; // Prevent infinite scrolling
|
|
let scrollCount = 0;
|
|
const timer = setInterval(() => {
|
|
// @ts-ignore - runs in browser context
|
|
const scrollHeight = document.body.scrollHeight;
|
|
// @ts-ignore - runs in browser context
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
scrollCount++;
|
|
if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
|
|
clearInterval(timer);
|
|
// Scroll back to top
|
|
// @ts-ignore - runs in browser context
|
|
window.scrollTo(0, 0);
|
|
resolve();
|
|
}
|
|
}, 200);
|
|
});
|
|
});
|
|
// Wait for any lazy-loaded content
|
|
await page.waitForTimeout(1000);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
|
|
}
|
|
}
|
|
/**
|
|
* Main fetch method - tries HTTP first, falls back to browser
|
|
*/
|
|
async fetch(request) {
|
|
const startTime = Date.now();
|
|
try {
|
|
// Force browser mode if required
|
|
if (request.metadata.requiresBrowser) {
|
|
logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
|
|
const response = await this.browserFetch(request);
|
|
logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
|
|
return response;
|
|
}
|
|
// Try HTTP first (faster)
|
|
try {
|
|
logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
|
|
const response = await this.httpFetch(request);
|
|
// Check if we got a meaningful response
|
|
if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
|
|
logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
|
|
return response;
|
|
}
|
|
// Fall through to browser mode for non-2xx responses
|
|
logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
|
|
}
|
|
catch (httpError) {
|
|
logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
|
|
}
|
|
// Fall back to browser
|
|
request.metadata.requiresBrowser = true;
|
|
const response = await this.browserFetch(request);
|
|
logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
|
|
return response;
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
/**
|
|
* Evaluate JavaScript in the current page context
|
|
*/
|
|
async evaluate(fn) {
|
|
if (!this.page || this.page.isClosed()) {
|
|
throw new Error('No active page for evaluation');
|
|
}
|
|
return await this.page.evaluate(fn);
|
|
}
|
|
/**
|
|
* Get the current page (for custom operations)
|
|
*/
|
|
async getCurrentPage() {
|
|
return this.page;
|
|
}
|
|
/**
|
|
* Close the browser
|
|
*/
|
|
async close() {
|
|
if (this.page && !this.page.isClosed()) {
|
|
await this.page.close();
|
|
this.page = null;
|
|
}
|
|
if (this.browser && this.browser.isConnected()) {
|
|
await this.browser.close();
|
|
this.browser = null;
|
|
logger_1.logger.info('scraper', 'Browser closed');
|
|
}
|
|
}
|
|
/**
|
|
* Clean up resources
|
|
*/
|
|
async cleanup() {
|
|
await this.close();
|
|
}
|
|
}
|
|
exports.Downloader = Downloader;
|