Files
cannaiq/backend/dist/scraper-v2/downloader.js
2025-11-28 19:45:44 -07:00

325 lines
12 KiB
JavaScript

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Downloader = void 0;
const puppeteer_1 = __importDefault(require("puppeteer"));
const axios_1 = __importDefault(require("axios"));
const types_1 = require("./types");
const logger_1 = require("../services/logger");
class Downloader {
browser = null;
page = null;
pageInUse = false;
/**
* Initialize browser instance (lazy initialization)
*/
async getBrowser() {
if (!this.browser || !this.browser.isConnected()) {
const launchOptions = {
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
]
};
this.browser = await puppeteer_1.default.launch(launchOptions);
logger_1.logger.info('scraper', 'Browser instance created');
}
return this.browser;
}
/**
* Get or create a page instance
*/
async getPage() {
if (!this.page || this.page.isClosed()) {
const browser = await this.getBrowser();
this.page = await browser.newPage();
await this.page.setViewport({ width: 1920, height: 1080 });
logger_1.logger.debug('scraper', 'New page created');
}
return this.page;
}
/**
* Apply stealth mode to page
*/
async makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
// @ts-ignore - runs in browser context
window.chrome = {
runtime: {},
};
// @ts-ignore - runs in browser context
const originalQuery = window.navigator.permissions.query;
// @ts-ignore - runs in browser context
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' })
: originalQuery(parameters);
});
}
/**
* Configure proxy for browser
*/
getProxyArgs(proxy) {
if (proxy.protocol === 'socks5') {
return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
}
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
}
return [];
}
/**
* HTTP-based fetch (lightweight, fast)
*/
async httpFetch(request) {
try {
const config = {
timeout: 30000,
headers: {
'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
...request.metadata.headers
},
validateStatus: () => true // Don't throw on any status
};
// Add proxy if available
if (request.metadata.proxy) {
const proxy = request.metadata.proxy;
config.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol
};
if (proxy.username && proxy.password) {
config.proxy.auth = {
username: proxy.username,
password: proxy.password
};
}
}
const response = await axios_1.default.get(request.url, config);
return {
url: request.url,
statusCode: response.status,
content: response.data,
metadata: {
headers: response.headers,
method: 'http'
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = true;
scraperError.request = request;
throw scraperError;
}
}
/**
* Browser-based fetch (for JS-heavy sites)
*/
async browserFetch(request) {
// Wait if page is in use
while (this.pageInUse) {
await new Promise(resolve => setTimeout(resolve, 100));
}
this.pageInUse = true;
try {
const page = await this.getPage();
// Apply stealth mode if required
if (request.metadata.requiresStealth) {
await this.makePageStealthy(page);
}
// Set user agent
if (request.metadata.userAgent) {
await page.setUserAgent(request.metadata.userAgent);
}
// Navigate to page
const navigationPromise = page.goto(request.url, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
const response = await navigationPromise;
if (!response) {
throw new Error('Navigation failed - no response');
}
// Wait for initial render
await page.waitForTimeout(3000);
// Check for lazy-loaded content
await this.autoScroll(page);
// Get page content
const content = await page.content();
const statusCode = response.status();
return {
url: request.url,
statusCode,
content,
metadata: {
method: 'browser',
finalUrl: page.url()
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.message.includes('net::')) {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else if (error.message.includes('404')) {
scraperError.type = types_1.ErrorType.NOT_FOUND;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
scraperError.request = request;
throw scraperError;
}
finally {
this.pageInUse = false;
}
}
/**
* Auto-scroll to load lazy content
*/
async autoScroll(page) {
try {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 500;
const maxScrolls = 20; // Prevent infinite scrolling
let scrollCount = 0;
const timer = setInterval(() => {
// @ts-ignore - runs in browser context
const scrollHeight = document.body.scrollHeight;
// @ts-ignore - runs in browser context
window.scrollBy(0, distance);
totalHeight += distance;
scrollCount++;
if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
clearInterval(timer);
// Scroll back to top
// @ts-ignore - runs in browser context
window.scrollTo(0, 0);
resolve();
}
}, 200);
});
});
// Wait for any lazy-loaded content
await page.waitForTimeout(1000);
}
catch (error) {
logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
}
}
/**
* Main fetch method - tries HTTP first, falls back to browser
*/
async fetch(request) {
const startTime = Date.now();
try {
// Force browser mode if required
if (request.metadata.requiresBrowser) {
logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
return response;
}
// Try HTTP first (faster)
try {
logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
const response = await this.httpFetch(request);
// Check if we got a meaningful response
if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
return response;
}
// Fall through to browser mode for non-2xx responses
logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
}
catch (httpError) {
logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
}
// Fall back to browser
request.metadata.requiresBrowser = true;
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
return response;
}
catch (error) {
logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
throw error;
}
}
/**
* Evaluate JavaScript in the current page context
*/
async evaluate(fn) {
if (!this.page || this.page.isClosed()) {
throw new Error('No active page for evaluation');
}
return await this.page.evaluate(fn);
}
/**
* Get the current page (for custom operations)
*/
async getCurrentPage() {
return this.page;
}
/**
* Close the browser
*/
async close() {
if (this.page && !this.page.isClosed()) {
await this.page.close();
this.page = null;
}
if (this.browser && this.browser.isConnected()) {
await this.browser.close();
this.browser = null;
logger_1.logger.info('scraper', 'Browser closed');
}
}
/**
* Clean up resources
*/
async cleanup() {
await this.close();
}
}
exports.Downloader = Downloader;