Files
cannaiq/backend/dist/scraper-v2/downloader.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

503 lines
20 KiB
JavaScript

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Downloader = void 0;
const puppeteer_1 = __importDefault(require("puppeteer"));
const axios_1 = __importDefault(require("axios"));
const types_1 = require("./types");
const logger_1 = require("../services/logger");
// Fingerprint profiles for randomization
const SCREEN_RESOLUTIONS = [
{ width: 1920, height: 1080 },
{ width: 1366, height: 768 },
{ width: 1536, height: 864 },
{ width: 1440, height: 900 },
{ width: 1280, height: 720 },
{ width: 2560, height: 1440 },
{ width: 1680, height: 1050 },
{ width: 1600, height: 900 },
];
const TIMEZONES = [
'America/New_York',
'America/Chicago',
'America/Denver',
'America/Los_Angeles',
'America/Phoenix',
];
const LANGUAGES = [
['en-US', 'en'],
['en-US', 'en', 'es'],
['en-US'],
];
const PLATFORMS = [
'Win32',
'MacIntel',
'Linux x86_64',
];
const WEBGL_VENDORS = [
'Google Inc. (NVIDIA)',
'Google Inc. (Intel)',
'Google Inc. (AMD)',
'Intel Inc.',
'NVIDIA Corporation',
];
const WEBGL_RENDERERS = [
'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
'Intel Iris OpenGL Engine',
'NVIDIA GeForce RTX 3070/PCIe/SSE2',
'AMD Radeon Pro 5500M OpenGL Engine',
];
function generateRandomFingerprint() {
return {
screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
};
}
class Downloader {
browser = null;
page = null;
pageInUse = false;
currentFingerprint = generateRandomFingerprint();
needsNewFingerprint = false;
/**
* Force new fingerprint on next browser creation
*/
rotateFingerprint() {
this.needsNewFingerprint = true;
logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled');
}
/**
* Initialize browser instance with fingerprint
*/
async getBrowser(forceNew = false) {
// Create new browser if needed for fingerprint rotation
if (forceNew || this.needsNewFingerprint) {
await this.close();
this.currentFingerprint = generateRandomFingerprint();
this.needsNewFingerprint = false;
logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
}
if (!this.browser || !this.browser.isConnected()) {
const { screen } = this.currentFingerprint;
const launchOptions = {
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
`--window-size=${screen.width},${screen.height}`,
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-infobars',
'--disable-extensions',
]
};
this.browser = await puppeteer_1.default.launch(launchOptions);
logger_1.logger.info('scraper', 'Browser instance created');
}
return this.browser;
}
/**
* Get or create a page instance with current fingerprint
*/
async getPage(forceNew = false) {
if (!this.page || this.page.isClosed() || forceNew) {
const browser = await this.getBrowser(forceNew);
this.page = await browser.newPage();
const { screen } = this.currentFingerprint;
await this.page.setViewport({
width: screen.width,
height: screen.height,
deviceScaleFactor: 1,
});
// Apply fingerprint
await this.applyFingerprint(this.page);
logger_1.logger.debug('scraper', 'New page created with fingerprint');
}
return this.page;
}
/**
* Apply full fingerprint to page
*/
async applyFingerprint(page) {
const fp = this.currentFingerprint;
await page.evaluateOnNewDocument((fingerprint) => {
// Hide webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
// Spoof platform
Object.defineProperty(navigator, 'platform', {
get: () => fingerprint.platform,
});
// Spoof languages
Object.defineProperty(navigator, 'languages', {
get: () => fingerprint.languages,
});
// Spoof hardware concurrency
Object.defineProperty(navigator, 'hardwareConcurrency', {
get: () => fingerprint.hardwareConcurrency,
});
// Spoof device memory
Object.defineProperty(navigator, 'deviceMemory', {
get: () => fingerprint.deviceMemory,
});
// Spoof plugins (realistic count)
Object.defineProperty(navigator, 'plugins', {
get: () => {
const plugins = [];
for (let i = 0; i < 5; i++) {
plugins.push({
name: `Plugin ${i}`,
filename: `plugin${i}.dll`,
description: `Description ${i}`,
});
}
plugins.length = 5;
return plugins;
},
});
// Chrome object
window.chrome = {
runtime: {},
loadTimes: () => ({}),
csi: () => ({}),
app: {},
};
// Permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' })
: originalQuery(parameters);
// WebGL fingerprint spoofing
const getParameterProxyHandler = {
apply: function (target, thisArg, argumentsList) {
const param = argumentsList[0];
// UNMASKED_VENDOR_WEBGL
if (param === 37445) {
return fingerprint.webglVendor;
}
// UNMASKED_RENDERER_WEBGL
if (param === 37446) {
return fingerprint.webglRenderer;
}
return Reflect.apply(target, thisArg, argumentsList);
}
};
// Override WebGL
const originalGetContext = HTMLCanvasElement.prototype.getContext;
HTMLCanvasElement.prototype.getContext = function (type, ...args) {
const context = originalGetContext.call(this, type, ...args);
if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
const glContext = context;
const originalGetParameter = glContext.getParameter.bind(glContext);
glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
}
return context;
};
// Canvas fingerprint noise
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.toDataURL = function (type) {
const context = this.getContext('2d');
if (context) {
const imageData = context.getImageData(0, 0, this.width, this.height);
for (let i = 0; i < imageData.data.length; i += 4) {
// Add tiny noise to RGB values
imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
}
context.putImageData(imageData, 0, 0);
}
return originalToDataURL.call(this, type);
};
// Screen dimensions
Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
}, fp);
// Set timezone via CDP
const client = await page.target().createCDPSession();
await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
}
/**
* Apply stealth mode to page (legacy - now uses applyFingerprint)
*/
async makePageStealthy(page) {
// Now handled by applyFingerprint
await this.applyFingerprint(page);
}
/**
* Configure proxy for browser
*/
getProxyArgs(proxy) {
if (proxy.protocol === 'socks5') {
return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
}
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
}
return [];
}
/**
* HTTP-based fetch (lightweight, fast)
*/
async httpFetch(request) {
try {
const config = {
timeout: 30000,
headers: {
'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
...request.metadata.headers
},
validateStatus: () => true // Don't throw on any status
};
// Add proxy if available
if (request.metadata.proxy) {
const proxy = request.metadata.proxy;
config.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol
};
if (proxy.username && proxy.password) {
config.proxy.auth = {
username: proxy.username,
password: proxy.password
};
}
}
const response = await axios_1.default.get(request.url, config);
return {
url: request.url,
statusCode: response.status,
content: response.data,
metadata: {
headers: response.headers,
method: 'http'
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = true;
scraperError.request = request;
throw scraperError;
}
}
/**
* Browser-based fetch (for JS-heavy sites)
*/
async browserFetch(request) {
// Wait if page is in use
while (this.pageInUse) {
await new Promise(resolve => setTimeout(resolve, 100));
}
this.pageInUse = true;
try {
const page = await this.getPage();
// Apply stealth mode if required
if (request.metadata.requiresStealth) {
await this.makePageStealthy(page);
}
// Set user agent
if (request.metadata.userAgent) {
await page.setUserAgent(request.metadata.userAgent);
}
// Navigate to page - use networkidle2 for SPAs like Dutchie
// Increased timeout to 90s - Dutchie pages can take 30-40s to fully load
const navigationPromise = page.goto(request.url, {
waitUntil: 'networkidle2',
timeout: 90000
});
const response = await navigationPromise;
if (!response) {
throw new Error('Navigation failed - no response');
}
// Wait for React to render product content
// Try to wait for products, but don't fail if they don't appear (empty category)
try {
await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', {
timeout: 10000
});
}
catch {
// Products might not exist in this category - continue anyway
logger_1.logger.debug('scraper', 'No products found within timeout - continuing');
}
// Additional wait for any lazy-loaded content
await page.waitForTimeout(2000);
// Check for lazy-loaded content
await this.autoScroll(page);
// Get page content
const content = await page.content();
const statusCode = response.status();
return {
url: request.url,
statusCode,
content,
metadata: {
method: 'browser',
finalUrl: page.url()
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.message.includes('net::')) {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else if (error.message.includes('404')) {
scraperError.type = types_1.ErrorType.NOT_FOUND;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
scraperError.request = request;
throw scraperError;
}
finally {
this.pageInUse = false;
}
}
/**
* Auto-scroll to load lazy content
*/
async autoScroll(page) {
try {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 500;
const maxScrolls = 20; // Prevent infinite scrolling
let scrollCount = 0;
const timer = setInterval(() => {
// @ts-ignore - runs in browser context
const scrollHeight = document.body.scrollHeight;
// @ts-ignore - runs in browser context
window.scrollBy(0, distance);
totalHeight += distance;
scrollCount++;
if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
clearInterval(timer);
// Scroll back to top
// @ts-ignore - runs in browser context
window.scrollTo(0, 0);
resolve();
}
}, 200);
});
});
// Wait for any lazy-loaded content
await page.waitForTimeout(1000);
}
catch (error) {
logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
}
}
/**
* Main fetch method - tries HTTP first, falls back to browser
*/
async fetch(request) {
const startTime = Date.now();
try {
// Force browser mode if required
if (request.metadata.requiresBrowser) {
logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
return response;
}
// Try HTTP first (faster)
try {
logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
const response = await this.httpFetch(request);
// Check if we got a meaningful response
if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
return response;
}
// Fall through to browser mode for non-2xx responses
logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
}
catch (httpError) {
logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
}
// Fall back to browser
request.metadata.requiresBrowser = true;
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
return response;
}
catch (error) {
logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
throw error;
}
}
/**
* Evaluate JavaScript in the current page context
*/
async evaluate(fn) {
if (!this.page || this.page.isClosed()) {
throw new Error('No active page for evaluation');
}
return await this.page.evaluate(fn);
}
/**
* Get the current page (for custom operations)
*/
async getCurrentPage() {
return this.page;
}
/**
* Close the browser
*/
async close() {
if (this.page && !this.page.isClosed()) {
await this.page.close();
this.page = null;
}
if (this.browser && this.browser.isConnected()) {
await this.browser.close();
this.browser = null;
logger_1.logger.info('scraper', 'Browser closed');
}
}
/**
* Clean up resources
*/
async cleanup() {
await this.close();
}
}
exports.Downloader = Downloader;