The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
503 lines
20 KiB
JavaScript
503 lines
20 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.Downloader = void 0;
|
|
const puppeteer_1 = __importDefault(require("puppeteer"));
|
|
const axios_1 = __importDefault(require("axios"));
|
|
const types_1 = require("./types");
|
|
const logger_1 = require("../services/logger");
|
|
// Fingerprint profiles for randomization
|
|
const SCREEN_RESOLUTIONS = [
|
|
{ width: 1920, height: 1080 },
|
|
{ width: 1366, height: 768 },
|
|
{ width: 1536, height: 864 },
|
|
{ width: 1440, height: 900 },
|
|
{ width: 1280, height: 720 },
|
|
{ width: 2560, height: 1440 },
|
|
{ width: 1680, height: 1050 },
|
|
{ width: 1600, height: 900 },
|
|
];
|
|
const TIMEZONES = [
|
|
'America/New_York',
|
|
'America/Chicago',
|
|
'America/Denver',
|
|
'America/Los_Angeles',
|
|
'America/Phoenix',
|
|
];
|
|
const LANGUAGES = [
|
|
['en-US', 'en'],
|
|
['en-US', 'en', 'es'],
|
|
['en-US'],
|
|
];
|
|
const PLATFORMS = [
|
|
'Win32',
|
|
'MacIntel',
|
|
'Linux x86_64',
|
|
];
|
|
const WEBGL_VENDORS = [
|
|
'Google Inc. (NVIDIA)',
|
|
'Google Inc. (Intel)',
|
|
'Google Inc. (AMD)',
|
|
'Intel Inc.',
|
|
'NVIDIA Corporation',
|
|
];
|
|
const WEBGL_RENDERERS = [
|
|
'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
|
|
'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
|
|
'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
|
|
'Intel Iris OpenGL Engine',
|
|
'NVIDIA GeForce RTX 3070/PCIe/SSE2',
|
|
'AMD Radeon Pro 5500M OpenGL Engine',
|
|
];
|
|
function generateRandomFingerprint() {
|
|
return {
|
|
screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
|
|
timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
|
|
languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
|
|
platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
|
|
hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
|
|
deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
|
|
webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
|
|
webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
|
|
};
|
|
}
|
|
class Downloader {
|
|
browser = null;
|
|
page = null;
|
|
pageInUse = false;
|
|
currentFingerprint = generateRandomFingerprint();
|
|
needsNewFingerprint = false;
|
|
/**
|
|
* Force new fingerprint on next browser creation
|
|
*/
|
|
rotateFingerprint() {
|
|
this.needsNewFingerprint = true;
|
|
logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled');
|
|
}
|
|
/**
|
|
* Initialize browser instance with fingerprint
|
|
*/
|
|
async getBrowser(forceNew = false) {
|
|
// Create new browser if needed for fingerprint rotation
|
|
if (forceNew || this.needsNewFingerprint) {
|
|
await this.close();
|
|
this.currentFingerprint = generateRandomFingerprint();
|
|
this.needsNewFingerprint = false;
|
|
logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
|
|
}
|
|
if (!this.browser || !this.browser.isConnected()) {
|
|
const { screen } = this.currentFingerprint;
|
|
const launchOptions = {
|
|
headless: 'new',
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
`--window-size=${screen.width},${screen.height}`,
|
|
'--disable-web-security',
|
|
'--disable-features=IsolateOrigins,site-per-process',
|
|
'--disable-infobars',
|
|
'--disable-extensions',
|
|
]
|
|
};
|
|
this.browser = await puppeteer_1.default.launch(launchOptions);
|
|
logger_1.logger.info('scraper', 'Browser instance created');
|
|
}
|
|
return this.browser;
|
|
}
|
|
/**
|
|
* Get or create a page instance with current fingerprint
|
|
*/
|
|
async getPage(forceNew = false) {
|
|
if (!this.page || this.page.isClosed() || forceNew) {
|
|
const browser = await this.getBrowser(forceNew);
|
|
this.page = await browser.newPage();
|
|
const { screen } = this.currentFingerprint;
|
|
await this.page.setViewport({
|
|
width: screen.width,
|
|
height: screen.height,
|
|
deviceScaleFactor: 1,
|
|
});
|
|
// Apply fingerprint
|
|
await this.applyFingerprint(this.page);
|
|
logger_1.logger.debug('scraper', 'New page created with fingerprint');
|
|
}
|
|
return this.page;
|
|
}
|
|
/**
|
|
* Apply full fingerprint to page
|
|
*/
|
|
async applyFingerprint(page) {
|
|
const fp = this.currentFingerprint;
|
|
await page.evaluateOnNewDocument((fingerprint) => {
|
|
// Hide webdriver
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false,
|
|
});
|
|
// Spoof platform
|
|
Object.defineProperty(navigator, 'platform', {
|
|
get: () => fingerprint.platform,
|
|
});
|
|
// Spoof languages
|
|
Object.defineProperty(navigator, 'languages', {
|
|
get: () => fingerprint.languages,
|
|
});
|
|
// Spoof hardware concurrency
|
|
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
|
get: () => fingerprint.hardwareConcurrency,
|
|
});
|
|
// Spoof device memory
|
|
Object.defineProperty(navigator, 'deviceMemory', {
|
|
get: () => fingerprint.deviceMemory,
|
|
});
|
|
// Spoof plugins (realistic count)
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => {
|
|
const plugins = [];
|
|
for (let i = 0; i < 5; i++) {
|
|
plugins.push({
|
|
name: `Plugin ${i}`,
|
|
filename: `plugin${i}.dll`,
|
|
description: `Description ${i}`,
|
|
});
|
|
}
|
|
plugins.length = 5;
|
|
return plugins;
|
|
},
|
|
});
|
|
// Chrome object
|
|
window.chrome = {
|
|
runtime: {},
|
|
loadTimes: () => ({}),
|
|
csi: () => ({}),
|
|
app: {},
|
|
};
|
|
// Permissions
|
|
const originalQuery = window.navigator.permissions.query;
|
|
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
|
|
? Promise.resolve({ state: 'denied' })
|
|
: originalQuery(parameters);
|
|
// WebGL fingerprint spoofing
|
|
const getParameterProxyHandler = {
|
|
apply: function (target, thisArg, argumentsList) {
|
|
const param = argumentsList[0];
|
|
// UNMASKED_VENDOR_WEBGL
|
|
if (param === 37445) {
|
|
return fingerprint.webglVendor;
|
|
}
|
|
// UNMASKED_RENDERER_WEBGL
|
|
if (param === 37446) {
|
|
return fingerprint.webglRenderer;
|
|
}
|
|
return Reflect.apply(target, thisArg, argumentsList);
|
|
}
|
|
};
|
|
// Override WebGL
|
|
const originalGetContext = HTMLCanvasElement.prototype.getContext;
|
|
HTMLCanvasElement.prototype.getContext = function (type, ...args) {
|
|
const context = originalGetContext.call(this, type, ...args);
|
|
if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
|
|
const glContext = context;
|
|
const originalGetParameter = glContext.getParameter.bind(glContext);
|
|
glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
|
|
}
|
|
return context;
|
|
};
|
|
// Canvas fingerprint noise
|
|
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
|
HTMLCanvasElement.prototype.toDataURL = function (type) {
|
|
const context = this.getContext('2d');
|
|
if (context) {
|
|
const imageData = context.getImageData(0, 0, this.width, this.height);
|
|
for (let i = 0; i < imageData.data.length; i += 4) {
|
|
// Add tiny noise to RGB values
|
|
imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
|
|
}
|
|
context.putImageData(imageData, 0, 0);
|
|
}
|
|
return originalToDataURL.call(this, type);
|
|
};
|
|
// Screen dimensions
|
|
Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
|
|
Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
|
|
Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
|
|
Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
|
|
Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
|
|
Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
|
|
Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
|
|
Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
|
|
}, fp);
|
|
// Set timezone via CDP
|
|
const client = await page.target().createCDPSession();
|
|
await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
|
|
}
|
|
/**
|
|
* Apply stealth mode to page (legacy - now uses applyFingerprint)
|
|
*/
|
|
async makePageStealthy(page) {
|
|
// Now handled by applyFingerprint
|
|
await this.applyFingerprint(page);
|
|
}
|
|
/**
|
|
* Configure proxy for browser
|
|
*/
|
|
getProxyArgs(proxy) {
|
|
if (proxy.protocol === 'socks5') {
|
|
return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
|
|
}
|
|
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
|
|
return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
|
|
}
|
|
return [];
|
|
}
|
|
/**
|
|
* HTTP-based fetch (lightweight, fast)
|
|
*/
|
|
async httpFetch(request) {
|
|
try {
|
|
const config = {
|
|
timeout: 30000,
|
|
headers: {
|
|
'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
|
|
...request.metadata.headers
|
|
},
|
|
validateStatus: () => true // Don't throw on any status
|
|
};
|
|
// Add proxy if available
|
|
if (request.metadata.proxy) {
|
|
const proxy = request.metadata.proxy;
|
|
config.proxy = {
|
|
host: proxy.host,
|
|
port: proxy.port,
|
|
protocol: proxy.protocol
|
|
};
|
|
if (proxy.username && proxy.password) {
|
|
config.proxy.auth = {
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
};
|
|
}
|
|
}
|
|
const response = await axios_1.default.get(request.url, config);
|
|
return {
|
|
url: request.url,
|
|
statusCode: response.status,
|
|
content: response.data,
|
|
metadata: {
|
|
headers: response.headers,
|
|
method: 'http'
|
|
},
|
|
request
|
|
};
|
|
}
|
|
catch (error) {
|
|
const scraperError = new Error(error.message);
|
|
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
|
|
scraperError.type = types_1.ErrorType.TIMEOUT;
|
|
}
|
|
else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
|
|
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
|
|
}
|
|
else {
|
|
scraperError.type = types_1.ErrorType.UNKNOWN;
|
|
}
|
|
scraperError.retryable = true;
|
|
scraperError.request = request;
|
|
throw scraperError;
|
|
}
|
|
}
|
|
/**
|
|
* Browser-based fetch (for JS-heavy sites)
|
|
*/
|
|
async browserFetch(request) {
|
|
// Wait if page is in use
|
|
while (this.pageInUse) {
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
}
|
|
this.pageInUse = true;
|
|
try {
|
|
const page = await this.getPage();
|
|
// Apply stealth mode if required
|
|
if (request.metadata.requiresStealth) {
|
|
await this.makePageStealthy(page);
|
|
}
|
|
// Set user agent
|
|
if (request.metadata.userAgent) {
|
|
await page.setUserAgent(request.metadata.userAgent);
|
|
}
|
|
// Navigate to page - use networkidle2 for SPAs like Dutchie
|
|
// Increased timeout to 90s - Dutchie pages can take 30-40s to fully load
|
|
const navigationPromise = page.goto(request.url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 90000
|
|
});
|
|
const response = await navigationPromise;
|
|
if (!response) {
|
|
throw new Error('Navigation failed - no response');
|
|
}
|
|
// Wait for React to render product content
|
|
// Try to wait for products, but don't fail if they don't appear (empty category)
|
|
try {
|
|
await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', {
|
|
timeout: 10000
|
|
});
|
|
}
|
|
catch {
|
|
// Products might not exist in this category - continue anyway
|
|
logger_1.logger.debug('scraper', 'No products found within timeout - continuing');
|
|
}
|
|
// Additional wait for any lazy-loaded content
|
|
await page.waitForTimeout(2000);
|
|
// Check for lazy-loaded content
|
|
await this.autoScroll(page);
|
|
// Get page content
|
|
const content = await page.content();
|
|
const statusCode = response.status();
|
|
return {
|
|
url: request.url,
|
|
statusCode,
|
|
content,
|
|
metadata: {
|
|
method: 'browser',
|
|
finalUrl: page.url()
|
|
},
|
|
request
|
|
};
|
|
}
|
|
catch (error) {
|
|
const scraperError = new Error(error.message);
|
|
if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
|
|
scraperError.type = types_1.ErrorType.TIMEOUT;
|
|
}
|
|
else if (error.message.includes('net::')) {
|
|
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
|
|
}
|
|
else if (error.message.includes('404')) {
|
|
scraperError.type = types_1.ErrorType.NOT_FOUND;
|
|
}
|
|
else {
|
|
scraperError.type = types_1.ErrorType.UNKNOWN;
|
|
}
|
|
scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
|
|
scraperError.request = request;
|
|
throw scraperError;
|
|
}
|
|
finally {
|
|
this.pageInUse = false;
|
|
}
|
|
}
|
|
/**
|
|
* Auto-scroll to load lazy content
|
|
*/
|
|
async autoScroll(page) {
|
|
try {
|
|
await page.evaluate(async () => {
|
|
await new Promise((resolve) => {
|
|
let totalHeight = 0;
|
|
const distance = 500;
|
|
const maxScrolls = 20; // Prevent infinite scrolling
|
|
let scrollCount = 0;
|
|
const timer = setInterval(() => {
|
|
// @ts-ignore - runs in browser context
|
|
const scrollHeight = document.body.scrollHeight;
|
|
// @ts-ignore - runs in browser context
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
scrollCount++;
|
|
if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
|
|
clearInterval(timer);
|
|
// Scroll back to top
|
|
// @ts-ignore - runs in browser context
|
|
window.scrollTo(0, 0);
|
|
resolve();
|
|
}
|
|
}, 200);
|
|
});
|
|
});
|
|
// Wait for any lazy-loaded content
|
|
await page.waitForTimeout(1000);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
|
|
}
|
|
}
|
|
/**
|
|
* Main fetch method - tries HTTP first, falls back to browser
|
|
*/
|
|
async fetch(request) {
|
|
const startTime = Date.now();
|
|
try {
|
|
// Force browser mode if required
|
|
if (request.metadata.requiresBrowser) {
|
|
logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
|
|
const response = await this.browserFetch(request);
|
|
logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
|
|
return response;
|
|
}
|
|
// Try HTTP first (faster)
|
|
try {
|
|
logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
|
|
const response = await this.httpFetch(request);
|
|
// Check if we got a meaningful response
|
|
if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
|
|
logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
|
|
return response;
|
|
}
|
|
// Fall through to browser mode for non-2xx responses
|
|
logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
|
|
}
|
|
catch (httpError) {
|
|
logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
|
|
}
|
|
// Fall back to browser
|
|
request.metadata.requiresBrowser = true;
|
|
const response = await this.browserFetch(request);
|
|
logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
|
|
return response;
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
/**
|
|
* Evaluate JavaScript in the current page context
|
|
*/
|
|
async evaluate(fn) {
|
|
if (!this.page || this.page.isClosed()) {
|
|
throw new Error('No active page for evaluation');
|
|
}
|
|
return await this.page.evaluate(fn);
|
|
}
|
|
/**
|
|
* Get the current page (for custom operations)
|
|
*/
|
|
async getCurrentPage() {
|
|
return this.page;
|
|
}
|
|
/**
|
|
* Close the browser
|
|
*/
|
|
async close() {
|
|
if (this.page && !this.page.isClosed()) {
|
|
await this.page.close();
|
|
this.page = null;
|
|
}
|
|
if (this.browser && this.browser.isConnected()) {
|
|
await this.browser.close();
|
|
this.browser = null;
|
|
logger_1.logger.info('scraper', 'Browser closed');
|
|
}
|
|
}
|
|
/**
|
|
* Clean up resources
|
|
*/
|
|
async cleanup() {
|
|
await this.close();
|
|
}
|
|
}
|
|
exports.Downloader = Downloader;
|