The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
182 lines
7.8 KiB
JavaScript
182 lines
7.8 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
const migrate_1 = require("../db/migrate");
|
|
const proxy_1 = require("../services/proxy");
|
|
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
|
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
|
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
|
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
|
const NUM_WORKERS = parseInt(process.argv[2] || '15');
|
|
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
|
|
const USE_PROXIES = process.argv[4] !== 'no-proxy';
|
|
async function getStore(name) {
|
|
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]);
|
|
return result.rows[0] || null;
|
|
}
|
|
async function getCategories(storeId) {
|
|
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]);
|
|
return result.rows;
|
|
}
|
|
async function scrapeWithProxy(workerId, store, category) {
|
|
let browser = null;
|
|
let proxyId = null;
|
|
try {
|
|
// Get a proxy (if enabled)
|
|
let proxy = null;
|
|
if (USE_PROXIES) {
|
|
proxy = await (0, proxy_1.getActiveProxy)();
|
|
if (proxy) {
|
|
proxyId = proxy.id;
|
|
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
|
}
|
|
else {
|
|
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
|
|
}
|
|
}
|
|
else {
|
|
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
|
|
}
|
|
// Build browser args
|
|
const args = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920,1080',
|
|
];
|
|
if (proxy) {
|
|
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
|
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
|
}
|
|
else {
|
|
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
|
}
|
|
}
|
|
browser = await puppeteer_extra_1.default.launch({
|
|
headless: true,
|
|
args,
|
|
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
});
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent(FIREFOX_USER_AGENT);
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
// Handle proxy auth if needed
|
|
if (proxy?.username && proxy?.password) {
|
|
await page.authenticate({
|
|
username: proxy.username,
|
|
password: proxy.password,
|
|
});
|
|
}
|
|
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
|
|
// Navigate to the category page
|
|
const response = await page.goto(category.url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
if (!response || !response.ok()) {
|
|
throw new Error(`Failed to load page: ${response?.status()}`);
|
|
}
|
|
// Wait for products to load
|
|
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
|
timeout: 30000,
|
|
}).catch(() => {
|
|
console.log(`[Worker ${workerId}] No products found on page`);
|
|
});
|
|
// Extract products
|
|
const products = await page.evaluate(() => {
|
|
// Try data-testid first, then fall back to product links
|
|
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
|
if (listItems.length > 0)
|
|
return listItems.length;
|
|
return document.querySelectorAll('a[href*="/product/"]').length;
|
|
});
|
|
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
|
await browser.close();
|
|
return { success: true, products };
|
|
}
|
|
catch (error) {
|
|
console.error(`[Worker ${workerId}] Error:`, error.message);
|
|
// Check for bot detection
|
|
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
|
|
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
|
|
}
|
|
if (browser) {
|
|
await browser.close().catch(() => { });
|
|
}
|
|
return { success: false, products: 0, error: error.message };
|
|
}
|
|
}
|
|
async function worker(workerId, store, categories, categoryIndex) {
|
|
while (categoryIndex.current < categories.length) {
|
|
const idx = categoryIndex.current++;
|
|
const category = categories[idx];
|
|
if (!category)
|
|
break;
|
|
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
|
|
const result = await scrapeWithProxy(workerId, store, category);
|
|
if (result.success) {
|
|
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
|
|
}
|
|
else {
|
|
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
|
|
}
|
|
// Small delay between requests
|
|
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
|
}
|
|
console.log(`[Worker ${workerId}] Finished all assigned work`);
|
|
}
|
|
async function main() {
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
|
|
console.log(`Target: ${DISPENSARY_NAME}`);
|
|
console.log(`User Agent: Firefox`);
|
|
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
// Find the store
|
|
const store = await getStore(DISPENSARY_NAME);
|
|
if (!store) {
|
|
console.error(`Store not found: ${DISPENSARY_NAME}`);
|
|
process.exit(1);
|
|
}
|
|
console.log(`Found store: ${store.name} (ID: ${store.id})`);
|
|
// Get categories
|
|
const categories = await getCategories(store.id);
|
|
if (categories.length === 0) {
|
|
console.error('No categories found for this store');
|
|
process.exit(1);
|
|
}
|
|
console.log(`Found ${categories.length} categories to scrape`);
|
|
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
|
|
// Check proxies
|
|
const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
|
|
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
|
|
// Shared index for work distribution
|
|
const categoryIndex = { current: 0 };
|
|
// For a store with few categories, we'll run multiple passes
|
|
// Expand the work by duplicating categories for parallel workers
|
|
const expandedCategories = [];
|
|
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
|
|
for (let i = 0; i < passes; i++) {
|
|
expandedCategories.push(...categories);
|
|
}
|
|
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
|
|
// Start workers
|
|
const workers = [];
|
|
for (let i = 0; i < NUM_WORKERS; i++) {
|
|
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
|
|
// Stagger worker starts
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
}
|
|
// Wait for all workers
|
|
await Promise.all(workers);
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log('All workers completed!');
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
await migrate_1.pool.end();
|
|
}
|
|
main().catch(console.error);
|