Files
cannaiq/backend/dist/scripts/parallel-scrape.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

182 lines
7.8 KiB
JavaScript

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
const NUM_WORKERS = parseInt(process.argv[2] || '15');
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
const USE_PROXIES = process.argv[4] !== 'no-proxy';
async function getStore(name) {
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]);
return result.rows[0] || null;
}
async function getCategories(storeId) {
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]);
return result.rows;
}
async function scrapeWithProxy(workerId, store, category) {
let browser = null;
let proxyId = null;
try {
// Get a proxy (if enabled)
let proxy = null;
if (USE_PROXIES) {
proxy = await (0, proxy_1.getActiveProxy)();
if (proxy) {
proxyId = proxy.id;
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
else {
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
}
}
else {
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
}
// Build browser args
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080',
];
if (proxy) {
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
}
else {
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
}
browser = await puppeteer_extra_1.default.launch({
headless: true,
args,
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
});
const page = await browser.newPage();
await page.setUserAgent(FIREFOX_USER_AGENT);
await page.setViewport({ width: 1920, height: 1080 });
// Handle proxy auth if needed
if (proxy?.username && proxy?.password) {
await page.authenticate({
username: proxy.username,
password: proxy.password,
});
}
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
// Navigate to the category page
const response = await page.goto(category.url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
if (!response || !response.ok()) {
throw new Error(`Failed to load page: ${response?.status()}`);
}
// Wait for products to load
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => {
console.log(`[Worker ${workerId}] No products found on page`);
});
// Extract products
const products = await page.evaluate(() => {
// Try data-testid first, then fall back to product links
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
if (listItems.length > 0)
return listItems.length;
return document.querySelectorAll('a[href*="/product/"]').length;
});
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
await browser.close();
return { success: true, products };
}
catch (error) {
console.error(`[Worker ${workerId}] Error:`, error.message);
// Check for bot detection
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
}
if (browser) {
await browser.close().catch(() => { });
}
return { success: false, products: 0, error: error.message };
}
}
async function worker(workerId, store, categories, categoryIndex) {
while (categoryIndex.current < categories.length) {
const idx = categoryIndex.current++;
const category = categories[idx];
if (!category)
break;
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
const result = await scrapeWithProxy(workerId, store, category);
if (result.success) {
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
}
else {
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
}
// Small delay between requests
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
}
console.log(`[Worker ${workerId}] Finished all assigned work`);
}
async function main() {
console.log(`\n${'='.repeat(60)}`);
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
console.log(`Target: ${DISPENSARY_NAME}`);
console.log(`User Agent: Firefox`);
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
console.log(`${'='.repeat(60)}\n`);
// Find the store
const store = await getStore(DISPENSARY_NAME);
if (!store) {
console.error(`Store not found: ${DISPENSARY_NAME}`);
process.exit(1);
}
console.log(`Found store: ${store.name} (ID: ${store.id})`);
// Get categories
const categories = await getCategories(store.id);
if (categories.length === 0) {
console.error('No categories found for this store');
process.exit(1);
}
console.log(`Found ${categories.length} categories to scrape`);
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
// Check proxies
const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
// Shared index for work distribution
const categoryIndex = { current: 0 };
// For a store with few categories, we'll run multiple passes
// Expand the work by duplicating categories for parallel workers
const expandedCategories = [];
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
for (let i = 0; i < passes; i++) {
expandedCategories.push(...categories);
}
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
// Start workers
const workers = [];
for (let i = 0; i < NUM_WORKERS; i++) {
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
// Stagger worker starts
await new Promise(resolve => setTimeout(resolve, 500));
}
// Wait for all workers
await Promise.all(workers);
console.log(`\n${'='.repeat(60)}`);
console.log('All workers completed!');
console.log(`${'='.repeat(60)}\n`);
await migrate_1.pool.end();
}
main().catch(console.error);