fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
181
backend/dist/scripts/parallel-scrape.js
vendored
Normal file
181
backend/dist/scripts/parallel-scrape.js
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
||||
const NUM_WORKERS = parseInt(process.argv[2] || '15');
|
||||
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
|
||||
const USE_PROXIES = process.argv[4] !== 'no-proxy';
|
||||
async function getStore(name) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function getCategories(storeId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]);
|
||||
return result.rows;
|
||||
}
|
||||
async function scrapeWithProxy(workerId, store, category) {
|
||||
let browser = null;
|
||||
let proxyId = null;
|
||||
try {
|
||||
// Get a proxy (if enabled)
|
||||
let proxy = null;
|
||||
if (USE_PROXIES) {
|
||||
proxy = await (0, proxy_1.getActiveProxy)();
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
|
||||
}
|
||||
// Build browser args
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1080',
|
||||
];
|
||||
if (proxy) {
|
||||
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
||||
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
}
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless: true,
|
||||
args,
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(FIREFOX_USER_AGENT);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
// Handle proxy auth if needed
|
||||
if (proxy?.username && proxy?.password) {
|
||||
await page.authenticate({
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
});
|
||||
}
|
||||
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
|
||||
// Navigate to the category page
|
||||
const response = await page.goto(category.url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`Failed to load page: ${response?.status()}`);
|
||||
}
|
||||
// Wait for products to load
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => {
|
||||
console.log(`[Worker ${workerId}] No products found on page`);
|
||||
});
|
||||
// Extract products
|
||||
const products = await page.evaluate(() => {
|
||||
// Try data-testid first, then fall back to product links
|
||||
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
||||
if (listItems.length > 0)
|
||||
return listItems.length;
|
||||
return document.querySelectorAll('a[href*="/product/"]').length;
|
||||
});
|
||||
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
||||
await browser.close();
|
||||
return { success: true, products };
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Worker ${workerId}] Error:`, error.message);
|
||||
// Check for bot detection
|
||||
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
|
||||
}
|
||||
if (browser) {
|
||||
await browser.close().catch(() => { });
|
||||
}
|
||||
return { success: false, products: 0, error: error.message };
|
||||
}
|
||||
}
|
||||
async function worker(workerId, store, categories, categoryIndex) {
|
||||
while (categoryIndex.current < categories.length) {
|
||||
const idx = categoryIndex.current++;
|
||||
const category = categories[idx];
|
||||
if (!category)
|
||||
break;
|
||||
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
|
||||
const result = await scrapeWithProxy(workerId, store, category);
|
||||
if (result.success) {
|
||||
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
|
||||
}
|
||||
else {
|
||||
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
|
||||
}
|
||||
// Small delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
||||
}
|
||||
console.log(`[Worker ${workerId}] Finished all assigned work`);
|
||||
}
|
||||
async function main() {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
|
||||
console.log(`Target: ${DISPENSARY_NAME}`);
|
||||
console.log(`User Agent: Firefox`);
|
||||
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
// Find the store
|
||||
const store = await getStore(DISPENSARY_NAME);
|
||||
if (!store) {
|
||||
console.error(`Store not found: ${DISPENSARY_NAME}`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Found store: ${store.name} (ID: ${store.id})`);
|
||||
// Get categories
|
||||
const categories = await getCategories(store.id);
|
||||
if (categories.length === 0) {
|
||||
console.error('No categories found for this store');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Found ${categories.length} categories to scrape`);
|
||||
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
|
||||
// Check proxies
|
||||
const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
|
||||
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
|
||||
// Shared index for work distribution
|
||||
const categoryIndex = { current: 0 };
|
||||
// For a store with few categories, we'll run multiple passes
|
||||
// Expand the work by duplicating categories for parallel workers
|
||||
const expandedCategories = [];
|
||||
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
|
||||
for (let i = 0; i < passes; i++) {
|
||||
expandedCategories.push(...categories);
|
||||
}
|
||||
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
|
||||
// Start workers
|
||||
const workers = [];
|
||||
for (let i = 0; i < NUM_WORKERS; i++) {
|
||||
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
|
||||
// Stagger worker starts
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
// Wait for all workers
|
||||
await Promise.all(workers);
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log('All workers completed!');
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
await migrate_1.pool.end();
|
||||
}
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user