fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
182
backend/dist/routes/parallel-scrape.js
vendored
Normal file
182
backend/dist/routes/parallel-scrape.js
vendored
Normal file
@@ -0,0 +1,182 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
||||
// In-memory job tracking
|
||||
const activeJobs = new Map();
|
||||
// Get job status
|
||||
router.get('/status/:jobId', (req, res) => {
|
||||
const job = activeJobs.get(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: 'Job not found' });
|
||||
}
|
||||
res.json(job);
|
||||
});
|
||||
// List active jobs
|
||||
router.get('/jobs', (req, res) => {
|
||||
const jobs = Array.from(activeJobs.values());
|
||||
res.json({ jobs });
|
||||
});
|
||||
// Start parallel scrape
|
||||
router.post('/start', async (req, res) => {
|
||||
const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body;
|
||||
try {
|
||||
// Find the store
|
||||
const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]);
|
||||
if (storeResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: `Store not found: ${storeName}` });
|
||||
}
|
||||
const store = storeResult.rows[0];
|
||||
// Get categories
|
||||
const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]);
|
||||
if (categoriesResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'No categories found for this store' });
|
||||
}
|
||||
const categories = categoriesResult.rows;
|
||||
// Create job
|
||||
const jobId = `scrape-${Date.now()}`;
|
||||
const job = {
|
||||
id: jobId,
|
||||
storeName: store.name,
|
||||
status: 'running',
|
||||
workers,
|
||||
startedAt: new Date(),
|
||||
results: []
|
||||
};
|
||||
activeJobs.set(jobId, job);
|
||||
// Start scraping in background
|
||||
runParallelScrape(job, store, categories, workers, useProxies).catch(err => {
|
||||
console.error('Parallel scrape error:', err);
|
||||
job.status = 'failed';
|
||||
});
|
||||
res.json({
|
||||
message: 'Parallel scrape started',
|
||||
jobId,
|
||||
store: store.name,
|
||||
categories: categories.length,
|
||||
workers
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Failed to start parallel scrape:', error);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
async function runParallelScrape(job, store, categories, numWorkers, useProxies) {
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
// Expand categories for multiple passes
|
||||
const expandedCategories = [];
|
||||
const passes = Math.ceil(numWorkers / Math.max(categories.length, 1));
|
||||
for (let i = 0; i < passes; i++) {
|
||||
expandedCategories.push(...categories);
|
||||
}
|
||||
const categoryIndex = { current: 0 };
|
||||
const worker = async (workerId) => {
|
||||
while (categoryIndex.current < expandedCategories.length) {
|
||||
const idx = categoryIndex.current++;
|
||||
const category = expandedCategories[idx];
|
||||
if (!category)
|
||||
break;
|
||||
const result = await scrapeCategory(puppeteer, workerId, category, useProxies);
|
||||
job.results.push({
|
||||
category: category.name,
|
||||
success: result.success,
|
||||
products: result.products,
|
||||
error: result.error
|
||||
});
|
||||
// Delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
||||
}
|
||||
};
|
||||
// Start workers with staggered starts
|
||||
const workers = [];
|
||||
for (let i = 0; i < numWorkers; i++) {
|
||||
workers.push(worker(i + 1));
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
await Promise.all(workers);
|
||||
job.status = 'completed';
|
||||
job.completedAt = new Date();
|
||||
// Clean up job after 1 hour
|
||||
setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000);
|
||||
}
|
||||
async function scrapeCategory(puppeteer, workerId, category, useProxies) {
|
||||
let browser = null;
|
||||
let proxyId = null;
|
||||
try {
|
||||
let proxy = null;
|
||||
if (useProxies) {
|
||||
proxy = await (0, proxy_1.getActiveProxy)();
|
||||
}
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1080',
|
||||
];
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
||||
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
}
|
||||
browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args,
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(FIREFOX_USER_AGENT);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
if (proxy?.username && proxy?.password) {
|
||||
await page.authenticate({
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
});
|
||||
}
|
||||
console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`);
|
||||
const response = await page.goto(category.url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`Failed to load page: ${response?.status()}`);
|
||||
}
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => { });
|
||||
const products = await page.evaluate(() => {
|
||||
// Try data-testid first, then fall back to product links
|
||||
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
||||
if (listItems.length > 0)
|
||||
return listItems.length;
|
||||
return document.querySelectorAll('a[href*="/product/"]').length;
|
||||
});
|
||||
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
||||
await browser.close();
|
||||
return { success: true, products };
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Worker ${workerId}] Error:`, error.message);
|
||||
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
|
||||
}
|
||||
if (browser) {
|
||||
await browser.close().catch(() => { });
|
||||
}
|
||||
return { success: false, products: 0, error: error.message };
|
||||
}
|
||||
}
|
||||
exports.default = router;
|
||||
Reference in New Issue
Block a user