The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
183 lines
7.1 KiB
JavaScript
183 lines
7.1 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
const express_1 = require("express");
|
|
const migrate_1 = require("../db/migrate");
|
|
const proxy_1 = require("../services/proxy");
|
|
const middleware_1 = require("../auth/middleware");
|
|
const router = (0, express_1.Router)();
|
|
router.use(middleware_1.authMiddleware);
|
|
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
|
// In-memory job tracking
|
|
const activeJobs = new Map();
|
|
// Get job status
|
|
router.get('/status/:jobId', (req, res) => {
|
|
const job = activeJobs.get(req.params.jobId);
|
|
if (!job) {
|
|
return res.status(404).json({ error: 'Job not found' });
|
|
}
|
|
res.json(job);
|
|
});
|
|
// List active jobs
|
|
router.get('/jobs', (req, res) => {
|
|
const jobs = Array.from(activeJobs.values());
|
|
res.json({ jobs });
|
|
});
|
|
// Start parallel scrape
|
|
router.post('/start', async (req, res) => {
|
|
const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body;
|
|
try {
|
|
// Find the store
|
|
const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]);
|
|
if (storeResult.rows.length === 0) {
|
|
return res.status(404).json({ error: `Store not found: ${storeName}` });
|
|
}
|
|
const store = storeResult.rows[0];
|
|
// Get categories
|
|
const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]);
|
|
if (categoriesResult.rows.length === 0) {
|
|
return res.status(404).json({ error: 'No categories found for this store' });
|
|
}
|
|
const categories = categoriesResult.rows;
|
|
// Create job
|
|
const jobId = `scrape-${Date.now()}`;
|
|
const job = {
|
|
id: jobId,
|
|
storeName: store.name,
|
|
status: 'running',
|
|
workers,
|
|
startedAt: new Date(),
|
|
results: []
|
|
};
|
|
activeJobs.set(jobId, job);
|
|
// Start scraping in background
|
|
runParallelScrape(job, store, categories, workers, useProxies).catch(err => {
|
|
console.error('Parallel scrape error:', err);
|
|
job.status = 'failed';
|
|
});
|
|
res.json({
|
|
message: 'Parallel scrape started',
|
|
jobId,
|
|
store: store.name,
|
|
categories: categories.length,
|
|
workers
|
|
});
|
|
}
|
|
catch (error) {
|
|
console.error('Failed to start parallel scrape:', error);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
async function runParallelScrape(job, store, categories, numWorkers, useProxies) {
|
|
const puppeteer = require('puppeteer-extra');
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
puppeteer.use(StealthPlugin());
|
|
// Expand categories for multiple passes
|
|
const expandedCategories = [];
|
|
const passes = Math.ceil(numWorkers / Math.max(categories.length, 1));
|
|
for (let i = 0; i < passes; i++) {
|
|
expandedCategories.push(...categories);
|
|
}
|
|
const categoryIndex = { current: 0 };
|
|
const worker = async (workerId) => {
|
|
while (categoryIndex.current < expandedCategories.length) {
|
|
const idx = categoryIndex.current++;
|
|
const category = expandedCategories[idx];
|
|
if (!category)
|
|
break;
|
|
const result = await scrapeCategory(puppeteer, workerId, category, useProxies);
|
|
job.results.push({
|
|
category: category.name,
|
|
success: result.success,
|
|
products: result.products,
|
|
error: result.error
|
|
});
|
|
// Delay between requests
|
|
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
|
}
|
|
};
|
|
// Start workers with staggered starts
|
|
const workers = [];
|
|
for (let i = 0; i < numWorkers; i++) {
|
|
workers.push(worker(i + 1));
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
}
|
|
await Promise.all(workers);
|
|
job.status = 'completed';
|
|
job.completedAt = new Date();
|
|
// Clean up job after 1 hour
|
|
setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000);
|
|
}
|
|
async function scrapeCategory(puppeteer, workerId, category, useProxies) {
|
|
let browser = null;
|
|
let proxyId = null;
|
|
try {
|
|
let proxy = null;
|
|
if (useProxies) {
|
|
proxy = await (0, proxy_1.getActiveProxy)();
|
|
}
|
|
const args = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920,1080',
|
|
];
|
|
if (proxy) {
|
|
proxyId = proxy.id;
|
|
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
|
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
|
}
|
|
else {
|
|
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
|
}
|
|
}
|
|
browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args,
|
|
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
|
|
});
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent(FIREFOX_USER_AGENT);
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
if (proxy?.username && proxy?.password) {
|
|
await page.authenticate({
|
|
username: proxy.username,
|
|
password: proxy.password,
|
|
});
|
|
}
|
|
console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`);
|
|
const response = await page.goto(category.url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
if (!response || !response.ok()) {
|
|
throw new Error(`Failed to load page: ${response?.status()}`);
|
|
}
|
|
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
|
timeout: 30000,
|
|
}).catch(() => { });
|
|
const products = await page.evaluate(() => {
|
|
// Try data-testid first, then fall back to product links
|
|
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
|
if (listItems.length > 0)
|
|
return listItems.length;
|
|
return document.querySelectorAll('a[href*="/product/"]').length;
|
|
});
|
|
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
|
await browser.close();
|
|
return { success: true, products };
|
|
}
|
|
catch (error) {
|
|
console.error(`[Worker ${workerId}] Error:`, error.message);
|
|
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
|
|
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
|
|
}
|
|
if (browser) {
|
|
await browser.close().catch(() => { });
|
|
}
|
|
return { success: false, products: 0, error: error.message };
|
|
}
|
|
}
|
|
exports.default = router;
|