Files
cannaiq/backend/dist/routes/parallel-scrape.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

183 lines
7.1 KiB
JavaScript

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
const middleware_1 = require("../auth/middleware");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
// In-memory job tracking
const activeJobs = new Map();
// Get job status
router.get('/status/:jobId', (req, res) => {
const job = activeJobs.get(req.params.jobId);
if (!job) {
return res.status(404).json({ error: 'Job not found' });
}
res.json(job);
});
// List active jobs
router.get('/jobs', (req, res) => {
const jobs = Array.from(activeJobs.values());
res.json({ jobs });
});
// Start parallel scrape
router.post('/start', async (req, res) => {
const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body;
try {
// Find the store
const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: `Store not found: ${storeName}` });
}
const store = storeResult.rows[0];
// Get categories
const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]);
if (categoriesResult.rows.length === 0) {
return res.status(404).json({ error: 'No categories found for this store' });
}
const categories = categoriesResult.rows;
// Create job
const jobId = `scrape-${Date.now()}`;
const job = {
id: jobId,
storeName: store.name,
status: 'running',
workers,
startedAt: new Date(),
results: []
};
activeJobs.set(jobId, job);
// Start scraping in background
runParallelScrape(job, store, categories, workers, useProxies).catch(err => {
console.error('Parallel scrape error:', err);
job.status = 'failed';
});
res.json({
message: 'Parallel scrape started',
jobId,
store: store.name,
categories: categories.length,
workers
});
}
catch (error) {
console.error('Failed to start parallel scrape:', error);
res.status(500).json({ error: error.message });
}
});
async function runParallelScrape(job, store, categories, numWorkers, useProxies) {
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Expand categories for multiple passes
const expandedCategories = [];
const passes = Math.ceil(numWorkers / Math.max(categories.length, 1));
for (let i = 0; i < passes; i++) {
expandedCategories.push(...categories);
}
const categoryIndex = { current: 0 };
const worker = async (workerId) => {
while (categoryIndex.current < expandedCategories.length) {
const idx = categoryIndex.current++;
const category = expandedCategories[idx];
if (!category)
break;
const result = await scrapeCategory(puppeteer, workerId, category, useProxies);
job.results.push({
category: category.name,
success: result.success,
products: result.products,
error: result.error
});
// Delay between requests
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
}
};
// Start workers with staggered starts
const workers = [];
for (let i = 0; i < numWorkers; i++) {
workers.push(worker(i + 1));
await new Promise(resolve => setTimeout(resolve, 500));
}
await Promise.all(workers);
job.status = 'completed';
job.completedAt = new Date();
// Clean up job after 1 hour
setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000);
}
async function scrapeCategory(puppeteer, workerId, category, useProxies) {
let browser = null;
let proxyId = null;
try {
let proxy = null;
if (useProxies) {
proxy = await (0, proxy_1.getActiveProxy)();
}
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080',
];
if (proxy) {
proxyId = proxy.id;
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
}
else {
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
}
browser = await puppeteer.launch({
headless: 'new',
args,
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
});
const page = await browser.newPage();
await page.setUserAgent(FIREFOX_USER_AGENT);
await page.setViewport({ width: 1920, height: 1080 });
if (proxy?.username && proxy?.password) {
await page.authenticate({
username: proxy.username,
password: proxy.password,
});
}
console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`);
const response = await page.goto(category.url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
if (!response || !response.ok()) {
throw new Error(`Failed to load page: ${response?.status()}`);
}
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => { });
const products = await page.evaluate(() => {
// Try data-testid first, then fall back to product links
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
if (listItems.length > 0)
return listItems.length;
return document.querySelectorAll('a[href*="/product/"]').length;
});
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
await browser.close();
return { success: true, products };
}
catch (error) {
console.error(`[Worker ${workerId}] Error:`, error.message);
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
}
if (browser) {
await browser.close().catch(() => { });
}
return { success: false, products: 0, error: error.message };
}
}
exports.default = router;