Add local product detail page with Dutchie comparison
- Add ProductDetail page for viewing products locally - Add Dutchie and Details buttons to product cards in Products and StoreDetail pages - Add Last Updated display showing data freshness - Add parallel scrape scripts and routes - Add K8s deployment configurations - Add frontend Dockerfile with nginx 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
241
backend/src/scripts/parallel-scrape.ts
Normal file
241
backend/src/scripts/parallel-scrape.ts
Normal file
@@ -0,0 +1,241 @@
|
||||
import { pool } from '../db/migrate';
|
||||
import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
||||
const NUM_WORKERS = parseInt(process.argv[2] || '15');
|
||||
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
|
||||
const USE_PROXIES = process.argv[4] !== 'no-proxy';
|
||||
|
||||
interface Category {
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface Store {
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
dutchie_url: string;
|
||||
}
|
||||
|
||||
async function getStore(name: string): Promise<Store | null> {
|
||||
const result = await pool.query(
|
||||
`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`,
|
||||
[`%${name}%`]
|
||||
);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
|
||||
async function getCategories(storeId: number): Promise<Category[]> {
|
||||
const result = await pool.query(
|
||||
`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`,
|
||||
[storeId]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
async function scrapeWithProxy(
|
||||
workerId: number,
|
||||
store: Store,
|
||||
category: Category
|
||||
): Promise<{ success: boolean; products: number; error?: string }> {
|
||||
let browser = null;
|
||||
let proxyId: number | null = null;
|
||||
|
||||
try {
|
||||
// Get a proxy (if enabled)
|
||||
let proxy = null;
|
||||
if (USE_PROXIES) {
|
||||
proxy = await getActiveProxy();
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
} else {
|
||||
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
|
||||
}
|
||||
} else {
|
||||
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
|
||||
}
|
||||
|
||||
// Build browser args
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1080',
|
||||
];
|
||||
|
||||
if (proxy) {
|
||||
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
||||
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
||||
} else {
|
||||
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args,
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(FIREFOX_USER_AGENT);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
// Handle proxy auth if needed
|
||||
if (proxy?.username && proxy?.password) {
|
||||
await page.authenticate({
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
|
||||
|
||||
// Navigate to the category page
|
||||
const response = await page.goto(category.url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`Failed to load page: ${response?.status()}`);
|
||||
}
|
||||
|
||||
// Wait for products to load
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => {
|
||||
console.log(`[Worker ${workerId}] No products found on page`);
|
||||
});
|
||||
|
||||
// Extract products
|
||||
const products = await page.evaluate(() => {
|
||||
// Try data-testid first, then fall back to product links
|
||||
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
||||
if (listItems.length > 0) return listItems.length;
|
||||
return document.querySelectorAll('a[href*="/product/"]').length;
|
||||
});
|
||||
|
||||
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
||||
|
||||
await browser.close();
|
||||
return { success: true, products };
|
||||
|
||||
} catch (error: any) {
|
||||
console.error(`[Worker ${workerId}] Error:`, error.message);
|
||||
|
||||
// Check for bot detection
|
||||
if (proxyId && isBotDetectionError(error.message)) {
|
||||
putProxyInTimeout(proxyId, error.message);
|
||||
}
|
||||
|
||||
if (browser) {
|
||||
await browser.close().catch(() => {});
|
||||
}
|
||||
|
||||
return { success: false, products: 0, error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function worker(
|
||||
workerId: number,
|
||||
store: Store,
|
||||
categories: Category[],
|
||||
categoryIndex: { current: number }
|
||||
): Promise<void> {
|
||||
while (categoryIndex.current < categories.length) {
|
||||
const idx = categoryIndex.current++;
|
||||
const category = categories[idx];
|
||||
|
||||
if (!category) break;
|
||||
|
||||
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
|
||||
|
||||
const result = await scrapeWithProxy(workerId, store, category);
|
||||
|
||||
if (result.success) {
|
||||
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
|
||||
} else {
|
||||
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
|
||||
}
|
||||
|
||||
// Small delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
||||
}
|
||||
|
||||
console.log(`[Worker ${workerId}] Finished all assigned work`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
|
||||
console.log(`Target: ${DISPENSARY_NAME}`);
|
||||
console.log(`User Agent: Firefox`);
|
||||
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
// Find the store
|
||||
const store = await getStore(DISPENSARY_NAME);
|
||||
if (!store) {
|
||||
console.error(`Store not found: ${DISPENSARY_NAME}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`Found store: ${store.name} (ID: ${store.id})`);
|
||||
|
||||
// Get categories
|
||||
const categories = await getCategories(store.id);
|
||||
if (categories.length === 0) {
|
||||
console.error('No categories found for this store');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`Found ${categories.length} categories to scrape`);
|
||||
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
|
||||
|
||||
// Check proxies
|
||||
const proxyResult = await pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
|
||||
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
|
||||
|
||||
// Shared index for work distribution
|
||||
const categoryIndex = { current: 0 };
|
||||
|
||||
// For a store with few categories, we'll run multiple passes
|
||||
// Expand the work by duplicating categories for parallel workers
|
||||
const expandedCategories: Category[] = [];
|
||||
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
|
||||
for (let i = 0; i < passes; i++) {
|
||||
expandedCategories.push(...categories);
|
||||
}
|
||||
|
||||
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
|
||||
|
||||
// Start workers
|
||||
const workers: Promise<void>[] = [];
|
||||
for (let i = 0; i < NUM_WORKERS; i++) {
|
||||
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
|
||||
// Stagger worker starts
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
|
||||
// Wait for all workers
|
||||
await Promise.all(workers);
|
||||
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log('All workers completed!');
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user