Add local product detail page with Dutchie comparison

- Add ProductDetail page for viewing products locally
- Add Dutchie and Details buttons to product cards in Products and StoreDetail pages
- Add Last Updated display showing data freshness
- Add parallel scrape scripts and routes
- Add K8s deployment configurations
- Add frontend Dockerfile with nginx

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-11-30 06:34:38 -07:00
parent 6e597f15ca
commit 8b4292fbb2
34 changed files with 1613 additions and 552 deletions

View File

@@ -0,0 +1,241 @@
import { pool } from '../db/migrate';
import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
const NUM_WORKERS = parseInt(process.argv[2] || '15');
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
const USE_PROXIES = process.argv[4] !== 'no-proxy';
interface Category {
id: number;
name: string;
slug: string;
url: string;
}
interface Store {
id: number;
name: string;
slug: string;
dutchie_url: string;
}
async function getStore(name: string): Promise<Store | null> {
const result = await pool.query(
`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`,
[`%${name}%`]
);
return result.rows[0] || null;
}
async function getCategories(storeId: number): Promise<Category[]> {
const result = await pool.query(
`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`,
[storeId]
);
return result.rows;
}
async function scrapeWithProxy(
workerId: number,
store: Store,
category: Category
): Promise<{ success: boolean; products: number; error?: string }> {
let browser = null;
let proxyId: number | null = null;
try {
// Get a proxy (if enabled)
let proxy = null;
if (USE_PROXIES) {
proxy = await getActiveProxy();
if (proxy) {
proxyId = proxy.id;
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
} else {
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
}
} else {
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
}
// Build browser args
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080',
];
if (proxy) {
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
} else {
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
}
browser = await puppeteer.launch({
headless: true,
args,
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
});
const page = await browser.newPage();
await page.setUserAgent(FIREFOX_USER_AGENT);
await page.setViewport({ width: 1920, height: 1080 });
// Handle proxy auth if needed
if (proxy?.username && proxy?.password) {
await page.authenticate({
username: proxy.username,
password: proxy.password,
});
}
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
// Navigate to the category page
const response = await page.goto(category.url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
if (!response || !response.ok()) {
throw new Error(`Failed to load page: ${response?.status()}`);
}
// Wait for products to load
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => {
console.log(`[Worker ${workerId}] No products found on page`);
});
// Extract products
const products = await page.evaluate(() => {
// Try data-testid first, then fall back to product links
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
if (listItems.length > 0) return listItems.length;
return document.querySelectorAll('a[href*="/product/"]').length;
});
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
await browser.close();
return { success: true, products };
} catch (error: any) {
console.error(`[Worker ${workerId}] Error:`, error.message);
// Check for bot detection
if (proxyId && isBotDetectionError(error.message)) {
putProxyInTimeout(proxyId, error.message);
}
if (browser) {
await browser.close().catch(() => {});
}
return { success: false, products: 0, error: error.message };
}
}
async function worker(
workerId: number,
store: Store,
categories: Category[],
categoryIndex: { current: number }
): Promise<void> {
while (categoryIndex.current < categories.length) {
const idx = categoryIndex.current++;
const category = categories[idx];
if (!category) break;
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
const result = await scrapeWithProxy(workerId, store, category);
if (result.success) {
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
} else {
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
}
// Small delay between requests
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
}
console.log(`[Worker ${workerId}] Finished all assigned work`);
}
async function main() {
console.log(`\n${'='.repeat(60)}`);
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
console.log(`Target: ${DISPENSARY_NAME}`);
console.log(`User Agent: Firefox`);
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
console.log(`${'='.repeat(60)}\n`);
// Find the store
const store = await getStore(DISPENSARY_NAME);
if (!store) {
console.error(`Store not found: ${DISPENSARY_NAME}`);
process.exit(1);
}
console.log(`Found store: ${store.name} (ID: ${store.id})`);
// Get categories
const categories = await getCategories(store.id);
if (categories.length === 0) {
console.error('No categories found for this store');
process.exit(1);
}
console.log(`Found ${categories.length} categories to scrape`);
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
// Check proxies
const proxyResult = await pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
// Shared index for work distribution
const categoryIndex = { current: 0 };
// For a store with few categories, we'll run multiple passes
// Expand the work by duplicating categories for parallel workers
const expandedCategories: Category[] = [];
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
for (let i = 0; i < passes; i++) {
expandedCategories.push(...categories);
}
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
// Start workers
const workers: Promise<void>[] = [];
for (let i = 0; i < NUM_WORKERS; i++) {
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
// Stagger worker starts
await new Promise(resolve => setTimeout(resolve, 500));
}
// Wait for all workers
await Promise.all(workers);
console.log(`\n${'='.repeat(60)}`);
console.log('All workers completed!');
console.log(`${'='.repeat(60)}\n`);
await pool.end();
}
main().catch(console.error);