import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function scrape() { let browser; try { // Get random proxy const proxyResult = await pool.query(` SELECT host, port, protocol FROM proxies WHERE active = false ORDER BY RANDOM() LIMIT 1 `); if (proxyResult.rows.length === 0) { console.log('❌ No proxies available'); return; } const proxy = proxyResult.rows[0]; const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`; console.log(`🔌 Using proxy: ${proxy.host}:${proxy.port}\n`); // Launch browser with proxy browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', `--proxy-server=${proxyUrl}` ] }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport'; console.log(`🌐 Going to: ${url}\n`); await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); // Wait for products to load console.log('⏳ Waiting for products to load...\n'); await page.waitForTimeout(5000); // Scrape products from DOM const products = await page.evaluate(() => { const productElements = document.querySelectorAll('[data-testid="product-card"], .product-card, [class*="Product"], [class*="product"]'); const results: any[] = []; productElements.forEach(el => { try { // Try to extract product info from the element const nameEl = el.querySelector('[class*="name"], [class*="Name"], h3, h4'); const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]'); const priceEl = el.querySelector('[class*="price"], [class*="Price"]'); const name = nameEl?.textContent?.trim(); const brand = brandEl?.textContent?.trim(); const price = priceEl?.textContent?.trim(); if (name) { results.push({ name, brand, price }); } } catch (e) { // Skip this element } }); return results; }); console.log(`📦 Found ${products.length} products\n`); if (products.length > 0) { // Extract unique brands const brands = new Set(products.map(p => p.brand).filter(Boolean)); console.log(`🏷️ Brands found: ${Array.from(brands).join(', ')}\n`); // Get store ID const storeResult = await pool.query(` SELECT id FROM stores WHERE slug = 'curaleaf-az-48th-street' `); if (storeResult.rows.length === 0) { console.log('❌ Store not found in database'); return; } const storeId = storeResult.rows[0].id; // Save to database let saved = 0; for (const product of products) { if (!product.name || !product.brand) continue; try { await pool.query(` INSERT INTO products (store_id, name, brand, price, dutchie_url, in_stock) VALUES ($1, $2, $3, $4, $5, true) ON CONFLICT (store_id, name, brand) DO UPDATE SET price = $4, in_stock = true `, [ storeId, product.name, product.brand, parseFloat(product.price?.replace(/[^0-9.]/g, '') || '0'), url ]); saved++; } catch (error: any) { console.log(`⚠️ Skip: ${error.message}`); } } console.log(`✅ Saved ${saved} products to database\n`); } else { console.log('⚠️ No products found on page\n'); // Debug: show what we found const pageContent = await page.evaluate(() => { return { title: document.title, bodyText: document.body.innerText.substring(0, 500) }; }); console.log('Page title:', pageContent.title); console.log('Page preview:', pageContent.bodyText); } } catch (error: any) { console.error('❌ Error:', error.message); } finally { if (browser) await browser.close(); await pool.end(); } } scrape();