import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function getRandomActiveProxy() { const result = await pool.query(` SELECT id, host, port, protocol FROM proxies WHERE active = false ORDER BY RANDOM() LIMIT 1 `); return result.rows[0] || null; } async function scrapeCuraleafBrands() { let browser; try { // Get proxy const proxy = await getRandomActiveProxy(); if (!proxy) { console.log('āš ļø No proxies available'); await pool.end(); return; } console.log(`šŸ”Œ Using proxy: ${proxy.host}:${proxy.port}`); // Launch browser with proxy const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`; browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', '--disable-features=IsolateOrigins,site-per-process', `--proxy-server=${proxyUrl}`, '--disable-web-security', '--disable-features=VizDisplayCompositor' ] }); const page = await browser.newPage(); // Set Googlebot user-agent await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); // Set viewport await page.setViewport({ width: 1920, height: 1080 }); // Additional stealth measures await page.evaluateOnNewDocument(() => { // Override timezone to Arizona Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', { value: function() { return { timeZone: 'America/Phoenix' }; } }); // Spoof geolocation Object.defineProperty(navigator, 'geolocation', { get: () => ({ getCurrentPosition: (success: any) => { setTimeout(() => { success({ coords: { latitude: 33.4484, // Phoenix, AZ longitude: -112.0740, accuracy: 100 } }); }, 100); } }) }); // Remove webdriver flag Object.defineProperty(navigator, 'webdriver', { get: () => false }); // Chrome runtime (window as any).chrome = { runtime: {} }; // Languages Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); // Plugins Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); }); // Get store from database const storeResult = await pool.query(` SELECT id, name, dutchie_url FROM stores WHERE slug = 'curaleaf-az-48th-street' `); if (storeResult.rows.length === 0) { console.log('āŒ Store not found'); await browser.close(); await pool.end(); return; } const store = storeResult.rows[0]; const testUrl = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport'; console.log(`\n🌐 Navigating to: ${testUrl}`); console.log(`šŸ“¦ Store: ${store.name}\n`); // Track API responses const apiResponses: any[] = []; page.on('response', async response => { const url = response.url(); try { const contentType = response.headers()['content-type'] || ''; if (contentType.includes('application/json')) { const data = await response.json(); // Look for product data if (url.includes('filteredProducts') || url.includes('products') || url.includes('menu') || (data.data && data.data.filteredProducts)) { console.log(`šŸ“” Found product API: ${url.substring(0, 80)}...`); apiResponses.push({ url, data }); } } } catch (e) { // Not JSON } }); await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Check for Dutchie const isDutchie = await page.evaluate(() => { return typeof (window as any).reactEnv !== 'undefined'; }); console.log(`āœ… Is Dutchie menu: ${isDutchie}\n`); if (isDutchie) { // Get reactEnv const reactEnv = await page.evaluate(() => { return (window as any).reactEnv; }); console.log('šŸ“‹ Dutchie Info:'); console.log(` Chain ID: ${reactEnv.chainId}`); console.log(` Dispensary ID: ${reactEnv.dispensaryId}`); console.log(` Retailer ID: ${reactEnv.retailerId}\n`); // Scroll to trigger lazy loading console.log('šŸ“œ Scrolling page to trigger product loading...'); await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight / 2)); await page.waitForTimeout(3000); await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(3000); console.log(`\nšŸ“Š Captured ${apiResponses.length} API responses\n`); // Extract products from API responses const allProducts: any[] = []; for (const resp of apiResponses) { if (resp.data && resp.data.data && resp.data.data.filteredProducts) { const products = resp.data.data.filteredProducts.products || []; allProducts.push(...products); console.log(`āœ… Found ${products.length} products in API response`); } } if (allProducts.length > 0) { // Extract unique brands const brands = new Set(); allProducts.forEach((product: any) => { if (product.brand) brands.add(product.brand); if (product.brandName) brands.add(product.brandName); }); console.log(`\nšŸ·ļø Unique Brands Found (${brands.size}):`); console.log('─'.repeat(60)); Array.from(brands).sort().forEach((brand, i) => { console.log(`${i + 1}. ${brand}`); }); console.log('─'.repeat(60)); // Save products to database console.log(`\nšŸ’¾ Saving ${allProducts.length} products to database...`); let saved = 0; for (const product of allProducts.slice(0, 50)) { // Save first 50 products try { await pool.query(` INSERT INTO products ( store_id, name, brand, price, thc_percentage, dutchie_url, in_stock, category ) VALUES ($1, $2, $3, $4, $5, $6, true, $7) ON CONFLICT (store_id, name, brand) DO UPDATE SET price = $4, thc_percentage = $5, in_stock = true `, [ store.id, product.name || 'Unknown', product.brand || product.brandName || 'Unknown', parseFloat(product.price) || 0, parseFloat(product.potencyThc?.formatted?.replace('%', '')) || null, testUrl, product.category || 'other' ]); saved++; } catch (error: any) { console.log(`āŒ Error saving product: ${error.message}`); } } console.log(`āœ… Saved ${saved} products to database\n`); } else { console.log('āš ļø No products found in API responses\n'); } } } catch (error: any) { console.error('āŒ Error:', error.message); } finally { if (browser) { await browser.close(); } await pool.end(); } } scrapeCuraleafBrands();