import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function scrapeCuraleafBrands() { let browser; try { // Get random proxy const proxyResult = await pool.query(` SELECT host, port, protocol FROM proxies ORDER BY RANDOM() LIMIT 1 `); const proxy = proxyResult.rows[0]; const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`; console.log('šŸ”Œ Proxy:', `${proxy.host}:${proxy.port}`); browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', `--proxy-server=${proxyUrl}` ] }); const page = await browser.newPage(); // Mobile Chrome UA const mobileUA = 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36'; await page.setUserAgent(mobileUA); console.log('šŸ“± UA: Mobile Chrome'); console.log(''); const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands'; console.log('🌐 Going to:', url); await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); await page.waitForTimeout(3000); if (page.url().includes('/age-gate')) { console.log('šŸ”’ Handling age gate...'); // Gate 1: State selector await page.waitForSelector('button[role="combobox"]', { timeout: 10000 }); await page.click('button[role="combobox"]'); console.log(' āœ… Opened dropdown'); await page.waitForTimeout(2000); // Find and click Arizona with REAL Puppeteer click await page.waitForSelector('[role="option"]', { timeout: 5000 }); const options = await page.$$('[role="option"]'); for (const option of options) { const text = await option.evaluate(el => el.textContent?.toLowerCase().trim()); if (text === 'arizona') { await option.click(); console.log(' āœ… Selected Arizona'); break; } } await page.waitForTimeout(3000); // Gate 2: Age confirmation - wait for button to appear const ageButtonAppeared = await page.waitForFunction(() => { const buttons = Array.from(document.querySelectorAll('button')); return buttons.some(btn => btn.textContent?.trim().toLowerCase().includes("i'm over 21")); }, { timeout: 10000 }); if (ageButtonAppeared) { console.log(' āœ… Age button appeared'); // Click it with page.evaluate since we know the text await page.evaluate(() => { const buttons = Array.from(document.querySelectorAll('button')); const ageBtn = buttons.find(btn => btn.textContent?.trim().toLowerCase().includes("i'm over 21") ) as HTMLElement; if (ageBtn) ageBtn.click(); }); console.log(' āœ… Clicked age confirmation'); await page.waitForTimeout(5000); } } console.log(''); console.log('šŸ“¦ Scraping brands...'); console.log('šŸ“ URL:', page.url()); await page.waitForTimeout(3000); // Scrape brands with better filtering const brands = await page.evaluate(() => { const selectors = [ '[data-testid*="brand"]', '[class*="Brand"]', '[class*="brand"]', 'a[href*="/brand/"]' ]; const found = new Set(); selectors.forEach(selector => { document.querySelectorAll(selector).forEach(el => { const text = el.textContent?.trim(); // Filter out single letters, "Brands", "Search", etc. if (text && text.length > 1 && text.length < 50 && text !== 'Brands' && text !== 'Search' && text !== 'BrandsSearch' && !/^[A-Z]$/.test(text)) { found.add(text); } }); }); return Array.from(found).sort(); }); console.log(`\nāœ… Found ${brands.length} brands`); console.log('─'.repeat(60)); brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`)); console.log('─'.repeat(60)); // Save to database console.log(''); console.log('šŸ’¾ Saving to database...'); // Get the store ID const storeResult = await pool.query(` SELECT id FROM stores WHERE slug = 'curaleaf-az-48th-street' `); if (storeResult.rows.length === 0) { console.log('āŒ Store not found: curaleaf-az-48th-street'); return; } const storeId = storeResult.rows[0].id; // Delete existing brands for this store await pool.query('DELETE FROM brands WHERE store_id = $1', [storeId]); console.log(` šŸ—‘ļø Deleted old brands for store ${storeId}`); // Insert new brands using ON CONFLICT to handle duplicates let inserted = 0; for (const brandName of brands) { await pool.query(` INSERT INTO brands (store_id, name, created_at, updated_at) VALUES ($1, $2, NOW(), NOW()) ON CONFLICT (store_id, name) DO UPDATE SET updated_at = NOW() `, [storeId, brandName]); inserted++; } console.log(` āœ… Saved ${inserted} brands`); console.log(''); console.log('šŸŽ‰ Complete! View at: http://localhost:5174/stores/az/curaleaf/curaleaf-az-48th-street/brands'); } catch (error: any) { console.error('āŒ Error:', error.message); console.error(error.stack); } finally { if (browser) await browser.close(); await pool.end(); } } scrapeCuraleafBrands();