import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function main() { let browser; try { console.log('STEP 2: Getting random proxy from pool...'); const proxyResult = await pool.query(` SELECT host, port, protocol FROM proxies ORDER BY RANDOM() LIMIT 1 `); const proxy = proxyResult.rows[0]; console.log(`✅ Selected proxy: ${proxy.host}:${proxy.port}\n`); console.log('STEP 3: Launching browser with proxy + anti-fingerprint...'); const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`; browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', `--proxy-server=${proxyUrl}`, '--disable-blink-features=AutomationControlled' ] }); const page = await browser.newPage(); // Set Googlebot user-agent await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); console.log('✅ Set UA to Googlebot\n'); // Anti-fingerprint: spoof timezone, geolocation, remove webdriver await page.evaluateOnNewDocument(() => { // Timezone (Arizona) Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', { value: function() { return { timeZone: 'America/Phoenix' }; } }); // Geolocation (Phoenix) Object.defineProperty(navigator, 'geolocation', { get: () => ({ getCurrentPosition: (success: any) => { setTimeout(() => success({ coords: { latitude: 33.4484, longitude: -112.0740, accuracy: 100 } }), 100); } }) }); // Remove webdriver Object.defineProperty(navigator, 'webdriver', { get: () => false }); }); console.log('✅ Fingerprint spoofed (timezone=Arizona, geo=Phoenix, webdriver=hidden)\n'); console.log('STEP 4: Navigating to Curaleaf Phoenix Airport brands page...'); const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands'; console.log(`URL: ${url}\n`); await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); await page.waitForTimeout(5000); console.log('STEP 5: Scraping brand data from page...'); // Get page info for debugging const pageInfo = await page.evaluate(() => ({ title: document.title, url: window.location.href, bodyLength: document.body.innerHTML.length })); console.log(`Page title: "${pageInfo.title}"`); console.log(`Current URL: ${pageInfo.url}`); console.log(`Body HTML length: ${pageInfo.bodyLength} chars\n`); // Scrape brands const brands = await page.evaluate(() => { // Try multiple selectors const selectors = [ '[data-testid*="brand"]', '[class*="Brand"]', '[class*="brand"]', 'a[href*="/brand/"]', '.brand-card', '.brand-item' ]; const found = new Set(); selectors.forEach(selector => { document.querySelectorAll(selector).forEach(el => { const text = el.textContent?.trim(); if (text && text.length > 0 && text.length < 50) { found.add(text); } }); }); return Array.from(found); }); console.log(`✅ Found ${brands.length} brands:\n`); brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`)); if (brands.length === 0) { console.log('\n⚠️ No brands found. Possible reasons:'); console.log(' - IP/proxy is blocked'); console.log(' - Page requires different selectors'); console.log(' - Brands load asynchronously'); return; } console.log('\n\nSTEP 6: Saving brands to database...'); let saved = 0; for (const brand of brands) { try { await pool.query(` INSERT INTO products (store_id, name, brand, dutchie_url, in_stock) VALUES (1, $1, $2, $3, true) ON CONFLICT (store_id, name, brand) DO NOTHING `, [`${brand} Product`, brand, url]); saved++; } catch (e) {} } console.log(`✅ Saved ${saved} brands to database\n`); } catch (error: any) { console.error('❌ ERROR:', error.message); } finally { if (browser) await browser.close(); await pool.end(); } } main();