import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function scrapeBrands() { let browser; try { console.log('\nšŸ” Scraping brands from Curaleaf 48th Street...\n'); // Get proxy const proxyResult = await pool.query(` SELECT host, port, protocol, username, password FROM proxies LIMIT 1 `); const proxy = proxyResult.rows[0]; const browserArgs = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled' ]; if (proxy) { const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`; browserArgs.push(`--proxy-server=${proxyUrl}`); console.log(`Using proxy: ${proxy.host}:${proxy.port}`); } browser = await puppeteer.launch({ headless: true, args: browserArgs }); const page = await browser.newPage(); // Set Googlebot user agent await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); // Set age verification cookie await page.setCookie({ name: 'age_verified', value: 'true', domain: '.curaleaf.com', path: '/' }); // Anti-detection await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false, }); }); const url = 'https://curaleaf.com/stores/curaleaf-dispensary-48th-street'; console.log(`Navigating to: ${url}`); console.log('─'.repeat(80)); await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); const currentUrl = page.url(); console.log(`Current URL after navigation: ${currentUrl}\n`); console.log('Page loaded, waiting for content to render...\n'); // Wait a bit for JavaScript to render content await page.waitForTimeout(3000); // Check for iframes const iframeInfo = await page.evaluate(() => { const iframes = Array.from(document.querySelectorAll('iframe')); return iframes.map(iframe => ({ src: iframe.src, id: iframe.id, className: iframe.className })); }); if (iframeInfo.length > 0) { console.log('Found iframes:'); iframeInfo.forEach((iframe, i) => { console.log(` ${i + 1}. ${iframe.src}`); }); console.log(''); } // Extract brands from the page const brands = await page.evaluate(() => { const brandSet = new Set(); // Try multiple selectors for brands const selectors = [ '[class*="brand"]', '[class*="Brand"]', '[data-testid*="brand"]', '[class*="product"] [class*="brand"]', '[class*="Product"] [class*="Brand"]' ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); elements.forEach(el => { const text = el.textContent?.trim(); if (text && text.length > 0 && text.length < 100) { brandSet.add(text); } }); } // Also look for product cards and extract any brand info const productCards = document.querySelectorAll('[class*="product"], [class*="Product"], [class*="card"]'); productCards.forEach(card => { const brandElement = card.querySelector('[class*="brand"], [class*="Brand"]'); if (brandElement) { const text = brandElement.textContent?.trim(); if (text && text.length > 0 && text.length < 100) { brandSet.add(text); } } }); return Array.from(brandSet); }); console.log('BRANDS FOUND:'); console.log('─'.repeat(80)); if (brands.length === 0) { console.log('No brands found!'); console.log('\nDumping page structure for debugging...\n'); const pageInfo = await page.evaluate(() => { return { title: document.title, bodyText: document.body.textContent?.substring(0, 1000), productElements: document.querySelectorAll('[class*="product"], [class*="Product"]').length, hasIframe: document.querySelectorAll('iframe').length, allText: document.body.innerText?.substring(0, 500) }; }); console.log('Page Title:', pageInfo.title); console.log('Product Elements:', pageInfo.productElements); console.log('Iframes:', pageInfo.hasIframe); console.log('\nFirst 500 chars of visible text:'); console.log(pageInfo.allText); } else { brands.sort().forEach((brand, i) => { console.log(`${i + 1}. ${brand}`); }); console.log('─'.repeat(80)); console.log(`Total unique brands: ${brands.length}\n`); // Update store URL in database console.log('Updating store URL in database...'); await pool.query( `UPDATE stores SET dutchie_url = $1, slug = $2, updated_at = NOW() WHERE slug = $3`, [ url, 'curaleaf-az-48th-street', // Update to match the local URL pattern 'curaleaf-az-48th-street-med' ] ); console.log('āœ“ Store URL updated\n'); // Insert brands into database console.log('Inserting brands into database...'); const storeResult = await pool.query( 'SELECT id FROM stores WHERE slug = $1', ['curaleaf-az-48th-street'] ); if (storeResult.rows.length > 0) { const storeId = storeResult.rows[0].id; for (const brandName of brands) { await pool.query( `INSERT INTO brands (store_id, name, created_at, updated_at) VALUES ($1, $2, NOW(), NOW()) ON CONFLICT (store_id, name) DO NOTHING`, [storeId, brandName] ); } console.log(`āœ“ Inserted ${brands.length} brands\n`); } } } catch (error: any) { console.error('Error:', error.message); if (error.stack) { console.error(error.stack); } } finally { if (browser) { await browser.close(); } await pool.end(); } } scrapeBrands();