import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function scrapeDutchieBrands(storeUrl: string) { let browser; try { console.log(`\nšŸ” Checking if ${storeUrl} is a Dutchie menu...\n`); // Get proxy const proxyResult = await pool.query(`SELECT host, port, protocol FROM proxies LIMIT 1`); const proxy = proxyResult.rows[0]; const browserArgs = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled' ]; if (proxy) { const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`; browserArgs.push(`--proxy-server=${proxyUrl}`); console.log(`Using proxy: ${proxy.host}:${proxy.port}`); } browser = await puppeteer.launch({ headless: true, args: browserArgs }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false, }); }); // Enable request interception to capture API calls await page.setRequestInterception(true); let productsApiData: any = null; const apiCalls: string[] = []; page.on('request', request => { const url = request.url(); // Log API calls if (url.includes('api') || url.includes('graphql') || url.includes('.json')) { apiCalls.push(url); } request.continue(); }); page.on('response', async response => { const url = response.url(); // Capture ANY API calls that look like they might have product data if (url.includes('api.dutchie.com') || url.includes('/graphql') || url.includes('/api/') || url.includes('products')) { console.log(`šŸ“” API call detected: ${url.substring(0, 100)}...`); try { const contentType = response.headers()['content-type'] || ''; if (contentType.includes('application/json')) { const data = await response.json(); console.log(` Response keys: ${Object.keys(data).join(', ')}`); // Look for product data in the response if (data && (data.data?.filteredProducts || data.data?.products || data.products)) { console.log(' āœ… Found product data in response!'); productsApiData = data; } } } catch (e) { // Ignore JSON parse errors } } }); console.log('Navigating to store page...'); await page.goto(storeUrl, { waitUntil: 'networkidle2', timeout: 60000 }); // Check if it's a Dutchie menu by looking for window.reactEnv const isDutchie = await page.evaluate(() => { return typeof (window as any).reactEnv !== 'undefined'; }); if (!isDutchie) { console.log('āŒ This is not a Dutchie menu page'); return { isDutchie: false, brands: [] }; } console.log('āœ… Detected Dutchie menu!'); // Extract dispensary info from reactEnv const dutchieInfo = await page.evaluate(() => { const env = (window as any).reactEnv; return { dispensaryId: env?.dispensaryId, chainId: env?.chainId, retailerId: env?.retailerId }; }); console.log('\nDutchie Menu Info:'); console.log('─'.repeat(80)); console.log(`Chain ID: ${dutchieInfo.chainId}`); console.log(`Dispensary ID: ${dutchieInfo.dispensaryId}`); console.log(`Retailer ID: ${dutchieInfo.retailerId}`); console.log('─'.repeat(80)); // Scroll page to trigger product loading console.log('\nScrolling page to trigger product loading...'); await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight / 2); }); await page.waitForTimeout(3000); await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); console.log('Waiting for products to load via API...'); await page.waitForTimeout(10000); console.log(`\nšŸ“Š Total API calls detected: ${apiCalls.length}`); if (apiCalls.length > 0) { console.log('API endpoints called:'); apiCalls.slice(0, 10).forEach((url, i) => { console.log(` ${i + 1}. ${url.substring(0, 120)}`); }); if (apiCalls.length > 10) { console.log(` ... and ${apiCalls.length - 10} more\n`); } } // Extract brands from intercepted API data or DOM let brands: string[] = []; if (productsApiData) { console.log('āœ… Successfully intercepted products API data!'); // Extract brands from API response const brandSet = new Set(); const products = productsApiData.data?.filteredProducts?.products || productsApiData.data?.products || []; products.forEach((product: any) => { if (product.brand || product.brandName) { const brandName = product.brand || product.brandName; if (brandName && brandName.length > 0 && brandName.length < 100) { brandSet.add(brandName); } } }); brands = Array.from(brandSet); console.log(`Found ${products.length} products in API response`); } else { console.log('āš ļø No API data intercepted, trying DOM extraction...'); // Fallback: Extract brands from DOM brands = await page.evaluate(() => { const brandSet = new Set(); // Dutchie uses specific selectors for products and brands const selectors = [ '[class*="ProductCard"] [class*="brand"]', '[class*="product-card"] [class*="brand"]', '[data-testid*="product"] [data-testid*="brand"]', '[class*="Brand"]', '[class*="brand-name"]' ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); elements.forEach(el => { const text = el.textContent?.trim(); if (text && text.length > 0 && text.length < 100 && !text.includes('$')) { brandSet.add(text); } }); } // Also look for any element with "brand" in the class containing text const allElements = document.querySelectorAll('[class*="brand" i], [class*="Brand"]'); allElements.forEach(el => { const text = el.textContent?.trim(); if (text && text.length > 1 && text.length < 100 && !text.includes('$') && !text.includes('Add to cart')) { brandSet.add(text); } }); return Array.from(brandSet); }); } console.log('\nšŸ“¦ BRANDS FOUND:'); console.log('─'.repeat(80)); if (brands.length === 0) { console.log('No brands found.'); // Debug: show what's on the page const pageContent = await page.evaluate(() => { return { hasProducts: document.querySelectorAll('[class*="product" i], [class*="Product"]').length, bodyPreview: document.body.innerText?.substring(0, 500) }; }); console.log('\nDebug Info:'); console.log(`Product elements found: ${pageContent.hasProducts}`); console.log(`\nPage preview:\n${pageContent.bodyPreview}\n`); } else { brands.sort().forEach((brand, i) => { console.log(`${i + 1}. ${brand}`); }); console.log('─'.repeat(80)); console.log(`Total: ${brands.length} unique brands\n`); } return { isDutchie: true, brands, dutchieInfo }; } catch (error: any) { console.error('āŒ Error:', error.message); return { isDutchie: false, brands: [], error: error.message }; } finally { if (browser) { await browser.close(); } } } async function main() { const testUrl = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport'; console.log('Testing Dutchie brand scraper...'); console.log('═'.repeat(80)); const result = await scrapeDutchieBrands(testUrl); if (result.isDutchie && result.brands.length > 0) { console.log('\nāœ… SUCCESS! Found Dutchie menu with brands.'); console.log('\nNext steps:'); console.log('1. Update all Curaleaf store URLs to use the correct Dutchie slugs'); console.log('2. Scrape products and brands from each store'); console.log('3. Populate the database with real product data'); } await pool.end(); } main();