/** * Extract ALL product elements and find unique products */ import puppeteer, { Page } from 'puppeteer'; const STORE_ID = 'best'; async function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } async function bypassAgeGate(page: Page): Promise { const ageGate = await page.$('[data-testid="age-gate-modal"]'); if (ageGate) { const btn = await page.$('[data-testid="age-gate-submit-button"]'); if (btn) await btn.click(); await sleep(2000); } } async function main() { console.log('='.repeat(60)); console.log('Extracting ALL product elements'); console.log('='.repeat(60)); const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); await page.setRequestInterception(true); page.on('request', (req) => { if (['image', 'font', 'media'].includes(req.resourceType())) { req.abort(); } else { req.continue(); } }); const url = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`; await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); await sleep(3000); await bypassAgeGate(page); await sleep(2000); // Get ALL elements with product_product__ class console.log('\n[1] Counting all product_product__ elements...'); const elementAnalysis = await page.evaluate(() => { const all = document.querySelectorAll('[class*="product_product__"]'); const byTag: Record = {}; const anchorHrefs: string[] = []; const imgAlts: string[] = []; all.forEach(el => { const tag = el.tagName; byTag[tag] = (byTag[tag] || 0) + 1; if (tag === 'A') { const href = el.getAttribute('href'); if (href && href.includes('/product/')) { anchorHrefs.push(href); } } if (tag === 'IMG') { const alt = el.getAttribute('alt'); if (alt) imgAlts.push(alt); } }); return { total: all.length, byTag, anchorHrefs: anchorHrefs.slice(0, 20), uniqueAnchors: new Set(anchorHrefs).size, imgAlts: imgAlts.slice(0, 20), uniqueImgAlts: new Set(imgAlts).size, }; }); console.log(`Total elements: ${elementAnalysis.total}`); console.log(`By tag:`, elementAnalysis.byTag); console.log(`Unique anchor hrefs: ${elementAnalysis.uniqueAnchors}`); console.log(`Unique image alts: ${elementAnalysis.uniqueImgAlts}`); console.log(`\nSample anchor hrefs:`, elementAnalysis.anchorHrefs.slice(0, 5)); console.log(`Sample image alts:`, elementAnalysis.imgAlts.slice(0, 5)); // Try to extract using different approaches console.log('\n[2] Testing extraction approaches...'); const approaches = await page.evaluate(() => { const results: Record = {}; // Approach 1: Anchor elements with product links const anchors = document.querySelectorAll('a[href*="/product/"]'); const anchorNames = new Set(); anchors.forEach(a => { const img = a.querySelector('img'); const name = img?.getAttribute('alt') || a.textContent?.trim().split('\n')[0] || ''; if (name) anchorNames.add(name); }); results['a[href*="/product/"]'] = { count: anchors.length, unique: anchorNames.size, sample: Array.from(anchorNames).slice(0, 5), }; // Approach 2: Images with alt text inside product areas const productImgs = document.querySelectorAll('[class*="product_product__"] img[alt]'); const imgNames = new Set(); productImgs.forEach(img => { const alt = img.getAttribute('alt'); if (alt && alt.length > 2) imgNames.add(alt); }); results['[class*="product_product__"] img[alt]'] = { count: productImgs.length, unique: imgNames.size, sample: Array.from(imgNames).slice(0, 5), }; // Approach 3: H5 elements (product names) const h5s = document.querySelectorAll('h5.product_product__name__JcEk0, h5[class*="product__name"]'); const h5Names = new Set(); h5s.forEach(h5 => { const text = h5.textContent?.trim(); if (text) h5Names.add(text); }); results['h5[class*="product__name"]'] = { count: h5s.length, unique: h5Names.size, sample: Array.from(h5Names).slice(0, 5), }; // Approach 4: Link class with product_product__ const links = document.querySelectorAll('a.product_product__ERWtJ, a[class*="product_product__"][class*="link"]'); const linkNames = new Set(); links.forEach(link => { const h5 = link.querySelector('h5'); const img = link.querySelector('img'); const name = h5?.textContent?.trim() || img?.getAttribute('alt') || ''; if (name) linkNames.add(name); }); results['a.product_product__ERWtJ'] = { count: links.length, unique: linkNames.size, sample: Array.from(linkNames).slice(0, 5), }; return results; }); Object.entries(approaches).forEach(([sel, data]) => { console.log(`\n${sel}:`); console.log(` Count: ${data.count}, Unique: ${data.unique}`); console.log(` Sample: ${data.sample.join(', ')}`); }); // The best approach: use images with alt as the source of truth console.log('\n[3] Full product extraction using img[alt] approach...'); const products = await page.evaluate(() => { const seen = new Set(); const products: { name: string; href: string; price: string }[] = []; // Get all product links document.querySelectorAll('a[href*="/product/"]').forEach(a => { const img = a.querySelector('img'); const name = img?.getAttribute('alt') || ''; if (!name || seen.has(name)) return; seen.add(name); const href = a.getAttribute('href') || ''; // Get price from within the link or parent let price = ''; const priceEl = a.querySelector('[class*="price"]'); if (priceEl) { const priceMatch = priceEl.textContent?.match(/\$(\d+(?:\.\d{2})?)/); price = priceMatch ? priceMatch[1] : ''; } products.push({ name, href, price }); }); return products; }); console.log(`Extracted ${products.length} unique products`); console.log('\nSample products:'); products.slice(0, 10).forEach(p => { console.log(` - ${p.name} | ${p.price ? '$' + p.price : 'N/A'} | ${p.href.slice(0, 40)}...`); }); await browser.close(); } main().catch(console.error);