/** * Full crawl: Visit each brand page and aggregate all products */ import puppeteer, { Page } from 'puppeteer'; const STORE_ID = 'best'; async function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } async function bypassAgeGate(page: Page): Promise { const ageGate = await page.$('[data-testid="age-gate-modal"]'); if (ageGate) { const btn = await page.$('[data-testid="age-gate-submit-button"]'); if (btn) await btn.click(); await sleep(2000); } } async function scrollToLoadAll(page: Page): Promise { let previousHeight = 0; let sameCount = 0; for (let i = 0; i < 30; i++) { const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) { sameCount++; if (sameCount >= 3) break; } else { sameCount = 0; } await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await sleep(1000); previousHeight = currentHeight; } } async function extractProducts(page: Page): Promise<{ name: string; brand: string; price: string; href: string }[]> { return page.evaluate(() => { const products: { name: string; brand: string; price: string; href: string }[] = []; const seen = new Set(); document.querySelectorAll('a[href*="/product/"]').forEach(a => { const href = a.getAttribute('href') || ''; const img = a.querySelector('img'); const h5 = a.querySelector('h5'); const name = img?.getAttribute('alt') || h5?.textContent?.trim() || ''; if (!name || seen.has(href)) return; seen.add(href); // Extract brand from href pattern: /product/{brand}-{product} const brandMatch = href.match(/\/product\/([^\/]+)/); const productSlug = brandMatch ? brandMatch[1] : ''; const priceEl = a.querySelector('[class*="price"]'); const priceMatch = priceEl?.textContent?.match(/\$(\d+(?:\.\d{2})?)/); const price = priceMatch ? priceMatch[1] : ''; products.push({ name, brand: productSlug.split('-')[0] || '', price, href }); }); return products; }); } async function main() { console.log('='.repeat(60)); console.log('Full Treez Crawl - All Brands'); console.log('='.repeat(60)); const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); await page.setRequestInterception(true); page.on('request', (req) => { if (['image', 'font', 'media'].includes(req.resourceType())) { req.abort(); } else { req.continue(); } }); // Step 1: Go to brands page and extract all brand links const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`; console.log(`\n[1] Getting brand list...`); await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 }); await sleep(3000); await bypassAgeGate(page); await sleep(2000); // Get all brand links from the page const brandLinks = await page.evaluate(() => { const links: string[] = []; const seen = new Set(); // Get all /brand/ links document.querySelectorAll('a[href*="/brand/"]').forEach(a => { const href = a.getAttribute('href') || ''; if (href && !seen.has(href)) { seen.add(href); links.push(href); } }); return links; }); console.log(`Found ${brandLinks.length} brand links: ${brandLinks.join(', ')}`); // Step 2: Also extract unique brands from product URLs const productBrands = await page.evaluate(() => { const brands = new Set(); document.querySelectorAll('a[href*="/product/"]').forEach(a => { const href = a.getAttribute('href') || ''; // Pattern: /product/{brand}-{product}-... // Extract first part before first hyphen that looks like brand const match = href.match(/\/product\/([a-z0-9]+(?:-[a-z0-9]+)?)-/i); if (match) { brands.add(match[1].toLowerCase()); } }); return Array.from(brands); }); console.log(`Found ${productBrands.length} brands from product URLs`); // Step 3: Build full brand URL list const allBrandUrls = new Set(); // Add direct brand links brandLinks.forEach(link => { if (link.startsWith('/')) { allBrandUrls.add(`https://${STORE_ID}.treez.io${link}`); } else { allBrandUrls.add(link); } }); // Add brand URLs from product slugs productBrands.forEach(brand => { allBrandUrls.add(`https://${STORE_ID}.treez.io/brand/${encodeURIComponent(brand)}`); }); console.log(`Total brand URLs to visit: ${allBrandUrls.size}`); // Step 4: Visit each brand page and collect products const allProducts = new Map(); let visitedBrands = 0; for (const brandUrl of allBrandUrls) { try { const fullUrl = brandUrl.includes('customerType') ? brandUrl : `${brandUrl}?customerType=ADULT`; console.log(`\n[${++visitedBrands}/${allBrandUrls.size}] Visiting: ${fullUrl}`); await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 30000 }); await sleep(1500); // Scroll to load all await scrollToLoadAll(page); const products = await extractProducts(page); console.log(` Found ${products.length} products`); products.forEach(p => { if (!allProducts.has(p.href)) { allProducts.set(p.href, p); } }); console.log(` Total unique so far: ${allProducts.size}`); } catch (error: any) { console.log(` Error: ${error.message.slice(0, 50)}`); } // Small delay between requests await sleep(500); } // Summary console.log('\n' + '='.repeat(60)); console.log('SUMMARY'); console.log('='.repeat(60)); console.log(`Brands visited: ${visitedBrands}`); console.log(`Total unique products: ${allProducts.size}`); // Count by brand const brandCounts: Record = {}; allProducts.forEach(p => { brandCounts[p.brand] = (brandCounts[p.brand] || 0) + 1; }); console.log('\nProducts by brand:'); Object.entries(brandCounts) .sort((a, b) => b[1] - a[1]) .slice(0, 20) .forEach(([brand, count]) => { console.log(` ${brand}: ${count}`); }); // Sample products console.log('\nSample products:'); Array.from(allProducts.values()).slice(0, 10).forEach(p => { console.log(` - ${p.name} | ${p.brand} | $${p.price || 'N/A'}`); }); await browser.close(); } main().catch(console.error);