import { chromium, Frame } from 'playwright'; import fs from 'fs/promises'; import path from 'path'; type Product = { name: string; brand?: string; price?: number; size?: string; category?: string; url?: string; imageUrl?: string; }; type ProductWithImagePath = Product & { imagePath?: string }; const TARGET_URL = 'https://azdeeplyrooted.com/menu'; const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted'); const IMAGE_DIR = path.join(OUTPUT_DIR, 'images'); const JSON_PATH = path.join(OUTPUT_DIR, 'products.json'); async function ensureDirs(): Promise { await fs.mkdir(IMAGE_DIR, { recursive: true }); } async function getDutchieFrame(page: any): Promise { const iframeHandle = await page.waitForSelector( 'iframe[src*="dutchie"], iframe[srcdoc*="dutchie"], iframe[id^="iframe-"]', { timeout: 45000 } ); const frame = await iframeHandle.contentFrame(); if (!frame) { throw new Error('Unable to access embedded Dutchie iframe.'); } await frame.waitForLoadState('domcontentloaded', { timeout: 30000 }); return frame; } async function loadAllProducts(frame: Frame): Promise { const maxScrolls = 30; for (let i = 0; i < maxScrolls; i++) { const beforeCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length); await frame.mouse.wheel(0, 1200); await frame.waitForTimeout(900); const afterCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length); if (afterCount <= beforeCount) break; } await frame.evaluate(() => window.scrollTo({ top: 0 })); } async function extractProducts(frame: Frame): Promise { return frame.evaluate(() => { const cards = Array.from( document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]') ); const pickImage = (card: Element): string | undefined => { const imgEl = (card.querySelector('img[src^="http"]') as HTMLImageElement | null) || (card.querySelector('source[srcset]') as HTMLSourceElement | null); if (imgEl && 'src' in imgEl && typeof imgEl.src === 'string' && imgEl.src.startsWith('http')) { return imgEl.src; } if (imgEl && 'srcset' in imgEl && typeof (imgEl as any).srcset === 'string') { const first = (imgEl as any).srcset.split(',')[0]?.trim().split(' ')[0]; if (first?.startsWith('http')) return first; } const dataSrc = (card.querySelector('img[data-src]') as HTMLImageElement | null)?.getAttribute('data-src'); if (dataSrc?.startsWith('http')) return dataSrc; return undefined; }; return cards .map((card: Element) => { const name = (card.querySelector('[data-testid="product-card-name"]') as HTMLElement)?.innerText?.trim() || (card.querySelector('[data-testid="product-name"]') as HTMLElement)?.innerText?.trim() || (card.querySelector('h3, h4') as HTMLElement)?.innerText?.trim() || (card.textContent || '').split('\n').map(t => t.trim()).find(t => t.length > 3) || ''; const brand = (card.querySelector('[data-testid="product-card-brand"]') as HTMLElement)?.innerText?.trim() || (card.querySelector('[data-testid="product-brand"]') as HTMLElement)?.innerText?.trim() || undefined; const priceText = (card.querySelector('[data-testid="product-card-price"]') as HTMLElement)?.innerText || (card.textContent || ''); const priceMatch = priceText.match(/\$?(\d+(?:\.\d{2})?)/); const price = priceMatch ? parseFloat(priceMatch[1]) : undefined; const size = (card.querySelector('[data-testid*="size"]') as HTMLElement)?.innerText?.trim() || (card.querySelector('[data-testid*="weight"]') as HTMLElement)?.innerText?.trim() || undefined; const category = (card.querySelector('[data-testid*="category"]') as HTMLElement)?.innerText?.trim() || undefined; const link = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null; const url = link?.href; const imageUrl = pickImage(card); return { name, brand, price, size, category, url, imageUrl }; }) .filter(p => p.name); }); } function safeFileName(base: string, ext: string): string { return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'product'}.${ext}`; } async function downloadImages(products: Product[]): Promise { const results: ProductWithImagePath[] = []; for (const product of products) { if (!product.imageUrl) { results.push(product); continue; } try { const res = await fetch(product.imageUrl); if (!res.ok) throw new Error(`HTTP ${res.status}`); const arrayBuffer = await res.arrayBuffer(); const contentType = res.headers.get('content-type') || ''; const extFromType = contentType.includes('png') ? 'png' : contentType.includes('jpeg') ? 'jpg' : contentType.includes('jpg') ? 'jpg' : contentType.includes('webp') ? 'webp' : contentType.includes('gif') ? 'gif' : 'bin'; const urlExt = path.extname(new URL(product.imageUrl).pathname).replace('.', ''); const ext = urlExt || extFromType || 'bin'; const fileName = safeFileName(product.name || 'product', ext); const filePath = path.join(IMAGE_DIR, fileName); await fs.writeFile(filePath, Buffer.from(arrayBuffer)); results.push({ ...product, imagePath: filePath }); } catch (err) { console.warn(`Failed to download image for ${product.name}: ${err}`); results.push(product); } } return results; } async function main() { await ensureDirs(); const browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'], }); const page = await browser.newPage({ viewport: { width: 1300, height: 900 }, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); try { console.log(`Navigating to ${TARGET_URL}...`); await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 60000 }); const frame = await getDutchieFrame(page); await frame.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined); await loadAllProducts(frame); const products = await extractProducts(frame); console.log(`Found ${products.length} products, downloading images...`); const withImages = await downloadImages(products); await fs.writeFile(JSON_PATH, JSON.stringify(withImages, null, 2)); console.log(`Saved data to ${JSON_PATH}`); console.log(`Images stored in ${IMAGE_DIR}`); } catch (err) { console.error('Scrape failed:', err); process.exitCode = 1; } finally { await browser.close(); } } main();