import { chromium as playwright } from 'playwright-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import fs from 'fs/promises'; import path from 'path'; playwright.use(StealthPlugin()); type ProductVariant = { label: string; price?: number; inventory?: string; }; type ProductData = { name: string; brand?: string; price?: number; description?: string; thc?: string; cbd?: string; category?: string; variants?: ProductVariant[]; images: string[]; productUrl: string; }; const PRODUCT_URL = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/product/mfused-loud-liquid-diamonds-aio-stoopid-gas'; const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'dutchie-product'); const IMAGE_DIR = path.join(OUTPUT_DIR, 'images'); const JSON_PATH = path.join(OUTPUT_DIR, 'product.json'); async function ensureDirs() { await fs.mkdir(IMAGE_DIR, { recursive: true }); } async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise { const start = Date.now(); while (Date.now() - start < maxWaitMs) { const title = await page.title().catch(() => ''); const content = await page.content().catch(() => ''); const challenge = title.includes('Attention Required') || title.includes('Just a moment') || content.includes('challenge-platform') || content.includes('cf-challenge'); if (!challenge) return true; await page.waitForTimeout(2000); } return false; } async function extractProduct(page: any): Promise { return page.evaluate(() => { const pickText = (selectorList: string[]): string | undefined => { for (const sel of selectorList) { const el = document.querySelector(sel) as HTMLElement | null; const txt = el?.innerText?.trim(); if (txt) return txt; } return undefined; }; const pickAllTexts = (selector: string): string[] => Array.from(document.querySelectorAll(selector)) .map(el => (el as HTMLElement).innerText?.trim()) .filter(Boolean) as string[]; const parsePrice = (text?: string | null): number | undefined => { if (!text) return undefined; const match = text.match(/\$?(\d+(?:\.\d{1,2})?)/); return match ? parseFloat(match[1]) : undefined; }; const name = pickText(['[data-testid="product-name"]', 'h1', '[class*="ProductTitle"]']) || ''; const brand = pickText(['[data-testid="product-brand"]', '[class*="Brand"]']); const priceText = pickText([ '[data-testid="product-price"]', '[data-testid*="price"]', '[class*="Price"]' ]) || ''; const description = pickText(['[data-testid="product-description"]', 'article', '[class*="Description"]']); const potencyTexts = pickAllTexts('[data-testid*="thc"], [data-testid*="cbd"], [class*="Potency"]'); const thc = potencyTexts.find(t => t.toLowerCase().includes('thc')) || undefined; const cbd = potencyTexts.find(t => t.toLowerCase().includes('cbd')) || undefined; const category = pickText(['[data-testid="breadcrumb"]', '[class*="Breadcrumb"]', '[data-testid*="category"]']) || undefined; const variantEls = Array.from( document.querySelectorAll('[data-testid*="variant"], [data-testid*="option"], [class*="Variant"]') ); const variants = variantEls.map(el => { const label = (el.querySelector('span,div') as HTMLElement | null)?.innerText?.trim() || el.textContent?.trim() || ''; const price = parsePrice(el.textContent || undefined); return { label, price }; }).filter(v => v.label); const imageUrls = Array.from( document.querySelectorAll('img[src*="images.dutchie.com"], source[srcset*="images.dutchie.com"], img[src*="https://images.dutchie.com"]') ).map(el => { if (el instanceof HTMLImageElement) return el.src; const srcset = (el as HTMLSourceElement).srcset || ''; return srcset.split(',')[0]?.trim().split(' ')[0]; }).filter((u): u is string => !!u); return { name, brand, price: parsePrice(priceText), description, thc, cbd, category, variants, images: Array.from(new Set(imageUrls)), productUrl: window.location.href, }; }); } function safeFileName(base: string, ext: string): string { return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'image'}.${ext}`; } async function downloadImages(urls: string[]): Promise { const saved: string[] = []; for (const url of urls) { try { const res = await fetch(url); if (!res.ok) throw new Error(`HTTP ${res.status}`); const buf = Buffer.from(await res.arrayBuffer()); const contentType = res.headers.get('content-type') || ''; const urlExt = path.extname(new URL(url).pathname).replace('.', ''); const ext = urlExt || (contentType.includes('png') ? 'png' : contentType.includes('jpeg') ? 'jpg' : contentType.includes('webp') ? 'webp' : 'bin'); const fileName = safeFileName(path.basename(url).split('.')[0] || 'image', ext); const filePath = path.join(IMAGE_DIR, fileName); await fs.writeFile(filePath, buf); saved.push(filePath); } catch (err) { console.warn(`Failed to download image ${url}:`, err); } } return saved; } async function main() { await ensureDirs(); const browser = await playwright.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled', ], }); const context = await browser.newContext({ viewport: { width: 1280, height: 900 }, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); const page = await context.newPage(); try { console.log(`Navigating to product page...`); await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded', timeout: 90000 }); const cfOk = await waitForCloudflare(page, 60000); if (!cfOk) { throw new Error('Cloudflare challenge not passed in time'); } await page.waitForSelector('[data-testid*="product"]', { timeout: 60000 }).catch(() => undefined); await page.waitForTimeout(2000); const product = await extractProduct(page); console.log('Extracted product:'); console.log(product); const imagePaths = await downloadImages(product.images); const finalProduct = { ...product, imagePaths }; await fs.writeFile(JSON_PATH, JSON.stringify(finalProduct, null, 2)); console.log(`Saved product JSON to ${JSON_PATH}`); if (imagePaths.length) { console.log(`Saved ${imagePaths.length} images to ${IMAGE_DIR}`); } } catch (err) { console.error('Failed to scrape product:', err); process.exitCode = 1; } finally { await browser.close(); } } main();