"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright; exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright; const age_gate_playwright_1 = require("../utils/age-gate-playwright"); const logger_1 = require("./logger"); const stealthBrowser_1 = require("../utils/stealthBrowser"); const dutchie_1 = require("../scrapers/templates/dutchie"); /** * Scrapes a category page using Playwright with stealth mode to extract product information */ async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) { logger_1.logger.info('scraper', `Scraping category: ${categoryName}`); logger_1.logger.info('scraper', `URL: ${categoryUrl}`); // Create stealth browser with optional proxy const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true }); try { // Create stealth context with age gate cookies const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state }); // Try to load saved session cookies const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`; await (0, stealthBrowser_1.loadCookies)(context, cookiesPath); const page = await context.newPage(); // Navigate to category page logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`); await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Random delay to appear more human await (0, stealthBrowser_1.randomDelay)(1000, 2000); // Check for Cloudflare challenge if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) { logger_1.logger.info('scraper', 'šŸ›”ļø Cloudflare challenge detected, waiting...'); const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000); if (!passed) { logger_1.logger.error('scraper', 'āŒ Failed to pass Cloudflare challenge'); await browser.close(); return []; } // Save successful session cookies await (0, stealthBrowser_1.saveCookies)(context, cookiesPath); } // Wait for page to be fully loaded await (0, stealthBrowser_1.waitForPageLoad)(page); // Simulate human behavior await (0, stealthBrowser_1.simulateHumanBehavior)(page); // Check for and bypass age gate const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state); if (!bypassed) { logger_1.logger.error('scraper', 'Failed to bypass age gate'); await browser.close(); return []; } // Wait for products to load with random delay logger_1.logger.info('scraper', 'Waiting for products to load...'); await (0, stealthBrowser_1.randomDelay)(2000, 4000); // Scroll to load all products with human-like behavior logger_1.logger.info('scraper', 'Scrolling to load all products...'); await scrollToBottomHuman(page); // Extract products logger_1.logger.info('scraper', 'Extracting products from page...'); const products = await extractProducts(page, categoryUrl, categoryName); logger_1.logger.info('scraper', `Found ${products.length} products`); await browser.close(); return products; } catch (error) { logger_1.logger.error('scraper', `Error scraping category: ${error}`); await browser.close(); return []; } } /** * Scrolls to the bottom of the page with human-like behavior */ async function scrollToBottomHuman(page) { let previousHeight = 0; let currentHeight = await page.evaluate(() => document.body.scrollHeight); let attempts = 0; const maxAttempts = 20; while (previousHeight < currentHeight && attempts < maxAttempts) { previousHeight = currentHeight; // Scroll down in chunks with randomized delays const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px await (0, stealthBrowser_1.humanScroll)(page, scrollAmount); // Random pause like a human reading await (0, stealthBrowser_1.randomDelay)(500, 1500); // Check new height currentHeight = await page.evaluate(() => document.body.scrollHeight); attempts++; } // Final wait for any lazy-loaded content await (0, stealthBrowser_1.randomDelay)(1000, 2000); } /** * Extracts product information from the page */ async function extractProducts(page, categoryUrl, categoryName) { let products = []; // Check if we have a template for this URL const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl); if (template) { logger_1.logger.info('scraper', `Using ${template.name} template for extraction`); try { const templateProducts = await template.extractProducts(page); // Add category to products from template products = templateProducts.map(p => ({ ...p, category: categoryName, })); logger_1.logger.info('scraper', `Template extracted ${products.length} products`); return products; } catch (err) { logger_1.logger.error('scraper', `Template extraction failed: ${err}`); // Fall through to fallback methods } } // Fallback Method 1: Dutchie products (for Sol Flower, etc.) try { const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all(); if (dutchieProducts.length > 0) { logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`); for (const productEl of dutchieProducts) { try { const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || ''; const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => ''); const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => ''); const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => ''); const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => ''); // Parse price const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined; if (name) { products.push({ name: name.trim(), brand: brand ? brand.trim() : undefined, category: categoryName, price, image_url: imageUrl || undefined, product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl, in_stock: true }); } } catch (err) { logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`); } } } } catch (err) { logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`); } // Method 2: Curaleaf products if (products.length === 0) { try { const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all(); if (curaleafProducts.length > 0) { logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`); for (const productEl of curaleafProducts) { try { const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || ''; const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => ''); const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => ''); const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined; if (name && name.length > 3) { products.push({ name: name.trim(), category: categoryName, price, image_url: imageUrl || undefined, product_url: categoryUrl, in_stock: true }); } } catch (err) { logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`); } } } } catch (err) { logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`); } } // Method 3: Generic product cards if (products.length === 0) { try { const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all(); logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`); for (const productEl of genericProducts) { try { const text = await productEl.textContent() || ''; // Only consider elements that look like products if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) { const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || ''; if (name && name.length > 3) { products.push({ name: name.trim(), category: categoryName, product_url: categoryUrl, in_stock: true }); } } } catch (err) { // Skip this element } } } catch (err) { logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`); } } return products; } /** * Test function to scrape a single category */ async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') { console.log(`\nšŸŽ­ Testing Playwright Category Scraper\n`); console.log(`Category: ${categoryName}`); console.log(`URL: ${url}\n`); const products = await scrapeCategoryPlaywright(url, categoryName, state); console.log(`\nāœ… Found ${products.length} products\n`); products.slice(0, 5).forEach((p, i) => { console.log(`${i + 1}. ${p.name}`); if (p.brand) console.log(` Brand: ${p.brand}`); if (p.price) console.log(` Price: $${p.price}`); console.log(` URL: ${p.product_url}`); console.log(''); }); return products; }