import { firefox } from 'playwright'; import { pool } from './src/db/migrate.js'; import { getRandomProxy } from './src/utils/proxyManager.js'; interface Product { slug: string; name: string; brand?: string; variant?: string; description?: string; regularPrice?: number; salePrice?: number; thcPercentage?: number; cbdPercentage?: number; strainType?: string; terpenes: string[]; effects: string[]; flavors: string[]; imageUrl?: string; dutchieUrl: string; inStock: boolean; stockQuantity?: number; stockStatus?: string; } interface Brand { slug: string; name: string; url: string; } async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise { console.log(`\n${'='.repeat(60)}`); console.log(`🏷️ SCRAPING BRANDS LIST`); console.log(`${'='.repeat(60)}\n`); try { const brandsUrl = `${menuUrl}/brands`; console.log(`📄 Loading brands page: ${brandsUrl}`); await page.goto(brandsUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Wait for brand links to render console.log('⏳ Waiting for brands to render...'); await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 }); console.log('✅ Brands appeared!'); await page.waitForTimeout(3000); // Scroll to load all brands (in case of lazy loading) console.log('📜 Scrolling to load all brands...'); let previousHeight = 0; let scrollAttempts = 0; const maxScrolls = 10; while (scrollAttempts < maxScrolls) { await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(1500); const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) break; previousHeight = currentHeight; scrollAttempts++; console.log(` Scroll ${scrollAttempts}/${maxScrolls}...`); } // Extract all brand links const brands = await page.evaluate(() => { const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]')); console.log(`Found ${brandLinks.length} brand links`); const extracted = brandLinks.map(link => { const href = link.getAttribute('href') || ''; const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || ''; // Get brand name from link text or any text content const name = link.textContent?.trim() || slug; return { slug, name, url: href.startsWith('http') ? href : href }; }); // Filter out duplicates and invalid entries const seen = new Set(); const unique = extracted.filter(b => { if (!b.slug || !b.name || seen.has(b.slug)) return false; seen.add(b.slug); return true; }); console.log(`Extracted ${unique.length} unique brands`); return unique; }); console.log(`✅ Found ${brands.length} brands to scrape\n`); // Log first few brands for verification brands.slice(0, 5).forEach((brand, idx) => { console.log(` ${idx + 1}. ${brand.name} (${brand.slug})`); }); if (brands.length > 5) { console.log(` ... and ${brands.length - 5} more brands\n`); } return brands; } catch (error: any) { console.error('❌ Error scraping brands list:', error.message); // Return empty array if brands page fails - we can still scrape main products page return []; } } async function scrapeProductsPage(menuUrl: string, dispensaryId: number) { // Pool of realistic user agents - desktop and mobile const desktopUserAgents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/131.0.0.0 Safari/537.36', ]; const mobileUserAgents = [ 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.0.0 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36', 'Mozilla/5.0 (Linux; Android 13; SM-S918U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36', ]; const getRandomUserAgent = (preferMobile = false) => { const agents = preferMobile ? mobileUserAgents : desktopUserAgents; return agents[Math.floor(Math.random() * agents.length)]; }; const getRandomViewport = (isMobile = false) => { if (isMobile) { return { width: 390, height: 844 }; // iPhone 14 Pro } return { width: 1920, height: 1080 }; // Desktop }; let proxy = await getRandomProxy(); if (!proxy) { console.log('❌ No proxy available'); process.exit(1); } // Randomly choose mobile or desktop for this session let isMobile = Math.random() > 0.5; let userAgent = getRandomUserAgent(isMobile); let viewport = getRandomViewport(isMobile); let attemptsWithCurrentUA = 0; const maxAttemptsPerUA = 3; // Try 3 proxies before changing UA console.log(`🔐 Using proxy: ${proxy.server}`); console.log(`🌐 Using ${isMobile ? 'MOBILE' : 'DESKTOP'} UA: ${userAgent.substring(0, 50)}...`); const browser = await firefox.launch({ headless: true, firefoxUserPrefs: { 'geo.enabled': true, } }); let context = await browser.newContext({ viewport, userAgent, geolocation: { latitude: 33.4484, longitude: -112.0740 }, permissions: ['geolocation'], proxy: { server: proxy.server, username: proxy.username, password: proxy.password } }); let page = await context.newPage(); // Helper function to rotate proxy (and occasionally UA) const rotateProxy = async () => { await context.close(); attemptsWithCurrentUA++; // Change UA after every 3 proxy attempts, or switch mobile/desktop const shouldChangeUA = attemptsWithCurrentUA >= maxAttemptsPerUA; if (shouldChangeUA) { attemptsWithCurrentUA = 0; // 50% chance to switch between mobile/desktop if (Math.random() > 0.5) { isMobile = !isMobile; } userAgent = getRandomUserAgent(isMobile); viewport = getRandomViewport(isMobile); console.log(` 🔄 Rotating to new proxy + NEW ${isMobile ? 'MOBILE' : 'DESKTOP'} UA...`); } else { console.log(` 🔄 Rotating to new proxy (keeping same UA, ${attemptsWithCurrentUA + 1}/${maxAttemptsPerUA})...`); } proxy = await getRandomProxy(); if (!proxy) { throw new Error('No proxy available'); } console.log(` 🔐 New proxy: ${proxy.server}`); if (shouldChangeUA) { console.log(` 🌐 New UA: ${userAgent.substring(0, 50)}...`); } context = await browser.newContext({ viewport, userAgent, geolocation: { latitude: 33.4484, longitude: -112.0740 }, permissions: ['geolocation'], proxy: { server: proxy.server, username: proxy.username, password: proxy.password } }); page = await context.newPage(); }; try { // First, scrape the brands list let brands: Brand[] = []; let brandsLoadRetries = 0; const maxBrandsRetries = 10; while (brands.length === 0 && brandsLoadRetries < maxBrandsRetries) { try { brands = await scrapeBrandsList(menuUrl, context, page); break; } catch (error: any) { brandsLoadRetries++; const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') || error.message?.includes('Timeout'); if (isProxyError && brandsLoadRetries < maxBrandsRetries) { console.log(`⚠️ Proxy failed loading brands page (attempt ${brandsLoadRetries}/${maxBrandsRetries}), rotating...`); await rotateProxy(); } else { console.error('⚠️ Failed to load brands, will skip brand scraping'); break; } } } // Collect all products from all brands const allProducts: Product[] = []; let totalSuccessCount = 0; let totalErrorCount = 0; // Scrape each brand page for (let brandIndex = 0; brandIndex < brands.length; brandIndex++) { const brand = brands[brandIndex]; console.log(`\n${'='.repeat(60)}`); console.log(`🏷️ BRAND ${brandIndex + 1}/${brands.length}: ${brand.name}`); console.log(`${'='.repeat(60)}\n`); // Retry loading brand page if proxy fails let loadSuccess = false; let loadRetries = 0; const maxLoadRetries = 10; while (!loadSuccess && loadRetries < maxLoadRetries) { try { const brandUrl = `${menuUrl}/brands/${brand.slug}`; console.log(`📄 Loading brand page: ${brandUrl}`); await page.goto(brandUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Wait for React to render products console.log('⏳ Waiting for products to render...'); await page.waitForSelector('a[href*="/product/"]', { timeout: 45000 }); console.log('✅ Products appeared!'); await page.waitForTimeout(5000); loadSuccess = true; } catch (error: any) { loadRetries++; const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') || error.message?.includes('Timeout'); if (isProxyError && loadRetries < maxLoadRetries) { console.log(`⚠️ Proxy failed loading brand page (attempt ${loadRetries}/${maxLoadRetries}), rotating...`); await rotateProxy(); } else { console.error(`❌ Failed to load brand page after ${maxLoadRetries} retries, skipping brand: ${brand.name}`); break; } } } if (!loadSuccess) { console.log(`⏭️ Skipping brand: ${brand.name}`); continue; } // Scroll to load all products (lazy loading) console.log('📜 Scrolling to load all products...'); let previousHeight = 0; let scrollAttempts = 0; const maxScrolls = 20; while (scrollAttempts < maxScrolls) { await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(2000); const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) break; previousHeight = currentHeight; scrollAttempts++; console.log(` Scroll ${scrollAttempts}/${maxScrolls}...`); } console.log('🔍 Extracting product data...'); // Extract all product cards from this brand page const products = await page.evaluate((brandName) => { const productCards = Array.from(document.querySelectorAll('a[href*="/product/"]')); console.log(`Found ${productCards.length} product links for brand: ${brandName}`); const extracted = productCards.map((card, idx) => { const href = card.getAttribute('href') || ''; const slug = href.split('/product/')[1]?.replace(/\/$/, '') || ''; // Get all text content from the card const allText = card.textContent || ''; // Name - extract first line of text (usually the product name) // Split by newlines and take first non-empty line const lines = allText.split('\n').map(l => l.trim()).filter(Boolean); const name = lines[0] || ''; // Debug first few products if (idx < 3) { console.log(`Product ${idx}: slug="${slug}", name="${name}", allText="${allText.substring(0, 100)}"`); } // Prices - look for dollar amounts const priceMatches = allText.match(/\$(\d+\.?\d*)/g) || []; const prices = priceMatches.map(p => parseFloat(p.replace('$', ''))); // If 2 prices, first is sale, second is regular const salePrice = prices.length === 2 ? prices[0] : undefined; const regularPrice = prices.length === 2 ? prices[1] : (prices.length === 1 ? prices[0] : undefined); // THC/CBD const thcMatch = allText.match(/THC[:\s]*(\d+\.?\d*)\s*%/i); const cbdMatch = allText.match(/CBD[:\s]*(\d+\.?\d*)\s*%/i); const thc = thcMatch ? parseFloat(thcMatch[1]) : undefined; const cbd = cbdMatch ? parseFloat(cbdMatch[1]) : undefined; // Strain type let strainType = undefined; if (allText.match(/\bindica\b/i)) strainType = 'Indica'; else if (allText.match(/\bsativa\b/i)) strainType = 'Sativa'; else if (allText.match(/\bhybrid\b/i)) strainType = 'Hybrid'; // Variant/weight const weightMatch = allText.match(/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i); const variant = weightMatch ? weightMatch[1].trim() : undefined; // Image const img = card.querySelector('img'); const imageUrl = img?.src || undefined; // Stock status parsing const allTextLower = allText.toLowerCase(); const inStock = !allTextLower.includes('out of stock'); // Extract stock quantity (e.g., "5 left in stock", "Only 3 left") let stockQuantity = undefined; const quantityMatch = allText.match(/(\d+)\s+left/i); if (quantityMatch) { stockQuantity = parseInt(quantityMatch[1]); } // Extract stock status messages let stockStatus = undefined; if (allTextLower.includes('out of stock')) { stockStatus = 'out of stock'; } else if (allTextLower.includes('low stock') || allTextLower.includes('limited')) { stockStatus = 'low stock'; } else if (quantityMatch) { stockStatus = `${stockQuantity} left in stock`; } else if (allTextLower.includes('order soon')) { stockStatus = 'order soon'; } return { slug, name, brand: brandName, // Set the brand from the brand page variant, regularPrice, salePrice, thcPercentage: thc, cbdPercentage: cbd, strainType, imageUrl, inStock, stockQuantity, stockStatus, dutchieUrl: href.startsWith('http') ? href : `https://dutchie.com${href}` }; }); const filtered = extracted.filter(p => p.slug && p.name); console.log(`Extracted ${extracted.length} products, ${filtered.length} with slug+name for brand: ${brandName}`); if (filtered.length === 0 && extracted.length > 0) { console.log('Sample failed product:', JSON.stringify(extracted[0])); } return filtered; }, brand.name); console.log(`✅ Extracted ${products.length} products from brand: ${brand.name}`); // Now fetch detailed data for each product in this brand for (let i = 0; i < products.length; i++) { const product = products[i]; console.log(`\n[${i + 1}/${products.length}] Processing: ${product.name}`); let retries = 0; const maxRetries = 3; let success = false; while (retries < maxRetries && !success) { try { // Navigate to product detail page const productUrl = `${menuUrl}/product/${product.slug}`; await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); await page.waitForTimeout(3000); // Extract detailed data const detailedData = await page.evaluate(() => { const allText = document.body.textContent || ''; // Brand let brand = undefined; const brandEl = document.querySelector('[class*="brand" i], [data-testid*="brand" i]'); if (brandEl?.textContent) { brand = brandEl.textContent.trim(); } // Description let description = undefined; const descEl = document.querySelector('[class*="description" i], p[class*="product" i]'); if (descEl?.textContent && descEl.textContent.length > 20) { description = descEl.textContent.trim().substring(0, 1000); } // Terpenes const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene', 'Terpinolene', 'Ocimene']; const terpenes = terpeneNames.filter(t => allText.match(new RegExp(`\\b${t}\\b`, 'i'))); // Effects const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry', 'Aroused', 'Giggly', 'Tingly']; const effects = effectNames.filter(e => allText.match(new RegExp(`\\b${e}\\b`, 'i'))); // Flavors const flavorNames = ['Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel', 'Sour', 'Floral', 'Spicy', 'Fruity', 'Herbal', 'Mint', 'Woody']; const flavors = flavorNames.filter(f => allText.match(new RegExp(`\\b${f}\\b`, 'i'))); // Categories (from breadcrumbs) const breadcrumbs = Array.from(document.querySelectorAll('[class*="breadcrumb" i] a, nav a')); const categories = breadcrumbs .map(b => b.textContent?.trim()) .filter(Boolean) .filter(c => c !== 'Home' && c.length > 0); return { brand, description, terpenes, effects, flavors, categories }; }); // Merge detailed data - use brand from page evaluation, or fall back to brand page name const fullProduct: Product = { ...product, brand: detailedData.brand || product.brand, description: detailedData.description, terpenes: detailedData.terpenes, effects: detailedData.effects, flavors: detailedData.flavors, }; // Save to database await saveProduct(dispensaryId, fullProduct, detailedData.categories); totalSuccessCount++; console.log(` ✅ Saved (${totalSuccessCount} successful, ${totalErrorCount} errors)`); success = true; } catch (error: any) { retries++; // Check if it's a proxy error const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') || error.message?.includes('Timeout'); if (isProxyError && retries < maxRetries) { console.log(` ⚠️ Proxy error (attempt ${retries}/${maxRetries}), rotating...`); await rotateProxy(); } else if (retries >= maxRetries) { totalErrorCount++; console.error(` ❌ Failed after ${maxRetries} retries:`, error.message); } else { totalErrorCount++; console.error(` ❌ Error processing ${product.name}:`, error.message); break; // Non-proxy error, don't retry } } } // Rate limiting if (success) { await page.waitForTimeout(1000 + Math.random() * 2000); } } console.log(`\n 📊 Brand "${brand.name}" complete: ${products.length} products processed`); } console.log(`\n${'='.repeat(60)}`); console.log(`✅ ALL BRANDS SCRAPING COMPLETE!`); console.log(` Total brands scraped: ${brands.length}`); console.log(` Total products found: ${allProducts.length}`); console.log(` Successfully saved: ${totalSuccessCount}`); console.log(` Errors: ${totalErrorCount}`); console.log(`${'='.repeat(60)}\n`); await browser.close(); await pool.end(); } catch (error) { console.error('❌ Fatal error:', error); await browser.close(); await pool.end(); process.exit(1); } } async function saveProduct(storeId: number, product: Product, categories: string[]) { const client = await pool.connect(); try { await client.query('BEGIN'); // Check if product exists const existing = await client.query( 'SELECT id FROM products WHERE dispensary_id = $1 AND slug = $2', [storeId, product.slug] ); let productId: number; if (existing.rows.length > 0) { // UPDATE existing product productId = existing.rows[0].id; await client.query(` UPDATE products SET name = $1, brand = $2, variant = $3, description = $4, regular_price = $5, sale_price = $6, thc_percentage = $7, cbd_percentage = $8, strain_type = $9, terpenes = $10, effects = $11, flavors = $12, image_url = $13, dutchie_url = $14, in_stock = $15, stock_quantity = $16, stock_status = $17, last_seen_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE id = $18 `, [ product.name, product.brand, product.variant, product.description, product.regularPrice, product.salePrice, product.thcPercentage, product.cbdPercentage, product.strainType, product.terpenes, product.effects, product.flavors, product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus, productId ]); } else { // INSERT new product const result = await client.query(` INSERT INTO products ( dispensary_id, slug, name, brand, variant, description, regular_price, sale_price, thc_percentage, cbd_percentage, strain_type, terpenes, effects, flavors, image_url, dutchie_url, in_stock, stock_quantity, stock_status ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19) RETURNING id `, [ storeId, product.slug, product.name, product.brand, product.variant, product.description, product.regularPrice, product.salePrice, product.thcPercentage, product.cbdPercentage, product.strainType, product.terpenes, product.effects, product.flavors, product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus ]); productId = result.rows[0].id; } // Record price history (if we have price data) if (product.regularPrice !== undefined || product.salePrice !== undefined) { await client.query(` INSERT INTO price_history (product_id, regular_price, sale_price) VALUES ($1, $2, $3) `, [productId, product.regularPrice, product.salePrice]); } // Record batch history (if we have cannabinoid data) if (product.thcPercentage !== undefined || product.cbdPercentage !== undefined) { await client.query(` INSERT INTO batch_history (product_id, thc_percentage, cbd_percentage, terpenes, strain_type) VALUES ($1, $2, $3, $4, $5) `, [productId, product.thcPercentage, product.cbdPercentage, product.terpenes, product.strainType]); } // Update product_categories for (const categorySlug of categories) { await client.query(` INSERT INTO product_categories (product_id, category_slug, last_seen_at) VALUES ($1, $2, CURRENT_TIMESTAMP) ON CONFLICT (product_id, category_slug) DO UPDATE SET last_seen_at = CURRENT_TIMESTAMP `, [productId, categorySlug.toLowerCase()]); } await client.query('COMMIT'); } catch (error) { await client.query('ROLLBACK'); throw error; } finally { client.release(); } } // Main execution async function main() { // Get dispensary ID from command line argument or default to 112 (Deeply Rooted) const dispensaryIdArg = process.argv[2]; let dispensaryId: number; if (dispensaryIdArg) { // If dispensary ID is provided as argument, use it directly dispensaryId = parseInt(dispensaryIdArg, 10); const dispensaryResult = await pool.query("SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId]); if (dispensaryResult.rows.length === 0) { console.error(`❌ Dispensary ID ${dispensaryId} not found`); process.exit(1); } if (!dispensaryResult.rows[0].menu_url) { console.error(`❌ Dispensary ${dispensaryResult.rows[0].name} has no menu_url configured`); process.exit(1); } console.log(`✅ Found dispensary ID: ${dispensaryId} - ${dispensaryResult.rows[0].name}`); } else { // Default behavior: use Deeply Rooted (ID 112) dispensaryId = 112; const dispensaryResult = await pool.query("SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId]); if (dispensaryResult.rows.length === 0) { console.error('❌ Deeply Rooted dispensary not found'); process.exit(1); } console.log(`✅ Found dispensary ID: ${dispensaryId} - ${dispensaryResult.rows[0].name}`); } // Get the menu URL for the dispensary const dispensaryData = await pool.query("SELECT menu_url FROM dispensaries WHERE id = $1", [dispensaryId]); const menuUrl = dispensaryData.rows[0].menu_url; await scrapeProductsPage(menuUrl, dispensaryId); } main().catch(console.error);