import { firefox } from 'playwright'; import { pool } from './src/db/migrate.js'; import { getRandomProxy } from './src/utils/proxyManager.js'; // Get command line arguments const dispensaryId = parseInt(process.argv[2] || '112', 10); const startBrandIndex = parseInt(process.argv[3] || '0', 10); const endBrandIndex = parseInt(process.argv[4] || '89', 10); const workerNum = process.argv[5] || '1'; interface Product { slug: string; name: string; brand?: string; variant?: string; description?: string; regularPrice?: number; salePrice?: number; thcPercentage?: number; cbdPercentage?: number; strainType?: string; terpenes: string[]; effects: string[]; flavors: string[]; imageUrl?: string; dutchieUrl: string; inStock: boolean; stockQuantity?: number; stockStatus?: string; } interface Brand { slug: string; name: string; url: string; } async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise { try { const brandsUrl = `${menuUrl}/brands`; console.log(`[W${workerNum}] 📄 Loading brands page: ${brandsUrl}`); await page.goto(brandsUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); console.log(`[W${workerNum}] ⏳ Waiting for brands to render...`); await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 }); console.log(`[W${workerNum}] ✅ Brands appeared!`); await page.waitForTimeout(3000); // Scroll to load all brands console.log(`[W${workerNum}] 📜 Scrolling to load all brands...`); let previousHeight = 0; let scrollAttempts = 0; const maxScrolls = 10; while (scrollAttempts < maxScrolls) { await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); await page.waitForTimeout(1500); const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) break; previousHeight = currentHeight; scrollAttempts++; } // Extract all brand links const brands = await page.evaluate(() => { const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]')); const extracted = brandLinks.map(link => { const href = link.getAttribute('href') || ''; const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || ''; const name = link.textContent?.trim() || slug; return { slug, name, url: href.startsWith('http') ? href : href }; }); // Filter out duplicates and invalid entries const seen = new Set(); const unique = extracted.filter(b => { if (!b.slug || !b.name || seen.has(b.slug)) return false; seen.add(b.slug); return true; }); return unique; }); console.log(`[W${workerNum}] ✅ Found ${brands.length} total brands`); return brands; } catch (error: any) { console.error(`[W${workerNum}] ❌ Error scraping brands list:`, error.message); return []; } } async function scrapeProductsFromBrand( menuUrl: string, brand: Brand, dispensaryId: number, page: any ): Promise { try { const brandUrl = `${menuUrl}/brands/${brand.slug}`; console.log(`[W${workerNum}] 📄 Loading brand page: ${brandUrl}`); await page.goto(brandUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); await page.waitForTimeout(3000); // Scroll to load all products for (let i = 0; i < 20; i++) { await page.evaluate(() => window.scrollBy(0, window.innerHeight)); await page.waitForTimeout(1500); } // Extract products const products = await page.evaluate((brandName: string) => { const productCards = Array.from(document.querySelectorAll('a[href*="/product/"]')); return productCards.map(card => { const href = card.getAttribute('href') || ''; const slug = href.split('/product/')[1]?.replace(/\/$/, '') || ''; const allText = card.textContent || ''; const allTextLower = allText.toLowerCase(); const inStock = !allTextLower.includes('out of stock'); // Extract stock quantity (e.g., "5 left in stock", "Only 3 left") let stockQuantity = undefined; const quantityMatch = allText.match(/(\d+)\s+left/i); if (quantityMatch) { stockQuantity = parseInt(quantityMatch[1]); } // Extract stock status messages let stockStatus = undefined; if (allTextLower.includes('out of stock')) { stockStatus = 'out of stock'; } else if (allTextLower.includes('low stock') || allTextLower.includes('limited')) { stockStatus = 'low stock'; } else if (quantityMatch) { stockStatus = `${stockQuantity} left in stock`; } else if (allTextLower.includes('order soon')) { stockStatus = 'order soon'; } return { slug, name: card.textContent?.trim().substring(0, 200) || '', brand: brandName, dutchieUrl: href, inStock, stockQuantity, stockStatus, terpenes: [], effects: [], flavors: [] }; }); }, brand.name); console.log(`[W${workerNum}] ✅ Extracted ${products.length} products from brand: ${brand.name}`); return products; } catch (error: any) { console.error(`[W${workerNum}] ❌ Error scraping brand ${brand.name}:`, error.message); return []; } } async function saveProduct(product: Product, dispensaryId: number): Promise { const client = await pool.connect(); try { await client.query('BEGIN'); // Check if product exists const existing = await client.query( 'SELECT id FROM products WHERE dispensary_id = $1 AND slug = $2', [dispensaryId, product.slug] ); let productId: number; if (existing.rows.length > 0) { // UPDATE existing product productId = existing.rows[0].id; await client.query(` UPDATE products SET name = $1, brand = $2, variant = $3, description = $4, regular_price = $5, sale_price = $6, thc_percentage = $7, cbd_percentage = $8, strain_type = $9, terpenes = $10, effects = $11, flavors = $12, image_url = $13, dutchie_url = $14, in_stock = $15, stock_quantity = $16, stock_status = $17, last_seen_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE id = $18 `, [ product.name, product.brand, product.variant, product.description, product.regularPrice, product.salePrice, product.thcPercentage, product.cbdPercentage, product.strainType, product.terpenes, product.effects, product.flavors, product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus, productId ]); } else { // INSERT new product const result = await client.query(` INSERT INTO products ( dispensary_id, slug, name, brand, variant, description, regular_price, sale_price, thc_percentage, cbd_percentage, strain_type, terpenes, effects, flavors, image_url, dutchie_url, in_stock, stock_quantity, stock_status ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19) RETURNING id `, [ dispensaryId, product.slug, product.name, product.brand, product.variant, product.description, product.regularPrice, product.salePrice, product.thcPercentage, product.cbdPercentage, product.strainType, product.terpenes, product.effects, product.flavors, product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus ]); productId = result.rows[0].id; } await client.query('COMMIT'); return true; } catch (error: any) { await client.query('ROLLBACK'); console.error(`[W${workerNum}] ❌ Error saving product:`, error.message); return false; } finally { client.release(); } } async function main() { console.log(`\n${'='.repeat(60)}`); console.log(`🚀 PARALLEL SCRAPER - WORKER ${workerNum}`); console.log(` Dispensary ID: ${dispensaryId}`); console.log(` Brand Range: ${startBrandIndex} - ${endBrandIndex}`); console.log(`${'='.repeat(60)}\n`); // Get dispensary info const dispensaryResult = await pool.query( "SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId] ); if (dispensaryResult.rows.length === 0) { console.error(`[W${workerNum}] ❌ Dispensary ID ${dispensaryId} not found`); process.exit(1); } const menuUrl = dispensaryResult.rows[0].menu_url; console.log(`[W${workerNum}] ✅ Dispensary: ${dispensaryResult.rows[0].name}`); // Get proxy const proxy = await getRandomProxy(); if (!proxy) { console.log(`[W${workerNum}] ❌ No proxy available`); process.exit(1); } console.log(`[W${workerNum}] 🔐 Using proxy: ${proxy.server}`); // Launch browser const browser = await firefox.launch({ headless: true }); const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', proxy: { server: proxy.server, username: proxy.username, password: proxy.password } }); const page = await context.newPage(); // Get all brands const allBrands = await scrapeBrandsList(menuUrl, context, page); if (allBrands.length === 0) { console.log(`[W${workerNum}] ❌ No brands found`); await browser.close(); process.exit(1); } // Filter to assigned range const brandsToScrape = allBrands.slice(startBrandIndex, endBrandIndex + 1); console.log(`[W${workerNum}] 📋 Processing ${brandsToScrape.length} brands (${startBrandIndex}-${endBrandIndex})`); let totalProducts = 0; let totalSaved = 0; // Scrape each brand for (let i = 0; i < brandsToScrape.length; i++) { const brand = brandsToScrape[i]; const globalIndex = startBrandIndex + i; console.log(`\n[W${workerNum}] ${'='.repeat(60)}`); console.log(`[W${workerNum}] 🏷️ BRAND ${i + 1}/${brandsToScrape.length} (Global: ${globalIndex + 1}/90): ${brand.name}`); console.log(`[W${workerNum}] ${'='.repeat(60)}\n`); const products = await scrapeProductsFromBrand(menuUrl, brand, dispensaryId, page); totalProducts += products.length; // Save products for (const product of products) { const saved = await saveProduct(product, dispensaryId); if (saved) totalSaved++; } console.log(`[W${workerNum}] ✅ Brand "${brand.name}" complete: ${products.length} products saved`); await page.waitForTimeout(2000); } console.log(`\n[W${workerNum}] ${'='.repeat(60)}`); console.log(`[W${workerNum}] ✅ WORKER ${workerNum} COMPLETE`); console.log(`[W${workerNum}] Brands scraped: ${brandsToScrape.length}`); console.log(`[W${workerNum}] Total products: ${totalProducts}`); console.log(`[W${workerNum}] Products saved: ${totalSaved}`); console.log(`[W${workerNum}] ${'='.repeat(60)}\n`); await browser.close(); await pool.end(); } main().catch(console.error);