import { chromium } from 'playwright-extra'; import stealth from 'puppeteer-extra-plugin-stealth'; import { pool } from './src/db/migrate'; chromium.use(stealth()); interface LeaflyDispensary { name: string; address?: string; city?: string; state: string; zip?: string; latitude?: number; longitude?: number; phone?: string; website?: string; rating?: number; review_count?: number; dispensary_type?: string; // MED, REC, or MED & REC } async function scrapeLeaflyArizona() { console.log('🌿 Scraping Leafly Arizona dispensaries...\n'); const browser = await chromium.launch({ headless: true, }); const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); const page = await context.newPage(); const allDispensaries: LeaflyDispensary[] = []; let currentPage = 1; const maxPages = 62; // Total pages according to website try { while (currentPage <= maxPages) { const url = currentPage === 1 ? 'https://www.leafly.com/dispensaries/arizona' : `https://www.leafly.com/dispensaries/arizona?page=${currentPage}`; console.log(`šŸ“„ Loading page ${currentPage}/${maxPages}...`); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); // Wait for dispensary cards to load await page.waitForSelector('[data-testid="dispensary-card"]', { timeout: 10000 }).catch(() => { logger.warn('leafly-scraper', 'No dispensary cards found on page'); }); // Extract dispensaries from current page const dispensaries = await page.evaluate(() => { const results: any[] = []; // Try to find dispensary cards const cards = document.querySelectorAll('[data-testid="dispensary-card"], a[href*="/dispensary/"]'); cards.forEach((card) => { const dispensary: any = { state: 'AZ' }; // Extract name const nameEl = card.querySelector('h3, h4, [class*="name"]'); if (nameEl) { dispensary.name = nameEl.textContent?.trim(); } // Extract URL const linkEl = card.closest('a') || card.querySelector('a'); if (linkEl) { const href = linkEl.getAttribute('href'); if (href) { dispensary.website = href.startsWith('http') ? href : `https://www.leafly.com${href}`; } } // Extract type (MED/REC) const typeEl = card.querySelector('[class*="badge"], [class*="type"]'); if (typeEl) { dispensary.dispensary_type = typeEl.textContent?.trim(); } // Extract rating const ratingEl = card.querySelector('[class*="rating"], [aria-label*="star"]'); if (ratingEl) { const ratingText = ratingEl.textContent?.match(/(\d+\.?\d*)/); if (ratingText) { dispensary.rating = parseFloat(ratingText[1]); } } // Extract review count const reviewEl = card.querySelector('[class*="review"]'); if (reviewEl) { const reviewText = reviewEl.textContent?.match(/(\d+)/); if (reviewText) { dispensary.review_count = parseInt(reviewText[1]); } } // Extract address from text content const addressText = card.textContent || ''; const addressMatch = addressText.match(/(\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Circle|Cir|Court|Ct)\.?\s*(?:#\d+)?)/i); if (addressMatch) { dispensary.address = addressMatch[1].trim(); } // Extract city from text const cityMatch = addressText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*AZ/); if (cityMatch) { dispensary.city = cityMatch[1].trim(); } // Extract ZIP const zipMatch = addressText.match(/\b(\d{5})\b/); if (zipMatch) { dispensary.zip = zipMatch[1]; } if (dispensary.name) { results.push(dispensary); } }); return results; }); logger.info('leafly-scraper', ` āœ… Found ${dispensaries.length} dispensaries on page ${currentPage}`); allDispensaries.push(...dispensaries); currentPage++; // Small delay between pages to be respectful await new Promise(resolve => setTimeout(resolve, 1000)); } logger.info('leafly-scraper', `\nāœ… Scraped ${allDispensaries.length} total dispensaries`); // Save to database logger.info('leafly-scraper', '\nšŸ’¾ Saving to database...'); let savedCount = 0; let updatedCount = 0; for (const dispensary of allDispensaries) { try { // Check if dispensary already exists by name const existing = await pool.query( 'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2', [dispensary.name, dispensary.state] ); if (existing.rows.length > 0) { // Update existing await pool.query(` UPDATE stores SET address = COALESCE($1, address), city = COALESCE($2, city), zip = COALESCE($3, zip), phone = COALESCE($4, phone), website = COALESCE($5, website), latitude = COALESCE($6, latitude), longitude = COALESCE($7, longitude), updated_at = CURRENT_TIMESTAMP WHERE id = $8 `, [ dispensary.address, dispensary.city, dispensary.zip, dispensary.phone, dispensary.website, dispensary.latitude, dispensary.longitude, existing.rows[0].id ]); updatedCount++; } else { // Insert new await pool.query(` INSERT INTO stores ( name, address, city, state, zip, phone, website, latitude, longitude, active, created_at, updated_at ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) `, [ dispensary.name, dispensary.address, dispensary.city, dispensary.state, dispensary.zip, dispensary.phone, dispensary.website, dispensary.latitude, dispensary.longitude ]); savedCount++; } } catch (error) { logger.error('leafly-scraper', `Error saving ${dispensary.name}: ${error}`); } } logger.info('leafly-scraper', `\nāœ… Saved ${savedCount} new dispensaries`); logger.info('leafly-scraper', `āœ… Updated ${updatedCount} existing dispensaries`); // Show sample of saved data const sample = await pool.query(` SELECT name, city, address, website FROM stores WHERE state = 'AZ' ORDER BY created_at DESC LIMIT 5 `); logger.info('leafly-scraper', '\nšŸ“‹ Sample of saved dispensaries:'); console.table(sample.rows); } catch (error) { logger.error('leafly-scraper', `Error: ${error}`); throw error; } finally { await browser.close(); await pool.end(); } } scrapeLeaflyArizona();