import { chromium } from 'playwright-extra'; import stealth from 'puppeteer-extra-plugin-stealth'; import { pool } from './src/db/migrate'; chromium.use(stealth()); interface MapDispensary { name: string; address?: string; city?: string; state: string; zip?: string; latitude?: number; longitude?: number; phone?: string; website?: string; } async function scrapeLeaflyMap() { console.log('πŸ—ΊοΈ Scraping dispensaries from Leafly Arizona map...\n'); const browser = await chromium.launch({ headless: false, // Show browser to see what's happening }); const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); const page = await context.newPage(); try { console.log('πŸ“„ Loading Leafly Arizona page...'); await page.goto('https://www.leafly.com/dispensaries/arizona', { waitUntil: 'networkidle', timeout: 60000 }); // Wait for page to fully load await page.waitForTimeout(5000); // Try to extract data from window object or JSON-LD const mapData = await page.evaluate(() => { const dispensaries: any[] = []; // Method 1: Check for JSON-LD structured data const scripts = document.querySelectorAll('script[type="application/ld+json"]'); scripts.forEach(script => { try { const data = JSON.parse(script.textContent || ''); if (data && typeof data === 'object') { console.log('Found JSON-LD:', Object.keys(data)); } } catch (e) { // Skip invalid JSON } }); // Method 2: Check window object for data const windowKeys = Object.keys(window).filter(key => key.toLowerCase().includes('store') || key.toLowerCase().includes('dispensar') || key.toLowerCase().includes('location') || key.toLowerCase().includes('map') || key.toLowerCase().includes('data') ); console.log('Interesting window keys:', windowKeys); // Method 3: Check for __NEXT_DATA__ (Next.js apps often use this) const nextData = document.getElementById('__NEXT_DATA__'); if (nextData) { try { const data = JSON.parse(nextData.textContent || ''); console.log('Found __NEXT_DATA__:', Object.keys(data)); // Navigate through the data to find dispensaries const pageProps = data?.props?.pageProps; if (pageProps) { console.log('PageProps keys:', Object.keys(pageProps)); // Common patterns for store data if (pageProps.stores) { console.log('Found stores array:', pageProps.stores.length); return pageProps.stores; } if (pageProps.dispensaries) { console.log('Found dispensaries array:', pageProps.dispensaries.length); return pageProps.dispensaries; } if (pageProps.locations) { console.log('Found locations array:', pageProps.locations.length); return pageProps.locations; } if (pageProps.initialData) { console.log('Found initialData:', Object.keys(pageProps.initialData)); return pageProps.initialData; } } } catch (e) { console.error('Error parsing __NEXT_DATA__:', e); } } // Method 4: Check for map markers const markers = document.querySelectorAll('[class*="marker"], [class*="pin"], [data-marker]'); console.log('Found map markers:', markers.length); return dispensaries; }); console.log('\nπŸ“Š Map data extracted:'); console.log(JSON.stringify(mapData, null, 2)); // If we found structured data, process it if (Array.isArray(mapData) && mapData.length > 0) { console.log(`\nβœ… Found ${mapData.length} dispensaries from map data`); let savedCount = 0; let updatedCount = 0; for (const dispensary of mapData) { try { const name = dispensary.name || dispensary.storeName || dispensary.title; const address = dispensary.address || dispensary.streetAddress; const city = dispensary.city || dispensary.locality; const state = dispensary.state || dispensary.region || 'AZ'; const zip = dispensary.zip || dispensary.postalCode; const lat = dispensary.latitude || dispensary.lat; const lng = dispensary.longitude || dispensary.lng || dispensary.lon; const phone = dispensary.phone || dispensary.telephone; const website = dispensary.website || dispensary.url; if (!name) continue; // Check if exists const existing = await pool.query( 'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2', [name, state] ); if (existing.rows.length > 0) { await pool.query(` UPDATE stores SET address = COALESCE($1, address), city = COALESCE($2, city), zip = COALESCE($3, zip), phone = COALESCE($4, phone), website = COALESCE($5, website), latitude = COALESCE($6, latitude), longitude = COALESCE($7, longitude), updated_at = CURRENT_TIMESTAMP WHERE id = $8 `, [address, city, zip, phone, website, lat, lng, existing.rows[0].id]); updatedCount++; } else { await pool.query(` INSERT INTO stores ( name, address, city, state, zip, phone, website, latitude, longitude, active, created_at, updated_at ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) `, [name, address, city, state, zip, phone, website, lat, lng]); savedCount++; } } catch (error) { console.error(`Error saving dispensary: ${error}`); } } console.log(`\nβœ… Saved ${savedCount} new dispensaries`); console.log(`βœ… Updated ${updatedCount} existing dispensaries`); } } catch (error) { console.error(`Error: ${error}`); throw error; } finally { await browser.close(); await pool.end(); } } scrapeLeaflyMap();