import { chromium } from 'playwright-extra'; import stealth from 'puppeteer-extra-plugin-stealth'; import { pool } from './src/db/migrate'; chromium.use(stealth()); async function scrapeLeaflyMapMarkers() { console.log('πŸ—ΊοΈ Extracting dispensaries from Leafly Arizona map...\n'); const browser = await chromium.launch({ headless: false, }); const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); const page = await context.newPage(); try { const allStores: any[] = []; let currentPage = 1; const maxPages = 6; // Fetch 6 pages (25 per page = ~150 total) while (currentPage <= maxPages) { const url = currentPage === 1 ? 'https://www.leafly.com/dispensaries/arizona' : `https://www.leafly.com/dispensaries/arizona?page=${currentPage}`; console.log(`πŸ“„ Loading page ${currentPage}/${maxPages}: ${url}`); await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 }); // Wait for page to fully load await page.waitForTimeout(3000); console.log(`πŸ” Extracting map data from page ${currentPage}...\n`); // Extract data from Next.js data or window object const mapData = await page.evaluate(() => { // Method 1: Check __NEXT_DATA__ which Next.js uses const nextDataEl = document.getElementById('__NEXT_DATA__'); if (nextDataEl && nextDataEl.textContent) { try { const data = JSON.parse(nextDataEl.textContent); // Navigate through the data structure to find stores if (data?.props?.pageProps) { const pageProps = data.props.pageProps; // Try different common property names if (pageProps.stores) return pageProps.stores; if (pageProps.dispensaries) return pageProps.dispensaries; if (pageProps.locations) return pageProps.locations; if (pageProps.storeLocatorResults?.data) { // Return the whole data object to see pagination info return { stores: pageProps.storeLocatorResults.data.organicStores, fullData: pageProps.storeLocatorResults.data }; } if (pageProps.storeLocatorResults) return pageProps.storeLocatorResults; if (pageProps.initMapData) return pageProps.initMapData; if (pageProps.initialState?.stores) return pageProps.initialState.stores; if (pageProps.initialData?.stores) return pageProps.initialData.stores; if (pageProps.initialData?.dispensaries) return pageProps.initialData.dispensaries; // Sometimes the data is nested deeper const keys = Object.keys(pageProps); for (const key of keys) { if (pageProps[key]?.stores) return pageProps[key].stores; if (pageProps[key]?.dispensaries) return pageProps[key].dispensaries; if (Array.isArray(pageProps[key]) && pageProps[key].length > 100) { // Likely the stores array return pageProps[key]; } } // Return the whole pageProps for inspection return { debug: 'pageProps', keys: Object.keys(pageProps), data: pageProps }; } } catch (e) { return { error: 'Error parsing __NEXT_DATA__', message: String(e) }; } } // Method 2: Check window object for any map-related data const windowKeys = Object.keys(window); for (const key of windowKeys) { try { const value = (window as any)[key]; if (Array.isArray(value) && value.length > 100) { // Check if it looks like store data const sample = value[0]; if (sample && (sample.name || sample.storeName || sample.dispensaryName)) { return value; } } } catch (e) { // Skip } } return { error: 'No map data found' }; }); // Handle different return formats let stores = []; if (mapData && mapData.stores && mapData.fullData) { // Got our custom format with pagination info stores = mapData.stores; console.log(` βœ… Found ${stores.length} stores on page ${currentPage}`); } else if (Array.isArray(mapData)) { stores = mapData; console.log(` βœ… Found ${stores.length} stores on page ${currentPage}`); } else { console.log(` ⚠️ No stores found on page ${currentPage}`); } // Add stores from this page to allStores if (Array.isArray(stores) && stores.length > 0) { allStores.push(...stores); } currentPage++; } console.log(`\nπŸ“Š Total stores collected from ${currentPage - 1} pages: ${allStores.length}`); // Now save all collected stores if (allStores.length > 0) { console.log('\nπŸ’Ύ Saving all dispensaries to database...\n'); let savedCount = 0; let updatedCount = 0; for (const item of allStores) { try { // Extract fields - Leafly structure const slug = item.slug || ''; const name = item.name || item.storeName || item.dispensaryName || item.title || (slug ? slug.replace(/-/g, ' ').replace(/\b\w/g, (l: string) => l.toUpperCase()) : ''); const address = item.address?.address1 || item.address?.streetAddress || item.streetAddress; const city = item.address?.city || item.city; const state = item.address?.state || item.state || 'AZ'; const zip = item.address?.zip || item.zip || item.zipCode; const lat = item.address?.lat || item.latitude || item.lat; const lng = item.address?.lon || item.address?.lng || item.longitude || item.lng; const phone = item.phone || item.phoneNumber; const leaflyUrl = slug ? `https://www.leafly.com/dispensaries/${slug}` : ''; const website = item.website || item.url || leaflyUrl; const dutchieUrl = item.dutchieUrl || item.menuUrl || leaflyUrl || ''; // Use Leafly URL as fallback const rating = item.rating || item.averageRating; const reviewCount = item.reviewCount || item.totalReviews; if (!name || !slug) continue; // Check if exists const existing = await pool.query( 'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2', [name, state] ); if (existing.rows.length > 0) { await pool.query(` UPDATE stores SET address = COALESCE($1, address), city = COALESCE($2, city), zip = COALESCE($3, zip), phone = COALESCE($4, phone), website = COALESCE($5, website), latitude = COALESCE($6, latitude), longitude = COALESCE($7, longitude), updated_at = CURRENT_TIMESTAMP WHERE id = $8 `, [address, city, zip, phone, website, lat, lng, existing.rows[0].id]); updatedCount++; } else { await pool.query(` INSERT INTO stores ( name, slug, dutchie_url, address, city, state, zip, phone, website, latitude, longitude, data_source, active, created_at, updated_at ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'leafly', true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) `, [name, slug, dutchieUrl, address, city, state, zip, phone, website, lat, lng]); savedCount++; } } catch (error) { console.error(`Error saving dispensary: ${error}`); } } console.log(`\nβœ… Saved ${savedCount} new dispensaries`); console.log(`βœ… Updated ${updatedCount} existing dispensaries`); // Show total by source const total = await pool.query(`SELECT COUNT(*) as total, data_source FROM stores WHERE state = 'AZ' GROUP BY data_source`); console.log('\nπŸ“Š Arizona dispensaries by source:'); console.table(total.rows); } } catch (error) { console.error(`❌ Error: ${error}`); throw error; } finally { await browser.close(); await pool.end(); } } scrapeLeaflyMapMarkers();