Files
cannaiq/backend/scrape-leafly-map-markers.ts
2025-11-28 19:45:44 -07:00

213 lines
8.2 KiB
TypeScript

import { chromium } from 'playwright-extra';
import stealth from 'puppeteer-extra-plugin-stealth';
import { pool } from './src/db/migrate';
chromium.use(stealth());
async function scrapeLeaflyMapMarkers() {
console.log('🗺️ Extracting dispensaries from Leafly Arizona map...\n');
const browser = await chromium.launch({
headless: false,
});
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
const allStores: any[] = [];
let currentPage = 1;
const maxPages = 6; // Fetch 6 pages (25 per page = ~150 total)
while (currentPage <= maxPages) {
const url = currentPage === 1
? 'https://www.leafly.com/dispensaries/arizona'
: `https://www.leafly.com/dispensaries/arizona?page=${currentPage}`;
console.log(`📄 Loading page ${currentPage}/${maxPages}: ${url}`);
await page.goto(url, {
waitUntil: 'networkidle',
timeout: 60000
});
// Wait for page to fully load
await page.waitForTimeout(3000);
console.log(`🔍 Extracting map data from page ${currentPage}...\n`);
// Extract data from Next.js data or window object
const mapData = await page.evaluate(() => {
// Method 1: Check __NEXT_DATA__ which Next.js uses
const nextDataEl = document.getElementById('__NEXT_DATA__');
if (nextDataEl && nextDataEl.textContent) {
try {
const data = JSON.parse(nextDataEl.textContent);
// Navigate through the data structure to find stores
if (data?.props?.pageProps) {
const pageProps = data.props.pageProps;
// Try different common property names
if (pageProps.stores) return pageProps.stores;
if (pageProps.dispensaries) return pageProps.dispensaries;
if (pageProps.locations) return pageProps.locations;
if (pageProps.storeLocatorResults?.data) {
// Return the whole data object to see pagination info
return { stores: pageProps.storeLocatorResults.data.organicStores, fullData: pageProps.storeLocatorResults.data };
}
if (pageProps.storeLocatorResults) return pageProps.storeLocatorResults;
if (pageProps.initMapData) return pageProps.initMapData;
if (pageProps.initialState?.stores) return pageProps.initialState.stores;
if (pageProps.initialData?.stores) return pageProps.initialData.stores;
if (pageProps.initialData?.dispensaries) return pageProps.initialData.dispensaries;
// Sometimes the data is nested deeper
const keys = Object.keys(pageProps);
for (const key of keys) {
if (pageProps[key]?.stores) return pageProps[key].stores;
if (pageProps[key]?.dispensaries) return pageProps[key].dispensaries;
if (Array.isArray(pageProps[key]) && pageProps[key].length > 100) {
// Likely the stores array
return pageProps[key];
}
}
// Return the whole pageProps for inspection
return { debug: 'pageProps', keys: Object.keys(pageProps), data: pageProps };
}
} catch (e) {
return { error: 'Error parsing __NEXT_DATA__', message: String(e) };
}
}
// Method 2: Check window object for any map-related data
const windowKeys = Object.keys(window);
for (const key of windowKeys) {
try {
const value = (window as any)[key];
if (Array.isArray(value) && value.length > 100) {
// Check if it looks like store data
const sample = value[0];
if (sample && (sample.name || sample.storeName || sample.dispensaryName)) {
return value;
}
}
} catch (e) {
// Skip
}
}
return { error: 'No map data found' };
});
// Handle different return formats
let stores = [];
if (mapData && mapData.stores && mapData.fullData) {
// Got our custom format with pagination info
stores = mapData.stores;
console.log(` ✅ Found ${stores.length} stores on page ${currentPage}`);
} else if (Array.isArray(mapData)) {
stores = mapData;
console.log(` ✅ Found ${stores.length} stores on page ${currentPage}`);
} else {
console.log(` ⚠️ No stores found on page ${currentPage}`);
}
// Add stores from this page to allStores
if (Array.isArray(stores) && stores.length > 0) {
allStores.push(...stores);
}
currentPage++;
}
console.log(`\n📊 Total stores collected from ${currentPage - 1} pages: ${allStores.length}`);
// Now save all collected stores
if (allStores.length > 0) {
console.log('\n💾 Saving all dispensaries to database...\n');
let savedCount = 0;
let updatedCount = 0;
for (const item of allStores) {
try {
// Extract fields - Leafly structure
const slug = item.slug || '';
const name = item.name || item.storeName || item.dispensaryName || item.title ||
(slug ? slug.replace(/-/g, ' ').replace(/\b\w/g, (l: string) => l.toUpperCase()) : '');
const address = item.address?.address1 || item.address?.streetAddress || item.streetAddress;
const city = item.address?.city || item.city;
const state = item.address?.state || item.state || 'AZ';
const zip = item.address?.zip || item.zip || item.zipCode;
const lat = item.address?.lat || item.latitude || item.lat;
const lng = item.address?.lon || item.address?.lng || item.longitude || item.lng;
const phone = item.phone || item.phoneNumber;
const leaflyUrl = slug ? `https://www.leafly.com/dispensaries/${slug}` : '';
const website = item.website || item.url || leaflyUrl;
const dutchieUrl = item.dutchieUrl || item.menuUrl || leaflyUrl || ''; // Use Leafly URL as fallback
const rating = item.rating || item.averageRating;
const reviewCount = item.reviewCount || item.totalReviews;
if (!name || !slug) continue;
// Check if exists
const existing = await pool.query(
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2',
[name, state]
);
if (existing.rows.length > 0) {
await pool.query(`
UPDATE stores SET
address = COALESCE($1, address),
city = COALESCE($2, city),
zip = COALESCE($3, zip),
phone = COALESCE($4, phone),
website = COALESCE($5, website),
latitude = COALESCE($6, latitude),
longitude = COALESCE($7, longitude),
updated_at = CURRENT_TIMESTAMP
WHERE id = $8
`, [address, city, zip, phone, website, lat, lng, existing.rows[0].id]);
updatedCount++;
} else {
await pool.query(`
INSERT INTO stores (
name, slug, dutchie_url, address, city, state, zip, phone, website,
latitude, longitude, data_source, active, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'leafly', true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
`, [name, slug, dutchieUrl, address, city, state, zip, phone, website, lat, lng]);
savedCount++;
}
} catch (error) {
console.error(`Error saving dispensary: ${error}`);
}
}
console.log(`\n✅ Saved ${savedCount} new dispensaries`);
console.log(`✅ Updated ${updatedCount} existing dispensaries`);
// Show total by source
const total = await pool.query(`SELECT COUNT(*) as total, data_source FROM stores WHERE state = 'AZ' GROUP BY data_source`);
console.log('\n📊 Arizona dispensaries by source:');
console.table(total.rows);
}
} catch (error) {
console.error(`❌ Error: ${error}`);
throw error;
} finally {
await browser.close();
await pool.end();
}
}
scrapeLeaflyMapMarkers();