213 lines
8.2 KiB
TypeScript
213 lines
8.2 KiB
TypeScript
import { chromium } from 'playwright-extra';
|
|
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
import { pool } from './src/db/migrate';
|
|
|
|
chromium.use(stealth());
|
|
|
|
async function scrapeLeaflyMapMarkers() {
|
|
console.log('🗺️ Extracting dispensaries from Leafly Arizona map...\n');
|
|
|
|
const browser = await chromium.launch({
|
|
headless: false,
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
const allStores: any[] = [];
|
|
let currentPage = 1;
|
|
const maxPages = 6; // Fetch 6 pages (25 per page = ~150 total)
|
|
|
|
while (currentPage <= maxPages) {
|
|
const url = currentPage === 1
|
|
? 'https://www.leafly.com/dispensaries/arizona'
|
|
: `https://www.leafly.com/dispensaries/arizona?page=${currentPage}`;
|
|
|
|
console.log(`📄 Loading page ${currentPage}/${maxPages}: ${url}`);
|
|
|
|
await page.goto(url, {
|
|
waitUntil: 'networkidle',
|
|
timeout: 60000
|
|
});
|
|
|
|
// Wait for page to fully load
|
|
await page.waitForTimeout(3000);
|
|
|
|
console.log(`🔍 Extracting map data from page ${currentPage}...\n`);
|
|
|
|
// Extract data from Next.js data or window object
|
|
const mapData = await page.evaluate(() => {
|
|
// Method 1: Check __NEXT_DATA__ which Next.js uses
|
|
const nextDataEl = document.getElementById('__NEXT_DATA__');
|
|
if (nextDataEl && nextDataEl.textContent) {
|
|
try {
|
|
const data = JSON.parse(nextDataEl.textContent);
|
|
|
|
// Navigate through the data structure to find stores
|
|
if (data?.props?.pageProps) {
|
|
const pageProps = data.props.pageProps;
|
|
|
|
// Try different common property names
|
|
if (pageProps.stores) return pageProps.stores;
|
|
if (pageProps.dispensaries) return pageProps.dispensaries;
|
|
if (pageProps.locations) return pageProps.locations;
|
|
if (pageProps.storeLocatorResults?.data) {
|
|
// Return the whole data object to see pagination info
|
|
return { stores: pageProps.storeLocatorResults.data.organicStores, fullData: pageProps.storeLocatorResults.data };
|
|
}
|
|
if (pageProps.storeLocatorResults) return pageProps.storeLocatorResults;
|
|
if (pageProps.initMapData) return pageProps.initMapData;
|
|
if (pageProps.initialState?.stores) return pageProps.initialState.stores;
|
|
if (pageProps.initialData?.stores) return pageProps.initialData.stores;
|
|
if (pageProps.initialData?.dispensaries) return pageProps.initialData.dispensaries;
|
|
|
|
// Sometimes the data is nested deeper
|
|
const keys = Object.keys(pageProps);
|
|
for (const key of keys) {
|
|
if (pageProps[key]?.stores) return pageProps[key].stores;
|
|
if (pageProps[key]?.dispensaries) return pageProps[key].dispensaries;
|
|
if (Array.isArray(pageProps[key]) && pageProps[key].length > 100) {
|
|
// Likely the stores array
|
|
return pageProps[key];
|
|
}
|
|
}
|
|
|
|
// Return the whole pageProps for inspection
|
|
return { debug: 'pageProps', keys: Object.keys(pageProps), data: pageProps };
|
|
}
|
|
} catch (e) {
|
|
return { error: 'Error parsing __NEXT_DATA__', message: String(e) };
|
|
}
|
|
}
|
|
|
|
// Method 2: Check window object for any map-related data
|
|
const windowKeys = Object.keys(window);
|
|
for (const key of windowKeys) {
|
|
try {
|
|
const value = (window as any)[key];
|
|
if (Array.isArray(value) && value.length > 100) {
|
|
// Check if it looks like store data
|
|
const sample = value[0];
|
|
if (sample && (sample.name || sample.storeName || sample.dispensaryName)) {
|
|
return value;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Skip
|
|
}
|
|
}
|
|
|
|
return { error: 'No map data found' };
|
|
});
|
|
|
|
// Handle different return formats
|
|
let stores = [];
|
|
|
|
if (mapData && mapData.stores && mapData.fullData) {
|
|
// Got our custom format with pagination info
|
|
stores = mapData.stores;
|
|
console.log(` ✅ Found ${stores.length} stores on page ${currentPage}`);
|
|
} else if (Array.isArray(mapData)) {
|
|
stores = mapData;
|
|
console.log(` ✅ Found ${stores.length} stores on page ${currentPage}`);
|
|
} else {
|
|
console.log(` ⚠️ No stores found on page ${currentPage}`);
|
|
}
|
|
|
|
// Add stores from this page to allStores
|
|
if (Array.isArray(stores) && stores.length > 0) {
|
|
allStores.push(...stores);
|
|
}
|
|
|
|
currentPage++;
|
|
}
|
|
|
|
console.log(`\n📊 Total stores collected from ${currentPage - 1} pages: ${allStores.length}`);
|
|
|
|
// Now save all collected stores
|
|
if (allStores.length > 0) {
|
|
console.log('\n💾 Saving all dispensaries to database...\n');
|
|
|
|
let savedCount = 0;
|
|
let updatedCount = 0;
|
|
|
|
for (const item of allStores) {
|
|
try {
|
|
// Extract fields - Leafly structure
|
|
const slug = item.slug || '';
|
|
const name = item.name || item.storeName || item.dispensaryName || item.title ||
|
|
(slug ? slug.replace(/-/g, ' ').replace(/\b\w/g, (l: string) => l.toUpperCase()) : '');
|
|
const address = item.address?.address1 || item.address?.streetAddress || item.streetAddress;
|
|
const city = item.address?.city || item.city;
|
|
const state = item.address?.state || item.state || 'AZ';
|
|
const zip = item.address?.zip || item.zip || item.zipCode;
|
|
const lat = item.address?.lat || item.latitude || item.lat;
|
|
const lng = item.address?.lon || item.address?.lng || item.longitude || item.lng;
|
|
const phone = item.phone || item.phoneNumber;
|
|
const leaflyUrl = slug ? `https://www.leafly.com/dispensaries/${slug}` : '';
|
|
const website = item.website || item.url || leaflyUrl;
|
|
const dutchieUrl = item.dutchieUrl || item.menuUrl || leaflyUrl || ''; // Use Leafly URL as fallback
|
|
const rating = item.rating || item.averageRating;
|
|
const reviewCount = item.reviewCount || item.totalReviews;
|
|
|
|
if (!name || !slug) continue;
|
|
|
|
// Check if exists
|
|
const existing = await pool.query(
|
|
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2',
|
|
[name, state]
|
|
);
|
|
|
|
if (existing.rows.length > 0) {
|
|
await pool.query(`
|
|
UPDATE stores SET
|
|
address = COALESCE($1, address),
|
|
city = COALESCE($2, city),
|
|
zip = COALESCE($3, zip),
|
|
phone = COALESCE($4, phone),
|
|
website = COALESCE($5, website),
|
|
latitude = COALESCE($6, latitude),
|
|
longitude = COALESCE($7, longitude),
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $8
|
|
`, [address, city, zip, phone, website, lat, lng, existing.rows[0].id]);
|
|
updatedCount++;
|
|
} else {
|
|
await pool.query(`
|
|
INSERT INTO stores (
|
|
name, slug, dutchie_url, address, city, state, zip, phone, website,
|
|
latitude, longitude, data_source, active, created_at, updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, 'leafly', true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
`, [name, slug, dutchieUrl, address, city, state, zip, phone, website, lat, lng]);
|
|
savedCount++;
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error saving dispensary: ${error}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\n✅ Saved ${savedCount} new dispensaries`);
|
|
console.log(`✅ Updated ${updatedCount} existing dispensaries`);
|
|
|
|
// Show total by source
|
|
const total = await pool.query(`SELECT COUNT(*) as total, data_source FROM stores WHERE state = 'AZ' GROUP BY data_source`);
|
|
console.log('\n📊 Arizona dispensaries by source:');
|
|
console.table(total.rows);
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error(`❌ Error: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeLeaflyMapMarkers();
|