- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
185 lines
6.3 KiB
TypeScript
185 lines
6.3 KiB
TypeScript
import { chromium } from 'playwright-extra';
|
|
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
import { pool } from './src/db/migrate';
|
|
|
|
chromium.use(stealth());
|
|
|
|
interface MapDispensary {
|
|
name: string;
|
|
address?: string;
|
|
city?: string;
|
|
state: string;
|
|
zip?: string;
|
|
latitude?: number;
|
|
longitude?: number;
|
|
phone?: string;
|
|
website?: string;
|
|
}
|
|
|
|
async function scrapeLeaflyMap() {
|
|
console.log('🗺️ Scraping dispensaries from Leafly Arizona map...\n');
|
|
|
|
const browser = await chromium.launch({
|
|
headless: false, // Show browser to see what's happening
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
console.log('📄 Loading Leafly Arizona page...');
|
|
|
|
await page.goto('https://www.leafly.com/dispensaries/arizona', {
|
|
waitUntil: 'networkidle',
|
|
timeout: 60000
|
|
});
|
|
|
|
// Wait for page to fully load
|
|
await page.waitForTimeout(5000);
|
|
|
|
// Try to extract data from window object or JSON-LD
|
|
const mapData = await page.evaluate(() => {
|
|
const dispensaries: any[] = [];
|
|
|
|
// Method 1: Check for JSON-LD structured data
|
|
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
|
|
scripts.forEach(script => {
|
|
try {
|
|
const data = JSON.parse(script.textContent || '');
|
|
if (data && typeof data === 'object') {
|
|
console.log('Found JSON-LD:', Object.keys(data));
|
|
}
|
|
} catch (e) {
|
|
// Skip invalid JSON
|
|
}
|
|
});
|
|
|
|
// Method 2: Check window object for data
|
|
const windowKeys = Object.keys(window).filter(key =>
|
|
key.toLowerCase().includes('store') ||
|
|
key.toLowerCase().includes('dispensar') ||
|
|
key.toLowerCase().includes('location') ||
|
|
key.toLowerCase().includes('map') ||
|
|
key.toLowerCase().includes('data')
|
|
);
|
|
console.log('Interesting window keys:', windowKeys);
|
|
|
|
// Method 3: Check for __NEXT_DATA__ (Next.js apps often use this)
|
|
const nextData = document.getElementById('__NEXT_DATA__');
|
|
if (nextData) {
|
|
try {
|
|
const data = JSON.parse(nextData.textContent || '');
|
|
console.log('Found __NEXT_DATA__:', Object.keys(data));
|
|
|
|
// Navigate through the data to find dispensaries
|
|
const pageProps = data?.props?.pageProps;
|
|
if (pageProps) {
|
|
console.log('PageProps keys:', Object.keys(pageProps));
|
|
|
|
// Common patterns for store data
|
|
if (pageProps.stores) {
|
|
console.log('Found stores array:', pageProps.stores.length);
|
|
return pageProps.stores;
|
|
}
|
|
if (pageProps.dispensaries) {
|
|
console.log('Found dispensaries array:', pageProps.dispensaries.length);
|
|
return pageProps.dispensaries;
|
|
}
|
|
if (pageProps.locations) {
|
|
console.log('Found locations array:', pageProps.locations.length);
|
|
return pageProps.locations;
|
|
}
|
|
if (pageProps.initialData) {
|
|
console.log('Found initialData:', Object.keys(pageProps.initialData));
|
|
return pageProps.initialData;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.error('Error parsing __NEXT_DATA__:', e);
|
|
}
|
|
}
|
|
|
|
// Method 4: Check for map markers
|
|
const markers = document.querySelectorAll('[class*="marker"], [class*="pin"], [data-marker]');
|
|
console.log('Found map markers:', markers.length);
|
|
|
|
return dispensaries;
|
|
});
|
|
|
|
console.log('\n📊 Map data extracted:');
|
|
console.log(JSON.stringify(mapData, null, 2));
|
|
|
|
// If we found structured data, process it
|
|
if (Array.isArray(mapData) && mapData.length > 0) {
|
|
console.log(`\n✅ Found ${mapData.length} dispensaries from map data`);
|
|
|
|
let savedCount = 0;
|
|
let updatedCount = 0;
|
|
|
|
for (const dispensary of mapData) {
|
|
try {
|
|
const name = dispensary.name || dispensary.storeName || dispensary.title;
|
|
const address = dispensary.address || dispensary.streetAddress;
|
|
const city = dispensary.city || dispensary.locality;
|
|
const state = dispensary.state || dispensary.region || 'AZ';
|
|
const zip = dispensary.zip || dispensary.postalCode;
|
|
const lat = dispensary.latitude || dispensary.lat;
|
|
const lng = dispensary.longitude || dispensary.lng || dispensary.lon;
|
|
const phone = dispensary.phone || dispensary.telephone;
|
|
const website = dispensary.website || dispensary.url;
|
|
|
|
if (!name) continue;
|
|
|
|
// Check if exists
|
|
const existing = await pool.query(
|
|
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2',
|
|
[name, state]
|
|
);
|
|
|
|
if (existing.rows.length > 0) {
|
|
await pool.query(`
|
|
UPDATE stores SET
|
|
address = COALESCE($1, address),
|
|
city = COALESCE($2, city),
|
|
zip = COALESCE($3, zip),
|
|
phone = COALESCE($4, phone),
|
|
website = COALESCE($5, website),
|
|
latitude = COALESCE($6, latitude),
|
|
longitude = COALESCE($7, longitude),
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $8
|
|
`, [address, city, zip, phone, website, lat, lng, existing.rows[0].id]);
|
|
updatedCount++;
|
|
} else {
|
|
await pool.query(`
|
|
INSERT INTO stores (
|
|
name, address, city, state, zip, phone, website,
|
|
latitude, longitude, active, created_at, updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
`, [name, address, city, state, zip, phone, website, lat, lng]);
|
|
savedCount++;
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error saving dispensary: ${error}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\n✅ Saved ${savedCount} new dispensaries`);
|
|
console.log(`✅ Updated ${updatedCount} existing dispensaries`);
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error(`Error: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeLeaflyMap();
|