259 lines
7.6 KiB
TypeScript
259 lines
7.6 KiB
TypeScript
import { chromium } from 'playwright-extra';
|
|
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
import { pool } from './src/db/migrate';
|
|
|
|
chromium.use(stealth());
|
|
|
|
interface GoogleMapsPlace {
|
|
name: string;
|
|
address?: string;
|
|
city?: string;
|
|
state?: string;
|
|
zip?: string;
|
|
latitude?: number;
|
|
longitude?: number;
|
|
phone?: string;
|
|
website?: string;
|
|
rating?: number;
|
|
review_count?: number;
|
|
place_id?: string;
|
|
}
|
|
|
|
async function scrapeGoogleMaps() {
|
|
console.log('🗺️ Scraping Arizona dispensaries from Google Maps...\n');
|
|
|
|
const browser = await chromium.launch({
|
|
headless: false,
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
geolocation: { latitude: 33.4484, longitude: -112.0740 }, // Phoenix, AZ
|
|
permissions: ['geolocation'],
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
const searchQuery = 'dispensaries in Arizona';
|
|
const encodedQuery = encodeURIComponent(searchQuery);
|
|
const url = `https://www.google.com/maps/search/${encodedQuery}`;
|
|
|
|
console.log(`🔍 Searching: ${searchQuery}`);
|
|
console.log(`📄 Loading: ${url}\n`);
|
|
|
|
await page.goto(url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
// Wait for results to load
|
|
await page.waitForTimeout(5000);
|
|
|
|
console.log('📜 Scrolling to load all results...');
|
|
|
|
// Scroll the results panel to load more listings
|
|
const scrollPanel = page.locator('[role="feed"]').first();
|
|
let previousHeight = 0;
|
|
let scrollAttempts = 0;
|
|
const maxScrolls = 50; // Limit scrolls to prevent infinite loop
|
|
|
|
while (scrollAttempts < maxScrolls) {
|
|
// Scroll to bottom of the feed
|
|
await scrollPanel.evaluate((el) => {
|
|
el.scrollTop = el.scrollHeight;
|
|
}).catch(() => {
|
|
console.log('Could not scroll panel');
|
|
});
|
|
|
|
await page.waitForTimeout(2000);
|
|
|
|
const currentHeight = await scrollPanel.evaluate((el) => el.scrollHeight).catch(() => 0);
|
|
|
|
if (currentHeight === previousHeight) {
|
|
console.log(' ✅ Reached end of results');
|
|
break;
|
|
}
|
|
|
|
previousHeight = currentHeight;
|
|
scrollAttempts++;
|
|
console.log(` Scroll ${scrollAttempts}/${maxScrolls}`);
|
|
}
|
|
|
|
console.log('\n📦 Extracting dispensary data...');
|
|
|
|
// Extract all place data
|
|
const places = await page.evaluate(() => {
|
|
const results: any[] = [];
|
|
|
|
// Find all place cards in the results feed
|
|
const placeCards = document.querySelectorAll('a[href*="/maps/place/"]');
|
|
|
|
placeCards.forEach((card) => {
|
|
const place: any = { state: 'AZ' };
|
|
|
|
// Extract name
|
|
const nameEl = card.querySelector('[class*="fontHeadline"], [class*="fontBody"] div[class*="fontHeadline"]');
|
|
if (nameEl) {
|
|
place.name = nameEl.textContent?.trim();
|
|
}
|
|
|
|
// Extract rating
|
|
const ratingEl = card.querySelector('[role="img"][aria-label*="stars"]');
|
|
if (ratingEl) {
|
|
const ariaLabel = ratingEl.getAttribute('aria-label') || '';
|
|
const ratingMatch = ariaLabel.match(/(\d+\.?\d*)\s*stars?/i);
|
|
if (ratingMatch) {
|
|
place.rating = parseFloat(ratingMatch[1]);
|
|
}
|
|
|
|
// Review count
|
|
const reviewMatch = ariaLabel.match(/(\d+)\s*reviews?/i);
|
|
if (reviewMatch) {
|
|
place.review_count = parseInt(reviewMatch[1]);
|
|
}
|
|
}
|
|
|
|
// Extract address from aria-label or text content
|
|
const addressEls = card.querySelectorAll('[class*="fontBody"]');
|
|
addressEls.forEach((el) => {
|
|
const text = el.textContent?.trim() || '';
|
|
|
|
// Check for address pattern
|
|
if (text.match(/\d+\s+[\w\s]+,/)) {
|
|
place.address = text;
|
|
|
|
// Try to parse city and zip
|
|
const parts = text.split(',').map(s => s.trim());
|
|
if (parts.length >= 2) {
|
|
const lastPart = parts[parts.length - 1];
|
|
const zipMatch = lastPart.match(/\b(\d{5})\b/);
|
|
if (zipMatch) {
|
|
place.zip = zipMatch[1];
|
|
}
|
|
|
|
// City is usually second to last part
|
|
if (parts.length >= 3) {
|
|
place.city = parts[parts.length - 2];
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
// Extract href for place URL
|
|
const href = card.getAttribute('href');
|
|
if (href) {
|
|
place.google_maps_url = href.startsWith('http') ? href : `https://www.google.com${href}`;
|
|
|
|
// Try to extract place ID
|
|
const placeIdMatch = href.match(/place\/(.*?)(?:\/|$)/);
|
|
if (placeIdMatch) {
|
|
place.place_id = placeIdMatch[1];
|
|
}
|
|
|
|
// Try to extract coordinates from URL
|
|
const coordMatch = href.match(/@(-?\d+\.\d+),(-?\d+\.\d+)/);
|
|
if (coordMatch) {
|
|
place.latitude = parseFloat(coordMatch[1]);
|
|
place.longitude = parseFloat(coordMatch[2]);
|
|
}
|
|
}
|
|
|
|
if (place.name) {
|
|
results.push(place);
|
|
}
|
|
});
|
|
|
|
return results;
|
|
});
|
|
|
|
console.log(`✅ Found ${places.length} dispensaries on map\n`);
|
|
|
|
// Show sample
|
|
if (places.length > 0) {
|
|
console.log('📋 Sample of first 5:');
|
|
console.table(places.slice(0, 5).map(p => ({
|
|
name: p.name,
|
|
city: p.city,
|
|
rating: p.rating,
|
|
reviews: p.review_count
|
|
})));
|
|
}
|
|
|
|
// Save to database
|
|
console.log('\n💾 Saving to database...');
|
|
|
|
let savedCount = 0;
|
|
let updatedCount = 0;
|
|
|
|
for (const place of places) {
|
|
try {
|
|
// Check if exists by name and state
|
|
const existing = await pool.query(
|
|
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2',
|
|
[place.name, 'AZ']
|
|
);
|
|
|
|
if (existing.rows.length > 0) {
|
|
// Update existing
|
|
await pool.query(`
|
|
UPDATE stores SET
|
|
address = COALESCE($1, address),
|
|
city = COALESCE($2, city),
|
|
zip = COALESCE($3, zip),
|
|
latitude = COALESCE($4, latitude),
|
|
longitude = COALESCE($5, longitude),
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $6
|
|
`, [
|
|
place.address,
|
|
place.city,
|
|
place.zip,
|
|
place.latitude,
|
|
place.longitude,
|
|
existing.rows[0].id
|
|
]);
|
|
updatedCount++;
|
|
} else {
|
|
// Insert new
|
|
await pool.query(`
|
|
INSERT INTO stores (
|
|
name, address, city, state, zip,
|
|
latitude, longitude, active,
|
|
created_at, updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
`, [
|
|
place.name,
|
|
place.address,
|
|
place.city,
|
|
'AZ',
|
|
place.zip,
|
|
place.latitude,
|
|
place.longitude
|
|
]);
|
|
savedCount++;
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error saving ${place.name}: ${error}`);
|
|
}
|
|
}
|
|
|
|
console.log(`\n✅ Saved ${savedCount} new dispensaries`);
|
|
console.log(`✅ Updated ${updatedCount} existing dispensaries`);
|
|
|
|
// Show total count
|
|
const total = await pool.query('SELECT COUNT(*) FROM stores WHERE state = $1', ['AZ']);
|
|
console.log(`\n📊 Total Arizona dispensaries in database: ${total.rows[0].count}`);
|
|
|
|
} catch (error) {
|
|
console.error(`❌ Error: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeGoogleMaps();
|