233 lines
7.3 KiB
TypeScript
233 lines
7.3 KiB
TypeScript
import { chromium } from 'playwright-extra';
|
|
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
import { pool } from './src/db/migrate';
|
|
|
|
chromium.use(stealth());
|
|
|
|
interface LeaflyDispensary {
|
|
name: string;
|
|
address?: string;
|
|
city?: string;
|
|
state: string;
|
|
zip?: string;
|
|
latitude?: number;
|
|
longitude?: number;
|
|
phone?: string;
|
|
website?: string;
|
|
rating?: number;
|
|
review_count?: number;
|
|
dispensary_type?: string; // MED, REC, or MED & REC
|
|
}
|
|
|
|
async function scrapeLeaflyArizona() {
|
|
console.log('🌿 Scraping Leafly Arizona dispensaries...\n');
|
|
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
const allDispensaries: LeaflyDispensary[] = [];
|
|
let currentPage = 1;
|
|
const maxPages = 62; // Total pages according to website
|
|
|
|
try {
|
|
while (currentPage <= maxPages) {
|
|
const url = currentPage === 1
|
|
? 'https://www.leafly.com/dispensaries/arizona'
|
|
: `https://www.leafly.com/dispensaries/arizona?page=${currentPage}`;
|
|
|
|
console.log(`📄 Loading page ${currentPage}/${maxPages}...`);
|
|
|
|
await page.goto(url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000
|
|
});
|
|
|
|
// Wait for dispensary cards to load
|
|
await page.waitForSelector('[data-testid="dispensary-card"]', { timeout: 10000 }).catch(() => {
|
|
logger.warn('leafly-scraper', 'No dispensary cards found on page');
|
|
});
|
|
|
|
// Extract dispensaries from current page
|
|
const dispensaries = await page.evaluate(() => {
|
|
const results: any[] = [];
|
|
|
|
// Try to find dispensary cards
|
|
const cards = document.querySelectorAll('[data-testid="dispensary-card"], a[href*="/dispensary/"]');
|
|
|
|
cards.forEach((card) => {
|
|
const dispensary: any = { state: 'AZ' };
|
|
|
|
// Extract name
|
|
const nameEl = card.querySelector('h3, h4, [class*="name"]');
|
|
if (nameEl) {
|
|
dispensary.name = nameEl.textContent?.trim();
|
|
}
|
|
|
|
// Extract URL
|
|
const linkEl = card.closest('a') || card.querySelector('a');
|
|
if (linkEl) {
|
|
const href = linkEl.getAttribute('href');
|
|
if (href) {
|
|
dispensary.website = href.startsWith('http') ? href : `https://www.leafly.com${href}`;
|
|
}
|
|
}
|
|
|
|
// Extract type (MED/REC)
|
|
const typeEl = card.querySelector('[class*="badge"], [class*="type"]');
|
|
if (typeEl) {
|
|
dispensary.dispensary_type = typeEl.textContent?.trim();
|
|
}
|
|
|
|
// Extract rating
|
|
const ratingEl = card.querySelector('[class*="rating"], [aria-label*="star"]');
|
|
if (ratingEl) {
|
|
const ratingText = ratingEl.textContent?.match(/(\d+\.?\d*)/);
|
|
if (ratingText) {
|
|
dispensary.rating = parseFloat(ratingText[1]);
|
|
}
|
|
}
|
|
|
|
// Extract review count
|
|
const reviewEl = card.querySelector('[class*="review"]');
|
|
if (reviewEl) {
|
|
const reviewText = reviewEl.textContent?.match(/(\d+)/);
|
|
if (reviewText) {
|
|
dispensary.review_count = parseInt(reviewText[1]);
|
|
}
|
|
}
|
|
|
|
// Extract address from text content
|
|
const addressText = card.textContent || '';
|
|
const addressMatch = addressText.match(/(\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Circle|Cir|Court|Ct)\.?\s*(?:#\d+)?)/i);
|
|
if (addressMatch) {
|
|
dispensary.address = addressMatch[1].trim();
|
|
}
|
|
|
|
// Extract city from text
|
|
const cityMatch = addressText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*AZ/);
|
|
if (cityMatch) {
|
|
dispensary.city = cityMatch[1].trim();
|
|
}
|
|
|
|
// Extract ZIP
|
|
const zipMatch = addressText.match(/\b(\d{5})\b/);
|
|
if (zipMatch) {
|
|
dispensary.zip = zipMatch[1];
|
|
}
|
|
|
|
if (dispensary.name) {
|
|
results.push(dispensary);
|
|
}
|
|
});
|
|
|
|
return results;
|
|
});
|
|
|
|
logger.info('leafly-scraper', ` ✅ Found ${dispensaries.length} dispensaries on page ${currentPage}`);
|
|
allDispensaries.push(...dispensaries);
|
|
|
|
currentPage++;
|
|
|
|
// Small delay between pages to be respectful
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
}
|
|
|
|
logger.info('leafly-scraper', `\n✅ Scraped ${allDispensaries.length} total dispensaries`);
|
|
|
|
// Save to database
|
|
logger.info('leafly-scraper', '\n💾 Saving to database...');
|
|
|
|
let savedCount = 0;
|
|
let updatedCount = 0;
|
|
|
|
for (const dispensary of allDispensaries) {
|
|
try {
|
|
// Check if dispensary already exists by name
|
|
const existing = await pool.query(
|
|
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2',
|
|
[dispensary.name, dispensary.state]
|
|
);
|
|
|
|
if (existing.rows.length > 0) {
|
|
// Update existing
|
|
await pool.query(`
|
|
UPDATE stores SET
|
|
address = COALESCE($1, address),
|
|
city = COALESCE($2, city),
|
|
zip = COALESCE($3, zip),
|
|
phone = COALESCE($4, phone),
|
|
website = COALESCE($5, website),
|
|
latitude = COALESCE($6, latitude),
|
|
longitude = COALESCE($7, longitude),
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $8
|
|
`, [
|
|
dispensary.address,
|
|
dispensary.city,
|
|
dispensary.zip,
|
|
dispensary.phone,
|
|
dispensary.website,
|
|
dispensary.latitude,
|
|
dispensary.longitude,
|
|
existing.rows[0].id
|
|
]);
|
|
updatedCount++;
|
|
} else {
|
|
// Insert new
|
|
await pool.query(`
|
|
INSERT INTO stores (
|
|
name, address, city, state, zip, phone, website,
|
|
latitude, longitude, active, created_at, updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
`, [
|
|
dispensary.name,
|
|
dispensary.address,
|
|
dispensary.city,
|
|
dispensary.state,
|
|
dispensary.zip,
|
|
dispensary.phone,
|
|
dispensary.website,
|
|
dispensary.latitude,
|
|
dispensary.longitude
|
|
]);
|
|
savedCount++;
|
|
}
|
|
} catch (error) {
|
|
logger.error('leafly-scraper', `Error saving ${dispensary.name}: ${error}`);
|
|
}
|
|
}
|
|
|
|
logger.info('leafly-scraper', `\n✅ Saved ${savedCount} new dispensaries`);
|
|
logger.info('leafly-scraper', `✅ Updated ${updatedCount} existing dispensaries`);
|
|
|
|
// Show sample of saved data
|
|
const sample = await pool.query(`
|
|
SELECT name, city, address, website
|
|
FROM stores
|
|
WHERE state = 'AZ'
|
|
ORDER BY created_at DESC
|
|
LIMIT 5
|
|
`);
|
|
|
|
logger.info('leafly-scraper', '\n📋 Sample of saved dispensaries:');
|
|
console.table(sample.rows);
|
|
|
|
} catch (error) {
|
|
logger.error('leafly-scraper', `Error: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeLeaflyArizona();
|