Files
cannaiq/backend/scrape-azdhs-better.ts
2025-11-28 19:45:44 -07:00

109 lines
3.4 KiB
TypeScript

import { chromium } from 'playwright-extra';
import stealth from 'puppeteer-extra-plugin-stealth';
import { pool } from './src/db/migrate';
chromium.use(stealth());
async function scrapeAZDHSBetter() {
console.log('🏛️ Scraping AZDHS official map (improved approach)...\n');
const browser = await chromium.launch({
headless: false,
});
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
// Capture API requests
const apiData: any[] = [];
page.on('response', async (response) => {
const url = response.url();
if (url.includes('dispensar') || url.includes('facility') || url.includes('location')) {
try {
const json = await response.json();
console.log(`📡 Captured API response from: ${url.substring(0, 100)}...`);
apiData.push({ url, data: json });
} catch (e) {
// Not JSON
}
}
});
try {
console.log('📄 Loading AZDHS page (waiting up to 60s for JavaScript)...');
await page.goto('https://azcarecheck.azdhs.gov/s/?facilityId=001t000000L0TApAAN', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Wait longer for JavaScript to execute
console.log('⏳ Waiting 20 seconds for Salesforce to fully load...');
await page.waitForTimeout(20000);
// Try to find and click "View All" or expand the map
console.log('🔍 Looking for buttons to expand results...');
const viewAllButton = page.locator('button:has-text("View All"), button:has-text("Show All"), a:has-text("View All")').first();
if (await viewAllButton.isVisible().catch(() => false)) {
console.log(' ✅ Found View All button, clicking...');
await viewAllButton.click();
await page.waitForTimeout(5000);
}
// Try extracting data directly from page
console.log('\n📦 Extracting dispensary data from page...');
const dispensaries = await page.evaluate(() => {
const results: any[] = [];
// Look for various data patterns
const elements = document.querySelectorAll('[data-facility], [data-location], article, .facility, .location, .dispensary');
elements.forEach((el) => {
const text = el.textContent || '';
// Try to extract structured data
if (text.length > 20 && text.length < 500) {
// Look for name patterns
const nameMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5})/);
if (nameMatch) {
results.push({
rawText: text.substring(0, 200),
element: el.className,
});
}
}
});
return results;
});
console.log(`\n📊 Found ${dispensaries.length} potential dispensary elements`);
console.log(`📊 Captured ${apiData.length} API responses`);
if (apiData.length > 0) {
console.log('\n🎯 Analyzing API data...');
console.log(JSON.stringify(apiData[0], null, 2).substring(0, 1000));
}
if (dispensaries.length > 0) {
console.log('\n📋 Sample dispensary elements:');
console.log(dispensaries.slice(0, 3));
}
} catch (error) {
console.error(`❌ Error: ${error}`);
throw error;
} finally {
await browser.close();
await pool.end();
}
}
scrapeAZDHSBetter();