Files
cannaiq/backend/archive/scrape-azdhs-manual-scroll.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

197 lines
6.4 KiB
TypeScript

import { chromium } from 'playwright-extra';
import stealth from 'puppeteer-extra-plugin-stealth';
import { pool } from './src/db/migrate';
chromium.use(stealth());
async function scrapeAZDHSManualScroll() {
console.log('🏛️ Scraping AZDHS - Manual Scroll Mode\n');
console.log('📖 Instructions:');
console.log(' 1. Browser will open');
console.log(' 2. Scroll down to load ALL 182 dispensaries');
console.log(' 3. Press ENTER in this terminal when done scrolling\n');
const browser = await chromium.launch({
headless: false, // Visible browser
});
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
});
const page = await context.newPage();
try {
console.log('📄 Loading AZDHS page...');
await page.goto('https://azcarecheck.azdhs.gov/s/?facilityId=001t000000L0TApAAN', {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await page.waitForTimeout(10000);
console.log('\n✅ Page loaded!');
console.log('👉 Now scroll down to load all dispensaries...');
console.log('👉 Press ENTER when you\'re done scrolling and all 182 are loaded\n');
// Wait for user to press ENTER
await new Promise<void>((resolve) => {
process.stdin.once('data', () => {
console.log('\n📦 Extracting all dispensaries...\n');
resolve();
});
});
// Extract all dispensaries from the list
const dispensaries = await page.evaluate(() => {
const results: any[] = [];
// Look for dispensary cards/listings
const cards = document.querySelectorAll('article, [class*="facility"], [class*="dispensary"], [class*="location"], .slds-card, lightning-card');
cards.forEach((card, index) => {
const disp: any = {};
// Extract name
const nameEl = card.querySelector('h3, h2, [class*="title"], [class*="name"], a[title]');
if (nameEl) {
disp.name = nameEl.textContent?.trim() || nameEl.getAttribute('title');
}
// Extract address
const addressEl = card.querySelector('[class*="address"], address');
if (addressEl) {
disp.address = addressEl.textContent?.trim();
}
// Extract phone
const phoneEl = card.querySelector('a[href^="tel:"], [class*="phone"]');
if (phoneEl) {
disp.phone = phoneEl.textContent?.trim() || phoneEl.getAttribute('href')?.replace('tel:', '');
}
// Extract email
const emailEl = card.querySelector('a[href^="mailto:"]');
if (emailEl) {
disp.email = emailEl.getAttribute('href')?.replace('mailto:', '');
}
// Extract all text to parse later
disp.rawText = card.textContent?.trim().substring(0, 500);
disp.index = index;
if (disp.name || disp.rawText) {
results.push(disp);
}
});
return results;
});
console.log(`✅ Found ${dispensaries.length} dispensary entries!`);
if (dispensaries.length > 0) {
console.log('\n📋 Sample of first 3:');
console.table(dispensaries.slice(0, 3).map(d => ({
name: d.name?.substring(0, 40),
phone: d.phone,
email: d.email,
})));
}
// Save to database
console.log('\n💾 Saving to database with data_source="azdhs"...\n');
let savedCount = 0;
let updatedCount = 0;
let skippedCount = 0;
for (const disp of dispensaries) {
if (!disp.name) {
skippedCount++;
continue;
}
try {
// Parse address from rawText if not found
let address = disp.address;
let city, state = 'AZ', zip;
if (disp.rawText) {
// Try to extract address components
const zipMatch = disp.rawText.match(/\b(\d{5})\b/);
if (zipMatch) zip = zipMatch[1];
const addressMatch = disp.rawText.match(/(\d+\s+[\w\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way)\.?)/i);
if (addressMatch) address = addressMatch[1];
const cityMatch = disp.rawText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*AZ/);
if (cityMatch) city = cityMatch[1];
}
// Check if exists by name
const existing = await pool.query(
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2 AND data_source = $3',
[disp.name, 'AZ', 'azdhs']
);
const slug = disp.name.toLowerCase().replace(/[^a-z0-9]+/g, '-');
const dutchieUrl = `https://azcarecheck.azdhs.gov/s/?name=${encodeURIComponent(disp.name)}`;
if (existing.rows.length > 0) {
await pool.query(`
UPDATE stores SET
address = COALESCE($1, address),
city = COALESCE($2, city),
zip = COALESCE($3, zip),
phone = COALESCE($4, phone),
email = COALESCE($5, email),
updated_at = CURRENT_TIMESTAMP
WHERE id = $6
`, [address, city, zip, disp.phone, disp.email, existing.rows[0].id]);
updatedCount++;
} else {
await pool.query(`
INSERT INTO stores (
name, slug, dutchie_url, address, city, state, zip, phone, email,
data_source, active, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, 'azdhs', true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
`, [disp.name, slug, dutchieUrl, address, city, 'AZ', zip, disp.phone, disp.email]);
savedCount++;
}
} catch (error) {
console.error(`Error saving ${disp.name}: ${error}`);
skippedCount++;
}
}
console.log(`\n✅ Saved ${savedCount} new AZDHS dispensaries`);
console.log(`✅ Updated ${updatedCount} existing AZDHS dispensaries`);
if (skippedCount > 0) console.log(`⚠️ Skipped ${skippedCount} entries`);
// Show totals by source
const totals = await pool.query(`
SELECT data_source, COUNT(*) as count
FROM stores
WHERE state = 'AZ'
GROUP BY data_source
ORDER BY data_source
`);
console.log('\n📊 Arizona dispensaries by source:');
console.table(totals.rows);
} catch (error) {
console.error(`❌ Error: ${error}`);
throw error;
} finally {
console.log('\n👉 Browser will stay open for 10 seconds so you can review...');
await page.waitForTimeout(10000);
await browser.close();
await pool.end();
}
}
scrapeAZDHSManualScroll();