- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
194 lines
6.3 KiB
TypeScript
194 lines
6.3 KiB
TypeScript
import { chromium } from 'playwright-extra';
|
|
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
import { pool } from './src/db/migrate';
|
|
|
|
chromium.use(stealth());
|
|
|
|
async function scrapeAZDHSAuto() {
|
|
console.log('🏛️ Scraping AZDHS - Automatic Mode\n');
|
|
|
|
const browser = await chromium.launch({
|
|
headless: false, // Visible so you can see it working
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
console.log('📄 Loading AZDHS page...');
|
|
|
|
await page.goto('https://azcarecheck.azdhs.gov/s/?facilityId=001t000000L0TApAAN', {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
console.log('⏳ Waiting 30 seconds for page to fully load and for you to scroll...\n');
|
|
await page.waitForTimeout(30000);
|
|
|
|
console.log('📦 Extracting all dispensaries from the page...\n');
|
|
|
|
// Extract all dispensaries
|
|
const dispensaries = await page.evaluate(() => {
|
|
const results: any[] = [];
|
|
|
|
// Look for all possible dispensary container elements
|
|
const containers = document.querySelectorAll(
|
|
'article, [class*="facility"], [class*="dispensary"], [class*="location"], ' +
|
|
'.slds-card, lightning-card, [data-id], [data-facility-id]'
|
|
);
|
|
|
|
containers.forEach((card) => {
|
|
const disp: any = {};
|
|
|
|
// Get all text from the card
|
|
const fullText = card.textContent?.trim() || '';
|
|
disp.rawText = fullText.substring(0, 500);
|
|
|
|
// Try various selectors for name
|
|
const nameSelectors = ['h3', 'h2', 'h4', '[class*="title"]', '[class*="name"]', 'strong', 'b'];
|
|
for (const selector of nameSelectors) {
|
|
const el = card.querySelector(selector);
|
|
if (el && el.textContent && el.textContent.trim().length > 3) {
|
|
disp.name = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract phone
|
|
const phoneLink = card.querySelector('a[href^="tel:"]');
|
|
if (phoneLink) {
|
|
disp.phone = phoneLink.getAttribute('href')?.replace('tel:', '').replace(/\D/g, '');
|
|
} else {
|
|
// Look for phone pattern in text
|
|
const phoneMatch = fullText.match(/(\d{3}[-.]?\d{3}[-.]?\d{4})/);
|
|
if (phoneMatch) disp.phone = phoneMatch[1];
|
|
}
|
|
|
|
// Extract email
|
|
const emailLink = card.querySelector('a[href^="mailto:"]');
|
|
if (emailLink) {
|
|
disp.email = emailLink.getAttribute('href')?.replace('mailto:', '');
|
|
}
|
|
|
|
// Extract address - look for street address pattern
|
|
const addressMatch = fullText.match(/(\d+\s+[A-Za-z0-9\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Circle|Court|Parkway)\.?(?:\s+(?:Suite|Ste|Unit|#)\s*[\w-]+)?)/i);
|
|
if (addressMatch) {
|
|
disp.address = addressMatch[1].trim();
|
|
}
|
|
|
|
// Extract city
|
|
const cityMatch = fullText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*AZ/);
|
|
if (cityMatch) {
|
|
disp.city = cityMatch[1];
|
|
}
|
|
|
|
// Extract ZIP
|
|
const zipMatch = fullText.match(/\b(\d{5})(?:-\d{4})?\b/);
|
|
if (zipMatch) {
|
|
disp.zip = zipMatch[1];
|
|
}
|
|
|
|
// Only add if we found at least a name
|
|
if (disp.name && disp.name.length > 3) {
|
|
results.push(disp);
|
|
}
|
|
});
|
|
|
|
return results;
|
|
});
|
|
|
|
console.log(`✅ Found ${dispensaries.length} dispensary entries!\n`);
|
|
|
|
if (dispensaries.length > 0) {
|
|
console.log('📋 Sample of first 5:');
|
|
console.table(dispensaries.slice(0, 5).map(d => ({
|
|
name: d.name?.substring(0, 40),
|
|
phone: d.phone,
|
|
city: d.city,
|
|
})));
|
|
}
|
|
|
|
// Save to database
|
|
console.log('\n💾 Saving to database with data_source="azdhs"...\n');
|
|
|
|
let savedCount = 0;
|
|
let updatedCount = 0;
|
|
let skippedCount = 0;
|
|
|
|
for (const disp of dispensaries) {
|
|
if (!disp.name) {
|
|
skippedCount++;
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
// Check if exists by name
|
|
const existing = await pool.query(
|
|
'SELECT id FROM stores WHERE LOWER(name) = LOWER($1) AND state = $2 AND data_source = $3',
|
|
[disp.name, 'AZ', 'azdhs']
|
|
);
|
|
|
|
const slug = disp.name.toLowerCase().replace(/[^a-z0-9]+/g, '-');
|
|
const dutchieUrl = `https://azcarecheck.azdhs.gov/s/?name=${encodeURIComponent(disp.name)}`;
|
|
|
|
if (existing.rows.length > 0) {
|
|
await pool.query(`
|
|
UPDATE stores SET
|
|
address = COALESCE($1, address),
|
|
city = COALESCE($2, city),
|
|
zip = COALESCE($3, zip),
|
|
phone = COALESCE($4, phone),
|
|
email = COALESCE($5, email),
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $6
|
|
`, [disp.address, disp.city, disp.zip, disp.phone, disp.email, existing.rows[0].id]);
|
|
updatedCount++;
|
|
} else {
|
|
await pool.query(`
|
|
INSERT INTO stores (
|
|
name, slug, dutchie_url, address, city, state, zip, phone, email,
|
|
data_source, active, created_at, updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, 'azdhs', true, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
`, [disp.name, slug, dutchieUrl, disp.address, disp.city, 'AZ', disp.zip, disp.phone, disp.email]);
|
|
savedCount++;
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error saving ${disp.name}: ${error}`);
|
|
skippedCount++;
|
|
}
|
|
}
|
|
|
|
console.log(`\n✅ Saved ${savedCount} new AZDHS dispensaries`);
|
|
console.log(`✅ Updated ${updatedCount} existing AZDHS dispensaries`);
|
|
if (skippedCount > 0) console.log(`⚠️ Skipped ${skippedCount} entries`);
|
|
|
|
// Show totals by source
|
|
const totals = await pool.query(`
|
|
SELECT data_source, COUNT(*) as count
|
|
FROM stores
|
|
WHERE state = 'AZ'
|
|
GROUP BY data_source
|
|
ORDER BY data_source
|
|
`);
|
|
|
|
console.log('\n📊 Arizona dispensaries by source:');
|
|
console.table(totals.rows);
|
|
|
|
console.log('\n✅ AZDHS scraping complete!');
|
|
|
|
} catch (error) {
|
|
console.error(`❌ Error: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
console.log('\n👉 Browser will close in 5 seconds...');
|
|
await page.waitForTimeout(5000);
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeAZDHSAuto();
|