- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
320 lines
10 KiB
TypeScript
320 lines
10 KiB
TypeScript
import { firefox } from 'playwright';
|
|
import { pool } from './src/db/migrate';
|
|
import { getRandomProxy } from './src/utils/proxyManager';
|
|
|
|
interface DispensaryEnrichment {
|
|
id: number;
|
|
azdhs_name: string;
|
|
address: string;
|
|
city: string;
|
|
state: string;
|
|
zip: string;
|
|
dba_name?: string;
|
|
website?: string;
|
|
google_phone?: string;
|
|
google_rating?: number;
|
|
google_review_count?: number;
|
|
confidence: 'high' | 'medium' | 'low';
|
|
notes?: string;
|
|
}
|
|
|
|
async function enrichFromGoogleMaps() {
|
|
console.log('🦊 Enriching AZDHS dispensaries from Google Maps using Firefox\n');
|
|
|
|
// Get a proxy
|
|
const proxy = await getRandomProxy();
|
|
if (!proxy) {
|
|
console.log('❌ No proxies available');
|
|
await pool.end();
|
|
return;
|
|
}
|
|
|
|
console.log(`🔌 Using proxy: ${proxy.server}\n`);
|
|
|
|
const browser = await firefox.launch({
|
|
headless: true,
|
|
firefoxUserPrefs: {
|
|
'geo.enabled': true,
|
|
'geo.provider.use_corelocation': true,
|
|
'geo.prompt.testing': true,
|
|
'geo.prompt.testing.allow': true,
|
|
}
|
|
});
|
|
|
|
const contextOptions: any = {
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
geolocation: { latitude: 33.4484, longitude: -112.0740 }, // Phoenix, AZ
|
|
permissions: ['geolocation'],
|
|
proxy: {
|
|
server: proxy.server,
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
}
|
|
};
|
|
|
|
const context = await browser.newContext(contextOptions);
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
// Get all dispensaries that don't have website yet
|
|
const result = await pool.query(`
|
|
SELECT id, slug, name, address, city, state, zip, phone, website, dba_name
|
|
FROM dispensaries
|
|
WHERE website IS NULL OR website = ''
|
|
ORDER BY id
|
|
LIMIT 50
|
|
`);
|
|
|
|
const dispensaries = result.rows;
|
|
console.log(`📋 Found ${dispensaries.length} dispensaries to enrich\n`);
|
|
|
|
let changesCreated = 0;
|
|
let failed = 0;
|
|
let skipped = 0;
|
|
|
|
for (const disp of dispensaries) {
|
|
console.log(`\n🔍 Processing: ${disp.name}`);
|
|
console.log(` Address: ${disp.address}, ${disp.city}, ${disp.state} ${disp.zip}`);
|
|
|
|
try {
|
|
// Search Google Maps with dispensary name + address for better results
|
|
const searchQuery = `${disp.name} ${disp.address}, ${disp.city}, ${disp.state} ${disp.zip}`;
|
|
const encodedQuery = encodeURIComponent(searchQuery);
|
|
const url = `https://www.google.com/maps/search/${encodedQuery}`;
|
|
|
|
console.log(` 📍 Searching Maps: ${searchQuery}`);
|
|
await page.goto(url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000
|
|
});
|
|
|
|
// Wait for results
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Extract business data from the first result
|
|
const businessData = await page.evaluate(() => {
|
|
const data: any = {};
|
|
|
|
// Try to find the place name from the side panel
|
|
const nameSelectors = [
|
|
'h1[class*="fontHeadline"]',
|
|
'h1.DUwDvf',
|
|
'[data-item-id*="name"] h1'
|
|
];
|
|
|
|
for (const selector of nameSelectors) {
|
|
const el = document.querySelector(selector);
|
|
if (el?.textContent) {
|
|
data.name = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to find website
|
|
const websiteSelectors = [
|
|
'a[data-item-id="authority"]',
|
|
'a[data-tooltip="Open website"]',
|
|
'a[aria-label*="Website"]'
|
|
];
|
|
|
|
for (const selector of websiteSelectors) {
|
|
const el = document.querySelector(selector) as HTMLAnchorElement;
|
|
if (el?.href && !el.href.includes('google.com')) {
|
|
data.website = el.href;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to find phone
|
|
const phoneSelectors = [
|
|
'button[data-item-id*="phone"]',
|
|
'button[aria-label*="Phone"]',
|
|
'[data-tooltip*="Copy phone number"]'
|
|
];
|
|
|
|
for (const selector of phoneSelectors) {
|
|
const el = document.querySelector(selector);
|
|
if (el?.textContent) {
|
|
const phoneMatch = el.textContent.match(/\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/);
|
|
if (phoneMatch) {
|
|
data.phone = phoneMatch[0];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to find rating
|
|
const ratingEl = document.querySelector('[role="img"][aria-label*="stars"]');
|
|
if (ratingEl) {
|
|
const label = ratingEl.getAttribute('aria-label');
|
|
const match = label?.match(/(\d+\.?\d*)\s*stars?/);
|
|
if (match) {
|
|
data.rating = parseFloat(match[1]);
|
|
}
|
|
}
|
|
|
|
// Try to find review count
|
|
const reviewEl = document.querySelector('[aria-label*="reviews"]');
|
|
if (reviewEl) {
|
|
const label = reviewEl.getAttribute('aria-label');
|
|
const match = label?.match(/([\d,]+)\s*reviews?/);
|
|
if (match) {
|
|
data.reviewCount = parseInt(match[1].replace(/,/g, ''));
|
|
}
|
|
}
|
|
|
|
return data;
|
|
});
|
|
|
|
console.log(` Found data:`, businessData);
|
|
|
|
// Determine confidence level
|
|
let confidence: 'high' | 'medium' | 'low' = 'low';
|
|
if (businessData.name && businessData.website && businessData.phone) {
|
|
confidence = 'high';
|
|
} else if (businessData.name && (businessData.website || businessData.phone)) {
|
|
confidence = 'medium';
|
|
}
|
|
|
|
// Track if any changes were made for this dispensary
|
|
let changesMadeForDispensary = 0;
|
|
|
|
// Create change records for each field that has new data
|
|
if (businessData.name && businessData.name !== disp.dba_name) {
|
|
await pool.query(`
|
|
INSERT INTO dispensary_changes (
|
|
dispensary_id, field_name, old_value, new_value,
|
|
confidence_score, source, change_notes
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
`, [
|
|
disp.id,
|
|
'dba_name',
|
|
disp.dba_name || null,
|
|
businessData.name,
|
|
confidence,
|
|
'google_maps',
|
|
`Found via Google Maps search for "${disp.name}"`
|
|
]);
|
|
console.log(` 📝 Created change record for DBA name`);
|
|
changesMadeForDispensary++;
|
|
}
|
|
|
|
if (businessData.website && businessData.website !== disp.website) {
|
|
await pool.query(`
|
|
INSERT INTO dispensary_changes (
|
|
dispensary_id, field_name, old_value, new_value,
|
|
confidence_score, source, change_notes, requires_recrawl
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
`, [
|
|
disp.id,
|
|
'website',
|
|
disp.website || null,
|
|
businessData.website,
|
|
confidence,
|
|
'google_maps',
|
|
`Found via Google Maps search for "${disp.name}"`,
|
|
true
|
|
]);
|
|
console.log(` 📝 Created change record for website (requires recrawl)`);
|
|
changesMadeForDispensary++;
|
|
}
|
|
|
|
if (businessData.phone && businessData.phone !== disp.phone) {
|
|
await pool.query(`
|
|
INSERT INTO dispensary_changes (
|
|
dispensary_id, field_name, old_value, new_value,
|
|
confidence_score, source, change_notes
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
`, [
|
|
disp.id,
|
|
'phone',
|
|
disp.phone || null,
|
|
businessData.phone,
|
|
confidence,
|
|
'google_maps',
|
|
`Found via Google Maps search for "${disp.name}"`
|
|
]);
|
|
console.log(` 📝 Created change record for phone`);
|
|
changesMadeForDispensary++;
|
|
}
|
|
|
|
if (businessData.rating) {
|
|
await pool.query(`
|
|
INSERT INTO dispensary_changes (
|
|
dispensary_id, field_name, old_value, new_value,
|
|
confidence_score, source, change_notes
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
`, [
|
|
disp.id,
|
|
'google_rating',
|
|
null,
|
|
businessData.rating.toString(),
|
|
confidence,
|
|
'google_maps',
|
|
`Google rating from Maps search`
|
|
]);
|
|
console.log(` 📝 Created change record for Google rating`);
|
|
changesMadeForDispensary++;
|
|
}
|
|
|
|
if (businessData.reviewCount) {
|
|
await pool.query(`
|
|
INSERT INTO dispensary_changes (
|
|
dispensary_id, field_name, old_value, new_value,
|
|
confidence_score, source, change_notes
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
`, [
|
|
disp.id,
|
|
'google_review_count',
|
|
null,
|
|
businessData.reviewCount.toString(),
|
|
confidence,
|
|
'google_maps',
|
|
`Google review count from Maps search`
|
|
]);
|
|
console.log(` 📝 Created change record for Google review count`);
|
|
changesMadeForDispensary++;
|
|
}
|
|
|
|
if (changesMadeForDispensary > 0) {
|
|
console.log(` ✅ Created ${changesMadeForDispensary} change record(s) for review (${confidence} confidence)`);
|
|
changesCreated += changesMadeForDispensary;
|
|
} else {
|
|
console.log(` ⏭️ No new data found`);
|
|
skipped++;
|
|
}
|
|
|
|
} catch (error) {
|
|
console.log(` ❌ Error: ${error}`);
|
|
failed++;
|
|
}
|
|
|
|
// Rate limiting - wait between requests
|
|
await page.waitForTimeout(3000 + Math.random() * 2000);
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(80));
|
|
console.log(`\n📊 Summary:`);
|
|
console.log(` 📝 Change records created: ${changesCreated}`);
|
|
console.log(` ⏭️ Skipped (no new data): ${skipped}`);
|
|
console.log(` ❌ Failed: ${failed}`);
|
|
console.log(`\n💡 Visit the Change Approval page to review and approve these changes.`);
|
|
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
try {
|
|
await enrichFromGoogleMaps();
|
|
} catch (error) {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main();
|