Files
cannaiq/backend/archive/enrich-dispensaries-from-google.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

285 lines
8.5 KiB
TypeScript

import { chromium } from 'playwright';
import { pool } from './src/db/migrate';
import { getStateProxy, getRandomProxy } from './src/utils/proxyManager';
interface DispensaryEnrichment {
id: number;
azdhs_name: string;
address: string;
city: string;
state: string;
zip: string;
dba_name?: string;
website?: string;
google_phone?: string;
google_rating?: number;
google_review_count?: number;
confidence: 'high' | 'medium' | 'low';
notes?: string;
}
async function enrichDispensariesFromGoogle() {
console.log('🔍 Starting Google enrichment for AZDHS dispensaries\n');
// Get an Arizona proxy if available, otherwise any proxy
let proxy = await getStateProxy('Arizona');
if (!proxy) {
console.log('⚠️ No Arizona proxy available, trying any US proxy...');
proxy = await getRandomProxy();
}
if (!proxy) {
console.log('❌ No proxies available. Please add proxies to the database.');
await pool.end();
return;
}
console.log(`🔌 Using proxy: ${proxy.server}\n`);
const browser = await chromium.launch({
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
]
});
const contextOptions: any = {
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/Phoenix',
geolocation: { latitude: 33.4484, longitude: -112.0740 }, // Phoenix, AZ
permissions: ['geolocation'],
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
};
const context = await browser.newContext(contextOptions);
// Add stealth techniques
await context.addInitScript(() => {
// Remove webdriver flag
Object.defineProperty(navigator, 'webdriver', { get: () => false });
// Chrome runtime
(window as any).chrome = { runtime: {} };
// Permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters: any) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission } as PermissionStatus) :
originalQuery(parameters)
);
});
const page = await context.newPage();
try {
// Get all dispensaries that don't have website yet
const result = await pool.query(`
SELECT id, name, address, city, state, zip, phone
FROM azdhs_list
WHERE website IS NULL OR website = ''
ORDER BY id
LIMIT 2
`);
const dispensaries = result.rows;
console.log(`📋 Found ${dispensaries.length} dispensaries to enrich\n`);
let enriched = 0;
let failed = 0;
const needsReview: DispensaryEnrichment[] = [];
for (const disp of dispensaries) {
console.log(`\n🔍 Processing: ${disp.name}`);
console.log(` Address: ${disp.address}, ${disp.city}, ${disp.state} ${disp.zip}`);
try {
// Search Google for the address + dispensary
const searchQuery = `${disp.address}, ${disp.city}, ${disp.state} ${disp.zip} dispensary`;
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(searchQuery)}`;
console.log(` Searching: ${searchQuery}`);
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
await page.waitForTimeout(2000);
// Try to extract Google Business info
const businessData = await page.evaluate(() => {
const data: any = {};
// Try to find business name
const nameSelectors = [
'[data-attrid="title"]',
'h2[data-attrid="title"]',
'.SPZz6b h2',
'h3.LC20lb',
'.kp-header .SPZz6b'
];
for (const selector of nameSelectors) {
const el = document.querySelector(selector);
if (el?.textContent) {
data.name = el.textContent.trim();
break;
}
}
// Try to find website
const websiteSelectors = [
'a[data-dtype="d3ph"]',
'.yuRUbf a',
'a.ab_button[href^="http"]'
];
for (const selector of websiteSelectors) {
const el = document.querySelector(selector) as HTMLAnchorElement;
if (el?.href && !el.href.includes('google.com')) {
data.website = el.href;
break;
}
}
// Try to find phone
const phoneSelectors = [
'[data-dtype="d3ph"]',
'span[data-dtype="d3ph"]',
'.LrzXr.zdqRlf'
];
for (const selector of phoneSelectors) {
const el = document.querySelector(selector);
if (el?.textContent && /\d{3}.*\d{3}.*\d{4}/.test(el.textContent)) {
data.phone = el.textContent.trim();
break;
}
}
// Try to find rating
const ratingEl = document.querySelector('.Aq14fc');
if (ratingEl?.textContent) {
const match = ratingEl.textContent.match(/(\d+\.?\d*)/);
if (match) data.rating = parseFloat(match[1]);
}
// Try to find review count
const reviewEl = document.querySelector('.hqzQac span');
if (reviewEl?.textContent) {
const match = reviewEl.textContent.match(/(\d+)/);
if (match) data.reviewCount = parseInt(match[1]);
}
return data;
});
console.log(` Found data:`, businessData);
// Determine confidence level
let confidence: 'high' | 'medium' | 'low' = 'low';
if (businessData.name && businessData.website && businessData.phone) {
confidence = 'high';
} else if (businessData.name && (businessData.website || businessData.phone)) {
confidence = 'medium';
}
const enrichment: DispensaryEnrichment = {
id: disp.id,
azdhs_name: disp.name,
address: disp.address,
city: disp.city,
state: disp.state,
zip: disp.zip,
dba_name: businessData.name,
website: businessData.website,
google_phone: businessData.phone,
google_rating: businessData.rating,
google_review_count: businessData.reviewCount,
confidence
};
if (confidence === 'high') {
// Auto-update high confidence matches
await pool.query(`
UPDATE azdhs_list
SET
dba_name = $1,
website = $2,
google_rating = $3,
google_review_count = $4,
updated_at = CURRENT_TIMESTAMP
WHERE id = $5
`, [
businessData.name,
businessData.website,
businessData.rating,
businessData.reviewCount,
disp.id
]);
console.log(` ✅ Updated (high confidence)`);
enriched++;
} else {
// Flag for manual review
needsReview.push(enrichment);
console.log(` ⚠️ Needs review (${confidence} confidence)`);
}
} catch (error) {
console.log(` ❌ Error: ${error}`);
failed++;
}
// Rate limiting - wait between requests
await page.waitForTimeout(3000 + Math.random() * 2000);
}
console.log('\n' + '='.repeat(80));
console.log(`\n📊 Summary:`);
console.log(` ✅ Enriched: ${enriched}`);
console.log(` ⚠️ Needs review: ${needsReview.length}`);
console.log(` ❌ Failed: ${failed}`);
if (needsReview.length > 0) {
console.log('\n📋 Dispensaries needing manual review:\n');
console.table(needsReview.map(d => ({
ID: d.id,
'AZDHS Name': d.azdhs_name.substring(0, 30),
'Google Name': d.dba_name?.substring(0, 30) || '-',
Website: d.website ? 'Yes' : 'No',
Phone: d.google_phone ? 'Yes' : 'No',
Confidence: d.confidence
})));
}
} finally {
await browser.close();
await pool.end();
}
}
// Add missing columns if they don't exist
async function setupDatabase() {
await pool.query(`
ALTER TABLE azdhs_list
ADD COLUMN IF NOT EXISTS dba_name VARCHAR(255),
ADD COLUMN IF NOT EXISTS google_rating DECIMAL(2,1),
ADD COLUMN IF NOT EXISTS google_review_count INTEGER
`);
}
async function main() {
try {
await setupDatabase();
await enrichDispensariesFromGoogle();
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
}
}
main();