285 lines
8.5 KiB
TypeScript
285 lines
8.5 KiB
TypeScript
import { chromium } from 'playwright';
|
|
import { pool } from './src/db/migrate';
|
|
import { getStateProxy, getRandomProxy } from './src/utils/proxyManager';
|
|
|
|
interface DispensaryEnrichment {
|
|
id: number;
|
|
azdhs_name: string;
|
|
address: string;
|
|
city: string;
|
|
state: string;
|
|
zip: string;
|
|
dba_name?: string;
|
|
website?: string;
|
|
google_phone?: string;
|
|
google_rating?: number;
|
|
google_review_count?: number;
|
|
confidence: 'high' | 'medium' | 'low';
|
|
notes?: string;
|
|
}
|
|
|
|
async function enrichDispensariesFromGoogle() {
|
|
console.log('🔍 Starting Google enrichment for AZDHS dispensaries\n');
|
|
|
|
// Get an Arizona proxy if available, otherwise any proxy
|
|
let proxy = await getStateProxy('Arizona');
|
|
if (!proxy) {
|
|
console.log('⚠️ No Arizona proxy available, trying any US proxy...');
|
|
proxy = await getRandomProxy();
|
|
}
|
|
|
|
if (!proxy) {
|
|
console.log('❌ No proxies available. Please add proxies to the database.');
|
|
await pool.end();
|
|
return;
|
|
}
|
|
|
|
console.log(`🔌 Using proxy: ${proxy.server}\n`);
|
|
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
]
|
|
});
|
|
|
|
const contextOptions: any = {
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
viewport: { width: 1920, height: 1080 },
|
|
locale: 'en-US',
|
|
timezoneId: 'America/Phoenix',
|
|
geolocation: { latitude: 33.4484, longitude: -112.0740 }, // Phoenix, AZ
|
|
permissions: ['geolocation'],
|
|
proxy: {
|
|
server: proxy.server,
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
}
|
|
};
|
|
|
|
const context = await browser.newContext(contextOptions);
|
|
|
|
// Add stealth techniques
|
|
await context.addInitScript(() => {
|
|
// Remove webdriver flag
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
|
|
// Chrome runtime
|
|
(window as any).chrome = { runtime: {} };
|
|
|
|
// Permissions
|
|
const originalQuery = window.navigator.permissions.query;
|
|
window.navigator.permissions.query = (parameters: any) => (
|
|
parameters.name === 'notifications' ?
|
|
Promise.resolve({ state: Notification.permission } as PermissionStatus) :
|
|
originalQuery(parameters)
|
|
);
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
// Get all dispensaries that don't have website yet
|
|
const result = await pool.query(`
|
|
SELECT id, name, address, city, state, zip, phone
|
|
FROM azdhs_list
|
|
WHERE website IS NULL OR website = ''
|
|
ORDER BY id
|
|
LIMIT 2
|
|
`);
|
|
|
|
const dispensaries = result.rows;
|
|
console.log(`📋 Found ${dispensaries.length} dispensaries to enrich\n`);
|
|
|
|
let enriched = 0;
|
|
let failed = 0;
|
|
const needsReview: DispensaryEnrichment[] = [];
|
|
|
|
for (const disp of dispensaries) {
|
|
console.log(`\n🔍 Processing: ${disp.name}`);
|
|
console.log(` Address: ${disp.address}, ${disp.city}, ${disp.state} ${disp.zip}`);
|
|
|
|
try {
|
|
// Search Google for the address + dispensary
|
|
const searchQuery = `${disp.address}, ${disp.city}, ${disp.state} ${disp.zip} dispensary`;
|
|
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(searchQuery)}`;
|
|
|
|
console.log(` Searching: ${searchQuery}`);
|
|
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
|
|
await page.waitForTimeout(2000);
|
|
|
|
// Try to extract Google Business info
|
|
const businessData = await page.evaluate(() => {
|
|
const data: any = {};
|
|
|
|
// Try to find business name
|
|
const nameSelectors = [
|
|
'[data-attrid="title"]',
|
|
'h2[data-attrid="title"]',
|
|
'.SPZz6b h2',
|
|
'h3.LC20lb',
|
|
'.kp-header .SPZz6b'
|
|
];
|
|
|
|
for (const selector of nameSelectors) {
|
|
const el = document.querySelector(selector);
|
|
if (el?.textContent) {
|
|
data.name = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to find website
|
|
const websiteSelectors = [
|
|
'a[data-dtype="d3ph"]',
|
|
'.yuRUbf a',
|
|
'a.ab_button[href^="http"]'
|
|
];
|
|
|
|
for (const selector of websiteSelectors) {
|
|
const el = document.querySelector(selector) as HTMLAnchorElement;
|
|
if (el?.href && !el.href.includes('google.com')) {
|
|
data.website = el.href;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to find phone
|
|
const phoneSelectors = [
|
|
'[data-dtype="d3ph"]',
|
|
'span[data-dtype="d3ph"]',
|
|
'.LrzXr.zdqRlf'
|
|
];
|
|
|
|
for (const selector of phoneSelectors) {
|
|
const el = document.querySelector(selector);
|
|
if (el?.textContent && /\d{3}.*\d{3}.*\d{4}/.test(el.textContent)) {
|
|
data.phone = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to find rating
|
|
const ratingEl = document.querySelector('.Aq14fc');
|
|
if (ratingEl?.textContent) {
|
|
const match = ratingEl.textContent.match(/(\d+\.?\d*)/);
|
|
if (match) data.rating = parseFloat(match[1]);
|
|
}
|
|
|
|
// Try to find review count
|
|
const reviewEl = document.querySelector('.hqzQac span');
|
|
if (reviewEl?.textContent) {
|
|
const match = reviewEl.textContent.match(/(\d+)/);
|
|
if (match) data.reviewCount = parseInt(match[1]);
|
|
}
|
|
|
|
return data;
|
|
});
|
|
|
|
console.log(` Found data:`, businessData);
|
|
|
|
// Determine confidence level
|
|
let confidence: 'high' | 'medium' | 'low' = 'low';
|
|
if (businessData.name && businessData.website && businessData.phone) {
|
|
confidence = 'high';
|
|
} else if (businessData.name && (businessData.website || businessData.phone)) {
|
|
confidence = 'medium';
|
|
}
|
|
|
|
const enrichment: DispensaryEnrichment = {
|
|
id: disp.id,
|
|
azdhs_name: disp.name,
|
|
address: disp.address,
|
|
city: disp.city,
|
|
state: disp.state,
|
|
zip: disp.zip,
|
|
dba_name: businessData.name,
|
|
website: businessData.website,
|
|
google_phone: businessData.phone,
|
|
google_rating: businessData.rating,
|
|
google_review_count: businessData.reviewCount,
|
|
confidence
|
|
};
|
|
|
|
if (confidence === 'high') {
|
|
// Auto-update high confidence matches
|
|
await pool.query(`
|
|
UPDATE azdhs_list
|
|
SET
|
|
dba_name = $1,
|
|
website = $2,
|
|
google_rating = $3,
|
|
google_review_count = $4,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $5
|
|
`, [
|
|
businessData.name,
|
|
businessData.website,
|
|
businessData.rating,
|
|
businessData.reviewCount,
|
|
disp.id
|
|
]);
|
|
|
|
console.log(` ✅ Updated (high confidence)`);
|
|
enriched++;
|
|
} else {
|
|
// Flag for manual review
|
|
needsReview.push(enrichment);
|
|
console.log(` ⚠️ Needs review (${confidence} confidence)`);
|
|
}
|
|
|
|
} catch (error) {
|
|
console.log(` ❌ Error: ${error}`);
|
|
failed++;
|
|
}
|
|
|
|
// Rate limiting - wait between requests
|
|
await page.waitForTimeout(3000 + Math.random() * 2000);
|
|
}
|
|
|
|
console.log('\n' + '='.repeat(80));
|
|
console.log(`\n📊 Summary:`);
|
|
console.log(` ✅ Enriched: ${enriched}`);
|
|
console.log(` ⚠️ Needs review: ${needsReview.length}`);
|
|
console.log(` ❌ Failed: ${failed}`);
|
|
|
|
if (needsReview.length > 0) {
|
|
console.log('\n📋 Dispensaries needing manual review:\n');
|
|
console.table(needsReview.map(d => ({
|
|
ID: d.id,
|
|
'AZDHS Name': d.azdhs_name.substring(0, 30),
|
|
'Google Name': d.dba_name?.substring(0, 30) || '-',
|
|
Website: d.website ? 'Yes' : 'No',
|
|
Phone: d.google_phone ? 'Yes' : 'No',
|
|
Confidence: d.confidence
|
|
})));
|
|
}
|
|
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
// Add missing columns if they don't exist
|
|
async function setupDatabase() {
|
|
await pool.query(`
|
|
ALTER TABLE azdhs_list
|
|
ADD COLUMN IF NOT EXISTS dba_name VARCHAR(255),
|
|
ADD COLUMN IF NOT EXISTS google_rating DECIMAL(2,1),
|
|
ADD COLUMN IF NOT EXISTS google_review_count INTEGER
|
|
`);
|
|
}
|
|
|
|
async function main() {
|
|
try {
|
|
await setupDatabase();
|
|
await enrichDispensariesFromGoogle();
|
|
} catch (error) {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main();
|