172 lines
5.5 KiB
TypeScript
172 lines
5.5 KiB
TypeScript
import { chromium } from 'playwright';
|
||
import { pool } from './src/db/migrate';
|
||
import { getRandomProxy } from './src/utils/proxyManager';
|
||
import * as fs from 'fs';
|
||
|
||
async function debugGoogleScraper() {
|
||
console.log('🔍 Debugging Google scraper with proxy\n');
|
||
|
||
// Get a proxy
|
||
const proxy = await getRandomProxy();
|
||
if (!proxy) {
|
||
console.log('❌ No proxies available');
|
||
await pool.end();
|
||
return;
|
||
}
|
||
|
||
console.log(`🔌 Using proxy: ${proxy.server}\n`);
|
||
|
||
const browser = await chromium.launch({
|
||
headless: false, // Run in visible mode
|
||
args: ['--disable-blink-features=AutomationControlled']
|
||
});
|
||
|
||
const contextOptions: any = {
|
||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
viewport: { width: 1920, height: 1080 },
|
||
locale: 'en-US',
|
||
timezoneId: 'America/Phoenix',
|
||
geolocation: { latitude: 33.4484, longitude: -112.0740 },
|
||
permissions: ['geolocation'],
|
||
proxy: {
|
||
server: proxy.server,
|
||
username: proxy.username,
|
||
password: proxy.password
|
||
}
|
||
};
|
||
|
||
const context = await browser.newContext(contextOptions);
|
||
|
||
// Add stealth
|
||
await context.addInitScript(() => {
|
||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||
(window as any).chrome = { runtime: {} };
|
||
});
|
||
|
||
const page = await context.newPage();
|
||
|
||
try {
|
||
// Test with the "All Greens Dispensary" example
|
||
const testAddress = '1035 W Main St, Quartzsite, AZ 85346';
|
||
const searchQuery = `${testAddress} dispensary`;
|
||
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(searchQuery)}`;
|
||
|
||
console.log(`🔍 Testing search: ${searchQuery}`);
|
||
console.log(`📍 URL: ${searchUrl}\n`);
|
||
|
||
await page.goto(searchUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
||
await page.waitForTimeout(3000);
|
||
|
||
// Take screenshot
|
||
await page.screenshot({ path: '/tmp/google-search-debug.png', fullPage: true });
|
||
console.log('📸 Screenshot saved to /tmp/google-search-debug.png\n');
|
||
|
||
// Get the full HTML
|
||
const html = await page.content();
|
||
fs.writeFileSync('/tmp/google-search-debug.html', html);
|
||
console.log('💾 HTML saved to /tmp/google-search-debug.html\n');
|
||
|
||
// Try to find any text that looks like "All Greens"
|
||
const pageText = await page.evaluate(() => document.body.innerText);
|
||
const hasAllGreens = pageText.toLowerCase().includes('all greens');
|
||
console.log(`🔍 Page contains "All Greens": ${hasAllGreens}\n`);
|
||
|
||
if (hasAllGreens) {
|
||
console.log('✅ Google found the business!\n');
|
||
|
||
// Let's try to find where the name appears in the DOM
|
||
const nameInfo = await page.evaluate(() => {
|
||
const results: any[] = [];
|
||
const walker = document.createTreeWalker(
|
||
document.body,
|
||
NodeFilter.SHOW_TEXT,
|
||
null
|
||
);
|
||
|
||
let node;
|
||
while (node = walker.nextNode()) {
|
||
const text = node.textContent?.trim() || '';
|
||
if (text.toLowerCase().includes('all greens')) {
|
||
const element = node.parentElement;
|
||
results.push({
|
||
text: text,
|
||
tagName: element?.tagName,
|
||
className: element?.className,
|
||
id: element?.id,
|
||
dataAttrs: Array.from(element?.attributes || [])
|
||
.filter(attr => attr.name.startsWith('data-'))
|
||
.map(attr => `${attr.name}="${attr.value}"`)
|
||
});
|
||
}
|
||
}
|
||
return results;
|
||
});
|
||
|
||
console.log('📍 Found "All Greens" in these elements:');
|
||
console.log(JSON.stringify(nameInfo, null, 2));
|
||
}
|
||
|
||
// Try current selectors
|
||
console.log('\n🧪 Testing current selectors:\n');
|
||
|
||
const nameSelectors = [
|
||
'[data-attrid="title"]',
|
||
'h2[data-attrid="title"]',
|
||
'.SPZz6b h2',
|
||
'h3.LC20lb',
|
||
'.kp-header .SPZz6b'
|
||
];
|
||
|
||
for (const selector of nameSelectors) {
|
||
const element = await page.$(selector);
|
||
if (element) {
|
||
const text = await element.textContent();
|
||
console.log(`✅ ${selector}: "${text?.trim()}"`);
|
||
} else {
|
||
console.log(`❌ ${selector}: not found`);
|
||
}
|
||
}
|
||
|
||
// Look for website links
|
||
console.log('\n🔗 Looking for website links:\n');
|
||
const links = await page.evaluate(() => {
|
||
const allLinks = Array.from(document.querySelectorAll('a[href]'));
|
||
return allLinks
|
||
.filter(a => {
|
||
const href = (a as HTMLAnchorElement).href;
|
||
return href &&
|
||
!href.includes('google.com') &&
|
||
!href.includes('youtube.com') &&
|
||
!href.includes('facebook.com');
|
||
})
|
||
.slice(0, 10)
|
||
.map(a => ({
|
||
href: (a as HTMLAnchorElement).href,
|
||
text: a.textContent?.trim().substring(0, 50),
|
||
className: a.className
|
||
}));
|
||
});
|
||
|
||
console.log('First 10 non-Google links:');
|
||
console.log(JSON.stringify(links, null, 2));
|
||
|
||
// Look for phone numbers
|
||
console.log('\n📞 Looking for phone numbers:\n');
|
||
const phoneMatches = pageText.match(/\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g);
|
||
if (phoneMatches) {
|
||
console.log('Found phone numbers:', phoneMatches);
|
||
} else {
|
||
console.log('No phone numbers found in page text');
|
||
}
|
||
|
||
console.log('\n⏸️ Browser will stay open for 30 seconds for manual inspection...');
|
||
await page.waitForTimeout(30000);
|
||
|
||
} finally {
|
||
await browser.close();
|
||
await pool.end();
|
||
}
|
||
}
|
||
|
||
debugGoogleScraper().catch(console.error);
|