Files
cannaiq/backend/archive/debug-google-scraper.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

172 lines
5.5 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { chromium } from 'playwright';
import { pool } from './src/db/migrate';
import { getRandomProxy } from './src/utils/proxyManager';
import * as fs from 'fs';
async function debugGoogleScraper() {
console.log('🔍 Debugging Google scraper with proxy\n');
// Get a proxy
const proxy = await getRandomProxy();
if (!proxy) {
console.log('❌ No proxies available');
await pool.end();
return;
}
console.log(`🔌 Using proxy: ${proxy.server}\n`);
const browser = await chromium.launch({
headless: false, // Run in visible mode
args: ['--disable-blink-features=AutomationControlled']
});
const contextOptions: any = {
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/Phoenix',
geolocation: { latitude: 33.4484, longitude: -112.0740 },
permissions: ['geolocation'],
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
};
const context = await browser.newContext(contextOptions);
// Add stealth
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} };
});
const page = await context.newPage();
try {
// Test with the "All Greens Dispensary" example
const testAddress = '1035 W Main St, Quartzsite, AZ 85346';
const searchQuery = `${testAddress} dispensary`;
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(searchQuery)}`;
console.log(`🔍 Testing search: ${searchQuery}`);
console.log(`📍 URL: ${searchUrl}\n`);
await page.goto(searchUrl, { waitUntil: 'networkidle', timeout: 30000 });
await page.waitForTimeout(3000);
// Take screenshot
await page.screenshot({ path: '/tmp/google-search-debug.png', fullPage: true });
console.log('📸 Screenshot saved to /tmp/google-search-debug.png\n');
// Get the full HTML
const html = await page.content();
fs.writeFileSync('/tmp/google-search-debug.html', html);
console.log('💾 HTML saved to /tmp/google-search-debug.html\n');
// Try to find any text that looks like "All Greens"
const pageText = await page.evaluate(() => document.body.innerText);
const hasAllGreens = pageText.toLowerCase().includes('all greens');
console.log(`🔍 Page contains "All Greens": ${hasAllGreens}\n`);
if (hasAllGreens) {
console.log('✅ Google found the business!\n');
// Let's try to find where the name appears in the DOM
const nameInfo = await page.evaluate(() => {
const results: any[] = [];
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null
);
let node;
while (node = walker.nextNode()) {
const text = node.textContent?.trim() || '';
if (text.toLowerCase().includes('all greens')) {
const element = node.parentElement;
results.push({
text: text,
tagName: element?.tagName,
className: element?.className,
id: element?.id,
dataAttrs: Array.from(element?.attributes || [])
.filter(attr => attr.name.startsWith('data-'))
.map(attr => `${attr.name}="${attr.value}"`)
});
}
}
return results;
});
console.log('📍 Found "All Greens" in these elements:');
console.log(JSON.stringify(nameInfo, null, 2));
}
// Try current selectors
console.log('\n🧪 Testing current selectors:\n');
const nameSelectors = [
'[data-attrid="title"]',
'h2[data-attrid="title"]',
'.SPZz6b h2',
'h3.LC20lb',
'.kp-header .SPZz6b'
];
for (const selector of nameSelectors) {
const element = await page.$(selector);
if (element) {
const text = await element.textContent();
console.log(`${selector}: "${text?.trim()}"`);
} else {
console.log(`${selector}: not found`);
}
}
// Look for website links
console.log('\n🔗 Looking for website links:\n');
const links = await page.evaluate(() => {
const allLinks = Array.from(document.querySelectorAll('a[href]'));
return allLinks
.filter(a => {
const href = (a as HTMLAnchorElement).href;
return href &&
!href.includes('google.com') &&
!href.includes('youtube.com') &&
!href.includes('facebook.com');
})
.slice(0, 10)
.map(a => ({
href: (a as HTMLAnchorElement).href,
text: a.textContent?.trim().substring(0, 50),
className: a.className
}));
});
console.log('First 10 non-Google links:');
console.log(JSON.stringify(links, null, 2));
// Look for phone numbers
console.log('\n📞 Looking for phone numbers:\n');
const phoneMatches = pageText.match(/\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g);
if (phoneMatches) {
console.log('Found phone numbers:', phoneMatches);
} else {
console.log('No phone numbers found in page text');
}
console.log('\n⏸ Browser will stay open for 30 seconds for manual inspection...');
await page.waitForTimeout(30000);
} finally {
await browser.close();
await pool.end();
}
}
debugGoogleScraper().catch(console.error);