Initial commit - Dutchie dispensary scraper

This commit is contained in:
Kelly
2025-11-28 19:45:44 -07:00
commit 5757a8e9bd
23375 changed files with 3788799 additions and 0 deletions

View File

@@ -0,0 +1,171 @@
import { chromium } from 'playwright';
import { pool } from './src/db/migrate';
import { getRandomProxy } from './src/utils/proxyManager';
import * as fs from 'fs';
async function debugGoogleScraper() {
console.log('🔍 Debugging Google scraper with proxy\n');
// Get a proxy
const proxy = await getRandomProxy();
if (!proxy) {
console.log('❌ No proxies available');
await pool.end();
return;
}
console.log(`🔌 Using proxy: ${proxy.server}\n`);
const browser = await chromium.launch({
headless: false, // Run in visible mode
args: ['--disable-blink-features=AutomationControlled']
});
const contextOptions: any = {
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/Phoenix',
geolocation: { latitude: 33.4484, longitude: -112.0740 },
permissions: ['geolocation'],
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
};
const context = await browser.newContext(contextOptions);
// Add stealth
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} };
});
const page = await context.newPage();
try {
// Test with the "All Greens Dispensary" example
const testAddress = '1035 W Main St, Quartzsite, AZ 85346';
const searchQuery = `${testAddress} dispensary`;
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(searchQuery)}`;
console.log(`🔍 Testing search: ${searchQuery}`);
console.log(`📍 URL: ${searchUrl}\n`);
await page.goto(searchUrl, { waitUntil: 'networkidle', timeout: 30000 });
await page.waitForTimeout(3000);
// Take screenshot
await page.screenshot({ path: '/tmp/google-search-debug.png', fullPage: true });
console.log('📸 Screenshot saved to /tmp/google-search-debug.png\n');
// Get the full HTML
const html = await page.content();
fs.writeFileSync('/tmp/google-search-debug.html', html);
console.log('💾 HTML saved to /tmp/google-search-debug.html\n');
// Try to find any text that looks like "All Greens"
const pageText = await page.evaluate(() => document.body.innerText);
const hasAllGreens = pageText.toLowerCase().includes('all greens');
console.log(`🔍 Page contains "All Greens": ${hasAllGreens}\n`);
if (hasAllGreens) {
console.log('✅ Google found the business!\n');
// Let's try to find where the name appears in the DOM
const nameInfo = await page.evaluate(() => {
const results: any[] = [];
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null
);
let node;
while (node = walker.nextNode()) {
const text = node.textContent?.trim() || '';
if (text.toLowerCase().includes('all greens')) {
const element = node.parentElement;
results.push({
text: text,
tagName: element?.tagName,
className: element?.className,
id: element?.id,
dataAttrs: Array.from(element?.attributes || [])
.filter(attr => attr.name.startsWith('data-'))
.map(attr => `${attr.name}="${attr.value}"`)
});
}
}
return results;
});
console.log('📍 Found "All Greens" in these elements:');
console.log(JSON.stringify(nameInfo, null, 2));
}
// Try current selectors
console.log('\n🧪 Testing current selectors:\n');
const nameSelectors = [
'[data-attrid="title"]',
'h2[data-attrid="title"]',
'.SPZz6b h2',
'h3.LC20lb',
'.kp-header .SPZz6b'
];
for (const selector of nameSelectors) {
const element = await page.$(selector);
if (element) {
const text = await element.textContent();
console.log(`${selector}: "${text?.trim()}"`);
} else {
console.log(`${selector}: not found`);
}
}
// Look for website links
console.log('\n🔗 Looking for website links:\n');
const links = await page.evaluate(() => {
const allLinks = Array.from(document.querySelectorAll('a[href]'));
return allLinks
.filter(a => {
const href = (a as HTMLAnchorElement).href;
return href &&
!href.includes('google.com') &&
!href.includes('youtube.com') &&
!href.includes('facebook.com');
})
.slice(0, 10)
.map(a => ({
href: (a as HTMLAnchorElement).href,
text: a.textContent?.trim().substring(0, 50),
className: a.className
}));
});
console.log('First 10 non-Google links:');
console.log(JSON.stringify(links, null, 2));
// Look for phone numbers
console.log('\n📞 Looking for phone numbers:\n');
const phoneMatches = pageText.match(/\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g);
if (phoneMatches) {
console.log('Found phone numbers:', phoneMatches);
} else {
console.log('No phone numbers found in page text');
}
console.log('\n⏸ Browser will stay open for 30 seconds for manual inspection...');
await page.waitForTimeout(30000);
} finally {
await browser.close();
await pool.end();
}
}
debugGoogleScraper().catch(console.error);