Files
cannaiq/backend/archive/debug-scrape.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

92 lines
2.9 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
import fs from 'fs';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function debug() {
let browser;
try {
// Get proxy
const proxyResult = await pool.query(`SELECT host, port, protocol FROM proxies ORDER BY RANDOM() LIMIT 1`);
const proxy = proxyResult.rows[0];
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
console.log('🔌 Proxy:', proxyUrl);
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', `--proxy-server=${proxyUrl}`]
});
const page = await browser.newPage();
// Set Googlebot UA
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
// Log all requests being made
page.on('request', request => {
console.log('\n📤 REQUEST:', request.method(), request.url());
console.log(' Headers:', JSON.stringify(request.headers(), null, 2));
});
// Log all responses
page.on('response', response => {
console.log('\n📥 RESPONSE:', response.status(), response.url());
console.log(' Headers:', JSON.stringify(response.headers(), null, 2));
});
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
console.log('\n🌐 Going to:', url);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
await page.waitForTimeout(3000);
// Get what the browser sees
const pageData = await page.evaluate(() => ({
title: document.title,
url: window.location.href,
userAgent: navigator.userAgent,
bodyHTML: document.body.innerHTML,
bodyText: document.body.innerText
}));
console.log('\n📄 PAGE DATA:');
console.log('Title:', pageData.title);
console.log('URL:', pageData.url);
console.log('User Agent (browser sees):', pageData.userAgent);
console.log('Body HTML length:', pageData.bodyHTML.length, 'chars');
console.log('Body text length:', pageData.bodyText.length, 'chars');
// Save HTML to file
fs.writeFileSync('/tmp/page.html', pageData.bodyHTML);
console.log('\n💾 Saved HTML to /tmp/page.html');
// Save screenshot
await page.screenshot({ path: '/tmp/screenshot.png', fullPage: true });
console.log('📸 Saved screenshot to /tmp/screenshot.png');
// Show first 500 chars of HTML
console.log('\n📝 First 500 chars of HTML:');
console.log(pageData.bodyHTML.substring(0, 500));
// Show first 500 chars of text
console.log('\n📝 First 500 chars of text:');
console.log(pageData.bodyText.substring(0, 500));
} catch (error: any) {
console.error('❌ Error:', error.message);
} finally {
if (browser) await browser.close();
await pool.end();
}
}
debug();