Files
cannaiq/backend/scrape-with-age-gate.ts
2025-11-28 19:45:44 -07:00

207 lines
6.4 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function scrapeWithAgeGate() {
let browser;
try {
// Get random proxy
const proxyResult = await pool.query(`
SELECT host, port, protocol FROM proxies
ORDER BY RANDOM() LIMIT 1
`);
const proxy = proxyResult.rows[0];
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
console.log('🔌 Proxy:', `${proxy.host}:${proxy.port}`);
// Launch browser with proxy
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
`--proxy-server=${proxyUrl}`
]
});
const page = await browser.newPage();
// Mobile Chrome UA
const mobileUA = 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36';
await page.setUserAgent(mobileUA);
console.log('📱 User-Agent:', mobileUA);
console.log('');
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
console.log('🌐 Going to:', url);
console.log('');
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
// Check if we're on age gate
const currentUrl = page.url();
console.log('📍 Current URL:', currentUrl);
if (currentUrl.includes('/age-gate')) {
console.log('🔒 Age gate detected, handling...');
// Wait for state selector to appear
await page.waitForSelector('button[role="combobox"]', { timeout: 10000 });
// Click state selector dropdown
await page.click('button[role="combobox"]');
console.log(' ✅ Clicked state dropdown');
await page.waitForTimeout(1000);
// Try to find and click Arizona option
const arizonaClicked = await page.evaluate(() => {
// Look for Arizona in the dropdown options
const options = Array.from(document.querySelectorAll('[role="option"]'));
const azOption = options.find(opt =>
opt.textContent?.toLowerCase().includes('arizona') ||
opt.textContent?.toLowerCase().includes('az')
);
if (azOption) {
(azOption as HTMLElement).click();
return true;
}
return false;
});
if (arizonaClicked) {
console.log(' ✅ Selected Arizona');
await page.waitForTimeout(2000);
// Look for submit/enter button
const submitButtonClicked = await page.evaluate(() => {
// Look for submit button - could be various selectors
const possibleSelectors = [
'button[type="submit"]',
'button:contains("Enter")',
'button:contains("Submit")',
'button:contains("Continue")',
'a[aria-label*="age"]',
'button'
];
for (const selector of possibleSelectors) {
const buttons = Array.from(document.querySelectorAll('button, a'));
const submitBtn = buttons.find(btn => {
const text = btn.textContent?.toLowerCase() || '';
const ariaLabel = btn.getAttribute('aria-label')?.toLowerCase() || '';
return text.includes('enter') ||
text.includes('submit') ||
text.includes('continue') ||
ariaLabel.includes('age');
});
if (submitBtn) {
(submitBtn as HTMLElement).click();
return true;
}
}
return false;
});
if (submitButtonClicked) {
console.log(' ✅ Clicked submit button');
// Wait for navigation
try {
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
console.log(' ✅ Age gate passed!');
console.log(' 📍 New URL:', page.url());
} catch (navError) {
console.log(' ⚠️ Navigation timeout - checking current page...');
console.log(' 📍 Current URL:', page.url());
}
} else {
console.log(' ⚠️ Could not find submit button, checking if redirect happened anyway...');
await page.waitForTimeout(3000);
console.log(' 📍 Current URL:', page.url());
}
} else {
console.log(' ❌ Could not find Arizona option');
// Debug: show what options are available
const availableOptions = await page.evaluate(() => {
return Array.from(document.querySelectorAll('[role="option"]'))
.map(opt => opt.textContent?.trim())
.filter(Boolean);
});
console.log(' Available options:', availableOptions);
}
}
// Now try to scrape brands
console.log('');
console.log('📦 Scraping brands...');
await page.waitForTimeout(3000);
const brands = await page.evaluate(() => {
const selectors = [
'[data-testid*="brand"]',
'[class*="Brand"]',
'[class*="brand"]',
'a[href*="/brand/"]',
'.brand-card',
'.brand-item'
];
const found = new Set<string>();
selectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
const text = el.textContent?.trim();
if (text && text.length > 0 && text.length < 50) {
found.add(text);
}
});
});
return Array.from(found);
});
console.log(`Found ${brands.length} brands`);
if (brands.length > 0) {
console.log('─'.repeat(60));
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
console.log('─'.repeat(60));
} else {
// Debug: show page content
const pageData = await page.evaluate(() => ({
title: document.title,
url: window.location.href,
bodyText: document.body.innerText.substring(0, 500)
}));
console.log('');
console.log('📄 PAGE DEBUG:');
console.log('Title:', pageData.title);
console.log('URL:', pageData.url);
console.log('Text preview:', pageData.bodyText);
}
} catch (error: any) {
console.error('❌ Error:', error.message);
} finally {
if (browser) await browser.close();
await pool.end();
}
}
scrapeWithAgeGate();