180 lines
5.1 KiB
TypeScript
180 lines
5.1 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const pool = new Pool({
|
|
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
|
});
|
|
|
|
async function scrapeBypassAgeGate() {
|
|
let browser;
|
|
|
|
try {
|
|
// Get random proxy
|
|
const proxyResult = await pool.query(`
|
|
SELECT host, port, protocol FROM proxies
|
|
ORDER BY RANDOM() LIMIT 1
|
|
`);
|
|
|
|
const proxy = proxyResult.rows[0];
|
|
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
|
|
|
console.log('🔌 Proxy:', `${proxy.host}:${proxy.port}`);
|
|
|
|
// Launch browser with proxy
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
`--proxy-server=${proxyUrl}`
|
|
]
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Mobile Chrome UA
|
|
const mobileUA = 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36';
|
|
await page.setUserAgent(mobileUA);
|
|
|
|
console.log('📱 User-Agent:', mobileUA);
|
|
console.log('');
|
|
|
|
// Set age gate bypass cookie/localStorage
|
|
// First, go to the main domain to set cookies
|
|
console.log('🍪 Setting age verification cookies...');
|
|
await page.goto('https://curaleaf.com', { waitUntil: 'domcontentloaded' });
|
|
|
|
// Set cookies for age verification
|
|
await page.evaluateOnNewDocument(() => {
|
|
// Try various localStorage/cookie approaches
|
|
try {
|
|
// Set localStorage for age verification
|
|
localStorage.setItem('age-verified', 'true');
|
|
localStorage.setItem('curaleaf-age-verified', 'true');
|
|
localStorage.setItem('state', 'arizona');
|
|
localStorage.setItem('selectedState', 'arizona');
|
|
} catch (e) {}
|
|
});
|
|
|
|
// Set cookies manually
|
|
await page.setCookie(
|
|
{
|
|
name: 'age-verified',
|
|
value: 'true',
|
|
domain: '.curaleaf.com',
|
|
path: '/',
|
|
},
|
|
{
|
|
name: 'curaleaf-age-gate',
|
|
value: 'passed',
|
|
domain: '.curaleaf.com',
|
|
path: '/',
|
|
},
|
|
{
|
|
name: 'state',
|
|
value: 'arizona',
|
|
domain: '.curaleaf.com',
|
|
path: '/',
|
|
}
|
|
);
|
|
|
|
console.log(' ✅ Cookies set');
|
|
console.log('');
|
|
|
|
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
|
|
console.log('🌐 Going to:', url);
|
|
console.log('');
|
|
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
await page.waitForTimeout(5000);
|
|
|
|
// Check current URL
|
|
const currentUrl = page.url();
|
|
console.log('📍 Current URL:', currentUrl);
|
|
|
|
if (currentUrl.includes('/age-gate')) {
|
|
console.log('⚠️ Still on age gate - cookies didn\'t work');
|
|
console.log('');
|
|
|
|
// Debug: check what cookies/localStorage we have
|
|
const storageData = await page.evaluate(() => {
|
|
const cookies = document.cookie;
|
|
const localStorageItems: any = {};
|
|
for (let i = 0; i < localStorage.length; i++) {
|
|
const key = localStorage.key(i);
|
|
if (key) {
|
|
localStorageItems[key] = localStorage.getItem(key);
|
|
}
|
|
}
|
|
return { cookies, localStorage: localStorageItems };
|
|
});
|
|
|
|
console.log('Current cookies:', storageData.cookies);
|
|
console.log('Current localStorage:', storageData.localStorage);
|
|
} else {
|
|
console.log('✅ Age gate bypassed!');
|
|
}
|
|
|
|
// Try to scrape brands
|
|
console.log('');
|
|
console.log('📦 Scraping brands...');
|
|
|
|
const brands = await page.evaluate(() => {
|
|
const selectors = [
|
|
'[data-testid*="brand"]',
|
|
'[class*="Brand"]',
|
|
'[class*="brand"]',
|
|
'a[href*="/brand/"]',
|
|
'.brand-card',
|
|
'.brand-item'
|
|
];
|
|
|
|
const found = new Set<string>();
|
|
|
|
selectors.forEach(selector => {
|
|
document.querySelectorAll(selector).forEach(el => {
|
|
const text = el.textContent?.trim();
|
|
if (text && text.length > 0 && text.length < 50) {
|
|
found.add(text);
|
|
}
|
|
});
|
|
});
|
|
|
|
return Array.from(found);
|
|
});
|
|
|
|
console.log(`Found ${brands.length} brands`);
|
|
if (brands.length > 0) {
|
|
console.log('─'.repeat(60));
|
|
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
|
|
console.log('─'.repeat(60));
|
|
} else {
|
|
// Debug
|
|
const pageData = await page.evaluate(() => ({
|
|
title: document.title,
|
|
url: window.location.href,
|
|
bodyText: document.body.innerText.substring(0, 500),
|
|
hasNextRoot: document.getElementById('__next') !== null
|
|
}));
|
|
|
|
console.log('');
|
|
console.log('📄 PAGE DEBUG:');
|
|
console.log('Title:', pageData.title);
|
|
console.log('URL:', pageData.url);
|
|
console.log('Has __next:', pageData.hasNextRoot);
|
|
console.log('Text preview:', pageData.bodyText);
|
|
}
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error:', error.message);
|
|
} finally {
|
|
if (browser) await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeBypassAgeGate();
|