import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus' }); async function scrapeCuraleafStores() { let browser; try { console.log('\nšŸ” Scraping Curaleaf store locator...\n'); browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled' ] }); const page = await browser.newPage(); // Use Googlebot UA await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); // Anti-detection await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false, }); }); // Set age verification cookie to bypass age gate await page.setCookie({ name: 'age_verified', value: 'true', domain: '.curaleaf.com', path: '/' }); console.log('Navigating to Curaleaf Arizona dispensaries page...'); await page.goto('https://curaleaf.com/dispensary/arizona', { waitUntil: 'networkidle2', timeout: 60000 }); console.log('Page loaded, checking for age gate...'); // Check if we hit an age gate and try to bypass it const hasAgeGate = await page.evaluate(() => { const bodyText = document.body.textContent || ''; return bodyText.includes('Welcome to Curaleaf') || bodyText.includes('age') || bodyText.includes('verify'); }); if (hasAgeGate) { console.log('Age gate detected, attempting to bypass...'); // Try to find and click the state selector or confirm button try { // Look for Arizona in dropdown or buttons const stateSelected = await page.evaluate(() => { // Try to find state dropdown const selects = Array.from(document.querySelectorAll('select')); const arizonaOption = selects.find(select => { const options = Array.from(select.querySelectorAll('option')); return options.some(opt => opt.textContent?.includes('Arizona')); }); if (arizonaOption) { const azOption = Array.from(arizonaOption.querySelectorAll('option')) .find(opt => opt.textContent?.includes('Arizona')); if (azOption) { (azOption as HTMLOptionElement).selected = true; arizonaOption.dispatchEvent(new Event('change', { bubbles: true })); return true; } } // Try to find confirm/continue button const buttons = Array.from(document.querySelectorAll('button, a')); const continueBtn = buttons.find(btn => { const text = btn.textContent?.toLowerCase() || ''; return text.includes('continue') || text.includes('confirm') || text.includes('enter'); }); if (continueBtn) { (continueBtn as HTMLElement).click(); return true; } return false; }); if (stateSelected) { console.log('Age gate interaction attempted, waiting for navigation...'); await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 10000 }).catch(() => {}); } } catch (e) { console.log('Could not interact with age gate, proceeding anyway...'); } } console.log('Extracting store data...\n'); // Debug: Check what's actually on the page const pageDebug = await page.evaluate(() => { return { title: document.title, bodyText: document.body.textContent?.substring(0, 500), allLinks: Array.from(document.querySelectorAll('a')).length, storeLinks: Array.from(document.querySelectorAll('a')).filter(a => a.href.includes('/stores/') ).length, hasArizona: (document.body.textContent || '').includes('Arizona'), sampleLinks: Array.from(document.querySelectorAll('a')).slice(0, 10).map(a => ({ href: a.href, text: a.textContent?.substring(0, 50) })) }; }); console.log('Page Debug Info:'); console.log('Title:', pageDebug.title); console.log('Total Links:', pageDebug.allLinks); console.log('Store Links:', pageDebug.storeLinks); console.log('Has "Arizona":', pageDebug.hasArizona); console.log('\nFirst 10 links:'); pageDebug.sampleLinks.forEach((link, i) => { console.log(` ${i + 1}. ${link.text} -> ${link.href}`); }); console.log('\nFirst 500 chars of body:'); console.log(pageDebug.bodyText); console.log('\n' + '─'.repeat(80) + '\n'); // Extract all Arizona stores const stores = await page.evaluate(() => { const storeList: any[] = []; const storeUrls = new Set(); // Since we're on the Arizona-specific page, ALL /stores/ links are Arizona stores document.querySelectorAll('a').forEach(link => { const href = link.href; // Only capture unique /stores/ URLs if (href && href.includes('/stores/') && href.includes('curaleaf')) { if (!storeUrls.has(href)) { storeUrls.add(href); // Try to find a nearby location name let locationName = link.textContent?.trim() || ''; // If the link just says "Shop", look for nearby text if (locationName === 'Shop' || locationName === 'Details') { // Look for parent or sibling elements with location info const parent = link.closest('[class*="location"], [class*="card"], [class*="store"]'); if (parent) { // Find text that looks like a location name const textNodes = Array.from(parent.querySelectorAll('*')) .map(el => el.textContent?.trim()) .filter(text => text && text.length > 3 && text.length < 100); // Find one that includes "AZ" or looks like a city name const locationText = textNodes.find(text => text && (text.includes('AZ') || text.includes(',')) ) || textNodes[0]; if (locationText) { locationName = locationText; } } } const slug = href.split('/').pop() || ''; storeList.push({ url: href, name: locationName || slug, text: locationName, slug: slug }); } } }); return storeList; }); console.log('Raw stores found:', stores.length); console.log('─'.repeat(80)); // Deduplicate and filter const uniqueStores = new Map(); stores.forEach(store => { const slug = store.url.split('/').pop() || ''; if (slug && !uniqueStores.has(slug)) { uniqueStores.set(slug, { slug, url: store.url, name: store.name, text: store.text }); } }); console.log('\nUnique Arizona stores found:'); console.log('─'.repeat(80)); Array.from(uniqueStores.values()).forEach((store: any, i) => { console.log(`${i + 1}. ${store.name}`); console.log(` Slug: ${store.slug}`); console.log(` URL: ${store.url}`); console.log(` Text: ${store.text}`); console.log('─'.repeat(80)); }); console.log(`\nāœ… Total unique stores: ${uniqueStores.size}`); // Return the stores for database insertion return Array.from(uniqueStores.values()); } catch (error: any) { console.error('āŒ Error:', error.message); if (error.stack) { console.error(error.stack); } return []; } finally { if (browser) { await browser.close(); } } } async function getStoreMenuUrl(detailsUrl: string): Promise { let browser; try { browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled' ] }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); // Set age verification cookie await page.setCookie({ name: 'age_verified', value: 'true', domain: '.curaleaf.com', path: '/' }); await page.goto(detailsUrl, { waitUntil: 'networkidle2', timeout: 30000 }); // Look for shop/menu links or iframe with dutchie const menuUrl = await page.evaluate(() => { // Look for dutchie iframe const iframe = document.querySelector('iframe[src*="dutchie"]'); if (iframe) { return (iframe as HTMLIFrameElement).src; } // Look for "Shop" or "Menu" buttons/links const links = Array.from(document.querySelectorAll('a')); const shopLink = links.find(a => { const text = a.textContent?.toLowerCase() || ''; return (text.includes('shop') || text.includes('menu') || text.includes('order')) && a.href && a.href.length > 10; }); if (shopLink) { return shopLink.href; } return null; }); return menuUrl; } catch (error) { console.error(` Error fetching menu URL: ${error}`); return null; } finally { if (browser) { await browser.close(); } } } async function main() { const stores = await scrapeCuraleafStores(); if (stores.length > 0) { console.log('\nšŸ“ Fetching actual menu URLs for each store...\n'); console.log('─'.repeat(80)); const storesWithMenus = []; for (const store of stores) { const detailsUrl = store.url.replace('/stores/', '/dispensary/arizona/'); console.log(`\nChecking: ${store.slug}`); console.log(`Details URL: ${detailsUrl}`); const menuUrl = await getStoreMenuUrl(detailsUrl); if (menuUrl) { console.log(`āœ“ Menu URL: ${menuUrl}`); storesWithMenus.push({ ...store, dutchie_url: menuUrl, details_url: detailsUrl }); } else { console.log(`āœ— No menu URL found`); } // Small delay between requests await new Promise(resolve => setTimeout(resolve, 1000)); } console.log('\n' + '─'.repeat(80)); console.log(`\nāœ… Found menu URLs for ${storesWithMenus.length}/${stores.length} stores\n`); if (storesWithMenus.length > 0) { console.log('Stores with menu URLs:'); console.log('─'.repeat(80)); storesWithMenus.forEach((store, i) => { console.log(`${i + 1}. ${store.slug}`); console.log(` Menu: ${store.dutchie_url}`); console.log('─'.repeat(80)); }); } } await pool.end(); } main();