350 lines
11 KiB
TypeScript
350 lines
11 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const pool = new Pool({
|
|
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
|
});
|
|
|
|
async function scrapeCuraleafStores() {
|
|
let browser;
|
|
|
|
try {
|
|
console.log('\n🔍 Scraping Curaleaf store locator...\n');
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled'
|
|
]
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Use Googlebot UA
|
|
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
|
|
|
// Anti-detection
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false,
|
|
});
|
|
});
|
|
|
|
// Set age verification cookie to bypass age gate
|
|
await page.setCookie({
|
|
name: 'age_verified',
|
|
value: 'true',
|
|
domain: '.curaleaf.com',
|
|
path: '/'
|
|
});
|
|
|
|
console.log('Navigating to Curaleaf Arizona dispensaries page...');
|
|
await page.goto('https://curaleaf.com/dispensary/arizona', {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000
|
|
});
|
|
|
|
console.log('Page loaded, checking for age gate...');
|
|
|
|
// Check if we hit an age gate and try to bypass it
|
|
const hasAgeGate = await page.evaluate(() => {
|
|
const bodyText = document.body.textContent || '';
|
|
return bodyText.includes('Welcome to Curaleaf') || bodyText.includes('age') || bodyText.includes('verify');
|
|
});
|
|
|
|
if (hasAgeGate) {
|
|
console.log('Age gate detected, attempting to bypass...');
|
|
|
|
// Try to find and click the state selector or confirm button
|
|
try {
|
|
// Look for Arizona in dropdown or buttons
|
|
const stateSelected = await page.evaluate(() => {
|
|
// Try to find state dropdown
|
|
const selects = Array.from(document.querySelectorAll('select'));
|
|
const arizonaOption = selects.find(select => {
|
|
const options = Array.from(select.querySelectorAll('option'));
|
|
return options.some(opt => opt.textContent?.includes('Arizona'));
|
|
});
|
|
|
|
if (arizonaOption) {
|
|
const azOption = Array.from(arizonaOption.querySelectorAll('option'))
|
|
.find(opt => opt.textContent?.includes('Arizona'));
|
|
if (azOption) {
|
|
(azOption as HTMLOptionElement).selected = true;
|
|
arizonaOption.dispatchEvent(new Event('change', { bubbles: true }));
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Try to find confirm/continue button
|
|
const buttons = Array.from(document.querySelectorAll('button, a'));
|
|
const continueBtn = buttons.find(btn => {
|
|
const text = btn.textContent?.toLowerCase() || '';
|
|
return text.includes('continue') || text.includes('confirm') || text.includes('enter');
|
|
});
|
|
|
|
if (continueBtn) {
|
|
(continueBtn as HTMLElement).click();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
});
|
|
|
|
if (stateSelected) {
|
|
console.log('Age gate interaction attempted, waiting for navigation...');
|
|
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 10000 }).catch(() => {});
|
|
}
|
|
} catch (e) {
|
|
console.log('Could not interact with age gate, proceeding anyway...');
|
|
}
|
|
}
|
|
|
|
console.log('Extracting store data...\n');
|
|
|
|
// Debug: Check what's actually on the page
|
|
const pageDebug = await page.evaluate(() => {
|
|
return {
|
|
title: document.title,
|
|
bodyText: document.body.textContent?.substring(0, 500),
|
|
allLinks: Array.from(document.querySelectorAll('a')).length,
|
|
storeLinks: Array.from(document.querySelectorAll('a')).filter(a =>
|
|
a.href.includes('/stores/')
|
|
).length,
|
|
hasArizona: (document.body.textContent || '').includes('Arizona'),
|
|
sampleLinks: Array.from(document.querySelectorAll('a')).slice(0, 10).map(a => ({
|
|
href: a.href,
|
|
text: a.textContent?.substring(0, 50)
|
|
}))
|
|
};
|
|
});
|
|
|
|
console.log('Page Debug Info:');
|
|
console.log('Title:', pageDebug.title);
|
|
console.log('Total Links:', pageDebug.allLinks);
|
|
console.log('Store Links:', pageDebug.storeLinks);
|
|
console.log('Has "Arizona":', pageDebug.hasArizona);
|
|
console.log('\nFirst 10 links:');
|
|
pageDebug.sampleLinks.forEach((link, i) => {
|
|
console.log(` ${i + 1}. ${link.text} -> ${link.href}`);
|
|
});
|
|
console.log('\nFirst 500 chars of body:');
|
|
console.log(pageDebug.bodyText);
|
|
console.log('\n' + '─'.repeat(80) + '\n');
|
|
|
|
// Extract all Arizona stores
|
|
const stores = await page.evaluate(() => {
|
|
const storeList: any[] = [];
|
|
const storeUrls = new Set<string>();
|
|
|
|
// Since we're on the Arizona-specific page, ALL /stores/ links are Arizona stores
|
|
document.querySelectorAll('a').forEach(link => {
|
|
const href = link.href;
|
|
|
|
// Only capture unique /stores/ URLs
|
|
if (href && href.includes('/stores/') && href.includes('curaleaf')) {
|
|
if (!storeUrls.has(href)) {
|
|
storeUrls.add(href);
|
|
|
|
// Try to find a nearby location name
|
|
let locationName = link.textContent?.trim() || '';
|
|
|
|
// If the link just says "Shop", look for nearby text
|
|
if (locationName === 'Shop' || locationName === 'Details') {
|
|
// Look for parent or sibling elements with location info
|
|
const parent = link.closest('[class*="location"], [class*="card"], [class*="store"]');
|
|
if (parent) {
|
|
// Find text that looks like a location name
|
|
const textNodes = Array.from(parent.querySelectorAll('*'))
|
|
.map(el => el.textContent?.trim())
|
|
.filter(text => text && text.length > 3 && text.length < 100);
|
|
|
|
// Find one that includes "AZ" or looks like a city name
|
|
const locationText = textNodes.find(text =>
|
|
text && (text.includes('AZ') || text.includes(','))
|
|
) || textNodes[0];
|
|
|
|
if (locationText) {
|
|
locationName = locationText;
|
|
}
|
|
}
|
|
}
|
|
|
|
const slug = href.split('/').pop() || '';
|
|
|
|
storeList.push({
|
|
url: href,
|
|
name: locationName || slug,
|
|
text: locationName,
|
|
slug: slug
|
|
});
|
|
}
|
|
}
|
|
});
|
|
|
|
return storeList;
|
|
});
|
|
|
|
console.log('Raw stores found:', stores.length);
|
|
console.log('─'.repeat(80));
|
|
|
|
// Deduplicate and filter
|
|
const uniqueStores = new Map();
|
|
stores.forEach(store => {
|
|
const slug = store.url.split('/').pop() || '';
|
|
if (slug && !uniqueStores.has(slug)) {
|
|
uniqueStores.set(slug, {
|
|
slug,
|
|
url: store.url,
|
|
name: store.name,
|
|
text: store.text
|
|
});
|
|
}
|
|
});
|
|
|
|
console.log('\nUnique Arizona stores found:');
|
|
console.log('─'.repeat(80));
|
|
|
|
Array.from(uniqueStores.values()).forEach((store: any, i) => {
|
|
console.log(`${i + 1}. ${store.name}`);
|
|
console.log(` Slug: ${store.slug}`);
|
|
console.log(` URL: ${store.url}`);
|
|
console.log(` Text: ${store.text}`);
|
|
console.log('─'.repeat(80));
|
|
});
|
|
|
|
console.log(`\n✅ Total unique stores: ${uniqueStores.size}`);
|
|
|
|
// Return the stores for database insertion
|
|
return Array.from(uniqueStores.values());
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error:', error.message);
|
|
if (error.stack) {
|
|
console.error(error.stack);
|
|
}
|
|
return [];
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
async function getStoreMenuUrl(detailsUrl: string): Promise<string | null> {
|
|
let browser;
|
|
|
|
try {
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled'
|
|
]
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
|
|
|
// Set age verification cookie
|
|
await page.setCookie({
|
|
name: 'age_verified',
|
|
value: 'true',
|
|
domain: '.curaleaf.com',
|
|
path: '/'
|
|
});
|
|
|
|
await page.goto(detailsUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
|
|
|
// Look for shop/menu links or iframe with dutchie
|
|
const menuUrl = await page.evaluate(() => {
|
|
// Look for dutchie iframe
|
|
const iframe = document.querySelector('iframe[src*="dutchie"]');
|
|
if (iframe) {
|
|
return (iframe as HTMLIFrameElement).src;
|
|
}
|
|
|
|
// Look for "Shop" or "Menu" buttons/links
|
|
const links = Array.from(document.querySelectorAll('a'));
|
|
const shopLink = links.find(a => {
|
|
const text = a.textContent?.toLowerCase() || '';
|
|
return (text.includes('shop') || text.includes('menu') || text.includes('order')) &&
|
|
a.href && a.href.length > 10;
|
|
});
|
|
|
|
if (shopLink) {
|
|
return shopLink.href;
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
return menuUrl;
|
|
} catch (error) {
|
|
console.error(` Error fetching menu URL: ${error}`);
|
|
return null;
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const stores = await scrapeCuraleafStores();
|
|
|
|
if (stores.length > 0) {
|
|
console.log('\n📝 Fetching actual menu URLs for each store...\n');
|
|
console.log('─'.repeat(80));
|
|
|
|
const storesWithMenus = [];
|
|
|
|
for (const store of stores) {
|
|
const detailsUrl = store.url.replace('/stores/', '/dispensary/arizona/');
|
|
console.log(`\nChecking: ${store.slug}`);
|
|
console.log(`Details URL: ${detailsUrl}`);
|
|
|
|
const menuUrl = await getStoreMenuUrl(detailsUrl);
|
|
|
|
if (menuUrl) {
|
|
console.log(`✓ Menu URL: ${menuUrl}`);
|
|
storesWithMenus.push({
|
|
...store,
|
|
dutchie_url: menuUrl,
|
|
details_url: detailsUrl
|
|
});
|
|
} else {
|
|
console.log(`✗ No menu URL found`);
|
|
}
|
|
|
|
// Small delay between requests
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
}
|
|
|
|
console.log('\n' + '─'.repeat(80));
|
|
console.log(`\n✅ Found menu URLs for ${storesWithMenus.length}/${stores.length} stores\n`);
|
|
|
|
if (storesWithMenus.length > 0) {
|
|
console.log('Stores with menu URLs:');
|
|
console.log('─'.repeat(80));
|
|
storesWithMenus.forEach((store, i) => {
|
|
console.log(`${i + 1}. ${store.slug}`);
|
|
console.log(` Menu: ${store.dutchie_url}`);
|
|
console.log('─'.repeat(80));
|
|
});
|
|
}
|
|
}
|
|
|
|
await pool.end();
|
|
}
|
|
|
|
main();
|