Files
cannaiq/backend/archive/scrape-curaleaf-stores.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

350 lines
11 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function scrapeCuraleafStores() {
let browser;
try {
console.log('\n🔍 Scraping Curaleaf store locator...\n');
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
]
});
const page = await browser.newPage();
// Use Googlebot UA
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
// Anti-detection
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
// Set age verification cookie to bypass age gate
await page.setCookie({
name: 'age_verified',
value: 'true',
domain: '.curaleaf.com',
path: '/'
});
console.log('Navigating to Curaleaf Arizona dispensaries page...');
await page.goto('https://curaleaf.com/dispensary/arizona', {
waitUntil: 'networkidle2',
timeout: 60000
});
console.log('Page loaded, checking for age gate...');
// Check if we hit an age gate and try to bypass it
const hasAgeGate = await page.evaluate(() => {
const bodyText = document.body.textContent || '';
return bodyText.includes('Welcome to Curaleaf') || bodyText.includes('age') || bodyText.includes('verify');
});
if (hasAgeGate) {
console.log('Age gate detected, attempting to bypass...');
// Try to find and click the state selector or confirm button
try {
// Look for Arizona in dropdown or buttons
const stateSelected = await page.evaluate(() => {
// Try to find state dropdown
const selects = Array.from(document.querySelectorAll('select'));
const arizonaOption = selects.find(select => {
const options = Array.from(select.querySelectorAll('option'));
return options.some(opt => opt.textContent?.includes('Arizona'));
});
if (arizonaOption) {
const azOption = Array.from(arizonaOption.querySelectorAll('option'))
.find(opt => opt.textContent?.includes('Arizona'));
if (azOption) {
(azOption as HTMLOptionElement).selected = true;
arizonaOption.dispatchEvent(new Event('change', { bubbles: true }));
return true;
}
}
// Try to find confirm/continue button
const buttons = Array.from(document.querySelectorAll('button, a'));
const continueBtn = buttons.find(btn => {
const text = btn.textContent?.toLowerCase() || '';
return text.includes('continue') || text.includes('confirm') || text.includes('enter');
});
if (continueBtn) {
(continueBtn as HTMLElement).click();
return true;
}
return false;
});
if (stateSelected) {
console.log('Age gate interaction attempted, waiting for navigation...');
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 10000 }).catch(() => {});
}
} catch (e) {
console.log('Could not interact with age gate, proceeding anyway...');
}
}
console.log('Extracting store data...\n');
// Debug: Check what's actually on the page
const pageDebug = await page.evaluate(() => {
return {
title: document.title,
bodyText: document.body.textContent?.substring(0, 500),
allLinks: Array.from(document.querySelectorAll('a')).length,
storeLinks: Array.from(document.querySelectorAll('a')).filter(a =>
a.href.includes('/stores/')
).length,
hasArizona: (document.body.textContent || '').includes('Arizona'),
sampleLinks: Array.from(document.querySelectorAll('a')).slice(0, 10).map(a => ({
href: a.href,
text: a.textContent?.substring(0, 50)
}))
};
});
console.log('Page Debug Info:');
console.log('Title:', pageDebug.title);
console.log('Total Links:', pageDebug.allLinks);
console.log('Store Links:', pageDebug.storeLinks);
console.log('Has "Arizona":', pageDebug.hasArizona);
console.log('\nFirst 10 links:');
pageDebug.sampleLinks.forEach((link, i) => {
console.log(` ${i + 1}. ${link.text} -> ${link.href}`);
});
console.log('\nFirst 500 chars of body:');
console.log(pageDebug.bodyText);
console.log('\n' + '─'.repeat(80) + '\n');
// Extract all Arizona stores
const stores = await page.evaluate(() => {
const storeList: any[] = [];
const storeUrls = new Set<string>();
// Since we're on the Arizona-specific page, ALL /stores/ links are Arizona stores
document.querySelectorAll('a').forEach(link => {
const href = link.href;
// Only capture unique /stores/ URLs
if (href && href.includes('/stores/') && href.includes('curaleaf')) {
if (!storeUrls.has(href)) {
storeUrls.add(href);
// Try to find a nearby location name
let locationName = link.textContent?.trim() || '';
// If the link just says "Shop", look for nearby text
if (locationName === 'Shop' || locationName === 'Details') {
// Look for parent or sibling elements with location info
const parent = link.closest('[class*="location"], [class*="card"], [class*="store"]');
if (parent) {
// Find text that looks like a location name
const textNodes = Array.from(parent.querySelectorAll('*'))
.map(el => el.textContent?.trim())
.filter(text => text && text.length > 3 && text.length < 100);
// Find one that includes "AZ" or looks like a city name
const locationText = textNodes.find(text =>
text && (text.includes('AZ') || text.includes(','))
) || textNodes[0];
if (locationText) {
locationName = locationText;
}
}
}
const slug = href.split('/').pop() || '';
storeList.push({
url: href,
name: locationName || slug,
text: locationName,
slug: slug
});
}
}
});
return storeList;
});
console.log('Raw stores found:', stores.length);
console.log('─'.repeat(80));
// Deduplicate and filter
const uniqueStores = new Map();
stores.forEach(store => {
const slug = store.url.split('/').pop() || '';
if (slug && !uniqueStores.has(slug)) {
uniqueStores.set(slug, {
slug,
url: store.url,
name: store.name,
text: store.text
});
}
});
console.log('\nUnique Arizona stores found:');
console.log('─'.repeat(80));
Array.from(uniqueStores.values()).forEach((store: any, i) => {
console.log(`${i + 1}. ${store.name}`);
console.log(` Slug: ${store.slug}`);
console.log(` URL: ${store.url}`);
console.log(` Text: ${store.text}`);
console.log('─'.repeat(80));
});
console.log(`\n✅ Total unique stores: ${uniqueStores.size}`);
// Return the stores for database insertion
return Array.from(uniqueStores.values());
} catch (error: any) {
console.error('❌ Error:', error.message);
if (error.stack) {
console.error(error.stack);
}
return [];
} finally {
if (browser) {
await browser.close();
}
}
}
async function getStoreMenuUrl(detailsUrl: string): Promise<string | null> {
let browser;
try {
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
]
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
// Set age verification cookie
await page.setCookie({
name: 'age_verified',
value: 'true',
domain: '.curaleaf.com',
path: '/'
});
await page.goto(detailsUrl, { waitUntil: 'networkidle2', timeout: 30000 });
// Look for shop/menu links or iframe with dutchie
const menuUrl = await page.evaluate(() => {
// Look for dutchie iframe
const iframe = document.querySelector('iframe[src*="dutchie"]');
if (iframe) {
return (iframe as HTMLIFrameElement).src;
}
// Look for "Shop" or "Menu" buttons/links
const links = Array.from(document.querySelectorAll('a'));
const shopLink = links.find(a => {
const text = a.textContent?.toLowerCase() || '';
return (text.includes('shop') || text.includes('menu') || text.includes('order')) &&
a.href && a.href.length > 10;
});
if (shopLink) {
return shopLink.href;
}
return null;
});
return menuUrl;
} catch (error) {
console.error(` Error fetching menu URL: ${error}`);
return null;
} finally {
if (browser) {
await browser.close();
}
}
}
async function main() {
const stores = await scrapeCuraleafStores();
if (stores.length > 0) {
console.log('\n📝 Fetching actual menu URLs for each store...\n');
console.log('─'.repeat(80));
const storesWithMenus = [];
for (const store of stores) {
const detailsUrl = store.url.replace('/stores/', '/dispensary/arizona/');
console.log(`\nChecking: ${store.slug}`);
console.log(`Details URL: ${detailsUrl}`);
const menuUrl = await getStoreMenuUrl(detailsUrl);
if (menuUrl) {
console.log(`✓ Menu URL: ${menuUrl}`);
storesWithMenus.push({
...store,
dutchie_url: menuUrl,
details_url: detailsUrl
});
} else {
console.log(`✗ No menu URL found`);
}
// Small delay between requests
await new Promise(resolve => setTimeout(resolve, 1000));
}
console.log('\n' + '─'.repeat(80));
console.log(`\n✅ Found menu URLs for ${storesWithMenus.length}/${stores.length} stores\n`);
if (storesWithMenus.length > 0) {
console.log('Stores with menu URLs:');
console.log('─'.repeat(80));
storesWithMenus.forEach((store, i) => {
console.log(`${i + 1}. ${store.slug}`);
console.log(` Menu: ${store.dutchie_url}`);
console.log('─'.repeat(80));
});
}
}
await pool.end();
}
main();