import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { Pool } from 'pg'; puppeteer.use(StealthPlugin()); const pool = new Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@postgres:5432/dutchie_menus' }); const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; async function scrapeArizonaStores() { const browser = await puppeteer.launch({ headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled' ] }); try { const page = await browser.newPage(); await page.setUserAgent(USER_AGENT); console.log('Navigating to Curaleaf stores page...'); await page.goto('https://curaleaf.com/stores/', { waitUntil: 'networkidle2', timeout: 30000 }); await page.waitForTimeout(3000); const stores = await page.evaluate(() => { const results: Array<{name: string; slug: string; url: string}> = []; const links = Array.from(document.querySelectorAll('a[href*="/stores/"]')); for (const link of links) { const href = (link as HTMLAnchorElement).href; const text = link.textContent?.trim() || ''; if (href.includes('/stores/curaleaf') && (href.toLowerCase().includes('-az-') || href.toLowerCase().includes('arizona') || href.toLowerCase().includes('dispensary-peoria') || text.toLowerCase().includes('arizona') || text.toLowerCase().includes(', az'))) { const match = href.match(/\/stores\/([^\/\?#]+)/); if (match) { results.push({ name: text, slug: match[1], url: href.split('?')[0].split('#')[0] }); } } } return results; }); console.log(`\nFound ${stores.length} Arizona stores\n`); const uniqueStores = Array.from( new Map(stores.map(s => [s.slug, s])).values() ); return uniqueStores; } finally { await browser.close(); } } async function compareAndUpdate() { const client = await pool.connect(); try { console.log('Scraping Curaleaf website...\n'); const scrapedStores = await scrapeArizonaStores(); console.log('\nQuerying database...\n'); const result = await client.query( "SELECT id, name, slug, dutchie_url FROM stores WHERE name LIKE 'Curaleaf%' ORDER BY name" ); const dbStores = result.rows; console.log('\n=== COMPARISON ===\n'); const scrapedMap = new Map(scrapedStores.map(s => [s.slug, s])); const updates = []; for (const dbStore of dbStores) { const scraped = scrapedMap.get(dbStore.slug); if (scraped) { if (dbStore.dutchie_url !== scraped.url) { console.log(`⚠️ URL mismatch for "${dbStore.name}"`); console.log(` DB: ${dbStore.dutchie_url}`); console.log(` Web: ${scraped.url}`); updates.push({ id: dbStore.id, url: scraped.url }); } else { console.log(`✅ "${dbStore.name}" - correct`); } } else { console.log(`⚠️ "${dbStore.name}" (${dbStore.slug}) - NOT FOUND on website`); // Try to find by name matching const possibleMatch = scrapedStores.find(s => { const storeName = dbStore.name.toLowerCase().replace('curaleaf - ', '').replace('curaleaf-', ''); return s.name.toLowerCase().includes(storeName) || s.slug.toLowerCase().includes(storeName); }); if (possibleMatch) { console.log(` → Possible match: ${possibleMatch.slug}`); console.log(` → URL: ${possibleMatch.url}`); updates.push({ id: dbStore.id, slug: possibleMatch.slug, url: possibleMatch.url }); } } } // Check for stores on website but not in DB for (const scraped of scrapedStores) { const inDb = dbStores.find(db => db.slug === scraped.slug); if (!inDb) { console.log(`\n➕ "${scraped.name}" (${scraped.slug}) - ON WEBSITE but not in DB`); console.log(` URL: ${scraped.url}`); } } if (updates.length > 0) { console.log(`\n\n=== APPLYING ${updates.length} UPDATES ===\n`); for (const update of updates) { if (update.slug) { await client.query( 'UPDATE stores SET slug = $1, dutchie_url = $2 WHERE id = $3 RETURNING name', [update.slug, update.url, update.id] ); console.log(`✅ Updated store ${update.id} with new slug: ${update.slug}`); } else { await client.query( 'UPDATE stores SET dutchie_url = $1 WHERE id = $2 RETURNING name', [update.url, update.id] ); console.log(`✅ Updated store ${update.id} with new URL`); } } console.log(`\n🎉 Successfully updated ${updates.length} stores!`); } else { console.log('\n✅ All stores are up to date!'); } } finally { client.release(); await pool.end(); } } compareAndUpdate().catch(console.error);