Files
cannaiq/backend/scrape-curaleaf-brands.ts
2025-11-28 19:45:44 -07:00

185 lines
5.6 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function scrapeCuraleafBrands() {
let browser;
try {
// Get random proxy
const proxyResult = await pool.query(`
SELECT host, port, protocol FROM proxies
ORDER BY RANDOM() LIMIT 1
`);
const proxy = proxyResult.rows[0];
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
console.log('🔌 Proxy:', `${proxy.host}:${proxy.port}`);
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
`--proxy-server=${proxyUrl}`
]
});
const page = await browser.newPage();
// Mobile Chrome UA
const mobileUA = 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36';
await page.setUserAgent(mobileUA);
console.log('📱 UA: Mobile Chrome');
console.log('');
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
console.log('🌐 Going to:', url);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
await page.waitForTimeout(3000);
if (page.url().includes('/age-gate')) {
console.log('🔒 Handling age gate...');
// Gate 1: State selector
await page.waitForSelector('button[role="combobox"]', { timeout: 10000 });
await page.click('button[role="combobox"]');
console.log(' ✅ Opened dropdown');
await page.waitForTimeout(2000);
// Find and click Arizona with REAL Puppeteer click
await page.waitForSelector('[role="option"]', { timeout: 5000 });
const options = await page.$$('[role="option"]');
for (const option of options) {
const text = await option.evaluate(el => el.textContent?.toLowerCase().trim());
if (text === 'arizona') {
await option.click();
console.log(' ✅ Selected Arizona');
break;
}
}
await page.waitForTimeout(3000);
// Gate 2: Age confirmation - wait for button to appear
const ageButtonAppeared = await page.waitForFunction(() => {
const buttons = Array.from(document.querySelectorAll('button'));
return buttons.some(btn => btn.textContent?.trim().toLowerCase().includes("i'm over 21"));
}, { timeout: 10000 });
if (ageButtonAppeared) {
console.log(' ✅ Age button appeared');
// Click it with page.evaluate since we know the text
await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
const ageBtn = buttons.find(btn =>
btn.textContent?.trim().toLowerCase().includes("i'm over 21")
) as HTMLElement;
if (ageBtn) ageBtn.click();
});
console.log(' ✅ Clicked age confirmation');
await page.waitForTimeout(5000);
}
}
console.log('');
console.log('📦 Scraping brands...');
console.log('📍 URL:', page.url());
await page.waitForTimeout(3000);
// Scrape brands with better filtering
const brands = await page.evaluate(() => {
const selectors = [
'[data-testid*="brand"]',
'[class*="Brand"]',
'[class*="brand"]',
'a[href*="/brand/"]'
];
const found = new Set<string>();
selectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
const text = el.textContent?.trim();
// Filter out single letters, "Brands", "Search", etc.
if (text &&
text.length > 1 &&
text.length < 50 &&
text !== 'Brands' &&
text !== 'Search' &&
text !== 'BrandsSearch' &&
!/^[A-Z]$/.test(text)) {
found.add(text);
}
});
});
return Array.from(found).sort();
});
console.log(`\n✅ Found ${brands.length} brands`);
console.log('─'.repeat(60));
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
console.log('─'.repeat(60));
// Save to database
console.log('');
console.log('💾 Saving to database...');
// Get the store ID
const storeResult = await pool.query(`
SELECT id FROM stores WHERE slug = 'curaleaf-az-48th-street'
`);
if (storeResult.rows.length === 0) {
console.log('❌ Store not found: curaleaf-az-48th-street');
return;
}
const storeId = storeResult.rows[0].id;
// Delete existing brands for this store
await pool.query('DELETE FROM brands WHERE store_id = $1', [storeId]);
console.log(` 🗑️ Deleted old brands for store ${storeId}`);
// Insert new brands using ON CONFLICT to handle duplicates
let inserted = 0;
for (const brandName of brands) {
await pool.query(`
INSERT INTO brands (store_id, name, created_at, updated_at)
VALUES ($1, $2, NOW(), NOW())
ON CONFLICT (store_id, name) DO UPDATE
SET updated_at = NOW()
`, [storeId, brandName]);
inserted++;
}
console.log(` ✅ Saved ${inserted} brands`);
console.log('');
console.log('🎉 Complete! View at: http://localhost:5174/stores/az/curaleaf/curaleaf-az-48th-street/brands');
} catch (error: any) {
console.error('❌ Error:', error.message);
console.error(error.stack);
} finally {
if (browser) await browser.close();
await pool.end();
}
}
scrapeCuraleafBrands();