- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
185 lines
5.6 KiB
TypeScript
185 lines
5.6 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const pool = new Pool({
|
|
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
|
});
|
|
|
|
async function scrapeCuraleafBrands() {
|
|
let browser;
|
|
|
|
try {
|
|
// Get random proxy
|
|
const proxyResult = await pool.query(`
|
|
SELECT host, port, protocol FROM proxies
|
|
ORDER BY RANDOM() LIMIT 1
|
|
`);
|
|
|
|
const proxy = proxyResult.rows[0];
|
|
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
|
|
|
console.log('🔌 Proxy:', `${proxy.host}:${proxy.port}`);
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
`--proxy-server=${proxyUrl}`
|
|
]
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Mobile Chrome UA
|
|
const mobileUA = 'Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36';
|
|
await page.setUserAgent(mobileUA);
|
|
|
|
console.log('📱 UA: Mobile Chrome');
|
|
console.log('');
|
|
|
|
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
|
|
console.log('🌐 Going to:', url);
|
|
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
await page.waitForTimeout(3000);
|
|
|
|
if (page.url().includes('/age-gate')) {
|
|
console.log('🔒 Handling age gate...');
|
|
|
|
// Gate 1: State selector
|
|
await page.waitForSelector('button[role="combobox"]', { timeout: 10000 });
|
|
await page.click('button[role="combobox"]');
|
|
console.log(' ✅ Opened dropdown');
|
|
|
|
await page.waitForTimeout(2000);
|
|
|
|
// Find and click Arizona with REAL Puppeteer click
|
|
await page.waitForSelector('[role="option"]', { timeout: 5000 });
|
|
const options = await page.$$('[role="option"]');
|
|
|
|
for (const option of options) {
|
|
const text = await option.evaluate(el => el.textContent?.toLowerCase().trim());
|
|
if (text === 'arizona') {
|
|
await option.click();
|
|
console.log(' ✅ Selected Arizona');
|
|
break;
|
|
}
|
|
}
|
|
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Gate 2: Age confirmation - wait for button to appear
|
|
const ageButtonAppeared = await page.waitForFunction(() => {
|
|
const buttons = Array.from(document.querySelectorAll('button'));
|
|
return buttons.some(btn => btn.textContent?.trim().toLowerCase().includes("i'm over 21"));
|
|
}, { timeout: 10000 });
|
|
|
|
if (ageButtonAppeared) {
|
|
console.log(' ✅ Age button appeared');
|
|
|
|
// Click it with page.evaluate since we know the text
|
|
await page.evaluate(() => {
|
|
const buttons = Array.from(document.querySelectorAll('button'));
|
|
const ageBtn = buttons.find(btn =>
|
|
btn.textContent?.trim().toLowerCase().includes("i'm over 21")
|
|
) as HTMLElement;
|
|
if (ageBtn) ageBtn.click();
|
|
});
|
|
|
|
console.log(' ✅ Clicked age confirmation');
|
|
await page.waitForTimeout(5000);
|
|
}
|
|
}
|
|
|
|
console.log('');
|
|
console.log('📦 Scraping brands...');
|
|
console.log('📍 URL:', page.url());
|
|
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Scrape brands with better filtering
|
|
const brands = await page.evaluate(() => {
|
|
const selectors = [
|
|
'[data-testid*="brand"]',
|
|
'[class*="Brand"]',
|
|
'[class*="brand"]',
|
|
'a[href*="/brand/"]'
|
|
];
|
|
|
|
const found = new Set<string>();
|
|
|
|
selectors.forEach(selector => {
|
|
document.querySelectorAll(selector).forEach(el => {
|
|
const text = el.textContent?.trim();
|
|
// Filter out single letters, "Brands", "Search", etc.
|
|
if (text &&
|
|
text.length > 1 &&
|
|
text.length < 50 &&
|
|
text !== 'Brands' &&
|
|
text !== 'Search' &&
|
|
text !== 'BrandsSearch' &&
|
|
!/^[A-Z]$/.test(text)) {
|
|
found.add(text);
|
|
}
|
|
});
|
|
});
|
|
|
|
return Array.from(found).sort();
|
|
});
|
|
|
|
console.log(`\n✅ Found ${brands.length} brands`);
|
|
console.log('─'.repeat(60));
|
|
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
|
|
console.log('─'.repeat(60));
|
|
|
|
// Save to database
|
|
console.log('');
|
|
console.log('💾 Saving to database...');
|
|
|
|
// Get the store ID
|
|
const storeResult = await pool.query(`
|
|
SELECT id FROM stores WHERE slug = 'curaleaf-az-48th-street'
|
|
`);
|
|
|
|
if (storeResult.rows.length === 0) {
|
|
console.log('❌ Store not found: curaleaf-az-48th-street');
|
|
return;
|
|
}
|
|
|
|
const storeId = storeResult.rows[0].id;
|
|
|
|
// Delete existing brands for this store
|
|
await pool.query('DELETE FROM brands WHERE store_id = $1', [storeId]);
|
|
console.log(` 🗑️ Deleted old brands for store ${storeId}`);
|
|
|
|
// Insert new brands using ON CONFLICT to handle duplicates
|
|
let inserted = 0;
|
|
for (const brandName of brands) {
|
|
await pool.query(`
|
|
INSERT INTO brands (store_id, name, created_at, updated_at)
|
|
VALUES ($1, $2, NOW(), NOW())
|
|
ON CONFLICT (store_id, name) DO UPDATE
|
|
SET updated_at = NOW()
|
|
`, [storeId, brandName]);
|
|
inserted++;
|
|
}
|
|
|
|
console.log(` ✅ Saved ${inserted} brands`);
|
|
console.log('');
|
|
console.log('🎉 Complete! View at: http://localhost:5174/stores/az/curaleaf/curaleaf-az-48th-street/brands');
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error:', error.message);
|
|
console.error(error.stack);
|
|
} finally {
|
|
if (browser) await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeCuraleafBrands();
|