Files
cannaiq/backend/archive/scrape-curaleaf-simple.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

151 lines
4.3 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function scrape() {
let browser;
try {
// Get random proxy
const proxyResult = await pool.query(`
SELECT host, port, protocol FROM proxies
WHERE active = false
ORDER BY RANDOM()
LIMIT 1
`);
if (proxyResult.rows.length === 0) {
console.log('❌ No proxies available');
return;
}
const proxy = proxyResult.rows[0];
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
console.log(`🔌 Using proxy: ${proxy.host}:${proxy.port}\n`);
// Launch browser with proxy
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
`--proxy-server=${proxyUrl}`
]
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport';
console.log(`🌐 Going to: ${url}\n`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
// Wait for products to load
console.log('⏳ Waiting for products to load...\n');
await page.waitForTimeout(5000);
// Scrape products from DOM
const products = await page.evaluate(() => {
const productElements = document.querySelectorAll('[data-testid="product-card"], .product-card, [class*="Product"], [class*="product"]');
const results: any[] = [];
productElements.forEach(el => {
try {
// Try to extract product info from the element
const nameEl = el.querySelector('[class*="name"], [class*="Name"], h3, h4');
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
const priceEl = el.querySelector('[class*="price"], [class*="Price"]');
const name = nameEl?.textContent?.trim();
const brand = brandEl?.textContent?.trim();
const price = priceEl?.textContent?.trim();
if (name) {
results.push({ name, brand, price });
}
} catch (e) {
// Skip this element
}
});
return results;
});
console.log(`📦 Found ${products.length} products\n`);
if (products.length > 0) {
// Extract unique brands
const brands = new Set(products.map(p => p.brand).filter(Boolean));
console.log(`🏷️ Brands found: ${Array.from(brands).join(', ')}\n`);
// Get store ID
const storeResult = await pool.query(`
SELECT id FROM stores WHERE slug = 'curaleaf-az-48th-street'
`);
if (storeResult.rows.length === 0) {
console.log('❌ Store not found in database');
return;
}
const storeId = storeResult.rows[0].id;
// Save to database
let saved = 0;
for (const product of products) {
if (!product.name || !product.brand) continue;
try {
await pool.query(`
INSERT INTO products (store_id, name, brand, price, dutchie_url, in_stock)
VALUES ($1, $2, $3, $4, $5, true)
ON CONFLICT (store_id, name, brand) DO UPDATE
SET price = $4, in_stock = true
`, [
storeId,
product.name,
product.brand,
parseFloat(product.price?.replace(/[^0-9.]/g, '') || '0'),
url
]);
saved++;
} catch (error: any) {
console.log(`⚠️ Skip: ${error.message}`);
}
}
console.log(`✅ Saved ${saved} products to database\n`);
} else {
console.log('⚠️ No products found on page\n');
// Debug: show what we found
const pageContent = await page.evaluate(() => {
return {
title: document.title,
bodyText: document.body.innerText.substring(0, 500)
};
});
console.log('Page title:', pageContent.title);
console.log('Page preview:', pageContent.bodyText);
}
} catch (error: any) {
console.error('❌ Error:', error.message);
} finally {
if (browser) await browser.close();
await pool.end();
}
}
scrape();