- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
151 lines
4.3 KiB
TypeScript
151 lines
4.3 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const pool = new Pool({
|
|
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
|
});
|
|
|
|
async function scrape() {
|
|
let browser;
|
|
|
|
try {
|
|
// Get random proxy
|
|
const proxyResult = await pool.query(`
|
|
SELECT host, port, protocol FROM proxies
|
|
WHERE active = false
|
|
ORDER BY RANDOM()
|
|
LIMIT 1
|
|
`);
|
|
|
|
if (proxyResult.rows.length === 0) {
|
|
console.log('❌ No proxies available');
|
|
return;
|
|
}
|
|
|
|
const proxy = proxyResult.rows[0];
|
|
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
|
|
|
console.log(`🔌 Using proxy: ${proxy.host}:${proxy.port}\n`);
|
|
|
|
// Launch browser with proxy
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
`--proxy-server=${proxyUrl}`
|
|
]
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
|
|
|
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport';
|
|
console.log(`🌐 Going to: ${url}\n`);
|
|
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
|
|
// Wait for products to load
|
|
console.log('⏳ Waiting for products to load...\n');
|
|
await page.waitForTimeout(5000);
|
|
|
|
// Scrape products from DOM
|
|
const products = await page.evaluate(() => {
|
|
const productElements = document.querySelectorAll('[data-testid="product-card"], .product-card, [class*="Product"], [class*="product"]');
|
|
const results: any[] = [];
|
|
|
|
productElements.forEach(el => {
|
|
try {
|
|
// Try to extract product info from the element
|
|
const nameEl = el.querySelector('[class*="name"], [class*="Name"], h3, h4');
|
|
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
|
|
const priceEl = el.querySelector('[class*="price"], [class*="Price"]');
|
|
|
|
const name = nameEl?.textContent?.trim();
|
|
const brand = brandEl?.textContent?.trim();
|
|
const price = priceEl?.textContent?.trim();
|
|
|
|
if (name) {
|
|
results.push({ name, brand, price });
|
|
}
|
|
} catch (e) {
|
|
// Skip this element
|
|
}
|
|
});
|
|
|
|
return results;
|
|
});
|
|
|
|
console.log(`📦 Found ${products.length} products\n`);
|
|
|
|
if (products.length > 0) {
|
|
// Extract unique brands
|
|
const brands = new Set(products.map(p => p.brand).filter(Boolean));
|
|
|
|
console.log(`🏷️ Brands found: ${Array.from(brands).join(', ')}\n`);
|
|
|
|
// Get store ID
|
|
const storeResult = await pool.query(`
|
|
SELECT id FROM stores WHERE slug = 'curaleaf-az-48th-street'
|
|
`);
|
|
|
|
if (storeResult.rows.length === 0) {
|
|
console.log('❌ Store not found in database');
|
|
return;
|
|
}
|
|
|
|
const storeId = storeResult.rows[0].id;
|
|
|
|
// Save to database
|
|
let saved = 0;
|
|
for (const product of products) {
|
|
if (!product.name || !product.brand) continue;
|
|
|
|
try {
|
|
await pool.query(`
|
|
INSERT INTO products (store_id, name, brand, price, dutchie_url, in_stock)
|
|
VALUES ($1, $2, $3, $4, $5, true)
|
|
ON CONFLICT (store_id, name, brand) DO UPDATE
|
|
SET price = $4, in_stock = true
|
|
`, [
|
|
storeId,
|
|
product.name,
|
|
product.brand,
|
|
parseFloat(product.price?.replace(/[^0-9.]/g, '') || '0'),
|
|
url
|
|
]);
|
|
saved++;
|
|
} catch (error: any) {
|
|
console.log(`⚠️ Skip: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
console.log(`✅ Saved ${saved} products to database\n`);
|
|
} else {
|
|
console.log('⚠️ No products found on page\n');
|
|
|
|
// Debug: show what we found
|
|
const pageContent = await page.evaluate(() => {
|
|
return {
|
|
title: document.title,
|
|
bodyText: document.body.innerText.substring(0, 500)
|
|
};
|
|
});
|
|
|
|
console.log('Page title:', pageContent.title);
|
|
console.log('Page preview:', pageContent.bodyText);
|
|
}
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error:', error.message);
|
|
} finally {
|
|
if (browser) await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrape();
|