- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
260 lines
7.5 KiB
TypeScript
260 lines
7.5 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||
import { Pool } from 'pg';
|
||
|
||
puppeteer.use(StealthPlugin());
|
||
|
||
const pool = new Pool({
|
||
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
||
});
|
||
|
||
async function getRandomActiveProxy() {
|
||
const result = await pool.query(`
|
||
SELECT id, host, port, protocol
|
||
FROM proxies
|
||
WHERE active = false
|
||
ORDER BY RANDOM()
|
||
LIMIT 1
|
||
`);
|
||
|
||
return result.rows[0] || null;
|
||
}
|
||
|
||
async function scrapeCuraleafBrands() {
|
||
let browser;
|
||
|
||
try {
|
||
// Get proxy
|
||
const proxy = await getRandomActiveProxy();
|
||
|
||
if (!proxy) {
|
||
console.log('⚠️ No proxies available');
|
||
await pool.end();
|
||
return;
|
||
}
|
||
|
||
console.log(`🔌 Using proxy: ${proxy.host}:${proxy.port}`);
|
||
|
||
// Launch browser with proxy
|
||
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
||
|
||
browser = await puppeteer.launch({
|
||
headless: true,
|
||
args: [
|
||
'--no-sandbox',
|
||
'--disable-setuid-sandbox',
|
||
'--disable-dev-shm-usage',
|
||
'--disable-blink-features=AutomationControlled',
|
||
'--disable-features=IsolateOrigins,site-per-process',
|
||
`--proxy-server=${proxyUrl}`,
|
||
'--disable-web-security',
|
||
'--disable-features=VizDisplayCompositor'
|
||
]
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
|
||
// Set Googlebot user-agent
|
||
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
||
|
||
// Set viewport
|
||
await page.setViewport({ width: 1920, height: 1080 });
|
||
|
||
// Additional stealth measures
|
||
await page.evaluateOnNewDocument(() => {
|
||
// Override timezone to Arizona
|
||
Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', {
|
||
value: function() {
|
||
return { timeZone: 'America/Phoenix' };
|
||
}
|
||
});
|
||
|
||
// Spoof geolocation
|
||
Object.defineProperty(navigator, 'geolocation', {
|
||
get: () => ({
|
||
getCurrentPosition: (success: any) => {
|
||
setTimeout(() => {
|
||
success({
|
||
coords: {
|
||
latitude: 33.4484, // Phoenix, AZ
|
||
longitude: -112.0740,
|
||
accuracy: 100
|
||
}
|
||
});
|
||
}, 100);
|
||
}
|
||
})
|
||
});
|
||
|
||
// Remove webdriver flag
|
||
Object.defineProperty(navigator, 'webdriver', {
|
||
get: () => false
|
||
});
|
||
|
||
// Chrome runtime
|
||
(window as any).chrome = {
|
||
runtime: {}
|
||
};
|
||
|
||
// Languages
|
||
Object.defineProperty(navigator, 'languages', {
|
||
get: () => ['en-US', 'en']
|
||
});
|
||
|
||
// Plugins
|
||
Object.defineProperty(navigator, 'plugins', {
|
||
get: () => [1, 2, 3, 4, 5]
|
||
});
|
||
});
|
||
|
||
// Get store from database
|
||
const storeResult = await pool.query(`
|
||
SELECT id, name, dutchie_url
|
||
FROM stores
|
||
WHERE slug = 'curaleaf-az-48th-street'
|
||
`);
|
||
|
||
if (storeResult.rows.length === 0) {
|
||
console.log('❌ Store not found');
|
||
await browser.close();
|
||
await pool.end();
|
||
return;
|
||
}
|
||
|
||
const store = storeResult.rows[0];
|
||
const testUrl = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport';
|
||
|
||
console.log(`\n🌐 Navigating to: ${testUrl}`);
|
||
console.log(`📦 Store: ${store.name}\n`);
|
||
|
||
// Track API responses
|
||
const apiResponses: any[] = [];
|
||
|
||
page.on('response', async response => {
|
||
const url = response.url();
|
||
|
||
try {
|
||
const contentType = response.headers()['content-type'] || '';
|
||
if (contentType.includes('application/json')) {
|
||
const data = await response.json();
|
||
|
||
// Look for product data
|
||
if (url.includes('filteredProducts') ||
|
||
url.includes('products') ||
|
||
url.includes('menu') ||
|
||
(data.data && data.data.filteredProducts)) {
|
||
console.log(`📡 Found product API: ${url.substring(0, 80)}...`);
|
||
apiResponses.push({ url, data });
|
||
}
|
||
}
|
||
} catch (e) {
|
||
// Not JSON
|
||
}
|
||
});
|
||
|
||
await page.goto(testUrl, {
|
||
waitUntil: 'domcontentloaded',
|
||
timeout: 60000
|
||
});
|
||
|
||
// Check for Dutchie
|
||
const isDutchie = await page.evaluate(() => {
|
||
return typeof (window as any).reactEnv !== 'undefined';
|
||
});
|
||
|
||
console.log(`✅ Is Dutchie menu: ${isDutchie}\n`);
|
||
|
||
if (isDutchie) {
|
||
// Get reactEnv
|
||
const reactEnv = await page.evaluate(() => {
|
||
return (window as any).reactEnv;
|
||
});
|
||
|
||
console.log('📋 Dutchie Info:');
|
||
console.log(` Chain ID: ${reactEnv.chainId}`);
|
||
console.log(` Dispensary ID: ${reactEnv.dispensaryId}`);
|
||
console.log(` Retailer ID: ${reactEnv.retailerId}\n`);
|
||
|
||
// Scroll to trigger lazy loading
|
||
console.log('📜 Scrolling page to trigger product loading...');
|
||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight / 2));
|
||
await page.waitForTimeout(3000);
|
||
|
||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||
await page.waitForTimeout(3000);
|
||
|
||
console.log(`\n📊 Captured ${apiResponses.length} API responses\n`);
|
||
|
||
// Extract products from API responses
|
||
const allProducts: any[] = [];
|
||
|
||
for (const resp of apiResponses) {
|
||
if (resp.data && resp.data.data && resp.data.data.filteredProducts) {
|
||
const products = resp.data.data.filteredProducts.products || [];
|
||
allProducts.push(...products);
|
||
console.log(`✅ Found ${products.length} products in API response`);
|
||
}
|
||
}
|
||
|
||
if (allProducts.length > 0) {
|
||
// Extract unique brands
|
||
const brands = new Set<string>();
|
||
allProducts.forEach((product: any) => {
|
||
if (product.brand) brands.add(product.brand);
|
||
if (product.brandName) brands.add(product.brandName);
|
||
});
|
||
|
||
console.log(`\n🏷️ Unique Brands Found (${brands.size}):`);
|
||
console.log('─'.repeat(60));
|
||
Array.from(brands).sort().forEach((brand, i) => {
|
||
console.log(`${i + 1}. ${brand}`);
|
||
});
|
||
console.log('─'.repeat(60));
|
||
|
||
// Save products to database
|
||
console.log(`\n💾 Saving ${allProducts.length} products to database...`);
|
||
|
||
let saved = 0;
|
||
for (const product of allProducts.slice(0, 50)) { // Save first 50 products
|
||
try {
|
||
await pool.query(`
|
||
INSERT INTO products (
|
||
store_id, name, brand, price, thc_percentage,
|
||
dutchie_url, in_stock, category
|
||
)
|
||
VALUES ($1, $2, $3, $4, $5, $6, true, $7)
|
||
ON CONFLICT (store_id, name, brand) DO UPDATE
|
||
SET price = $4, thc_percentage = $5, in_stock = true
|
||
`, [
|
||
store.id,
|
||
product.name || 'Unknown',
|
||
product.brand || product.brandName || 'Unknown',
|
||
parseFloat(product.price) || 0,
|
||
parseFloat(product.potencyThc?.formatted?.replace('%', '')) || null,
|
||
testUrl,
|
||
product.category || 'other'
|
||
]);
|
||
saved++;
|
||
} catch (error: any) {
|
||
console.log(`❌ Error saving product: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
console.log(`✅ Saved ${saved} products to database\n`);
|
||
} else {
|
||
console.log('⚠️ No products found in API responses\n');
|
||
}
|
||
}
|
||
|
||
} catch (error: any) {
|
||
console.error('❌ Error:', error.message);
|
||
} finally {
|
||
if (browser) {
|
||
await browser.close();
|
||
}
|
||
await pool.end();
|
||
}
|
||
}
|
||
|
||
scrapeCuraleafBrands();
|