Files
cannaiq/backend/archive/scrape-curaleaf-with-proxy.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

260 lines
7.5 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function getRandomActiveProxy() {
const result = await pool.query(`
SELECT id, host, port, protocol
FROM proxies
WHERE active = false
ORDER BY RANDOM()
LIMIT 1
`);
return result.rows[0] || null;
}
async function scrapeCuraleafBrands() {
let browser;
try {
// Get proxy
const proxy = await getRandomActiveProxy();
if (!proxy) {
console.log('⚠️ No proxies available');
await pool.end();
return;
}
console.log(`🔌 Using proxy: ${proxy.host}:${proxy.port}`);
// Launch browser with proxy
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--disable-features=IsolateOrigins,site-per-process',
`--proxy-server=${proxyUrl}`,
'--disable-web-security',
'--disable-features=VizDisplayCompositor'
]
});
const page = await browser.newPage();
// Set Googlebot user-agent
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
// Set viewport
await page.setViewport({ width: 1920, height: 1080 });
// Additional stealth measures
await page.evaluateOnNewDocument(() => {
// Override timezone to Arizona
Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', {
value: function() {
return { timeZone: 'America/Phoenix' };
}
});
// Spoof geolocation
Object.defineProperty(navigator, 'geolocation', {
get: () => ({
getCurrentPosition: (success: any) => {
setTimeout(() => {
success({
coords: {
latitude: 33.4484, // Phoenix, AZ
longitude: -112.0740,
accuracy: 100
}
});
}, 100);
}
})
});
// Remove webdriver flag
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
// Chrome runtime
(window as any).chrome = {
runtime: {}
};
// Languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
// Plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
});
// Get store from database
const storeResult = await pool.query(`
SELECT id, name, dutchie_url
FROM stores
WHERE slug = 'curaleaf-az-48th-street'
`);
if (storeResult.rows.length === 0) {
console.log('❌ Store not found');
await browser.close();
await pool.end();
return;
}
const store = storeResult.rows[0];
const testUrl = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport';
console.log(`\n🌐 Navigating to: ${testUrl}`);
console.log(`📦 Store: ${store.name}\n`);
// Track API responses
const apiResponses: any[] = [];
page.on('response', async response => {
const url = response.url();
try {
const contentType = response.headers()['content-type'] || '';
if (contentType.includes('application/json')) {
const data = await response.json();
// Look for product data
if (url.includes('filteredProducts') ||
url.includes('products') ||
url.includes('menu') ||
(data.data && data.data.filteredProducts)) {
console.log(`📡 Found product API: ${url.substring(0, 80)}...`);
apiResponses.push({ url, data });
}
}
} catch (e) {
// Not JSON
}
});
await page.goto(testUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Check for Dutchie
const isDutchie = await page.evaluate(() => {
return typeof (window as any).reactEnv !== 'undefined';
});
console.log(`✅ Is Dutchie menu: ${isDutchie}\n`);
if (isDutchie) {
// Get reactEnv
const reactEnv = await page.evaluate(() => {
return (window as any).reactEnv;
});
console.log('📋 Dutchie Info:');
console.log(` Chain ID: ${reactEnv.chainId}`);
console.log(` Dispensary ID: ${reactEnv.dispensaryId}`);
console.log(` Retailer ID: ${reactEnv.retailerId}\n`);
// Scroll to trigger lazy loading
console.log('📜 Scrolling page to trigger product loading...');
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight / 2));
await page.waitForTimeout(3000);
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(3000);
console.log(`\n📊 Captured ${apiResponses.length} API responses\n`);
// Extract products from API responses
const allProducts: any[] = [];
for (const resp of apiResponses) {
if (resp.data && resp.data.data && resp.data.data.filteredProducts) {
const products = resp.data.data.filteredProducts.products || [];
allProducts.push(...products);
console.log(`✅ Found ${products.length} products in API response`);
}
}
if (allProducts.length > 0) {
// Extract unique brands
const brands = new Set<string>();
allProducts.forEach((product: any) => {
if (product.brand) brands.add(product.brand);
if (product.brandName) brands.add(product.brandName);
});
console.log(`\n🏷 Unique Brands Found (${brands.size}):`);
console.log('─'.repeat(60));
Array.from(brands).sort().forEach((brand, i) => {
console.log(`${i + 1}. ${brand}`);
});
console.log('─'.repeat(60));
// Save products to database
console.log(`\n💾 Saving ${allProducts.length} products to database...`);
let saved = 0;
for (const product of allProducts.slice(0, 50)) { // Save first 50 products
try {
await pool.query(`
INSERT INTO products (
store_id, name, brand, price, thc_percentage,
dutchie_url, in_stock, category
)
VALUES ($1, $2, $3, $4, $5, $6, true, $7)
ON CONFLICT (store_id, name, brand) DO UPDATE
SET price = $4, thc_percentage = $5, in_stock = true
`, [
store.id,
product.name || 'Unknown',
product.brand || product.brandName || 'Unknown',
parseFloat(product.price) || 0,
parseFloat(product.potencyThc?.formatted?.replace('%', '')) || null,
testUrl,
product.category || 'other'
]);
saved++;
} catch (error: any) {
console.log(`❌ Error saving product: ${error.message}`);
}
}
console.log(`✅ Saved ${saved} products to database\n`);
} else {
console.log('⚠️ No products found in API responses\n');
}
}
} catch (error: any) {
console.error('❌ Error:', error.message);
} finally {
if (browser) {
await browser.close();
}
await pool.end();
}
}
scrapeCuraleafBrands();