Files
cannaiq/backend/archive/scrape-dutchie-brands.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

263 lines
8.5 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function scrapeDutchieBrands(storeUrl: string) {
let browser;
try {
console.log(`\n🔍 Checking if ${storeUrl} is a Dutchie menu...\n`);
// Get proxy
const proxyResult = await pool.query(`SELECT host, port, protocol FROM proxies LIMIT 1`);
const proxy = proxyResult.rows[0];
const browserArgs = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
];
if (proxy) {
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
browserArgs.push(`--proxy-server=${proxyUrl}`);
console.log(`Using proxy: ${proxy.host}:${proxy.port}`);
}
browser = await puppeteer.launch({
headless: true,
args: browserArgs
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
// Enable request interception to capture API calls
await page.setRequestInterception(true);
let productsApiData: any = null;
const apiCalls: string[] = [];
page.on('request', request => {
const url = request.url();
// Log API calls
if (url.includes('api') || url.includes('graphql') || url.includes('.json')) {
apiCalls.push(url);
}
request.continue();
});
page.on('response', async response => {
const url = response.url();
// Capture ANY API calls that look like they might have product data
if (url.includes('api.dutchie.com') || url.includes('/graphql') || url.includes('/api/') || url.includes('products')) {
console.log(`📡 API call detected: ${url.substring(0, 100)}...`);
try {
const contentType = response.headers()['content-type'] || '';
if (contentType.includes('application/json')) {
const data = await response.json();
console.log(` Response keys: ${Object.keys(data).join(', ')}`);
// Look for product data in the response
if (data && (data.data?.filteredProducts || data.data?.products || data.products)) {
console.log(' ✅ Found product data in response!');
productsApiData = data;
}
}
} catch (e) {
// Ignore JSON parse errors
}
}
});
console.log('Navigating to store page...');
await page.goto(storeUrl, { waitUntil: 'networkidle2', timeout: 60000 });
// Check if it's a Dutchie menu by looking for window.reactEnv
const isDutchie = await page.evaluate(() => {
return typeof (window as any).reactEnv !== 'undefined';
});
if (!isDutchie) {
console.log('❌ This is not a Dutchie menu page');
return { isDutchie: false, brands: [] };
}
console.log('✅ Detected Dutchie menu!');
// Extract dispensary info from reactEnv
const dutchieInfo = await page.evaluate(() => {
const env = (window as any).reactEnv;
return {
dispensaryId: env?.dispensaryId,
chainId: env?.chainId,
retailerId: env?.retailerId
};
});
console.log('\nDutchie Menu Info:');
console.log('─'.repeat(80));
console.log(`Chain ID: ${dutchieInfo.chainId}`);
console.log(`Dispensary ID: ${dutchieInfo.dispensaryId}`);
console.log(`Retailer ID: ${dutchieInfo.retailerId}`);
console.log('─'.repeat(80));
// Scroll page to trigger product loading
console.log('\nScrolling page to trigger product loading...');
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight / 2);
});
await page.waitForTimeout(3000);
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
console.log('Waiting for products to load via API...');
await page.waitForTimeout(10000);
console.log(`\n📊 Total API calls detected: ${apiCalls.length}`);
if (apiCalls.length > 0) {
console.log('API endpoints called:');
apiCalls.slice(0, 10).forEach((url, i) => {
console.log(` ${i + 1}. ${url.substring(0, 120)}`);
});
if (apiCalls.length > 10) {
console.log(` ... and ${apiCalls.length - 10} more\n`);
}
}
// Extract brands from intercepted API data or DOM
let brands: string[] = [];
if (productsApiData) {
console.log('✅ Successfully intercepted products API data!');
// Extract brands from API response
const brandSet = new Set<string>();
const products = productsApiData.data?.filteredProducts?.products ||
productsApiData.data?.products ||
[];
products.forEach((product: any) => {
if (product.brand || product.brandName) {
const brandName = product.brand || product.brandName;
if (brandName && brandName.length > 0 && brandName.length < 100) {
brandSet.add(brandName);
}
}
});
brands = Array.from(brandSet);
console.log(`Found ${products.length} products in API response`);
} else {
console.log('⚠️ No API data intercepted, trying DOM extraction...');
// Fallback: Extract brands from DOM
brands = await page.evaluate(() => {
const brandSet = new Set<string>();
// Dutchie uses specific selectors for products and brands
const selectors = [
'[class*="ProductCard"] [class*="brand"]',
'[class*="product-card"] [class*="brand"]',
'[data-testid*="product"] [data-testid*="brand"]',
'[class*="Brand"]',
'[class*="brand-name"]'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
elements.forEach(el => {
const text = el.textContent?.trim();
if (text && text.length > 0 && text.length < 100 && !text.includes('$')) {
brandSet.add(text);
}
});
}
// Also look for any element with "brand" in the class containing text
const allElements = document.querySelectorAll('[class*="brand" i], [class*="Brand"]');
allElements.forEach(el => {
const text = el.textContent?.trim();
if (text && text.length > 1 && text.length < 100 && !text.includes('$') && !text.includes('Add to cart')) {
brandSet.add(text);
}
});
return Array.from(brandSet);
});
}
console.log('\n📦 BRANDS FOUND:');
console.log('─'.repeat(80));
if (brands.length === 0) {
console.log('No brands found.');
// Debug: show what's on the page
const pageContent = await page.evaluate(() => {
return {
hasProducts: document.querySelectorAll('[class*="product" i], [class*="Product"]').length,
bodyPreview: document.body.innerText?.substring(0, 500)
};
});
console.log('\nDebug Info:');
console.log(`Product elements found: ${pageContent.hasProducts}`);
console.log(`\nPage preview:\n${pageContent.bodyPreview}\n`);
} else {
brands.sort().forEach((brand, i) => {
console.log(`${i + 1}. ${brand}`);
});
console.log('─'.repeat(80));
console.log(`Total: ${brands.length} unique brands\n`);
}
return { isDutchie: true, brands, dutchieInfo };
} catch (error: any) {
console.error('❌ Error:', error.message);
return { isDutchie: false, brands: [], error: error.message };
} finally {
if (browser) {
await browser.close();
}
}
}
async function main() {
const testUrl = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport';
console.log('Testing Dutchie brand scraper...');
console.log('═'.repeat(80));
const result = await scrapeDutchieBrands(testUrl);
if (result.isDutchie && result.brands.length > 0) {
console.log('\n✅ SUCCESS! Found Dutchie menu with brands.');
console.log('\nNext steps:');
console.log('1. Update all Curaleaf store URLs to use the correct Dutchie slugs');
console.log('2. Scrape products and brands from each store');
console.log('3. Populate the database with real product data');
}
await pool.end();
}
main();