263 lines
8.5 KiB
TypeScript
263 lines
8.5 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const pool = new Pool({
|
|
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
|
});
|
|
|
|
async function scrapeDutchieBrands(storeUrl: string) {
|
|
let browser;
|
|
|
|
try {
|
|
console.log(`\n🔍 Checking if ${storeUrl} is a Dutchie menu...\n`);
|
|
|
|
// Get proxy
|
|
const proxyResult = await pool.query(`SELECT host, port, protocol FROM proxies LIMIT 1`);
|
|
const proxy = proxyResult.rows[0];
|
|
|
|
const browserArgs = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled'
|
|
];
|
|
|
|
if (proxy) {
|
|
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
|
browserArgs.push(`--proxy-server=${proxyUrl}`);
|
|
console.log(`Using proxy: ${proxy.host}:${proxy.port}`);
|
|
}
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: browserArgs
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
|
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false,
|
|
});
|
|
});
|
|
|
|
// Enable request interception to capture API calls
|
|
await page.setRequestInterception(true);
|
|
|
|
let productsApiData: any = null;
|
|
const apiCalls: string[] = [];
|
|
|
|
page.on('request', request => {
|
|
const url = request.url();
|
|
// Log API calls
|
|
if (url.includes('api') || url.includes('graphql') || url.includes('.json')) {
|
|
apiCalls.push(url);
|
|
}
|
|
request.continue();
|
|
});
|
|
|
|
page.on('response', async response => {
|
|
const url = response.url();
|
|
|
|
// Capture ANY API calls that look like they might have product data
|
|
if (url.includes('api.dutchie.com') || url.includes('/graphql') || url.includes('/api/') || url.includes('products')) {
|
|
console.log(`📡 API call detected: ${url.substring(0, 100)}...`);
|
|
try {
|
|
const contentType = response.headers()['content-type'] || '';
|
|
if (contentType.includes('application/json')) {
|
|
const data = await response.json();
|
|
console.log(` Response keys: ${Object.keys(data).join(', ')}`);
|
|
|
|
// Look for product data in the response
|
|
if (data && (data.data?.filteredProducts || data.data?.products || data.products)) {
|
|
console.log(' ✅ Found product data in response!');
|
|
productsApiData = data;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Ignore JSON parse errors
|
|
}
|
|
}
|
|
});
|
|
|
|
console.log('Navigating to store page...');
|
|
await page.goto(storeUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
|
|
// Check if it's a Dutchie menu by looking for window.reactEnv
|
|
const isDutchie = await page.evaluate(() => {
|
|
return typeof (window as any).reactEnv !== 'undefined';
|
|
});
|
|
|
|
if (!isDutchie) {
|
|
console.log('❌ This is not a Dutchie menu page');
|
|
return { isDutchie: false, brands: [] };
|
|
}
|
|
|
|
console.log('✅ Detected Dutchie menu!');
|
|
|
|
// Extract dispensary info from reactEnv
|
|
const dutchieInfo = await page.evaluate(() => {
|
|
const env = (window as any).reactEnv;
|
|
return {
|
|
dispensaryId: env?.dispensaryId,
|
|
chainId: env?.chainId,
|
|
retailerId: env?.retailerId
|
|
};
|
|
});
|
|
|
|
console.log('\nDutchie Menu Info:');
|
|
console.log('─'.repeat(80));
|
|
console.log(`Chain ID: ${dutchieInfo.chainId}`);
|
|
console.log(`Dispensary ID: ${dutchieInfo.dispensaryId}`);
|
|
console.log(`Retailer ID: ${dutchieInfo.retailerId}`);
|
|
console.log('─'.repeat(80));
|
|
|
|
// Scroll page to trigger product loading
|
|
console.log('\nScrolling page to trigger product loading...');
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, document.body.scrollHeight / 2);
|
|
});
|
|
await page.waitForTimeout(3000);
|
|
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, document.body.scrollHeight);
|
|
});
|
|
|
|
console.log('Waiting for products to load via API...');
|
|
await page.waitForTimeout(10000);
|
|
|
|
console.log(`\n📊 Total API calls detected: ${apiCalls.length}`);
|
|
if (apiCalls.length > 0) {
|
|
console.log('API endpoints called:');
|
|
apiCalls.slice(0, 10).forEach((url, i) => {
|
|
console.log(` ${i + 1}. ${url.substring(0, 120)}`);
|
|
});
|
|
if (apiCalls.length > 10) {
|
|
console.log(` ... and ${apiCalls.length - 10} more\n`);
|
|
}
|
|
}
|
|
|
|
// Extract brands from intercepted API data or DOM
|
|
let brands: string[] = [];
|
|
|
|
if (productsApiData) {
|
|
console.log('✅ Successfully intercepted products API data!');
|
|
|
|
// Extract brands from API response
|
|
const brandSet = new Set<string>();
|
|
const products = productsApiData.data?.filteredProducts?.products ||
|
|
productsApiData.data?.products ||
|
|
[];
|
|
|
|
products.forEach((product: any) => {
|
|
if (product.brand || product.brandName) {
|
|
const brandName = product.brand || product.brandName;
|
|
if (brandName && brandName.length > 0 && brandName.length < 100) {
|
|
brandSet.add(brandName);
|
|
}
|
|
}
|
|
});
|
|
|
|
brands = Array.from(brandSet);
|
|
console.log(`Found ${products.length} products in API response`);
|
|
} else {
|
|
console.log('⚠️ No API data intercepted, trying DOM extraction...');
|
|
|
|
// Fallback: Extract brands from DOM
|
|
brands = await page.evaluate(() => {
|
|
const brandSet = new Set<string>();
|
|
|
|
// Dutchie uses specific selectors for products and brands
|
|
const selectors = [
|
|
'[class*="ProductCard"] [class*="brand"]',
|
|
'[class*="product-card"] [class*="brand"]',
|
|
'[data-testid*="product"] [data-testid*="brand"]',
|
|
'[class*="Brand"]',
|
|
'[class*="brand-name"]'
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
elements.forEach(el => {
|
|
const text = el.textContent?.trim();
|
|
if (text && text.length > 0 && text.length < 100 && !text.includes('$')) {
|
|
brandSet.add(text);
|
|
}
|
|
});
|
|
}
|
|
|
|
// Also look for any element with "brand" in the class containing text
|
|
const allElements = document.querySelectorAll('[class*="brand" i], [class*="Brand"]');
|
|
allElements.forEach(el => {
|
|
const text = el.textContent?.trim();
|
|
if (text && text.length > 1 && text.length < 100 && !text.includes('$') && !text.includes('Add to cart')) {
|
|
brandSet.add(text);
|
|
}
|
|
});
|
|
|
|
return Array.from(brandSet);
|
|
});
|
|
}
|
|
|
|
console.log('\n📦 BRANDS FOUND:');
|
|
console.log('─'.repeat(80));
|
|
|
|
if (brands.length === 0) {
|
|
console.log('No brands found.');
|
|
|
|
// Debug: show what's on the page
|
|
const pageContent = await page.evaluate(() => {
|
|
return {
|
|
hasProducts: document.querySelectorAll('[class*="product" i], [class*="Product"]').length,
|
|
bodyPreview: document.body.innerText?.substring(0, 500)
|
|
};
|
|
});
|
|
|
|
console.log('\nDebug Info:');
|
|
console.log(`Product elements found: ${pageContent.hasProducts}`);
|
|
console.log(`\nPage preview:\n${pageContent.bodyPreview}\n`);
|
|
} else {
|
|
brands.sort().forEach((brand, i) => {
|
|
console.log(`${i + 1}. ${brand}`);
|
|
});
|
|
console.log('─'.repeat(80));
|
|
console.log(`Total: ${brands.length} unique brands\n`);
|
|
}
|
|
|
|
return { isDutchie: true, brands, dutchieInfo };
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error:', error.message);
|
|
return { isDutchie: false, brands: [], error: error.message };
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const testUrl = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport';
|
|
|
|
console.log('Testing Dutchie brand scraper...');
|
|
console.log('═'.repeat(80));
|
|
|
|
const result = await scrapeDutchieBrands(testUrl);
|
|
|
|
if (result.isDutchie && result.brands.length > 0) {
|
|
console.log('\n✅ SUCCESS! Found Dutchie menu with brands.');
|
|
console.log('\nNext steps:');
|
|
console.log('1. Update all Curaleaf store URLs to use the correct Dutchie slugs');
|
|
console.log('2. Scrape products and brands from each store');
|
|
console.log('3. Populate the database with real product data');
|
|
}
|
|
|
|
await pool.end();
|
|
}
|
|
|
|
main();
|