Files
cannaiq/backend/scrape-products.ts
2025-11-28 19:45:44 -07:00

698 lines
26 KiB
TypeScript

import { firefox } from 'playwright';
import { pool } from './src/db/migrate.js';
import { getRandomProxy } from './src/utils/proxyManager.js';
interface Product {
slug: string;
name: string;
brand?: string;
variant?: string;
description?: string;
regularPrice?: number;
salePrice?: number;
thcPercentage?: number;
cbdPercentage?: number;
strainType?: string;
terpenes: string[];
effects: string[];
flavors: string[];
imageUrl?: string;
dutchieUrl: string;
inStock: boolean;
stockQuantity?: number;
stockStatus?: string;
}
interface Brand {
slug: string;
name: string;
url: string;
}
async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise<Brand[]> {
console.log(`\n${'='.repeat(60)}`);
console.log(`🏷️ SCRAPING BRANDS LIST`);
console.log(`${'='.repeat(60)}\n`);
try {
const brandsUrl = `${menuUrl}/brands`;
console.log(`📄 Loading brands page: ${brandsUrl}`);
await page.goto(brandsUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Wait for brand links to render
console.log('⏳ Waiting for brands to render...');
await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 });
console.log('✅ Brands appeared!');
await page.waitForTimeout(3000);
// Scroll to load all brands (in case of lazy loading)
console.log('📜 Scrolling to load all brands...');
let previousHeight = 0;
let scrollAttempts = 0;
const maxScrolls = 10;
while (scrollAttempts < maxScrolls) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1500);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) break;
previousHeight = currentHeight;
scrollAttempts++;
console.log(` Scroll ${scrollAttempts}/${maxScrolls}...`);
}
// Extract all brand links
const brands = await page.evaluate(() => {
const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]'));
console.log(`Found ${brandLinks.length} brand links`);
const extracted = brandLinks.map(link => {
const href = link.getAttribute('href') || '';
const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || '';
// Get brand name from link text or any text content
const name = link.textContent?.trim() || slug;
return {
slug,
name,
url: href.startsWith('http') ? href : href
};
});
// Filter out duplicates and invalid entries
const seen = new Set();
const unique = extracted.filter(b => {
if (!b.slug || !b.name || seen.has(b.slug)) return false;
seen.add(b.slug);
return true;
});
console.log(`Extracted ${unique.length} unique brands`);
return unique;
});
console.log(`✅ Found ${brands.length} brands to scrape\n`);
// Log first few brands for verification
brands.slice(0, 5).forEach((brand, idx) => {
console.log(` ${idx + 1}. ${brand.name} (${brand.slug})`);
});
if (brands.length > 5) {
console.log(` ... and ${brands.length - 5} more brands\n`);
}
return brands;
} catch (error: any) {
console.error('❌ Error scraping brands list:', error.message);
// Return empty array if brands page fails - we can still scrape main products page
return [];
}
}
async function scrapeProductsPage(menuUrl: string, dispensaryId: number) {
// Pool of realistic user agents - desktop and mobile
const desktopUserAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/131.0.0.0 Safari/537.36',
];
const mobileUserAgents = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.0.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 13; SM-S918U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
];
const getRandomUserAgent = (preferMobile = false) => {
const agents = preferMobile ? mobileUserAgents : desktopUserAgents;
return agents[Math.floor(Math.random() * agents.length)];
};
const getRandomViewport = (isMobile = false) => {
if (isMobile) {
return { width: 390, height: 844 }; // iPhone 14 Pro
}
return { width: 1920, height: 1080 }; // Desktop
};
let proxy = await getRandomProxy();
if (!proxy) {
console.log('❌ No proxy available');
process.exit(1);
}
// Randomly choose mobile or desktop for this session
let isMobile = Math.random() > 0.5;
let userAgent = getRandomUserAgent(isMobile);
let viewport = getRandomViewport(isMobile);
let attemptsWithCurrentUA = 0;
const maxAttemptsPerUA = 3; // Try 3 proxies before changing UA
console.log(`🔐 Using proxy: ${proxy.server}`);
console.log(`🌐 Using ${isMobile ? 'MOBILE' : 'DESKTOP'} UA: ${userAgent.substring(0, 50)}...`);
const browser = await firefox.launch({
headless: true,
firefoxUserPrefs: {
'geo.enabled': true,
}
});
let context = await browser.newContext({
viewport,
userAgent,
geolocation: { latitude: 33.4484, longitude: -112.0740 },
permissions: ['geolocation'],
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
});
let page = await context.newPage();
// Helper function to rotate proxy (and occasionally UA)
const rotateProxy = async () => {
await context.close();
attemptsWithCurrentUA++;
// Change UA after every 3 proxy attempts, or switch mobile/desktop
const shouldChangeUA = attemptsWithCurrentUA >= maxAttemptsPerUA;
if (shouldChangeUA) {
attemptsWithCurrentUA = 0;
// 50% chance to switch between mobile/desktop
if (Math.random() > 0.5) {
isMobile = !isMobile;
}
userAgent = getRandomUserAgent(isMobile);
viewport = getRandomViewport(isMobile);
console.log(` 🔄 Rotating to new proxy + NEW ${isMobile ? 'MOBILE' : 'DESKTOP'} UA...`);
} else {
console.log(` 🔄 Rotating to new proxy (keeping same UA, ${attemptsWithCurrentUA + 1}/${maxAttemptsPerUA})...`);
}
proxy = await getRandomProxy();
if (!proxy) {
throw new Error('No proxy available');
}
console.log(` 🔐 New proxy: ${proxy.server}`);
if (shouldChangeUA) {
console.log(` 🌐 New UA: ${userAgent.substring(0, 50)}...`);
}
context = await browser.newContext({
viewport,
userAgent,
geolocation: { latitude: 33.4484, longitude: -112.0740 },
permissions: ['geolocation'],
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
});
page = await context.newPage();
};
try {
// First, scrape the brands list
let brands: Brand[] = [];
let brandsLoadRetries = 0;
const maxBrandsRetries = 10;
while (brands.length === 0 && brandsLoadRetries < maxBrandsRetries) {
try {
brands = await scrapeBrandsList(menuUrl, context, page);
break;
} catch (error: any) {
brandsLoadRetries++;
const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') ||
error.message?.includes('Timeout');
if (isProxyError && brandsLoadRetries < maxBrandsRetries) {
console.log(`⚠️ Proxy failed loading brands page (attempt ${brandsLoadRetries}/${maxBrandsRetries}), rotating...`);
await rotateProxy();
} else {
console.error('⚠️ Failed to load brands, will skip brand scraping');
break;
}
}
}
// Collect all products from all brands
const allProducts: Product[] = [];
let totalSuccessCount = 0;
let totalErrorCount = 0;
// Scrape each brand page
for (let brandIndex = 0; brandIndex < brands.length; brandIndex++) {
const brand = brands[brandIndex];
console.log(`\n${'='.repeat(60)}`);
console.log(`🏷️ BRAND ${brandIndex + 1}/${brands.length}: ${brand.name}`);
console.log(`${'='.repeat(60)}\n`);
// Retry loading brand page if proxy fails
let loadSuccess = false;
let loadRetries = 0;
const maxLoadRetries = 10;
while (!loadSuccess && loadRetries < maxLoadRetries) {
try {
const brandUrl = `${menuUrl}/brands/${brand.slug}`;
console.log(`📄 Loading brand page: ${brandUrl}`);
await page.goto(brandUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Wait for React to render products
console.log('⏳ Waiting for products to render...');
await page.waitForSelector('a[href*="/product/"]', { timeout: 45000 });
console.log('✅ Products appeared!');
await page.waitForTimeout(5000);
loadSuccess = true;
} catch (error: any) {
loadRetries++;
const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') ||
error.message?.includes('Timeout');
if (isProxyError && loadRetries < maxLoadRetries) {
console.log(`⚠️ Proxy failed loading brand page (attempt ${loadRetries}/${maxLoadRetries}), rotating...`);
await rotateProxy();
} else {
console.error(`❌ Failed to load brand page after ${maxLoadRetries} retries, skipping brand: ${brand.name}`);
break;
}
}
}
if (!loadSuccess) {
console.log(`⏭️ Skipping brand: ${brand.name}`);
continue;
}
// Scroll to load all products (lazy loading)
console.log('📜 Scrolling to load all products...');
let previousHeight = 0;
let scrollAttempts = 0;
const maxScrolls = 20;
while (scrollAttempts < maxScrolls) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) break;
previousHeight = currentHeight;
scrollAttempts++;
console.log(` Scroll ${scrollAttempts}/${maxScrolls}...`);
}
console.log('🔍 Extracting product data...');
// Extract all product cards from this brand page
const products = await page.evaluate((brandName) => {
const productCards = Array.from(document.querySelectorAll('a[href*="/product/"]'));
console.log(`Found ${productCards.length} product links for brand: ${brandName}`);
const extracted = productCards.map((card, idx) => {
const href = card.getAttribute('href') || '';
const slug = href.split('/product/')[1]?.replace(/\/$/, '') || '';
// Get all text content from the card
const allText = card.textContent || '';
// Name - extract first line of text (usually the product name)
// Split by newlines and take first non-empty line
const lines = allText.split('\n').map(l => l.trim()).filter(Boolean);
const name = lines[0] || '';
// Debug first few products
if (idx < 3) {
console.log(`Product ${idx}: slug="${slug}", name="${name}", allText="${allText.substring(0, 100)}"`);
}
// Prices - look for dollar amounts
const priceMatches = allText.match(/\$(\d+\.?\d*)/g) || [];
const prices = priceMatches.map(p => parseFloat(p.replace('$', '')));
// If 2 prices, first is sale, second is regular
const salePrice = prices.length === 2 ? prices[0] : undefined;
const regularPrice = prices.length === 2 ? prices[1] : (prices.length === 1 ? prices[0] : undefined);
// THC/CBD
const thcMatch = allText.match(/THC[:\s]*(\d+\.?\d*)\s*%/i);
const cbdMatch = allText.match(/CBD[:\s]*(\d+\.?\d*)\s*%/i);
const thc = thcMatch ? parseFloat(thcMatch[1]) : undefined;
const cbd = cbdMatch ? parseFloat(cbdMatch[1]) : undefined;
// Strain type
let strainType = undefined;
if (allText.match(/\bindica\b/i)) strainType = 'Indica';
else if (allText.match(/\bsativa\b/i)) strainType = 'Sativa';
else if (allText.match(/\bhybrid\b/i)) strainType = 'Hybrid';
// Variant/weight
const weightMatch = allText.match(/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i);
const variant = weightMatch ? weightMatch[1].trim() : undefined;
// Image
const img = card.querySelector('img');
const imageUrl = img?.src || undefined;
// Stock status parsing
const allTextLower = allText.toLowerCase();
const inStock = !allTextLower.includes('out of stock');
// Extract stock quantity (e.g., "5 left in stock", "Only 3 left")
let stockQuantity = undefined;
const quantityMatch = allText.match(/(\d+)\s+left/i);
if (quantityMatch) {
stockQuantity = parseInt(quantityMatch[1]);
}
// Extract stock status messages
let stockStatus = undefined;
if (allTextLower.includes('out of stock')) {
stockStatus = 'out of stock';
} else if (allTextLower.includes('low stock') || allTextLower.includes('limited')) {
stockStatus = 'low stock';
} else if (quantityMatch) {
stockStatus = `${stockQuantity} left in stock`;
} else if (allTextLower.includes('order soon')) {
stockStatus = 'order soon';
}
return {
slug,
name,
brand: brandName, // Set the brand from the brand page
variant,
regularPrice,
salePrice,
thcPercentage: thc,
cbdPercentage: cbd,
strainType,
imageUrl,
inStock,
stockQuantity,
stockStatus,
dutchieUrl: href.startsWith('http') ? href : `https://dutchie.com${href}`
};
});
const filtered = extracted.filter(p => p.slug && p.name);
console.log(`Extracted ${extracted.length} products, ${filtered.length} with slug+name for brand: ${brandName}`);
if (filtered.length === 0 && extracted.length > 0) {
console.log('Sample failed product:', JSON.stringify(extracted[0]));
}
return filtered;
}, brand.name);
console.log(`✅ Extracted ${products.length} products from brand: ${brand.name}`);
// Now fetch detailed data for each product in this brand
for (let i = 0; i < products.length; i++) {
const product = products[i];
console.log(`\n[${i + 1}/${products.length}] Processing: ${product.name}`);
let retries = 0;
const maxRetries = 3;
let success = false;
while (retries < maxRetries && !success) {
try {
// Navigate to product detail page
const productUrl = `${menuUrl}/product/${product.slug}`;
await page.goto(productUrl, {
waitUntil: 'domcontentloaded',
timeout: 30000
});
await page.waitForTimeout(3000);
// Extract detailed data
const detailedData = await page.evaluate(() => {
const allText = document.body.textContent || '';
// Brand
let brand = undefined;
const brandEl = document.querySelector('[class*="brand" i], [data-testid*="brand" i]');
if (brandEl?.textContent) {
brand = brandEl.textContent.trim();
}
// Description
let description = undefined;
const descEl = document.querySelector('[class*="description" i], p[class*="product" i]');
if (descEl?.textContent && descEl.textContent.length > 20) {
description = descEl.textContent.trim().substring(0, 1000);
}
// Terpenes
const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene', 'Terpinolene', 'Ocimene'];
const terpenes = terpeneNames.filter(t => allText.match(new RegExp(`\\b${t}\\b`, 'i')));
// Effects
const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry', 'Aroused', 'Giggly', 'Tingly'];
const effects = effectNames.filter(e => allText.match(new RegExp(`\\b${e}\\b`, 'i')));
// Flavors
const flavorNames = ['Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel', 'Sour', 'Floral', 'Spicy', 'Fruity', 'Herbal', 'Mint', 'Woody'];
const flavors = flavorNames.filter(f => allText.match(new RegExp(`\\b${f}\\b`, 'i')));
// Categories (from breadcrumbs)
const breadcrumbs = Array.from(document.querySelectorAll('[class*="breadcrumb" i] a, nav a'));
const categories = breadcrumbs
.map(b => b.textContent?.trim())
.filter(Boolean)
.filter(c => c !== 'Home' && c.length > 0);
return { brand, description, terpenes, effects, flavors, categories };
});
// Merge detailed data - use brand from page evaluation, or fall back to brand page name
const fullProduct: Product = {
...product,
brand: detailedData.brand || product.brand,
description: detailedData.description,
terpenes: detailedData.terpenes,
effects: detailedData.effects,
flavors: detailedData.flavors,
};
// Save to database
await saveProduct(dispensaryId, fullProduct, detailedData.categories);
totalSuccessCount++;
console.log(` ✅ Saved (${totalSuccessCount} successful, ${totalErrorCount} errors)`);
success = true;
} catch (error: any) {
retries++;
// Check if it's a proxy error
const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') ||
error.message?.includes('Timeout');
if (isProxyError && retries < maxRetries) {
console.log(` ⚠️ Proxy error (attempt ${retries}/${maxRetries}), rotating...`);
await rotateProxy();
} else if (retries >= maxRetries) {
totalErrorCount++;
console.error(` ❌ Failed after ${maxRetries} retries:`, error.message);
} else {
totalErrorCount++;
console.error(` ❌ Error processing ${product.name}:`, error.message);
break; // Non-proxy error, don't retry
}
}
}
// Rate limiting
if (success) {
await page.waitForTimeout(1000 + Math.random() * 2000);
}
}
console.log(`\n 📊 Brand "${brand.name}" complete: ${products.length} products processed`);
}
console.log(`\n${'='.repeat(60)}`);
console.log(`✅ ALL BRANDS SCRAPING COMPLETE!`);
console.log(` Total brands scraped: ${brands.length}`);
console.log(` Total products found: ${allProducts.length}`);
console.log(` Successfully saved: ${totalSuccessCount}`);
console.log(` Errors: ${totalErrorCount}`);
console.log(`${'='.repeat(60)}\n`);
await browser.close();
await pool.end();
} catch (error) {
console.error('❌ Fatal error:', error);
await browser.close();
await pool.end();
process.exit(1);
}
}
async function saveProduct(storeId: number, product: Product, categories: string[]) {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Check if product exists
const existing = await client.query(
'SELECT id FROM products WHERE dispensary_id = $1 AND slug = $2',
[storeId, product.slug]
);
let productId: number;
if (existing.rows.length > 0) {
// UPDATE existing product
productId = existing.rows[0].id;
await client.query(`
UPDATE products
SET name = $1, brand = $2, variant = $3, description = $4,
regular_price = $5, sale_price = $6,
thc_percentage = $7, cbd_percentage = $8, strain_type = $9,
terpenes = $10, effects = $11, flavors = $12,
image_url = $13, dutchie_url = $14, in_stock = $15,
stock_quantity = $16, stock_status = $17,
last_seen_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
WHERE id = $18
`, [
product.name, product.brand, product.variant, product.description,
product.regularPrice, product.salePrice,
product.thcPercentage, product.cbdPercentage, product.strainType,
product.terpenes, product.effects, product.flavors,
product.imageUrl, product.dutchieUrl, product.inStock,
product.stockQuantity, product.stockStatus,
productId
]);
} else {
// INSERT new product
const result = await client.query(`
INSERT INTO products (
dispensary_id, slug, name, brand, variant, description,
regular_price, sale_price,
thc_percentage, cbd_percentage, strain_type,
terpenes, effects, flavors,
image_url, dutchie_url, in_stock, stock_quantity, stock_status
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
RETURNING id
`, [
storeId, product.slug, product.name, product.brand, product.variant, product.description,
product.regularPrice, product.salePrice,
product.thcPercentage, product.cbdPercentage, product.strainType,
product.terpenes, product.effects, product.flavors,
product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus
]);
productId = result.rows[0].id;
}
// Record price history (if we have price data)
if (product.regularPrice !== undefined || product.salePrice !== undefined) {
await client.query(`
INSERT INTO price_history (product_id, regular_price, sale_price)
VALUES ($1, $2, $3)
`, [productId, product.regularPrice, product.salePrice]);
}
// Record batch history (if we have cannabinoid data)
if (product.thcPercentage !== undefined || product.cbdPercentage !== undefined) {
await client.query(`
INSERT INTO batch_history (product_id, thc_percentage, cbd_percentage, terpenes, strain_type)
VALUES ($1, $2, $3, $4, $5)
`, [productId, product.thcPercentage, product.cbdPercentage, product.terpenes, product.strainType]);
}
// Update product_categories
for (const categorySlug of categories) {
await client.query(`
INSERT INTO product_categories (product_id, category_slug, last_seen_at)
VALUES ($1, $2, CURRENT_TIMESTAMP)
ON CONFLICT (product_id, category_slug)
DO UPDATE SET last_seen_at = CURRENT_TIMESTAMP
`, [productId, categorySlug.toLowerCase()]);
}
await client.query('COMMIT');
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
}
// Main execution
async function main() {
// Get dispensary ID from command line argument or default to 112 (Deeply Rooted)
const dispensaryIdArg = process.argv[2];
let dispensaryId: number;
if (dispensaryIdArg) {
// If dispensary ID is provided as argument, use it directly
dispensaryId = parseInt(dispensaryIdArg, 10);
const dispensaryResult = await pool.query("SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId]);
if (dispensaryResult.rows.length === 0) {
console.error(`❌ Dispensary ID ${dispensaryId} not found`);
process.exit(1);
}
if (!dispensaryResult.rows[0].menu_url) {
console.error(`❌ Dispensary ${dispensaryResult.rows[0].name} has no menu_url configured`);
process.exit(1);
}
console.log(`✅ Found dispensary ID: ${dispensaryId} - ${dispensaryResult.rows[0].name}`);
} else {
// Default behavior: use Deeply Rooted (ID 112)
dispensaryId = 112;
const dispensaryResult = await pool.query("SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId]);
if (dispensaryResult.rows.length === 0) {
console.error('❌ Deeply Rooted dispensary not found');
process.exit(1);
}
console.log(`✅ Found dispensary ID: ${dispensaryId} - ${dispensaryResult.rows[0].name}`);
}
// Get the menu URL for the dispensary
const dispensaryData = await pool.query("SELECT menu_url FROM dispensaries WHERE id = $1", [dispensaryId]);
const menuUrl = dispensaryData.rows[0].menu_url;
await scrapeProductsPage(menuUrl, dispensaryId);
}
main().catch(console.error);