- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
698 lines
26 KiB
TypeScript
698 lines
26 KiB
TypeScript
import { firefox } from 'playwright';
|
|
import { pool } from './src/db/migrate.js';
|
|
import { getRandomProxy } from './src/utils/proxyManager.js';
|
|
|
|
interface Product {
|
|
slug: string;
|
|
name: string;
|
|
brand?: string;
|
|
variant?: string;
|
|
description?: string;
|
|
regularPrice?: number;
|
|
salePrice?: number;
|
|
thcPercentage?: number;
|
|
cbdPercentage?: number;
|
|
strainType?: string;
|
|
terpenes: string[];
|
|
effects: string[];
|
|
flavors: string[];
|
|
imageUrl?: string;
|
|
dutchieUrl: string;
|
|
inStock: boolean;
|
|
stockQuantity?: number;
|
|
stockStatus?: string;
|
|
}
|
|
|
|
interface Brand {
|
|
slug: string;
|
|
name: string;
|
|
url: string;
|
|
}
|
|
|
|
async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise<Brand[]> {
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`🏷️ SCRAPING BRANDS LIST`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
|
|
try {
|
|
const brandsUrl = `${menuUrl}/brands`;
|
|
console.log(`📄 Loading brands page: ${brandsUrl}`);
|
|
|
|
await page.goto(brandsUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
// Wait for brand links to render
|
|
console.log('⏳ Waiting for brands to render...');
|
|
await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 });
|
|
console.log('✅ Brands appeared!');
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Scroll to load all brands (in case of lazy loading)
|
|
console.log('📜 Scrolling to load all brands...');
|
|
let previousHeight = 0;
|
|
let scrollAttempts = 0;
|
|
const maxScrolls = 10;
|
|
|
|
while (scrollAttempts < maxScrolls) {
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(1500);
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === previousHeight) break;
|
|
|
|
previousHeight = currentHeight;
|
|
scrollAttempts++;
|
|
console.log(` Scroll ${scrollAttempts}/${maxScrolls}...`);
|
|
}
|
|
|
|
// Extract all brand links
|
|
const brands = await page.evaluate(() => {
|
|
const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]'));
|
|
console.log(`Found ${brandLinks.length} brand links`);
|
|
|
|
const extracted = brandLinks.map(link => {
|
|
const href = link.getAttribute('href') || '';
|
|
const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || '';
|
|
|
|
// Get brand name from link text or any text content
|
|
const name = link.textContent?.trim() || slug;
|
|
|
|
return {
|
|
slug,
|
|
name,
|
|
url: href.startsWith('http') ? href : href
|
|
};
|
|
});
|
|
|
|
// Filter out duplicates and invalid entries
|
|
const seen = new Set();
|
|
const unique = extracted.filter(b => {
|
|
if (!b.slug || !b.name || seen.has(b.slug)) return false;
|
|
seen.add(b.slug);
|
|
return true;
|
|
});
|
|
|
|
console.log(`Extracted ${unique.length} unique brands`);
|
|
return unique;
|
|
});
|
|
|
|
console.log(`✅ Found ${brands.length} brands to scrape\n`);
|
|
|
|
// Log first few brands for verification
|
|
brands.slice(0, 5).forEach((brand, idx) => {
|
|
console.log(` ${idx + 1}. ${brand.name} (${brand.slug})`);
|
|
});
|
|
|
|
if (brands.length > 5) {
|
|
console.log(` ... and ${brands.length - 5} more brands\n`);
|
|
}
|
|
|
|
return brands;
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error scraping brands list:', error.message);
|
|
// Return empty array if brands page fails - we can still scrape main products page
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function scrapeProductsPage(menuUrl: string, dispensaryId: number) {
|
|
// Pool of realistic user agents - desktop and mobile
|
|
const desktopUserAgents = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/131.0.0.0 Safari/537.36',
|
|
];
|
|
|
|
const mobileUserAgents = [
|
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1',
|
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.0.0 Mobile/15E148 Safari/604.1',
|
|
'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
|
|
'Mozilla/5.0 (Linux; Android 13; SM-S918U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
|
|
];
|
|
|
|
const getRandomUserAgent = (preferMobile = false) => {
|
|
const agents = preferMobile ? mobileUserAgents : desktopUserAgents;
|
|
return agents[Math.floor(Math.random() * agents.length)];
|
|
};
|
|
|
|
const getRandomViewport = (isMobile = false) => {
|
|
if (isMobile) {
|
|
return { width: 390, height: 844 }; // iPhone 14 Pro
|
|
}
|
|
return { width: 1920, height: 1080 }; // Desktop
|
|
};
|
|
|
|
let proxy = await getRandomProxy();
|
|
if (!proxy) {
|
|
console.log('❌ No proxy available');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Randomly choose mobile or desktop for this session
|
|
let isMobile = Math.random() > 0.5;
|
|
let userAgent = getRandomUserAgent(isMobile);
|
|
let viewport = getRandomViewport(isMobile);
|
|
let attemptsWithCurrentUA = 0;
|
|
const maxAttemptsPerUA = 3; // Try 3 proxies before changing UA
|
|
|
|
console.log(`🔐 Using proxy: ${proxy.server}`);
|
|
console.log(`🌐 Using ${isMobile ? 'MOBILE' : 'DESKTOP'} UA: ${userAgent.substring(0, 50)}...`);
|
|
|
|
const browser = await firefox.launch({
|
|
headless: true,
|
|
firefoxUserPrefs: {
|
|
'geo.enabled': true,
|
|
}
|
|
});
|
|
|
|
let context = await browser.newContext({
|
|
viewport,
|
|
userAgent,
|
|
geolocation: { latitude: 33.4484, longitude: -112.0740 },
|
|
permissions: ['geolocation'],
|
|
proxy: {
|
|
server: proxy.server,
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
}
|
|
});
|
|
|
|
let page = await context.newPage();
|
|
|
|
// Helper function to rotate proxy (and occasionally UA)
|
|
const rotateProxy = async () => {
|
|
await context.close();
|
|
attemptsWithCurrentUA++;
|
|
|
|
// Change UA after every 3 proxy attempts, or switch mobile/desktop
|
|
const shouldChangeUA = attemptsWithCurrentUA >= maxAttemptsPerUA;
|
|
if (shouldChangeUA) {
|
|
attemptsWithCurrentUA = 0;
|
|
// 50% chance to switch between mobile/desktop
|
|
if (Math.random() > 0.5) {
|
|
isMobile = !isMobile;
|
|
}
|
|
userAgent = getRandomUserAgent(isMobile);
|
|
viewport = getRandomViewport(isMobile);
|
|
console.log(` 🔄 Rotating to new proxy + NEW ${isMobile ? 'MOBILE' : 'DESKTOP'} UA...`);
|
|
} else {
|
|
console.log(` 🔄 Rotating to new proxy (keeping same UA, ${attemptsWithCurrentUA + 1}/${maxAttemptsPerUA})...`);
|
|
}
|
|
|
|
proxy = await getRandomProxy();
|
|
if (!proxy) {
|
|
throw new Error('No proxy available');
|
|
}
|
|
|
|
console.log(` 🔐 New proxy: ${proxy.server}`);
|
|
if (shouldChangeUA) {
|
|
console.log(` 🌐 New UA: ${userAgent.substring(0, 50)}...`);
|
|
}
|
|
|
|
context = await browser.newContext({
|
|
viewport,
|
|
userAgent,
|
|
geolocation: { latitude: 33.4484, longitude: -112.0740 },
|
|
permissions: ['geolocation'],
|
|
proxy: {
|
|
server: proxy.server,
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
}
|
|
});
|
|
|
|
page = await context.newPage();
|
|
};
|
|
|
|
try {
|
|
// First, scrape the brands list
|
|
let brands: Brand[] = [];
|
|
let brandsLoadRetries = 0;
|
|
const maxBrandsRetries = 10;
|
|
|
|
while (brands.length === 0 && brandsLoadRetries < maxBrandsRetries) {
|
|
try {
|
|
brands = await scrapeBrandsList(menuUrl, context, page);
|
|
break;
|
|
} catch (error: any) {
|
|
brandsLoadRetries++;
|
|
const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') ||
|
|
error.message?.includes('Timeout');
|
|
|
|
if (isProxyError && brandsLoadRetries < maxBrandsRetries) {
|
|
console.log(`⚠️ Proxy failed loading brands page (attempt ${brandsLoadRetries}/${maxBrandsRetries}), rotating...`);
|
|
await rotateProxy();
|
|
} else {
|
|
console.error('⚠️ Failed to load brands, will skip brand scraping');
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Collect all products from all brands
|
|
const allProducts: Product[] = [];
|
|
let totalSuccessCount = 0;
|
|
let totalErrorCount = 0;
|
|
|
|
// Scrape each brand page
|
|
for (let brandIndex = 0; brandIndex < brands.length; brandIndex++) {
|
|
const brand = brands[brandIndex];
|
|
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`🏷️ BRAND ${brandIndex + 1}/${brands.length}: ${brand.name}`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
|
|
// Retry loading brand page if proxy fails
|
|
let loadSuccess = false;
|
|
let loadRetries = 0;
|
|
const maxLoadRetries = 10;
|
|
|
|
while (!loadSuccess && loadRetries < maxLoadRetries) {
|
|
try {
|
|
const brandUrl = `${menuUrl}/brands/${brand.slug}`;
|
|
console.log(`📄 Loading brand page: ${brandUrl}`);
|
|
await page.goto(brandUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
// Wait for React to render products
|
|
console.log('⏳ Waiting for products to render...');
|
|
await page.waitForSelector('a[href*="/product/"]', { timeout: 45000 });
|
|
console.log('✅ Products appeared!');
|
|
await page.waitForTimeout(5000);
|
|
loadSuccess = true;
|
|
|
|
} catch (error: any) {
|
|
loadRetries++;
|
|
const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') ||
|
|
error.message?.includes('Timeout');
|
|
|
|
if (isProxyError && loadRetries < maxLoadRetries) {
|
|
console.log(`⚠️ Proxy failed loading brand page (attempt ${loadRetries}/${maxLoadRetries}), rotating...`);
|
|
await rotateProxy();
|
|
} else {
|
|
console.error(`❌ Failed to load brand page after ${maxLoadRetries} retries, skipping brand: ${brand.name}`);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!loadSuccess) {
|
|
console.log(`⏭️ Skipping brand: ${brand.name}`);
|
|
continue;
|
|
}
|
|
|
|
// Scroll to load all products (lazy loading)
|
|
console.log('📜 Scrolling to load all products...');
|
|
let previousHeight = 0;
|
|
let scrollAttempts = 0;
|
|
const maxScrolls = 20;
|
|
|
|
while (scrollAttempts < maxScrolls) {
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(2000);
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === previousHeight) break;
|
|
|
|
previousHeight = currentHeight;
|
|
scrollAttempts++;
|
|
console.log(` Scroll ${scrollAttempts}/${maxScrolls}...`);
|
|
}
|
|
|
|
console.log('🔍 Extracting product data...');
|
|
|
|
// Extract all product cards from this brand page
|
|
const products = await page.evaluate((brandName) => {
|
|
const productCards = Array.from(document.querySelectorAll('a[href*="/product/"]'));
|
|
console.log(`Found ${productCards.length} product links for brand: ${brandName}`);
|
|
|
|
const extracted = productCards.map((card, idx) => {
|
|
const href = card.getAttribute('href') || '';
|
|
const slug = href.split('/product/')[1]?.replace(/\/$/, '') || '';
|
|
|
|
// Get all text content from the card
|
|
const allText = card.textContent || '';
|
|
|
|
// Name - extract first line of text (usually the product name)
|
|
// Split by newlines and take first non-empty line
|
|
const lines = allText.split('\n').map(l => l.trim()).filter(Boolean);
|
|
const name = lines[0] || '';
|
|
|
|
// Debug first few products
|
|
if (idx < 3) {
|
|
console.log(`Product ${idx}: slug="${slug}", name="${name}", allText="${allText.substring(0, 100)}"`);
|
|
}
|
|
|
|
// Prices - look for dollar amounts
|
|
const priceMatches = allText.match(/\$(\d+\.?\d*)/g) || [];
|
|
const prices = priceMatches.map(p => parseFloat(p.replace('$', '')));
|
|
|
|
// If 2 prices, first is sale, second is regular
|
|
const salePrice = prices.length === 2 ? prices[0] : undefined;
|
|
const regularPrice = prices.length === 2 ? prices[1] : (prices.length === 1 ? prices[0] : undefined);
|
|
|
|
// THC/CBD
|
|
const thcMatch = allText.match(/THC[:\s]*(\d+\.?\d*)\s*%/i);
|
|
const cbdMatch = allText.match(/CBD[:\s]*(\d+\.?\d*)\s*%/i);
|
|
const thc = thcMatch ? parseFloat(thcMatch[1]) : undefined;
|
|
const cbd = cbdMatch ? parseFloat(cbdMatch[1]) : undefined;
|
|
|
|
// Strain type
|
|
let strainType = undefined;
|
|
if (allText.match(/\bindica\b/i)) strainType = 'Indica';
|
|
else if (allText.match(/\bsativa\b/i)) strainType = 'Sativa';
|
|
else if (allText.match(/\bhybrid\b/i)) strainType = 'Hybrid';
|
|
|
|
// Variant/weight
|
|
const weightMatch = allText.match(/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i);
|
|
const variant = weightMatch ? weightMatch[1].trim() : undefined;
|
|
|
|
// Image
|
|
const img = card.querySelector('img');
|
|
const imageUrl = img?.src || undefined;
|
|
|
|
// Stock status parsing
|
|
const allTextLower = allText.toLowerCase();
|
|
const inStock = !allTextLower.includes('out of stock');
|
|
|
|
// Extract stock quantity (e.g., "5 left in stock", "Only 3 left")
|
|
let stockQuantity = undefined;
|
|
const quantityMatch = allText.match(/(\d+)\s+left/i);
|
|
if (quantityMatch) {
|
|
stockQuantity = parseInt(quantityMatch[1]);
|
|
}
|
|
|
|
// Extract stock status messages
|
|
let stockStatus = undefined;
|
|
if (allTextLower.includes('out of stock')) {
|
|
stockStatus = 'out of stock';
|
|
} else if (allTextLower.includes('low stock') || allTextLower.includes('limited')) {
|
|
stockStatus = 'low stock';
|
|
} else if (quantityMatch) {
|
|
stockStatus = `${stockQuantity} left in stock`;
|
|
} else if (allTextLower.includes('order soon')) {
|
|
stockStatus = 'order soon';
|
|
}
|
|
|
|
return {
|
|
slug,
|
|
name,
|
|
brand: brandName, // Set the brand from the brand page
|
|
variant,
|
|
regularPrice,
|
|
salePrice,
|
|
thcPercentage: thc,
|
|
cbdPercentage: cbd,
|
|
strainType,
|
|
imageUrl,
|
|
inStock,
|
|
stockQuantity,
|
|
stockStatus,
|
|
dutchieUrl: href.startsWith('http') ? href : `https://dutchie.com${href}`
|
|
};
|
|
});
|
|
|
|
const filtered = extracted.filter(p => p.slug && p.name);
|
|
console.log(`Extracted ${extracted.length} products, ${filtered.length} with slug+name for brand: ${brandName}`);
|
|
if (filtered.length === 0 && extracted.length > 0) {
|
|
console.log('Sample failed product:', JSON.stringify(extracted[0]));
|
|
}
|
|
|
|
return filtered;
|
|
}, brand.name);
|
|
|
|
console.log(`✅ Extracted ${products.length} products from brand: ${brand.name}`);
|
|
|
|
// Now fetch detailed data for each product in this brand
|
|
for (let i = 0; i < products.length; i++) {
|
|
const product = products[i];
|
|
console.log(`\n[${i + 1}/${products.length}] Processing: ${product.name}`);
|
|
|
|
let retries = 0;
|
|
const maxRetries = 3;
|
|
let success = false;
|
|
|
|
while (retries < maxRetries && !success) {
|
|
try {
|
|
// Navigate to product detail page
|
|
const productUrl = `${menuUrl}/product/${product.slug}`;
|
|
await page.goto(productUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000
|
|
});
|
|
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Extract detailed data
|
|
const detailedData = await page.evaluate(() => {
|
|
const allText = document.body.textContent || '';
|
|
|
|
// Brand
|
|
let brand = undefined;
|
|
const brandEl = document.querySelector('[class*="brand" i], [data-testid*="brand" i]');
|
|
if (brandEl?.textContent) {
|
|
brand = brandEl.textContent.trim();
|
|
}
|
|
|
|
// Description
|
|
let description = undefined;
|
|
const descEl = document.querySelector('[class*="description" i], p[class*="product" i]');
|
|
if (descEl?.textContent && descEl.textContent.length > 20) {
|
|
description = descEl.textContent.trim().substring(0, 1000);
|
|
}
|
|
|
|
// Terpenes
|
|
const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene', 'Terpinolene', 'Ocimene'];
|
|
const terpenes = terpeneNames.filter(t => allText.match(new RegExp(`\\b${t}\\b`, 'i')));
|
|
|
|
// Effects
|
|
const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry', 'Aroused', 'Giggly', 'Tingly'];
|
|
const effects = effectNames.filter(e => allText.match(new RegExp(`\\b${e}\\b`, 'i')));
|
|
|
|
// Flavors
|
|
const flavorNames = ['Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel', 'Sour', 'Floral', 'Spicy', 'Fruity', 'Herbal', 'Mint', 'Woody'];
|
|
const flavors = flavorNames.filter(f => allText.match(new RegExp(`\\b${f}\\b`, 'i')));
|
|
|
|
// Categories (from breadcrumbs)
|
|
const breadcrumbs = Array.from(document.querySelectorAll('[class*="breadcrumb" i] a, nav a'));
|
|
const categories = breadcrumbs
|
|
.map(b => b.textContent?.trim())
|
|
.filter(Boolean)
|
|
.filter(c => c !== 'Home' && c.length > 0);
|
|
|
|
return { brand, description, terpenes, effects, flavors, categories };
|
|
});
|
|
|
|
// Merge detailed data - use brand from page evaluation, or fall back to brand page name
|
|
const fullProduct: Product = {
|
|
...product,
|
|
brand: detailedData.brand || product.brand,
|
|
description: detailedData.description,
|
|
terpenes: detailedData.terpenes,
|
|
effects: detailedData.effects,
|
|
flavors: detailedData.flavors,
|
|
};
|
|
|
|
// Save to database
|
|
await saveProduct(dispensaryId, fullProduct, detailedData.categories);
|
|
totalSuccessCount++;
|
|
console.log(` ✅ Saved (${totalSuccessCount} successful, ${totalErrorCount} errors)`);
|
|
success = true;
|
|
|
|
} catch (error: any) {
|
|
retries++;
|
|
|
|
// Check if it's a proxy error
|
|
const isProxyError = error.message?.includes('NS_ERROR_PROXY_CONNECTION_REFUSED') ||
|
|
error.message?.includes('Timeout');
|
|
|
|
if (isProxyError && retries < maxRetries) {
|
|
console.log(` ⚠️ Proxy error (attempt ${retries}/${maxRetries}), rotating...`);
|
|
await rotateProxy();
|
|
} else if (retries >= maxRetries) {
|
|
totalErrorCount++;
|
|
console.error(` ❌ Failed after ${maxRetries} retries:`, error.message);
|
|
} else {
|
|
totalErrorCount++;
|
|
console.error(` ❌ Error processing ${product.name}:`, error.message);
|
|
break; // Non-proxy error, don't retry
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rate limiting
|
|
if (success) {
|
|
await page.waitForTimeout(1000 + Math.random() * 2000);
|
|
}
|
|
}
|
|
|
|
console.log(`\n 📊 Brand "${brand.name}" complete: ${products.length} products processed`);
|
|
}
|
|
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`✅ ALL BRANDS SCRAPING COMPLETE!`);
|
|
console.log(` Total brands scraped: ${brands.length}`);
|
|
console.log(` Total products found: ${allProducts.length}`);
|
|
console.log(` Successfully saved: ${totalSuccessCount}`);
|
|
console.log(` Errors: ${totalErrorCount}`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
|
|
await browser.close();
|
|
await pool.end();
|
|
|
|
} catch (error) {
|
|
console.error('❌ Fatal error:', error);
|
|
await browser.close();
|
|
await pool.end();
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
async function saveProduct(storeId: number, product: Product, categories: string[]) {
|
|
const client = await pool.connect();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
// Check if product exists
|
|
const existing = await client.query(
|
|
'SELECT id FROM products WHERE dispensary_id = $1 AND slug = $2',
|
|
[storeId, product.slug]
|
|
);
|
|
|
|
let productId: number;
|
|
|
|
if (existing.rows.length > 0) {
|
|
// UPDATE existing product
|
|
productId = existing.rows[0].id;
|
|
|
|
await client.query(`
|
|
UPDATE products
|
|
SET name = $1, brand = $2, variant = $3, description = $4,
|
|
regular_price = $5, sale_price = $6,
|
|
thc_percentage = $7, cbd_percentage = $8, strain_type = $9,
|
|
terpenes = $10, effects = $11, flavors = $12,
|
|
image_url = $13, dutchie_url = $14, in_stock = $15,
|
|
stock_quantity = $16, stock_status = $17,
|
|
last_seen_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $18
|
|
`, [
|
|
product.name, product.brand, product.variant, product.description,
|
|
product.regularPrice, product.salePrice,
|
|
product.thcPercentage, product.cbdPercentage, product.strainType,
|
|
product.terpenes, product.effects, product.flavors,
|
|
product.imageUrl, product.dutchieUrl, product.inStock,
|
|
product.stockQuantity, product.stockStatus,
|
|
productId
|
|
]);
|
|
|
|
} else {
|
|
// INSERT new product
|
|
const result = await client.query(`
|
|
INSERT INTO products (
|
|
dispensary_id, slug, name, brand, variant, description,
|
|
regular_price, sale_price,
|
|
thc_percentage, cbd_percentage, strain_type,
|
|
terpenes, effects, flavors,
|
|
image_url, dutchie_url, in_stock, stock_quantity, stock_status
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
|
|
RETURNING id
|
|
`, [
|
|
storeId, product.slug, product.name, product.brand, product.variant, product.description,
|
|
product.regularPrice, product.salePrice,
|
|
product.thcPercentage, product.cbdPercentage, product.strainType,
|
|
product.terpenes, product.effects, product.flavors,
|
|
product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus
|
|
]);
|
|
|
|
productId = result.rows[0].id;
|
|
}
|
|
|
|
// Record price history (if we have price data)
|
|
if (product.regularPrice !== undefined || product.salePrice !== undefined) {
|
|
await client.query(`
|
|
INSERT INTO price_history (product_id, regular_price, sale_price)
|
|
VALUES ($1, $2, $3)
|
|
`, [productId, product.regularPrice, product.salePrice]);
|
|
}
|
|
|
|
// Record batch history (if we have cannabinoid data)
|
|
if (product.thcPercentage !== undefined || product.cbdPercentage !== undefined) {
|
|
await client.query(`
|
|
INSERT INTO batch_history (product_id, thc_percentage, cbd_percentage, terpenes, strain_type)
|
|
VALUES ($1, $2, $3, $4, $5)
|
|
`, [productId, product.thcPercentage, product.cbdPercentage, product.terpenes, product.strainType]);
|
|
}
|
|
|
|
// Update product_categories
|
|
for (const categorySlug of categories) {
|
|
await client.query(`
|
|
INSERT INTO product_categories (product_id, category_slug, last_seen_at)
|
|
VALUES ($1, $2, CURRENT_TIMESTAMP)
|
|
ON CONFLICT (product_id, category_slug)
|
|
DO UPDATE SET last_seen_at = CURRENT_TIMESTAMP
|
|
`, [productId, categorySlug.toLowerCase()]);
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
|
|
} catch (error) {
|
|
await client.query('ROLLBACK');
|
|
throw error;
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
// Main execution
|
|
async function main() {
|
|
// Get dispensary ID from command line argument or default to 112 (Deeply Rooted)
|
|
const dispensaryIdArg = process.argv[2];
|
|
let dispensaryId: number;
|
|
|
|
if (dispensaryIdArg) {
|
|
// If dispensary ID is provided as argument, use it directly
|
|
dispensaryId = parseInt(dispensaryIdArg, 10);
|
|
const dispensaryResult = await pool.query("SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId]);
|
|
|
|
if (dispensaryResult.rows.length === 0) {
|
|
console.error(`❌ Dispensary ID ${dispensaryId} not found`);
|
|
process.exit(1);
|
|
}
|
|
|
|
if (!dispensaryResult.rows[0].menu_url) {
|
|
console.error(`❌ Dispensary ${dispensaryResult.rows[0].name} has no menu_url configured`);
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`✅ Found dispensary ID: ${dispensaryId} - ${dispensaryResult.rows[0].name}`);
|
|
} else {
|
|
// Default behavior: use Deeply Rooted (ID 112)
|
|
dispensaryId = 112;
|
|
const dispensaryResult = await pool.query("SELECT id, name, menu_url FROM dispensaries WHERE id = $1", [dispensaryId]);
|
|
|
|
if (dispensaryResult.rows.length === 0) {
|
|
console.error('❌ Deeply Rooted dispensary not found');
|
|
process.exit(1);
|
|
}
|
|
console.log(`✅ Found dispensary ID: ${dispensaryId} - ${dispensaryResult.rows[0].name}`);
|
|
}
|
|
|
|
// Get the menu URL for the dispensary
|
|
const dispensaryData = await pool.query("SELECT menu_url FROM dispensaries WHERE id = $1", [dispensaryId]);
|
|
const menuUrl = dispensaryData.rows[0].menu_url;
|
|
|
|
await scrapeProductsPage(menuUrl, dispensaryId);
|
|
}
|
|
|
|
main().catch(console.error);
|