Files
cannaiq/backend/scrape-parallel-brands.ts
2025-11-28 19:45:44 -07:00

348 lines
11 KiB
TypeScript

import { firefox } from 'playwright';
import { pool } from './src/db/migrate.js';
import { getRandomProxy } from './src/utils/proxyManager.js';
// Get command line arguments
const dispensaryId = parseInt(process.argv[2] || '112', 10);
const startBrandIndex = parseInt(process.argv[3] || '0', 10);
const endBrandIndex = parseInt(process.argv[4] || '89', 10);
const workerNum = process.argv[5] || '1';
interface Product {
slug: string;
name: string;
brand?: string;
variant?: string;
description?: string;
regularPrice?: number;
salePrice?: number;
thcPercentage?: number;
cbdPercentage?: number;
strainType?: string;
terpenes: string[];
effects: string[];
flavors: string[];
imageUrl?: string;
dutchieUrl: string;
inStock: boolean;
stockQuantity?: number;
stockStatus?: string;
}
interface Brand {
slug: string;
name: string;
url: string;
}
async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise<Brand[]> {
try {
const brandsUrl = `${menuUrl}/brands`;
console.log(`[W${workerNum}] 📄 Loading brands page: ${brandsUrl}`);
await page.goto(brandsUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
console.log(`[W${workerNum}] ⏳ Waiting for brands to render...`);
await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 });
console.log(`[W${workerNum}] ✅ Brands appeared!`);
await page.waitForTimeout(3000);
// Scroll to load all brands
console.log(`[W${workerNum}] 📜 Scrolling to load all brands...`);
let previousHeight = 0;
let scrollAttempts = 0;
const maxScrolls = 10;
while (scrollAttempts < maxScrolls) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1500);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) break;
previousHeight = currentHeight;
scrollAttempts++;
}
// Extract all brand links
const brands = await page.evaluate(() => {
const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]'));
const extracted = brandLinks.map(link => {
const href = link.getAttribute('href') || '';
const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || '';
const name = link.textContent?.trim() || slug;
return {
slug,
name,
url: href.startsWith('http') ? href : href
};
});
// Filter out duplicates and invalid entries
const seen = new Set();
const unique = extracted.filter(b => {
if (!b.slug || !b.name || seen.has(b.slug)) return false;
seen.add(b.slug);
return true;
});
return unique;
});
console.log(`[W${workerNum}] ✅ Found ${brands.length} total brands`);
return brands;
} catch (error: any) {
console.error(`[W${workerNum}] ❌ Error scraping brands list:`, error.message);
return [];
}
}
async function scrapeProductsFromBrand(
menuUrl: string,
brand: Brand,
dispensaryId: number,
page: any
): Promise<Product[]> {
try {
const brandUrl = `${menuUrl}/brands/${brand.slug}`;
console.log(`[W${workerNum}] 📄 Loading brand page: ${brandUrl}`);
await page.goto(brandUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
await page.waitForTimeout(3000);
// Scroll to load all products
for (let i = 0; i < 20; i++) {
await page.evaluate(() => window.scrollBy(0, window.innerHeight));
await page.waitForTimeout(1500);
}
// Extract products
const products = await page.evaluate((brandName: string) => {
const productCards = Array.from(document.querySelectorAll('a[href*="/product/"]'));
return productCards.map(card => {
const href = card.getAttribute('href') || '';
const slug = href.split('/product/')[1]?.replace(/\/$/, '') || '';
const allText = card.textContent || '';
const allTextLower = allText.toLowerCase();
const inStock = !allTextLower.includes('out of stock');
// Extract stock quantity (e.g., "5 left in stock", "Only 3 left")
let stockQuantity = undefined;
const quantityMatch = allText.match(/(\d+)\s+left/i);
if (quantityMatch) {
stockQuantity = parseInt(quantityMatch[1]);
}
// Extract stock status messages
let stockStatus = undefined;
if (allTextLower.includes('out of stock')) {
stockStatus = 'out of stock';
} else if (allTextLower.includes('low stock') || allTextLower.includes('limited')) {
stockStatus = 'low stock';
} else if (quantityMatch) {
stockStatus = `${stockQuantity} left in stock`;
} else if (allTextLower.includes('order soon')) {
stockStatus = 'order soon';
}
return {
slug,
name: card.textContent?.trim().substring(0, 200) || '',
brand: brandName,
dutchieUrl: href,
inStock,
stockQuantity,
stockStatus,
terpenes: [],
effects: [],
flavors: []
};
});
}, brand.name);
console.log(`[W${workerNum}] ✅ Extracted ${products.length} products from brand: ${brand.name}`);
return products;
} catch (error: any) {
console.error(`[W${workerNum}] ❌ Error scraping brand ${brand.name}:`, error.message);
return [];
}
}
async function saveProduct(product: Product, dispensaryId: number): Promise<boolean> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Check if product exists
const existing = await client.query(
'SELECT id FROM products WHERE dispensary_id = $1 AND slug = $2',
[dispensaryId, product.slug]
);
let productId: number;
if (existing.rows.length > 0) {
// UPDATE existing product
productId = existing.rows[0].id;
await client.query(`
UPDATE products
SET name = $1, brand = $2, variant = $3, description = $4,
regular_price = $5, sale_price = $6,
thc_percentage = $7, cbd_percentage = $8, strain_type = $9,
terpenes = $10, effects = $11, flavors = $12,
image_url = $13, dutchie_url = $14, in_stock = $15,
stock_quantity = $16, stock_status = $17,
last_seen_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
WHERE id = $18
`, [
product.name, product.brand, product.variant, product.description,
product.regularPrice, product.salePrice,
product.thcPercentage, product.cbdPercentage, product.strainType,
product.terpenes, product.effects, product.flavors,
product.imageUrl, product.dutchieUrl, product.inStock,
product.stockQuantity, product.stockStatus,
productId
]);
} else {
// INSERT new product
const result = await client.query(`
INSERT INTO products (
dispensary_id, slug, name, brand, variant, description,
regular_price, sale_price,
thc_percentage, cbd_percentage, strain_type,
terpenes, effects, flavors,
image_url, dutchie_url, in_stock, stock_quantity, stock_status
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
RETURNING id
`, [
dispensaryId, product.slug, product.name, product.brand, product.variant, product.description,
product.regularPrice, product.salePrice,
product.thcPercentage, product.cbdPercentage, product.strainType,
product.terpenes, product.effects, product.flavors,
product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus
]);
productId = result.rows[0].id;
}
await client.query('COMMIT');
return true;
} catch (error: any) {
await client.query('ROLLBACK');
console.error(`[W${workerNum}] ❌ Error saving product:`, error.message);
return false;
} finally {
client.release();
}
}
async function main() {
console.log(`\n${'='.repeat(60)}`);
console.log(`🚀 PARALLEL SCRAPER - WORKER ${workerNum}`);
console.log(` Dispensary ID: ${dispensaryId}`);
console.log(` Brand Range: ${startBrandIndex} - ${endBrandIndex}`);
console.log(`${'='.repeat(60)}\n`);
// Get dispensary info
const dispensaryResult = await pool.query(
"SELECT id, name, menu_url FROM dispensaries WHERE id = $1",
[dispensaryId]
);
if (dispensaryResult.rows.length === 0) {
console.error(`[W${workerNum}] ❌ Dispensary ID ${dispensaryId} not found`);
process.exit(1);
}
const menuUrl = dispensaryResult.rows[0].menu_url;
console.log(`[W${workerNum}] ✅ Dispensary: ${dispensaryResult.rows[0].name}`);
// Get proxy
const proxy = await getRandomProxy();
if (!proxy) {
console.log(`[W${workerNum}] ❌ No proxy available`);
process.exit(1);
}
console.log(`[W${workerNum}] 🔐 Using proxy: ${proxy.server}`);
// Launch browser
const browser = await firefox.launch({ headless: true });
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
});
const page = await context.newPage();
// Get all brands
const allBrands = await scrapeBrandsList(menuUrl, context, page);
if (allBrands.length === 0) {
console.log(`[W${workerNum}] ❌ No brands found`);
await browser.close();
process.exit(1);
}
// Filter to assigned range
const brandsToScrape = allBrands.slice(startBrandIndex, endBrandIndex + 1);
console.log(`[W${workerNum}] 📋 Processing ${brandsToScrape.length} brands (${startBrandIndex}-${endBrandIndex})`);
let totalProducts = 0;
let totalSaved = 0;
// Scrape each brand
for (let i = 0; i < brandsToScrape.length; i++) {
const brand = brandsToScrape[i];
const globalIndex = startBrandIndex + i;
console.log(`\n[W${workerNum}] ${'='.repeat(60)}`);
console.log(`[W${workerNum}] 🏷️ BRAND ${i + 1}/${brandsToScrape.length} (Global: ${globalIndex + 1}/90): ${brand.name}`);
console.log(`[W${workerNum}] ${'='.repeat(60)}\n`);
const products = await scrapeProductsFromBrand(menuUrl, brand, dispensaryId, page);
totalProducts += products.length;
// Save products
for (const product of products) {
const saved = await saveProduct(product, dispensaryId);
if (saved) totalSaved++;
}
console.log(`[W${workerNum}] ✅ Brand "${brand.name}" complete: ${products.length} products saved`);
await page.waitForTimeout(2000);
}
console.log(`\n[W${workerNum}] ${'='.repeat(60)}`);
console.log(`[W${workerNum}] ✅ WORKER ${workerNum} COMPLETE`);
console.log(`[W${workerNum}] Brands scraped: ${brandsToScrape.length}`);
console.log(`[W${workerNum}] Total products: ${totalProducts}`);
console.log(`[W${workerNum}] Products saved: ${totalSaved}`);
console.log(`[W${workerNum}] ${'='.repeat(60)}\n`);
await browser.close();
await pool.end();
}
main().catch(console.error);