- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
348 lines
11 KiB
TypeScript
348 lines
11 KiB
TypeScript
import { firefox } from 'playwright';
|
|
import { pool } from './src/db/migrate.js';
|
|
import { getRandomProxy } from './src/utils/proxyManager.js';
|
|
|
|
// Get command line arguments
|
|
const dispensaryId = parseInt(process.argv[2] || '112', 10);
|
|
const startBrandIndex = parseInt(process.argv[3] || '0', 10);
|
|
const endBrandIndex = parseInt(process.argv[4] || '89', 10);
|
|
const workerNum = process.argv[5] || '1';
|
|
|
|
interface Product {
|
|
slug: string;
|
|
name: string;
|
|
brand?: string;
|
|
variant?: string;
|
|
description?: string;
|
|
regularPrice?: number;
|
|
salePrice?: number;
|
|
thcPercentage?: number;
|
|
cbdPercentage?: number;
|
|
strainType?: string;
|
|
terpenes: string[];
|
|
effects: string[];
|
|
flavors: string[];
|
|
imageUrl?: string;
|
|
dutchieUrl: string;
|
|
inStock: boolean;
|
|
stockQuantity?: number;
|
|
stockStatus?: string;
|
|
}
|
|
|
|
interface Brand {
|
|
slug: string;
|
|
name: string;
|
|
url: string;
|
|
}
|
|
|
|
async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise<Brand[]> {
|
|
try {
|
|
const brandsUrl = `${menuUrl}/brands`;
|
|
console.log(`[W${workerNum}] 📄 Loading brands page: ${brandsUrl}`);
|
|
|
|
await page.goto(brandsUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
console.log(`[W${workerNum}] ⏳ Waiting for brands to render...`);
|
|
await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 });
|
|
console.log(`[W${workerNum}] ✅ Brands appeared!`);
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Scroll to load all brands
|
|
console.log(`[W${workerNum}] 📜 Scrolling to load all brands...`);
|
|
let previousHeight = 0;
|
|
let scrollAttempts = 0;
|
|
const maxScrolls = 10;
|
|
|
|
while (scrollAttempts < maxScrolls) {
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(1500);
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === previousHeight) break;
|
|
|
|
previousHeight = currentHeight;
|
|
scrollAttempts++;
|
|
}
|
|
|
|
// Extract all brand links
|
|
const brands = await page.evaluate(() => {
|
|
const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]'));
|
|
|
|
const extracted = brandLinks.map(link => {
|
|
const href = link.getAttribute('href') || '';
|
|
const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || '';
|
|
const name = link.textContent?.trim() || slug;
|
|
|
|
return {
|
|
slug,
|
|
name,
|
|
url: href.startsWith('http') ? href : href
|
|
};
|
|
});
|
|
|
|
// Filter out duplicates and invalid entries
|
|
const seen = new Set();
|
|
const unique = extracted.filter(b => {
|
|
if (!b.slug || !b.name || seen.has(b.slug)) return false;
|
|
seen.add(b.slug);
|
|
return true;
|
|
});
|
|
|
|
return unique;
|
|
});
|
|
|
|
console.log(`[W${workerNum}] ✅ Found ${brands.length} total brands`);
|
|
return brands;
|
|
|
|
} catch (error: any) {
|
|
console.error(`[W${workerNum}] ❌ Error scraping brands list:`, error.message);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function scrapeProductsFromBrand(
|
|
menuUrl: string,
|
|
brand: Brand,
|
|
dispensaryId: number,
|
|
page: any
|
|
): Promise<Product[]> {
|
|
try {
|
|
const brandUrl = `${menuUrl}/brands/${brand.slug}`;
|
|
console.log(`[W${workerNum}] 📄 Loading brand page: ${brandUrl}`);
|
|
|
|
await page.goto(brandUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000
|
|
});
|
|
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Scroll to load all products
|
|
for (let i = 0; i < 20; i++) {
|
|
await page.evaluate(() => window.scrollBy(0, window.innerHeight));
|
|
await page.waitForTimeout(1500);
|
|
}
|
|
|
|
// Extract products
|
|
const products = await page.evaluate((brandName: string) => {
|
|
const productCards = Array.from(document.querySelectorAll('a[href*="/product/"]'));
|
|
|
|
return productCards.map(card => {
|
|
const href = card.getAttribute('href') || '';
|
|
const slug = href.split('/product/')[1]?.replace(/\/$/, '') || '';
|
|
const allText = card.textContent || '';
|
|
const allTextLower = allText.toLowerCase();
|
|
const inStock = !allTextLower.includes('out of stock');
|
|
|
|
// Extract stock quantity (e.g., "5 left in stock", "Only 3 left")
|
|
let stockQuantity = undefined;
|
|
const quantityMatch = allText.match(/(\d+)\s+left/i);
|
|
if (quantityMatch) {
|
|
stockQuantity = parseInt(quantityMatch[1]);
|
|
}
|
|
|
|
// Extract stock status messages
|
|
let stockStatus = undefined;
|
|
if (allTextLower.includes('out of stock')) {
|
|
stockStatus = 'out of stock';
|
|
} else if (allTextLower.includes('low stock') || allTextLower.includes('limited')) {
|
|
stockStatus = 'low stock';
|
|
} else if (quantityMatch) {
|
|
stockStatus = `${stockQuantity} left in stock`;
|
|
} else if (allTextLower.includes('order soon')) {
|
|
stockStatus = 'order soon';
|
|
}
|
|
|
|
return {
|
|
slug,
|
|
name: card.textContent?.trim().substring(0, 200) || '',
|
|
brand: brandName,
|
|
dutchieUrl: href,
|
|
inStock,
|
|
stockQuantity,
|
|
stockStatus,
|
|
terpenes: [],
|
|
effects: [],
|
|
flavors: []
|
|
};
|
|
});
|
|
}, brand.name);
|
|
|
|
console.log(`[W${workerNum}] ✅ Extracted ${products.length} products from brand: ${brand.name}`);
|
|
return products;
|
|
|
|
} catch (error: any) {
|
|
console.error(`[W${workerNum}] ❌ Error scraping brand ${brand.name}:`, error.message);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function saveProduct(product: Product, dispensaryId: number): Promise<boolean> {
|
|
const client = await pool.connect();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
// Check if product exists
|
|
const existing = await client.query(
|
|
'SELECT id FROM products WHERE dispensary_id = $1 AND slug = $2',
|
|
[dispensaryId, product.slug]
|
|
);
|
|
|
|
let productId: number;
|
|
|
|
if (existing.rows.length > 0) {
|
|
// UPDATE existing product
|
|
productId = existing.rows[0].id;
|
|
|
|
await client.query(`
|
|
UPDATE products
|
|
SET name = $1, brand = $2, variant = $3, description = $4,
|
|
regular_price = $5, sale_price = $6,
|
|
thc_percentage = $7, cbd_percentage = $8, strain_type = $9,
|
|
terpenes = $10, effects = $11, flavors = $12,
|
|
image_url = $13, dutchie_url = $14, in_stock = $15,
|
|
stock_quantity = $16, stock_status = $17,
|
|
last_seen_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $18
|
|
`, [
|
|
product.name, product.brand, product.variant, product.description,
|
|
product.regularPrice, product.salePrice,
|
|
product.thcPercentage, product.cbdPercentage, product.strainType,
|
|
product.terpenes, product.effects, product.flavors,
|
|
product.imageUrl, product.dutchieUrl, product.inStock,
|
|
product.stockQuantity, product.stockStatus,
|
|
productId
|
|
]);
|
|
} else {
|
|
// INSERT new product
|
|
const result = await client.query(`
|
|
INSERT INTO products (
|
|
dispensary_id, slug, name, brand, variant, description,
|
|
regular_price, sale_price,
|
|
thc_percentage, cbd_percentage, strain_type,
|
|
terpenes, effects, flavors,
|
|
image_url, dutchie_url, in_stock, stock_quantity, stock_status
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
|
|
RETURNING id
|
|
`, [
|
|
dispensaryId, product.slug, product.name, product.brand, product.variant, product.description,
|
|
product.regularPrice, product.salePrice,
|
|
product.thcPercentage, product.cbdPercentage, product.strainType,
|
|
product.terpenes, product.effects, product.flavors,
|
|
product.imageUrl, product.dutchieUrl, product.inStock, product.stockQuantity, product.stockStatus
|
|
]);
|
|
|
|
productId = result.rows[0].id;
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
return true;
|
|
} catch (error: any) {
|
|
await client.query('ROLLBACK');
|
|
console.error(`[W${workerNum}] ❌ Error saving product:`, error.message);
|
|
return false;
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`🚀 PARALLEL SCRAPER - WORKER ${workerNum}`);
|
|
console.log(` Dispensary ID: ${dispensaryId}`);
|
|
console.log(` Brand Range: ${startBrandIndex} - ${endBrandIndex}`);
|
|
console.log(`${'='.repeat(60)}\n`);
|
|
|
|
// Get dispensary info
|
|
const dispensaryResult = await pool.query(
|
|
"SELECT id, name, menu_url FROM dispensaries WHERE id = $1",
|
|
[dispensaryId]
|
|
);
|
|
|
|
if (dispensaryResult.rows.length === 0) {
|
|
console.error(`[W${workerNum}] ❌ Dispensary ID ${dispensaryId} not found`);
|
|
process.exit(1);
|
|
}
|
|
|
|
const menuUrl = dispensaryResult.rows[0].menu_url;
|
|
console.log(`[W${workerNum}] ✅ Dispensary: ${dispensaryResult.rows[0].name}`);
|
|
|
|
// Get proxy
|
|
const proxy = await getRandomProxy();
|
|
if (!proxy) {
|
|
console.log(`[W${workerNum}] ❌ No proxy available`);
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`[W${workerNum}] 🔐 Using proxy: ${proxy.server}`);
|
|
|
|
// Launch browser
|
|
const browser = await firefox.launch({ headless: true });
|
|
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 },
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
proxy: {
|
|
server: proxy.server,
|
|
username: proxy.username,
|
|
password: proxy.password
|
|
}
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
// Get all brands
|
|
const allBrands = await scrapeBrandsList(menuUrl, context, page);
|
|
|
|
if (allBrands.length === 0) {
|
|
console.log(`[W${workerNum}] ❌ No brands found`);
|
|
await browser.close();
|
|
process.exit(1);
|
|
}
|
|
|
|
// Filter to assigned range
|
|
const brandsToScrape = allBrands.slice(startBrandIndex, endBrandIndex + 1);
|
|
console.log(`[W${workerNum}] 📋 Processing ${brandsToScrape.length} brands (${startBrandIndex}-${endBrandIndex})`);
|
|
|
|
let totalProducts = 0;
|
|
let totalSaved = 0;
|
|
|
|
// Scrape each brand
|
|
for (let i = 0; i < brandsToScrape.length; i++) {
|
|
const brand = brandsToScrape[i];
|
|
const globalIndex = startBrandIndex + i;
|
|
|
|
console.log(`\n[W${workerNum}] ${'='.repeat(60)}`);
|
|
console.log(`[W${workerNum}] 🏷️ BRAND ${i + 1}/${brandsToScrape.length} (Global: ${globalIndex + 1}/90): ${brand.name}`);
|
|
console.log(`[W${workerNum}] ${'='.repeat(60)}\n`);
|
|
|
|
const products = await scrapeProductsFromBrand(menuUrl, brand, dispensaryId, page);
|
|
totalProducts += products.length;
|
|
|
|
// Save products
|
|
for (const product of products) {
|
|
const saved = await saveProduct(product, dispensaryId);
|
|
if (saved) totalSaved++;
|
|
}
|
|
|
|
console.log(`[W${workerNum}] ✅ Brand "${brand.name}" complete: ${products.length} products saved`);
|
|
await page.waitForTimeout(2000);
|
|
}
|
|
|
|
console.log(`\n[W${workerNum}] ${'='.repeat(60)}`);
|
|
console.log(`[W${workerNum}] ✅ WORKER ${workerNum} COMPLETE`);
|
|
console.log(`[W${workerNum}] Brands scraped: ${brandsToScrape.length}`);
|
|
console.log(`[W${workerNum}] Total products: ${totalProducts}`);
|
|
console.log(`[W${workerNum}] Products saved: ${totalSaved}`);
|
|
console.log(`[W${workerNum}] ${'='.repeat(60)}\n`);
|
|
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
|
|
main().catch(console.error);
|