- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
253 lines
8.0 KiB
TypeScript
253 lines
8.0 KiB
TypeScript
import { chromium } from 'playwright';
|
|
import { pool } from './src/db/migrate.js';
|
|
|
|
const workerNum = process.argv[2] || `T${Date.now().toString().slice(-4)}`;
|
|
const dispensaryId = parseInt(process.argv[3] || '149', 10);
|
|
|
|
const GOOGLE_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
|
|
interface Product {
|
|
name: string;
|
|
brand: string;
|
|
variant: string;
|
|
category: string;
|
|
strain_type?: string;
|
|
thc_percentage?: number;
|
|
cbd_percentage?: number;
|
|
regular_price?: number;
|
|
sale_price?: number;
|
|
price_per_unit?: string;
|
|
dutchie_url: string;
|
|
in_stock: boolean;
|
|
}
|
|
|
|
async function main() {
|
|
console.log(`[${workerNum}] Starting Treez scraper for dispensary ${dispensaryId}`);
|
|
|
|
// Get dispensary details
|
|
const dispensaryResult = await pool.query(`
|
|
SELECT id, name, menu_url
|
|
FROM dispensaries
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
if (dispensaryResult.rows.length === 0) {
|
|
console.error(`[${workerNum}] Dispensary ${dispensaryId} not found`);
|
|
await pool.end();
|
|
return;
|
|
}
|
|
|
|
const dispensary = dispensaryResult.rows[0];
|
|
const menuUrl = dispensary.menu_url;
|
|
|
|
if (!menuUrl) {
|
|
console.error(`[${workerNum}] No menu URL for dispensary ${dispensary.name}`);
|
|
await pool.end();
|
|
return;
|
|
}
|
|
|
|
console.log(`[${workerNum}] Scraping: ${dispensary.name}`);
|
|
console.log(`[${workerNum}] Menu URL: ${menuUrl}`);
|
|
|
|
const browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
userAgent: GOOGLE_UA
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
console.log(`[${workerNum}] Loading menu page...`);
|
|
await page.goto(menuUrl, {
|
|
waitUntil: 'networkidle',
|
|
timeout: 30000
|
|
});
|
|
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Scroll to bottom to trigger lazy loading
|
|
console.log(`[${workerNum}] Scrolling to load all products...`);
|
|
let previousCount = 0;
|
|
let currentCount = 0;
|
|
let scrollAttempts = 0;
|
|
const maxScrollAttempts = 50; // Increased to capture hundreds of products
|
|
|
|
do {
|
|
previousCount = currentCount;
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(2000);
|
|
currentCount = await page.locator('.menu-item').count();
|
|
scrollAttempts++;
|
|
console.log(`[${workerNum}] Scroll ${scrollAttempts}: ${currentCount} items loaded`);
|
|
} while (currentCount > previousCount && scrollAttempts < maxScrollAttempts);
|
|
|
|
// Get all menu items
|
|
const menuItems = await page.locator('.menu-item').all();
|
|
console.log(`[${workerNum}] Found ${menuItems.length} menu items after scrolling`);
|
|
|
|
const products: Product[] = [];
|
|
|
|
for (let i = 0; i < menuItems.length; i++) {
|
|
try {
|
|
const item = menuItems[i];
|
|
const link = item.locator('a.link').first();
|
|
const href = await link.getAttribute('href');
|
|
|
|
if (!href) continue;
|
|
|
|
// Extract brand from .item-text-1
|
|
const brandElement = item.locator('.item-text-1').first();
|
|
const brand = await brandElement.textContent();
|
|
|
|
// Extract product name from .item-text-2
|
|
const nameElement = item.locator('.item-text-2').first();
|
|
const productLine = await nameElement.textContent();
|
|
|
|
// Parse product name and variant from productLine
|
|
const name = productLine?.split(/\d+\.?\d*\s*G/i)[0].trim() || '';
|
|
const variant = productLine?.match(/\d+\.?\d*\s*G/i)?.[0] || '';
|
|
|
|
// Get full text for strain/cannabinoid extraction
|
|
const text = await item.textContent();
|
|
|
|
// Extract strain type
|
|
const strainMatch = text?.match(/(SATIVA|INDICA|HYBRID|I\/S|S\/I)/i);
|
|
const strainType = strainMatch ? strainMatch[1] : null;
|
|
|
|
// Extract THC percentage
|
|
const thcMatch = text?.match(/THC\s+([\d.]+)%/i);
|
|
const thcPercentage = thcMatch ? parseFloat(thcMatch[1]) : null;
|
|
|
|
// Extract CBD percentage
|
|
const cbdMatch = text?.match(/CBD\s+([\d.]+)%/i);
|
|
const cbdPercentage = cbdMatch ? parseFloat(cbdMatch[1]) : null;
|
|
|
|
// Extract price
|
|
const priceMatch = text?.match(/\$(\d+\.?\d*)/);
|
|
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
|
|
|
// Extract price per unit
|
|
const pricePerMatch = text?.match(/\$\d+\.?\d*\/\s*([^\s]+)/);
|
|
const pricePerUnit = pricePerMatch ? pricePerMatch[0] : null;
|
|
|
|
// Extract category from URL
|
|
const categoryMatch = href.match(/\/category\/([^\/]+)\//);
|
|
const category = categoryMatch ? categoryMatch[1] : 'unknown';
|
|
|
|
const fullUrl = href.startsWith('http') ? href : `https://best.treez.io${href}`;
|
|
|
|
const product: Product = {
|
|
name: name || productLine || '',
|
|
brand: brand?.trim() || '',
|
|
variant: variant.trim(),
|
|
category,
|
|
strain_type: strainType || undefined,
|
|
thc_percentage: thcPercentage || undefined,
|
|
cbd_percentage: cbdPercentage || undefined,
|
|
regular_price: price || undefined,
|
|
sale_price: price || undefined,
|
|
price_per_unit: pricePerUnit || undefined,
|
|
dutchie_url: fullUrl,
|
|
in_stock: true
|
|
};
|
|
|
|
products.push(product);
|
|
|
|
} catch (error: any) {
|
|
console.error(`[${workerNum}] Error parsing item ${i}:`, error.message);
|
|
}
|
|
}
|
|
|
|
console.log(`[${workerNum}] Parsed ${products.length} products`);
|
|
|
|
// Save to database
|
|
let saved = 0;
|
|
for (const product of products) {
|
|
try {
|
|
// Create slug
|
|
const slug = `${product.brand}-${product.name}-${product.variant}`
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-|-$/g, '');
|
|
|
|
// Check if product already exists
|
|
const existing = await pool.query(`
|
|
SELECT id FROM products
|
|
WHERE dispensary_id = $1 AND slug = $2
|
|
`, [dispensaryId, slug]);
|
|
|
|
if (existing.rows.length > 0) {
|
|
// Update existing product
|
|
await pool.query(`
|
|
UPDATE products SET
|
|
name = $1,
|
|
brand = $2,
|
|
variant = $3,
|
|
strain_type = $4,
|
|
thc_percentage = $5,
|
|
cbd_percentage = $6,
|
|
regular_price = $7,
|
|
sale_price = $8,
|
|
dutchie_url = $9,
|
|
in_stock = $10,
|
|
updated_at = NOW()
|
|
WHERE dispensary_id = $11 AND slug = $12
|
|
`, [
|
|
product.name,
|
|
product.brand,
|
|
product.variant,
|
|
product.strain_type,
|
|
product.thc_percentage,
|
|
product.cbd_percentage,
|
|
product.regular_price,
|
|
product.sale_price,
|
|
product.dutchie_url,
|
|
product.in_stock,
|
|
dispensaryId,
|
|
slug
|
|
]);
|
|
} else {
|
|
// Insert new product
|
|
await pool.query(`
|
|
INSERT INTO products (
|
|
dispensary_id, slug, name, brand, variant,
|
|
strain_type, thc_percentage, cbd_percentage,
|
|
regular_price, sale_price, dutchie_url, in_stock,
|
|
created_at, updated_at
|
|
) VALUES (
|
|
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW(), NOW()
|
|
)
|
|
`, [
|
|
dispensaryId,
|
|
slug,
|
|
product.name,
|
|
product.brand,
|
|
product.variant,
|
|
product.strain_type,
|
|
product.thc_percentage,
|
|
product.cbd_percentage,
|
|
product.regular_price,
|
|
product.sale_price,
|
|
product.dutchie_url,
|
|
product.in_stock
|
|
]);
|
|
}
|
|
|
|
saved++;
|
|
} catch (error: any) {
|
|
console.error(`[${workerNum}] Error saving product:`, error.message);
|
|
}
|
|
}
|
|
|
|
console.log(`[${workerNum}] ✅ Saved ${saved}/${products.length} products`);
|
|
|
|
} catch (error: any) {
|
|
console.error(`[${workerNum}] ❌ Error:`, error.message);
|
|
} finally {
|
|
await browser.close();
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
main().catch(console.error);
|