Files
cannaiq/backend/archive/scrape-treez.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

253 lines
8.0 KiB
TypeScript

import { chromium } from 'playwright';
import { pool } from './src/db/migrate.js';
const workerNum = process.argv[2] || `T${Date.now().toString().slice(-4)}`;
const dispensaryId = parseInt(process.argv[3] || '149', 10);
const GOOGLE_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
interface Product {
name: string;
brand: string;
variant: string;
category: string;
strain_type?: string;
thc_percentage?: number;
cbd_percentage?: number;
regular_price?: number;
sale_price?: number;
price_per_unit?: string;
dutchie_url: string;
in_stock: boolean;
}
async function main() {
console.log(`[${workerNum}] Starting Treez scraper for dispensary ${dispensaryId}`);
// Get dispensary details
const dispensaryResult = await pool.query(`
SELECT id, name, menu_url
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
if (dispensaryResult.rows.length === 0) {
console.error(`[${workerNum}] Dispensary ${dispensaryId} not found`);
await pool.end();
return;
}
const dispensary = dispensaryResult.rows[0];
const menuUrl = dispensary.menu_url;
if (!menuUrl) {
console.error(`[${workerNum}] No menu URL for dispensary ${dispensary.name}`);
await pool.end();
return;
}
console.log(`[${workerNum}] Scraping: ${dispensary.name}`);
console.log(`[${workerNum}] Menu URL: ${menuUrl}`);
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
userAgent: GOOGLE_UA
});
const page = await context.newPage();
try {
console.log(`[${workerNum}] Loading menu page...`);
await page.goto(menuUrl, {
waitUntil: 'networkidle',
timeout: 30000
});
await page.waitForTimeout(3000);
// Scroll to bottom to trigger lazy loading
console.log(`[${workerNum}] Scrolling to load all products...`);
let previousCount = 0;
let currentCount = 0;
let scrollAttempts = 0;
const maxScrollAttempts = 50; // Increased to capture hundreds of products
do {
previousCount = currentCount;
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
currentCount = await page.locator('.menu-item').count();
scrollAttempts++;
console.log(`[${workerNum}] Scroll ${scrollAttempts}: ${currentCount} items loaded`);
} while (currentCount > previousCount && scrollAttempts < maxScrollAttempts);
// Get all menu items
const menuItems = await page.locator('.menu-item').all();
console.log(`[${workerNum}] Found ${menuItems.length} menu items after scrolling`);
const products: Product[] = [];
for (let i = 0; i < menuItems.length; i++) {
try {
const item = menuItems[i];
const link = item.locator('a.link').first();
const href = await link.getAttribute('href');
if (!href) continue;
// Extract brand from .item-text-1
const brandElement = item.locator('.item-text-1').first();
const brand = await brandElement.textContent();
// Extract product name from .item-text-2
const nameElement = item.locator('.item-text-2').first();
const productLine = await nameElement.textContent();
// Parse product name and variant from productLine
const name = productLine?.split(/\d+\.?\d*\s*G/i)[0].trim() || '';
const variant = productLine?.match(/\d+\.?\d*\s*G/i)?.[0] || '';
// Get full text for strain/cannabinoid extraction
const text = await item.textContent();
// Extract strain type
const strainMatch = text?.match(/(SATIVA|INDICA|HYBRID|I\/S|S\/I)/i);
const strainType = strainMatch ? strainMatch[1] : null;
// Extract THC percentage
const thcMatch = text?.match(/THC\s+([\d.]+)%/i);
const thcPercentage = thcMatch ? parseFloat(thcMatch[1]) : null;
// Extract CBD percentage
const cbdMatch = text?.match(/CBD\s+([\d.]+)%/i);
const cbdPercentage = cbdMatch ? parseFloat(cbdMatch[1]) : null;
// Extract price
const priceMatch = text?.match(/\$(\d+\.?\d*)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
// Extract price per unit
const pricePerMatch = text?.match(/\$\d+\.?\d*\/\s*([^\s]+)/);
const pricePerUnit = pricePerMatch ? pricePerMatch[0] : null;
// Extract category from URL
const categoryMatch = href.match(/\/category\/([^\/]+)\//);
const category = categoryMatch ? categoryMatch[1] : 'unknown';
const fullUrl = href.startsWith('http') ? href : `https://best.treez.io${href}`;
const product: Product = {
name: name || productLine || '',
brand: brand?.trim() || '',
variant: variant.trim(),
category,
strain_type: strainType || undefined,
thc_percentage: thcPercentage || undefined,
cbd_percentage: cbdPercentage || undefined,
regular_price: price || undefined,
sale_price: price || undefined,
price_per_unit: pricePerUnit || undefined,
dutchie_url: fullUrl,
in_stock: true
};
products.push(product);
} catch (error: any) {
console.error(`[${workerNum}] Error parsing item ${i}:`, error.message);
}
}
console.log(`[${workerNum}] Parsed ${products.length} products`);
// Save to database
let saved = 0;
for (const product of products) {
try {
// Create slug
const slug = `${product.brand}-${product.name}-${product.variant}`
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
// Check if product already exists
const existing = await pool.query(`
SELECT id FROM products
WHERE dispensary_id = $1 AND slug = $2
`, [dispensaryId, slug]);
if (existing.rows.length > 0) {
// Update existing product
await pool.query(`
UPDATE products SET
name = $1,
brand = $2,
variant = $3,
strain_type = $4,
thc_percentage = $5,
cbd_percentage = $6,
regular_price = $7,
sale_price = $8,
dutchie_url = $9,
in_stock = $10,
updated_at = NOW()
WHERE dispensary_id = $11 AND slug = $12
`, [
product.name,
product.brand,
product.variant,
product.strain_type,
product.thc_percentage,
product.cbd_percentage,
product.regular_price,
product.sale_price,
product.dutchie_url,
product.in_stock,
dispensaryId,
slug
]);
} else {
// Insert new product
await pool.query(`
INSERT INTO products (
dispensary_id, slug, name, brand, variant,
strain_type, thc_percentage, cbd_percentage,
regular_price, sale_price, dutchie_url, in_stock,
created_at, updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW(), NOW()
)
`, [
dispensaryId,
slug,
product.name,
product.brand,
product.variant,
product.strain_type,
product.thc_percentage,
product.cbd_percentage,
product.regular_price,
product.sale_price,
product.dutchie_url,
product.in_stock
]);
}
saved++;
} catch (error: any) {
console.error(`[${workerNum}] Error saving product:`, error.message);
}
}
console.log(`[${workerNum}] ✅ Saved ${saved}/${products.length} products`);
} catch (error: any) {
console.error(`[${workerNum}] ❌ Error:`, error.message);
} finally {
await browser.close();
await pool.end();
}
}
main().catch(console.error);