fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -0,0 +1,236 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright;
exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright;
const age_gate_playwright_1 = require("../utils/age-gate-playwright");
const logger_1 = require("./logger");
const stealthBrowser_1 = require("../utils/stealthBrowser");
const dutchie_1 = require("../scrapers/templates/dutchie");
/**
* Scrapes a category page using Playwright with stealth mode to extract product information
*/
async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) {
logger_1.logger.info('scraper', `Scraping category: ${categoryName}`);
logger_1.logger.info('scraper', `URL: ${categoryUrl}`);
// Create stealth browser with optional proxy
const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true });
try {
// Create stealth context with age gate cookies
const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state });
// Try to load saved session cookies
const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`;
await (0, stealthBrowser_1.loadCookies)(context, cookiesPath);
const page = await context.newPage();
// Navigate to category page
logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`);
await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
// Random delay to appear more human
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
// Check for Cloudflare challenge
if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) {
logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...');
const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000);
if (!passed) {
logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge');
await browser.close();
return [];
}
// Save successful session cookies
await (0, stealthBrowser_1.saveCookies)(context, cookiesPath);
}
// Wait for page to be fully loaded
await (0, stealthBrowser_1.waitForPageLoad)(page);
// Simulate human behavior
await (0, stealthBrowser_1.simulateHumanBehavior)(page);
// Check for and bypass age gate
const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state);
if (!bypassed) {
logger_1.logger.error('scraper', 'Failed to bypass age gate');
await browser.close();
return [];
}
// Wait for products to load with random delay
logger_1.logger.info('scraper', 'Waiting for products to load...');
await (0, stealthBrowser_1.randomDelay)(2000, 4000);
// Scroll to load all products with human-like behavior
logger_1.logger.info('scraper', 'Scrolling to load all products...');
await scrollToBottomHuman(page);
// Extract products
logger_1.logger.info('scraper', 'Extracting products from page...');
const products = await extractProducts(page, categoryUrl, categoryName);
logger_1.logger.info('scraper', `Found ${products.length} products`);
await browser.close();
return products;
}
catch (error) {
logger_1.logger.error('scraper', `Error scraping category: ${error}`);
await browser.close();
return [];
}
}
/**
* Scrolls to the bottom of the page with human-like behavior
*/
async function scrollToBottomHuman(page) {
let previousHeight = 0;
let currentHeight = await page.evaluate(() => document.body.scrollHeight);
let attempts = 0;
const maxAttempts = 20;
while (previousHeight < currentHeight && attempts < maxAttempts) {
previousHeight = currentHeight;
// Scroll down in chunks with randomized delays
const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px
await (0, stealthBrowser_1.humanScroll)(page, scrollAmount);
// Random pause like a human reading
await (0, stealthBrowser_1.randomDelay)(500, 1500);
// Check new height
currentHeight = await page.evaluate(() => document.body.scrollHeight);
attempts++;
}
// Final wait for any lazy-loaded content
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
}
/**
* Extracts product information from the page
*/
async function extractProducts(page, categoryUrl, categoryName) {
let products = [];
// Check if we have a template for this URL
const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl);
if (template) {
logger_1.logger.info('scraper', `Using ${template.name} template for extraction`);
try {
const templateProducts = await template.extractProducts(page);
// Add category to products from template
products = templateProducts.map(p => ({
...p,
category: categoryName,
}));
logger_1.logger.info('scraper', `Template extracted ${products.length} products`);
return products;
}
catch (err) {
logger_1.logger.error('scraper', `Template extraction failed: ${err}`);
// Fall through to fallback methods
}
}
// Fallback Method 1: Dutchie products (for Sol Flower, etc.)
try {
const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all();
if (dutchieProducts.length > 0) {
logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`);
for (const productEl of dutchieProducts) {
try {
const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || '';
const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => '');
const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => '');
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => '');
// Parse price
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
if (name) {
products.push({
name: name.trim(),
brand: brand ? brand.trim() : undefined,
category: categoryName,
price,
image_url: imageUrl || undefined,
product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl,
in_stock: true
});
}
}
catch (err) {
logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`);
}
}
}
}
catch (err) {
logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`);
}
// Method 2: Curaleaf products
if (products.length === 0) {
try {
const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all();
if (curaleafProducts.length > 0) {
logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`);
for (const productEl of curaleafProducts) {
try {
const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || '';
const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => '');
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
if (name && name.length > 3) {
products.push({
name: name.trim(),
category: categoryName,
price,
image_url: imageUrl || undefined,
product_url: categoryUrl,
in_stock: true
});
}
}
catch (err) {
logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`);
}
}
}
}
catch (err) {
logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`);
}
}
// Method 3: Generic product cards
if (products.length === 0) {
try {
const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all();
logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`);
for (const productEl of genericProducts) {
try {
const text = await productEl.textContent() || '';
// Only consider elements that look like products
if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) {
const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || '';
if (name && name.length > 3) {
products.push({
name: name.trim(),
category: categoryName,
product_url: categoryUrl,
in_stock: true
});
}
}
}
catch (err) {
// Skip this element
}
}
}
catch (err) {
logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`);
}
}
return products;
}
/**
* Test function to scrape a single category
*/
async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') {
console.log(`\n🎭 Testing Playwright Category Scraper\n`);
console.log(`Category: ${categoryName}`);
console.log(`URL: ${url}\n`);
const products = await scrapeCategoryPlaywright(url, categoryName, state);
console.log(`\n✅ Found ${products.length} products\n`);
products.slice(0, 5).forEach((p, i) => {
console.log(`${i + 1}. ${p.name}`);
if (p.brand)
console.log(` Brand: ${p.brand}`);
if (p.price)
console.log(` Price: $${p.price}`);
console.log(` URL: ${p.product_url}`);
console.log('');
});
return products;
}