fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
236
backend/dist/services/scraper-playwright.js
vendored
Normal file
236
backend/dist/services/scraper-playwright.js
vendored
Normal file
@@ -0,0 +1,236 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright;
|
||||
exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright;
|
||||
const age_gate_playwright_1 = require("../utils/age-gate-playwright");
|
||||
const logger_1 = require("./logger");
|
||||
const stealthBrowser_1 = require("../utils/stealthBrowser");
|
||||
const dutchie_1 = require("../scrapers/templates/dutchie");
|
||||
/**
|
||||
* Scrapes a category page using Playwright with stealth mode to extract product information
|
||||
*/
|
||||
async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) {
|
||||
logger_1.logger.info('scraper', `Scraping category: ${categoryName}`);
|
||||
logger_1.logger.info('scraper', `URL: ${categoryUrl}`);
|
||||
// Create stealth browser with optional proxy
|
||||
const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true });
|
||||
try {
|
||||
// Create stealth context with age gate cookies
|
||||
const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state });
|
||||
// Try to load saved session cookies
|
||||
const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`;
|
||||
await (0, stealthBrowser_1.loadCookies)(context, cookiesPath);
|
||||
const page = await context.newPage();
|
||||
// Navigate to category page
|
||||
logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`);
|
||||
await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
// Random delay to appear more human
|
||||
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
|
||||
// Check for Cloudflare challenge
|
||||
if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) {
|
||||
logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...');
|
||||
const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000);
|
||||
if (!passed) {
|
||||
logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge');
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
// Save successful session cookies
|
||||
await (0, stealthBrowser_1.saveCookies)(context, cookiesPath);
|
||||
}
|
||||
// Wait for page to be fully loaded
|
||||
await (0, stealthBrowser_1.waitForPageLoad)(page);
|
||||
// Simulate human behavior
|
||||
await (0, stealthBrowser_1.simulateHumanBehavior)(page);
|
||||
// Check for and bypass age gate
|
||||
const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state);
|
||||
if (!bypassed) {
|
||||
logger_1.logger.error('scraper', 'Failed to bypass age gate');
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
// Wait for products to load with random delay
|
||||
logger_1.logger.info('scraper', 'Waiting for products to load...');
|
||||
await (0, stealthBrowser_1.randomDelay)(2000, 4000);
|
||||
// Scroll to load all products with human-like behavior
|
||||
logger_1.logger.info('scraper', 'Scrolling to load all products...');
|
||||
await scrollToBottomHuman(page);
|
||||
// Extract products
|
||||
logger_1.logger.info('scraper', 'Extracting products from page...');
|
||||
const products = await extractProducts(page, categoryUrl, categoryName);
|
||||
logger_1.logger.info('scraper', `Found ${products.length} products`);
|
||||
await browser.close();
|
||||
return products;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `Error scraping category: ${error}`);
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Scrolls to the bottom of the page with human-like behavior
|
||||
*/
|
||||
async function scrollToBottomHuman(page) {
|
||||
let previousHeight = 0;
|
||||
let currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
let attempts = 0;
|
||||
const maxAttempts = 20;
|
||||
while (previousHeight < currentHeight && attempts < maxAttempts) {
|
||||
previousHeight = currentHeight;
|
||||
// Scroll down in chunks with randomized delays
|
||||
const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px
|
||||
await (0, stealthBrowser_1.humanScroll)(page, scrollAmount);
|
||||
// Random pause like a human reading
|
||||
await (0, stealthBrowser_1.randomDelay)(500, 1500);
|
||||
// Check new height
|
||||
currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
attempts++;
|
||||
}
|
||||
// Final wait for any lazy-loaded content
|
||||
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
|
||||
}
|
||||
/**
|
||||
* Extracts product information from the page
|
||||
*/
|
||||
async function extractProducts(page, categoryUrl, categoryName) {
|
||||
let products = [];
|
||||
// Check if we have a template for this URL
|
||||
const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl);
|
||||
if (template) {
|
||||
logger_1.logger.info('scraper', `Using ${template.name} template for extraction`);
|
||||
try {
|
||||
const templateProducts = await template.extractProducts(page);
|
||||
// Add category to products from template
|
||||
products = templateProducts.map(p => ({
|
||||
...p,
|
||||
category: categoryName,
|
||||
}));
|
||||
logger_1.logger.info('scraper', `Template extracted ${products.length} products`);
|
||||
return products;
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('scraper', `Template extraction failed: ${err}`);
|
||||
// Fall through to fallback methods
|
||||
}
|
||||
}
|
||||
// Fallback Method 1: Dutchie products (for Sol Flower, etc.)
|
||||
try {
|
||||
const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all();
|
||||
if (dutchieProducts.length > 0) {
|
||||
logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`);
|
||||
for (const productEl of dutchieProducts) {
|
||||
try {
|
||||
const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || '';
|
||||
const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => '');
|
||||
const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => '');
|
||||
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
|
||||
const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => '');
|
||||
// Parse price
|
||||
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
|
||||
if (name) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
brand: brand ? brand.trim() : undefined,
|
||||
category: categoryName,
|
||||
price,
|
||||
image_url: imageUrl || undefined,
|
||||
product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`);
|
||||
}
|
||||
// Method 2: Curaleaf products
|
||||
if (products.length === 0) {
|
||||
try {
|
||||
const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all();
|
||||
if (curaleafProducts.length > 0) {
|
||||
logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`);
|
||||
for (const productEl of curaleafProducts) {
|
||||
try {
|
||||
const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || '';
|
||||
const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => '');
|
||||
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
|
||||
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
|
||||
if (name && name.length > 3) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
category: categoryName,
|
||||
price,
|
||||
image_url: imageUrl || undefined,
|
||||
product_url: categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`);
|
||||
}
|
||||
}
|
||||
// Method 3: Generic product cards
|
||||
if (products.length === 0) {
|
||||
try {
|
||||
const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all();
|
||||
logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`);
|
||||
for (const productEl of genericProducts) {
|
||||
try {
|
||||
const text = await productEl.textContent() || '';
|
||||
// Only consider elements that look like products
|
||||
if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) {
|
||||
const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || '';
|
||||
if (name && name.length > 3) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
category: categoryName,
|
||||
product_url: categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
// Skip this element
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`);
|
||||
}
|
||||
}
|
||||
return products;
|
||||
}
|
||||
/**
|
||||
* Test function to scrape a single category
|
||||
*/
|
||||
async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') {
|
||||
console.log(`\n🎭 Testing Playwright Category Scraper\n`);
|
||||
console.log(`Category: ${categoryName}`);
|
||||
console.log(`URL: ${url}\n`);
|
||||
const products = await scrapeCategoryPlaywright(url, categoryName, state);
|
||||
console.log(`\n✅ Found ${products.length} products\n`);
|
||||
products.slice(0, 5).forEach((p, i) => {
|
||||
console.log(`${i + 1}. ${p.name}`);
|
||||
if (p.brand)
|
||||
console.log(` Brand: ${p.brand}`);
|
||||
if (p.price)
|
||||
console.log(` Price: $${p.price}`);
|
||||
console.log(` URL: ${p.product_url}`);
|
||||
console.log('');
|
||||
});
|
||||
return products;
|
||||
}
|
||||
Reference in New Issue
Block a user