The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
718 lines
33 KiB
JavaScript
718 lines
33 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.USER_AGENT_GROUPS = exports.USER_AGENTS = void 0;
|
|
exports.getUserAgent = getUserAgent;
|
|
exports.scrapeCategory = scrapeCategory;
|
|
exports.saveProducts = saveProducts;
|
|
exports.scrapeStore = scrapeStore;
|
|
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
|
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
|
const migrate_1 = require("../db/migrate");
|
|
const minio_1 = require("../utils/minio");
|
|
const logger_1 = require("./logger");
|
|
const scraper_monitor_1 = require("../routes/scraper-monitor");
|
|
const proxy_1 = require("./proxy");
|
|
const age_gate_1 = require("../utils/age-gate");
|
|
const availability_1 = require("./availability");
|
|
// Apply stealth plugin for antidetect/anti-fingerprinting
|
|
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
|
exports.USER_AGENTS = {
|
|
'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
|
|
'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
|
|
'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
|
'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
|
|
};
|
|
exports.USER_AGENT_GROUPS = {
|
|
desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'],
|
|
mobile: ['mobile-ios', 'mobile-android'],
|
|
serp: ['googlebot', 'bingbot']
|
|
};
|
|
function getRandomUserAgentFromGroup(group) {
|
|
const randomKey = group[Math.floor(Math.random() * group.length)];
|
|
return exports.USER_AGENTS[randomKey];
|
|
}
|
|
function getUserAgent(key) {
|
|
if (!key)
|
|
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
|
// Check if it's a group
|
|
if (key === 'rotate-desktop')
|
|
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
|
if (key === 'rotate-mobile')
|
|
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.mobile);
|
|
if (key === 'rotate-serp')
|
|
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.serp);
|
|
// Otherwise treat as specific UA
|
|
return exports.USER_AGENTS[key] || getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
|
}
|
|
function extractImageIdFromUrl(url) {
|
|
try {
|
|
const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
|
|
return match ? match[1] : null;
|
|
}
|
|
catch (e) {
|
|
return null;
|
|
}
|
|
}
|
|
function getFullSizeImageUrl(imageUrl) {
|
|
const imageId = extractImageIdFromUrl(imageUrl);
|
|
if (!imageId)
|
|
return imageUrl;
|
|
return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
|
|
}
|
|
function sanitizeProductData(product) {
|
|
return {
|
|
...product,
|
|
name: product.name?.substring(0, 500) || 'Unnamed Product',
|
|
description: product.description || null,
|
|
brand: product.brand?.substring(0, 500) || null,
|
|
weight: product.weight?.substring(0, 100) || null,
|
|
thc: product.thc && product.thc < 100 ? product.thc : null,
|
|
cbd: product.cbd && product.cbd < 100 ? product.cbd : null
|
|
};
|
|
}
|
|
async function makePageStealthy(page) {
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false,
|
|
});
|
|
});
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => [1, 2, 3, 4, 5],
|
|
});
|
|
});
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'languages', {
|
|
get: () => ['en-US', 'en'],
|
|
});
|
|
});
|
|
await page.evaluateOnNewDocument(() => {
|
|
window.chrome = {
|
|
runtime: {},
|
|
};
|
|
});
|
|
await page.evaluateOnNewDocument(() => {
|
|
const originalQuery = window.navigator.permissions.query;
|
|
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
|
|
? Promise.resolve({ state: 'denied' })
|
|
: originalQuery(parameters);
|
|
});
|
|
}
|
|
async function scrapeProductDetails(page, productUrl, productName) {
|
|
const maxRetries = 3;
|
|
let lastError = null;
|
|
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
try {
|
|
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
const details = await page.evaluate(() => {
|
|
const allText = document.body.textContent || '';
|
|
let fullSizeImage = null;
|
|
const mainImageSelectors = [
|
|
'img[class*="ProductImage"]',
|
|
'img[class*="product-image"]',
|
|
'[class*="ImageGallery"] img',
|
|
'main img',
|
|
'img[src*="images.dutchie.com"]'
|
|
];
|
|
for (const sel of mainImageSelectors) {
|
|
const img = document.querySelector(sel);
|
|
if (img?.src && img.src.includes('dutchie.com')) {
|
|
fullSizeImage = img.src;
|
|
break;
|
|
}
|
|
}
|
|
let description = '';
|
|
const descSelectors = [
|
|
'[class*="description"]',
|
|
'[class*="Description"]',
|
|
'[data-testid*="description"]',
|
|
'p[class*="product"]'
|
|
];
|
|
for (const sel of descSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el?.textContent?.trim() && el.textContent.length > 20) {
|
|
description = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
let thc = null;
|
|
const thcPatterns = [
|
|
/THC[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/(\d+\.?\d*)\s*%\s+THC/i
|
|
];
|
|
for (const pattern of thcPatterns) {
|
|
const match = allText.match(pattern);
|
|
if (match) {
|
|
thc = parseFloat(match[1]);
|
|
break;
|
|
}
|
|
}
|
|
let cbd = null;
|
|
const cbdPatterns = [
|
|
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
|
|
/(\d+\.?\d*)\s*%\s+CBD/i
|
|
];
|
|
for (const pattern of cbdPatterns) {
|
|
const match = allText.match(pattern);
|
|
if (match) {
|
|
cbd = parseFloat(match[1]);
|
|
break;
|
|
}
|
|
}
|
|
let strainType = null;
|
|
if (allText.match(/\bindica\b/i))
|
|
strainType = 'Indica';
|
|
else if (allText.match(/\bsativa\b/i))
|
|
strainType = 'Sativa';
|
|
else if (allText.match(/\bhybrid\b/i))
|
|
strainType = 'Hybrid';
|
|
const terpenes = [];
|
|
const terpeneNames = [
|
|
'Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool',
|
|
'Humulene', 'Terpinolene', 'Ocimene', 'Bisabolol', 'Valencene'
|
|
];
|
|
terpeneNames.forEach(terp => {
|
|
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
|
|
terpenes.push(terp);
|
|
}
|
|
});
|
|
const effects = [];
|
|
const effectNames = [
|
|
'Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative',
|
|
'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry',
|
|
'Talkative', 'Giggly', 'Aroused'
|
|
];
|
|
effectNames.forEach(effect => {
|
|
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
|
|
effects.push(effect);
|
|
}
|
|
});
|
|
let brand = null;
|
|
const brandSelectors = [
|
|
'[class*="brand"]',
|
|
'[class*="Brand"]',
|
|
'[data-testid*="brand"]'
|
|
];
|
|
for (const sel of brandSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el?.textContent?.trim()) {
|
|
brand = el.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
let lineage = null;
|
|
const lineageMatch = allText.match(/(?:Lineage|Genetics|Parents?)[:\s]*([^\n]+)/i);
|
|
if (lineageMatch) {
|
|
lineage = lineageMatch[1].trim();
|
|
}
|
|
const flavors = [];
|
|
const flavorNames = [
|
|
'Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel',
|
|
'Sour', 'Floral', 'Spicy', 'Woody', 'Tropical', 'Fruity',
|
|
'Vanilla', 'Mint', 'Cheese', 'Grape', 'Lemon', 'Orange'
|
|
];
|
|
flavorNames.forEach(flavor => {
|
|
if (allText.match(new RegExp(`\\b${flavor}\\b`, 'i'))) {
|
|
flavors.push(flavor);
|
|
}
|
|
});
|
|
const weights = [];
|
|
const weightMatches = allText.matchAll(/(\d+\.?\d*\s*(?:g|oz|mg|gram))/gi);
|
|
for (const match of weightMatches) {
|
|
const weight = match[1].trim();
|
|
if (!weights.includes(weight)) {
|
|
weights.push(weight);
|
|
}
|
|
}
|
|
return {
|
|
fullSizeImage,
|
|
description,
|
|
thc,
|
|
cbd,
|
|
strainType,
|
|
terpenes,
|
|
effects,
|
|
brand,
|
|
lineage,
|
|
flavors,
|
|
weights
|
|
};
|
|
});
|
|
return details;
|
|
}
|
|
catch (error) {
|
|
lastError = error;
|
|
logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
|
|
// No delays - just retry immediately
|
|
}
|
|
}
|
|
logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
|
|
return {
|
|
fullSizeImage: null,
|
|
description: null,
|
|
thc: null,
|
|
cbd: null,
|
|
strainType: null,
|
|
terpenes: [],
|
|
effects: [],
|
|
brand: null,
|
|
lineage: null,
|
|
flavors: [],
|
|
weights: []
|
|
};
|
|
}
|
|
async function scrapeCategory(storeId, categoryId, userAgent) {
|
|
let browser = null;
|
|
const scraperId = `cat-${categoryId}-${Date.now()}`;
|
|
let proxyId = null;
|
|
try {
|
|
const categoryResult = await migrate_1.pool.query(`
|
|
SELECT c.*, s.slug as store_slug, s.name as store_name
|
|
FROM categories c
|
|
JOIN stores s ON c.store_id = s.id
|
|
WHERE c.id = $1
|
|
`, [categoryId]);
|
|
if (categoryResult.rows.length === 0) {
|
|
throw new Error('Category not found');
|
|
}
|
|
const category = categoryResult.rows[0];
|
|
logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
|
|
// Register scraper with monitoring system
|
|
(0, scraper_monitor_1.registerScraper)(scraperId, storeId, category.store_name, categoryId, category.name);
|
|
const proxy = await (0, proxy_1.getActiveProxy)();
|
|
if (proxy) {
|
|
proxyId = proxy.id;
|
|
}
|
|
const launchOptions = {
|
|
headless: 'new',
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--window-size=1920,1080'
|
|
]
|
|
};
|
|
if (proxy) {
|
|
if (proxy.protocol === 'socks5') {
|
|
launchOptions.args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
|
}
|
|
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
|
|
launchOptions.args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
|
}
|
|
logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
|
}
|
|
browser = await puppeteer_extra_1.default.launch(launchOptions);
|
|
const page = await browser.newPage();
|
|
await makePageStealthy(page);
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
// Use provided userAgent or random if not specified
|
|
const ua = getUserAgent(userAgent);
|
|
await page.setUserAgent(ua);
|
|
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
|
|
const state = (0, age_gate_1.detectStateFromUrl)(category.dutchie_url);
|
|
await (0, age_gate_1.setAgeGateCookies)(page, category.dutchie_url, state);
|
|
logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`);
|
|
try {
|
|
await page.goto(category.dutchie_url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000
|
|
});
|
|
// If age gate still appears, try to bypass it
|
|
await (0, age_gate_1.bypassAgeGate)(page, state);
|
|
// Wait for products to load
|
|
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
|
timeout: 30000,
|
|
}).catch(() => {
|
|
logger_1.logger.warn('scraper', 'No product selectors found, trying anyway...');
|
|
});
|
|
logger_1.logger.info('scraper', 'Scrolling to load all products...');
|
|
await autoScroll(page);
|
|
}
|
|
catch (navError) {
|
|
logger_1.logger.error('scraper', `Navigation error: ${navError}`);
|
|
// Check if this is bot detection - put proxy in timeout instead of hard failure
|
|
if (proxyId) {
|
|
const errorMsg = String(navError);
|
|
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
|
|
// Bot detection! Put this proxy in timeout and get a new one
|
|
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
|
|
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
|
|
throw new Error(`Bot detection: ${errorMsg}`);
|
|
}
|
|
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
|
|
errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) {
|
|
// Regular proxy failure - increment failure count
|
|
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
|
|
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
|
|
}
|
|
}
|
|
throw navError;
|
|
}
|
|
logger_1.logger.info('scraper', 'Extracting product list from page...');
|
|
const products = await page.evaluate(() => {
|
|
const items = [];
|
|
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
|
|
console.log(`Found ${cards.length} product cards`);
|
|
cards.forEach((card) => {
|
|
try {
|
|
const allText = card.textContent || '';
|
|
let name = '';
|
|
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
|
|
for (const sel of nameSelectors) {
|
|
const el = card.querySelector(sel);
|
|
if (el?.textContent?.trim()) {
|
|
name = el.textContent.trim();
|
|
name = name.split('\n')[0].trim();
|
|
break;
|
|
}
|
|
}
|
|
if (!name || name.length < 2)
|
|
return;
|
|
let price = null;
|
|
let originalPrice = null;
|
|
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
|
|
if (priceMatches && priceMatches.length > 0) {
|
|
price = parseFloat(priceMatches[0].replace('$', ''));
|
|
if (priceMatches.length > 1) {
|
|
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
|
|
}
|
|
}
|
|
// Extract variant (weight/size) - look for common patterns
|
|
let variant = null;
|
|
const variantPatterns = [
|
|
/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units
|
|
/(\d+\s*pack)/i, // Pack sizes
|
|
/(\d+\s*ct)/i, // Count
|
|
/(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g)
|
|
];
|
|
for (const pattern of variantPatterns) {
|
|
const match = allText.match(pattern);
|
|
if (match) {
|
|
variant = match[1].trim();
|
|
break;
|
|
}
|
|
}
|
|
const linkEl = card.querySelector('a[href*="/product/"]');
|
|
let href = linkEl?.href || linkEl?.getAttribute('href') || '';
|
|
if (href && href.startsWith('/')) {
|
|
href = 'https://dutchie.com' + href;
|
|
}
|
|
items.push({
|
|
name,
|
|
variant,
|
|
price,
|
|
originalPrice,
|
|
href: href || window.location.href
|
|
});
|
|
}
|
|
catch (err) {
|
|
console.error('Error parsing product card:', err);
|
|
}
|
|
});
|
|
return items;
|
|
});
|
|
logger_1.logger.info('scraper', `Found ${products.length} products total`);
|
|
logger_1.logger.info('scraper', `Now visiting each product page for complete details...`);
|
|
let successCount = 0;
|
|
let failCount = 0;
|
|
// Update initial stats
|
|
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
|
|
productsProcessed: 0,
|
|
productsTotal: products.length
|
|
});
|
|
for (let i = 0; i < products.length; i++) {
|
|
const product = products[i];
|
|
try {
|
|
logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
|
|
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
|
|
productsProcessed: i + 1,
|
|
productsTotal: products.length
|
|
}, `Processing: ${product.name}`);
|
|
if (!product.href) {
|
|
logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`);
|
|
product.metadata = {};
|
|
failCount++;
|
|
continue;
|
|
}
|
|
const details = await scrapeProductDetails(page, product.href, product.name);
|
|
product.imageUrl = details.fullSizeImage ? getFullSizeImageUrl(details.fullSizeImage) : null;
|
|
product.description = details.description;
|
|
product.thc = details.thc;
|
|
product.cbd = details.cbd;
|
|
product.strainType = details.strainType;
|
|
product.brand = details.brand;
|
|
product.weight = details.weights.length > 0 ? details.weights[0] : null;
|
|
product.metadata = {
|
|
terpenes: details.terpenes,
|
|
effects: details.effects,
|
|
lineage: details.lineage,
|
|
flavors: details.flavors,
|
|
allWeights: details.weights
|
|
};
|
|
if (details.thc || details.cbd || details.description) {
|
|
logger_1.logger.info('scraper', ` ✓ THC: ${details.thc}%, CBD: ${details.cbd}%`);
|
|
successCount++;
|
|
}
|
|
else {
|
|
logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`);
|
|
failCount++;
|
|
}
|
|
// No delays - scrape fast!
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`);
|
|
product.metadata = {};
|
|
failCount++;
|
|
}
|
|
}
|
|
await browser.close();
|
|
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
logger_1.logger.info('scraper', `✅ Category complete: ${category.name}`);
|
|
logger_1.logger.info('scraper', ` Total products: ${products.length}`);
|
|
logger_1.logger.info('scraper', ` Success: ${successCount}`);
|
|
logger_1.logger.info('scraper', ` Failed: ${failCount}`);
|
|
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
await migrate_1.pool.query(`
|
|
UPDATE categories
|
|
SET last_scraped_at = CURRENT_TIMESTAMP
|
|
WHERE id = $1
|
|
`, [categoryId]);
|
|
// Mark scraper as complete
|
|
(0, scraper_monitor_1.completeScraper)(scraperId);
|
|
const formattedProducts = products.map((p, index) => {
|
|
const sanitized = sanitizeProductData(p);
|
|
// Normalize availability from Dutchie product data
|
|
const availability = (0, availability_1.normalizeAvailability)(p);
|
|
return {
|
|
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
|
|
name: sanitized.name,
|
|
variant: p.variant || null,
|
|
description: sanitized.description,
|
|
price: p.price,
|
|
originalPrice: p.originalPrice,
|
|
thcPercentage: sanitized.thc,
|
|
cbdPercentage: sanitized.cbd,
|
|
strainType: p.strainType,
|
|
brand: sanitized.brand,
|
|
weight: sanitized.weight,
|
|
imageUrl: p.imageUrl,
|
|
dutchieUrl: p.href,
|
|
metadata: p.metadata || {},
|
|
availabilityStatus: availability.status,
|
|
availabilityRaw: availability.raw,
|
|
stockQuantity: availability.quantity
|
|
};
|
|
});
|
|
return formattedProducts;
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`);
|
|
// Smart proxy error handling
|
|
if (proxyId) {
|
|
const errorMsg = String(error);
|
|
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
|
|
// Bot detection! Put this proxy in timeout
|
|
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
|
|
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
|
|
}
|
|
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
|
|
errorMsg.includes('ERR_') || errorMsg.includes('Navigation') ||
|
|
errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) {
|
|
// Regular proxy failure - increment failure count
|
|
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
|
|
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
|
|
}
|
|
}
|
|
// Mark scraper as failed
|
|
(0, scraper_monitor_1.completeScraper)(scraperId, String(error));
|
|
if (browser) {
|
|
try {
|
|
await browser.close();
|
|
}
|
|
catch (e) {
|
|
logger_1.logger.error('scraper', `Error closing browser: ${e}`);
|
|
}
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
async function autoScroll(page) {
|
|
await page.evaluate(async () => {
|
|
await new Promise((resolve) => {
|
|
let totalHeight = 0;
|
|
const distance = 500;
|
|
const timer = setInterval(() => {
|
|
const scrollHeight = document.body.scrollHeight;
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
if (totalHeight >= scrollHeight) {
|
|
clearInterval(timer);
|
|
resolve();
|
|
}
|
|
}, 200);
|
|
});
|
|
});
|
|
}
|
|
async function saveProducts(storeId, categoryId, products) {
|
|
const client = await migrate_1.pool.connect();
|
|
try {
|
|
await client.query('BEGIN');
|
|
logger_1.logger.info('scraper', `Saving ${products.length} products to database...`);
|
|
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
|
|
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
|
|
await client.query(`
|
|
UPDATE products
|
|
SET in_stock = false,
|
|
availability_status = 'out_of_stock',
|
|
last_seen_out_of_stock_at = CASE
|
|
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
|
|
ELSE last_seen_out_of_stock_at
|
|
END
|
|
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
|
|
`, [storeId, categoryId]);
|
|
for (const product of products) {
|
|
try {
|
|
// Get availability from product (defaults to in_stock if product exists in scraped data)
|
|
const availStatus = product.availabilityStatus || 'in_stock';
|
|
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
|
|
const stockQty = product.stockQuantity ?? null;
|
|
const existingResult = await client.query(`
|
|
SELECT id, image_url, local_image_path, availability_status
|
|
FROM products
|
|
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
|
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
|
|
`, [storeId, product.name, categoryId, product.variant || null]);
|
|
let localImagePath = null;
|
|
let productId;
|
|
if (existingResult.rows.length > 0) {
|
|
productId = existingResult.rows[0].id;
|
|
localImagePath = existingResult.rows[0].local_image_path;
|
|
const prevStatus = existingResult.rows[0].availability_status;
|
|
// Determine if we need to update last_seen_in_stock_at
|
|
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
|
|
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
|
|
await client.query(`
|
|
UPDATE products
|
|
SET name = $1, variant = $2, description = $3, price = $4,
|
|
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
|
|
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
|
|
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
|
|
updated_at = CURRENT_TIMESTAMP,
|
|
availability_status = $14,
|
|
availability_raw = $15,
|
|
stock_quantity = $16,
|
|
last_seen_in_stock_at = CASE
|
|
WHEN $17 THEN CURRENT_TIMESTAMP
|
|
ELSE last_seen_in_stock_at
|
|
END
|
|
WHERE id = $13
|
|
`, [
|
|
product.name, product.variant, product.description, product.price,
|
|
product.strainType, product.thcPercentage, product.cbdPercentage,
|
|
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
|
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
|
|
isNowInStock && wasOutOfStock
|
|
]);
|
|
}
|
|
else {
|
|
// Generate unique slug from product name + timestamp + random suffix
|
|
const baseSlug = product.name
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-|-$/g, '')
|
|
.substring(0, 150);
|
|
const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`;
|
|
const slug = `${baseSlug}-${uniqueSuffix}`;
|
|
const insertResult = await client.query(`
|
|
INSERT INTO products (
|
|
store_id, category_id, dutchie_product_id, name, slug, variant, description,
|
|
price, strain_type, thc_percentage, cbd_percentage,
|
|
brand, weight, image_url, dutchie_url, in_stock, metadata,
|
|
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
|
|
RETURNING id
|
|
`, [
|
|
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
|
|
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
|
|
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
|
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
|
|
]);
|
|
productId = insertResult.rows[0].id;
|
|
}
|
|
if (product.imageUrl && !localImagePath) {
|
|
try {
|
|
localImagePath = await (0, minio_1.uploadImageFromUrl)(product.imageUrl, productId);
|
|
await client.query(`
|
|
UPDATE products
|
|
SET local_image_path = $1
|
|
WHERE id = $2
|
|
`, [localImagePath, productId]);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('images', `Failed to download image for ${product.name}: ${error}`);
|
|
}
|
|
}
|
|
}
|
|
catch (productError) {
|
|
logger_1.logger.error('scraper', `Failed to save product ${product.name}: ${productError}`);
|
|
}
|
|
}
|
|
await client.query('COMMIT');
|
|
logger_1.logger.info('scraper', `✅ Saved ${products.length} products successfully`);
|
|
}
|
|
catch (error) {
|
|
await client.query('ROLLBACK');
|
|
logger_1.logger.error('scraper', `Error saving products: ${error}`);
|
|
throw error;
|
|
}
|
|
finally {
|
|
client.release();
|
|
}
|
|
}
|
|
async function scrapeStore(storeId, parallel = 3, userAgent) {
|
|
try {
|
|
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`);
|
|
const categoriesResult = await migrate_1.pool.query(`
|
|
SELECT c.id, c.name, c.slug, c.dutchie_url
|
|
FROM categories c
|
|
WHERE c.store_id = $1
|
|
AND c.scrape_enabled = true
|
|
ORDER BY c.name
|
|
`, [storeId]);
|
|
logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
|
|
for (const category of categoriesResult.rows) {
|
|
try {
|
|
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`);
|
|
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
const products = await scrapeCategory(storeId, category.id, userAgent);
|
|
await saveProducts(storeId, category.id, products);
|
|
logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
|
|
}
|
|
// No delays - scrape fast!
|
|
}
|
|
await migrate_1.pool.query(`
|
|
UPDATE stores
|
|
SET last_scraped_at = CURRENT_TIMESTAMP
|
|
WHERE id = $1
|
|
`, [storeId]);
|
|
logger_1.logger.info('scraper', `🎉 Store scrape completed: ID ${storeId}`);
|
|
}
|
|
catch (error) {
|
|
logger_1.logger.error('scraper', `❌ Store scrape failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|