Files
cannaiq/backend/src/_deprecated/services/scraper.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

842 lines
29 KiB
TypeScript
Executable File

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Browser, Page } from 'puppeteer';
import { SocksProxyAgent } from 'socks-proxy-agent';
import { pool } from '../db/pool';
import { downloadProductImageLegacy } from '../utils/image-storage';
import { logger } from './logger';
import { registerScraper, updateScraperStats, completeScraper } from '../routes/scraper-monitor';
import { incrementProxyFailure, getActiveProxy, isBotDetectionError, putProxyInTimeout } from './proxy';
import { bypassAgeGate, detectStateFromUrl, setAgeGateCookies } from '../utils/age-gate';
import { normalizeAvailability, AvailabilityStatus } from './availability';
// Apply stealth plugin for antidetect/anti-fingerprinting
puppeteer.use(StealthPlugin());
interface ProxyConfig {
host: string;
port: number;
protocol: string;
username?: string;
password?: string;
}
interface Product {
dutchieProductId: string;
name: string;
variant?: string;
description?: string;
price?: number;
originalPrice?: number;
strainType?: string;
thcPercentage?: number;
cbdPercentage?: number;
brand?: string;
weight?: string;
imageUrl?: string;
dutchieUrl: string;
metadata?: any;
// Availability tracking
availabilityStatus?: AvailabilityStatus;
availabilityRaw?: any;
stockQuantity?: number | null;
}
export const USER_AGENTS = {
'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
};
export const USER_AGENT_GROUPS = {
desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'],
mobile: ['mobile-ios', 'mobile-android'],
serp: ['googlebot', 'bingbot']
};
function getRandomUserAgentFromGroup(group: string[]): string {
const randomKey = group[Math.floor(Math.random() * group.length)];
return USER_AGENTS[randomKey as keyof typeof USER_AGENTS];
}
export function getUserAgent(key?: string): string {
if (!key) return getRandomUserAgentFromGroup(USER_AGENT_GROUPS.desktop);
// Check if it's a group
if (key === 'rotate-desktop') return getRandomUserAgentFromGroup(USER_AGENT_GROUPS.desktop);
if (key === 'rotate-mobile') return getRandomUserAgentFromGroup(USER_AGENT_GROUPS.mobile);
if (key === 'rotate-serp') return getRandomUserAgentFromGroup(USER_AGENT_GROUPS.serp);
// Otherwise treat as specific UA
return USER_AGENTS[key as keyof typeof USER_AGENTS] || getRandomUserAgentFromGroup(USER_AGENT_GROUPS.desktop);
}
function extractImageIdFromUrl(url: string): string | null {
try {
const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
return match ? match[1] : null;
} catch (e) {
return null;
}
}
function getFullSizeImageUrl(imageUrl: string): string {
const imageId = extractImageIdFromUrl(imageUrl);
if (!imageId) return imageUrl;
return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
}
function sanitizeProductData(product: any): any {
return {
...product,
name: product.name?.substring(0, 500) || 'Unnamed Product',
description: product.description || null,
brand: product.brand?.substring(0, 500) || null,
weight: product.weight?.substring(0, 100) || null,
thc: product.thc && product.thc < 100 ? product.thc : null,
cbd: product.cbd && product.cbd < 100 ? product.cbd : null
};
}
async function makePageStealthy(page: Page): Promise<void> {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.evaluateOnNewDocument(() => {
(window as any).chrome = {
runtime: {},
};
});
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters: any) =>
parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' } as PermissionStatus)
: originalQuery(parameters);
});
}
async function scrapeProductDetails(page: Page, productUrl: string, productName: string): Promise<any> {
const maxRetries = 3;
let lastError = null;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
const details = await page.evaluate(() => {
const allText = document.body.textContent || '';
let fullSizeImage = null;
const mainImageSelectors = [
'img[class*="ProductImage"]',
'img[class*="product-image"]',
'[class*="ImageGallery"] img',
'main img',
'img[src*="images.dutchie.com"]'
];
for (const sel of mainImageSelectors) {
const img = document.querySelector(sel) as HTMLImageElement;
if (img?.src && img.src.includes('dutchie.com')) {
fullSizeImage = img.src;
break;
}
}
let description = '';
const descSelectors = [
'[class*="description"]',
'[class*="Description"]',
'[data-testid*="description"]',
'p[class*="product"]'
];
for (const sel of descSelectors) {
const el = document.querySelector(sel);
if (el?.textContent?.trim() && el.textContent.length > 20) {
description = el.textContent.trim();
break;
}
}
let thc = null;
const thcPatterns = [
/THC[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+THC/i
];
for (const pattern of thcPatterns) {
const match = allText.match(pattern);
if (match) {
thc = parseFloat(match[1]);
break;
}
}
let cbd = null;
const cbdPatterns = [
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+CBD/i
];
for (const pattern of cbdPatterns) {
const match = allText.match(pattern);
if (match) {
cbd = parseFloat(match[1]);
break;
}
}
let strainType = null;
if (allText.match(/\bindica\b/i)) strainType = 'Indica';
else if (allText.match(/\bsativa\b/i)) strainType = 'Sativa';
else if (allText.match(/\bhybrid\b/i)) strainType = 'Hybrid';
const terpenes: string[] = [];
const terpeneNames = [
'Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool',
'Humulene', 'Terpinolene', 'Ocimene', 'Bisabolol', 'Valencene'
];
terpeneNames.forEach(terp => {
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
terpenes.push(terp);
}
});
const effects: string[] = [];
const effectNames = [
'Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative',
'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry',
'Talkative', 'Giggly', 'Aroused'
];
effectNames.forEach(effect => {
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
effects.push(effect);
}
});
let brand = null;
const brandSelectors = [
'[class*="brand"]',
'[class*="Brand"]',
'[data-testid*="brand"]'
];
for (const sel of brandSelectors) {
const el = document.querySelector(sel);
if (el?.textContent?.trim()) {
brand = el.textContent.trim();
break;
}
}
let lineage = null;
const lineageMatch = allText.match(/(?:Lineage|Genetics|Parents?)[:\s]*([^\n]+)/i);
if (lineageMatch) {
lineage = lineageMatch[1].trim();
}
const flavors: string[] = [];
const flavorNames = [
'Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel',
'Sour', 'Floral', 'Spicy', 'Woody', 'Tropical', 'Fruity',
'Vanilla', 'Mint', 'Cheese', 'Grape', 'Lemon', 'Orange'
];
flavorNames.forEach(flavor => {
if (allText.match(new RegExp(`\\b${flavor}\\b`, 'i'))) {
flavors.push(flavor);
}
});
const weights: string[] = [];
const weightMatches = allText.matchAll(/(\d+\.?\d*\s*(?:g|oz|mg|gram))/gi);
for (const match of weightMatches) {
const weight = match[1].trim();
if (!weights.includes(weight)) {
weights.push(weight);
}
}
return {
fullSizeImage,
description,
thc,
cbd,
strainType,
terpenes,
effects,
brand,
lineage,
flavors,
weights
};
});
return details;
} catch (error) {
lastError = error;
logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
// No delays - just retry immediately
}
}
logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
return {
fullSizeImage: null,
description: null,
thc: null,
cbd: null,
strainType: null,
terpenes: [],
effects: [],
brand: null,
lineage: null,
flavors: [],
weights: []
};
}
export async function scrapeCategory(storeId: number, categoryId: number, userAgent?: string): Promise<Product[]> {
let browser: Browser | null = null;
const scraperId = `cat-${categoryId}-${Date.now()}`;
let proxyId: number | null = null;
try {
const categoryResult = await pool.query(`
SELECT c.*, s.slug as store_slug, s.name as store_name
FROM categories c
JOIN stores s ON c.store_id = s.id
WHERE c.id = $1
`, [categoryId]);
if (categoryResult.rows.length === 0) {
throw new Error('Category not found');
}
const category = categoryResult.rows[0];
logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
// Register scraper with monitoring system
registerScraper(scraperId, storeId, category.store_name, categoryId, category.name);
const proxy = await getActiveProxy();
if (proxy) {
proxyId = proxy.id;
}
const launchOptions: any = {
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080'
]
};
if (proxy) {
if (proxy.protocol === 'socks5') {
launchOptions.args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
} else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
launchOptions.args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
browser = await puppeteer.launch(launchOptions);
const page = await browser.newPage();
await makePageStealthy(page);
await page.setViewport({ width: 1920, height: 1080 });
// Use provided userAgent or random if not specified
const ua = getUserAgent(userAgent);
await page.setUserAgent(ua);
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
const state = detectStateFromUrl(category.dutchie_url);
await setAgeGateCookies(page, category.dutchie_url, state);
logger.info('scraper', `Loading page: ${category.dutchie_url}`);
try {
await page.goto(category.dutchie_url, {
waitUntil: 'networkidle2',
timeout: 60000
});
// If age gate still appears, try to bypass it
await bypassAgeGate(page, state);
// Wait for products to load
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => {
logger.warn('scraper', 'No product selectors found, trying anyway...');
});
logger.info('scraper', 'Scrolling to load all products...');
await autoScroll(page);
} catch (navError) {
logger.error('scraper', `Navigation error: ${navError}`);
// Check if this is bot detection - put proxy in timeout instead of hard failure
if (proxyId) {
const errorMsg = String(navError);
if (isBotDetectionError(errorMsg)) {
// Bot detection! Put this proxy in timeout and get a new one
logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
putProxyInTimeout(proxyId, errorMsg);
throw new Error(`Bot detection: ${errorMsg}`);
} else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) {
// Regular proxy failure - increment failure count
logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
await incrementProxyFailure(proxyId, errorMsg);
}
}
throw navError;
}
logger.info('scraper', 'Extracting product list from page...');
const products = await page.evaluate(() => {
const items: any[] = [];
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
console.log(`Found ${cards.length} product cards`);
cards.forEach((card) => {
try {
const allText = card.textContent || '';
let name = '';
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
for (const sel of nameSelectors) {
const el = card.querySelector(sel);
if (el?.textContent?.trim()) {
name = el.textContent.trim();
name = name.split('\n')[0].trim();
break;
}
}
if (!name || name.length < 2) return;
let price = null;
let originalPrice = null;
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
if (priceMatches && priceMatches.length > 0) {
price = parseFloat(priceMatches[0].replace('$', ''));
if (priceMatches.length > 1) {
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
}
}
// Extract variant (weight/size) - look for common patterns
let variant = null;
const variantPatterns = [
/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units
/(\d+\s*pack)/i, // Pack sizes
/(\d+\s*ct)/i, // Count
/(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g)
];
for (const pattern of variantPatterns) {
const match = allText.match(pattern);
if (match) {
variant = match[1].trim();
break;
}
}
const linkEl = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null;
let href = linkEl?.href || linkEl?.getAttribute('href') || '';
if (href && href.startsWith('/')) {
href = 'https://dutchie.com' + href;
}
items.push({
name,
variant,
price,
originalPrice,
href: href || window.location.href
});
} catch (err) {
console.error('Error parsing product card:', err);
}
});
return items;
});
logger.info('scraper', `Found ${products.length} products total`);
logger.info('scraper', `Now visiting each product page for complete details...`);
let successCount = 0;
let failCount = 0;
// Update initial stats
updateScraperStats(scraperId, {
productsProcessed: 0,
productsTotal: products.length
});
for (let i = 0; i < products.length; i++) {
const product = products[i];
try {
logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
updateScraperStats(scraperId, {
productsProcessed: i + 1,
productsTotal: products.length
}, `Processing: ${product.name}`);
if (!product.href) {
logger.warn('scraper', ` ⚠ No product URL, skipping details`);
product.metadata = {};
failCount++;
continue;
}
const details = await scrapeProductDetails(page, product.href, product.name);
product.imageUrl = details.fullSizeImage ? getFullSizeImageUrl(details.fullSizeImage) : null;
product.description = details.description;
product.thc = details.thc;
product.cbd = details.cbd;
product.strainType = details.strainType;
product.brand = details.brand;
product.weight = details.weights.length > 0 ? details.weights[0] : null;
product.metadata = {
terpenes: details.terpenes,
effects: details.effects,
lineage: details.lineage,
flavors: details.flavors,
allWeights: details.weights
};
if (details.thc || details.cbd || details.description) {
logger.info('scraper', ` ✓ THC: ${details.thc}%, CBD: ${details.cbd}%`);
successCount++;
} else {
logger.warn('scraper', ` ⚠ Limited data extracted`);
failCount++;
}
// No delays - scrape fast!
} catch (error) {
logger.error('scraper', ` ✗ Unexpected error: ${error}`);
product.metadata = {};
failCount++;
}
}
await browser.close();
logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
logger.info('scraper', `✅ Category complete: ${category.name}`);
logger.info('scraper', ` Total products: ${products.length}`);
logger.info('scraper', ` Success: ${successCount}`);
logger.info('scraper', ` Failed: ${failCount}`);
logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
await pool.query(`
UPDATE categories
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [categoryId]);
// Mark scraper as complete
completeScraper(scraperId);
const formattedProducts: Product[] = products.map((p, index) => {
const sanitized = sanitizeProductData(p);
// Normalize availability from Dutchie product data
const availability = normalizeAvailability(p);
return {
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
name: sanitized.name,
variant: p.variant || null,
description: sanitized.description,
price: p.price,
originalPrice: p.originalPrice,
thcPercentage: sanitized.thc,
cbdPercentage: sanitized.cbd,
strainType: p.strainType,
brand: sanitized.brand,
weight: sanitized.weight,
imageUrl: p.imageUrl,
dutchieUrl: p.href,
metadata: p.metadata || {},
availabilityStatus: availability.status,
availabilityRaw: availability.raw,
stockQuantity: availability.quantity
};
});
return formattedProducts;
} catch (error) {
logger.error('scraper', `❌ Category scraping error: ${error}`);
// Smart proxy error handling
if (proxyId) {
const errorMsg = String(error);
if (isBotDetectionError(errorMsg)) {
// Bot detection! Put this proxy in timeout
logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
putProxyInTimeout(proxyId, errorMsg);
} else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation') ||
errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) {
// Regular proxy failure - increment failure count
logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
await incrementProxyFailure(proxyId, errorMsg);
}
}
// Mark scraper as failed
completeScraper(scraperId, String(error));
if (browser) {
try {
await browser.close();
} catch (e) {
logger.error('scraper', `Error closing browser: ${e}`);
}
}
throw error;
}
}
async function autoScroll(page: Page) {
await page.evaluate(async () => {
await new Promise<void>((resolve) => {
let totalHeight = 0;
const distance = 500;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 200);
});
});
}
export async function saveProducts(storeId: number, categoryId: number, products: Product[]): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
logger.info('scraper', `Saving ${products.length} products to database...`);
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
await client.query(`
UPDATE products
SET in_stock = false,
availability_status = 'out_of_stock',
last_seen_out_of_stock_at = CASE
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
ELSE last_seen_out_of_stock_at
END
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
`, [storeId, categoryId]);
for (const product of products) {
try {
// Get availability from product (defaults to in_stock if product exists in scraped data)
const availStatus = product.availabilityStatus || 'in_stock';
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
const stockQty = product.stockQuantity ?? null;
const existingResult = await client.query(`
SELECT id, image_url, local_image_path, availability_status
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
`, [storeId, product.name, categoryId, product.variant || null]);
let localImagePath = null;
let productId: number;
if (existingResult.rows.length > 0) {
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
const prevStatus = existingResult.rows[0].availability_status;
// Determine if we need to update last_seen_in_stock_at
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
await client.query(`
UPDATE products
SET name = $1, variant = $2, description = $3, price = $4,
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP,
availability_status = $14,
availability_raw = $15,
stock_quantity = $16,
last_seen_in_stock_at = CASE
WHEN $17 THEN CURRENT_TIMESTAMP
ELSE last_seen_in_stock_at
END
WHERE id = $13
`, [
product.name, product.variant, product.description, product.price,
product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
isNowInStock && wasOutOfStock
]);
} else {
// Generate unique slug from product name + timestamp + random suffix
const baseSlug = product.name
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '')
.substring(0, 150);
const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`;
const slug = `${baseSlug}-${uniqueSuffix}`;
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, slug, variant, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata,
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
RETURNING id
`, [
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
]);
productId = insertResult.rows[0].id;
}
if (product.imageUrl && !localImagePath) {
try {
const result = await downloadProductImageLegacy(product.imageUrl, 0, productId);
localImagePath = result.urls?.original || null;
await client.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localImagePath, productId]);
} catch (error) {
logger.error('images', `Failed to download image for ${product.name}: ${error}`);
}
}
} catch (productError) {
logger.error('scraper', `Failed to save product ${product.name}: ${productError}`);
}
}
await client.query('COMMIT');
logger.info('scraper', `✅ Saved ${products.length} products successfully`);
} catch (error) {
await client.query('ROLLBACK');
logger.error('scraper', `Error saving products: ${error}`);
throw error;
} finally {
client.release();
}
}
export async function scrapeStore(storeId: number, parallel: number = 3, userAgent?: string): Promise<void> {
try {
logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`);
const categoriesResult = await pool.query(`
SELECT c.id, c.name, c.slug, c.dutchie_url
FROM categories c
WHERE c.store_id = $1
AND c.scrape_enabled = true
ORDER BY c.name
`, [storeId]);
logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
for (const category of categoriesResult.rows) {
try {
logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
logger.info('scraper', `📂 Scraping: ${category.name}`);
logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
const products = await scrapeCategory(storeId, category.id, userAgent);
await saveProducts(storeId, category.id, products);
logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
} catch (error) {
logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
}
// No delays - scrape fast!
}
await pool.query(`
UPDATE stores
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [storeId]);
logger.info('scraper', `🎉 Store scrape completed: ID ${storeId}`);
} catch (error) {
logger.error('scraper', `❌ Store scrape failed: ${error}`);
throw error;
}
}