Initial commit - Dutchie dispensary scraper

This commit is contained in:
Kelly
2025-11-28 19:45:44 -07:00
commit 5757a8e9bd
23375 changed files with 3788799 additions and 0 deletions

324
backend/dist/scraper-v2/downloader.js vendored Normal file
View File

@@ -0,0 +1,324 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Downloader = void 0;
const puppeteer_1 = __importDefault(require("puppeteer"));
const axios_1 = __importDefault(require("axios"));
const types_1 = require("./types");
const logger_1 = require("../services/logger");
class Downloader {
browser = null;
page = null;
pageInUse = false;
/**
* Initialize browser instance (lazy initialization)
*/
async getBrowser() {
if (!this.browser || !this.browser.isConnected()) {
const launchOptions = {
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
]
};
this.browser = await puppeteer_1.default.launch(launchOptions);
logger_1.logger.info('scraper', 'Browser instance created');
}
return this.browser;
}
/**
* Get or create a page instance
*/
async getPage() {
if (!this.page || this.page.isClosed()) {
const browser = await this.getBrowser();
this.page = await browser.newPage();
await this.page.setViewport({ width: 1920, height: 1080 });
logger_1.logger.debug('scraper', 'New page created');
}
return this.page;
}
/**
* Apply stealth mode to page
*/
async makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
// @ts-ignore - runs in browser context
window.chrome = {
runtime: {},
};
// @ts-ignore - runs in browser context
const originalQuery = window.navigator.permissions.query;
// @ts-ignore - runs in browser context
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' })
: originalQuery(parameters);
});
}
/**
* Configure proxy for browser
*/
getProxyArgs(proxy) {
if (proxy.protocol === 'socks5') {
return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
}
else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
}
return [];
}
/**
* HTTP-based fetch (lightweight, fast)
*/
async httpFetch(request) {
try {
const config = {
timeout: 30000,
headers: {
'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
...request.metadata.headers
},
validateStatus: () => true // Don't throw on any status
};
// Add proxy if available
if (request.metadata.proxy) {
const proxy = request.metadata.proxy;
config.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol
};
if (proxy.username && proxy.password) {
config.proxy.auth = {
username: proxy.username,
password: proxy.password
};
}
}
const response = await axios_1.default.get(request.url, config);
return {
url: request.url,
statusCode: response.status,
content: response.data,
metadata: {
headers: response.headers,
method: 'http'
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = true;
scraperError.request = request;
throw scraperError;
}
}
/**
* Browser-based fetch (for JS-heavy sites)
*/
async browserFetch(request) {
// Wait if page is in use
while (this.pageInUse) {
await new Promise(resolve => setTimeout(resolve, 100));
}
this.pageInUse = true;
try {
const page = await this.getPage();
// Apply stealth mode if required
if (request.metadata.requiresStealth) {
await this.makePageStealthy(page);
}
// Set user agent
if (request.metadata.userAgent) {
await page.setUserAgent(request.metadata.userAgent);
}
// Navigate to page
const navigationPromise = page.goto(request.url, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
const response = await navigationPromise;
if (!response) {
throw new Error('Navigation failed - no response');
}
// Wait for initial render
await page.waitForTimeout(3000);
// Check for lazy-loaded content
await this.autoScroll(page);
// Get page content
const content = await page.content();
const statusCode = response.status();
return {
url: request.url,
statusCode,
content,
metadata: {
method: 'browser',
finalUrl: page.url()
},
request
};
}
catch (error) {
const scraperError = new Error(error.message);
if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
scraperError.type = types_1.ErrorType.TIMEOUT;
}
else if (error.message.includes('net::')) {
scraperError.type = types_1.ErrorType.NETWORK_ERROR;
}
else if (error.message.includes('404')) {
scraperError.type = types_1.ErrorType.NOT_FOUND;
}
else {
scraperError.type = types_1.ErrorType.UNKNOWN;
}
scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
scraperError.request = request;
throw scraperError;
}
finally {
this.pageInUse = false;
}
}
/**
* Auto-scroll to load lazy content
*/
async autoScroll(page) {
try {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 500;
const maxScrolls = 20; // Prevent infinite scrolling
let scrollCount = 0;
const timer = setInterval(() => {
// @ts-ignore - runs in browser context
const scrollHeight = document.body.scrollHeight;
// @ts-ignore - runs in browser context
window.scrollBy(0, distance);
totalHeight += distance;
scrollCount++;
if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
clearInterval(timer);
// Scroll back to top
// @ts-ignore - runs in browser context
window.scrollTo(0, 0);
resolve();
}
}, 200);
});
});
// Wait for any lazy-loaded content
await page.waitForTimeout(1000);
}
catch (error) {
logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
}
}
/**
* Main fetch method - tries HTTP first, falls back to browser
*/
async fetch(request) {
const startTime = Date.now();
try {
// Force browser mode if required
if (request.metadata.requiresBrowser) {
logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
return response;
}
// Try HTTP first (faster)
try {
logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
const response = await this.httpFetch(request);
// Check if we got a meaningful response
if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
return response;
}
// Fall through to browser mode for non-2xx responses
logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
}
catch (httpError) {
logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
}
// Fall back to browser
request.metadata.requiresBrowser = true;
const response = await this.browserFetch(request);
logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
return response;
}
catch (error) {
logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
throw error;
}
}
/**
* Evaluate JavaScript in the current page context
*/
async evaluate(fn) {
if (!this.page || this.page.isClosed()) {
throw new Error('No active page for evaluation');
}
return await this.page.evaluate(fn);
}
/**
* Get the current page (for custom operations)
*/
async getCurrentPage() {
return this.page;
}
/**
* Close the browser
*/
async close() {
if (this.page && !this.page.isClosed()) {
await this.page.close();
this.page = null;
}
if (this.browser && this.browser.isConnected()) {
await this.browser.close();
this.browser = null;
logger_1.logger.info('scraper', 'Browser closed');
}
}
/**
* Clean up resources
*/
async cleanup() {
await this.close();
}
}
exports.Downloader = Downloader;

652
backend/dist/scraper-v2/engine.js vendored Normal file
View File

@@ -0,0 +1,652 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.DutchieSpider = exports.ScraperEngine = void 0;
const scheduler_1 = require("./scheduler");
const downloader_1 = require("./downloader");
const middlewares_1 = require("./middlewares");
const pipelines_1 = require("./pipelines");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
/**
* Main Scraper Engine - orchestrates the entire scraping process
*/
class ScraperEngine {
scheduler;
downloader;
middlewareEngine;
pipelineEngine;
stats;
isRunning = false;
concurrency = 1; // Conservative default
constructor(concurrency = 1) {
this.scheduler = new scheduler_1.RequestScheduler();
this.downloader = new downloader_1.Downloader();
this.middlewareEngine = new middlewares_1.MiddlewareEngine();
this.pipelineEngine = new pipelines_1.PipelineEngine();
this.concurrency = concurrency;
// Initialize stats
this.stats = {
requestsTotal: 0,
requestsSuccess: 0,
requestsFailed: 0,
itemsScraped: 0,
itemsSaved: 0,
itemsDropped: 0,
errorsCount: 0,
startTime: new Date()
};
// Setup middlewares
this.setupMiddlewares();
// Setup pipelines
this.setupPipelines();
}
/**
* Setup middleware chain
*/
setupMiddlewares() {
this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware());
this.middlewareEngine.use(new middlewares_1.ProxyMiddleware());
this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware());
this.middlewareEngine.use(new middlewares_1.RetryMiddleware());
this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware());
this.middlewareEngine.use(new middlewares_1.StealthMiddleware());
}
/**
* Setup pipeline chain
*/
setupPipelines() {
this.pipelineEngine.use(new pipelines_1.ValidationPipeline());
this.pipelineEngine.use(new pipelines_1.SanitizationPipeline());
this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline());
this.pipelineEngine.use(new pipelines_1.ImagePipeline());
this.pipelineEngine.use(new pipelines_1.StatsPipeline());
this.pipelineEngine.use(new pipelines_1.DatabasePipeline());
}
/**
* Add a request to the queue
*/
enqueue(request) {
this.scheduler.enqueue(request);
}
/**
* Start the scraping engine
*/
async start() {
if (this.isRunning) {
logger_1.logger.warn('scraper', 'Engine is already running');
return;
}
this.isRunning = true;
this.stats.startTime = new Date();
logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`);
// Process queue
await this.processQueue();
this.isRunning = false;
this.stats.endTime = new Date();
this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime();
logger_1.logger.info('scraper', `✅ Scraper engine finished`);
this.logStats();
// Cleanup
await this.downloader.cleanup();
}
/**
* Process the request queue
*/
async processQueue() {
while (!this.scheduler.isEmpty() && this.isRunning) {
const request = this.scheduler.dequeue();
if (!request) {
// Wait a bit and check again
await new Promise(resolve => setTimeout(resolve, 100));
continue;
}
try {
await this.processRequest(request);
}
catch (error) {
logger_1.logger.error('scraper', `Failed to process request: ${error}`);
}
}
}
/**
* Process a single request
*/
async processRequest(request) {
this.stats.requestsTotal++;
try {
logger_1.logger.debug('scraper', `Processing: ${request.url}`);
// Apply request middlewares
const processedRequest = await this.middlewareEngine.processRequest(request);
// Download
let response = await this.downloader.fetch(processedRequest);
// Apply response middlewares
response = await this.middlewareEngine.processResponse(response);
// Parse response using callback
const parseResult = await request.callback(response);
// Process items through pipeline
if (parseResult.items && parseResult.items.length > 0) {
for (const item of parseResult.items) {
await this.processItem(item, 'default');
}
}
// Enqueue follow-up requests
if (parseResult.requests && parseResult.requests.length > 0) {
for (const followUpRequest of parseResult.requests) {
this.scheduler.enqueue(followUpRequest);
}
}
this.stats.requestsSuccess++;
this.scheduler.markComplete(request);
}
catch (error) {
this.stats.requestsFailed++;
this.stats.errorsCount++;
logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`);
// Apply error middlewares
const handledError = await this.middlewareEngine.processError(error, request);
// If error is null, it was handled (e.g., retry)
if (handledError === null) {
this.scheduler.requeueForRetry(request);
}
else {
this.scheduler.markComplete(request);
// Call error handler if provided
if (request.errorHandler) {
await request.errorHandler(error, request);
}
}
}
}
/**
* Process an item through pipelines
*/
async processItem(item, spider) {
this.stats.itemsScraped++;
try {
const processedItem = await this.pipelineEngine.processItem(item, spider);
if (processedItem) {
this.stats.itemsSaved++;
}
else {
this.stats.itemsDropped++;
}
}
catch (error) {
logger_1.logger.error('scraper', `Failed to process item: ${error}`);
this.stats.itemsDropped++;
}
}
/**
* Log statistics
*/
logStats() {
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
logger_1.logger.info('scraper', '📊 Scraper Statistics');
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`);
logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`);
logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`);
logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`);
// Get stats from StatsPipeline
const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline');
if (statsPipeline) {
const itemStats = statsPipeline.getStats();
logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`);
logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`);
logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`);
}
logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
}
/**
* Stop the engine
*/
stop() {
this.isRunning = false;
logger_1.logger.info('scraper', 'Stopping scraper engine...');
}
/**
* Get current stats
*/
getStats() {
return { ...this.stats };
}
/**
* Get queue stats
*/
getQueueStats() {
return this.scheduler.getStats();
}
}
exports.ScraperEngine = ScraperEngine;
/**
* Spider for scraping Dutchie categories
*/
class DutchieSpider {
engine;
constructor(engine) {
this.engine = engine;
}
/**
* Scrape a category
*/
async scrapeCategory(storeId, categoryId) {
logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`);
const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`;
let registerScraper, updateScraperStats, completeScraper;
try {
// Import monitoring functions
const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor')));
registerScraper = monitor.registerScraper;
updateScraperStats = monitor.updateScraperStats;
completeScraper = monitor.completeScraper;
}
catch (e) {
// Monitoring not available
}
try {
// Get category info
const categoryResult = await migrate_1.pool.query(`
SELECT c.*, s.slug as store_slug, s.name as store_name
FROM categories c
JOIN stores s ON c.store_id = s.id
WHERE c.id = $1
`, [categoryId]);
if (categoryResult.rows.length === 0) {
throw new Error('Category not found');
}
const category = categoryResult.rows[0];
logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`);
// Register with monitoring system
if (registerScraper) {
registerScraper(scraperId, storeId, category.store_name, categoryId, category.name);
}
// Mark products as out of stock before scraping
await migrate_1.pool.query(`
UPDATE products
SET in_stock = false
WHERE store_id = $1 AND category_id = $2
`, [storeId, categoryId]);
if (updateScraperStats) {
updateScraperStats(scraperId, {}, 'Marking products as out of stock');
}
// Enqueue category page request
this.engine.enqueue({
url: category.dutchie_url,
priority: 100,
maxRetries: 3,
metadata: {
requiresBrowser: true,
storeId,
categoryId,
categorySlug: category.slug,
storeSlug: category.store_slug
},
callback: this.parseCategoryPage.bind(this)
});
// Start the engine
if (updateScraperStats) {
updateScraperStats(scraperId, {}, 'Scraping category page');
}
await this.engine.start();
// Update stats from engine
const engineStats = this.engine.getStats();
if (updateScraperStats) {
updateScraperStats(scraperId, {
requestsTotal: engineStats.requestsTotal,
requestsSuccess: engineStats.requestsSuccess,
itemsSaved: engineStats.itemsSaved,
itemsDropped: engineStats.itemsDropped,
errorsCount: engineStats.errorsCount
}, 'Finalizing');
}
// Update category last_scraped_at
await migrate_1.pool.query(`
UPDATE categories
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [categoryId]);
logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`);
if (completeScraper) {
completeScraper(scraperId);
}
}
catch (error) {
logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
if (completeScraper) {
completeScraper(scraperId, error.toString());
}
throw error;
}
}
/**
* Parse category page (product listing)
*/
async parseCategoryPage(response) {
const page = await this.engine['downloader'].getCurrentPage();
if (!page) {
throw new Error('No active page');
}
logger_1.logger.info('scraper', 'Parsing category page...');
// Extract product cards
const productCards = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const cards = document.querySelectorAll('[data-testid="product-list-item"]');
const items = [];
cards.forEach((card) => {
try {
const allText = card.textContent || '';
// Extract name
let name = '';
const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
for (const sel of nameSelectors) {
const el = card.querySelector(sel);
if (el?.textContent?.trim()) {
name = el.textContent.trim().split('\n')[0].trim();
break;
}
}
if (!name || name.length < 2)
return;
// Extract price
let price = null;
let originalPrice = null;
const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
if (priceMatches && priceMatches.length > 0) {
price = parseFloat(priceMatches[0].replace('$', ''));
if (priceMatches.length > 1) {
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
}
}
// Extract link
const linkEl = card.querySelector('a[href*="/product/"]');
let href = linkEl?.getAttribute('href') || '';
if (href && href.startsWith('/')) {
// @ts-ignore - runs in browser context
href = window.location.origin + href;
}
items.push({ name, price, originalPrice, href });
}
catch (err) {
console.error('Error parsing product card:', err);
}
});
return items;
});
logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`);
// Create follow-up requests for each product
const requests = productCards.map((card, index) => ({
url: card.href,
priority: 50,
maxRetries: 3,
metadata: {
...response.request.metadata,
productName: card.name,
productPrice: card.price,
productOriginalPrice: card.originalPrice,
requiresBrowser: true
},
callback: this.parseProductPage.bind(this)
}));
return { items: [], requests };
}
/**
* Parse individual product page
*/
async parseProductPage(response) {
const page = await this.engine['downloader'].getCurrentPage();
if (!page) {
throw new Error('No active page');
}
const productName = response.request.metadata.productName;
logger_1.logger.debug('scraper', `Parsing product: ${productName}`);
// Extract product details
const details = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const allText = document.body.textContent || '';
// Extract image
let fullSizeImage = null;
const mainImageSelectors = [
'img[class*="ProductImage"]',
'img[class*="product-image"]',
'[class*="ImageGallery"] img',
'main img',
'img[src*="images.dutchie.com"]'
];
for (const sel of mainImageSelectors) {
// @ts-ignore - runs in browser context
const img = document.querySelector(sel);
if (img?.src && img.src.includes('dutchie.com')) {
fullSizeImage = img.src;
break;
}
}
// Extract description
let description = '';
const descSelectors = [
'[class*="description"]',
'[class*="Description"]',
'[data-testid*="description"]',
'p[class*="product"]'
];
for (const sel of descSelectors) {
// @ts-ignore - runs in browser context
const el = document.querySelector(sel);
if (el?.textContent?.trim() && el.textContent.length > 20) {
description = el.textContent.trim();
break;
}
}
// Extract THC/CBD
let thc = null;
const thcPatterns = [
/THC[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+THC/i
];
for (const pattern of thcPatterns) {
const match = allText.match(pattern);
if (match) {
thc = parseFloat(match[1]);
break;
}
}
let cbd = null;
const cbdPatterns = [
/CBD[:\s]*(\d+\.?\d*)\s*%/i,
/Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
/(\d+\.?\d*)\s*%\s+CBD/i
];
for (const pattern of cbdPatterns) {
const match = allText.match(pattern);
if (match) {
cbd = parseFloat(match[1]);
break;
}
}
// Extract strain type
let strainType = null;
if (allText.match(/\bindica\b/i))
strainType = 'Indica';
else if (allText.match(/\bsativa\b/i))
strainType = 'Sativa';
else if (allText.match(/\bhybrid\b/i))
strainType = 'Hybrid';
// Extract brand
let brand = null;
const brandSelectors = [
'[class*="brand"]',
'[class*="Brand"]',
'[data-testid*="brand"]'
];
for (const sel of brandSelectors) {
// @ts-ignore - runs in browser context
const el = document.querySelector(sel);
if (el?.textContent?.trim()) {
brand = el.textContent.trim();
break;
}
}
// Extract metadata
const terpenes = [];
const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene'];
terpeneNames.forEach(terp => {
if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
terpenes.push(terp);
}
});
const effects = [];
const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic'];
effectNames.forEach(effect => {
if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
effects.push(effect);
}
});
return {
fullSizeImage,
description,
thc,
cbd,
strainType,
brand,
terpenes,
effects
};
});
// Create product item
const product = {
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
name: productName || 'Unknown Product',
description: details.description,
price: response.request.metadata.productPrice,
originalPrice: response.request.metadata.productOriginalPrice,
thcPercentage: details.thc || undefined,
cbdPercentage: details.cbd || undefined,
strainType: details.strainType || undefined,
brand: details.brand || undefined,
imageUrl: details.fullSizeImage || undefined,
dutchieUrl: response.url,
metadata: {
terpenes: details.terpenes,
effects: details.effects
},
storeId: response.request.metadata.storeId,
categoryId: response.request.metadata.categoryId
};
return { items: [product], requests: [] };
}
/**
* Scrape entire store
*/
async scrapeStore(storeId, parallel = 3) {
logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
try {
// Get all leaf categories (no children)
const categoriesResult = await migrate_1.pool.query(`
SELECT c.id, c.name
FROM categories c
WHERE c.store_id = $1
AND c.scrape_enabled = true
AND NOT EXISTS (
SELECT 1 FROM categories child
WHERE child.parent_id = c.id
)
ORDER BY c.name
`, [storeId]);
const categories = categoriesResult.rows;
logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`);
if (parallel === 1) {
// Sequential scraping (original behavior)
for (const category of categories) {
try {
await this.scrapeCategory(storeId, category.id);
await new Promise(resolve => setTimeout(resolve, 3000));
}
catch (error) {
logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`);
}
}
}
else {
// Parallel scraping with concurrency limit
const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel);
const successful = results.filter(r => r.status === 'fulfilled').length;
const failed = results.filter(r => r.status === 'rejected').length;
logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`);
}
// Update store last_scraped_at
await migrate_1.pool.query(`
UPDATE stores
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [storeId]);
logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`);
}
catch (error) {
logger_1.logger.error('scraper', `Store scrape failed: ${error}`);
throw error;
}
}
/**
* Scrape multiple categories in parallel with concurrency limit
*/
async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) {
const results = [];
// Process categories in batches
for (let i = 0; i < categories.length; i += concurrency) {
const batch = categories.slice(i, i + concurrency);
logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`);
const batchPromises = batch.map(category => {
// Create a new spider instance for each category
const engine = new ScraperEngine(1); // 1 concurrent request per spider
const spider = new DutchieSpider(engine);
return spider.scrapeCategory(storeId, category.id)
.catch(error => {
logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`);
throw error;
});
});
const batchResults = await Promise.allSettled(batchPromises);
results.push(...batchResults);
// Delay between batches to avoid overwhelming the server
if (i + concurrency < categories.length) {
logger_1.logger.info('scraper', 'Waiting 5s before next batch...');
await new Promise(resolve => setTimeout(resolve, 5000));
}
}
return results;
}
}
exports.DutchieSpider = DutchieSpider;

108
backend/dist/scraper-v2/index.js vendored Normal file
View File

@@ -0,0 +1,108 @@
"use strict";
/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
* - Downloader: HTTP + Browser hybrid fetcher
* - Middlewares: Request/response processing chain
* - Pipelines: Item processing and persistence
* - Navigation: Category discovery
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = exports.PipelineEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = exports.MiddlewareEngine = exports.NavigationDiscovery = exports.Downloader = exports.RequestScheduler = exports.DutchieSpider = exports.ScraperEngine = void 0;
exports.scrapeCategory = scrapeCategory;
exports.scrapeStore = scrapeStore;
exports.discoverCategories = discoverCategories;
var engine_1 = require("./engine");
Object.defineProperty(exports, "ScraperEngine", { enumerable: true, get: function () { return engine_1.ScraperEngine; } });
Object.defineProperty(exports, "DutchieSpider", { enumerable: true, get: function () { return engine_1.DutchieSpider; } });
var scheduler_1 = require("./scheduler");
Object.defineProperty(exports, "RequestScheduler", { enumerable: true, get: function () { return scheduler_1.RequestScheduler; } });
var downloader_1 = require("./downloader");
Object.defineProperty(exports, "Downloader", { enumerable: true, get: function () { return downloader_1.Downloader; } });
var navigation_1 = require("./navigation");
Object.defineProperty(exports, "NavigationDiscovery", { enumerable: true, get: function () { return navigation_1.NavigationDiscovery; } });
var middlewares_1 = require("./middlewares");
Object.defineProperty(exports, "MiddlewareEngine", { enumerable: true, get: function () { return middlewares_1.MiddlewareEngine; } });
Object.defineProperty(exports, "UserAgentMiddleware", { enumerable: true, get: function () { return middlewares_1.UserAgentMiddleware; } });
Object.defineProperty(exports, "ProxyMiddleware", { enumerable: true, get: function () { return middlewares_1.ProxyMiddleware; } });
Object.defineProperty(exports, "RateLimitMiddleware", { enumerable: true, get: function () { return middlewares_1.RateLimitMiddleware; } });
Object.defineProperty(exports, "RetryMiddleware", { enumerable: true, get: function () { return middlewares_1.RetryMiddleware; } });
Object.defineProperty(exports, "BotDetectionMiddleware", { enumerable: true, get: function () { return middlewares_1.BotDetectionMiddleware; } });
Object.defineProperty(exports, "StealthMiddleware", { enumerable: true, get: function () { return middlewares_1.StealthMiddleware; } });
var pipelines_1 = require("./pipelines");
Object.defineProperty(exports, "PipelineEngine", { enumerable: true, get: function () { return pipelines_1.PipelineEngine; } });
Object.defineProperty(exports, "ValidationPipeline", { enumerable: true, get: function () { return pipelines_1.ValidationPipeline; } });
Object.defineProperty(exports, "SanitizationPipeline", { enumerable: true, get: function () { return pipelines_1.SanitizationPipeline; } });
Object.defineProperty(exports, "DeduplicationPipeline", { enumerable: true, get: function () { return pipelines_1.DeduplicationPipeline; } });
Object.defineProperty(exports, "ImagePipeline", { enumerable: true, get: function () { return pipelines_1.ImagePipeline; } });
Object.defineProperty(exports, "DatabasePipeline", { enumerable: true, get: function () { return pipelines_1.DatabasePipeline; } });
Object.defineProperty(exports, "StatsPipeline", { enumerable: true, get: function () { return pipelines_1.StatsPipeline; } });
__exportStar(require("./types"), exports);
// Main API functions
const engine_2 = require("./engine");
const navigation_2 = require("./navigation");
const downloader_2 = require("./downloader");
const logger_1 = require("../services/logger");
/**
* Scrape a single category
*/
async function scrapeCategory(storeId, categoryId) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {
await spider.scrapeCategory(storeId, categoryId);
}
catch (error) {
logger_1.logger.error('scraper', `scrapeCategory failed: ${error}`);
throw error;
}
}
/**
* Scrape an entire store
*/
async function scrapeStore(storeId, parallel = 3) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {
await spider.scrapeStore(storeId, parallel);
}
catch (error) {
logger_1.logger.error('scraper', `scrapeStore failed: ${error}`);
throw error;
}
}
/**
* Discover categories for a store
*/
async function discoverCategories(storeId) {
const downloader = new downloader_2.Downloader();
const discovery = new navigation_2.NavigationDiscovery(downloader);
try {
// Discover categories (uses your existing Dutchie category structure)
await discovery.discoverCategories(storeId);
}
catch (error) {
logger_1.logger.error('scraper', `discoverCategories failed: ${error}`);
throw error;
}
finally {
await downloader.cleanup();
}
}

263
backend/dist/scraper-v2/middlewares.js vendored Normal file
View File

@@ -0,0 +1,263 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
const types_1 = require("./types");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* User Agent Rotation Middleware
*/
class UserAgentMiddleware {
name = 'UserAgentMiddleware';
priority = 100;
async processRequest(request) {
if (!request.metadata.userAgent) {
request.metadata.userAgent = getRandomUserAgent();
}
return request;
}
}
exports.UserAgentMiddleware = UserAgentMiddleware;
/**
* Proxy Rotation Middleware
*/
class ProxyMiddleware {
name = 'ProxyMiddleware';
priority = 90;
async getActiveProxy() {
try {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
catch (error) {
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
return null;
}
}
async processRequest(request) {
// Only add proxy if not already set
if (!request.metadata.proxy && request.retryCount > 0) {
// Use proxy on retries
request.metadata.proxy = await this.getActiveProxy();
if (request.metadata.proxy) {
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
}
}
return request;
}
}
exports.ProxyMiddleware = ProxyMiddleware;
/**
* Rate Limiting Middleware with Adaptive Delays
*/
class RateLimitMiddleware {
name = 'RateLimitMiddleware';
priority = 80;
requestTimes = [];
errorCount = 0;
baseDelay = 2000; // 2 seconds base delay
maxDelay = 30000; // 30 seconds max
async processRequest(request) {
await this.waitForNextRequest();
return request;
}
async processResponse(response) {
// Record success - gradually reduce error count
this.errorCount = Math.max(0, this.errorCount - 1);
return response;
}
async processError(error) {
// Record error - increase delay
this.errorCount++;
return error;
}
async waitForNextRequest() {
// Calculate adaptive delay based on error count
const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
// Add random jitter (±20%)
const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
const delay = adaptiveDelay + jitter;
const now = Date.now();
const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
const timeSinceLast = now - lastRequest;
if (timeSinceLast < delay) {
const waitTime = delay - timeSinceLast;
logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
await sleep(waitTime);
}
this.requestTimes.push(Date.now());
this.cleanup();
}
cleanup() {
// Keep only last minute of requests
const cutoff = Date.now() - 60000;
this.requestTimes = this.requestTimes.filter(t => t > cutoff);
}
setBaseDelay(ms) {
this.baseDelay = ms;
}
}
exports.RateLimitMiddleware = RateLimitMiddleware;
/**
* Retry Middleware with Exponential Backoff
*/
class RetryMiddleware {
name = 'RetryMiddleware';
priority = 70;
isRetryable(error) {
const retryableErrors = [
types_1.ErrorType.NETWORK_ERROR,
types_1.ErrorType.TIMEOUT,
types_1.ErrorType.SERVER_ERROR
];
if ('type' in error) {
return retryableErrors.includes(error.type);
}
// Check error message for common retryable patterns
const message = error.message.toLowerCase();
return (message.includes('timeout') ||
message.includes('network') ||
message.includes('econnreset') ||
message.includes('econnrefused') ||
message.includes('500') ||
message.includes('502') ||
message.includes('503'));
}
async processError(error, request) {
if (!this.isRetryable(error)) {
logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
return error;
}
if (request.retryCount < request.maxRetries) {
// Calculate backoff delay
const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000);
logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
await sleep(backoffDelay);
// Return null to indicate retry should happen
return null;
}
logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`);
return error;
}
}
exports.RetryMiddleware = RetryMiddleware;
/**
* Bot Detection Middleware
*/
class BotDetectionMiddleware {
name = 'BotDetectionMiddleware';
priority = 60;
detectedCount = 0;
DETECTION_THRESHOLD = 3;
async processResponse(response) {
const content = typeof response.content === 'string'
? response.content
: JSON.stringify(response.content);
// Check for bot detection indicators
const botIndicators = [
/captcha/i,
/cloudflare/i,
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error = new Error('Bot detection threshold reached');
error.type = types_1.ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
throw error;
}
}
else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
}
return response;
}
}
exports.BotDetectionMiddleware = BotDetectionMiddleware;
/**
* Stealth Mode Middleware
*/
class StealthMiddleware {
name = 'StealthMiddleware';
priority = 95;
async processRequest(request) {
// Flag that this request needs stealth mode
request.metadata.requiresStealth = true;
return request;
}
}
exports.StealthMiddleware = StealthMiddleware;
/**
* Middleware Engine to orchestrate all middlewares
*/
class MiddlewareEngine {
middlewares = [];
use(middleware) {
this.middlewares.push(middleware);
// Sort by priority (higher first)
this.middlewares.sort((a, b) => b.priority - a.priority);
}
async processRequest(request) {
let current = request;
for (const middleware of this.middlewares) {
if (middleware.processRequest) {
current = await middleware.processRequest(current);
}
}
return current;
}
async processResponse(response) {
let current = response;
for (const middleware of this.middlewares) {
if (middleware.processResponse) {
current = await middleware.processResponse(current);
}
}
return current;
}
async processError(error, request) {
let currentError = error;
for (const middleware of this.middlewares) {
if (middleware.processError && currentError) {
currentError = await middleware.processError(currentError, request);
if (currentError === null) {
// Middleware handled the error (e.g., retry)
break;
}
}
}
return currentError;
}
}
exports.MiddlewareEngine = MiddlewareEngine;

278
backend/dist/scraper-v2/navigation.js vendored Normal file
View File

@@ -0,0 +1,278 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.NavigationDiscovery = void 0;
const migrate_1 = require("../db/migrate");
const logger_1 = require("../services/logger");
/**
* Navigation Discovery - finds and builds category structure
*/
class NavigationDiscovery {
downloader;
constructor(downloader) {
this.downloader = downloader;
}
/**
* Discover categories from a store's main page
*/
async discoverCategories(storeId) {
logger_1.logger.info('categories', `Starting category discovery for store ${storeId}`);
try {
// Get store info
const storeResult = await migrate_1.pool.query(`
SELECT id, name, slug, dutchie_url
FROM stores
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Create request to fetch the main page
const request = {
url: baseUrl,
priority: 100,
retryCount: 0,
maxRetries: 3,
metadata: {
requiresBrowser: true,
requiresStealth: true
},
callback: async () => ({ items: [], requests: [] })
};
// Fetch the page
const response = await this.downloader.fetch(request);
// Extract navigation links
const page = await this.downloader.getCurrentPage();
if (!page) {
throw new Error('No active page for navigation extraction');
}
const links = await this.extractNavigationLinks(page, baseUrl);
logger_1.logger.info('categories', `Found ${links.length} navigation links`);
// Check if it's a Dutchie menu
const isDutchie = await this.isDutchieMenu(page);
if (isDutchie) {
logger_1.logger.info('categories', 'Detected Dutchie menu - using predefined structure');
await this.createDutchieCategories(storeId, store, links);
}
else {
logger_1.logger.info('categories', 'Custom menu detected - extracting from navigation');
await this.createCustomCategories(storeId, store, links);
}
logger_1.logger.info('categories', `✅ Category discovery completed for ${store.name}`);
}
catch (error) {
logger_1.logger.error('categories', `Category discovery failed: ${error}`);
throw error;
}
}
/**
* Extract navigation links from page
*/
async extractNavigationLinks(page, baseUrl) {
return await page.evaluate((base) => {
const links = [];
// Look for navigation elements
const navSelectors = [
'nav a',
'[role="navigation"] a',
'[class*="nav"] a',
'[class*="menu"] a',
'[class*="category"] a',
'header a'
];
const foundLinks = new Set();
for (const selector of navSelectors) {
// @ts-ignore - runs in browser context
const elements = document.querySelectorAll(selector);
elements.forEach((el) => {
const text = el.textContent?.trim();
let href = el.href || el.getAttribute('href');
if (!text || !href || text.length < 2)
return;
// Normalize href
if (href.startsWith('/')) {
// @ts-ignore - runs in browser context
const url = new URL(base);
href = `${url.origin}${href}`;
}
// Skip external links and anchors
if (!href.includes(base) || href.includes('#'))
return;
// Skip duplicates
const linkKey = `${text}:${href}`;
if (foundLinks.has(linkKey))
return;
foundLinks.add(linkKey);
// Determine if it's likely a category
const categoryKeywords = [
'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
'topical', 'accessory', 'brand', 'special', 'shop',
'indica', 'sativa', 'hybrid', 'cbd', 'thc'
];
const isCategory = categoryKeywords.some(kw => text.toLowerCase().includes(kw) ||
href.toLowerCase().includes(kw));
links.push({
text,
href,
isCategory
});
});
}
return links;
}, baseUrl);
}
/**
* Check if it's a Dutchie menu
*/
async isDutchieMenu(page) {
return await page.evaluate(() => {
// Check for Dutchie markers
// @ts-ignore - runs in browser context
if (window.reactEnv) {
// @ts-ignore - runs in browser context
const env = window.reactEnv;
if (env.adminUrl?.includes('dutchie.com') ||
env.apiUrl?.includes('dutchie.com') ||
env.consumerUrl?.includes('dutchie.com')) {
return true;
}
}
// @ts-ignore - runs in browser context
const htmlContent = document.documentElement.innerHTML;
return (htmlContent.includes('admin.dutchie.com') ||
htmlContent.includes('api.dutchie.com') ||
htmlContent.includes('embedded-menu') ||
htmlContent.includes('window.reactEnv'));
});
}
/**
* Create categories for Dutchie menus (predefined structure)
* Uses your existing Dutchie category structure
*/
async createDutchieCategories(storeId, store, discoveredLinks) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
const baseUrl = store.dutchie_url;
// Your existing Dutchie categories structure
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop', parentSlug: undefined },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
{ name: 'Brands', slug: 'brands', parentSlug: undefined },
{ name: 'Specials', slug: 'specials', parentSlug: undefined }
];
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl;
if (category.parentSlug) {
// Subcategory: /embedded-menu/{slug}/shop/flower
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
}
else {
// Top-level: /embedded-menu/{slug}/shop
categoryUrl = `${baseUrl}/${category.slug}`;
}
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, NULL)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl, path]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
const parentId = parentResult.rows[0].id;
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
throw error;
}
finally {
client.release();
}
}
/**
* Create categories from discovered links (custom menus)
*/
async createCustomCategories(storeId, store, links) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
// Filter to likely category links
const categoryLinks = links.filter(link => link.isCategory);
let displayOrder = 0;
for (const link of categoryLinks) {
// Generate slug from text
const slug = link.text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
// Determine path from URL
const url = new URL(link.href);
const path = url.pathname.replace(/^\//, '');
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
`, [storeId, link.text, slug, link.href, path, displayOrder++]);
logger_1.logger.info('categories', `📁 ${link.text} -> ${link.href}`);
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
/**
* Update display_order column in categories table
*/
async ensureDisplayOrderColumn() {
try {
await migrate_1.pool.query(`
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
`);
logger_1.logger.info('categories', 'Ensured display_order column exists');
}
catch (error) {
logger_1.logger.warn('categories', `Could not add display_order column: ${error}`);
}
}
}
exports.NavigationDiscovery = NavigationDiscovery;

300
backend/dist/scraper-v2/pipelines.js vendored Normal file
View File

@@ -0,0 +1,300 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.PipelineEngine = exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = void 0;
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
/**
* Validation Pipeline - ensures data quality
*/
class ValidationPipeline {
name = 'ValidationPipeline';
priority = 100;
async process(item, spider) {
// Required fields
if (!item.name || item.name.trim().length < 2) {
logger_1.logger.warn('pipeline', `Dropping product: invalid name`);
return null;
}
if (!item.dutchieUrl) {
logger_1.logger.warn('pipeline', `Dropping product ${item.name}: no URL`);
return null;
}
// Validate numeric fields
if (item.price !== undefined && (item.price < 0 || item.price > 10000)) {
logger_1.logger.warn('pipeline', `Invalid price for ${item.name}: ${item.price}`);
item.price = undefined;
}
if (item.thcPercentage !== undefined && (item.thcPercentage < 0 || item.thcPercentage > 100)) {
logger_1.logger.warn('pipeline', `Invalid THC for ${item.name}: ${item.thcPercentage}`);
item.thcPercentage = undefined;
}
if (item.cbdPercentage !== undefined && (item.cbdPercentage < 0 || item.cbdPercentage > 100)) {
logger_1.logger.warn('pipeline', `Invalid CBD for ${item.name}: ${item.cbdPercentage}`);
item.cbdPercentage = undefined;
}
return item;
}
}
exports.ValidationPipeline = ValidationPipeline;
/**
* Sanitization Pipeline - cleans and normalizes data
*/
class SanitizationPipeline {
name = 'SanitizationPipeline';
priority = 90;
async process(item, spider) {
// Truncate long strings
if (item.name) {
item.name = item.name.substring(0, 500).trim();
}
if (item.description) {
item.description = item.description.substring(0, 5000).trim();
}
if (item.brand) {
item.brand = item.brand.substring(0, 255).trim();
}
if (item.weight) {
item.weight = item.weight.substring(0, 100).trim();
}
// Normalize strain type
if (item.strainType) {
const normalized = item.strainType.toLowerCase();
if (normalized.includes('indica')) {
item.strainType = 'Indica';
}
else if (normalized.includes('sativa')) {
item.strainType = 'Sativa';
}
else if (normalized.includes('hybrid')) {
item.strainType = 'Hybrid';
}
else {
item.strainType = undefined;
}
}
// Clean up metadata
if (item.metadata) {
// Remove empty arrays
Object.keys(item.metadata).forEach(key => {
if (Array.isArray(item.metadata[key]) && item.metadata[key].length === 0) {
delete item.metadata[key];
}
});
}
return item;
}
}
exports.SanitizationPipeline = SanitizationPipeline;
/**
* Deduplication Pipeline - prevents duplicate items
*/
class DeduplicationPipeline {
name = 'DeduplicationPipeline';
priority = 80;
seen = new Set();
async process(item, spider) {
const fingerprint = `${item.dutchieProductId}`;
if (this.seen.has(fingerprint)) {
logger_1.logger.debug('pipeline', `Duplicate product detected: ${item.name}`);
return null;
}
this.seen.add(fingerprint);
return item;
}
clear() {
this.seen.clear();
}
}
exports.DeduplicationPipeline = DeduplicationPipeline;
/**
* Image Processing Pipeline - handles image downloads
*/
class ImagePipeline {
name = 'ImagePipeline';
priority = 70;
extractImageId(url) {
try {
const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
return match ? match[1] : null;
}
catch (e) {
return null;
}
}
getFullSizeImageUrl(imageUrl) {
const imageId = this.extractImageId(imageUrl);
if (!imageId)
return imageUrl;
return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
}
async process(item, spider) {
if (item.imageUrl) {
// Convert to full-size URL
item.imageUrl = this.getFullSizeImageUrl(item.imageUrl);
}
return item;
}
}
exports.ImagePipeline = ImagePipeline;
/**
* Database Pipeline - saves items to database
*/
class DatabasePipeline {
name = 'DatabasePipeline';
priority = 10; // Low priority - runs last
async process(item, spider) {
const client = await migrate_1.pool.connect();
try {
// Extract store and category from metadata (set by spider)
const storeId = item.storeId;
const categoryId = item.categoryId;
if (!storeId || !categoryId) {
logger_1.logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
return null;
}
// Check if product exists
const existingResult = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
`, [storeId, item.name, categoryId]);
let localImagePath = null;
let productId;
if (existingResult.rows.length > 0) {
// Update existing product
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
await client.query(`
UPDATE products
SET name = $1, description = $2, price = $3,
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $12
`, [
item.name, item.description, item.price,
item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {}), productId
]);
logger_1.logger.debug('pipeline', `Updated product: ${item.name}`);
}
else {
// Insert new product
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
RETURNING id
`, [
storeId, categoryId, item.dutchieProductId, item.name, item.description,
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {})
]);
productId = insertResult.rows[0].id;
logger_1.logger.debug('pipeline', `Inserted new product: ${item.name}`);
}
// Download image if needed
if (item.imageUrl && !localImagePath) {
try {
localImagePath = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId);
await client.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localImagePath, productId]);
logger_1.logger.debug('pipeline', `Downloaded image for: ${item.name}`);
}
catch (error) {
logger_1.logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);
}
}
return item;
}
catch (error) {
logger_1.logger.error('pipeline', `Failed to save product ${item.name}: ${error}`);
return null;
}
finally {
client.release();
}
}
}
exports.DatabasePipeline = DatabasePipeline;
/**
* Stats Pipeline - tracks statistics
*/
class StatsPipeline {
name = 'StatsPipeline';
priority = 50;
stats = {
total: 0,
withImages: 0,
withThc: 0,
withCbd: 0,
withDescription: 0
};
async process(item, spider) {
this.stats.total++;
if (item.imageUrl)
this.stats.withImages++;
if (item.thcPercentage)
this.stats.withThc++;
if (item.cbdPercentage)
this.stats.withCbd++;
if (item.description)
this.stats.withDescription++;
return item;
}
getStats() {
return { ...this.stats };
}
clear() {
this.stats = {
total: 0,
withImages: 0,
withThc: 0,
withCbd: 0,
withDescription: 0
};
}
}
exports.StatsPipeline = StatsPipeline;
/**
* Pipeline Engine - orchestrates all pipelines
*/
class PipelineEngine {
pipelines = [];
use(pipeline) {
this.pipelines.push(pipeline);
// Sort by priority (higher first)
this.pipelines.sort((a, b) => b.priority - a.priority);
}
async processItem(item, spider) {
let current = item;
for (const pipeline of this.pipelines) {
try {
current = await pipeline.process(current, spider);
if (!current) {
// Item was filtered out
logger_1.logger.debug('pipeline', `Item filtered by ${pipeline.name}`);
return null;
}
}
catch (error) {
logger_1.logger.error('pipeline', `Error in ${pipeline.name}: ${error}`);
// Continue with other pipelines
}
}
return current;
}
getPipeline(name) {
return this.pipelines.find(p => p.name === name);
}
}
exports.PipelineEngine = PipelineEngine;

136
backend/dist/scraper-v2/scheduler.js vendored Normal file
View File

@@ -0,0 +1,136 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.RequestScheduler = void 0;
const logger_1 = require("../services/logger");
const crypto_1 = __importDefault(require("crypto"));
class RequestScheduler {
queue = [];
inProgress = new Set();
seen = new Set();
deduplicationEnabled = true;
constructor(deduplicationEnabled = true) {
this.deduplicationEnabled = deduplicationEnabled;
}
/**
* Generate fingerprint for request deduplication
*/
generateFingerprint(request) {
if (request.fingerprint) {
return request.fingerprint;
}
// Generate fingerprint based on URL and relevant metadata
const data = {
url: request.url,
method: request.metadata?.method || 'GET',
body: request.metadata?.body
};
return crypto_1.default.createHash('md5').update(JSON.stringify(data)).digest('hex');
}
/**
* Add a request to the queue
*/
enqueue(partialRequest) {
if (!partialRequest.url) {
logger_1.logger.warn('scraper', 'Cannot enqueue request without URL');
return false;
}
const fingerprint = this.generateFingerprint(partialRequest);
// Check for duplicates
if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
logger_1.logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
return false;
}
// Create full request with defaults
const request = {
url: partialRequest.url,
priority: partialRequest.priority ?? 0,
retryCount: partialRequest.retryCount ?? 0,
maxRetries: partialRequest.maxRetries ?? 3,
metadata: partialRequest.metadata || {},
callback: partialRequest.callback,
errorHandler: partialRequest.errorHandler,
fingerprint
};
this.queue.push(request);
this.seen.add(fingerprint);
// Sort by priority (higher priority first)
this.queue.sort((a, b) => b.priority - a.priority);
logger_1.logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
return true;
}
/**
* Get the next request from the queue
*/
dequeue() {
const request = this.queue.shift();
if (request) {
this.inProgress.add(request.fingerprint);
}
return request || null;
}
/**
* Mark a request as complete
*/
markComplete(request) {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
}
}
/**
* Requeue a failed request (for retry)
*/
requeueForRetry(request) {
if (request.fingerprint) {
this.inProgress.delete(request.fingerprint);
this.seen.delete(request.fingerprint);
}
request.retryCount++;
if (request.retryCount > request.maxRetries) {
logger_1.logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
return false;
}
// Decrease priority for retried requests
request.priority = Math.max(0, request.priority - 1);
return this.enqueue(request);
}
/**
* Get queue stats
*/
getStats() {
return {
pending: this.queue.length,
inProgress: this.inProgress.size,
total: this.seen.size
};
}
/**
* Check if queue is empty
*/
isEmpty() {
return this.queue.length === 0 && this.inProgress.size === 0;
}
/**
* Clear all queues
*/
clear() {
this.queue = [];
this.inProgress.clear();
this.seen.clear();
}
/**
* Get pending requests count
*/
getPendingCount() {
return this.queue.length;
}
/**
* Get in-progress count
*/
getInProgressCount() {
return this.inProgress.size;
}
}
exports.RequestScheduler = RequestScheduler;

13
backend/dist/scraper-v2/types.js vendored Normal file
View File

@@ -0,0 +1,13 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ErrorType = void 0;
var ErrorType;
(function (ErrorType) {
ErrorType["NETWORK_ERROR"] = "NETWORK_ERROR";
ErrorType["TIMEOUT"] = "TIMEOUT";
ErrorType["PARSE_ERROR"] = "PARSE_ERROR";
ErrorType["BOT_DETECTION"] = "BOT_DETECTION";
ErrorType["NOT_FOUND"] = "NOT_FOUND";
ErrorType["SERVER_ERROR"] = "SERVER_ERROR";
ErrorType["UNKNOWN"] = "UNKNOWN";
})(ErrorType || (exports.ErrorType = ErrorType = {}));