Files
cannaiq/backend/dist/scrapers/templates/dutchie.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

86 lines
4.1 KiB
JavaScript

"use strict";
// ============================================================================
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
// ============================================================================
Object.defineProperty(exports, "__esModule", { value: true });
exports.dutchieTemplate = void 0;
exports.getTemplateForUrl = getTemplateForUrl;
const logger_1 = require("../../services/logger");
/**
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
* This template relied on unstable DOM selectors and wrote to legacy tables.
*/
exports.dutchieTemplate = {
name: 'Dutchie Marketplace',
urlPattern: /dutchie\.com\/dispensary\//,
buildCategoryUrl: (baseUrl, category) => {
// Remove trailing slash
const base = baseUrl.replace(/\/$/, '');
// Convert category name to URL-friendly slug
const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
return `${base}/products/${categorySlug}`;
},
extractProducts: async (page) => {
const products = [];
try {
// Wait for product cards to load
await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
logger_1.logger.warn('scraper', 'No product cards found with data-testid="card-link"');
});
// Get all product card links
const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
logger_1.logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
for (const card of productCards) {
try {
// Extract all data at once using evaluate for speed
const cardData = await card.evaluate((el) => {
const href = el.getAttribute('href') || '';
const img = el.querySelector('img');
const imageUrl = img ? img.getAttribute('src') || '' : '';
// Get all text nodes in order
const textElements = Array.from(el.querySelectorAll('*'))
.filter(el => el.textContent && el.children.length === 0)
.map(el => (el.textContent || '').trim())
.filter(text => text.length > 0);
const name = textElements[0] || '';
const brand = textElements[1] || '';
// Look for price
const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
return { href, imageUrl, name, brand, price };
});
if (cardData.name && cardData.href) {
products.push({
name: cardData.name,
brand: cardData.brand || undefined,
product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
image_url: cardData.imageUrl || undefined,
price: cardData.price,
in_stock: true,
});
}
}
catch (err) {
logger_1.logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
}
}
}
catch (err) {
logger_1.logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
}
return products;
},
};
/**
* Get the appropriate scraper template based on URL
*/
function getTemplateForUrl(url) {
if (exports.dutchieTemplate.urlPattern.test(url)) {
return exports.dutchieTemplate;
}
return null;
}