Files
cannaiq/backend/src/scrapers/templates/dutchie.ts

103 lines
3.7 KiB
TypeScript

// ============================================================================
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
// ============================================================================
import { Page } from 'playwright';
import { logger } from '../../services/logger';
export interface ScraperTemplate {
name: string;
urlPattern: RegExp;
buildCategoryUrl: (baseUrl: string, category: string) => string;
extractProducts: (page: Page) => Promise<any[]>;
}
/**
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
* This template relied on unstable DOM selectors and wrote to legacy tables.
*/
export const dutchieTemplate: ScraperTemplate = {
name: 'Dutchie Marketplace',
urlPattern: /dutchie\.com\/dispensary\//,
buildCategoryUrl: (baseUrl: string, category: string) => {
// Remove trailing slash
const base = baseUrl.replace(/\/$/, '');
// Convert category name to URL-friendly slug
const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
return `${base}/products/${categorySlug}`;
},
extractProducts: async (page: Page) => {
const products: any[] = [];
try {
// Wait for product cards to load
await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
logger.warn('scraper', 'No product cards found with data-testid="card-link"');
});
// Get all product card links
const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
for (const card of productCards) {
try {
// Extract all data at once using evaluate for speed
const cardData = await card.evaluate((el) => {
const href = el.getAttribute('href') || '';
const img = el.querySelector('img');
const imageUrl = img ? img.getAttribute('src') || '' : '';
// Get all text nodes in order
const textElements = Array.from(el.querySelectorAll('*'))
.filter(el => el.textContent && el.children.length === 0)
.map(el => (el.textContent || '').trim())
.filter(text => text.length > 0);
const name = textElements[0] || '';
const brand = textElements[1] || '';
// Look for price
const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
return { href, imageUrl, name, brand, price };
});
if (cardData.name && cardData.href) {
products.push({
name: cardData.name,
brand: cardData.brand || undefined,
product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
image_url: cardData.imageUrl || undefined,
price: cardData.price,
in_stock: true,
});
}
} catch (err) {
logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
}
}
} catch (err) {
logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
}
return products;
},
};
/**
* Get the appropriate scraper template based on URL
*/
export function getTemplateForUrl(url: string): ScraperTemplate | null {
if (dutchieTemplate.urlPattern.test(url)) {
return dutchieTemplate;
}
return null;
}