103 lines
3.7 KiB
TypeScript
103 lines
3.7 KiB
TypeScript
// ============================================================================
|
|
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
|
|
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
|
|
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
|
|
// ============================================================================
|
|
|
|
import { Page } from 'playwright';
|
|
import { logger } from '../../services/logger';
|
|
|
|
export interface ScraperTemplate {
|
|
name: string;
|
|
urlPattern: RegExp;
|
|
buildCategoryUrl: (baseUrl: string, category: string) => string;
|
|
extractProducts: (page: Page) => Promise<any[]>;
|
|
}
|
|
|
|
/**
|
|
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
|
|
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
|
|
* This template relied on unstable DOM selectors and wrote to legacy tables.
|
|
*/
|
|
export const dutchieTemplate: ScraperTemplate = {
|
|
name: 'Dutchie Marketplace',
|
|
urlPattern: /dutchie\.com\/dispensary\//,
|
|
|
|
buildCategoryUrl: (baseUrl: string, category: string) => {
|
|
// Remove trailing slash
|
|
const base = baseUrl.replace(/\/$/, '');
|
|
// Convert category name to URL-friendly slug
|
|
const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
|
|
return `${base}/products/${categorySlug}`;
|
|
},
|
|
|
|
extractProducts: async (page: Page) => {
|
|
const products: any[] = [];
|
|
|
|
try {
|
|
// Wait for product cards to load
|
|
await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
|
|
logger.warn('scraper', 'No product cards found with data-testid="card-link"');
|
|
});
|
|
|
|
// Get all product card links
|
|
const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
|
|
|
|
logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
|
|
|
|
for (const card of productCards) {
|
|
try {
|
|
// Extract all data at once using evaluate for speed
|
|
const cardData = await card.evaluate((el) => {
|
|
const href = el.getAttribute('href') || '';
|
|
const img = el.querySelector('img');
|
|
const imageUrl = img ? img.getAttribute('src') || '' : '';
|
|
|
|
// Get all text nodes in order
|
|
const textElements = Array.from(el.querySelectorAll('*'))
|
|
.filter(el => el.textContent && el.children.length === 0)
|
|
.map(el => (el.textContent || '').trim())
|
|
.filter(text => text.length > 0);
|
|
|
|
const name = textElements[0] || '';
|
|
const brand = textElements[1] || '';
|
|
|
|
// Look for price
|
|
const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
|
|
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
|
|
|
return { href, imageUrl, name, brand, price };
|
|
});
|
|
|
|
if (cardData.name && cardData.href) {
|
|
products.push({
|
|
name: cardData.name,
|
|
brand: cardData.brand || undefined,
|
|
product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
|
|
image_url: cardData.imageUrl || undefined,
|
|
price: cardData.price,
|
|
in_stock: true,
|
|
});
|
|
}
|
|
} catch (err) {
|
|
logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
|
|
}
|
|
}
|
|
} catch (err) {
|
|
logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
|
|
}
|
|
|
|
return products;
|
|
},
|
|
};
|
|
|
|
/**
|
|
* Get the appropriate scraper template based on URL
|
|
*/
|
|
export function getTemplateForUrl(url: string): ScraperTemplate | null {
|
|
if (dutchieTemplate.urlPattern.test(url)) {
|
|
return dutchieTemplate;
|
|
}
|
|
return null;
|
|
}
|