Files
cannaiq/backend/dist/scrapers/dutchie-graphql.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

447 lines
18 KiB
JavaScript

"use strict";
// ============================================================================
// DEPRECATED: This scraper writes to the LEGACY products table.
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
//
// New pipeline location: src/dutchie-az/services/product-crawler.ts
// - Uses fetch-based GraphQL (no Puppeteer needed)
// - Writes to isolated dutchie_az_* tables with snapshot model
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
//
// The normalizer functions in this file (normalizeDutchieProduct) may still
// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
// ============================================================================
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeDutchieProduct = normalizeDutchieProduct;
exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer;
exports.upsertProducts = upsertProducts;
exports.scrapeDutchieMenu = scrapeDutchieMenu;
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This scraper writes to the legacy products table, not the new dutchie_az tables.
*
* Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
* This bypasses Cloudflare by using a real browser to load the menu page.
*
* GraphQL Operations:
* - FilteredProducts: Returns paginated product list with full details
* - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
*/
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
// =====================================================
// NORMALIZER: Dutchie GraphQL → DB Schema
// =====================================================
function normalizeDutchieProduct(product) {
// Extract first special if exists
const saleSpecial = product.specialData?.saleSpecials?.[0];
// Calculate inventory from POSMetaData children
const children = product.POSMetaData?.children || [];
const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
// Parse timestamps
let sourceCreatedAt;
if (product.createdAt) {
// createdAt is a timestamp string like "1729044510543"
const ts = parseInt(product.createdAt, 10);
if (!isNaN(ts)) {
sourceCreatedAt = new Date(ts);
}
}
let sourceUpdatedAt;
if (product.updatedAt) {
sourceUpdatedAt = new Date(product.updatedAt);
}
return {
// Identity
external_id: product._id || product.id,
slug: product.cName,
name: product.Name,
enterprise_product_id: product.enterpriseProductId,
// Brand
brand: product.brandName || product.brand?.name,
brand_external_id: product.brandId || product.brand?.id,
brand_logo_url: product.brandLogo || product.brand?.imageUrl,
// Category
subcategory: product.subcategory,
strain_type: product.strainType,
canonical_category: product.POSMetaData?.canonicalCategory,
// Pricing
price: product.Prices?.[0],
rec_price: product.recPrices?.[0],
med_price: product.medicalPrices?.[0],
rec_special_price: product.recSpecialPrices?.[0],
med_special_price: product.medicalSpecialPrices?.[0],
// Specials
is_on_special: product.special === true,
special_name: saleSpecial?.specialName,
discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
special_data: product.specialData,
// Inventory
sku: product.POSMetaData?.canonicalSKU,
inventory_quantity: totalQuantity || undefined,
inventory_available: availableQuantity || undefined,
is_below_threshold: product.isBelowThreshold === true,
status: product.Status,
// Potency
thc_percentage: product.THCContent?.range?.[0],
cbd_percentage: product.CBDContent?.range?.[0],
cannabinoids: product.cannabinoidsV2,
// Weight/Options
weight_mg: product.weight,
net_weight_value: product.measurements?.netWeight?.values?.[0],
net_weight_unit: product.measurements?.netWeight?.unit,
options: product.Options,
raw_options: product.rawOptions,
// Images
image_url: product.Image,
additional_images: product.images?.length ? product.images : undefined,
// Flags
is_featured: product.featured === true,
medical_only: product.medicalOnly === true,
rec_only: product.recOnly === true,
// Timestamps
source_created_at: sourceCreatedAt,
source_updated_at: sourceUpdatedAt,
// Description
description: typeof product.description === 'string' ? product.description : undefined,
// Raw
raw_data: product,
};
}
async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) {
const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture
} = options;
let browser;
const capturedProducts = [];
let dispensaryId = '';
try {
browser = await puppeteer_extra_1.default.launch({
headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
const page = await browser.newPage();
// Stealth configuration
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
});
// Track seen product IDs to avoid duplicates
const seenIds = new Set();
// Intercept GraphQL responses
page.on('response', async (response) => {
const url = response.url();
if (!url.includes('graphql'))
return;
try {
const contentType = response.headers()['content-type'] || '';
if (!contentType.includes('application/json'))
return;
const data = await response.json();
// Capture dispensary ID
if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
}
// Capture products from FilteredProducts
if (data?.data?.filteredProducts?.products) {
const products = data.data.filteredProducts.products;
for (const product of products) {
if (!seenIds.has(product._id)) {
seenIds.add(product._id);
capturedProducts.push(product);
}
}
}
}
catch {
// Ignore parse errors
}
});
// Navigate to menu
console.log('[DutchieGraphQL] Loading menu page...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout,
});
// Get dispensary ID from window.reactEnv if not captured
if (!dispensaryId) {
dispensaryId = await page.evaluate(() => {
const env = window.reactEnv;
return env?.dispensaryId || env?.retailerId || '';
});
}
// Helper function to scroll through a page until no more products load
async function scrollToLoadAll(maxScrollAttempts = maxScrolls) {
let scrollCount = 0;
let previousCount = 0;
let noNewProductsCount = 0;
while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await new Promise((r) => setTimeout(r, 1500));
const currentCount = seenIds.size;
if (currentCount === previousCount) {
noNewProductsCount++;
}
else {
noNewProductsCount = 0;
}
previousCount = currentCount;
scrollCount++;
}
}
// First, scroll through the main page (all products)
console.log('[DutchieGraphQL] Scrolling main page...');
await scrollToLoadAll();
console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
// Get category links from the navigation
const categoryLinks = await page.evaluate(() => {
const links = [];
// Look for category navigation links
const navLinks = document.querySelectorAll('a[href*="/products/"]');
navLinks.forEach((link) => {
const href = link.href;
if (href && !links.includes(href)) {
links.push(href);
}
});
return links;
});
console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
// Visit each category page to capture all products
for (const categoryUrl of categoryLinks) {
try {
console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
await page.goto(categoryUrl, {
waitUntil: 'networkidle2',
timeout: 30000,
});
await scrollToLoadAll(15); // Fewer scrolls per category
console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
}
catch (e) {
console.log(`[DutchieGraphQL] Category error: ${e.message}`);
}
}
// Wait for any final responses
await new Promise((r) => setTimeout(r, 2000));
return {
products: capturedProducts,
dispensaryId,
menuUrl,
};
}
finally {
if (browser) {
await browser.close();
}
}
}
// =====================================================
// DATABASE OPERATIONS
// =====================================================
async function upsertProducts(pool, storeId, products) {
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of products) {
// Upsert product
const result = await client.query(`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`, [
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]);
if (result.rows[0]?.was_inserted) {
inserted++;
}
else {
updated++;
}
}
await client.query('COMMIT');
return { inserted, updated };
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
// =====================================================
// MAIN ENTRY POINT
// =====================================================
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This function is disabled and will throw an error if called.
*/
async function scrapeDutchieMenu(pool, storeId, menuUrl) {
// DEPRECATED: Throw error to prevent accidental use
throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
'This scraper writes to the legacy products table.');
// Original code below is unreachable but kept for reference
try {
console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
// Fetch products via Puppeteer
const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
if (products.length === 0) {
return {
success: false,
productsFound: 0,
inserted: 0,
updated: 0,
error: 'No products captured from GraphQL responses',
};
}
// Normalize products
const normalized = products.map(normalizeDutchieProduct);
// Upsert to database
const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
return {
success: true,
productsFound: products.length,
inserted,
updated,
};
}
catch (error) {
console.error(`[DutchieGraphQL] Error:`, error.message);
return {
success: false,
productsFound: 0,
inserted: 0,
updated: 0,
error: error.message,
};
}
}