The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
447 lines
18 KiB
JavaScript
447 lines
18 KiB
JavaScript
"use strict";
|
|
// ============================================================================
|
|
// DEPRECATED: This scraper writes to the LEGACY products table.
|
|
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
|
//
|
|
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
|
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
|
// - Writes to isolated dutchie_az_* tables with snapshot model
|
|
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
|
//
|
|
// The normalizer functions in this file (normalizeDutchieProduct) may still
|
|
// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
|
|
// ============================================================================
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.normalizeDutchieProduct = normalizeDutchieProduct;
|
|
exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer;
|
|
exports.upsertProducts = upsertProducts;
|
|
exports.scrapeDutchieMenu = scrapeDutchieMenu;
|
|
/**
|
|
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
|
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
|
*
|
|
* Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
|
|
* This bypasses Cloudflare by using a real browser to load the menu page.
|
|
*
|
|
* GraphQL Operations:
|
|
* - FilteredProducts: Returns paginated product list with full details
|
|
* - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
|
|
*/
|
|
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
|
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
|
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
|
// =====================================================
|
|
// NORMALIZER: Dutchie GraphQL → DB Schema
|
|
// =====================================================
|
|
function normalizeDutchieProduct(product) {
|
|
// Extract first special if exists
|
|
const saleSpecial = product.specialData?.saleSpecials?.[0];
|
|
// Calculate inventory from POSMetaData children
|
|
const children = product.POSMetaData?.children || [];
|
|
const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
|
|
const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
|
|
// Parse timestamps
|
|
let sourceCreatedAt;
|
|
if (product.createdAt) {
|
|
// createdAt is a timestamp string like "1729044510543"
|
|
const ts = parseInt(product.createdAt, 10);
|
|
if (!isNaN(ts)) {
|
|
sourceCreatedAt = new Date(ts);
|
|
}
|
|
}
|
|
let sourceUpdatedAt;
|
|
if (product.updatedAt) {
|
|
sourceUpdatedAt = new Date(product.updatedAt);
|
|
}
|
|
return {
|
|
// Identity
|
|
external_id: product._id || product.id,
|
|
slug: product.cName,
|
|
name: product.Name,
|
|
enterprise_product_id: product.enterpriseProductId,
|
|
// Brand
|
|
brand: product.brandName || product.brand?.name,
|
|
brand_external_id: product.brandId || product.brand?.id,
|
|
brand_logo_url: product.brandLogo || product.brand?.imageUrl,
|
|
// Category
|
|
subcategory: product.subcategory,
|
|
strain_type: product.strainType,
|
|
canonical_category: product.POSMetaData?.canonicalCategory,
|
|
// Pricing
|
|
price: product.Prices?.[0],
|
|
rec_price: product.recPrices?.[0],
|
|
med_price: product.medicalPrices?.[0],
|
|
rec_special_price: product.recSpecialPrices?.[0],
|
|
med_special_price: product.medicalSpecialPrices?.[0],
|
|
// Specials
|
|
is_on_special: product.special === true,
|
|
special_name: saleSpecial?.specialName,
|
|
discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
|
|
special_data: product.specialData,
|
|
// Inventory
|
|
sku: product.POSMetaData?.canonicalSKU,
|
|
inventory_quantity: totalQuantity || undefined,
|
|
inventory_available: availableQuantity || undefined,
|
|
is_below_threshold: product.isBelowThreshold === true,
|
|
status: product.Status,
|
|
// Potency
|
|
thc_percentage: product.THCContent?.range?.[0],
|
|
cbd_percentage: product.CBDContent?.range?.[0],
|
|
cannabinoids: product.cannabinoidsV2,
|
|
// Weight/Options
|
|
weight_mg: product.weight,
|
|
net_weight_value: product.measurements?.netWeight?.values?.[0],
|
|
net_weight_unit: product.measurements?.netWeight?.unit,
|
|
options: product.Options,
|
|
raw_options: product.rawOptions,
|
|
// Images
|
|
image_url: product.Image,
|
|
additional_images: product.images?.length ? product.images : undefined,
|
|
// Flags
|
|
is_featured: product.featured === true,
|
|
medical_only: product.medicalOnly === true,
|
|
rec_only: product.recOnly === true,
|
|
// Timestamps
|
|
source_created_at: sourceCreatedAt,
|
|
source_updated_at: sourceUpdatedAt,
|
|
// Description
|
|
description: typeof product.description === 'string' ? product.description : undefined,
|
|
// Raw
|
|
raw_data: product,
|
|
};
|
|
}
|
|
async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) {
|
|
const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture
|
|
} = options;
|
|
let browser;
|
|
const capturedProducts = [];
|
|
let dispensaryId = '';
|
|
try {
|
|
browser = await puppeteer_extra_1.default.launch({
|
|
headless,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
],
|
|
});
|
|
const page = await browser.newPage();
|
|
// Stealth configuration
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
window.chrome = { runtime: {} };
|
|
});
|
|
// Track seen product IDs to avoid duplicates
|
|
const seenIds = new Set();
|
|
// Intercept GraphQL responses
|
|
page.on('response', async (response) => {
|
|
const url = response.url();
|
|
if (!url.includes('graphql'))
|
|
return;
|
|
try {
|
|
const contentType = response.headers()['content-type'] || '';
|
|
if (!contentType.includes('application/json'))
|
|
return;
|
|
const data = await response.json();
|
|
// Capture dispensary ID
|
|
if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
|
|
dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
|
|
}
|
|
// Capture products from FilteredProducts
|
|
if (data?.data?.filteredProducts?.products) {
|
|
const products = data.data.filteredProducts.products;
|
|
for (const product of products) {
|
|
if (!seenIds.has(product._id)) {
|
|
seenIds.add(product._id);
|
|
capturedProducts.push(product);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
catch {
|
|
// Ignore parse errors
|
|
}
|
|
});
|
|
// Navigate to menu
|
|
console.log('[DutchieGraphQL] Loading menu page...');
|
|
await page.goto(menuUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout,
|
|
});
|
|
// Get dispensary ID from window.reactEnv if not captured
|
|
if (!dispensaryId) {
|
|
dispensaryId = await page.evaluate(() => {
|
|
const env = window.reactEnv;
|
|
return env?.dispensaryId || env?.retailerId || '';
|
|
});
|
|
}
|
|
// Helper function to scroll through a page until no more products load
|
|
async function scrollToLoadAll(maxScrollAttempts = maxScrolls) {
|
|
let scrollCount = 0;
|
|
let previousCount = 0;
|
|
let noNewProductsCount = 0;
|
|
while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await new Promise((r) => setTimeout(r, 1500));
|
|
const currentCount = seenIds.size;
|
|
if (currentCount === previousCount) {
|
|
noNewProductsCount++;
|
|
}
|
|
else {
|
|
noNewProductsCount = 0;
|
|
}
|
|
previousCount = currentCount;
|
|
scrollCount++;
|
|
}
|
|
}
|
|
// First, scroll through the main page (all products)
|
|
console.log('[DutchieGraphQL] Scrolling main page...');
|
|
await scrollToLoadAll();
|
|
console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
|
|
// Get category links from the navigation
|
|
const categoryLinks = await page.evaluate(() => {
|
|
const links = [];
|
|
// Look for category navigation links
|
|
const navLinks = document.querySelectorAll('a[href*="/products/"]');
|
|
navLinks.forEach((link) => {
|
|
const href = link.href;
|
|
if (href && !links.includes(href)) {
|
|
links.push(href);
|
|
}
|
|
});
|
|
return links;
|
|
});
|
|
console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
|
|
// Visit each category page to capture all products
|
|
for (const categoryUrl of categoryLinks) {
|
|
try {
|
|
console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
|
|
await page.goto(categoryUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 30000,
|
|
});
|
|
await scrollToLoadAll(15); // Fewer scrolls per category
|
|
console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
|
|
}
|
|
catch (e) {
|
|
console.log(`[DutchieGraphQL] Category error: ${e.message}`);
|
|
}
|
|
}
|
|
// Wait for any final responses
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
return {
|
|
products: capturedProducts,
|
|
dispensaryId,
|
|
menuUrl,
|
|
};
|
|
}
|
|
finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|
|
// =====================================================
|
|
// DATABASE OPERATIONS
|
|
// =====================================================
|
|
async function upsertProducts(pool, storeId, products) {
|
|
const client = await pool.connect();
|
|
let inserted = 0;
|
|
let updated = 0;
|
|
try {
|
|
await client.query('BEGIN');
|
|
for (const product of products) {
|
|
// Upsert product
|
|
const result = await client.query(`
|
|
INSERT INTO products (
|
|
store_id, external_id, slug, name, enterprise_product_id,
|
|
brand, brand_external_id, brand_logo_url,
|
|
subcategory, strain_type, canonical_category,
|
|
price, rec_price, med_price, rec_special_price, med_special_price,
|
|
is_on_special, special_name, discount_percent, special_data,
|
|
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
|
thc_percentage, cbd_percentage, cannabinoids,
|
|
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
|
image_url, additional_images,
|
|
is_featured, medical_only, rec_only,
|
|
source_created_at, source_updated_at,
|
|
description, raw_data,
|
|
dutchie_url, last_seen_at, updated_at
|
|
)
|
|
VALUES (
|
|
$1, $2, $3, $4, $5,
|
|
$6, $7, $8,
|
|
$9, $10, $11,
|
|
$12, $13, $14, $15, $16,
|
|
$17, $18, $19, $20,
|
|
$21, $22, $23, $24, $25,
|
|
$26, $27, $28,
|
|
$29, $30, $31, $32, $33,
|
|
$34, $35,
|
|
$36, $37, $38,
|
|
$39, $40,
|
|
$41, $42,
|
|
'', NOW(), NOW()
|
|
)
|
|
ON CONFLICT (store_id, slug) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
|
brand = EXCLUDED.brand,
|
|
brand_external_id = EXCLUDED.brand_external_id,
|
|
brand_logo_url = EXCLUDED.brand_logo_url,
|
|
subcategory = EXCLUDED.subcategory,
|
|
strain_type = EXCLUDED.strain_type,
|
|
canonical_category = EXCLUDED.canonical_category,
|
|
price = EXCLUDED.price,
|
|
rec_price = EXCLUDED.rec_price,
|
|
med_price = EXCLUDED.med_price,
|
|
rec_special_price = EXCLUDED.rec_special_price,
|
|
med_special_price = EXCLUDED.med_special_price,
|
|
is_on_special = EXCLUDED.is_on_special,
|
|
special_name = EXCLUDED.special_name,
|
|
discount_percent = EXCLUDED.discount_percent,
|
|
special_data = EXCLUDED.special_data,
|
|
sku = EXCLUDED.sku,
|
|
inventory_quantity = EXCLUDED.inventory_quantity,
|
|
inventory_available = EXCLUDED.inventory_available,
|
|
is_below_threshold = EXCLUDED.is_below_threshold,
|
|
status = EXCLUDED.status,
|
|
thc_percentage = EXCLUDED.thc_percentage,
|
|
cbd_percentage = EXCLUDED.cbd_percentage,
|
|
cannabinoids = EXCLUDED.cannabinoids,
|
|
weight_mg = EXCLUDED.weight_mg,
|
|
net_weight_value = EXCLUDED.net_weight_value,
|
|
net_weight_unit = EXCLUDED.net_weight_unit,
|
|
options = EXCLUDED.options,
|
|
raw_options = EXCLUDED.raw_options,
|
|
image_url = EXCLUDED.image_url,
|
|
additional_images = EXCLUDED.additional_images,
|
|
is_featured = EXCLUDED.is_featured,
|
|
medical_only = EXCLUDED.medical_only,
|
|
rec_only = EXCLUDED.rec_only,
|
|
source_created_at = EXCLUDED.source_created_at,
|
|
source_updated_at = EXCLUDED.source_updated_at,
|
|
description = EXCLUDED.description,
|
|
raw_data = EXCLUDED.raw_data,
|
|
last_seen_at = NOW(),
|
|
updated_at = NOW()
|
|
RETURNING (xmax = 0) AS was_inserted
|
|
`, [
|
|
storeId,
|
|
product.external_id,
|
|
product.slug,
|
|
product.name,
|
|
product.enterprise_product_id,
|
|
product.brand,
|
|
product.brand_external_id,
|
|
product.brand_logo_url,
|
|
product.subcategory,
|
|
product.strain_type,
|
|
product.canonical_category,
|
|
product.price,
|
|
product.rec_price,
|
|
product.med_price,
|
|
product.rec_special_price,
|
|
product.med_special_price,
|
|
product.is_on_special,
|
|
product.special_name,
|
|
product.discount_percent,
|
|
product.special_data ? JSON.stringify(product.special_data) : null,
|
|
product.sku,
|
|
product.inventory_quantity,
|
|
product.inventory_available,
|
|
product.is_below_threshold,
|
|
product.status,
|
|
product.thc_percentage,
|
|
product.cbd_percentage,
|
|
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
|
product.weight_mg,
|
|
product.net_weight_value,
|
|
product.net_weight_unit,
|
|
product.options,
|
|
product.raw_options,
|
|
product.image_url,
|
|
product.additional_images,
|
|
product.is_featured,
|
|
product.medical_only,
|
|
product.rec_only,
|
|
product.source_created_at,
|
|
product.source_updated_at,
|
|
product.description,
|
|
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
|
]);
|
|
if (result.rows[0]?.was_inserted) {
|
|
inserted++;
|
|
}
|
|
else {
|
|
updated++;
|
|
}
|
|
}
|
|
await client.query('COMMIT');
|
|
return { inserted, updated };
|
|
}
|
|
catch (error) {
|
|
await client.query('ROLLBACK');
|
|
throw error;
|
|
}
|
|
finally {
|
|
client.release();
|
|
}
|
|
}
|
|
// =====================================================
|
|
// MAIN ENTRY POINT
|
|
// =====================================================
|
|
/**
|
|
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
|
* This function is disabled and will throw an error if called.
|
|
*/
|
|
async function scrapeDutchieMenu(pool, storeId, menuUrl) {
|
|
// DEPRECATED: Throw error to prevent accidental use
|
|
throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
|
|
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
|
'This scraper writes to the legacy products table.');
|
|
// Original code below is unreachable but kept for reference
|
|
try {
|
|
console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
|
|
// Fetch products via Puppeteer
|
|
const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
|
|
console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
|
|
if (products.length === 0) {
|
|
return {
|
|
success: false,
|
|
productsFound: 0,
|
|
inserted: 0,
|
|
updated: 0,
|
|
error: 'No products captured from GraphQL responses',
|
|
};
|
|
}
|
|
// Normalize products
|
|
const normalized = products.map(normalizeDutchieProduct);
|
|
// Upsert to database
|
|
const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
|
|
console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
|
|
return {
|
|
success: true,
|
|
productsFound: products.length,
|
|
inserted,
|
|
updated,
|
|
};
|
|
}
|
|
catch (error) {
|
|
console.error(`[DutchieGraphQL] Error:`, error.message);
|
|
return {
|
|
success: false,
|
|
productsFound: 0,
|
|
inserted: 0,
|
|
updated: 0,
|
|
error: error.message,
|
|
};
|
|
}
|
|
}
|