The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
844 lines
35 KiB
JavaScript
844 lines
35 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Dutchie AZ Product Crawler Service
|
|
*
|
|
* Crawls products from Dutchie dispensaries and stores them in the dutchie_az database.
|
|
* Handles normalization from GraphQL response to database entities.
|
|
*
|
|
* IMPORTANT: Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM.
|
|
*/
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.normalizeProduct = normalizeProduct;
|
|
exports.normalizeSnapshot = normalizeSnapshot;
|
|
exports.crawlDispensaryProducts = crawlDispensaryProducts;
|
|
exports.crawlAllArizonaDispensaries = crawlAllArizonaDispensaries;
|
|
const connection_1 = require("../db/connection");
|
|
const graphql_client_1 = require("./graphql-client");
|
|
const discovery_1 = require("./discovery");
|
|
const types_1 = require("../types");
|
|
const image_storage_1 = require("../../utils/image-storage");
|
|
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
|
const DISPENSARY_COLUMNS = `
|
|
id, name, slug, city, state, zip, address, latitude, longitude,
|
|
menu_type, menu_url, platform_dispensary_id, website,
|
|
provider_detection_data, created_at, updated_at
|
|
`;
|
|
// ============================================================
|
|
// BATCH PROCESSING CONFIGURATION
|
|
// ============================================================
|
|
/** Chunk size for batch DB writes (per CLAUDE.md Rule #15) */
|
|
const BATCH_CHUNK_SIZE = 100;
|
|
// ============================================================
|
|
// NORMALIZATION FUNCTIONS
|
|
// ============================================================
|
|
/**
|
|
* Convert price to cents
|
|
*/
|
|
function toCents(price) {
|
|
if (price === undefined || price === null)
|
|
return undefined;
|
|
return Math.round(price * 100);
|
|
}
|
|
/**
|
|
* Get min value from array of numbers
|
|
*/
|
|
function getMin(arr) {
|
|
if (!arr || arr.length === 0)
|
|
return undefined;
|
|
return Math.min(...arr.filter((n) => n !== null && n !== undefined));
|
|
}
|
|
/**
|
|
* Get max value from array of numbers
|
|
*/
|
|
function getMax(arr) {
|
|
if (!arr || arr.length === 0)
|
|
return undefined;
|
|
return Math.max(...arr.filter((n) => n !== null && n !== undefined));
|
|
}
|
|
/**
|
|
* Normalize a value to boolean
|
|
* Handles Dutchie API returning {} or [] or other non-boolean values
|
|
* that would cause "invalid input syntax for type boolean" errors
|
|
*/
|
|
function normBool(v, defaultVal = false) {
|
|
if (v === true)
|
|
return true;
|
|
if (v === false)
|
|
return false;
|
|
// Log unexpected object/array values once for debugging
|
|
if (v !== null && v !== undefined && typeof v === 'object') {
|
|
console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v));
|
|
}
|
|
return defaultVal;
|
|
}
|
|
/**
|
|
* Normalize a value to Date or undefined
|
|
* Handles Dutchie API returning {} or [] or other non-date values
|
|
* that would cause "invalid input syntax for type timestamp" errors
|
|
*/
|
|
function normDate(v) {
|
|
if (!v)
|
|
return undefined;
|
|
// Reject objects/arrays that aren't dates
|
|
if (typeof v === 'object' && !(v instanceof Date)) {
|
|
console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v));
|
|
return undefined;
|
|
}
|
|
// Try parsing
|
|
const d = new Date(v);
|
|
if (isNaN(d.getTime())) {
|
|
console.warn(`[normDate] Invalid date value, ignoring:`, v);
|
|
return undefined;
|
|
}
|
|
return d;
|
|
}
|
|
/**
|
|
* Extract cName (Dutchie slug) from menuUrl or dispensary slug
|
|
* Handles URL formats:
|
|
* - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted
|
|
* - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock
|
|
* Falls back to dispensary.slug if menuUrl extraction fails
|
|
*/
|
|
function extractCName(dispensary) {
|
|
if (dispensary.menuUrl) {
|
|
try {
|
|
const url = new URL(dispensary.menuUrl);
|
|
// Extract last path segment: /embedded-menu/X or /dispensary/X
|
|
const segments = url.pathname.split('/').filter(Boolean);
|
|
if (segments.length >= 2) {
|
|
const cName = segments[segments.length - 1];
|
|
if (cName) {
|
|
console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`);
|
|
return cName;
|
|
}
|
|
}
|
|
}
|
|
catch (e) {
|
|
console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`);
|
|
}
|
|
}
|
|
// Fallback to slug
|
|
console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`);
|
|
return dispensary.slug;
|
|
}
|
|
/**
|
|
* Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot
|
|
*/
|
|
function normalizeOption(child) {
|
|
return {
|
|
optionId: child.canonicalID || child.canonicalPackageId || child.canonicalSKU || child.option || 'unknown',
|
|
canonicalId: child.canonicalID,
|
|
canonicalPackageId: child.canonicalPackageId,
|
|
canonicalSKU: child.canonicalSKU,
|
|
canonicalName: child.canonicalName,
|
|
canonicalCategory: child.canonicalCategory,
|
|
canonicalCategoryId: child.canonicalCategoryId,
|
|
canonicalBrandId: child.canonicalBrandId,
|
|
canonicalBrandName: child.canonicalBrandName,
|
|
canonicalStrainId: child.canonicalStrainId,
|
|
canonicalVendorId: child.canonicalVendorId,
|
|
optionLabel: child.option,
|
|
packageQuantity: child.packageQuantity,
|
|
recEquivalent: child.recEquivalent,
|
|
standardEquivalent: child.standardEquivalent,
|
|
priceCents: toCents(child.price),
|
|
recPriceCents: toCents(child.recPrice),
|
|
medPriceCents: toCents(child.medPrice),
|
|
quantity: child.quantity,
|
|
quantityAvailable: child.quantityAvailable,
|
|
kioskQuantityAvailable: child.kioskQuantityAvailable,
|
|
activeBatchTags: child.activeBatchTags,
|
|
canonicalImgUrl: child.canonicalImgUrl,
|
|
canonicalLabResultUrl: child.canonicalLabResultUrl,
|
|
canonicalEffectivePotencyMg: child.canonicalEffectivePotencyMg,
|
|
rawChildPayload: child,
|
|
};
|
|
}
|
|
/**
|
|
* Normalize a raw Dutchie product to DutchieProduct (canonical identity)
|
|
*/
|
|
function normalizeProduct(raw, dispensaryId, platformDispensaryId) {
|
|
return {
|
|
dispensaryId,
|
|
platform: 'dutchie',
|
|
externalProductId: raw._id || raw.id || '',
|
|
platformDispensaryId,
|
|
cName: raw.cName,
|
|
name: raw.Name,
|
|
// Brand
|
|
brandName: raw.brandName || raw.brand?.name,
|
|
brandId: raw.brandId || raw.brand?.id,
|
|
brandLogoUrl: raw.brandLogo || raw.brand?.imageUrl,
|
|
// Classification
|
|
type: raw.type,
|
|
subcategory: raw.subcategory,
|
|
strainType: raw.strainType,
|
|
provider: raw.provider,
|
|
// Potency
|
|
thc: raw.THC,
|
|
thcContent: raw.THCContent?.range?.[0],
|
|
cbd: raw.CBD,
|
|
cbdContent: raw.CBDContent?.range?.[0],
|
|
cannabinoidsV2: raw.cannabinoidsV2,
|
|
effects: raw.effects,
|
|
// Status / flags
|
|
status: raw.Status,
|
|
medicalOnly: normBool(raw.medicalOnly, false),
|
|
recOnly: normBool(raw.recOnly, false),
|
|
featured: normBool(raw.featured, false),
|
|
comingSoon: normBool(raw.comingSoon, false),
|
|
certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false),
|
|
isBelowThreshold: normBool(raw.isBelowThreshold, false),
|
|
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
|
|
optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false),
|
|
optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false),
|
|
// Derived stock status
|
|
stockStatus: (0, types_1.deriveStockStatus)(raw),
|
|
totalQuantityAvailable: (0, types_1.calculateTotalQuantity)(raw),
|
|
// Images
|
|
primaryImageUrl: raw.Image || raw.images?.[0]?.url,
|
|
images: raw.images,
|
|
// Misc
|
|
measurements: raw.measurements,
|
|
weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight,
|
|
pastCNames: raw.pastCNames,
|
|
createdAtDutchie: normDate(raw.createdAt),
|
|
updatedAtDutchie: normDate(raw.updatedAt),
|
|
latestRawPayload: raw,
|
|
};
|
|
}
|
|
/**
|
|
* Normalize a raw Dutchie product to DutchieProductSnapshot (time-series data)
|
|
*/
|
|
function normalizeSnapshot(raw, dutchieProductId, dispensaryId, platformDispensaryId, pricingType, crawlMode = 'mode_a') {
|
|
const children = raw.POSMetaData?.children || [];
|
|
const options = children.map(normalizeOption);
|
|
// Aggregate prices from various sources
|
|
const recPrices = raw.recPrices || [];
|
|
const medPrices = raw.medicalPrices || [];
|
|
const recSpecialPrices = raw.recSpecialPrices || [];
|
|
const medSpecialPrices = raw.medicalSpecialPrices || [];
|
|
const wholesalePrices = raw.wholesalePrices || [];
|
|
// Also consider child prices
|
|
const childRecPrices = children.map((c) => c.recPrice).filter((p) => p !== undefined);
|
|
const childMedPrices = children.map((c) => c.medPrice).filter((p) => p !== undefined);
|
|
const childPrices = children.map((c) => c.price).filter((p) => p !== undefined);
|
|
// Aggregate inventory - use calculateTotalQuantity for proper null handling
|
|
const totalQty = (0, types_1.calculateTotalQuantity)(raw);
|
|
const hasAnyKioskQty = children.some(c => typeof c.kioskQuantityAvailable === 'number');
|
|
const totalKioskQty = hasAnyKioskQty
|
|
? children.reduce((sum, c) => sum + (c.kioskQuantityAvailable || 0), 0)
|
|
: null;
|
|
// Determine if on special
|
|
const isOnSpecial = raw.special === true ||
|
|
(raw.specialData?.saleSpecials && raw.specialData.saleSpecials.length > 0) ||
|
|
(recSpecialPrices.length > 0 && recSpecialPrices[0] !== null) ||
|
|
(medSpecialPrices.length > 0 && medSpecialPrices[0] !== null);
|
|
return {
|
|
dutchieProductId,
|
|
dispensaryId,
|
|
platformDispensaryId,
|
|
externalProductId: raw._id || raw.id || '',
|
|
pricingType,
|
|
crawlMode,
|
|
status: raw.Status,
|
|
featured: normBool(raw.featured, false),
|
|
special: normBool(isOnSpecial, false),
|
|
medicalOnly: normBool(raw.medicalOnly, false),
|
|
recOnly: normBool(raw.recOnly, false),
|
|
// Product was present in feed
|
|
isPresentInFeed: true,
|
|
// Derived stock status
|
|
stockStatus: (0, types_1.deriveStockStatus)(raw),
|
|
// Price summary
|
|
recMinPriceCents: toCents(getMin([...recPrices, ...childRecPrices, ...childPrices])),
|
|
recMaxPriceCents: toCents(getMax([...recPrices, ...childRecPrices, ...childPrices])),
|
|
recMinSpecialPriceCents: toCents(getMin(recSpecialPrices)),
|
|
medMinPriceCents: toCents(getMin([...medPrices, ...childMedPrices])),
|
|
medMaxPriceCents: toCents(getMax([...medPrices, ...childMedPrices])),
|
|
medMinSpecialPriceCents: toCents(getMin(medSpecialPrices)),
|
|
wholesaleMinPriceCents: toCents(getMin(wholesalePrices)),
|
|
// Inventory summary - null = unknown, 0 = all OOS
|
|
totalQuantityAvailable: totalQty,
|
|
totalKioskQuantityAvailable: totalKioskQty,
|
|
manualInventory: normBool(raw.manualInventory, false),
|
|
isBelowThreshold: normBool(raw.isBelowThreshold, false),
|
|
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
|
|
options,
|
|
rawPayload: raw,
|
|
crawledAt: new Date(),
|
|
};
|
|
}
|
|
// ============================================================
|
|
// DATABASE OPERATIONS
|
|
// ============================================================
|
|
/**
|
|
* Upsert a DutchieProduct record
|
|
*/
|
|
async function upsertProduct(product) {
|
|
const result = await (0, connection_1.query)(`
|
|
INSERT INTO dutchie_products (
|
|
dispensary_id, platform, external_product_id, platform_dispensary_id,
|
|
c_name, name, brand_name, brand_id, brand_logo_url,
|
|
type, subcategory, strain_type, provider,
|
|
thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
|
|
status, medical_only, rec_only, featured, coming_soon, certificate_of_analysis_enabled,
|
|
is_below_threshold, is_below_kiosk_threshold, options_below_threshold, options_below_kiosk_threshold,
|
|
stock_status, total_quantity_available,
|
|
primary_image_url, images, measurements, weight, past_c_names,
|
|
created_at_dutchie, updated_at_dutchie, latest_raw_payload, updated_at
|
|
) VALUES (
|
|
$1, $2, $3, $4,
|
|
$5, $6, $7, $8, $9,
|
|
$10, $11, $12, $13,
|
|
$14, $15, $16, $17, $18, $19,
|
|
$20, $21, $22, $23, $24, $25,
|
|
$26, $27, $28, $29,
|
|
$30, $31,
|
|
$32, $33, $34, $35, $36,
|
|
$37, $38, $39, NOW()
|
|
)
|
|
ON CONFLICT (dispensary_id, external_product_id) DO UPDATE SET
|
|
c_name = EXCLUDED.c_name,
|
|
name = EXCLUDED.name,
|
|
brand_name = EXCLUDED.brand_name,
|
|
brand_id = EXCLUDED.brand_id,
|
|
brand_logo_url = EXCLUDED.brand_logo_url,
|
|
type = EXCLUDED.type,
|
|
subcategory = EXCLUDED.subcategory,
|
|
strain_type = EXCLUDED.strain_type,
|
|
provider = EXCLUDED.provider,
|
|
thc = EXCLUDED.thc,
|
|
thc_content = EXCLUDED.thc_content,
|
|
cbd = EXCLUDED.cbd,
|
|
cbd_content = EXCLUDED.cbd_content,
|
|
cannabinoids_v2 = EXCLUDED.cannabinoids_v2,
|
|
effects = EXCLUDED.effects,
|
|
status = EXCLUDED.status,
|
|
medical_only = EXCLUDED.medical_only,
|
|
rec_only = EXCLUDED.rec_only,
|
|
featured = EXCLUDED.featured,
|
|
coming_soon = EXCLUDED.coming_soon,
|
|
certificate_of_analysis_enabled = EXCLUDED.certificate_of_analysis_enabled,
|
|
is_below_threshold = EXCLUDED.is_below_threshold,
|
|
is_below_kiosk_threshold = EXCLUDED.is_below_kiosk_threshold,
|
|
options_below_threshold = EXCLUDED.options_below_threshold,
|
|
options_below_kiosk_threshold = EXCLUDED.options_below_kiosk_threshold,
|
|
stock_status = EXCLUDED.stock_status,
|
|
total_quantity_available = EXCLUDED.total_quantity_available,
|
|
primary_image_url = EXCLUDED.primary_image_url,
|
|
images = EXCLUDED.images,
|
|
measurements = EXCLUDED.measurements,
|
|
weight = EXCLUDED.weight,
|
|
past_c_names = EXCLUDED.past_c_names,
|
|
created_at_dutchie = EXCLUDED.created_at_dutchie,
|
|
updated_at_dutchie = EXCLUDED.updated_at_dutchie,
|
|
latest_raw_payload = EXCLUDED.latest_raw_payload,
|
|
updated_at = NOW()
|
|
RETURNING id
|
|
`, [
|
|
product.dispensaryId,
|
|
product.platform,
|
|
product.externalProductId,
|
|
product.platformDispensaryId,
|
|
product.cName,
|
|
product.name,
|
|
product.brandName,
|
|
product.brandId,
|
|
product.brandLogoUrl,
|
|
product.type,
|
|
product.subcategory,
|
|
product.strainType,
|
|
product.provider,
|
|
product.thc,
|
|
product.thcContent,
|
|
product.cbd,
|
|
product.cbdContent,
|
|
product.cannabinoidsV2 ? JSON.stringify(product.cannabinoidsV2) : null,
|
|
product.effects ? JSON.stringify(product.effects) : null,
|
|
product.status,
|
|
product.medicalOnly,
|
|
product.recOnly,
|
|
product.featured,
|
|
product.comingSoon,
|
|
product.certificateOfAnalysisEnabled,
|
|
product.isBelowThreshold,
|
|
product.isBelowKioskThreshold,
|
|
product.optionsBelowThreshold,
|
|
product.optionsBelowKioskThreshold,
|
|
product.stockStatus,
|
|
product.totalQuantityAvailable,
|
|
product.primaryImageUrl,
|
|
product.images ? JSON.stringify(product.images) : null,
|
|
product.measurements ? JSON.stringify(product.measurements) : null,
|
|
product.weight,
|
|
product.pastCNames,
|
|
product.createdAtDutchie,
|
|
product.updatedAtDutchie,
|
|
product.latestRawPayload ? JSON.stringify(product.latestRawPayload) : null,
|
|
]);
|
|
return result.rows[0].id;
|
|
}
|
|
/**
|
|
* Download product image and update local image URLs
|
|
* Skips download if local image already exists for this product+URL combo
|
|
*/
|
|
async function downloadAndUpdateProductImage(productId, dispensaryId, externalProductId, primaryImageUrl) {
|
|
if (!primaryImageUrl) {
|
|
return { downloaded: false, error: 'No image URL' };
|
|
}
|
|
try {
|
|
// Check if we already have this image locally
|
|
const exists = await (0, image_storage_1.imageExists)(dispensaryId, externalProductId, primaryImageUrl);
|
|
if (exists) {
|
|
return { downloaded: false };
|
|
}
|
|
// Download and process the image
|
|
const result = await (0, image_storage_1.downloadProductImage)(primaryImageUrl, dispensaryId, externalProductId);
|
|
if (!result.success || !result.urls) {
|
|
return { downloaded: false, error: result.error };
|
|
}
|
|
// Update the product record with local image URLs
|
|
await (0, connection_1.query)(`
|
|
UPDATE dutchie_products
|
|
SET
|
|
local_image_url = $1,
|
|
local_image_thumb_url = $2,
|
|
local_image_medium_url = $3,
|
|
original_image_url = COALESCE(original_image_url, primary_image_url),
|
|
updated_at = NOW()
|
|
WHERE id = $4
|
|
`, [result.urls.full, result.urls.thumb, result.urls.medium, productId]);
|
|
return { downloaded: true };
|
|
}
|
|
catch (error) {
|
|
return { downloaded: false, error: error.message };
|
|
}
|
|
}
|
|
/**
|
|
* Insert a snapshot record
|
|
*/
|
|
async function insertSnapshot(snapshot) {
|
|
const result = await (0, connection_1.query)(`
|
|
INSERT INTO dutchie_product_snapshots (
|
|
dutchie_product_id, dispensary_id, platform_dispensary_id, external_product_id,
|
|
pricing_type, crawl_mode, status, featured, special, medical_only, rec_only,
|
|
is_present_in_feed, stock_status,
|
|
rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
|
|
med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
|
|
wholesale_min_price_cents,
|
|
total_quantity_available, total_kiosk_quantity_available, manual_inventory,
|
|
is_below_threshold, is_below_kiosk_threshold,
|
|
options, raw_payload, crawled_at
|
|
) VALUES (
|
|
$1, $2, $3, $4,
|
|
$5, $6, $7, $8, $9, $10, $11,
|
|
$12, $13,
|
|
$14, $15, $16,
|
|
$17, $18, $19,
|
|
$20,
|
|
$21, $22, $23,
|
|
$24, $25,
|
|
$26, $27, $28
|
|
)
|
|
RETURNING id
|
|
`, [
|
|
snapshot.dutchieProductId,
|
|
snapshot.dispensaryId,
|
|
snapshot.platformDispensaryId,
|
|
snapshot.externalProductId,
|
|
snapshot.pricingType,
|
|
snapshot.crawlMode,
|
|
snapshot.status,
|
|
snapshot.featured,
|
|
snapshot.special,
|
|
snapshot.medicalOnly,
|
|
snapshot.recOnly,
|
|
snapshot.isPresentInFeed ?? true,
|
|
snapshot.stockStatus,
|
|
snapshot.recMinPriceCents,
|
|
snapshot.recMaxPriceCents,
|
|
snapshot.recMinSpecialPriceCents,
|
|
snapshot.medMinPriceCents,
|
|
snapshot.medMaxPriceCents,
|
|
snapshot.medMinSpecialPriceCents,
|
|
snapshot.wholesaleMinPriceCents,
|
|
snapshot.totalQuantityAvailable,
|
|
snapshot.totalKioskQuantityAvailable,
|
|
snapshot.manualInventory,
|
|
snapshot.isBelowThreshold,
|
|
snapshot.isBelowKioskThreshold,
|
|
JSON.stringify(snapshot.options || []),
|
|
JSON.stringify(snapshot.rawPayload || {}),
|
|
snapshot.crawledAt,
|
|
]);
|
|
return result.rows[0].id;
|
|
}
|
|
// ============================================================
|
|
// BATCH DATABASE OPERATIONS (per CLAUDE.md Rule #15)
|
|
// ============================================================
|
|
/**
|
|
* Helper to chunk an array into smaller arrays
|
|
*/
|
|
function chunkArray(array, size) {
|
|
const chunks = [];
|
|
for (let i = 0; i < array.length; i += size) {
|
|
chunks.push(array.slice(i, i + size));
|
|
}
|
|
return chunks;
|
|
}
|
|
/**
|
|
* Batch upsert products - processes in chunks to avoid OOM
|
|
* Returns a Map of externalProductId -> database id
|
|
*/
|
|
async function batchUpsertProducts(products) {
|
|
const productIdMap = new Map();
|
|
const chunks = chunkArray(products, BATCH_CHUNK_SIZE);
|
|
console.log(`[ProductCrawler] Batch upserting ${products.length} products in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
const chunk = chunks[i];
|
|
// Process each product in the chunk
|
|
for (const product of chunk) {
|
|
try {
|
|
const id = await upsertProduct(product);
|
|
if (product.externalProductId) {
|
|
productIdMap.set(product.externalProductId, id);
|
|
}
|
|
}
|
|
catch (error) {
|
|
console.error(`[ProductCrawler] Error upserting product ${product.externalProductId}:`, error.message);
|
|
}
|
|
}
|
|
// Log progress
|
|
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
|
|
console.log(`[ProductCrawler] Upserted chunk ${i + 1}/${chunks.length} (${productIdMap.size} products so far)`);
|
|
}
|
|
}
|
|
return productIdMap;
|
|
}
|
|
/**
|
|
* Batch insert snapshots - processes in chunks to avoid OOM
|
|
*/
|
|
async function batchInsertSnapshots(snapshots) {
|
|
const chunks = chunkArray(snapshots, BATCH_CHUNK_SIZE);
|
|
let inserted = 0;
|
|
console.log(`[ProductCrawler] Batch inserting ${snapshots.length} snapshots in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
const chunk = chunks[i];
|
|
// Process each snapshot in the chunk
|
|
for (const snapshot of chunk) {
|
|
try {
|
|
await insertSnapshot(snapshot);
|
|
inserted++;
|
|
}
|
|
catch (error) {
|
|
console.error(`[ProductCrawler] Error inserting snapshot for ${snapshot.externalProductId}:`, error.message);
|
|
}
|
|
}
|
|
// Log progress
|
|
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
|
|
console.log(`[ProductCrawler] Inserted snapshot chunk ${i + 1}/${chunks.length} (${inserted} snapshots so far)`);
|
|
}
|
|
}
|
|
return inserted;
|
|
}
|
|
/**
|
|
* Update dispensary last_crawled_at and product_count
|
|
*/
|
|
async function updateDispensaryCrawlStats(dispensaryId, productCount) {
|
|
// Update last_crawl_at to track when we last crawled
|
|
// Skip product_count as that column may not exist
|
|
await (0, connection_1.query)(`
|
|
UPDATE dispensaries
|
|
SET last_crawl_at = NOW(), updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
}
|
|
/**
|
|
* Mark products as missing from feed
|
|
* Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
|
|
* for products that were NOT in the UNION of Mode A and Mode B product lists
|
|
*
|
|
* IMPORTANT: Uses UNION of both modes to avoid false positives
|
|
* If the union is empty (possible outage), we skip marking to avoid data corruption
|
|
*/
|
|
async function markMissingProducts(dispensaryId, platformDispensaryId, modeAProductIds, modeBProductIds, pricingType) {
|
|
// Build UNION of Mode A + Mode B product IDs
|
|
const unionProductIds = new Set([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
|
|
// OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
|
|
if (unionProductIds.size === 0) {
|
|
console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
|
|
return 0;
|
|
}
|
|
// Get all existing products for this dispensary that were not in the UNION
|
|
const { rows: missingProducts } = await (0, connection_1.query)(`
|
|
SELECT id, external_product_id, name
|
|
FROM dutchie_products
|
|
WHERE dispensary_id = $1
|
|
AND external_product_id NOT IN (SELECT unnest($2::text[]))
|
|
`, [dispensaryId, Array.from(unionProductIds)]);
|
|
if (missingProducts.length === 0) {
|
|
return 0;
|
|
}
|
|
console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
|
|
const crawledAt = new Date();
|
|
// Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
|
|
const missingSnapshots = missingProducts.map(product => ({
|
|
dutchieProductId: product.id,
|
|
dispensaryId,
|
|
platformDispensaryId,
|
|
externalProductId: product.external_product_id,
|
|
pricingType,
|
|
crawlMode: 'mode_a', // Use mode_a for missing snapshots (convention)
|
|
status: undefined,
|
|
featured: false,
|
|
special: false,
|
|
medicalOnly: false,
|
|
recOnly: false,
|
|
isPresentInFeed: false,
|
|
stockStatus: 'missing_from_feed',
|
|
totalQuantityAvailable: undefined, // null = unknown, not 0
|
|
manualInventory: false,
|
|
isBelowThreshold: false,
|
|
isBelowKioskThreshold: false,
|
|
options: [],
|
|
rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
|
|
crawledAt,
|
|
}));
|
|
// Batch insert missing snapshots
|
|
const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
|
|
// Batch update product stock status in chunks
|
|
const productIds = missingProducts.map(p => p.id);
|
|
const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
|
|
console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
|
|
for (const chunk of productChunks) {
|
|
await (0, connection_1.query)(`
|
|
UPDATE dutchie_products
|
|
SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
|
|
WHERE id = ANY($1::int[])
|
|
`, [chunk]);
|
|
}
|
|
console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
|
|
return snapshotsInserted;
|
|
}
|
|
/**
|
|
* Process a batch of products from a single crawl mode
|
|
* IMPORTANT: Stores ALL products, never filters before DB
|
|
* Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM
|
|
* Returns the set of external product IDs that were processed
|
|
*/
|
|
async function processProducts(products, dispensary, pricingType, crawlMode, options = {}) {
|
|
const { downloadImages = true } = options;
|
|
const productIds = new Set();
|
|
let imagesDownloaded = 0;
|
|
let imageErrors = 0;
|
|
console.log(`[ProductCrawler] Processing ${products.length} products using chunked batch processing...`);
|
|
// Step 1: Normalize all products and collect IDs
|
|
const normalizedProducts = [];
|
|
const rawByExternalId = new Map();
|
|
for (const raw of products) {
|
|
const externalId = raw._id || raw.id || '';
|
|
productIds.add(externalId);
|
|
rawByExternalId.set(externalId, raw);
|
|
const normalized = normalizeProduct(raw, dispensary.id, dispensary.platformDispensaryId);
|
|
normalizedProducts.push(normalized);
|
|
}
|
|
// Step 2: Batch upsert products (chunked)
|
|
const productIdMap = await batchUpsertProducts(normalizedProducts);
|
|
const upserted = productIdMap.size;
|
|
// Step 3: Create and batch insert snapshots (chunked)
|
|
// IMPORTANT: Do this BEFORE image downloads to ensure snapshots are created even if images fail
|
|
const snapshots = [];
|
|
for (const [externalId, productId] of Array.from(productIdMap.entries())) {
|
|
const raw = rawByExternalId.get(externalId);
|
|
if (raw) {
|
|
const snapshot = normalizeSnapshot(raw, productId, dispensary.id, dispensary.platformDispensaryId, pricingType, crawlMode);
|
|
snapshots.push(snapshot);
|
|
}
|
|
}
|
|
const snapshotsInserted = await batchInsertSnapshots(snapshots);
|
|
// Step 4: Download images in chunks (if enabled)
|
|
// This is done AFTER snapshots to ensure core data is saved even if image downloads fail
|
|
if (downloadImages) {
|
|
const imageChunks = chunkArray(Array.from(productIdMap.entries()), BATCH_CHUNK_SIZE);
|
|
console.log(`[ProductCrawler] Downloading images in ${imageChunks.length} chunks...`);
|
|
for (let i = 0; i < imageChunks.length; i++) {
|
|
const chunk = imageChunks[i];
|
|
for (const [externalId, productId] of chunk) {
|
|
const normalized = normalizedProducts.find(p => p.externalProductId === externalId);
|
|
if (normalized?.primaryImageUrl) {
|
|
try {
|
|
const imageResult = await downloadAndUpdateProductImage(productId, dispensary.id, externalId, normalized.primaryImageUrl);
|
|
if (imageResult.downloaded) {
|
|
imagesDownloaded++;
|
|
}
|
|
else if (imageResult.error && imageResult.error !== 'No image URL') {
|
|
imageErrors++;
|
|
}
|
|
}
|
|
catch (error) {
|
|
imageErrors++;
|
|
}
|
|
}
|
|
}
|
|
if ((i + 1) % 5 === 0 || i === imageChunks.length - 1) {
|
|
console.log(`[ProductCrawler] Image download chunk ${i + 1}/${imageChunks.length} (${imagesDownloaded} downloaded, ${imageErrors} errors)`);
|
|
}
|
|
}
|
|
}
|
|
// Clear references to help GC
|
|
normalizedProducts.length = 0;
|
|
rawByExternalId.clear();
|
|
return { upserted, snapshots: snapshotsInserted, productIds, imagesDownloaded, imageErrors };
|
|
}
|
|
async function crawlDispensaryProducts(dispensary, pricingType = 'rec', options = {}) {
|
|
const { useBothModes = true, downloadImages = true, onProgress } = options;
|
|
const startTime = Date.now();
|
|
if (!dispensary.platformDispensaryId) {
|
|
return {
|
|
success: false,
|
|
dispensaryId: dispensary.id,
|
|
productsFound: 0,
|
|
productsFetched: 0,
|
|
productsUpserted: 0,
|
|
snapshotsCreated: 0,
|
|
errorMessage: 'Missing platformDispensaryId',
|
|
durationMs: Date.now() - startTime,
|
|
};
|
|
}
|
|
try {
|
|
console.log(`[ProductCrawler] Crawling ${dispensary.name} (${dispensary.platformDispensaryId})...`);
|
|
let totalUpserted = 0;
|
|
let totalSnapshots = 0;
|
|
let totalImagesDownloaded = 0;
|
|
let totalImageErrors = 0;
|
|
let modeAProducts = 0;
|
|
let modeBProducts = 0;
|
|
let missingMarked = 0;
|
|
// Track product IDs separately for each mode (needed for missing product detection)
|
|
const modeAProductIds = new Set();
|
|
const modeBProductIds = new Set();
|
|
// Extract cName for this specific dispensary (used for Puppeteer session & headers)
|
|
const cName = extractCName(dispensary);
|
|
console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`);
|
|
if (useBothModes) {
|
|
// Run two-mode crawl for maximum coverage
|
|
const bothResults = await (0, graphql_client_1.fetchAllProductsBothModes)(dispensary.platformDispensaryId, pricingType, { cName });
|
|
modeAProducts = bothResults.modeA.products.length;
|
|
modeBProducts = bothResults.modeB.products.length;
|
|
console.log(`[ProductCrawler] Two-mode crawl: Mode A=${modeAProducts}, Mode B=${modeBProducts}, Merged=${bothResults.merged.products.length}`);
|
|
// Collect Mode A product IDs
|
|
for (const p of bothResults.modeA.products) {
|
|
modeAProductIds.add(p._id);
|
|
}
|
|
// Collect Mode B product IDs
|
|
for (const p of bothResults.modeB.products) {
|
|
modeBProductIds.add(p._id);
|
|
}
|
|
// Process MERGED products (includes options from both modes)
|
|
if (bothResults.merged.products.length > 0) {
|
|
const mergedResult = await processProducts(bothResults.merged.products, dispensary, pricingType, 'mode_a', // Use mode_a for merged products (convention)
|
|
{ downloadImages });
|
|
totalUpserted = mergedResult.upserted;
|
|
totalSnapshots = mergedResult.snapshots;
|
|
totalImagesDownloaded = mergedResult.imagesDownloaded;
|
|
totalImageErrors = mergedResult.imageErrors;
|
|
// Report progress
|
|
if (onProgress) {
|
|
await onProgress({
|
|
productsFound: bothResults.merged.products.length,
|
|
productsUpserted: totalUpserted,
|
|
snapshotsCreated: totalSnapshots,
|
|
currentPage: 1,
|
|
totalPages: 1,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Single mode crawl (Mode A only)
|
|
const { products, crawlMode } = await (0, graphql_client_1.fetchAllProducts)(dispensary.platformDispensaryId, pricingType, { crawlMode: 'mode_a', cName });
|
|
modeAProducts = products.length;
|
|
// Collect Mode A product IDs
|
|
for (const p of products) {
|
|
modeAProductIds.add(p._id);
|
|
}
|
|
const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages });
|
|
totalUpserted = result.upserted;
|
|
totalSnapshots = result.snapshots;
|
|
totalImagesDownloaded = result.imagesDownloaded;
|
|
totalImageErrors = result.imageErrors;
|
|
// Report progress
|
|
if (onProgress) {
|
|
await onProgress({
|
|
productsFound: products.length,
|
|
productsUpserted: totalUpserted,
|
|
snapshotsCreated: totalSnapshots,
|
|
currentPage: 1,
|
|
totalPages: 1,
|
|
});
|
|
}
|
|
}
|
|
// Mark products as missing using UNION of Mode A + Mode B
|
|
// The function handles outage detection (empty union = skip marking)
|
|
missingMarked = await markMissingProducts(dispensary.id, dispensary.platformDispensaryId, modeAProductIds, modeBProductIds, pricingType);
|
|
totalSnapshots += missingMarked;
|
|
// Update dispensary stats
|
|
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
|
|
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
|
|
const totalProductsFound = modeAProducts + modeBProducts;
|
|
return {
|
|
success: true,
|
|
dispensaryId: dispensary.id,
|
|
productsFound: totalProductsFound,
|
|
productsFetched: totalProductsFound,
|
|
productsUpserted: totalUpserted,
|
|
snapshotsCreated: totalSnapshots,
|
|
modeAProducts,
|
|
modeBProducts,
|
|
missingProductsMarked: missingMarked,
|
|
imagesDownloaded: totalImagesDownloaded,
|
|
imageErrors: totalImageErrors,
|
|
durationMs: Date.now() - startTime,
|
|
};
|
|
}
|
|
catch (error) {
|
|
console.error(`[ProductCrawler] Failed to crawl ${dispensary.name}:`, error.message);
|
|
return {
|
|
success: false,
|
|
dispensaryId: dispensary.id,
|
|
productsFound: 0,
|
|
productsFetched: 0,
|
|
productsUpserted: 0,
|
|
snapshotsCreated: 0,
|
|
errorMessage: error.message,
|
|
durationMs: Date.now() - startTime,
|
|
};
|
|
}
|
|
}
|
|
/**
|
|
* Crawl all Arizona dispensaries
|
|
*/
|
|
async function crawlAllArizonaDispensaries(pricingType = 'rec') {
|
|
const results = [];
|
|
// Get all AZ dispensaries with platform IDs
|
|
const { rows: rawRows } = await (0, connection_1.query)(`
|
|
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
|
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
|
ORDER BY id
|
|
`);
|
|
const dispensaries = rawRows.map(discovery_1.mapDbRowToDispensary);
|
|
console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);
|
|
for (const dispensary of dispensaries) {
|
|
const result = await crawlDispensaryProducts(dispensary, pricingType);
|
|
results.push(result);
|
|
// Delay between dispensaries
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
}
|
|
const successful = results.filter((r) => r.success).length;
|
|
const totalProducts = results.reduce((sum, r) => sum + r.productsUpserted, 0);
|
|
const totalSnapshots = results.reduce((sum, r) => sum + r.snapshotsCreated, 0);
|
|
console.log(`[ProductCrawler] Completed: ${successful}/${dispensaries.length} stores, ${totalProducts} products, ${totalSnapshots} snapshots`);
|
|
return results;
|
|
}
|