Files
cannaiq/backend/dist/dutchie-az/services/product-crawler.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

844 lines
35 KiB
JavaScript

"use strict";
/**
* Dutchie AZ Product Crawler Service
*
* Crawls products from Dutchie dispensaries and stores them in the dutchie_az database.
* Handles normalization from GraphQL response to database entities.
*
* IMPORTANT: Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeProduct = normalizeProduct;
exports.normalizeSnapshot = normalizeSnapshot;
exports.crawlDispensaryProducts = crawlDispensaryProducts;
exports.crawlAllArizonaDispensaries = crawlAllArizonaDispensaries;
const connection_1 = require("../db/connection");
const graphql_client_1 = require("./graphql-client");
const discovery_1 = require("./discovery");
const types_1 = require("../types");
const image_storage_1 = require("../../utils/image-storage");
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// ============================================================
// BATCH PROCESSING CONFIGURATION
// ============================================================
/** Chunk size for batch DB writes (per CLAUDE.md Rule #15) */
const BATCH_CHUNK_SIZE = 100;
// ============================================================
// NORMALIZATION FUNCTIONS
// ============================================================
/**
* Convert price to cents
*/
function toCents(price) {
if (price === undefined || price === null)
return undefined;
return Math.round(price * 100);
}
/**
* Get min value from array of numbers
*/
function getMin(arr) {
if (!arr || arr.length === 0)
return undefined;
return Math.min(...arr.filter((n) => n !== null && n !== undefined));
}
/**
* Get max value from array of numbers
*/
function getMax(arr) {
if (!arr || arr.length === 0)
return undefined;
return Math.max(...arr.filter((n) => n !== null && n !== undefined));
}
/**
* Normalize a value to boolean
* Handles Dutchie API returning {} or [] or other non-boolean values
* that would cause "invalid input syntax for type boolean" errors
*/
function normBool(v, defaultVal = false) {
if (v === true)
return true;
if (v === false)
return false;
// Log unexpected object/array values once for debugging
if (v !== null && v !== undefined && typeof v === 'object') {
console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v));
}
return defaultVal;
}
/**
* Normalize a value to Date or undefined
* Handles Dutchie API returning {} or [] or other non-date values
* that would cause "invalid input syntax for type timestamp" errors
*/
function normDate(v) {
if (!v)
return undefined;
// Reject objects/arrays that aren't dates
if (typeof v === 'object' && !(v instanceof Date)) {
console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v));
return undefined;
}
// Try parsing
const d = new Date(v);
if (isNaN(d.getTime())) {
console.warn(`[normDate] Invalid date value, ignoring:`, v);
return undefined;
}
return d;
}
/**
* Extract cName (Dutchie slug) from menuUrl or dispensary slug
* Handles URL formats:
* - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted
* - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock
* Falls back to dispensary.slug if menuUrl extraction fails
*/
function extractCName(dispensary) {
if (dispensary.menuUrl) {
try {
const url = new URL(dispensary.menuUrl);
// Extract last path segment: /embedded-menu/X or /dispensary/X
const segments = url.pathname.split('/').filter(Boolean);
if (segments.length >= 2) {
const cName = segments[segments.length - 1];
if (cName) {
console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`);
return cName;
}
}
}
catch (e) {
console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`);
}
}
// Fallback to slug
console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`);
return dispensary.slug;
}
/**
* Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot
*/
function normalizeOption(child) {
return {
optionId: child.canonicalID || child.canonicalPackageId || child.canonicalSKU || child.option || 'unknown',
canonicalId: child.canonicalID,
canonicalPackageId: child.canonicalPackageId,
canonicalSKU: child.canonicalSKU,
canonicalName: child.canonicalName,
canonicalCategory: child.canonicalCategory,
canonicalCategoryId: child.canonicalCategoryId,
canonicalBrandId: child.canonicalBrandId,
canonicalBrandName: child.canonicalBrandName,
canonicalStrainId: child.canonicalStrainId,
canonicalVendorId: child.canonicalVendorId,
optionLabel: child.option,
packageQuantity: child.packageQuantity,
recEquivalent: child.recEquivalent,
standardEquivalent: child.standardEquivalent,
priceCents: toCents(child.price),
recPriceCents: toCents(child.recPrice),
medPriceCents: toCents(child.medPrice),
quantity: child.quantity,
quantityAvailable: child.quantityAvailable,
kioskQuantityAvailable: child.kioskQuantityAvailable,
activeBatchTags: child.activeBatchTags,
canonicalImgUrl: child.canonicalImgUrl,
canonicalLabResultUrl: child.canonicalLabResultUrl,
canonicalEffectivePotencyMg: child.canonicalEffectivePotencyMg,
rawChildPayload: child,
};
}
/**
* Normalize a raw Dutchie product to DutchieProduct (canonical identity)
*/
function normalizeProduct(raw, dispensaryId, platformDispensaryId) {
return {
dispensaryId,
platform: 'dutchie',
externalProductId: raw._id || raw.id || '',
platformDispensaryId,
cName: raw.cName,
name: raw.Name,
// Brand
brandName: raw.brandName || raw.brand?.name,
brandId: raw.brandId || raw.brand?.id,
brandLogoUrl: raw.brandLogo || raw.brand?.imageUrl,
// Classification
type: raw.type,
subcategory: raw.subcategory,
strainType: raw.strainType,
provider: raw.provider,
// Potency
thc: raw.THC,
thcContent: raw.THCContent?.range?.[0],
cbd: raw.CBD,
cbdContent: raw.CBDContent?.range?.[0],
cannabinoidsV2: raw.cannabinoidsV2,
effects: raw.effects,
// Status / flags
status: raw.Status,
medicalOnly: normBool(raw.medicalOnly, false),
recOnly: normBool(raw.recOnly, false),
featured: normBool(raw.featured, false),
comingSoon: normBool(raw.comingSoon, false),
certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false),
isBelowThreshold: normBool(raw.isBelowThreshold, false),
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false),
optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false),
// Derived stock status
stockStatus: (0, types_1.deriveStockStatus)(raw),
totalQuantityAvailable: (0, types_1.calculateTotalQuantity)(raw),
// Images
primaryImageUrl: raw.Image || raw.images?.[0]?.url,
images: raw.images,
// Misc
measurements: raw.measurements,
weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight,
pastCNames: raw.pastCNames,
createdAtDutchie: normDate(raw.createdAt),
updatedAtDutchie: normDate(raw.updatedAt),
latestRawPayload: raw,
};
}
/**
* Normalize a raw Dutchie product to DutchieProductSnapshot (time-series data)
*/
function normalizeSnapshot(raw, dutchieProductId, dispensaryId, platformDispensaryId, pricingType, crawlMode = 'mode_a') {
const children = raw.POSMetaData?.children || [];
const options = children.map(normalizeOption);
// Aggregate prices from various sources
const recPrices = raw.recPrices || [];
const medPrices = raw.medicalPrices || [];
const recSpecialPrices = raw.recSpecialPrices || [];
const medSpecialPrices = raw.medicalSpecialPrices || [];
const wholesalePrices = raw.wholesalePrices || [];
// Also consider child prices
const childRecPrices = children.map((c) => c.recPrice).filter((p) => p !== undefined);
const childMedPrices = children.map((c) => c.medPrice).filter((p) => p !== undefined);
const childPrices = children.map((c) => c.price).filter((p) => p !== undefined);
// Aggregate inventory - use calculateTotalQuantity for proper null handling
const totalQty = (0, types_1.calculateTotalQuantity)(raw);
const hasAnyKioskQty = children.some(c => typeof c.kioskQuantityAvailable === 'number');
const totalKioskQty = hasAnyKioskQty
? children.reduce((sum, c) => sum + (c.kioskQuantityAvailable || 0), 0)
: null;
// Determine if on special
const isOnSpecial = raw.special === true ||
(raw.specialData?.saleSpecials && raw.specialData.saleSpecials.length > 0) ||
(recSpecialPrices.length > 0 && recSpecialPrices[0] !== null) ||
(medSpecialPrices.length > 0 && medSpecialPrices[0] !== null);
return {
dutchieProductId,
dispensaryId,
platformDispensaryId,
externalProductId: raw._id || raw.id || '',
pricingType,
crawlMode,
status: raw.Status,
featured: normBool(raw.featured, false),
special: normBool(isOnSpecial, false),
medicalOnly: normBool(raw.medicalOnly, false),
recOnly: normBool(raw.recOnly, false),
// Product was present in feed
isPresentInFeed: true,
// Derived stock status
stockStatus: (0, types_1.deriveStockStatus)(raw),
// Price summary
recMinPriceCents: toCents(getMin([...recPrices, ...childRecPrices, ...childPrices])),
recMaxPriceCents: toCents(getMax([...recPrices, ...childRecPrices, ...childPrices])),
recMinSpecialPriceCents: toCents(getMin(recSpecialPrices)),
medMinPriceCents: toCents(getMin([...medPrices, ...childMedPrices])),
medMaxPriceCents: toCents(getMax([...medPrices, ...childMedPrices])),
medMinSpecialPriceCents: toCents(getMin(medSpecialPrices)),
wholesaleMinPriceCents: toCents(getMin(wholesalePrices)),
// Inventory summary - null = unknown, 0 = all OOS
totalQuantityAvailable: totalQty,
totalKioskQuantityAvailable: totalKioskQty,
manualInventory: normBool(raw.manualInventory, false),
isBelowThreshold: normBool(raw.isBelowThreshold, false),
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
options,
rawPayload: raw,
crawledAt: new Date(),
};
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a DutchieProduct record
*/
async function upsertProduct(product) {
const result = await (0, connection_1.query)(`
INSERT INTO dutchie_products (
dispensary_id, platform, external_product_id, platform_dispensary_id,
c_name, name, brand_name, brand_id, brand_logo_url,
type, subcategory, strain_type, provider,
thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
status, medical_only, rec_only, featured, coming_soon, certificate_of_analysis_enabled,
is_below_threshold, is_below_kiosk_threshold, options_below_threshold, options_below_kiosk_threshold,
stock_status, total_quantity_available,
primary_image_url, images, measurements, weight, past_c_names,
created_at_dutchie, updated_at_dutchie, latest_raw_payload, updated_at
) VALUES (
$1, $2, $3, $4,
$5, $6, $7, $8, $9,
$10, $11, $12, $13,
$14, $15, $16, $17, $18, $19,
$20, $21, $22, $23, $24, $25,
$26, $27, $28, $29,
$30, $31,
$32, $33, $34, $35, $36,
$37, $38, $39, NOW()
)
ON CONFLICT (dispensary_id, external_product_id) DO UPDATE SET
c_name = EXCLUDED.c_name,
name = EXCLUDED.name,
brand_name = EXCLUDED.brand_name,
brand_id = EXCLUDED.brand_id,
brand_logo_url = EXCLUDED.brand_logo_url,
type = EXCLUDED.type,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
provider = EXCLUDED.provider,
thc = EXCLUDED.thc,
thc_content = EXCLUDED.thc_content,
cbd = EXCLUDED.cbd,
cbd_content = EXCLUDED.cbd_content,
cannabinoids_v2 = EXCLUDED.cannabinoids_v2,
effects = EXCLUDED.effects,
status = EXCLUDED.status,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
featured = EXCLUDED.featured,
coming_soon = EXCLUDED.coming_soon,
certificate_of_analysis_enabled = EXCLUDED.certificate_of_analysis_enabled,
is_below_threshold = EXCLUDED.is_below_threshold,
is_below_kiosk_threshold = EXCLUDED.is_below_kiosk_threshold,
options_below_threshold = EXCLUDED.options_below_threshold,
options_below_kiosk_threshold = EXCLUDED.options_below_kiosk_threshold,
stock_status = EXCLUDED.stock_status,
total_quantity_available = EXCLUDED.total_quantity_available,
primary_image_url = EXCLUDED.primary_image_url,
images = EXCLUDED.images,
measurements = EXCLUDED.measurements,
weight = EXCLUDED.weight,
past_c_names = EXCLUDED.past_c_names,
created_at_dutchie = EXCLUDED.created_at_dutchie,
updated_at_dutchie = EXCLUDED.updated_at_dutchie,
latest_raw_payload = EXCLUDED.latest_raw_payload,
updated_at = NOW()
RETURNING id
`, [
product.dispensaryId,
product.platform,
product.externalProductId,
product.platformDispensaryId,
product.cName,
product.name,
product.brandName,
product.brandId,
product.brandLogoUrl,
product.type,
product.subcategory,
product.strainType,
product.provider,
product.thc,
product.thcContent,
product.cbd,
product.cbdContent,
product.cannabinoidsV2 ? JSON.stringify(product.cannabinoidsV2) : null,
product.effects ? JSON.stringify(product.effects) : null,
product.status,
product.medicalOnly,
product.recOnly,
product.featured,
product.comingSoon,
product.certificateOfAnalysisEnabled,
product.isBelowThreshold,
product.isBelowKioskThreshold,
product.optionsBelowThreshold,
product.optionsBelowKioskThreshold,
product.stockStatus,
product.totalQuantityAvailable,
product.primaryImageUrl,
product.images ? JSON.stringify(product.images) : null,
product.measurements ? JSON.stringify(product.measurements) : null,
product.weight,
product.pastCNames,
product.createdAtDutchie,
product.updatedAtDutchie,
product.latestRawPayload ? JSON.stringify(product.latestRawPayload) : null,
]);
return result.rows[0].id;
}
/**
* Download product image and update local image URLs
* Skips download if local image already exists for this product+URL combo
*/
async function downloadAndUpdateProductImage(productId, dispensaryId, externalProductId, primaryImageUrl) {
if (!primaryImageUrl) {
return { downloaded: false, error: 'No image URL' };
}
try {
// Check if we already have this image locally
const exists = await (0, image_storage_1.imageExists)(dispensaryId, externalProductId, primaryImageUrl);
if (exists) {
return { downloaded: false };
}
// Download and process the image
const result = await (0, image_storage_1.downloadProductImage)(primaryImageUrl, dispensaryId, externalProductId);
if (!result.success || !result.urls) {
return { downloaded: false, error: result.error };
}
// Update the product record with local image URLs
await (0, connection_1.query)(`
UPDATE dutchie_products
SET
local_image_url = $1,
local_image_thumb_url = $2,
local_image_medium_url = $3,
original_image_url = COALESCE(original_image_url, primary_image_url),
updated_at = NOW()
WHERE id = $4
`, [result.urls.full, result.urls.thumb, result.urls.medium, productId]);
return { downloaded: true };
}
catch (error) {
return { downloaded: false, error: error.message };
}
}
/**
* Insert a snapshot record
*/
async function insertSnapshot(snapshot) {
const result = await (0, connection_1.query)(`
INSERT INTO dutchie_product_snapshots (
dutchie_product_id, dispensary_id, platform_dispensary_id, external_product_id,
pricing_type, crawl_mode, status, featured, special, medical_only, rec_only,
is_present_in_feed, stock_status,
rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
wholesale_min_price_cents,
total_quantity_available, total_kiosk_quantity_available, manual_inventory,
is_below_threshold, is_below_kiosk_threshold,
options, raw_payload, crawled_at
) VALUES (
$1, $2, $3, $4,
$5, $6, $7, $8, $9, $10, $11,
$12, $13,
$14, $15, $16,
$17, $18, $19,
$20,
$21, $22, $23,
$24, $25,
$26, $27, $28
)
RETURNING id
`, [
snapshot.dutchieProductId,
snapshot.dispensaryId,
snapshot.platformDispensaryId,
snapshot.externalProductId,
snapshot.pricingType,
snapshot.crawlMode,
snapshot.status,
snapshot.featured,
snapshot.special,
snapshot.medicalOnly,
snapshot.recOnly,
snapshot.isPresentInFeed ?? true,
snapshot.stockStatus,
snapshot.recMinPriceCents,
snapshot.recMaxPriceCents,
snapshot.recMinSpecialPriceCents,
snapshot.medMinPriceCents,
snapshot.medMaxPriceCents,
snapshot.medMinSpecialPriceCents,
snapshot.wholesaleMinPriceCents,
snapshot.totalQuantityAvailable,
snapshot.totalKioskQuantityAvailable,
snapshot.manualInventory,
snapshot.isBelowThreshold,
snapshot.isBelowKioskThreshold,
JSON.stringify(snapshot.options || []),
JSON.stringify(snapshot.rawPayload || {}),
snapshot.crawledAt,
]);
return result.rows[0].id;
}
// ============================================================
// BATCH DATABASE OPERATIONS (per CLAUDE.md Rule #15)
// ============================================================
/**
* Helper to chunk an array into smaller arrays
*/
function chunkArray(array, size) {
const chunks = [];
for (let i = 0; i < array.length; i += size) {
chunks.push(array.slice(i, i + size));
}
return chunks;
}
/**
* Batch upsert products - processes in chunks to avoid OOM
* Returns a Map of externalProductId -> database id
*/
async function batchUpsertProducts(products) {
const productIdMap = new Map();
const chunks = chunkArray(products, BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Batch upserting ${products.length} products in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
// Process each product in the chunk
for (const product of chunk) {
try {
const id = await upsertProduct(product);
if (product.externalProductId) {
productIdMap.set(product.externalProductId, id);
}
}
catch (error) {
console.error(`[ProductCrawler] Error upserting product ${product.externalProductId}:`, error.message);
}
}
// Log progress
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
console.log(`[ProductCrawler] Upserted chunk ${i + 1}/${chunks.length} (${productIdMap.size} products so far)`);
}
}
return productIdMap;
}
/**
* Batch insert snapshots - processes in chunks to avoid OOM
*/
async function batchInsertSnapshots(snapshots) {
const chunks = chunkArray(snapshots, BATCH_CHUNK_SIZE);
let inserted = 0;
console.log(`[ProductCrawler] Batch inserting ${snapshots.length} snapshots in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
// Process each snapshot in the chunk
for (const snapshot of chunk) {
try {
await insertSnapshot(snapshot);
inserted++;
}
catch (error) {
console.error(`[ProductCrawler] Error inserting snapshot for ${snapshot.externalProductId}:`, error.message);
}
}
// Log progress
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
console.log(`[ProductCrawler] Inserted snapshot chunk ${i + 1}/${chunks.length} (${inserted} snapshots so far)`);
}
}
return inserted;
}
/**
* Update dispensary last_crawled_at and product_count
*/
async function updateDispensaryCrawlStats(dispensaryId, productCount) {
// Update last_crawl_at to track when we last crawled
// Skip product_count as that column may not exist
await (0, connection_1.query)(`
UPDATE dispensaries
SET last_crawl_at = NOW(), updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
}
/**
* Mark products as missing from feed
* Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
* for products that were NOT in the UNION of Mode A and Mode B product lists
*
* IMPORTANT: Uses UNION of both modes to avoid false positives
* If the union is empty (possible outage), we skip marking to avoid data corruption
*/
async function markMissingProducts(dispensaryId, platformDispensaryId, modeAProductIds, modeBProductIds, pricingType) {
// Build UNION of Mode A + Mode B product IDs
const unionProductIds = new Set([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
// OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
if (unionProductIds.size === 0) {
console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
return 0;
}
// Get all existing products for this dispensary that were not in the UNION
const { rows: missingProducts } = await (0, connection_1.query)(`
SELECT id, external_product_id, name
FROM dutchie_products
WHERE dispensary_id = $1
AND external_product_id NOT IN (SELECT unnest($2::text[]))
`, [dispensaryId, Array.from(unionProductIds)]);
if (missingProducts.length === 0) {
return 0;
}
console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
const crawledAt = new Date();
// Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
const missingSnapshots = missingProducts.map(product => ({
dutchieProductId: product.id,
dispensaryId,
platformDispensaryId,
externalProductId: product.external_product_id,
pricingType,
crawlMode: 'mode_a', // Use mode_a for missing snapshots (convention)
status: undefined,
featured: false,
special: false,
medicalOnly: false,
recOnly: false,
isPresentInFeed: false,
stockStatus: 'missing_from_feed',
totalQuantityAvailable: undefined, // null = unknown, not 0
manualInventory: false,
isBelowThreshold: false,
isBelowKioskThreshold: false,
options: [],
rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
crawledAt,
}));
// Batch insert missing snapshots
const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
// Batch update product stock status in chunks
const productIds = missingProducts.map(p => p.id);
const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
for (const chunk of productChunks) {
await (0, connection_1.query)(`
UPDATE dutchie_products
SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
WHERE id = ANY($1::int[])
`, [chunk]);
}
console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
return snapshotsInserted;
}
/**
* Process a batch of products from a single crawl mode
* IMPORTANT: Stores ALL products, never filters before DB
* Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM
* Returns the set of external product IDs that were processed
*/
async function processProducts(products, dispensary, pricingType, crawlMode, options = {}) {
const { downloadImages = true } = options;
const productIds = new Set();
let imagesDownloaded = 0;
let imageErrors = 0;
console.log(`[ProductCrawler] Processing ${products.length} products using chunked batch processing...`);
// Step 1: Normalize all products and collect IDs
const normalizedProducts = [];
const rawByExternalId = new Map();
for (const raw of products) {
const externalId = raw._id || raw.id || '';
productIds.add(externalId);
rawByExternalId.set(externalId, raw);
const normalized = normalizeProduct(raw, dispensary.id, dispensary.platformDispensaryId);
normalizedProducts.push(normalized);
}
// Step 2: Batch upsert products (chunked)
const productIdMap = await batchUpsertProducts(normalizedProducts);
const upserted = productIdMap.size;
// Step 3: Create and batch insert snapshots (chunked)
// IMPORTANT: Do this BEFORE image downloads to ensure snapshots are created even if images fail
const snapshots = [];
for (const [externalId, productId] of Array.from(productIdMap.entries())) {
const raw = rawByExternalId.get(externalId);
if (raw) {
const snapshot = normalizeSnapshot(raw, productId, dispensary.id, dispensary.platformDispensaryId, pricingType, crawlMode);
snapshots.push(snapshot);
}
}
const snapshotsInserted = await batchInsertSnapshots(snapshots);
// Step 4: Download images in chunks (if enabled)
// This is done AFTER snapshots to ensure core data is saved even if image downloads fail
if (downloadImages) {
const imageChunks = chunkArray(Array.from(productIdMap.entries()), BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Downloading images in ${imageChunks.length} chunks...`);
for (let i = 0; i < imageChunks.length; i++) {
const chunk = imageChunks[i];
for (const [externalId, productId] of chunk) {
const normalized = normalizedProducts.find(p => p.externalProductId === externalId);
if (normalized?.primaryImageUrl) {
try {
const imageResult = await downloadAndUpdateProductImage(productId, dispensary.id, externalId, normalized.primaryImageUrl);
if (imageResult.downloaded) {
imagesDownloaded++;
}
else if (imageResult.error && imageResult.error !== 'No image URL') {
imageErrors++;
}
}
catch (error) {
imageErrors++;
}
}
}
if ((i + 1) % 5 === 0 || i === imageChunks.length - 1) {
console.log(`[ProductCrawler] Image download chunk ${i + 1}/${imageChunks.length} (${imagesDownloaded} downloaded, ${imageErrors} errors)`);
}
}
}
// Clear references to help GC
normalizedProducts.length = 0;
rawByExternalId.clear();
return { upserted, snapshots: snapshotsInserted, productIds, imagesDownloaded, imageErrors };
}
async function crawlDispensaryProducts(dispensary, pricingType = 'rec', options = {}) {
const { useBothModes = true, downloadImages = true, onProgress } = options;
const startTime = Date.now();
if (!dispensary.platformDispensaryId) {
return {
success: false,
dispensaryId: dispensary.id,
productsFound: 0,
productsFetched: 0,
productsUpserted: 0,
snapshotsCreated: 0,
errorMessage: 'Missing platformDispensaryId',
durationMs: Date.now() - startTime,
};
}
try {
console.log(`[ProductCrawler] Crawling ${dispensary.name} (${dispensary.platformDispensaryId})...`);
let totalUpserted = 0;
let totalSnapshots = 0;
let totalImagesDownloaded = 0;
let totalImageErrors = 0;
let modeAProducts = 0;
let modeBProducts = 0;
let missingMarked = 0;
// Track product IDs separately for each mode (needed for missing product detection)
const modeAProductIds = new Set();
const modeBProductIds = new Set();
// Extract cName for this specific dispensary (used for Puppeteer session & headers)
const cName = extractCName(dispensary);
console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`);
if (useBothModes) {
// Run two-mode crawl for maximum coverage
const bothResults = await (0, graphql_client_1.fetchAllProductsBothModes)(dispensary.platformDispensaryId, pricingType, { cName });
modeAProducts = bothResults.modeA.products.length;
modeBProducts = bothResults.modeB.products.length;
console.log(`[ProductCrawler] Two-mode crawl: Mode A=${modeAProducts}, Mode B=${modeBProducts}, Merged=${bothResults.merged.products.length}`);
// Collect Mode A product IDs
for (const p of bothResults.modeA.products) {
modeAProductIds.add(p._id);
}
// Collect Mode B product IDs
for (const p of bothResults.modeB.products) {
modeBProductIds.add(p._id);
}
// Process MERGED products (includes options from both modes)
if (bothResults.merged.products.length > 0) {
const mergedResult = await processProducts(bothResults.merged.products, dispensary, pricingType, 'mode_a', // Use mode_a for merged products (convention)
{ downloadImages });
totalUpserted = mergedResult.upserted;
totalSnapshots = mergedResult.snapshots;
totalImagesDownloaded = mergedResult.imagesDownloaded;
totalImageErrors = mergedResult.imageErrors;
// Report progress
if (onProgress) {
await onProgress({
productsFound: bothResults.merged.products.length,
productsUpserted: totalUpserted,
snapshotsCreated: totalSnapshots,
currentPage: 1,
totalPages: 1,
});
}
}
}
else {
// Single mode crawl (Mode A only)
const { products, crawlMode } = await (0, graphql_client_1.fetchAllProducts)(dispensary.platformDispensaryId, pricingType, { crawlMode: 'mode_a', cName });
modeAProducts = products.length;
// Collect Mode A product IDs
for (const p of products) {
modeAProductIds.add(p._id);
}
const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages });
totalUpserted = result.upserted;
totalSnapshots = result.snapshots;
totalImagesDownloaded = result.imagesDownloaded;
totalImageErrors = result.imageErrors;
// Report progress
if (onProgress) {
await onProgress({
productsFound: products.length,
productsUpserted: totalUpserted,
snapshotsCreated: totalSnapshots,
currentPage: 1,
totalPages: 1,
});
}
}
// Mark products as missing using UNION of Mode A + Mode B
// The function handles outage detection (empty union = skip marking)
missingMarked = await markMissingProducts(dispensary.id, dispensary.platformDispensaryId, modeAProductIds, modeBProductIds, pricingType);
totalSnapshots += missingMarked;
// Update dispensary stats
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
const totalProductsFound = modeAProducts + modeBProducts;
return {
success: true,
dispensaryId: dispensary.id,
productsFound: totalProductsFound,
productsFetched: totalProductsFound,
productsUpserted: totalUpserted,
snapshotsCreated: totalSnapshots,
modeAProducts,
modeBProducts,
missingProductsMarked: missingMarked,
imagesDownloaded: totalImagesDownloaded,
imageErrors: totalImageErrors,
durationMs: Date.now() - startTime,
};
}
catch (error) {
console.error(`[ProductCrawler] Failed to crawl ${dispensary.name}:`, error.message);
return {
success: false,
dispensaryId: dispensary.id,
productsFound: 0,
productsFetched: 0,
productsUpserted: 0,
snapshotsCreated: 0,
errorMessage: error.message,
durationMs: Date.now() - startTime,
};
}
}
/**
* Crawl all Arizona dispensaries
*/
async function crawlAllArizonaDispensaries(pricingType = 'rec') {
const results = [];
// Get all AZ dispensaries with platform IDs
const { rows: rawRows } = await (0, connection_1.query)(`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
ORDER BY id
`);
const dispensaries = rawRows.map(discovery_1.mapDbRowToDispensary);
console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);
for (const dispensary of dispensaries) {
const result = await crawlDispensaryProducts(dispensary, pricingType);
results.push(result);
// Delay between dispensaries
await new Promise((r) => setTimeout(r, 2000));
}
const successful = results.filter((r) => r.success).length;
const totalProducts = results.reduce((sum, r) => sum + r.productsUpserted, 0);
const totalSnapshots = results.reduce((sum, r) => sum + r.snapshotsCreated, 0);
console.log(`[ProductCrawler] Completed: ${successful}/${dispensaries.length} stores, ${totalProducts} products, ${totalSnapshots} snapshots`);
return results;
}