fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
360
backend/dist/scrapers/dutchie-graphql-direct.js
vendored
Normal file
360
backend/dist/scrapers/dutchie-graphql-direct.js
vendored
Normal file
@@ -0,0 +1,360 @@
|
||||
"use strict";
|
||||
// ============================================================================
|
||||
// DEPRECATED: This scraper writes to the LEGACY products table.
|
||||
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
||||
//
|
||||
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
||||
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
||||
// - Writes to isolated dutchie_az_* tables with snapshot model
|
||||
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
||||
// ============================================================================
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.fetchAllDutchieProducts = fetchAllDutchieProducts;
|
||||
exports.upsertProductsDirect = upsertProductsDirect;
|
||||
exports.scrapeAllDutchieProducts = scrapeAllDutchieProducts;
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
||||
*
|
||||
* Makes direct GraphQL requests from within the browser context to:
|
||||
* 1. Bypass Cloudflare (using browser session)
|
||||
* 2. Fetch ALL products including out-of-stock (Status: null)
|
||||
* 3. Paginate through complete menu
|
||||
*/
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const dutchie_graphql_1 = require("./dutchie-graphql");
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
// GraphQL persisted query hashes
|
||||
const GRAPHQL_HASHES = {
|
||||
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
};
|
||||
/**
|
||||
* Fetch all products via in-page GraphQL requests
|
||||
* This includes both in-stock and out-of-stock items
|
||||
*/
|
||||
async function fetchAllDutchieProducts(menuUrl, options = {}) {
|
||||
const { headless = 'new', timeout = 90000, perPage = 100, includeOutOfStock = true, } = options;
|
||||
let browser;
|
||||
try {
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
// Stealth configuration
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
// Navigate to menu page to establish session
|
||||
console.log('[DutchieGraphQL] Loading menu page to establish session...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Get dispensary ID from page
|
||||
const dispensaryId = await page.evaluate(() => {
|
||||
const env = window.reactEnv;
|
||||
return env?.dispensaryId || env?.retailerId || '';
|
||||
});
|
||||
if (!dispensaryId) {
|
||||
throw new Error('Could not determine dispensaryId from page');
|
||||
}
|
||||
console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
|
||||
// Fetch all products via in-page GraphQL requests
|
||||
const allProducts = [];
|
||||
let page_num = 0;
|
||||
let hasMore = true;
|
||||
while (hasMore) {
|
||||
console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
|
||||
const result = await page.evaluate(async (dispensaryId, page_num, perPage, includeOutOfStock, hash) => {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
|
||||
types: [],
|
||||
useCache: false, // Don't cache to get fresh data
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page: page_num,
|
||||
perPage,
|
||||
};
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
}),
|
||||
});
|
||||
const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include', // Include cookies/session
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
return response.json();
|
||||
}, dispensaryId, page_num, perPage, includeOutOfStock, GRAPHQL_HASHES.FilteredProducts);
|
||||
if (result.errors) {
|
||||
console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
|
||||
break;
|
||||
}
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
|
||||
if (products.length === 0) {
|
||||
hasMore = false;
|
||||
}
|
||||
else {
|
||||
allProducts.push(...products);
|
||||
page_num++;
|
||||
// Safety limit
|
||||
if (page_num > 50) {
|
||||
console.log('[DutchieGraphQL] Reached page limit, stopping');
|
||||
hasMore = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Count active vs inactive
|
||||
const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
|
||||
const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
|
||||
console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
|
||||
return {
|
||||
products: allProducts,
|
||||
dispensaryId,
|
||||
totalProducts: allProducts.length,
|
||||
activeCount,
|
||||
inactiveCount,
|
||||
};
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Upsert products to database
|
||||
*/
|
||||
async function upsertProductsDirect(pool, storeId, products) {
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
for (const product of products) {
|
||||
const result = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`, [
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]);
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
}
|
||||
else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
return { inserted, updated };
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This function is disabled and will throw an error if called.
|
||||
* Main entry point - scrape all products including out-of-stock
|
||||
*/
|
||||
async function scrapeAllDutchieProducts(pool, storeId, menuUrl) {
|
||||
// DEPRECATED: Throw error to prevent accidental use
|
||||
throw new Error('DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
|
||||
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
||||
'This scraper writes to the legacy products table.');
|
||||
// Original code below is unreachable but kept for reference
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
|
||||
// Fetch all products via direct GraphQL
|
||||
const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
|
||||
includeOutOfStock: true,
|
||||
perPage: 100,
|
||||
});
|
||||
if (products.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
totalProducts: 0,
|
||||
activeCount: 0,
|
||||
inactiveCount: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: 'No products returned from GraphQL',
|
||||
};
|
||||
}
|
||||
// Normalize products
|
||||
const normalized = products.map(dutchie_graphql_1.normalizeDutchieProduct);
|
||||
// Upsert to database
|
||||
const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
|
||||
console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
|
||||
console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
|
||||
return {
|
||||
success: true,
|
||||
totalProducts,
|
||||
activeCount,
|
||||
inactiveCount,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[DutchieGraphQL] Error:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
totalProducts: 0,
|
||||
activeCount: 0,
|
||||
inactiveCount: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
446
backend/dist/scrapers/dutchie-graphql.js
vendored
Normal file
446
backend/dist/scrapers/dutchie-graphql.js
vendored
Normal file
@@ -0,0 +1,446 @@
|
||||
"use strict";
|
||||
// ============================================================================
|
||||
// DEPRECATED: This scraper writes to the LEGACY products table.
|
||||
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
||||
//
|
||||
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
||||
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
||||
// - Writes to isolated dutchie_az_* tables with snapshot model
|
||||
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
||||
//
|
||||
// The normalizer functions in this file (normalizeDutchieProduct) may still
|
||||
// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
|
||||
// ============================================================================
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.normalizeDutchieProduct = normalizeDutchieProduct;
|
||||
exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer;
|
||||
exports.upsertProducts = upsertProducts;
|
||||
exports.scrapeDutchieMenu = scrapeDutchieMenu;
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
||||
*
|
||||
* Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
|
||||
* This bypasses Cloudflare by using a real browser to load the menu page.
|
||||
*
|
||||
* GraphQL Operations:
|
||||
* - FilteredProducts: Returns paginated product list with full details
|
||||
* - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
|
||||
*/
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
// =====================================================
|
||||
// NORMALIZER: Dutchie GraphQL → DB Schema
|
||||
// =====================================================
|
||||
function normalizeDutchieProduct(product) {
|
||||
// Extract first special if exists
|
||||
const saleSpecial = product.specialData?.saleSpecials?.[0];
|
||||
// Calculate inventory from POSMetaData children
|
||||
const children = product.POSMetaData?.children || [];
|
||||
const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
|
||||
const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
|
||||
// Parse timestamps
|
||||
let sourceCreatedAt;
|
||||
if (product.createdAt) {
|
||||
// createdAt is a timestamp string like "1729044510543"
|
||||
const ts = parseInt(product.createdAt, 10);
|
||||
if (!isNaN(ts)) {
|
||||
sourceCreatedAt = new Date(ts);
|
||||
}
|
||||
}
|
||||
let sourceUpdatedAt;
|
||||
if (product.updatedAt) {
|
||||
sourceUpdatedAt = new Date(product.updatedAt);
|
||||
}
|
||||
return {
|
||||
// Identity
|
||||
external_id: product._id || product.id,
|
||||
slug: product.cName,
|
||||
name: product.Name,
|
||||
enterprise_product_id: product.enterpriseProductId,
|
||||
// Brand
|
||||
brand: product.brandName || product.brand?.name,
|
||||
brand_external_id: product.brandId || product.brand?.id,
|
||||
brand_logo_url: product.brandLogo || product.brand?.imageUrl,
|
||||
// Category
|
||||
subcategory: product.subcategory,
|
||||
strain_type: product.strainType,
|
||||
canonical_category: product.POSMetaData?.canonicalCategory,
|
||||
// Pricing
|
||||
price: product.Prices?.[0],
|
||||
rec_price: product.recPrices?.[0],
|
||||
med_price: product.medicalPrices?.[0],
|
||||
rec_special_price: product.recSpecialPrices?.[0],
|
||||
med_special_price: product.medicalSpecialPrices?.[0],
|
||||
// Specials
|
||||
is_on_special: product.special === true,
|
||||
special_name: saleSpecial?.specialName,
|
||||
discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
|
||||
special_data: product.specialData,
|
||||
// Inventory
|
||||
sku: product.POSMetaData?.canonicalSKU,
|
||||
inventory_quantity: totalQuantity || undefined,
|
||||
inventory_available: availableQuantity || undefined,
|
||||
is_below_threshold: product.isBelowThreshold === true,
|
||||
status: product.Status,
|
||||
// Potency
|
||||
thc_percentage: product.THCContent?.range?.[0],
|
||||
cbd_percentage: product.CBDContent?.range?.[0],
|
||||
cannabinoids: product.cannabinoidsV2,
|
||||
// Weight/Options
|
||||
weight_mg: product.weight,
|
||||
net_weight_value: product.measurements?.netWeight?.values?.[0],
|
||||
net_weight_unit: product.measurements?.netWeight?.unit,
|
||||
options: product.Options,
|
||||
raw_options: product.rawOptions,
|
||||
// Images
|
||||
image_url: product.Image,
|
||||
additional_images: product.images?.length ? product.images : undefined,
|
||||
// Flags
|
||||
is_featured: product.featured === true,
|
||||
medical_only: product.medicalOnly === true,
|
||||
rec_only: product.recOnly === true,
|
||||
// Timestamps
|
||||
source_created_at: sourceCreatedAt,
|
||||
source_updated_at: sourceUpdatedAt,
|
||||
// Description
|
||||
description: typeof product.description === 'string' ? product.description : undefined,
|
||||
// Raw
|
||||
raw_data: product,
|
||||
};
|
||||
}
|
||||
async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) {
|
||||
const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture
|
||||
} = options;
|
||||
let browser;
|
||||
const capturedProducts = [];
|
||||
let dispensaryId = '';
|
||||
try {
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
// Stealth configuration
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
// Track seen product IDs to avoid duplicates
|
||||
const seenIds = new Set();
|
||||
// Intercept GraphQL responses
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
if (!url.includes('graphql'))
|
||||
return;
|
||||
try {
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (!contentType.includes('application/json'))
|
||||
return;
|
||||
const data = await response.json();
|
||||
// Capture dispensary ID
|
||||
if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
|
||||
dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
|
||||
}
|
||||
// Capture products from FilteredProducts
|
||||
if (data?.data?.filteredProducts?.products) {
|
||||
const products = data.data.filteredProducts.products;
|
||||
for (const product of products) {
|
||||
if (!seenIds.has(product._id)) {
|
||||
seenIds.add(product._id);
|
||||
capturedProducts.push(product);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Ignore parse errors
|
||||
}
|
||||
});
|
||||
// Navigate to menu
|
||||
console.log('[DutchieGraphQL] Loading menu page...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Get dispensary ID from window.reactEnv if not captured
|
||||
if (!dispensaryId) {
|
||||
dispensaryId = await page.evaluate(() => {
|
||||
const env = window.reactEnv;
|
||||
return env?.dispensaryId || env?.retailerId || '';
|
||||
});
|
||||
}
|
||||
// Helper function to scroll through a page until no more products load
|
||||
async function scrollToLoadAll(maxScrollAttempts = maxScrolls) {
|
||||
let scrollCount = 0;
|
||||
let previousCount = 0;
|
||||
let noNewProductsCount = 0;
|
||||
while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await new Promise((r) => setTimeout(r, 1500));
|
||||
const currentCount = seenIds.size;
|
||||
if (currentCount === previousCount) {
|
||||
noNewProductsCount++;
|
||||
}
|
||||
else {
|
||||
noNewProductsCount = 0;
|
||||
}
|
||||
previousCount = currentCount;
|
||||
scrollCount++;
|
||||
}
|
||||
}
|
||||
// First, scroll through the main page (all products)
|
||||
console.log('[DutchieGraphQL] Scrolling main page...');
|
||||
await scrollToLoadAll();
|
||||
console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
|
||||
// Get category links from the navigation
|
||||
const categoryLinks = await page.evaluate(() => {
|
||||
const links = [];
|
||||
// Look for category navigation links
|
||||
const navLinks = document.querySelectorAll('a[href*="/products/"]');
|
||||
navLinks.forEach((link) => {
|
||||
const href = link.href;
|
||||
if (href && !links.includes(href)) {
|
||||
links.push(href);
|
||||
}
|
||||
});
|
||||
return links;
|
||||
});
|
||||
console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
|
||||
// Visit each category page to capture all products
|
||||
for (const categoryUrl of categoryLinks) {
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
|
||||
await page.goto(categoryUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
await scrollToLoadAll(15); // Fewer scrolls per category
|
||||
console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
|
||||
}
|
||||
catch (e) {
|
||||
console.log(`[DutchieGraphQL] Category error: ${e.message}`);
|
||||
}
|
||||
}
|
||||
// Wait for any final responses
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
return {
|
||||
products: capturedProducts,
|
||||
dispensaryId,
|
||||
menuUrl,
|
||||
};
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
// =====================================================
|
||||
// DATABASE OPERATIONS
|
||||
// =====================================================
|
||||
async function upsertProducts(pool, storeId, products) {
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
for (const product of products) {
|
||||
// Upsert product
|
||||
const result = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`, [
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]);
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
}
|
||||
else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
return { inserted, updated };
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
// =====================================================
|
||||
// MAIN ENTRY POINT
|
||||
// =====================================================
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This function is disabled and will throw an error if called.
|
||||
*/
|
||||
async function scrapeDutchieMenu(pool, storeId, menuUrl) {
|
||||
// DEPRECATED: Throw error to prevent accidental use
|
||||
throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
|
||||
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
||||
'This scraper writes to the legacy products table.');
|
||||
// Original code below is unreachable but kept for reference
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
|
||||
// Fetch products via Puppeteer
|
||||
const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
|
||||
console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
|
||||
if (products.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
productsFound: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: 'No products captured from GraphQL responses',
|
||||
};
|
||||
}
|
||||
// Normalize products
|
||||
const normalized = products.map(normalizeDutchieProduct);
|
||||
// Upsert to database
|
||||
const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
|
||||
console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
|
||||
return {
|
||||
success: true,
|
||||
productsFound: products.length,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[DutchieGraphQL] Error:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
productsFound: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
85
backend/dist/scrapers/templates/dutchie.js
vendored
Normal file
85
backend/dist/scrapers/templates/dutchie.js
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
"use strict";
|
||||
// ============================================================================
|
||||
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
|
||||
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
|
||||
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
|
||||
// ============================================================================
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.dutchieTemplate = void 0;
|
||||
exports.getTemplateForUrl = getTemplateForUrl;
|
||||
const logger_1 = require("../../services/logger");
|
||||
/**
|
||||
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
|
||||
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
|
||||
* This template relied on unstable DOM selectors and wrote to legacy tables.
|
||||
*/
|
||||
exports.dutchieTemplate = {
|
||||
name: 'Dutchie Marketplace',
|
||||
urlPattern: /dutchie\.com\/dispensary\//,
|
||||
buildCategoryUrl: (baseUrl, category) => {
|
||||
// Remove trailing slash
|
||||
const base = baseUrl.replace(/\/$/, '');
|
||||
// Convert category name to URL-friendly slug
|
||||
const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
|
||||
return `${base}/products/${categorySlug}`;
|
||||
},
|
||||
extractProducts: async (page) => {
|
||||
const products = [];
|
||||
try {
|
||||
// Wait for product cards to load
|
||||
await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
|
||||
logger_1.logger.warn('scraper', 'No product cards found with data-testid="card-link"');
|
||||
});
|
||||
// Get all product card links
|
||||
const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
|
||||
logger_1.logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
|
||||
for (const card of productCards) {
|
||||
try {
|
||||
// Extract all data at once using evaluate for speed
|
||||
const cardData = await card.evaluate((el) => {
|
||||
const href = el.getAttribute('href') || '';
|
||||
const img = el.querySelector('img');
|
||||
const imageUrl = img ? img.getAttribute('src') || '' : '';
|
||||
// Get all text nodes in order
|
||||
const textElements = Array.from(el.querySelectorAll('*'))
|
||||
.filter(el => el.textContent && el.children.length === 0)
|
||||
.map(el => (el.textContent || '').trim())
|
||||
.filter(text => text.length > 0);
|
||||
const name = textElements[0] || '';
|
||||
const brand = textElements[1] || '';
|
||||
// Look for price
|
||||
const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
||||
return { href, imageUrl, name, brand, price };
|
||||
});
|
||||
if (cardData.name && cardData.href) {
|
||||
products.push({
|
||||
name: cardData.name,
|
||||
brand: cardData.brand || undefined,
|
||||
product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
|
||||
image_url: cardData.imageUrl || undefined,
|
||||
price: cardData.price,
|
||||
in_stock: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
|
||||
}
|
||||
return products;
|
||||
},
|
||||
};
|
||||
/**
|
||||
* Get the appropriate scraper template based on URL
|
||||
*/
|
||||
function getTemplateForUrl(url) {
|
||||
if (exports.dutchieTemplate.urlPattern.test(url)) {
|
||||
return exports.dutchieTemplate;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
Reference in New Issue
Block a user