Files
cannaiq/backend/src/scrapers/dutchie-graphql-direct.ts

440 lines
14 KiB
TypeScript

// ============================================================================
// DEPRECATED: This scraper writes to the LEGACY products table.
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
//
// New pipeline location: src/dutchie-az/services/product-crawler.ts
// - Uses fetch-based GraphQL (no Puppeteer needed)
// - Writes to isolated dutchie_az_* tables with snapshot model
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
// ============================================================================
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This scraper writes to the legacy products table, not the new dutchie_az tables.
*
* Makes direct GraphQL requests from within the browser context to:
* 1. Bypass Cloudflare (using browser session)
* 2. Fetch ALL products including out-of-stock (Status: null)
* 3. Paginate through complete menu
*/
import puppeteer from 'puppeteer-extra';
import type { Browser, Page } from 'puppeteer';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
import { DutchieProduct, NormalizedProduct, normalizeDutchieProduct } from './dutchie-graphql';
puppeteer.use(StealthPlugin());
// GraphQL persisted query hashes
const GRAPHQL_HASHES = {
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
};
interface FetchResult {
products: DutchieProduct[];
dispensaryId: string;
totalProducts: number;
activeCount: number;
inactiveCount: number;
}
/**
* Fetch all products via in-page GraphQL requests
* This includes both in-stock and out-of-stock items
*/
export async function fetchAllDutchieProducts(
menuUrl: string,
options: {
headless?: boolean | 'new';
timeout?: number;
perPage?: number;
includeOutOfStock?: boolean;
} = {}
): Promise<FetchResult> {
const {
headless = 'new',
timeout = 90000,
perPage = 100,
includeOutOfStock = true,
} = options;
let browser: Browser | undefined;
try {
browser = await puppeteer.launch({
headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
const page = await browser.newPage();
// Stealth configuration
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} };
});
// Navigate to menu page to establish session
console.log('[DutchieGraphQL] Loading menu page to establish session...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout,
});
// Get dispensary ID from page
const dispensaryId = await page.evaluate(() => {
const env = (window as any).reactEnv;
return env?.dispensaryId || env?.retailerId || '';
});
if (!dispensaryId) {
throw new Error('Could not determine dispensaryId from page');
}
console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
// Fetch all products via in-page GraphQL requests
const allProducts: DutchieProduct[] = [];
let page_num = 0;
let hasMore = true;
while (hasMore) {
console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
const result = await page.evaluate(
async (dispensaryId: string, page_num: number, perPage: number, includeOutOfStock: boolean, hash: string) => {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId,
pricingType: 'rec',
Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
types: [],
useCache: false, // Don't cache to get fresh data
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: page_num,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({
persistedQuery: { version: 1, sha256Hash: hash },
}),
});
const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
method: 'GET',
headers: {
'content-type': 'application/json',
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include', // Include cookies/session
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
return response.json();
},
dispensaryId,
page_num,
perPage,
includeOutOfStock,
GRAPHQL_HASHES.FilteredProducts
);
if (result.errors) {
console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
break;
}
const products = result?.data?.filteredProducts?.products || [];
console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
if (products.length === 0) {
hasMore = false;
} else {
allProducts.push(...products);
page_num++;
// Safety limit
if (page_num > 50) {
console.log('[DutchieGraphQL] Reached page limit, stopping');
hasMore = false;
}
}
}
// Count active vs inactive
const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
return {
products: allProducts,
dispensaryId,
totalProducts: allProducts.length,
activeCount,
inactiveCount,
};
} finally {
if (browser) {
await browser.close();
}
}
}
/**
* Upsert products to database
*/
export async function upsertProductsDirect(
pool: Pool,
storeId: number,
products: NormalizedProduct[]
): Promise<{ inserted: number; updated: number }> {
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of products) {
const result = await client.query(
`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`,
[
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]
);
if (result.rows[0]?.was_inserted) {
inserted++;
} else {
updated++;
}
}
await client.query('COMMIT');
return { inserted, updated };
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
}
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This function is disabled and will throw an error if called.
* Main entry point - scrape all products including out-of-stock
*/
export async function scrapeAllDutchieProducts(
pool: Pool,
storeId: number,
menuUrl: string
): Promise<{
success: boolean;
totalProducts: number;
activeCount: number;
inactiveCount: number;
inserted: number;
updated: number;
error?: string;
}> {
// DEPRECATED: Throw error to prevent accidental use
throw new Error(
'DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
'This scraper writes to the legacy products table.'
);
// Original code below is unreachable but kept for reference
try {
console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
// Fetch all products via direct GraphQL
const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
includeOutOfStock: true,
perPage: 100,
});
if (products.length === 0) {
return {
success: false,
totalProducts: 0,
activeCount: 0,
inactiveCount: 0,
inserted: 0,
updated: 0,
error: 'No products returned from GraphQL',
};
}
// Normalize products
const normalized = products.map(normalizeDutchieProduct);
// Upsert to database
const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
return {
success: true,
totalProducts,
activeCount,
inactiveCount,
inserted,
updated,
};
} catch (error: any) {
console.error(`[DutchieGraphQL] Error:`, error.message);
return {
success: false,
totalProducts: 0,
activeCount: 0,
inactiveCount: 0,
inserted: 0,
updated: 0,
error: error.message,
};
}
}