440 lines
14 KiB
TypeScript
440 lines
14 KiB
TypeScript
// ============================================================================
|
|
// DEPRECATED: This scraper writes to the LEGACY products table.
|
|
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
|
//
|
|
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
|
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
|
// - Writes to isolated dutchie_az_* tables with snapshot model
|
|
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
|
// ============================================================================
|
|
|
|
/**
|
|
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
|
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
|
*
|
|
* Makes direct GraphQL requests from within the browser context to:
|
|
* 1. Bypass Cloudflare (using browser session)
|
|
* 2. Fetch ALL products including out-of-stock (Status: null)
|
|
* 3. Paginate through complete menu
|
|
*/
|
|
|
|
import puppeteer from 'puppeteer-extra';
|
|
import type { Browser, Page } from 'puppeteer';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
import { DutchieProduct, NormalizedProduct, normalizeDutchieProduct } from './dutchie-graphql';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
// GraphQL persisted query hashes
|
|
const GRAPHQL_HASHES = {
|
|
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
|
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
|
};
|
|
|
|
interface FetchResult {
|
|
products: DutchieProduct[];
|
|
dispensaryId: string;
|
|
totalProducts: number;
|
|
activeCount: number;
|
|
inactiveCount: number;
|
|
}
|
|
|
|
/**
|
|
* Fetch all products via in-page GraphQL requests
|
|
* This includes both in-stock and out-of-stock items
|
|
*/
|
|
export async function fetchAllDutchieProducts(
|
|
menuUrl: string,
|
|
options: {
|
|
headless?: boolean | 'new';
|
|
timeout?: number;
|
|
perPage?: number;
|
|
includeOutOfStock?: boolean;
|
|
} = {}
|
|
): Promise<FetchResult> {
|
|
const {
|
|
headless = 'new',
|
|
timeout = 90000,
|
|
perPage = 100,
|
|
includeOutOfStock = true,
|
|
} = options;
|
|
|
|
let browser: Browser | undefined;
|
|
|
|
try {
|
|
browser = await puppeteer.launch({
|
|
headless,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
],
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Stealth configuration
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
);
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
(window as any).chrome = { runtime: {} };
|
|
});
|
|
|
|
// Navigate to menu page to establish session
|
|
console.log('[DutchieGraphQL] Loading menu page to establish session...');
|
|
await page.goto(menuUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout,
|
|
});
|
|
|
|
// Get dispensary ID from page
|
|
const dispensaryId = await page.evaluate(() => {
|
|
const env = (window as any).reactEnv;
|
|
return env?.dispensaryId || env?.retailerId || '';
|
|
});
|
|
|
|
if (!dispensaryId) {
|
|
throw new Error('Could not determine dispensaryId from page');
|
|
}
|
|
|
|
console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
|
|
|
|
// Fetch all products via in-page GraphQL requests
|
|
const allProducts: DutchieProduct[] = [];
|
|
let page_num = 0;
|
|
let hasMore = true;
|
|
|
|
while (hasMore) {
|
|
console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
|
|
|
|
const result = await page.evaluate(
|
|
async (dispensaryId: string, page_num: number, perPage: number, includeOutOfStock: boolean, hash: string) => {
|
|
const variables = {
|
|
includeEnterpriseSpecials: false,
|
|
productsFilter: {
|
|
dispensaryId,
|
|
pricingType: 'rec',
|
|
Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
|
|
types: [],
|
|
useCache: false, // Don't cache to get fresh data
|
|
isDefaultSort: true,
|
|
sortBy: 'popularSortIdx',
|
|
sortDirection: 1,
|
|
bypassOnlineThresholds: true,
|
|
isKioskMenu: false,
|
|
removeProductsBelowOptionThresholds: false,
|
|
},
|
|
page: page_num,
|
|
perPage,
|
|
};
|
|
|
|
const qs = new URLSearchParams({
|
|
operationName: 'FilteredProducts',
|
|
variables: JSON.stringify(variables),
|
|
extensions: JSON.stringify({
|
|
persistedQuery: { version: 1, sha256Hash: hash },
|
|
}),
|
|
});
|
|
|
|
const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
|
method: 'GET',
|
|
headers: {
|
|
'content-type': 'application/json',
|
|
'apollographql-client-name': 'Marketplace (production)',
|
|
},
|
|
credentials: 'include', // Include cookies/session
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}`);
|
|
}
|
|
|
|
return response.json();
|
|
},
|
|
dispensaryId,
|
|
page_num,
|
|
perPage,
|
|
includeOutOfStock,
|
|
GRAPHQL_HASHES.FilteredProducts
|
|
);
|
|
|
|
if (result.errors) {
|
|
console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
|
|
break;
|
|
}
|
|
|
|
const products = result?.data?.filteredProducts?.products || [];
|
|
console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
|
|
|
|
if (products.length === 0) {
|
|
hasMore = false;
|
|
} else {
|
|
allProducts.push(...products);
|
|
page_num++;
|
|
|
|
// Safety limit
|
|
if (page_num > 50) {
|
|
console.log('[DutchieGraphQL] Reached page limit, stopping');
|
|
hasMore = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Count active vs inactive
|
|
const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
|
|
const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
|
|
|
|
console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
|
|
|
|
return {
|
|
products: allProducts,
|
|
dispensaryId,
|
|
totalProducts: allProducts.length,
|
|
activeCount,
|
|
inactiveCount,
|
|
};
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Upsert products to database
|
|
*/
|
|
export async function upsertProductsDirect(
|
|
pool: Pool,
|
|
storeId: number,
|
|
products: NormalizedProduct[]
|
|
): Promise<{ inserted: number; updated: number }> {
|
|
const client = await pool.connect();
|
|
let inserted = 0;
|
|
let updated = 0;
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
for (const product of products) {
|
|
const result = await client.query(
|
|
`
|
|
INSERT INTO products (
|
|
store_id, external_id, slug, name, enterprise_product_id,
|
|
brand, brand_external_id, brand_logo_url,
|
|
subcategory, strain_type, canonical_category,
|
|
price, rec_price, med_price, rec_special_price, med_special_price,
|
|
is_on_special, special_name, discount_percent, special_data,
|
|
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
|
thc_percentage, cbd_percentage, cannabinoids,
|
|
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
|
image_url, additional_images,
|
|
is_featured, medical_only, rec_only,
|
|
source_created_at, source_updated_at,
|
|
description, raw_data,
|
|
dutchie_url, last_seen_at, updated_at
|
|
)
|
|
VALUES (
|
|
$1, $2, $3, $4, $5,
|
|
$6, $7, $8,
|
|
$9, $10, $11,
|
|
$12, $13, $14, $15, $16,
|
|
$17, $18, $19, $20,
|
|
$21, $22, $23, $24, $25,
|
|
$26, $27, $28,
|
|
$29, $30, $31, $32, $33,
|
|
$34, $35,
|
|
$36, $37, $38,
|
|
$39, $40,
|
|
$41, $42,
|
|
'', NOW(), NOW()
|
|
)
|
|
ON CONFLICT (store_id, slug) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
|
brand = EXCLUDED.brand,
|
|
brand_external_id = EXCLUDED.brand_external_id,
|
|
brand_logo_url = EXCLUDED.brand_logo_url,
|
|
subcategory = EXCLUDED.subcategory,
|
|
strain_type = EXCLUDED.strain_type,
|
|
canonical_category = EXCLUDED.canonical_category,
|
|
price = EXCLUDED.price,
|
|
rec_price = EXCLUDED.rec_price,
|
|
med_price = EXCLUDED.med_price,
|
|
rec_special_price = EXCLUDED.rec_special_price,
|
|
med_special_price = EXCLUDED.med_special_price,
|
|
is_on_special = EXCLUDED.is_on_special,
|
|
special_name = EXCLUDED.special_name,
|
|
discount_percent = EXCLUDED.discount_percent,
|
|
special_data = EXCLUDED.special_data,
|
|
sku = EXCLUDED.sku,
|
|
inventory_quantity = EXCLUDED.inventory_quantity,
|
|
inventory_available = EXCLUDED.inventory_available,
|
|
is_below_threshold = EXCLUDED.is_below_threshold,
|
|
status = EXCLUDED.status,
|
|
thc_percentage = EXCLUDED.thc_percentage,
|
|
cbd_percentage = EXCLUDED.cbd_percentage,
|
|
cannabinoids = EXCLUDED.cannabinoids,
|
|
weight_mg = EXCLUDED.weight_mg,
|
|
net_weight_value = EXCLUDED.net_weight_value,
|
|
net_weight_unit = EXCLUDED.net_weight_unit,
|
|
options = EXCLUDED.options,
|
|
raw_options = EXCLUDED.raw_options,
|
|
image_url = EXCLUDED.image_url,
|
|
additional_images = EXCLUDED.additional_images,
|
|
is_featured = EXCLUDED.is_featured,
|
|
medical_only = EXCLUDED.medical_only,
|
|
rec_only = EXCLUDED.rec_only,
|
|
source_created_at = EXCLUDED.source_created_at,
|
|
source_updated_at = EXCLUDED.source_updated_at,
|
|
description = EXCLUDED.description,
|
|
raw_data = EXCLUDED.raw_data,
|
|
last_seen_at = NOW(),
|
|
updated_at = NOW()
|
|
RETURNING (xmax = 0) AS was_inserted
|
|
`,
|
|
[
|
|
storeId,
|
|
product.external_id,
|
|
product.slug,
|
|
product.name,
|
|
product.enterprise_product_id,
|
|
product.brand,
|
|
product.brand_external_id,
|
|
product.brand_logo_url,
|
|
product.subcategory,
|
|
product.strain_type,
|
|
product.canonical_category,
|
|
product.price,
|
|
product.rec_price,
|
|
product.med_price,
|
|
product.rec_special_price,
|
|
product.med_special_price,
|
|
product.is_on_special,
|
|
product.special_name,
|
|
product.discount_percent,
|
|
product.special_data ? JSON.stringify(product.special_data) : null,
|
|
product.sku,
|
|
product.inventory_quantity,
|
|
product.inventory_available,
|
|
product.is_below_threshold,
|
|
product.status,
|
|
product.thc_percentage,
|
|
product.cbd_percentage,
|
|
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
|
product.weight_mg,
|
|
product.net_weight_value,
|
|
product.net_weight_unit,
|
|
product.options,
|
|
product.raw_options,
|
|
product.image_url,
|
|
product.additional_images,
|
|
product.is_featured,
|
|
product.medical_only,
|
|
product.rec_only,
|
|
product.source_created_at,
|
|
product.source_updated_at,
|
|
product.description,
|
|
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
|
]
|
|
);
|
|
|
|
if (result.rows[0]?.was_inserted) {
|
|
inserted++;
|
|
} else {
|
|
updated++;
|
|
}
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
return { inserted, updated };
|
|
} catch (error) {
|
|
await client.query('ROLLBACK');
|
|
throw error;
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
|
* This function is disabled and will throw an error if called.
|
|
* Main entry point - scrape all products including out-of-stock
|
|
*/
|
|
export async function scrapeAllDutchieProducts(
|
|
pool: Pool,
|
|
storeId: number,
|
|
menuUrl: string
|
|
): Promise<{
|
|
success: boolean;
|
|
totalProducts: number;
|
|
activeCount: number;
|
|
inactiveCount: number;
|
|
inserted: number;
|
|
updated: number;
|
|
error?: string;
|
|
}> {
|
|
// DEPRECATED: Throw error to prevent accidental use
|
|
throw new Error(
|
|
'DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
|
|
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
|
'This scraper writes to the legacy products table.'
|
|
);
|
|
|
|
// Original code below is unreachable but kept for reference
|
|
try {
|
|
console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
|
|
|
|
// Fetch all products via direct GraphQL
|
|
const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
|
|
includeOutOfStock: true,
|
|
perPage: 100,
|
|
});
|
|
|
|
if (products.length === 0) {
|
|
return {
|
|
success: false,
|
|
totalProducts: 0,
|
|
activeCount: 0,
|
|
inactiveCount: 0,
|
|
inserted: 0,
|
|
updated: 0,
|
|
error: 'No products returned from GraphQL',
|
|
};
|
|
}
|
|
|
|
// Normalize products
|
|
const normalized = products.map(normalizeDutchieProduct);
|
|
|
|
// Upsert to database
|
|
const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
|
|
|
|
console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
|
|
console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
|
|
|
|
return {
|
|
success: true,
|
|
totalProducts,
|
|
activeCount,
|
|
inactiveCount,
|
|
inserted,
|
|
updated,
|
|
};
|
|
} catch (error: any) {
|
|
console.error(`[DutchieGraphQL] Error:`, error.message);
|
|
return {
|
|
success: false,
|
|
totalProducts: 0,
|
|
activeCount: 0,
|
|
inactiveCount: 0,
|
|
inserted: 0,
|
|
updated: 0,
|
|
error: error.message,
|
|
};
|
|
}
|
|
}
|