feat(treez): CDP interception client for Elasticsearch API capture
Rewrites Treez platform client to use CDP (Chrome DevTools Protocol) interception instead of DOM scraping. Key changes: - Uses Puppeteer Stealth plugin to bypass headless detection - Intercepts Elasticsearch API responses via CDP Network.responseReceived - Captures full product data including inventory levels (availableUnits) - Adds comprehensive TypeScript types for all Treez data structures - Updates queries.ts with automatic session management - Fixes product-discovery-treez handler for new API shape Tested with Best Dispensary: 142 products across 10 categories captured with inventory data, pricing, and lab results. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,107 +1,172 @@
|
||||
/**
|
||||
* Test script for Treez platform client
|
||||
* Tests the new Treez integration with Best Dispensary
|
||||
* ============================================================
|
||||
* TREEZ CLIENT TEST SCRIPT
|
||||
* ============================================================
|
||||
*
|
||||
* Tests the Treez CDP interception client using Best Dispensary.
|
||||
*
|
||||
* This verifies:
|
||||
* - Stealth plugin bypasses headless detection
|
||||
* - CDP intercepts Elasticsearch API responses
|
||||
* - Products are captured and normalized correctly
|
||||
* - Inventory data is available
|
||||
*
|
||||
* Usage: npx ts-node scripts/test-treez-client.ts
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import {
|
||||
fetchProductsByStoreId,
|
||||
} from '../src/platforms/treez';
|
||||
import { TreezNormalizer } from '../src/hydration/normalizers/treez';
|
||||
import { fetchProductsFromUrl } from '../src/platforms/treez';
|
||||
|
||||
const TEST_STORE_ID = 'best';
|
||||
const TEST_URL = 'https://shop.bestdispensary.com/shop';
|
||||
|
||||
async function main() {
|
||||
console.log('='.repeat(60));
|
||||
console.log('Treez Platform Client Test');
|
||||
console.log('TREEZ CLIENT TEST - CDP INTERCEPTION');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Test Store: ${TEST_STORE_ID}`);
|
||||
console.log(`URL: ${TEST_URL}`);
|
||||
console.log('Method: Puppeteer + Stealth + CDP response capture');
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// Test 1: Fetch products from store
|
||||
console.log('[Test 1] Fetching products from Treez store...');
|
||||
const result = await fetchProductsByStoreId(TEST_STORE_ID);
|
||||
console.log('[Starting] Launching browser with Stealth plugin...\n');
|
||||
|
||||
console.log('');
|
||||
console.log('[Results]');
|
||||
console.log(` Store: ${result.store.name}`);
|
||||
console.log(` Store ID: ${result.store.storeId}`);
|
||||
console.log(` Products captured: ${result.products.length}`);
|
||||
console.log(` Scroll count: ${result.scrollCount}`);
|
||||
const result = await fetchProductsFromUrl(TEST_URL);
|
||||
|
||||
if (result.products.length > 0) {
|
||||
console.log('');
|
||||
console.log('[Sample Products (first 5)]');
|
||||
for (const p of result.products.slice(0, 5)) {
|
||||
console.log(` - ${p.name}`);
|
||||
console.log(` Brand: ${p.brand || 'N/A'}`);
|
||||
console.log(` Category: ${p.category || 'N/A'} / ${p.subcategory || 'N/A'}`);
|
||||
console.log(` Price: ${p.price ? '$' + p.price : 'N/A'}`);
|
||||
console.log(` THC: ${p.thcPercent !== null ? p.thcPercent + '%' : 'N/A'}`);
|
||||
}
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('RESULTS');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Total products: ${result.totalCaptured}`);
|
||||
console.log(`Store ID: ${result.storeId || 'N/A (custom domain)'}`);
|
||||
console.log(`Source URL: ${result.sourceUrl}`);
|
||||
console.log(`Fetched at: ${result.fetchedAt.toISOString()}`);
|
||||
|
||||
// Test 2: Normalize products
|
||||
console.log('');
|
||||
console.log('[Test 2] Testing normalizer...');
|
||||
const normalizer = new TreezNormalizer();
|
||||
|
||||
// Build a fake payload structure
|
||||
const fakePayload = {
|
||||
id: 'test-payload',
|
||||
dispensary_id: 9999,
|
||||
crawl_run_id: null,
|
||||
platform: 'treez',
|
||||
payload_version: 1,
|
||||
raw_json: { products: result.products },
|
||||
product_count: result.products.length,
|
||||
pricing_type: null,
|
||||
crawl_mode: null,
|
||||
fetched_at: new Date(),
|
||||
processed: false,
|
||||
normalized_at: null,
|
||||
hydration_error: null,
|
||||
hydration_attempts: 0,
|
||||
created_at: new Date(),
|
||||
};
|
||||
|
||||
const normalized = normalizer.normalize(fakePayload);
|
||||
|
||||
console.log(` Products normalized: ${normalized.products.length}`);
|
||||
console.log(` Brands extracted: ${normalized.brands.length}`);
|
||||
console.log(` Categories extracted: ${normalized.categories.length}`);
|
||||
console.log(` Errors: ${normalized.errors.length}`);
|
||||
|
||||
if (normalized.products.length > 0) {
|
||||
console.log('');
|
||||
console.log('[Sample Normalized Product]');
|
||||
const np = normalized.products[0];
|
||||
console.log(` External ID: ${np.externalProductId}`);
|
||||
console.log(` Name: ${np.name}`);
|
||||
console.log(` Brand: ${np.brandName}`);
|
||||
console.log(` Category: ${np.category}`);
|
||||
console.log(` Type: ${np.type}`);
|
||||
console.log(` Strain: ${np.strainType}`);
|
||||
console.log(` THC: ${np.thcPercent !== null ? np.thcPercent + '%' : 'N/A'}`);
|
||||
console.log(` CBD: ${np.cbdPercent !== null ? np.cbdPercent + '%' : 'N/A'}`);
|
||||
console.log(` Image: ${np.primaryImageUrl?.slice(0, 60) || 'N/A'}...`);
|
||||
|
||||
const pricing = normalized.pricing.get(np.externalProductId);
|
||||
if (pricing) {
|
||||
console.log(` Price (cents): ${pricing.priceRec}`);
|
||||
}
|
||||
}
|
||||
if (result.products.length === 0) {
|
||||
console.log('\n[WARNING] No products captured!');
|
||||
console.log('This could mean:');
|
||||
console.log(' - Stealth plugin is not bypassing detection');
|
||||
console.log(' - CDP is not intercepting the correct URLs');
|
||||
console.log(' - Page structure has changed');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('');
|
||||
// Show sample raw product
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('SAMPLE RAW PRODUCT (from Elasticsearch)');
|
||||
console.log('='.repeat(60));
|
||||
const raw = result.products[0];
|
||||
console.log(JSON.stringify({
|
||||
id: raw.id,
|
||||
name: raw.name,
|
||||
menuTitle: raw.menuTitle,
|
||||
brand: raw.brand,
|
||||
category: raw.category,
|
||||
subtype: raw.subtype,
|
||||
status: raw.status,
|
||||
availableUnits: raw.availableUnits,
|
||||
customMinPrice: raw.customMinPrice,
|
||||
customMaxPrice: raw.customMaxPrice,
|
||||
isActive: raw.isActive,
|
||||
isAboveThreshold: raw.isAboveThreshold,
|
||||
}, null, 2));
|
||||
|
||||
// Show sample normalized product
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('SAMPLE NORMALIZED PRODUCT');
|
||||
console.log('='.repeat(60));
|
||||
const normalized = result.normalized[0];
|
||||
console.log(JSON.stringify({
|
||||
id: normalized.id,
|
||||
name: normalized.name,
|
||||
brand: normalized.brand,
|
||||
category: normalized.category,
|
||||
subtype: normalized.subtype,
|
||||
price: normalized.price,
|
||||
priceMin: normalized.priceMin,
|
||||
priceMax: normalized.priceMax,
|
||||
discountedPrice: normalized.discountedPrice,
|
||||
discountPercent: normalized.discountPercent,
|
||||
availableUnits: normalized.availableUnits,
|
||||
inStock: normalized.inStock,
|
||||
thcPercent: normalized.thcPercent,
|
||||
cbdPercent: normalized.cbdPercent,
|
||||
strainType: normalized.strainType,
|
||||
effects: normalized.effects,
|
||||
flavors: normalized.flavors,
|
||||
imageUrl: normalized.imageUrl,
|
||||
images: normalized.images?.slice(0, 2),
|
||||
}, null, 2));
|
||||
|
||||
// Brand breakdown
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('BRANDS (top 15)');
|
||||
console.log('='.repeat(60));
|
||||
const brandCounts = new Map<string, number>();
|
||||
for (const p of result.normalized) {
|
||||
const brand = p.brand || 'Unknown';
|
||||
brandCounts.set(brand, (brandCounts.get(brand) || 0) + 1);
|
||||
}
|
||||
|
||||
const sorted = [...brandCounts.entries()].sort((a, b) => b[1] - a[1]);
|
||||
console.log(`Total unique brands: ${sorted.length}\n`);
|
||||
sorted.slice(0, 15).forEach(([brand, count]) => {
|
||||
console.log(` ${brand}: ${count} products`);
|
||||
});
|
||||
|
||||
// Category breakdown
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('CATEGORIES');
|
||||
console.log('='.repeat(60));
|
||||
const categoryCounts = new Map<string, number>();
|
||||
for (const p of result.normalized) {
|
||||
const cat = p.category || 'Unknown';
|
||||
categoryCounts.set(cat, (categoryCounts.get(cat) || 0) + 1);
|
||||
}
|
||||
|
||||
const catSorted = [...categoryCounts.entries()].sort((a, b) => b[1] - a[1]);
|
||||
catSorted.forEach(([cat, count]) => {
|
||||
console.log(` ${cat}: ${count} products`);
|
||||
});
|
||||
|
||||
// Inventory stats
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('INVENTORY STATS');
|
||||
console.log('='.repeat(60));
|
||||
const inStock = result.normalized.filter(p => p.inStock).length;
|
||||
const outOfStock = result.normalized.filter(p => !p.inStock).length;
|
||||
const hasInventoryData = result.normalized.filter(p => p.availableUnits > 0).length;
|
||||
|
||||
console.log(`In stock: ${inStock}`);
|
||||
console.log(`Out of stock: ${outOfStock}`);
|
||||
console.log(`With inventory levels: ${hasInventoryData}`);
|
||||
|
||||
// Show inventory examples
|
||||
if (hasInventoryData > 0) {
|
||||
console.log('\nSample inventory levels:');
|
||||
result.normalized
|
||||
.filter(p => p.availableUnits > 0)
|
||||
.slice(0, 5)
|
||||
.forEach(p => {
|
||||
console.log(` ${p.name}: ${p.availableUnits} units`);
|
||||
});
|
||||
}
|
||||
|
||||
// Check for THC/CBD data
|
||||
const hasThc = result.normalized.filter(p => p.thcPercent !== null).length;
|
||||
const hasCbd = result.normalized.filter(p => p.cbdPercent !== null).length;
|
||||
console.log(`\nWith THC data: ${hasThc} (${Math.round(hasThc / result.totalCaptured * 100)}%)`);
|
||||
console.log(`With CBD data: ${hasCbd} (${Math.round(hasCbd / result.totalCaptured * 100)}%)`);
|
||||
|
||||
// Check for images
|
||||
const hasImages = result.normalized.filter(p => p.imageUrl).length;
|
||||
console.log(`With images: ${hasImages} (${Math.round(hasImages / result.totalCaptured * 100)}%)`);
|
||||
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('TEST PASSED');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('');
|
||||
console.error('='.repeat(60));
|
||||
console.error('\n' + '='.repeat(60));
|
||||
console.error('TEST FAILED');
|
||||
console.error('='.repeat(60));
|
||||
console.error(`Error: ${error.message}`);
|
||||
|
||||
@@ -3,77 +3,63 @@
|
||||
* TREEZ PLATFORM CLIENT
|
||||
* ============================================================
|
||||
*
|
||||
* Treez is a fully client-side rendered platform (React/Next.js).
|
||||
* Unlike Dutchie (GraphQL) or Jane (Algolia), Treez requires DOM
|
||||
* parsing after page render. No API endpoints are available.
|
||||
* Treez uses Cloudflare protection + headless detection on their
|
||||
* Elasticsearch API. This client uses:
|
||||
*
|
||||
* Key differences:
|
||||
* - No Cloudflare protection (simpler than Jane)
|
||||
* - Products loaded via infinite scroll
|
||||
* - Data extracted from DOM elements
|
||||
* - Age gate must be bypassed
|
||||
* 1. Puppeteer with Stealth plugin to bypass detection
|
||||
* 2. CDP (Chrome DevTools Protocol) to intercept API responses
|
||||
* 3. Scrolling/pagination to trigger all product loads
|
||||
*
|
||||
* API Endpoints (intercepted, not called directly):
|
||||
* - Products: POST https://search-{tenant}.gapcommerceapi.com/product/search
|
||||
* - Discounts: GET https://headless.treez.io/v2.0/dispensary/{storeId}/ecommerce/discounts
|
||||
*
|
||||
* URL Pattern: https://{storeId}.treez.io/onlinemenu/?customerType=ADULT
|
||||
* Store ID Format: String slug (e.g., "best")
|
||||
* Menu URL: https://{storeId}.treez.io/onlinemenu/ or custom domain
|
||||
*
|
||||
* Data captured includes:
|
||||
* - Full product details (name, brand, category, subtype)
|
||||
* - Inventory levels (availableUnits)
|
||||
* - Pricing with discounts
|
||||
* - Lab results (THC/CBD when available)
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||
import puppeteerExtra from 'puppeteer-extra';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import type { Browser, Page, CDPSession } from 'puppeteer';
|
||||
|
||||
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||
import type {
|
||||
TreezSession,
|
||||
TreezProductRaw,
|
||||
TreezProduct,
|
||||
TreezConfig,
|
||||
TreezESResponse,
|
||||
} from './types';
|
||||
|
||||
// Register stealth plugin (good practice even without Cloudflare)
|
||||
puppeteerExtra.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface TreezProductRaw {
|
||||
productId: string;
|
||||
name: string;
|
||||
brand: string;
|
||||
category: string;
|
||||
subcategory: string; // indica, sativa, hybrid
|
||||
thcPercent: number | null;
|
||||
cbdPercent: number | null;
|
||||
price: number | null;
|
||||
priceUnit: string;
|
||||
imageUrl: string | null;
|
||||
inStock: boolean;
|
||||
weight: string | null;
|
||||
}
|
||||
|
||||
export interface TreezSession {
|
||||
sessionId: string;
|
||||
browser: Browser;
|
||||
page: Page;
|
||||
fingerprint: BrowserFingerprint;
|
||||
proxyUrl: string | null;
|
||||
startedAt: Date;
|
||||
storeId?: string;
|
||||
}
|
||||
|
||||
export interface TreezStoreInfo {
|
||||
storeId: string;
|
||||
name: string;
|
||||
url: string;
|
||||
}
|
||||
// Register stealth plugin - REQUIRED for Treez
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
export const TREEZ_CONFIG = {
|
||||
baseUrl: 'https://{storeId}.treez.io/onlinemenu/',
|
||||
timeout: 60000,
|
||||
export const TREEZ_CONFIG: TreezConfig = {
|
||||
// Elasticsearch API (product data) - intercepted via CDP
|
||||
esEndpoint: 'gapcommerceapi.com/product/search',
|
||||
esApiKey: 'V3jHL9dFzi3Gj4UISM4lr38Nm0GSxcps5OBz1PbS',
|
||||
|
||||
// Treez Headless API (discounts, store info)
|
||||
headlessApiBase: 'https://headless.treez.io/v2.0/dispensary',
|
||||
clientId: '29dce682258145c6b1cf71027282d083',
|
||||
clientSecret: 'A57bB49AfD7F4233B1750a0B501B4E16',
|
||||
|
||||
// Timing
|
||||
navigationTimeout: 60000,
|
||||
scrollDelay: 1500,
|
||||
maxScrollAttempts: 50,
|
||||
ageGateDelay: 2000,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
@@ -102,6 +88,7 @@ export function getCrawlRotator(): CrawlRotator | null {
|
||||
|
||||
/**
|
||||
* Start a new Treez browser session
|
||||
* Uses Puppeteer + Stealth plugin with CDP for response interception
|
||||
*/
|
||||
export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
if (currentSession) {
|
||||
@@ -122,7 +109,8 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
} else {
|
||||
// Default fingerprint for local testing
|
||||
fingerprint = {
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
browserName: 'Chrome',
|
||||
deviceCategory: 'desktop',
|
||||
platform: 'Windows',
|
||||
@@ -159,9 +147,9 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[Treez Client] Launching browser...');
|
||||
const browser = await puppeteerExtra.launch({
|
||||
headless: true,
|
||||
console.log('[Treez Client] Launching browser with Stealth plugin...');
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: browserArgs,
|
||||
});
|
||||
|
||||
@@ -176,18 +164,6 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
// Set user agent
|
||||
await page.setUserAgent(fingerprint.userAgent);
|
||||
|
||||
// Block unnecessary resources to save bandwidth
|
||||
// We only need HTML/JS for DOM extraction - not images, fonts, etc.
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (request) => {
|
||||
const resourceType = request.resourceType();
|
||||
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Handle proxy authentication if needed
|
||||
if (proxyUrl) {
|
||||
const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/);
|
||||
@@ -199,16 +175,22 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
}
|
||||
}
|
||||
|
||||
// Create CDP session for response interception
|
||||
const cdpClient = await page.target().createCDPSession();
|
||||
await cdpClient.send('Network.enable');
|
||||
|
||||
const sessionId = `treez_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||
|
||||
currentSession = {
|
||||
sessionId,
|
||||
browser,
|
||||
page,
|
||||
cdpClient,
|
||||
fingerprint,
|
||||
proxyUrl,
|
||||
startedAt: new Date(),
|
||||
storeId,
|
||||
capturedProducts: [],
|
||||
};
|
||||
|
||||
console.log(`[Treez Client] Started session ${sessionId}`);
|
||||
@@ -226,7 +208,10 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||
export async function endSession(): Promise<void> {
|
||||
if (currentSession) {
|
||||
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
|
||||
console.log(`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s)`);
|
||||
const productCount = currentSession.capturedProducts.length;
|
||||
console.log(
|
||||
`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s, ${productCount} products)`
|
||||
);
|
||||
|
||||
try {
|
||||
await currentSession.browser.close();
|
||||
@@ -246,320 +231,400 @@ export function getCurrentSession(): TreezSession | null {
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// AGE GATE HANDLING
|
||||
// CDP RESPONSE INTERCEPTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Bypass age gate if present
|
||||
* Setup CDP listener to capture Elasticsearch product responses
|
||||
*/
|
||||
export async function bypassAgeGate(page: Page): Promise<boolean> {
|
||||
console.log('[Treez Client] Checking for age gate...');
|
||||
function setupProductCapture(session: TreezSession): void {
|
||||
const { cdpClient } = session;
|
||||
|
||||
try {
|
||||
const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]');
|
||||
cdpClient.on('Network.responseReceived', async (event: any) => {
|
||||
const url = event.response.url;
|
||||
|
||||
if (ageGate) {
|
||||
console.log('[Treez Client] Age gate detected, clicking confirm button...');
|
||||
|
||||
const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]');
|
||||
if (submitBtn) {
|
||||
await submitBtn.click();
|
||||
console.log('[Treez Client] Clicked confirm button');
|
||||
|
||||
await sleep(TREEZ_CONFIG.ageGateDelay);
|
||||
|
||||
// Wait for age gate to disappear
|
||||
await page.waitForFunction(
|
||||
() => !document.querySelector('[data-testid="age-gate-modal"]'),
|
||||
{ timeout: 10000 }
|
||||
).catch(() => {
|
||||
console.log('[Treez Client] Gate may still be visible, continuing anyway');
|
||||
// Check if this is an ES product search response
|
||||
if (url.includes('gapcommerceapi.com/product/search') && event.response.status === 200) {
|
||||
try {
|
||||
const response = await cdpClient.send('Network.getResponseBody', {
|
||||
requestId: event.requestId,
|
||||
});
|
||||
|
||||
console.log('[Treez Client] Age gate bypassed');
|
||||
return true;
|
||||
} else {
|
||||
console.log('[Treez Client] No submit button found');
|
||||
const body = response.base64Encoded
|
||||
? Buffer.from(response.body, 'base64').toString('utf8')
|
||||
: response.body;
|
||||
|
||||
const json: TreezESResponse = JSON.parse(body);
|
||||
const products = json.hits?.hits?.map((h) => h._source) || [];
|
||||
|
||||
if (products.length > 0) {
|
||||
session.capturedProducts.push(...products);
|
||||
console.log(
|
||||
`[Treez Client] Captured ${products.length} products (total: ${session.capturedProducts.length})`
|
||||
);
|
||||
}
|
||||
} catch {
|
||||
// Response body may not be available, skip silently
|
||||
}
|
||||
} else {
|
||||
console.log('[Treez Client] No age gate detected');
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (err: any) {
|
||||
console.log(`[Treez Client] Age gate error: ${err.message}`);
|
||||
return false;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// NAVIGATION & SCRAPING
|
||||
// PRODUCT FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build menu URL for a store
|
||||
* Uses /brands page which contains all products (not just homepage carousels)
|
||||
* Navigate to store menu and capture all products via CDP interception
|
||||
* This is the main method for fetching products
|
||||
*/
|
||||
export function buildMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||
return `https://${storeId}.treez.io/onlinemenu/brands?customerType=${customerType}`;
|
||||
}
|
||||
export async function fetchAllProducts(
|
||||
menuUrl: string,
|
||||
options: {
|
||||
maxScrolls?: number;
|
||||
scrollDelay?: number;
|
||||
bypassAgeGate?: boolean;
|
||||
} = {}
|
||||
): Promise<TreezProductRaw[]> {
|
||||
const {
|
||||
maxScrolls = TREEZ_CONFIG.maxScrollAttempts,
|
||||
scrollDelay = TREEZ_CONFIG.scrollDelay,
|
||||
bypassAgeGate = true,
|
||||
} = options;
|
||||
|
||||
/**
|
||||
* Navigate to a store's menu page
|
||||
*/
|
||||
export async function navigateToMenu(storeId: string): Promise<void> {
|
||||
if (!currentSession) {
|
||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||
}
|
||||
|
||||
const { page } = currentSession;
|
||||
const url = buildMenuUrl(storeId);
|
||||
|
||||
console.log(`[Treez Client] Navigating to ${url}`);
|
||||
// Reset captured products
|
||||
currentSession.capturedProducts = [];
|
||||
|
||||
await page.goto(url, {
|
||||
// Setup CDP listener for product responses
|
||||
setupProductCapture(currentSession);
|
||||
|
||||
console.log(`[Treez Client] Navigating to ${menuUrl}`);
|
||||
|
||||
try {
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: TREEZ_CONFIG.navigationTimeout,
|
||||
});
|
||||
|
||||
await sleep(3000);
|
||||
|
||||
// Bypass age gate if present
|
||||
if (bypassAgeGate) {
|
||||
await tryBypassAgeGate(page);
|
||||
}
|
||||
|
||||
// Wait for initial products to load
|
||||
await sleep(3000);
|
||||
console.log(`[Treez Client] Initial capture: ${currentSession.capturedProducts.length} products`);
|
||||
|
||||
// Scroll and click "Load More" to get all products
|
||||
console.log('[Treez Client] Scrolling to load all products...');
|
||||
|
||||
let previousCount = 0;
|
||||
let noNewDataCount = 0;
|
||||
|
||||
for (let i = 0; i < maxScrolls; i++) {
|
||||
// Scroll to bottom
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await sleep(scrollDelay);
|
||||
|
||||
// Try clicking "Load More" button
|
||||
try {
|
||||
const loadMoreBtn = await page.$('button.collection__load-more');
|
||||
if (loadMoreBtn) {
|
||||
const isVisible = await page.evaluate((btn: Element) => {
|
||||
const rect = btn.getBoundingClientRect();
|
||||
return rect.width > 0 && rect.height > 0;
|
||||
}, loadMoreBtn);
|
||||
|
||||
if (isVisible) {
|
||||
await loadMoreBtn.click();
|
||||
await sleep(scrollDelay);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// No load more button or click failed
|
||||
}
|
||||
|
||||
const currentCount = currentSession.capturedProducts.length;
|
||||
if (currentCount === previousCount) {
|
||||
noNewDataCount++;
|
||||
if (noNewDataCount >= 5) {
|
||||
console.log(`[Treez Client] No new products for 5 scrolls, stopping`);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
noNewDataCount = 0;
|
||||
if ((i + 1) % 5 === 0) {
|
||||
console.log(`[Treez Client] Scroll ${i + 1}: ${currentCount} products`);
|
||||
}
|
||||
}
|
||||
previousCount = currentCount;
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Treez Client] Navigation error: ${error.message}`);
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Deduplicate products by ID
|
||||
const seen = new Set<string>();
|
||||
const uniqueProducts = currentSession.capturedProducts.filter((p) => {
|
||||
if (!p.id || seen.has(p.id)) return false;
|
||||
seen.add(p.id);
|
||||
return true;
|
||||
});
|
||||
|
||||
console.log(`[Treez Client] Total unique products: ${uniqueProducts.length}`);
|
||||
|
||||
// Record success with rotator
|
||||
if (crawlRotator && uniqueProducts.length > 0) {
|
||||
await crawlRotator.recordSuccess();
|
||||
}
|
||||
|
||||
return uniqueProducts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch products from a specific brand page
|
||||
*/
|
||||
export async function fetchBrandProducts(
|
||||
storeUrl: string,
|
||||
brandSlug: string
|
||||
): Promise<TreezProductRaw[]> {
|
||||
const brandUrl = `${storeUrl}/brand/${encodeURIComponent(brandSlug)}`;
|
||||
return fetchAllProducts(brandUrl, { maxScrolls: 30 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch products from a specific category page
|
||||
*/
|
||||
export async function fetchCategoryProducts(
|
||||
storeUrl: string,
|
||||
categorySlug: string
|
||||
): Promise<TreezProductRaw[]> {
|
||||
const categoryUrl = `${storeUrl}/collection/${encodeURIComponent(categorySlug)}`;
|
||||
return fetchAllProducts(categoryUrl, { maxScrolls: 30 });
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// BRAND DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch all brands from the /brands page
|
||||
*/
|
||||
export async function fetchAllBrands(
|
||||
storeUrl: string
|
||||
): Promise<Array<{ name: string; href: string }>> {
|
||||
if (!currentSession) {
|
||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||
}
|
||||
|
||||
const { page } = currentSession;
|
||||
const brandsUrl = `${storeUrl}/brands`;
|
||||
|
||||
console.log(`[Treez Client] Fetching brands from ${brandsUrl}`);
|
||||
|
||||
await page.goto(brandsUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: TREEZ_CONFIG.navigationTimeout,
|
||||
});
|
||||
|
||||
// Wait for React app to render
|
||||
await sleep(3000);
|
||||
await tryBypassAgeGate(page);
|
||||
await sleep(2000);
|
||||
|
||||
// Bypass age gate
|
||||
await bypassAgeGate(page);
|
||||
// Click "Load More" to get all brands
|
||||
for (let i = 0; i < 20; i++) {
|
||||
try {
|
||||
const btn = await page.$('button.collection__load-more');
|
||||
if (!btn) break;
|
||||
|
||||
// Wait for content to load
|
||||
await sleep(2000);
|
||||
const isVisible = await page.evaluate((b: Element) => {
|
||||
const rect = b.getBoundingClientRect();
|
||||
return rect.width > 0 && rect.height > 0;
|
||||
}, btn);
|
||||
|
||||
console.log('[Treez Client] Menu page loaded');
|
||||
}
|
||||
if (!isVisible) break;
|
||||
|
||||
/**
|
||||
* Scroll to load all products (infinite scroll)
|
||||
*/
|
||||
export async function scrollToLoadAll(page: Page): Promise<number> {
|
||||
let previousHeight = 0;
|
||||
let scrollCount = 0;
|
||||
let sameHeightCount = 0;
|
||||
|
||||
console.log('[Treez Client] Starting infinite scroll...');
|
||||
|
||||
while (scrollCount < TREEZ_CONFIG.maxScrollAttempts) {
|
||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
|
||||
if (currentHeight === previousHeight) {
|
||||
sameHeightCount++;
|
||||
if (sameHeightCount >= 3) {
|
||||
console.log('[Treez Client] No new content after 3 attempts, stopping');
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
sameHeightCount = 0;
|
||||
}
|
||||
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await sleep(TREEZ_CONFIG.scrollDelay);
|
||||
|
||||
previousHeight = currentHeight;
|
||||
scrollCount++;
|
||||
|
||||
if (scrollCount % 5 === 0) {
|
||||
const productCount = await page.evaluate(() => {
|
||||
return document.querySelectorAll('[class*="product_product__"]').length;
|
||||
});
|
||||
console.log(`[Treez Client] Scroll ${scrollCount}: ${productCount} products loaded`);
|
||||
await btn.click();
|
||||
await sleep(1500);
|
||||
} catch {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return scrollCount;
|
||||
}
|
||||
// Extract brand links
|
||||
const brands = await page.evaluate(() => {
|
||||
const results: Array<{ name: string; href: string }> = [];
|
||||
|
||||
/**
|
||||
* Extract products from the current page
|
||||
*/
|
||||
export async function extractProducts(page: Page): Promise<TreezProductRaw[]> {
|
||||
console.log('[Treez Client] Extracting products from DOM...');
|
||||
|
||||
const products = await page.evaluate(() => {
|
||||
const results: any[] = [];
|
||||
|
||||
// Find all product cards
|
||||
const productElements = Array.from(
|
||||
document.querySelectorAll('[class*="product_product__"]')
|
||||
).filter(el => {
|
||||
const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]');
|
||||
const hasPrice = el.querySelector('[class*="price"]');
|
||||
return hasName || hasPrice;
|
||||
});
|
||||
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const el of productElements) {
|
||||
try {
|
||||
// Get product name
|
||||
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
|
||||
if (!name || seen.has(name)) continue;
|
||||
seen.add(name);
|
||||
|
||||
// Get product ID from link
|
||||
const linkEl = el.querySelector('a[href*="/product/"]');
|
||||
let productId = '';
|
||||
if (linkEl) {
|
||||
const href = linkEl.getAttribute('href') || '';
|
||||
const match = href.match(/\/product\/([^\/\?]+)/);
|
||||
productId = match ? match[1] : '';
|
||||
}
|
||||
if (!productId) {
|
||||
productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`;
|
||||
}
|
||||
|
||||
// Get brand
|
||||
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
|
||||
const brand = brandEl?.textContent?.trim() || '';
|
||||
|
||||
// Get price
|
||||
const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]');
|
||||
const priceText = priceEl?.textContent || '';
|
||||
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
||||
|
||||
// Get image URL
|
||||
const imgEl = el.querySelector('img');
|
||||
let imageUrl = imgEl?.getAttribute('src') || null;
|
||||
if (imageUrl && imageUrl.includes('/_next/image')) {
|
||||
const urlMatch = imageUrl.match(/url=([^&]+)/);
|
||||
if (urlMatch) {
|
||||
imageUrl = decodeURIComponent(urlMatch[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Get text content for data extraction
|
||||
const text = el.textContent || '';
|
||||
const textLower = text.toLowerCase();
|
||||
|
||||
// Get THC/CBD
|
||||
const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) ||
|
||||
text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||
const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) ||
|
||||
text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
||||
const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null;
|
||||
const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null;
|
||||
|
||||
// Get weight from name
|
||||
const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i);
|
||||
const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null;
|
||||
|
||||
// Determine category from weight and name (not full text to avoid nav pollution)
|
||||
let category = '';
|
||||
|
||||
// Check explicit category patterns in NAME ONLY (not full text)
|
||||
// This avoids false positives from navigation elements
|
||||
const categoryPatterns = [
|
||||
{ pattern: /vape|cart(?:ridge)?|pen|pod/i, category: 'vape' },
|
||||
{ pattern: /edible|gummy|gummies|chocolate|candy/i, category: 'edible' },
|
||||
{ pattern: /concentrate|dab|wax|shatter|rosin|resin/i, category: 'concentrate' },
|
||||
{ pattern: /pre.?roll|joint|blunt/i, category: 'pre-roll' },
|
||||
{ pattern: /topical|balm|cream|lotion/i, category: 'topical' },
|
||||
{ pattern: /tincture/i, category: 'tincture' },
|
||||
];
|
||||
for (const { pattern, category: cat } of categoryPatterns) {
|
||||
if (pattern.test(name)) {
|
||||
category = cat;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If no explicit category found, infer from weight
|
||||
if (!category && weight) {
|
||||
const weightLower = weight.toLowerCase();
|
||||
if (weightLower.includes('g') && !weightLower.includes('mg')) {
|
||||
// Gram weights (3.5g, 1g, 7g, etc.) are typically flower
|
||||
category = 'flower';
|
||||
} else if (weightLower.includes('mg')) {
|
||||
// Milligram weights are typically edibles
|
||||
category = 'edible';
|
||||
}
|
||||
}
|
||||
|
||||
// Get strain type
|
||||
const strainTypes = ['indica', 'sativa', 'hybrid'];
|
||||
let subcategory = '';
|
||||
for (const strain of strainTypes) {
|
||||
if (textLower.includes(strain)) {
|
||||
subcategory = strain;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check stock status
|
||||
const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out');
|
||||
|
||||
results.push({
|
||||
productId,
|
||||
name,
|
||||
brand,
|
||||
category,
|
||||
subcategory,
|
||||
thcPercent,
|
||||
cbdPercent,
|
||||
price,
|
||||
priceUnit: weight || '',
|
||||
imageUrl,
|
||||
inStock,
|
||||
weight,
|
||||
});
|
||||
} catch (err) {
|
||||
// Skip products that fail extraction
|
||||
document.querySelectorAll('.brands-page__list a[href*="/brand/"]').forEach((a: Element) => {
|
||||
const href = a.getAttribute('href') || '';
|
||||
const name = a.textContent?.trim() || '';
|
||||
if (name && href) {
|
||||
results.push({ name, href });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
console.log(`[Treez Client] Extracted ${products.length} products`);
|
||||
return products;
|
||||
console.log(`[Treez Client] Found ${brands.length} brands`);
|
||||
return brands;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATA NORMALIZATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Parse raw Treez product into normalized structure
|
||||
*/
|
||||
export function normalizeProduct(raw: TreezProductRaw): TreezProduct {
|
||||
const productData = raw.productData || ({} as any);
|
||||
const pricing = productData.pricing || {};
|
||||
const labResults = productData.labResults || [];
|
||||
|
||||
// Extract THC/CBD from lab results
|
||||
let thcPercent: number | null = null;
|
||||
let cbdPercent: number | null = null;
|
||||
|
||||
for (const result of labResults) {
|
||||
const cannabinoid = (result.cannabinoid || '').toLowerCase();
|
||||
if (cannabinoid.includes('thc') && result.value != null) {
|
||||
thcPercent = result.value;
|
||||
} else if (cannabinoid.includes('cbd') && result.value != null) {
|
||||
cbdPercent = result.value;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract strain type from subtype
|
||||
let strainType: string | null = null;
|
||||
const subtypeLower = (raw.subtype || '').toLowerCase();
|
||||
|
||||
if (subtypeLower.includes('indica')) {
|
||||
strainType = 'Indica';
|
||||
} else if (subtypeLower.includes('sativa')) {
|
||||
strainType = 'Sativa';
|
||||
} else if (subtypeLower.includes('hybrid')) {
|
||||
strainType = 'Hybrid';
|
||||
}
|
||||
|
||||
// Extract images
|
||||
const images = (productData.images || []).map((img: any) => img.url).filter(Boolean);
|
||||
const imageUrl = images[0] || null;
|
||||
|
||||
// Extract inventory by location
|
||||
const inventoryByLocation = (productData.inventory || []).map((inv: any) => ({
|
||||
locationId: inv.locationId,
|
||||
locationName: inv.locationName,
|
||||
availableUnits: inv.availableUnits || 0,
|
||||
}));
|
||||
|
||||
return {
|
||||
id: raw.id,
|
||||
name: raw.menuTitle || raw.name,
|
||||
brand: raw.brand,
|
||||
slug: raw.slug,
|
||||
category: raw.category,
|
||||
subtype: raw.subtype,
|
||||
|
||||
availableUnits: raw.availableUnits || 0,
|
||||
inStock: (raw.availableUnits || 0) > 0,
|
||||
inventoryByLocation,
|
||||
|
||||
price: pricing.priceSell || raw.customMinPrice || 0,
|
||||
priceMin: raw.customMinPrice || 0,
|
||||
priceMax: raw.customMaxPrice || 0,
|
||||
discountedPrice:
|
||||
pricing.discountedPrice !== pricing.priceSell ? pricing.discountedPrice : null,
|
||||
discountPercent: pricing.discountPercent || 0,
|
||||
|
||||
thcPercent,
|
||||
cbdPercent,
|
||||
strainType,
|
||||
effects: raw.effects || [],
|
||||
flavors: raw.flavors || [],
|
||||
isCannabis: productData.isCannabis ?? true,
|
||||
|
||||
imageUrl,
|
||||
images,
|
||||
|
||||
isActive: raw.isActive,
|
||||
customerType: raw.customCustomerType,
|
||||
|
||||
lastUpdated: productData.lastUpdateDate || raw.customInjectionDate,
|
||||
createdAt: productData.createdDate || raw.customInjectionDate,
|
||||
|
||||
raw,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// URL HELPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build menu URL for a Treez store
|
||||
*/
|
||||
export function buildMenuUrl(
|
||||
storeId: string,
|
||||
customerType: 'ADULT' | 'MEDICAL' = 'ADULT'
|
||||
): string {
|
||||
return `https://${storeId}.treez.io/onlinemenu/shop?customerType=${customerType}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all products from a store
|
||||
* Main entry point for product discovery
|
||||
* Build custom domain menu URL
|
||||
*/
|
||||
export async function fetchAllProducts(storeId: string): Promise<{
|
||||
products: TreezProductRaw[];
|
||||
storeInfo: TreezStoreInfo;
|
||||
scrollCount: number;
|
||||
}> {
|
||||
if (!currentSession) {
|
||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||
export function buildCustomDomainUrl(domain: string, path: string = '/shop'): string {
|
||||
const cleanDomain = domain.replace(/^https?:\/\//, '').replace(/\/$/, '');
|
||||
return `https://${cleanDomain}${path}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract store ID from a Treez URL
|
||||
*/
|
||||
export function extractStoreId(url: string): string | null {
|
||||
// Pattern: {storeId}.treez.io
|
||||
const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/);
|
||||
if (treezMatch) {
|
||||
return treezMatch[1];
|
||||
}
|
||||
|
||||
const { page } = currentSession;
|
||||
// Custom domains need store ID from config
|
||||
return null;
|
||||
}
|
||||
|
||||
// Navigate to menu
|
||||
await navigateToMenu(storeId);
|
||||
// ============================================================
|
||||
// AGE GATE HANDLING
|
||||
// ============================================================
|
||||
|
||||
// Get page title for store info
|
||||
const pageTitle = await page.title();
|
||||
const storeInfo: TreezStoreInfo = {
|
||||
storeId,
|
||||
name: pageTitle.split('|')[1]?.trim() || pageTitle,
|
||||
url: buildMenuUrl(storeId),
|
||||
};
|
||||
|
||||
// Scroll to load all products
|
||||
const scrollCount = await scrollToLoadAll(page);
|
||||
|
||||
// Extract products
|
||||
const products = await extractProducts(page);
|
||||
|
||||
// Record success if we got products
|
||||
if (crawlRotator && products.length > 0) {
|
||||
await crawlRotator.recordSuccess();
|
||||
/**
|
||||
* Try to bypass age gate popup
|
||||
*/
|
||||
async function tryBypassAgeGate(page: Page): Promise<boolean> {
|
||||
try {
|
||||
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
||||
if (ageGate) {
|
||||
console.log('[Treez Client] Age gate detected, bypassing...');
|
||||
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||
if (btn) {
|
||||
await btn.click();
|
||||
await sleep(2000);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// No age gate or error bypassing
|
||||
}
|
||||
|
||||
return { products, storeInfo, scrollCount };
|
||||
return false;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
@@ -569,3 +634,9 @@ export async function fetchAllProducts(storeId: string): Promise<{
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LEGACY EXPORTS (for backward compatibility)
|
||||
// ============================================================
|
||||
|
||||
export { tryBypassAgeGate as bypassAgeGate };
|
||||
|
||||
@@ -1,50 +1,124 @@
|
||||
/**
|
||||
* Treez Platform Module
|
||||
* ============================================================
|
||||
* TREEZ PLATFORM MODULE
|
||||
* ============================================================
|
||||
*
|
||||
* Single export point for all Treez communication.
|
||||
* All Treez workers MUST import from this module.
|
||||
*
|
||||
* ARCHITECTURE:
|
||||
* Unlike Dutchie (GraphQL API) and Jane (Algolia API), Treez uses
|
||||
* a client-side rendered React app with Elasticsearch backend.
|
||||
* Direct API access is blocked by Cloudflare + headless detection.
|
||||
*
|
||||
* SOLUTION:
|
||||
* We use Puppeteer with Stealth plugin + CDP (Chrome DevTools Protocol)
|
||||
* to intercept the Elasticsearch API responses as the page loads.
|
||||
*
|
||||
* KEY COMPONENTS:
|
||||
* - client.ts: Low-level browser session and CDP interception
|
||||
* - queries.ts: High-level operations with automatic session management
|
||||
* - types.ts: TypeScript interfaces for Treez data structures
|
||||
*
|
||||
* USAGE EXAMPLE:
|
||||
* ```typescript
|
||||
* import { fetchProductsByStoreId } from '../platforms/treez';
|
||||
*
|
||||
* const result = await fetchProductsByStoreId('best');
|
||||
* console.log(`Found ${result.totalCaptured} products`);
|
||||
* console.log(`First product: ${result.normalized[0].name}`);
|
||||
* ```
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// HIGH-LEVEL OPERATIONS (Recommended for most use cases)
|
||||
// ============================================================
|
||||
|
||||
export {
|
||||
// Session Management
|
||||
// Product fetching with automatic session management
|
||||
fetchProductsByStoreId,
|
||||
fetchProductsFromUrl,
|
||||
|
||||
// Brand discovery
|
||||
fetchBrandsFromStore,
|
||||
|
||||
// Store validation
|
||||
validateStoreId,
|
||||
extractStoreIdFromUrl,
|
||||
|
||||
// URL building
|
||||
getMenuUrl,
|
||||
getCustomDomainUrl,
|
||||
|
||||
// Result types
|
||||
type FetchProductsResult,
|
||||
type FetchBrandsResult,
|
||||
} from './queries';
|
||||
|
||||
// ============================================================
|
||||
// LOW-LEVEL CLIENT (For advanced use cases)
|
||||
// ============================================================
|
||||
|
||||
export {
|
||||
// Session management
|
||||
startSession,
|
||||
endSession,
|
||||
getCurrentSession,
|
||||
|
||||
// Proxy/Rotation
|
||||
// Proxy/rotation integration
|
||||
setCrawlRotator,
|
||||
getCrawlRotator,
|
||||
|
||||
// Core Operations
|
||||
navigateToMenu,
|
||||
scrollToLoadAll,
|
||||
extractProducts,
|
||||
// Core operations (require active session)
|
||||
fetchAllProducts,
|
||||
bypassAgeGate,
|
||||
fetchAllBrands,
|
||||
fetchBrandProducts,
|
||||
fetchCategoryProducts,
|
||||
|
||||
// URL Building
|
||||
// Data normalization
|
||||
normalizeProduct,
|
||||
|
||||
// URL helpers
|
||||
buildMenuUrl,
|
||||
buildCustomDomainUrl,
|
||||
extractStoreId,
|
||||
|
||||
// Age gate
|
||||
bypassAgeGate,
|
||||
|
||||
// Configuration
|
||||
TREEZ_CONFIG,
|
||||
|
||||
// Types
|
||||
type TreezSession,
|
||||
type TreezStoreInfo,
|
||||
type TreezProductRaw,
|
||||
} from './client';
|
||||
|
||||
// High-level Query Functions
|
||||
export {
|
||||
fetchProductsByStoreId,
|
||||
fetchProductsFromUrl,
|
||||
extractStoreIdFromUrl,
|
||||
validateStoreId,
|
||||
getMenuUrl,
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
// Types
|
||||
type FetchProductsResult,
|
||||
} from './queries';
|
||||
export type {
|
||||
// Raw API response types
|
||||
TreezProductRaw,
|
||||
TreezProductDataRaw,
|
||||
TreezDiscountRaw,
|
||||
TreezImageRaw,
|
||||
TreezInventoryRaw,
|
||||
TreezLabResultRaw,
|
||||
TreezPricingRaw,
|
||||
TreezProductGroupRaw,
|
||||
|
||||
// Re-export CrawlRotator types from canonical location
|
||||
export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator';
|
||||
// Normalized types
|
||||
TreezProduct,
|
||||
TreezStore,
|
||||
|
||||
// Session types
|
||||
TreezSession,
|
||||
TreezConfig,
|
||||
|
||||
// Response types
|
||||
TreezESResponse,
|
||||
CapturedResponse,
|
||||
} from './types';
|
||||
|
||||
// Re-export CrawlRotator types for convenience
|
||||
export type { CrawlRotator, Proxy, ProxyStats, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||
|
||||
@@ -1,71 +1,212 @@
|
||||
/**
|
||||
* Treez High-Level Query Functions
|
||||
* ============================================================
|
||||
* TREEZ HIGH-LEVEL QUERY FUNCTIONS
|
||||
* ============================================================
|
||||
*
|
||||
* Wraps the low-level client methods with business logic
|
||||
* for common operations like product fetching.
|
||||
*
|
||||
* Use these functions for most Treez operations - they handle
|
||||
* session management automatically.
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import {
|
||||
startSession,
|
||||
endSession,
|
||||
fetchAllProducts,
|
||||
fetchAllBrands,
|
||||
normalizeProduct,
|
||||
buildMenuUrl,
|
||||
TreezProductRaw,
|
||||
TreezStoreInfo,
|
||||
buildCustomDomainUrl,
|
||||
extractStoreId,
|
||||
setCrawlRotator,
|
||||
} from './client';
|
||||
|
||||
import type { TreezProductRaw, TreezProduct } from './types';
|
||||
import type { CrawlRotator } from '../../services/crawl-rotator';
|
||||
|
||||
// ============================================================
|
||||
// RESULT TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Result from a product fetch operation
|
||||
*/
|
||||
export interface FetchProductsResult {
|
||||
/** Raw products from Elasticsearch API */
|
||||
products: TreezProductRaw[];
|
||||
|
||||
/** Normalized products ready for database */
|
||||
normalized: TreezProduct[];
|
||||
|
||||
/** Total unique products captured */
|
||||
totalCaptured: number;
|
||||
|
||||
/** Store ID extracted from URL */
|
||||
storeId: string | null;
|
||||
|
||||
/** Original URL fetched */
|
||||
sourceUrl: string;
|
||||
|
||||
/** Timestamp when fetch completed */
|
||||
fetchedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result from a brand fetch operation
|
||||
*/
|
||||
export interface FetchBrandsResult {
|
||||
/** List of brands with names and URLs */
|
||||
brands: Array<{ name: string; href: string }>;
|
||||
|
||||
/** Total brands found */
|
||||
totalBrands: number;
|
||||
|
||||
/** Store URL used */
|
||||
sourceUrl: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PRODUCT OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
export interface FetchProductsResult {
|
||||
store: TreezStoreInfo;
|
||||
products: TreezProductRaw[];
|
||||
totalCaptured: number;
|
||||
scrollCount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all products from a Treez store
|
||||
* Fetch all products from a Treez store by store ID
|
||||
*
|
||||
* @param storeId - Treez store ID (slug like "best")
|
||||
* @returns Products and store data captured from the page
|
||||
* This is the main entry point for product discovery.
|
||||
* Handles session management, CDP interception, and normalization.
|
||||
*
|
||||
* @param storeId - Treez store slug (e.g., "best")
|
||||
* @param rotator - Optional CrawlRotator for proxy/fingerprint management
|
||||
* @returns Products and metadata
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const result = await fetchProductsByStoreId('best');
|
||||
* console.log(`Found ${result.totalCaptured} products`);
|
||||
* ```
|
||||
*/
|
||||
export async function fetchProductsByStoreId(storeId: string): Promise<FetchProductsResult> {
|
||||
try {
|
||||
await startSession(storeId);
|
||||
export async function fetchProductsByStoreId(
|
||||
storeId: string,
|
||||
rotator?: CrawlRotator
|
||||
): Promise<FetchProductsResult> {
|
||||
const menuUrl = buildMenuUrl(storeId);
|
||||
|
||||
const { products, storeInfo, scrollCount } = await fetchAllProducts(storeId);
|
||||
try {
|
||||
// Set rotator if provided
|
||||
if (rotator) {
|
||||
setCrawlRotator(rotator);
|
||||
}
|
||||
|
||||
// Start session and fetch
|
||||
await startSession(storeId);
|
||||
const products = await fetchAllProducts(menuUrl);
|
||||
|
||||
// Normalize all products
|
||||
const normalized = products.map(normalizeProduct);
|
||||
|
||||
return {
|
||||
store: storeInfo,
|
||||
products,
|
||||
normalized,
|
||||
totalCaptured: products.length,
|
||||
scrollCount,
|
||||
storeId,
|
||||
sourceUrl: menuUrl,
|
||||
fetchedAt: new Date(),
|
||||
};
|
||||
} finally {
|
||||
await endSession();
|
||||
setCrawlRotator(null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch products from a Treez menu URL
|
||||
* Extracts store ID from URL and fetches products
|
||||
* Fetch all products from a custom domain URL
|
||||
*
|
||||
* @param menuUrl - Full Treez menu URL
|
||||
* @returns Products and store data
|
||||
* Use this for stores with custom domains like shop.bestdispensary.com
|
||||
* instead of best.treez.io
|
||||
*
|
||||
* @param menuUrl - Full URL to the store menu
|
||||
* @param rotator - Optional CrawlRotator for proxy/fingerprint management
|
||||
* @returns Products and metadata
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const result = await fetchProductsFromUrl('https://shop.bestdispensary.com/shop');
|
||||
* ```
|
||||
*/
|
||||
export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProductsResult> {
|
||||
const storeId = extractStoreIdFromUrl(menuUrl);
|
||||
if (!storeId) {
|
||||
throw new Error(`Could not extract store ID from URL: ${menuUrl}`);
|
||||
}
|
||||
export async function fetchProductsFromUrl(
|
||||
menuUrl: string,
|
||||
rotator?: CrawlRotator
|
||||
): Promise<FetchProductsResult> {
|
||||
const storeId = extractStoreId(menuUrl);
|
||||
|
||||
return fetchProductsByStoreId(storeId);
|
||||
try {
|
||||
if (rotator) {
|
||||
setCrawlRotator(rotator);
|
||||
}
|
||||
|
||||
await startSession(storeId || undefined);
|
||||
const products = await fetchAllProducts(menuUrl);
|
||||
const normalized = products.map(normalizeProduct);
|
||||
|
||||
return {
|
||||
products,
|
||||
normalized,
|
||||
totalCaptured: products.length,
|
||||
storeId,
|
||||
sourceUrl: menuUrl,
|
||||
fetchedAt: new Date(),
|
||||
};
|
||||
} finally {
|
||||
await endSession();
|
||||
setCrawlRotator(null);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STORE OPERATIONS
|
||||
// BRAND OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch all brands from a Treez store
|
||||
*
|
||||
* @param storeUrl - Base store URL (e.g., https://shop.bestdispensary.com)
|
||||
* @param rotator - Optional CrawlRotator
|
||||
* @returns List of brands with their page URLs
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const result = await fetchBrandsFromStore('https://shop.bestdispensary.com');
|
||||
* result.brands.forEach(b => console.log(b.name));
|
||||
* ```
|
||||
*/
|
||||
export async function fetchBrandsFromStore(
|
||||
storeUrl: string,
|
||||
rotator?: CrawlRotator
|
||||
): Promise<FetchBrandsResult> {
|
||||
try {
|
||||
if (rotator) {
|
||||
setCrawlRotator(rotator);
|
||||
}
|
||||
|
||||
await startSession();
|
||||
const brands = await fetchAllBrands(storeUrl);
|
||||
|
||||
return {
|
||||
brands,
|
||||
totalBrands: brands.length,
|
||||
sourceUrl: storeUrl,
|
||||
};
|
||||
} finally {
|
||||
await endSession();
|
||||
setCrawlRotator(null);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STORE VALIDATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
@@ -73,26 +214,20 @@ export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProduc
|
||||
*
|
||||
* Supports formats:
|
||||
* - https://best.treez.io/onlinemenu/
|
||||
* - https://shop.bestdispensary.com/ (resolves to best.treez.io)
|
||||
* - Custom domains return null (need to follow redirect)
|
||||
*
|
||||
* @param url - Treez menu URL
|
||||
* @returns Store ID or null if not found
|
||||
*/
|
||||
export function extractStoreIdFromUrl(url: string): string | null {
|
||||
// Pattern 1: {storeId}.treez.io
|
||||
const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/i);
|
||||
if (treezMatch) {
|
||||
return treezMatch[1];
|
||||
}
|
||||
|
||||
// Pattern 2: Custom domain - would need to follow redirect
|
||||
// For now, return null and let the caller handle it
|
||||
return null;
|
||||
return extractStoreId(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that a store ID exists and is accessible
|
||||
*
|
||||
* Attempts to load the store page and checks for 404
|
||||
*
|
||||
* @param storeId - Treez store ID
|
||||
* @returns True if store is accessible
|
||||
*/
|
||||
@@ -100,7 +235,11 @@ export async function validateStoreId(storeId: string): Promise<boolean> {
|
||||
try {
|
||||
await startSession(storeId);
|
||||
|
||||
const { page } = (await import('./client')).getCurrentSession()!;
|
||||
const { getCurrentSession } = await import('./client');
|
||||
const session = getCurrentSession();
|
||||
if (!session) return false;
|
||||
|
||||
const { page } = session;
|
||||
const url = buildMenuUrl(storeId);
|
||||
|
||||
await page.goto(url, {
|
||||
@@ -121,12 +260,27 @@ export async function validateStoreId(storeId: string): Promise<boolean> {
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// UTILITY FUNCTIONS
|
||||
// URL HELPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Get the direct Treez menu URL for a store
|
||||
*
|
||||
* @param storeId - Store slug (e.g., "best")
|
||||
* @param customerType - ADULT (recreational) or MEDICAL
|
||||
* @returns Full menu URL
|
||||
*/
|
||||
export function getMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||
return buildMenuUrl(storeId, customerType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get menu URL for a custom domain
|
||||
*
|
||||
* @param domain - Custom domain (e.g., shop.bestdispensary.com)
|
||||
* @param path - Path to menu (default: /shop)
|
||||
* @returns Full menu URL
|
||||
*/
|
||||
export function getCustomDomainUrl(domain: string, path: string = '/shop'): string {
|
||||
return buildCustomDomainUrl(domain, path);
|
||||
}
|
||||
|
||||
285
backend/src/platforms/treez/types.ts
Normal file
285
backend/src/platforms/treez/types.ts
Normal file
@@ -0,0 +1,285 @@
|
||||
/**
|
||||
* ============================================================
|
||||
* TREEZ PLATFORM TYPES
|
||||
* ============================================================
|
||||
*
|
||||
* TypeScript interfaces for Treez platform data structures.
|
||||
* Based on Elasticsearch API responses captured via CDP interception.
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// RAW API RESPONSE TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Raw product data from Treez Elasticsearch API
|
||||
*/
|
||||
export interface TreezProductRaw {
|
||||
id: string;
|
||||
name: string;
|
||||
menuTitle: string;
|
||||
brand: string;
|
||||
category: string;
|
||||
subtype: string;
|
||||
slug: string;
|
||||
oldSlug?: string;
|
||||
status: string;
|
||||
|
||||
// Inventory
|
||||
availableUnits: number;
|
||||
|
||||
// Pricing
|
||||
customMinPrice: number;
|
||||
customMaxPrice: number;
|
||||
customOnSaleValue?: number;
|
||||
|
||||
// Visibility
|
||||
isAboveThreshold: boolean;
|
||||
isActive: boolean;
|
||||
isHideFromMenu: boolean;
|
||||
customCustomerType: 'ADULT' | 'MEDICAL' | 'BOTH';
|
||||
|
||||
// Attributes
|
||||
effects: string[];
|
||||
flavors: string[];
|
||||
generals: string[];
|
||||
ingredients: string[];
|
||||
internalTags: string[];
|
||||
|
||||
// Inventory IDs
|
||||
customInventoryIds: string[];
|
||||
customInjectionDate: string;
|
||||
|
||||
// Extended product data
|
||||
productData: TreezProductDataRaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extended product data from productData field
|
||||
*/
|
||||
export interface TreezProductDataRaw {
|
||||
barcodes: string[];
|
||||
discounts: TreezDiscountRaw[];
|
||||
images: TreezImageRaw[];
|
||||
inventory: TreezInventoryRaw[];
|
||||
isCannabis: boolean;
|
||||
labResults: TreezLabResultRaw[];
|
||||
pricing: TreezPricingRaw;
|
||||
productGroups: TreezProductGroupRaw[];
|
||||
lastUpdateDate: string;
|
||||
createdDate: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discount information
|
||||
*/
|
||||
export interface TreezDiscountRaw {
|
||||
discountId: string;
|
||||
discountTitle: string;
|
||||
discountAffinity: string;
|
||||
discountAmount: number;
|
||||
discountMethod: 'PERCENT' | 'FLAT';
|
||||
discountStackable: string;
|
||||
discountConditions: Array<{ type: string; value: string }>;
|
||||
discountProductGroups: string[];
|
||||
discountProductGroupsRequired: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Product image
|
||||
*/
|
||||
export interface TreezImageRaw {
|
||||
url: string;
|
||||
isPrimary?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Location-level inventory
|
||||
*/
|
||||
export interface TreezInventoryRaw {
|
||||
locationId: string;
|
||||
locationName: string;
|
||||
customerType: string;
|
||||
availableUnits: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lab test results
|
||||
*/
|
||||
export interface TreezLabResultRaw {
|
||||
cannabinoid?: string;
|
||||
value?: number;
|
||||
unit?: string;
|
||||
testDate?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pricing information
|
||||
*/
|
||||
export interface TreezPricingRaw {
|
||||
priceType: string;
|
||||
priceSell: number;
|
||||
postTaxPriceSell: number;
|
||||
discountedPrice: number;
|
||||
discountAmount: number;
|
||||
discountPercent: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Product group membership
|
||||
*/
|
||||
export interface TreezProductGroupRaw {
|
||||
id: string;
|
||||
name: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// NORMALIZED TYPES (for use in handlers)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Normalized Treez product for internal use
|
||||
*/
|
||||
export interface TreezProduct {
|
||||
// Identity
|
||||
id: string;
|
||||
name: string;
|
||||
brand: string;
|
||||
slug: string;
|
||||
|
||||
// Classification
|
||||
category: string;
|
||||
subtype: string;
|
||||
|
||||
// Inventory
|
||||
availableUnits: number;
|
||||
inStock: boolean;
|
||||
inventoryByLocation: Array<{
|
||||
locationId: string;
|
||||
locationName: string;
|
||||
availableUnits: number;
|
||||
}>;
|
||||
|
||||
// Pricing
|
||||
price: number;
|
||||
priceMin: number;
|
||||
priceMax: number;
|
||||
discountedPrice: number | null;
|
||||
discountPercent: number;
|
||||
|
||||
// Cannabinoids
|
||||
thcPercent: number | null;
|
||||
cbdPercent: number | null;
|
||||
|
||||
// Attributes
|
||||
strainType: string | null; // Indica, Sativa, Hybrid
|
||||
effects: string[];
|
||||
flavors: string[];
|
||||
isCannabis: boolean;
|
||||
|
||||
// Media
|
||||
imageUrl: string | null;
|
||||
images: string[];
|
||||
|
||||
// Status
|
||||
isActive: boolean;
|
||||
customerType: 'ADULT' | 'MEDICAL' | 'BOTH';
|
||||
|
||||
// Timestamps
|
||||
lastUpdated: string;
|
||||
createdAt: string;
|
||||
|
||||
// Full raw data preserved
|
||||
raw: TreezProductRaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store/dispensary information from Treez
|
||||
*/
|
||||
export interface TreezStore {
|
||||
storeId: string;
|
||||
name: string;
|
||||
address?: string;
|
||||
city?: string;
|
||||
state?: string;
|
||||
zip?: string;
|
||||
lat?: number;
|
||||
lng?: number;
|
||||
phone?: string;
|
||||
isRecreational: boolean;
|
||||
isMedical: boolean;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SESSION TYPES
|
||||
// ============================================================
|
||||
|
||||
import type { Browser, Page, CDPSession } from 'puppeteer';
|
||||
import type { BrowserFingerprint } from '../../services/crawl-rotator';
|
||||
|
||||
/**
|
||||
* Active Treez browser session
|
||||
*/
|
||||
export interface TreezSession {
|
||||
sessionId: string;
|
||||
browser: Browser;
|
||||
page: Page;
|
||||
cdpClient: CDPSession;
|
||||
fingerprint: BrowserFingerprint;
|
||||
proxyUrl: string | null;
|
||||
startedAt: Date;
|
||||
storeId?: string;
|
||||
capturedProducts: TreezProductRaw[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// API CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Treez API endpoints and configuration
|
||||
*/
|
||||
export interface TreezConfig {
|
||||
// Elasticsearch API (main product data)
|
||||
esEndpoint: string;
|
||||
esApiKey: string;
|
||||
|
||||
// Treez Headless API (discounts, etc.)
|
||||
headlessApiBase: string;
|
||||
clientId: string;
|
||||
clientSecret: string;
|
||||
|
||||
// Timeouts
|
||||
navigationTimeout: number;
|
||||
scrollDelay: number;
|
||||
maxScrollAttempts: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// RESPONSE TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Elasticsearch API response structure
|
||||
*/
|
||||
export interface TreezESResponse {
|
||||
hits: {
|
||||
total: { value: number };
|
||||
hits: Array<{
|
||||
_source: TreezProductRaw;
|
||||
}>;
|
||||
};
|
||||
aggregations?: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* Captured API response
|
||||
*/
|
||||
export interface CapturedResponse {
|
||||
type: 'products' | 'discounts' | 'other';
|
||||
url: string;
|
||||
data: any;
|
||||
timestamp: Date;
|
||||
}
|
||||
@@ -1,15 +1,27 @@
|
||||
/**
|
||||
* Treez Product Discovery Handler
|
||||
* ============================================================
|
||||
* TREEZ PRODUCT DISCOVERY HANDLER
|
||||
* ============================================================
|
||||
*
|
||||
* Fetches all products from a Treez store via Puppeteer + DOM scraping.
|
||||
* Fetches all products from a Treez store via Puppeteer + CDP interception.
|
||||
*
|
||||
* Flow:
|
||||
* HOW IT WORKS:
|
||||
* Treez uses Cloudflare + headless detection on their Elasticsearch API.
|
||||
* We bypass this by:
|
||||
* 1. Using Puppeteer with Stealth plugin
|
||||
* 2. Intercepting ES API responses via CDP (Chrome DevTools Protocol)
|
||||
* 3. Scrolling to trigger all product loads
|
||||
*
|
||||
* FLOW:
|
||||
* 1. Load dispensary with platform_dispensary_id (store slug)
|
||||
* 2. Navigate to menu URL, bypass age gate
|
||||
* 3. Scroll to load all products (infinite scroll)
|
||||
* 4. Extract products from DOM
|
||||
* 5. Save raw payload to filesystem
|
||||
* 6. Queue product_refresh task for normalization
|
||||
* 2. Start Puppeteer session with Stealth plugin
|
||||
* 3. Navigate to menu, bypass age gate if present
|
||||
* 4. Scroll to load all products (triggers ES API calls)
|
||||
* 5. CDP intercepts ES responses and captures product data
|
||||
* 6. Save raw payload to filesystem
|
||||
* 7. Queue product_refresh task for normalization
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
@@ -90,17 +102,16 @@ export async function handleProductDiscoveryTreez(ctx: TaskContext): Promise<Tas
|
||||
console.log(`[TreezProductDiscovery] Captured ${result.products.length} products`);
|
||||
|
||||
// Build payload for storage
|
||||
// result.products = raw Elasticsearch data
|
||||
// result.normalized = parsed/normalized products
|
||||
const rawPayload = {
|
||||
products: result.products, // Store the scraped product data
|
||||
store: {
|
||||
storeId: result.store.storeId,
|
||||
name: result.store.name,
|
||||
url: result.store.url,
|
||||
},
|
||||
capturedAt: new Date().toISOString(),
|
||||
products: result.products, // Raw ES product data
|
||||
normalized: result.normalized, // Parsed product data
|
||||
storeId: result.storeId,
|
||||
sourceUrl: result.sourceUrl,
|
||||
capturedAt: result.fetchedAt.toISOString(),
|
||||
platform: 'treez',
|
||||
dispensaryId,
|
||||
scrollCount: result.scrollCount,
|
||||
};
|
||||
|
||||
// Save raw payload to filesystem (platform = 'treez')
|
||||
@@ -140,14 +151,11 @@ export async function handleProductDiscoveryTreez(ctx: TaskContext): Promise<Tas
|
||||
|
||||
return {
|
||||
success: true,
|
||||
productCount: result.products.length,
|
||||
productCount: result.totalCaptured,
|
||||
payloadId,
|
||||
payloadSizeKB: Math.round(sizeBytes / 1024),
|
||||
storeInfo: {
|
||||
storeId: result.store.storeId,
|
||||
name: result.store.name,
|
||||
},
|
||||
scrollCount: result.scrollCount,
|
||||
storeId: result.storeId,
|
||||
sourceUrl: result.sourceUrl,
|
||||
queuedProductRefresh: true,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
|
||||
Reference in New Issue
Block a user