feat(treez): CDP interception client for Elasticsearch API capture
Rewrites Treez platform client to use CDP (Chrome DevTools Protocol) interception instead of DOM scraping. Key changes: - Uses Puppeteer Stealth plugin to bypass headless detection - Intercepts Elasticsearch API responses via CDP Network.responseReceived - Captures full product data including inventory levels (availableUnits) - Adds comprehensive TypeScript types for all Treez data structures - Updates queries.ts with automatic session management - Fixes product-discovery-treez handler for new API shape Tested with Best Dispensary: 142 products across 10 categories captured with inventory data, pricing, and lab results. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,107 +1,172 @@
|
|||||||
/**
|
/**
|
||||||
* Test script for Treez platform client
|
* ============================================================
|
||||||
* Tests the new Treez integration with Best Dispensary
|
* TREEZ CLIENT TEST SCRIPT
|
||||||
|
* ============================================================
|
||||||
|
*
|
||||||
|
* Tests the Treez CDP interception client using Best Dispensary.
|
||||||
|
*
|
||||||
|
* This verifies:
|
||||||
|
* - Stealth plugin bypasses headless detection
|
||||||
|
* - CDP intercepts Elasticsearch API responses
|
||||||
|
* - Products are captured and normalized correctly
|
||||||
|
* - Inventory data is available
|
||||||
*
|
*
|
||||||
* Usage: npx ts-node scripts/test-treez-client.ts
|
* Usage: npx ts-node scripts/test-treez-client.ts
|
||||||
|
*
|
||||||
|
* ============================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import {
|
import { fetchProductsFromUrl } from '../src/platforms/treez';
|
||||||
fetchProductsByStoreId,
|
|
||||||
} from '../src/platforms/treez';
|
|
||||||
import { TreezNormalizer } from '../src/hydration/normalizers/treez';
|
|
||||||
|
|
||||||
const TEST_STORE_ID = 'best';
|
const TEST_URL = 'https://shop.bestdispensary.com/shop';
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
console.log('='.repeat(60));
|
console.log('='.repeat(60));
|
||||||
console.log('Treez Platform Client Test');
|
console.log('TREEZ CLIENT TEST - CDP INTERCEPTION');
|
||||||
console.log('='.repeat(60));
|
console.log('='.repeat(60));
|
||||||
console.log(`Test Store: ${TEST_STORE_ID}`);
|
console.log(`URL: ${TEST_URL}`);
|
||||||
|
console.log('Method: Puppeteer + Stealth + CDP response capture');
|
||||||
console.log('');
|
console.log('');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Test 1: Fetch products from store
|
console.log('[Starting] Launching browser with Stealth plugin...\n');
|
||||||
console.log('[Test 1] Fetching products from Treez store...');
|
|
||||||
const result = await fetchProductsByStoreId(TEST_STORE_ID);
|
|
||||||
|
|
||||||
console.log('');
|
const result = await fetchProductsFromUrl(TEST_URL);
|
||||||
console.log('[Results]');
|
|
||||||
console.log(` Store: ${result.store.name}`);
|
|
||||||
console.log(` Store ID: ${result.store.storeId}`);
|
|
||||||
console.log(` Products captured: ${result.products.length}`);
|
|
||||||
console.log(` Scroll count: ${result.scrollCount}`);
|
|
||||||
|
|
||||||
if (result.products.length > 0) {
|
console.log('\n' + '='.repeat(60));
|
||||||
console.log('');
|
console.log('RESULTS');
|
||||||
console.log('[Sample Products (first 5)]');
|
|
||||||
for (const p of result.products.slice(0, 5)) {
|
|
||||||
console.log(` - ${p.name}`);
|
|
||||||
console.log(` Brand: ${p.brand || 'N/A'}`);
|
|
||||||
console.log(` Category: ${p.category || 'N/A'} / ${p.subcategory || 'N/A'}`);
|
|
||||||
console.log(` Price: ${p.price ? '$' + p.price : 'N/A'}`);
|
|
||||||
console.log(` THC: ${p.thcPercent !== null ? p.thcPercent + '%' : 'N/A'}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 2: Normalize products
|
|
||||||
console.log('');
|
|
||||||
console.log('[Test 2] Testing normalizer...');
|
|
||||||
const normalizer = new TreezNormalizer();
|
|
||||||
|
|
||||||
// Build a fake payload structure
|
|
||||||
const fakePayload = {
|
|
||||||
id: 'test-payload',
|
|
||||||
dispensary_id: 9999,
|
|
||||||
crawl_run_id: null,
|
|
||||||
platform: 'treez',
|
|
||||||
payload_version: 1,
|
|
||||||
raw_json: { products: result.products },
|
|
||||||
product_count: result.products.length,
|
|
||||||
pricing_type: null,
|
|
||||||
crawl_mode: null,
|
|
||||||
fetched_at: new Date(),
|
|
||||||
processed: false,
|
|
||||||
normalized_at: null,
|
|
||||||
hydration_error: null,
|
|
||||||
hydration_attempts: 0,
|
|
||||||
created_at: new Date(),
|
|
||||||
};
|
|
||||||
|
|
||||||
const normalized = normalizer.normalize(fakePayload);
|
|
||||||
|
|
||||||
console.log(` Products normalized: ${normalized.products.length}`);
|
|
||||||
console.log(` Brands extracted: ${normalized.brands.length}`);
|
|
||||||
console.log(` Categories extracted: ${normalized.categories.length}`);
|
|
||||||
console.log(` Errors: ${normalized.errors.length}`);
|
|
||||||
|
|
||||||
if (normalized.products.length > 0) {
|
|
||||||
console.log('');
|
|
||||||
console.log('[Sample Normalized Product]');
|
|
||||||
const np = normalized.products[0];
|
|
||||||
console.log(` External ID: ${np.externalProductId}`);
|
|
||||||
console.log(` Name: ${np.name}`);
|
|
||||||
console.log(` Brand: ${np.brandName}`);
|
|
||||||
console.log(` Category: ${np.category}`);
|
|
||||||
console.log(` Type: ${np.type}`);
|
|
||||||
console.log(` Strain: ${np.strainType}`);
|
|
||||||
console.log(` THC: ${np.thcPercent !== null ? np.thcPercent + '%' : 'N/A'}`);
|
|
||||||
console.log(` CBD: ${np.cbdPercent !== null ? np.cbdPercent + '%' : 'N/A'}`);
|
|
||||||
console.log(` Image: ${np.primaryImageUrl?.slice(0, 60) || 'N/A'}...`);
|
|
||||||
|
|
||||||
const pricing = normalized.pricing.get(np.externalProductId);
|
|
||||||
if (pricing) {
|
|
||||||
console.log(` Price (cents): ${pricing.priceRec}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('');
|
|
||||||
console.log('='.repeat(60));
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Total products: ${result.totalCaptured}`);
|
||||||
|
console.log(`Store ID: ${result.storeId || 'N/A (custom domain)'}`);
|
||||||
|
console.log(`Source URL: ${result.sourceUrl}`);
|
||||||
|
console.log(`Fetched at: ${result.fetchedAt.toISOString()}`);
|
||||||
|
|
||||||
|
if (result.products.length === 0) {
|
||||||
|
console.log('\n[WARNING] No products captured!');
|
||||||
|
console.log('This could mean:');
|
||||||
|
console.log(' - Stealth plugin is not bypassing detection');
|
||||||
|
console.log(' - CDP is not intercepting the correct URLs');
|
||||||
|
console.log(' - Page structure has changed');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show sample raw product
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('SAMPLE RAW PRODUCT (from Elasticsearch)');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
const raw = result.products[0];
|
||||||
|
console.log(JSON.stringify({
|
||||||
|
id: raw.id,
|
||||||
|
name: raw.name,
|
||||||
|
menuTitle: raw.menuTitle,
|
||||||
|
brand: raw.brand,
|
||||||
|
category: raw.category,
|
||||||
|
subtype: raw.subtype,
|
||||||
|
status: raw.status,
|
||||||
|
availableUnits: raw.availableUnits,
|
||||||
|
customMinPrice: raw.customMinPrice,
|
||||||
|
customMaxPrice: raw.customMaxPrice,
|
||||||
|
isActive: raw.isActive,
|
||||||
|
isAboveThreshold: raw.isAboveThreshold,
|
||||||
|
}, null, 2));
|
||||||
|
|
||||||
|
// Show sample normalized product
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('SAMPLE NORMALIZED PRODUCT');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
const normalized = result.normalized[0];
|
||||||
|
console.log(JSON.stringify({
|
||||||
|
id: normalized.id,
|
||||||
|
name: normalized.name,
|
||||||
|
brand: normalized.brand,
|
||||||
|
category: normalized.category,
|
||||||
|
subtype: normalized.subtype,
|
||||||
|
price: normalized.price,
|
||||||
|
priceMin: normalized.priceMin,
|
||||||
|
priceMax: normalized.priceMax,
|
||||||
|
discountedPrice: normalized.discountedPrice,
|
||||||
|
discountPercent: normalized.discountPercent,
|
||||||
|
availableUnits: normalized.availableUnits,
|
||||||
|
inStock: normalized.inStock,
|
||||||
|
thcPercent: normalized.thcPercent,
|
||||||
|
cbdPercent: normalized.cbdPercent,
|
||||||
|
strainType: normalized.strainType,
|
||||||
|
effects: normalized.effects,
|
||||||
|
flavors: normalized.flavors,
|
||||||
|
imageUrl: normalized.imageUrl,
|
||||||
|
images: normalized.images?.slice(0, 2),
|
||||||
|
}, null, 2));
|
||||||
|
|
||||||
|
// Brand breakdown
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('BRANDS (top 15)');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
const brandCounts = new Map<string, number>();
|
||||||
|
for (const p of result.normalized) {
|
||||||
|
const brand = p.brand || 'Unknown';
|
||||||
|
brandCounts.set(brand, (brandCounts.get(brand) || 0) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const sorted = [...brandCounts.entries()].sort((a, b) => b[1] - a[1]);
|
||||||
|
console.log(`Total unique brands: ${sorted.length}\n`);
|
||||||
|
sorted.slice(0, 15).forEach(([brand, count]) => {
|
||||||
|
console.log(` ${brand}: ${count} products`);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Category breakdown
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('CATEGORIES');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
const categoryCounts = new Map<string, number>();
|
||||||
|
for (const p of result.normalized) {
|
||||||
|
const cat = p.category || 'Unknown';
|
||||||
|
categoryCounts.set(cat, (categoryCounts.get(cat) || 0) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const catSorted = [...categoryCounts.entries()].sort((a, b) => b[1] - a[1]);
|
||||||
|
catSorted.forEach(([cat, count]) => {
|
||||||
|
console.log(` ${cat}: ${count} products`);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Inventory stats
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('INVENTORY STATS');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
const inStock = result.normalized.filter(p => p.inStock).length;
|
||||||
|
const outOfStock = result.normalized.filter(p => !p.inStock).length;
|
||||||
|
const hasInventoryData = result.normalized.filter(p => p.availableUnits > 0).length;
|
||||||
|
|
||||||
|
console.log(`In stock: ${inStock}`);
|
||||||
|
console.log(`Out of stock: ${outOfStock}`);
|
||||||
|
console.log(`With inventory levels: ${hasInventoryData}`);
|
||||||
|
|
||||||
|
// Show inventory examples
|
||||||
|
if (hasInventoryData > 0) {
|
||||||
|
console.log('\nSample inventory levels:');
|
||||||
|
result.normalized
|
||||||
|
.filter(p => p.availableUnits > 0)
|
||||||
|
.slice(0, 5)
|
||||||
|
.forEach(p => {
|
||||||
|
console.log(` ${p.name}: ${p.availableUnits} units`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for THC/CBD data
|
||||||
|
const hasThc = result.normalized.filter(p => p.thcPercent !== null).length;
|
||||||
|
const hasCbd = result.normalized.filter(p => p.cbdPercent !== null).length;
|
||||||
|
console.log(`\nWith THC data: ${hasThc} (${Math.round(hasThc / result.totalCaptured * 100)}%)`);
|
||||||
|
console.log(`With CBD data: ${hasCbd} (${Math.round(hasCbd / result.totalCaptured * 100)}%)`);
|
||||||
|
|
||||||
|
// Check for images
|
||||||
|
const hasImages = result.normalized.filter(p => p.imageUrl).length;
|
||||||
|
console.log(`With images: ${hasImages} (${Math.round(hasImages / result.totalCaptured * 100)}%)`);
|
||||||
|
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
console.log('TEST PASSED');
|
console.log('TEST PASSED');
|
||||||
console.log('='.repeat(60));
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
console.error('');
|
console.error('\n' + '='.repeat(60));
|
||||||
console.error('='.repeat(60));
|
|
||||||
console.error('TEST FAILED');
|
console.error('TEST FAILED');
|
||||||
console.error('='.repeat(60));
|
console.error('='.repeat(60));
|
||||||
console.error(`Error: ${error.message}`);
|
console.error(`Error: ${error.message}`);
|
||||||
|
|||||||
@@ -3,77 +3,63 @@
|
|||||||
* TREEZ PLATFORM CLIENT
|
* TREEZ PLATFORM CLIENT
|
||||||
* ============================================================
|
* ============================================================
|
||||||
*
|
*
|
||||||
* Treez is a fully client-side rendered platform (React/Next.js).
|
* Treez uses Cloudflare protection + headless detection on their
|
||||||
* Unlike Dutchie (GraphQL) or Jane (Algolia), Treez requires DOM
|
* Elasticsearch API. This client uses:
|
||||||
* parsing after page render. No API endpoints are available.
|
|
||||||
*
|
*
|
||||||
* Key differences:
|
* 1. Puppeteer with Stealth plugin to bypass detection
|
||||||
* - No Cloudflare protection (simpler than Jane)
|
* 2. CDP (Chrome DevTools Protocol) to intercept API responses
|
||||||
* - Products loaded via infinite scroll
|
* 3. Scrolling/pagination to trigger all product loads
|
||||||
* - Data extracted from DOM elements
|
*
|
||||||
* - Age gate must be bypassed
|
* API Endpoints (intercepted, not called directly):
|
||||||
|
* - Products: POST https://search-{tenant}.gapcommerceapi.com/product/search
|
||||||
|
* - Discounts: GET https://headless.treez.io/v2.0/dispensary/{storeId}/ecommerce/discounts
|
||||||
*
|
*
|
||||||
* URL Pattern: https://{storeId}.treez.io/onlinemenu/?customerType=ADULT
|
|
||||||
* Store ID Format: String slug (e.g., "best")
|
* Store ID Format: String slug (e.g., "best")
|
||||||
|
* Menu URL: https://{storeId}.treez.io/onlinemenu/ or custom domain
|
||||||
|
*
|
||||||
|
* Data captured includes:
|
||||||
|
* - Full product details (name, brand, category, subtype)
|
||||||
|
* - Inventory levels (availableUnits)
|
||||||
|
* - Pricing with discounts
|
||||||
|
* - Lab results (THC/CBD when available)
|
||||||
*
|
*
|
||||||
* ============================================================
|
* ============================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import puppeteer, { Browser, Page } from 'puppeteer';
|
import puppeteer from 'puppeteer-extra';
|
||||||
import puppeteerExtra from 'puppeteer-extra';
|
|
||||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
import type { Browser, Page, CDPSession } from 'puppeteer';
|
||||||
|
|
||||||
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
|
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||||
|
import type {
|
||||||
|
TreezSession,
|
||||||
|
TreezProductRaw,
|
||||||
|
TreezProduct,
|
||||||
|
TreezConfig,
|
||||||
|
TreezESResponse,
|
||||||
|
} from './types';
|
||||||
|
|
||||||
// Register stealth plugin (good practice even without Cloudflare)
|
// Register stealth plugin - REQUIRED for Treez
|
||||||
puppeteerExtra.use(StealthPlugin());
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
// ============================================================
|
|
||||||
// TYPES
|
|
||||||
// ============================================================
|
|
||||||
|
|
||||||
export interface TreezProductRaw {
|
|
||||||
productId: string;
|
|
||||||
name: string;
|
|
||||||
brand: string;
|
|
||||||
category: string;
|
|
||||||
subcategory: string; // indica, sativa, hybrid
|
|
||||||
thcPercent: number | null;
|
|
||||||
cbdPercent: number | null;
|
|
||||||
price: number | null;
|
|
||||||
priceUnit: string;
|
|
||||||
imageUrl: string | null;
|
|
||||||
inStock: boolean;
|
|
||||||
weight: string | null;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TreezSession {
|
|
||||||
sessionId: string;
|
|
||||||
browser: Browser;
|
|
||||||
page: Page;
|
|
||||||
fingerprint: BrowserFingerprint;
|
|
||||||
proxyUrl: string | null;
|
|
||||||
startedAt: Date;
|
|
||||||
storeId?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TreezStoreInfo {
|
|
||||||
storeId: string;
|
|
||||||
name: string;
|
|
||||||
url: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// CONFIGURATION
|
// CONFIGURATION
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
||||||
export const TREEZ_CONFIG = {
|
export const TREEZ_CONFIG: TreezConfig = {
|
||||||
baseUrl: 'https://{storeId}.treez.io/onlinemenu/',
|
// Elasticsearch API (product data) - intercepted via CDP
|
||||||
timeout: 60000,
|
esEndpoint: 'gapcommerceapi.com/product/search',
|
||||||
|
esApiKey: 'V3jHL9dFzi3Gj4UISM4lr38Nm0GSxcps5OBz1PbS',
|
||||||
|
|
||||||
|
// Treez Headless API (discounts, store info)
|
||||||
|
headlessApiBase: 'https://headless.treez.io/v2.0/dispensary',
|
||||||
|
clientId: '29dce682258145c6b1cf71027282d083',
|
||||||
|
clientSecret: 'A57bB49AfD7F4233B1750a0B501B4E16',
|
||||||
|
|
||||||
|
// Timing
|
||||||
navigationTimeout: 60000,
|
navigationTimeout: 60000,
|
||||||
scrollDelay: 1500,
|
scrollDelay: 1500,
|
||||||
maxScrollAttempts: 50,
|
maxScrollAttempts: 50,
|
||||||
ageGateDelay: 2000,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
@@ -102,6 +88,7 @@ export function getCrawlRotator(): CrawlRotator | null {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Start a new Treez browser session
|
* Start a new Treez browser session
|
||||||
|
* Uses Puppeteer + Stealth plugin with CDP for response interception
|
||||||
*/
|
*/
|
||||||
export async function startSession(storeId?: string): Promise<TreezSession> {
|
export async function startSession(storeId?: string): Promise<TreezSession> {
|
||||||
if (currentSession) {
|
if (currentSession) {
|
||||||
@@ -122,7 +109,8 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
|||||||
} else {
|
} else {
|
||||||
// Default fingerprint for local testing
|
// Default fingerprint for local testing
|
||||||
fingerprint = {
|
fingerprint = {
|
||||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
userAgent:
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||||
browserName: 'Chrome',
|
browserName: 'Chrome',
|
||||||
deviceCategory: 'desktop',
|
deviceCategory: 'desktop',
|
||||||
platform: 'Windows',
|
platform: 'Windows',
|
||||||
@@ -159,9 +147,9 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('[Treez Client] Launching browser...');
|
console.log('[Treez Client] Launching browser with Stealth plugin...');
|
||||||
const browser = await puppeteerExtra.launch({
|
const browser = await puppeteer.launch({
|
||||||
headless: true,
|
headless: 'new',
|
||||||
args: browserArgs,
|
args: browserArgs,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -176,18 +164,6 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
|||||||
// Set user agent
|
// Set user agent
|
||||||
await page.setUserAgent(fingerprint.userAgent);
|
await page.setUserAgent(fingerprint.userAgent);
|
||||||
|
|
||||||
// Block unnecessary resources to save bandwidth
|
|
||||||
// We only need HTML/JS for DOM extraction - not images, fonts, etc.
|
|
||||||
await page.setRequestInterception(true);
|
|
||||||
page.on('request', (request) => {
|
|
||||||
const resourceType = request.resourceType();
|
|
||||||
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
||||||
request.abort();
|
|
||||||
} else {
|
|
||||||
request.continue();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Handle proxy authentication if needed
|
// Handle proxy authentication if needed
|
||||||
if (proxyUrl) {
|
if (proxyUrl) {
|
||||||
const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/);
|
const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/);
|
||||||
@@ -199,16 +175,22 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create CDP session for response interception
|
||||||
|
const cdpClient = await page.target().createCDPSession();
|
||||||
|
await cdpClient.send('Network.enable');
|
||||||
|
|
||||||
const sessionId = `treez_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
const sessionId = `treez_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||||
|
|
||||||
currentSession = {
|
currentSession = {
|
||||||
sessionId,
|
sessionId,
|
||||||
browser,
|
browser,
|
||||||
page,
|
page,
|
||||||
|
cdpClient,
|
||||||
fingerprint,
|
fingerprint,
|
||||||
proxyUrl,
|
proxyUrl,
|
||||||
startedAt: new Date(),
|
startedAt: new Date(),
|
||||||
storeId,
|
storeId,
|
||||||
|
capturedProducts: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
console.log(`[Treez Client] Started session ${sessionId}`);
|
console.log(`[Treez Client] Started session ${sessionId}`);
|
||||||
@@ -226,7 +208,10 @@ export async function startSession(storeId?: string): Promise<TreezSession> {
|
|||||||
export async function endSession(): Promise<void> {
|
export async function endSession(): Promise<void> {
|
||||||
if (currentSession) {
|
if (currentSession) {
|
||||||
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
|
const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000);
|
||||||
console.log(`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s)`);
|
const productCount = currentSession.capturedProducts.length;
|
||||||
|
console.log(
|
||||||
|
`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s, ${productCount} products)`
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await currentSession.browser.close();
|
await currentSession.browser.close();
|
||||||
@@ -246,320 +231,400 @@ export function getCurrentSession(): TreezSession | null {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// AGE GATE HANDLING
|
// CDP RESPONSE INTERCEPTION
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Bypass age gate if present
|
* Setup CDP listener to capture Elasticsearch product responses
|
||||||
*/
|
*/
|
||||||
export async function bypassAgeGate(page: Page): Promise<boolean> {
|
function setupProductCapture(session: TreezSession): void {
|
||||||
console.log('[Treez Client] Checking for age gate...');
|
const { cdpClient } = session;
|
||||||
|
|
||||||
|
cdpClient.on('Network.responseReceived', async (event: any) => {
|
||||||
|
const url = event.response.url;
|
||||||
|
|
||||||
|
// Check if this is an ES product search response
|
||||||
|
if (url.includes('gapcommerceapi.com/product/search') && event.response.status === 200) {
|
||||||
try {
|
try {
|
||||||
const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]');
|
const response = await cdpClient.send('Network.getResponseBody', {
|
||||||
|
requestId: event.requestId,
|
||||||
if (ageGate) {
|
|
||||||
console.log('[Treez Client] Age gate detected, clicking confirm button...');
|
|
||||||
|
|
||||||
const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]');
|
|
||||||
if (submitBtn) {
|
|
||||||
await submitBtn.click();
|
|
||||||
console.log('[Treez Client] Clicked confirm button');
|
|
||||||
|
|
||||||
await sleep(TREEZ_CONFIG.ageGateDelay);
|
|
||||||
|
|
||||||
// Wait for age gate to disappear
|
|
||||||
await page.waitForFunction(
|
|
||||||
() => !document.querySelector('[data-testid="age-gate-modal"]'),
|
|
||||||
{ timeout: 10000 }
|
|
||||||
).catch(() => {
|
|
||||||
console.log('[Treez Client] Gate may still be visible, continuing anyway');
|
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log('[Treez Client] Age gate bypassed');
|
const body = response.base64Encoded
|
||||||
return true;
|
? Buffer.from(response.body, 'base64').toString('utf8')
|
||||||
} else {
|
: response.body;
|
||||||
console.log('[Treez Client] No submit button found');
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.log('[Treez Client] No age gate detected');
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
const json: TreezESResponse = JSON.parse(body);
|
||||||
} catch (err: any) {
|
const products = json.hits?.hits?.map((h) => h._source) || [];
|
||||||
console.log(`[Treez Client] Age gate error: ${err.message}`);
|
|
||||||
return false;
|
if (products.length > 0) {
|
||||||
|
session.capturedProducts.push(...products);
|
||||||
|
console.log(
|
||||||
|
`[Treez Client] Captured ${products.length} products (total: ${session.capturedProducts.length})`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
} catch {
|
||||||
|
// Response body may not be available, skip silently
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// NAVIGATION & SCRAPING
|
// PRODUCT FETCHING
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build menu URL for a store
|
* Navigate to store menu and capture all products via CDP interception
|
||||||
* Uses /brands page which contains all products (not just homepage carousels)
|
* This is the main method for fetching products
|
||||||
*/
|
*/
|
||||||
export function buildMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
export async function fetchAllProducts(
|
||||||
return `https://${storeId}.treez.io/onlinemenu/brands?customerType=${customerType}`;
|
menuUrl: string,
|
||||||
}
|
options: {
|
||||||
|
maxScrolls?: number;
|
||||||
|
scrollDelay?: number;
|
||||||
|
bypassAgeGate?: boolean;
|
||||||
|
} = {}
|
||||||
|
): Promise<TreezProductRaw[]> {
|
||||||
|
const {
|
||||||
|
maxScrolls = TREEZ_CONFIG.maxScrollAttempts,
|
||||||
|
scrollDelay = TREEZ_CONFIG.scrollDelay,
|
||||||
|
bypassAgeGate = true,
|
||||||
|
} = options;
|
||||||
|
|
||||||
/**
|
|
||||||
* Navigate to a store's menu page
|
|
||||||
*/
|
|
||||||
export async function navigateToMenu(storeId: string): Promise<void> {
|
|
||||||
if (!currentSession) {
|
if (!currentSession) {
|
||||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||||
}
|
}
|
||||||
|
|
||||||
const { page } = currentSession;
|
const { page } = currentSession;
|
||||||
const url = buildMenuUrl(storeId);
|
|
||||||
|
|
||||||
console.log(`[Treez Client] Navigating to ${url}`);
|
// Reset captured products
|
||||||
|
currentSession.capturedProducts = [];
|
||||||
|
|
||||||
await page.goto(url, {
|
// Setup CDP listener for product responses
|
||||||
|
setupProductCapture(currentSession);
|
||||||
|
|
||||||
|
console.log(`[Treez Client] Navigating to ${menuUrl}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(menuUrl, {
|
||||||
waitUntil: 'networkidle2',
|
waitUntil: 'networkidle2',
|
||||||
timeout: TREEZ_CONFIG.navigationTimeout,
|
timeout: TREEZ_CONFIG.navigationTimeout,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Wait for React app to render
|
await sleep(3000);
|
||||||
await sleep(2000);
|
|
||||||
|
|
||||||
// Bypass age gate
|
// Bypass age gate if present
|
||||||
await bypassAgeGate(page);
|
if (bypassAgeGate) {
|
||||||
|
await tryBypassAgeGate(page);
|
||||||
// Wait for content to load
|
|
||||||
await sleep(2000);
|
|
||||||
|
|
||||||
console.log('[Treez Client] Menu page loaded');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Wait for initial products to load
|
||||||
* Scroll to load all products (infinite scroll)
|
await sleep(3000);
|
||||||
*/
|
console.log(`[Treez Client] Initial capture: ${currentSession.capturedProducts.length} products`);
|
||||||
export async function scrollToLoadAll(page: Page): Promise<number> {
|
|
||||||
let previousHeight = 0;
|
|
||||||
let scrollCount = 0;
|
|
||||||
let sameHeightCount = 0;
|
|
||||||
|
|
||||||
console.log('[Treez Client] Starting infinite scroll...');
|
// Scroll and click "Load More" to get all products
|
||||||
|
console.log('[Treez Client] Scrolling to load all products...');
|
||||||
|
|
||||||
while (scrollCount < TREEZ_CONFIG.maxScrollAttempts) {
|
let previousCount = 0;
|
||||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
let noNewDataCount = 0;
|
||||||
|
|
||||||
if (currentHeight === previousHeight) {
|
for (let i = 0; i < maxScrolls; i++) {
|
||||||
sameHeightCount++;
|
// Scroll to bottom
|
||||||
if (sameHeightCount >= 3) {
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
console.log('[Treez Client] No new content after 3 attempts, stopping');
|
await sleep(scrollDelay);
|
||||||
|
|
||||||
|
// Try clicking "Load More" button
|
||||||
|
try {
|
||||||
|
const loadMoreBtn = await page.$('button.collection__load-more');
|
||||||
|
if (loadMoreBtn) {
|
||||||
|
const isVisible = await page.evaluate((btn: Element) => {
|
||||||
|
const rect = btn.getBoundingClientRect();
|
||||||
|
return rect.width > 0 && rect.height > 0;
|
||||||
|
}, loadMoreBtn);
|
||||||
|
|
||||||
|
if (isVisible) {
|
||||||
|
await loadMoreBtn.click();
|
||||||
|
await sleep(scrollDelay);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// No load more button or click failed
|
||||||
|
}
|
||||||
|
|
||||||
|
const currentCount = currentSession.capturedProducts.length;
|
||||||
|
if (currentCount === previousCount) {
|
||||||
|
noNewDataCount++;
|
||||||
|
if (noNewDataCount >= 5) {
|
||||||
|
console.log(`[Treez Client] No new products for 5 scrolls, stopping`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
sameHeightCount = 0;
|
noNewDataCount = 0;
|
||||||
}
|
if ((i + 1) % 5 === 0) {
|
||||||
|
console.log(`[Treez Client] Scroll ${i + 1}: ${currentCount} products`);
|
||||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
||||||
await sleep(TREEZ_CONFIG.scrollDelay);
|
|
||||||
|
|
||||||
previousHeight = currentHeight;
|
|
||||||
scrollCount++;
|
|
||||||
|
|
||||||
if (scrollCount % 5 === 0) {
|
|
||||||
const productCount = await page.evaluate(() => {
|
|
||||||
return document.querySelectorAll('[class*="product_product__"]').length;
|
|
||||||
});
|
|
||||||
console.log(`[Treez Client] Scroll ${scrollCount}: ${productCount} products loaded`);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
previousCount = currentCount;
|
||||||
return scrollCount;
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`[Treez Client] Navigation error: ${error.message}`);
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Deduplicate products by ID
|
||||||
* Extract products from the current page
|
|
||||||
*/
|
|
||||||
export async function extractProducts(page: Page): Promise<TreezProductRaw[]> {
|
|
||||||
console.log('[Treez Client] Extracting products from DOM...');
|
|
||||||
|
|
||||||
const products = await page.evaluate(() => {
|
|
||||||
const results: any[] = [];
|
|
||||||
|
|
||||||
// Find all product cards
|
|
||||||
const productElements = Array.from(
|
|
||||||
document.querySelectorAll('[class*="product_product__"]')
|
|
||||||
).filter(el => {
|
|
||||||
const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]');
|
|
||||||
const hasPrice = el.querySelector('[class*="price"]');
|
|
||||||
return hasName || hasPrice;
|
|
||||||
});
|
|
||||||
|
|
||||||
const seen = new Set<string>();
|
const seen = new Set<string>();
|
||||||
|
const uniqueProducts = currentSession.capturedProducts.filter((p) => {
|
||||||
for (const el of productElements) {
|
if (!p.id || seen.has(p.id)) return false;
|
||||||
try {
|
seen.add(p.id);
|
||||||
// Get product name
|
return true;
|
||||||
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
|
|
||||||
const name = nameEl?.textContent?.trim() || '';
|
|
||||||
|
|
||||||
if (!name || seen.has(name)) continue;
|
|
||||||
seen.add(name);
|
|
||||||
|
|
||||||
// Get product ID from link
|
|
||||||
const linkEl = el.querySelector('a[href*="/product/"]');
|
|
||||||
let productId = '';
|
|
||||||
if (linkEl) {
|
|
||||||
const href = linkEl.getAttribute('href') || '';
|
|
||||||
const match = href.match(/\/product\/([^\/\?]+)/);
|
|
||||||
productId = match ? match[1] : '';
|
|
||||||
}
|
|
||||||
if (!productId) {
|
|
||||||
productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get brand
|
|
||||||
const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]');
|
|
||||||
const brand = brandEl?.textContent?.trim() || '';
|
|
||||||
|
|
||||||
// Get price
|
|
||||||
const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]');
|
|
||||||
const priceText = priceEl?.textContent || '';
|
|
||||||
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
|
|
||||||
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
|
||||||
|
|
||||||
// Get image URL
|
|
||||||
const imgEl = el.querySelector('img');
|
|
||||||
let imageUrl = imgEl?.getAttribute('src') || null;
|
|
||||||
if (imageUrl && imageUrl.includes('/_next/image')) {
|
|
||||||
const urlMatch = imageUrl.match(/url=([^&]+)/);
|
|
||||||
if (urlMatch) {
|
|
||||||
imageUrl = decodeURIComponent(urlMatch[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get text content for data extraction
|
|
||||||
const text = el.textContent || '';
|
|
||||||
const textLower = text.toLowerCase();
|
|
||||||
|
|
||||||
// Get THC/CBD
|
|
||||||
const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) ||
|
|
||||||
text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
|
||||||
const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) ||
|
|
||||||
text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i);
|
|
||||||
const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null;
|
|
||||||
const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null;
|
|
||||||
|
|
||||||
// Get weight from name
|
|
||||||
const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i);
|
|
||||||
const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null;
|
|
||||||
|
|
||||||
// Determine category from weight and name (not full text to avoid nav pollution)
|
|
||||||
let category = '';
|
|
||||||
|
|
||||||
// Check explicit category patterns in NAME ONLY (not full text)
|
|
||||||
// This avoids false positives from navigation elements
|
|
||||||
const categoryPatterns = [
|
|
||||||
{ pattern: /vape|cart(?:ridge)?|pen|pod/i, category: 'vape' },
|
|
||||||
{ pattern: /edible|gummy|gummies|chocolate|candy/i, category: 'edible' },
|
|
||||||
{ pattern: /concentrate|dab|wax|shatter|rosin|resin/i, category: 'concentrate' },
|
|
||||||
{ pattern: /pre.?roll|joint|blunt/i, category: 'pre-roll' },
|
|
||||||
{ pattern: /topical|balm|cream|lotion/i, category: 'topical' },
|
|
||||||
{ pattern: /tincture/i, category: 'tincture' },
|
|
||||||
];
|
|
||||||
for (const { pattern, category: cat } of categoryPatterns) {
|
|
||||||
if (pattern.test(name)) {
|
|
||||||
category = cat;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If no explicit category found, infer from weight
|
|
||||||
if (!category && weight) {
|
|
||||||
const weightLower = weight.toLowerCase();
|
|
||||||
if (weightLower.includes('g') && !weightLower.includes('mg')) {
|
|
||||||
// Gram weights (3.5g, 1g, 7g, etc.) are typically flower
|
|
||||||
category = 'flower';
|
|
||||||
} else if (weightLower.includes('mg')) {
|
|
||||||
// Milligram weights are typically edibles
|
|
||||||
category = 'edible';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get strain type
|
|
||||||
const strainTypes = ['indica', 'sativa', 'hybrid'];
|
|
||||||
let subcategory = '';
|
|
||||||
for (const strain of strainTypes) {
|
|
||||||
if (textLower.includes(strain)) {
|
|
||||||
subcategory = strain;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check stock status
|
|
||||||
const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out');
|
|
||||||
|
|
||||||
results.push({
|
|
||||||
productId,
|
|
||||||
name,
|
|
||||||
brand,
|
|
||||||
category,
|
|
||||||
subcategory,
|
|
||||||
thcPercent,
|
|
||||||
cbdPercent,
|
|
||||||
price,
|
|
||||||
priceUnit: weight || '',
|
|
||||||
imageUrl,
|
|
||||||
inStock,
|
|
||||||
weight,
|
|
||||||
});
|
|
||||||
} catch (err) {
|
|
||||||
// Skip products that fail extraction
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`[Treez Client] Extracted ${products.length} products`);
|
console.log(`[Treez Client] Total unique products: ${uniqueProducts.length}`);
|
||||||
return products;
|
|
||||||
|
// Record success with rotator
|
||||||
|
if (crawlRotator && uniqueProducts.length > 0) {
|
||||||
|
await crawlRotator.recordSuccess();
|
||||||
|
}
|
||||||
|
|
||||||
|
return uniqueProducts;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetch all products from a store
|
* Fetch products from a specific brand page
|
||||||
* Main entry point for product discovery
|
|
||||||
*/
|
*/
|
||||||
export async function fetchAllProducts(storeId: string): Promise<{
|
export async function fetchBrandProducts(
|
||||||
products: TreezProductRaw[];
|
storeUrl: string,
|
||||||
storeInfo: TreezStoreInfo;
|
brandSlug: string
|
||||||
scrollCount: number;
|
): Promise<TreezProductRaw[]> {
|
||||||
}> {
|
const brandUrl = `${storeUrl}/brand/${encodeURIComponent(brandSlug)}`;
|
||||||
|
return fetchAllProducts(brandUrl, { maxScrolls: 30 });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch products from a specific category page
|
||||||
|
*/
|
||||||
|
export async function fetchCategoryProducts(
|
||||||
|
storeUrl: string,
|
||||||
|
categorySlug: string
|
||||||
|
): Promise<TreezProductRaw[]> {
|
||||||
|
const categoryUrl = `${storeUrl}/collection/${encodeURIComponent(categorySlug)}`;
|
||||||
|
return fetchAllProducts(categoryUrl, { maxScrolls: 30 });
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BRAND DISCOVERY
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch all brands from the /brands page
|
||||||
|
*/
|
||||||
|
export async function fetchAllBrands(
|
||||||
|
storeUrl: string
|
||||||
|
): Promise<Array<{ name: string; href: string }>> {
|
||||||
if (!currentSession) {
|
if (!currentSession) {
|
||||||
throw new Error('[Treez Client] No active session - call startSession() first');
|
throw new Error('[Treez Client] No active session - call startSession() first');
|
||||||
}
|
}
|
||||||
|
|
||||||
const { page } = currentSession;
|
const { page } = currentSession;
|
||||||
|
const brandsUrl = `${storeUrl}/brands`;
|
||||||
|
|
||||||
// Navigate to menu
|
console.log(`[Treez Client] Fetching brands from ${brandsUrl}`);
|
||||||
await navigateToMenu(storeId);
|
|
||||||
|
|
||||||
// Get page title for store info
|
await page.goto(brandsUrl, {
|
||||||
const pageTitle = await page.title();
|
waitUntil: 'networkidle2',
|
||||||
const storeInfo: TreezStoreInfo = {
|
timeout: TREEZ_CONFIG.navigationTimeout,
|
||||||
storeId,
|
});
|
||||||
name: pageTitle.split('|')[1]?.trim() || pageTitle,
|
|
||||||
url: buildMenuUrl(storeId),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Scroll to load all products
|
await sleep(3000);
|
||||||
const scrollCount = await scrollToLoadAll(page);
|
await tryBypassAgeGate(page);
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
// Extract products
|
// Click "Load More" to get all brands
|
||||||
const products = await extractProducts(page);
|
for (let i = 0; i < 20; i++) {
|
||||||
|
try {
|
||||||
|
const btn = await page.$('button.collection__load-more');
|
||||||
|
if (!btn) break;
|
||||||
|
|
||||||
// Record success if we got products
|
const isVisible = await page.evaluate((b: Element) => {
|
||||||
if (crawlRotator && products.length > 0) {
|
const rect = b.getBoundingClientRect();
|
||||||
await crawlRotator.recordSuccess();
|
return rect.width > 0 && rect.height > 0;
|
||||||
|
}, btn);
|
||||||
|
|
||||||
|
if (!isVisible) break;
|
||||||
|
|
||||||
|
await btn.click();
|
||||||
|
await sleep(1500);
|
||||||
|
} catch {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return { products, storeInfo, scrollCount };
|
// Extract brand links
|
||||||
|
const brands = await page.evaluate(() => {
|
||||||
|
const results: Array<{ name: string; href: string }> = [];
|
||||||
|
|
||||||
|
document.querySelectorAll('.brands-page__list a[href*="/brand/"]').forEach((a: Element) => {
|
||||||
|
const href = a.getAttribute('href') || '';
|
||||||
|
const name = a.textContent?.trim() || '';
|
||||||
|
if (name && href) {
|
||||||
|
results.push({ name, href });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[Treez Client] Found ${brands.length} brands`);
|
||||||
|
return brands;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATA NORMALIZATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse raw Treez product into normalized structure
|
||||||
|
*/
|
||||||
|
export function normalizeProduct(raw: TreezProductRaw): TreezProduct {
|
||||||
|
const productData = raw.productData || ({} as any);
|
||||||
|
const pricing = productData.pricing || {};
|
||||||
|
const labResults = productData.labResults || [];
|
||||||
|
|
||||||
|
// Extract THC/CBD from lab results
|
||||||
|
let thcPercent: number | null = null;
|
||||||
|
let cbdPercent: number | null = null;
|
||||||
|
|
||||||
|
for (const result of labResults) {
|
||||||
|
const cannabinoid = (result.cannabinoid || '').toLowerCase();
|
||||||
|
if (cannabinoid.includes('thc') && result.value != null) {
|
||||||
|
thcPercent = result.value;
|
||||||
|
} else if (cannabinoid.includes('cbd') && result.value != null) {
|
||||||
|
cbdPercent = result.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract strain type from subtype
|
||||||
|
let strainType: string | null = null;
|
||||||
|
const subtypeLower = (raw.subtype || '').toLowerCase();
|
||||||
|
|
||||||
|
if (subtypeLower.includes('indica')) {
|
||||||
|
strainType = 'Indica';
|
||||||
|
} else if (subtypeLower.includes('sativa')) {
|
||||||
|
strainType = 'Sativa';
|
||||||
|
} else if (subtypeLower.includes('hybrid')) {
|
||||||
|
strainType = 'Hybrid';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract images
|
||||||
|
const images = (productData.images || []).map((img: any) => img.url).filter(Boolean);
|
||||||
|
const imageUrl = images[0] || null;
|
||||||
|
|
||||||
|
// Extract inventory by location
|
||||||
|
const inventoryByLocation = (productData.inventory || []).map((inv: any) => ({
|
||||||
|
locationId: inv.locationId,
|
||||||
|
locationName: inv.locationName,
|
||||||
|
availableUnits: inv.availableUnits || 0,
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: raw.id,
|
||||||
|
name: raw.menuTitle || raw.name,
|
||||||
|
brand: raw.brand,
|
||||||
|
slug: raw.slug,
|
||||||
|
category: raw.category,
|
||||||
|
subtype: raw.subtype,
|
||||||
|
|
||||||
|
availableUnits: raw.availableUnits || 0,
|
||||||
|
inStock: (raw.availableUnits || 0) > 0,
|
||||||
|
inventoryByLocation,
|
||||||
|
|
||||||
|
price: pricing.priceSell || raw.customMinPrice || 0,
|
||||||
|
priceMin: raw.customMinPrice || 0,
|
||||||
|
priceMax: raw.customMaxPrice || 0,
|
||||||
|
discountedPrice:
|
||||||
|
pricing.discountedPrice !== pricing.priceSell ? pricing.discountedPrice : null,
|
||||||
|
discountPercent: pricing.discountPercent || 0,
|
||||||
|
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
strainType,
|
||||||
|
effects: raw.effects || [],
|
||||||
|
flavors: raw.flavors || [],
|
||||||
|
isCannabis: productData.isCannabis ?? true,
|
||||||
|
|
||||||
|
imageUrl,
|
||||||
|
images,
|
||||||
|
|
||||||
|
isActive: raw.isActive,
|
||||||
|
customerType: raw.customCustomerType,
|
||||||
|
|
||||||
|
lastUpdated: productData.lastUpdateDate || raw.customInjectionDate,
|
||||||
|
createdAt: productData.createdDate || raw.customInjectionDate,
|
||||||
|
|
||||||
|
raw,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// URL HELPERS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build menu URL for a Treez store
|
||||||
|
*/
|
||||||
|
export function buildMenuUrl(
|
||||||
|
storeId: string,
|
||||||
|
customerType: 'ADULT' | 'MEDICAL' = 'ADULT'
|
||||||
|
): string {
|
||||||
|
return `https://${storeId}.treez.io/onlinemenu/shop?customerType=${customerType}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build custom domain menu URL
|
||||||
|
*/
|
||||||
|
export function buildCustomDomainUrl(domain: string, path: string = '/shop'): string {
|
||||||
|
const cleanDomain = domain.replace(/^https?:\/\//, '').replace(/\/$/, '');
|
||||||
|
return `https://${cleanDomain}${path}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract store ID from a Treez URL
|
||||||
|
*/
|
||||||
|
export function extractStoreId(url: string): string | null {
|
||||||
|
// Pattern: {storeId}.treez.io
|
||||||
|
const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/);
|
||||||
|
if (treezMatch) {
|
||||||
|
return treezMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom domains need store ID from config
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// AGE GATE HANDLING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try to bypass age gate popup
|
||||||
|
*/
|
||||||
|
async function tryBypassAgeGate(page: Page): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
||||||
|
if (ageGate) {
|
||||||
|
console.log('[Treez Client] Age gate detected, bypassing...');
|
||||||
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||||
|
if (btn) {
|
||||||
|
await btn.click();
|
||||||
|
await sleep(2000);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// No age gate or error bypassing
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
@@ -569,3 +634,9 @@ export async function fetchAllProducts(storeId: string): Promise<{
|
|||||||
function sleep(ms: number): Promise<void> {
|
function sleep(ms: number): Promise<void> {
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// LEGACY EXPORTS (for backward compatibility)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export { tryBypassAgeGate as bypassAgeGate };
|
||||||
|
|||||||
@@ -1,50 +1,124 @@
|
|||||||
/**
|
/**
|
||||||
* Treez Platform Module
|
* ============================================================
|
||||||
|
* TREEZ PLATFORM MODULE
|
||||||
|
* ============================================================
|
||||||
*
|
*
|
||||||
* Single export point for all Treez communication.
|
* Single export point for all Treez communication.
|
||||||
* All Treez workers MUST import from this module.
|
* All Treez workers MUST import from this module.
|
||||||
|
*
|
||||||
|
* ARCHITECTURE:
|
||||||
|
* Unlike Dutchie (GraphQL API) and Jane (Algolia API), Treez uses
|
||||||
|
* a client-side rendered React app with Elasticsearch backend.
|
||||||
|
* Direct API access is blocked by Cloudflare + headless detection.
|
||||||
|
*
|
||||||
|
* SOLUTION:
|
||||||
|
* We use Puppeteer with Stealth plugin + CDP (Chrome DevTools Protocol)
|
||||||
|
* to intercept the Elasticsearch API responses as the page loads.
|
||||||
|
*
|
||||||
|
* KEY COMPONENTS:
|
||||||
|
* - client.ts: Low-level browser session and CDP interception
|
||||||
|
* - queries.ts: High-level operations with automatic session management
|
||||||
|
* - types.ts: TypeScript interfaces for Treez data structures
|
||||||
|
*
|
||||||
|
* USAGE EXAMPLE:
|
||||||
|
* ```typescript
|
||||||
|
* import { fetchProductsByStoreId } from '../platforms/treez';
|
||||||
|
*
|
||||||
|
* const result = await fetchProductsByStoreId('best');
|
||||||
|
* console.log(`Found ${result.totalCaptured} products`);
|
||||||
|
* console.log(`First product: ${result.normalized[0].name}`);
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* ============================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// HIGH-LEVEL OPERATIONS (Recommended for most use cases)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
export {
|
export {
|
||||||
// Session Management
|
// Product fetching with automatic session management
|
||||||
|
fetchProductsByStoreId,
|
||||||
|
fetchProductsFromUrl,
|
||||||
|
|
||||||
|
// Brand discovery
|
||||||
|
fetchBrandsFromStore,
|
||||||
|
|
||||||
|
// Store validation
|
||||||
|
validateStoreId,
|
||||||
|
extractStoreIdFromUrl,
|
||||||
|
|
||||||
|
// URL building
|
||||||
|
getMenuUrl,
|
||||||
|
getCustomDomainUrl,
|
||||||
|
|
||||||
|
// Result types
|
||||||
|
type FetchProductsResult,
|
||||||
|
type FetchBrandsResult,
|
||||||
|
} from './queries';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// LOW-LEVEL CLIENT (For advanced use cases)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export {
|
||||||
|
// Session management
|
||||||
startSession,
|
startSession,
|
||||||
endSession,
|
endSession,
|
||||||
getCurrentSession,
|
getCurrentSession,
|
||||||
|
|
||||||
// Proxy/Rotation
|
// Proxy/rotation integration
|
||||||
setCrawlRotator,
|
setCrawlRotator,
|
||||||
getCrawlRotator,
|
getCrawlRotator,
|
||||||
|
|
||||||
// Core Operations
|
// Core operations (require active session)
|
||||||
navigateToMenu,
|
|
||||||
scrollToLoadAll,
|
|
||||||
extractProducts,
|
|
||||||
fetchAllProducts,
|
fetchAllProducts,
|
||||||
bypassAgeGate,
|
fetchAllBrands,
|
||||||
|
fetchBrandProducts,
|
||||||
|
fetchCategoryProducts,
|
||||||
|
|
||||||
// URL Building
|
// Data normalization
|
||||||
|
normalizeProduct,
|
||||||
|
|
||||||
|
// URL helpers
|
||||||
buildMenuUrl,
|
buildMenuUrl,
|
||||||
|
buildCustomDomainUrl,
|
||||||
|
extractStoreId,
|
||||||
|
|
||||||
|
// Age gate
|
||||||
|
bypassAgeGate,
|
||||||
|
|
||||||
// Configuration
|
// Configuration
|
||||||
TREEZ_CONFIG,
|
TREEZ_CONFIG,
|
||||||
|
|
||||||
// Types
|
|
||||||
type TreezSession,
|
|
||||||
type TreezStoreInfo,
|
|
||||||
type TreezProductRaw,
|
|
||||||
} from './client';
|
} from './client';
|
||||||
|
|
||||||
// High-level Query Functions
|
// ============================================================
|
||||||
export {
|
// TYPES
|
||||||
fetchProductsByStoreId,
|
// ============================================================
|
||||||
fetchProductsFromUrl,
|
|
||||||
extractStoreIdFromUrl,
|
|
||||||
validateStoreId,
|
|
||||||
getMenuUrl,
|
|
||||||
|
|
||||||
// Types
|
export type {
|
||||||
type FetchProductsResult,
|
// Raw API response types
|
||||||
} from './queries';
|
TreezProductRaw,
|
||||||
|
TreezProductDataRaw,
|
||||||
|
TreezDiscountRaw,
|
||||||
|
TreezImageRaw,
|
||||||
|
TreezInventoryRaw,
|
||||||
|
TreezLabResultRaw,
|
||||||
|
TreezPricingRaw,
|
||||||
|
TreezProductGroupRaw,
|
||||||
|
|
||||||
// Re-export CrawlRotator types from canonical location
|
// Normalized types
|
||||||
export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator';
|
TreezProduct,
|
||||||
|
TreezStore,
|
||||||
|
|
||||||
|
// Session types
|
||||||
|
TreezSession,
|
||||||
|
TreezConfig,
|
||||||
|
|
||||||
|
// Response types
|
||||||
|
TreezESResponse,
|
||||||
|
CapturedResponse,
|
||||||
|
} from './types';
|
||||||
|
|
||||||
|
// Re-export CrawlRotator types for convenience
|
||||||
|
export type { CrawlRotator, Proxy, ProxyStats, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||||
|
|||||||
@@ -1,71 +1,212 @@
|
|||||||
/**
|
/**
|
||||||
* Treez High-Level Query Functions
|
* ============================================================
|
||||||
|
* TREEZ HIGH-LEVEL QUERY FUNCTIONS
|
||||||
|
* ============================================================
|
||||||
*
|
*
|
||||||
* Wraps the low-level client methods with business logic
|
* Wraps the low-level client methods with business logic
|
||||||
* for common operations like product fetching.
|
* for common operations like product fetching.
|
||||||
|
*
|
||||||
|
* Use these functions for most Treez operations - they handle
|
||||||
|
* session management automatically.
|
||||||
|
*
|
||||||
|
* ============================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import {
|
import {
|
||||||
startSession,
|
startSession,
|
||||||
endSession,
|
endSession,
|
||||||
fetchAllProducts,
|
fetchAllProducts,
|
||||||
|
fetchAllBrands,
|
||||||
|
normalizeProduct,
|
||||||
buildMenuUrl,
|
buildMenuUrl,
|
||||||
TreezProductRaw,
|
buildCustomDomainUrl,
|
||||||
TreezStoreInfo,
|
extractStoreId,
|
||||||
|
setCrawlRotator,
|
||||||
} from './client';
|
} from './client';
|
||||||
|
|
||||||
|
import type { TreezProductRaw, TreezProduct } from './types';
|
||||||
|
import type { CrawlRotator } from '../../services/crawl-rotator';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RESULT TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result from a product fetch operation
|
||||||
|
*/
|
||||||
|
export interface FetchProductsResult {
|
||||||
|
/** Raw products from Elasticsearch API */
|
||||||
|
products: TreezProductRaw[];
|
||||||
|
|
||||||
|
/** Normalized products ready for database */
|
||||||
|
normalized: TreezProduct[];
|
||||||
|
|
||||||
|
/** Total unique products captured */
|
||||||
|
totalCaptured: number;
|
||||||
|
|
||||||
|
/** Store ID extracted from URL */
|
||||||
|
storeId: string | null;
|
||||||
|
|
||||||
|
/** Original URL fetched */
|
||||||
|
sourceUrl: string;
|
||||||
|
|
||||||
|
/** Timestamp when fetch completed */
|
||||||
|
fetchedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result from a brand fetch operation
|
||||||
|
*/
|
||||||
|
export interface FetchBrandsResult {
|
||||||
|
/** List of brands with names and URLs */
|
||||||
|
brands: Array<{ name: string; href: string }>;
|
||||||
|
|
||||||
|
/** Total brands found */
|
||||||
|
totalBrands: number;
|
||||||
|
|
||||||
|
/** Store URL used */
|
||||||
|
sourceUrl: string;
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// PRODUCT OPERATIONS
|
// PRODUCT OPERATIONS
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
||||||
export interface FetchProductsResult {
|
/**
|
||||||
store: TreezStoreInfo;
|
* Fetch all products from a Treez store by store ID
|
||||||
products: TreezProductRaw[];
|
*
|
||||||
totalCaptured: number;
|
* This is the main entry point for product discovery.
|
||||||
scrollCount: number;
|
* Handles session management, CDP interception, and normalization.
|
||||||
|
*
|
||||||
|
* @param storeId - Treez store slug (e.g., "best")
|
||||||
|
* @param rotator - Optional CrawlRotator for proxy/fingerprint management
|
||||||
|
* @returns Products and metadata
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```typescript
|
||||||
|
* const result = await fetchProductsByStoreId('best');
|
||||||
|
* console.log(`Found ${result.totalCaptured} products`);
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
export async function fetchProductsByStoreId(
|
||||||
|
storeId: string,
|
||||||
|
rotator?: CrawlRotator
|
||||||
|
): Promise<FetchProductsResult> {
|
||||||
|
const menuUrl = buildMenuUrl(storeId);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Set rotator if provided
|
||||||
|
if (rotator) {
|
||||||
|
setCrawlRotator(rotator);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Start session and fetch
|
||||||
* Fetch all products from a Treez store
|
|
||||||
*
|
|
||||||
* @param storeId - Treez store ID (slug like "best")
|
|
||||||
* @returns Products and store data captured from the page
|
|
||||||
*/
|
|
||||||
export async function fetchProductsByStoreId(storeId: string): Promise<FetchProductsResult> {
|
|
||||||
try {
|
|
||||||
await startSession(storeId);
|
await startSession(storeId);
|
||||||
|
const products = await fetchAllProducts(menuUrl);
|
||||||
|
|
||||||
const { products, storeInfo, scrollCount } = await fetchAllProducts(storeId);
|
// Normalize all products
|
||||||
|
const normalized = products.map(normalizeProduct);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
store: storeInfo,
|
|
||||||
products,
|
products,
|
||||||
|
normalized,
|
||||||
totalCaptured: products.length,
|
totalCaptured: products.length,
|
||||||
scrollCount,
|
storeId,
|
||||||
|
sourceUrl: menuUrl,
|
||||||
|
fetchedAt: new Date(),
|
||||||
};
|
};
|
||||||
} finally {
|
} finally {
|
||||||
await endSession();
|
await endSession();
|
||||||
|
setCrawlRotator(null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetch products from a Treez menu URL
|
* Fetch all products from a custom domain URL
|
||||||
* Extracts store ID from URL and fetches products
|
|
||||||
*
|
*
|
||||||
* @param menuUrl - Full Treez menu URL
|
* Use this for stores with custom domains like shop.bestdispensary.com
|
||||||
* @returns Products and store data
|
* instead of best.treez.io
|
||||||
|
*
|
||||||
|
* @param menuUrl - Full URL to the store menu
|
||||||
|
* @param rotator - Optional CrawlRotator for proxy/fingerprint management
|
||||||
|
* @returns Products and metadata
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```typescript
|
||||||
|
* const result = await fetchProductsFromUrl('https://shop.bestdispensary.com/shop');
|
||||||
|
* ```
|
||||||
*/
|
*/
|
||||||
export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProductsResult> {
|
export async function fetchProductsFromUrl(
|
||||||
const storeId = extractStoreIdFromUrl(menuUrl);
|
menuUrl: string,
|
||||||
if (!storeId) {
|
rotator?: CrawlRotator
|
||||||
throw new Error(`Could not extract store ID from URL: ${menuUrl}`);
|
): Promise<FetchProductsResult> {
|
||||||
|
const storeId = extractStoreId(menuUrl);
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (rotator) {
|
||||||
|
setCrawlRotator(rotator);
|
||||||
}
|
}
|
||||||
|
|
||||||
return fetchProductsByStoreId(storeId);
|
await startSession(storeId || undefined);
|
||||||
|
const products = await fetchAllProducts(menuUrl);
|
||||||
|
const normalized = products.map(normalizeProduct);
|
||||||
|
|
||||||
|
return {
|
||||||
|
products,
|
||||||
|
normalized,
|
||||||
|
totalCaptured: products.length,
|
||||||
|
storeId,
|
||||||
|
sourceUrl: menuUrl,
|
||||||
|
fetchedAt: new Date(),
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await endSession();
|
||||||
|
setCrawlRotator(null);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// STORE OPERATIONS
|
// BRAND OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch all brands from a Treez store
|
||||||
|
*
|
||||||
|
* @param storeUrl - Base store URL (e.g., https://shop.bestdispensary.com)
|
||||||
|
* @param rotator - Optional CrawlRotator
|
||||||
|
* @returns List of brands with their page URLs
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```typescript
|
||||||
|
* const result = await fetchBrandsFromStore('https://shop.bestdispensary.com');
|
||||||
|
* result.brands.forEach(b => console.log(b.name));
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
export async function fetchBrandsFromStore(
|
||||||
|
storeUrl: string,
|
||||||
|
rotator?: CrawlRotator
|
||||||
|
): Promise<FetchBrandsResult> {
|
||||||
|
try {
|
||||||
|
if (rotator) {
|
||||||
|
setCrawlRotator(rotator);
|
||||||
|
}
|
||||||
|
|
||||||
|
await startSession();
|
||||||
|
const brands = await fetchAllBrands(storeUrl);
|
||||||
|
|
||||||
|
return {
|
||||||
|
brands,
|
||||||
|
totalBrands: brands.length,
|
||||||
|
sourceUrl: storeUrl,
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await endSession();
|
||||||
|
setCrawlRotator(null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STORE VALIDATION
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -73,26 +214,20 @@ export async function fetchProductsFromUrl(menuUrl: string): Promise<FetchProduc
|
|||||||
*
|
*
|
||||||
* Supports formats:
|
* Supports formats:
|
||||||
* - https://best.treez.io/onlinemenu/
|
* - https://best.treez.io/onlinemenu/
|
||||||
* - https://shop.bestdispensary.com/ (resolves to best.treez.io)
|
* - Custom domains return null (need to follow redirect)
|
||||||
*
|
*
|
||||||
* @param url - Treez menu URL
|
* @param url - Treez menu URL
|
||||||
* @returns Store ID or null if not found
|
* @returns Store ID or null if not found
|
||||||
*/
|
*/
|
||||||
export function extractStoreIdFromUrl(url: string): string | null {
|
export function extractStoreIdFromUrl(url: string): string | null {
|
||||||
// Pattern 1: {storeId}.treez.io
|
return extractStoreId(url);
|
||||||
const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/i);
|
|
||||||
if (treezMatch) {
|
|
||||||
return treezMatch[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pattern 2: Custom domain - would need to follow redirect
|
|
||||||
// For now, return null and let the caller handle it
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate that a store ID exists and is accessible
|
* Validate that a store ID exists and is accessible
|
||||||
*
|
*
|
||||||
|
* Attempts to load the store page and checks for 404
|
||||||
|
*
|
||||||
* @param storeId - Treez store ID
|
* @param storeId - Treez store ID
|
||||||
* @returns True if store is accessible
|
* @returns True if store is accessible
|
||||||
*/
|
*/
|
||||||
@@ -100,7 +235,11 @@ export async function validateStoreId(storeId: string): Promise<boolean> {
|
|||||||
try {
|
try {
|
||||||
await startSession(storeId);
|
await startSession(storeId);
|
||||||
|
|
||||||
const { page } = (await import('./client')).getCurrentSession()!;
|
const { getCurrentSession } = await import('./client');
|
||||||
|
const session = getCurrentSession();
|
||||||
|
if (!session) return false;
|
||||||
|
|
||||||
|
const { page } = session;
|
||||||
const url = buildMenuUrl(storeId);
|
const url = buildMenuUrl(storeId);
|
||||||
|
|
||||||
await page.goto(url, {
|
await page.goto(url, {
|
||||||
@@ -121,12 +260,27 @@ export async function validateStoreId(storeId: string): Promise<boolean> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// UTILITY FUNCTIONS
|
// URL HELPERS
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the direct Treez menu URL for a store
|
* Get the direct Treez menu URL for a store
|
||||||
|
*
|
||||||
|
* @param storeId - Store slug (e.g., "best")
|
||||||
|
* @param customerType - ADULT (recreational) or MEDICAL
|
||||||
|
* @returns Full menu URL
|
||||||
*/
|
*/
|
||||||
export function getMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
export function getMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string {
|
||||||
return buildMenuUrl(storeId, customerType);
|
return buildMenuUrl(storeId, customerType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get menu URL for a custom domain
|
||||||
|
*
|
||||||
|
* @param domain - Custom domain (e.g., shop.bestdispensary.com)
|
||||||
|
* @param path - Path to menu (default: /shop)
|
||||||
|
* @returns Full menu URL
|
||||||
|
*/
|
||||||
|
export function getCustomDomainUrl(domain: string, path: string = '/shop'): string {
|
||||||
|
return buildCustomDomainUrl(domain, path);
|
||||||
|
}
|
||||||
|
|||||||
285
backend/src/platforms/treez/types.ts
Normal file
285
backend/src/platforms/treez/types.ts
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
/**
|
||||||
|
* ============================================================
|
||||||
|
* TREEZ PLATFORM TYPES
|
||||||
|
* ============================================================
|
||||||
|
*
|
||||||
|
* TypeScript interfaces for Treez platform data structures.
|
||||||
|
* Based on Elasticsearch API responses captured via CDP interception.
|
||||||
|
*
|
||||||
|
* ============================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RAW API RESPONSE TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Raw product data from Treez Elasticsearch API
|
||||||
|
*/
|
||||||
|
export interface TreezProductRaw {
|
||||||
|
id: string;
|
||||||
|
name: string;
|
||||||
|
menuTitle: string;
|
||||||
|
brand: string;
|
||||||
|
category: string;
|
||||||
|
subtype: string;
|
||||||
|
slug: string;
|
||||||
|
oldSlug?: string;
|
||||||
|
status: string;
|
||||||
|
|
||||||
|
// Inventory
|
||||||
|
availableUnits: number;
|
||||||
|
|
||||||
|
// Pricing
|
||||||
|
customMinPrice: number;
|
||||||
|
customMaxPrice: number;
|
||||||
|
customOnSaleValue?: number;
|
||||||
|
|
||||||
|
// Visibility
|
||||||
|
isAboveThreshold: boolean;
|
||||||
|
isActive: boolean;
|
||||||
|
isHideFromMenu: boolean;
|
||||||
|
customCustomerType: 'ADULT' | 'MEDICAL' | 'BOTH';
|
||||||
|
|
||||||
|
// Attributes
|
||||||
|
effects: string[];
|
||||||
|
flavors: string[];
|
||||||
|
generals: string[];
|
||||||
|
ingredients: string[];
|
||||||
|
internalTags: string[];
|
||||||
|
|
||||||
|
// Inventory IDs
|
||||||
|
customInventoryIds: string[];
|
||||||
|
customInjectionDate: string;
|
||||||
|
|
||||||
|
// Extended product data
|
||||||
|
productData: TreezProductDataRaw;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extended product data from productData field
|
||||||
|
*/
|
||||||
|
export interface TreezProductDataRaw {
|
||||||
|
barcodes: string[];
|
||||||
|
discounts: TreezDiscountRaw[];
|
||||||
|
images: TreezImageRaw[];
|
||||||
|
inventory: TreezInventoryRaw[];
|
||||||
|
isCannabis: boolean;
|
||||||
|
labResults: TreezLabResultRaw[];
|
||||||
|
pricing: TreezPricingRaw;
|
||||||
|
productGroups: TreezProductGroupRaw[];
|
||||||
|
lastUpdateDate: string;
|
||||||
|
createdDate: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discount information
|
||||||
|
*/
|
||||||
|
export interface TreezDiscountRaw {
|
||||||
|
discountId: string;
|
||||||
|
discountTitle: string;
|
||||||
|
discountAffinity: string;
|
||||||
|
discountAmount: number;
|
||||||
|
discountMethod: 'PERCENT' | 'FLAT';
|
||||||
|
discountStackable: string;
|
||||||
|
discountConditions: Array<{ type: string; value: string }>;
|
||||||
|
discountProductGroups: string[];
|
||||||
|
discountProductGroupsRequired: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Product image
|
||||||
|
*/
|
||||||
|
export interface TreezImageRaw {
|
||||||
|
url: string;
|
||||||
|
isPrimary?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Location-level inventory
|
||||||
|
*/
|
||||||
|
export interface TreezInventoryRaw {
|
||||||
|
locationId: string;
|
||||||
|
locationName: string;
|
||||||
|
customerType: string;
|
||||||
|
availableUnits: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lab test results
|
||||||
|
*/
|
||||||
|
export interface TreezLabResultRaw {
|
||||||
|
cannabinoid?: string;
|
||||||
|
value?: number;
|
||||||
|
unit?: string;
|
||||||
|
testDate?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pricing information
|
||||||
|
*/
|
||||||
|
export interface TreezPricingRaw {
|
||||||
|
priceType: string;
|
||||||
|
priceSell: number;
|
||||||
|
postTaxPriceSell: number;
|
||||||
|
discountedPrice: number;
|
||||||
|
discountAmount: number;
|
||||||
|
discountPercent: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Product group membership
|
||||||
|
*/
|
||||||
|
export interface TreezProductGroupRaw {
|
||||||
|
id: string;
|
||||||
|
name: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// NORMALIZED TYPES (for use in handlers)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalized Treez product for internal use
|
||||||
|
*/
|
||||||
|
export interface TreezProduct {
|
||||||
|
// Identity
|
||||||
|
id: string;
|
||||||
|
name: string;
|
||||||
|
brand: string;
|
||||||
|
slug: string;
|
||||||
|
|
||||||
|
// Classification
|
||||||
|
category: string;
|
||||||
|
subtype: string;
|
||||||
|
|
||||||
|
// Inventory
|
||||||
|
availableUnits: number;
|
||||||
|
inStock: boolean;
|
||||||
|
inventoryByLocation: Array<{
|
||||||
|
locationId: string;
|
||||||
|
locationName: string;
|
||||||
|
availableUnits: number;
|
||||||
|
}>;
|
||||||
|
|
||||||
|
// Pricing
|
||||||
|
price: number;
|
||||||
|
priceMin: number;
|
||||||
|
priceMax: number;
|
||||||
|
discountedPrice: number | null;
|
||||||
|
discountPercent: number;
|
||||||
|
|
||||||
|
// Cannabinoids
|
||||||
|
thcPercent: number | null;
|
||||||
|
cbdPercent: number | null;
|
||||||
|
|
||||||
|
// Attributes
|
||||||
|
strainType: string | null; // Indica, Sativa, Hybrid
|
||||||
|
effects: string[];
|
||||||
|
flavors: string[];
|
||||||
|
isCannabis: boolean;
|
||||||
|
|
||||||
|
// Media
|
||||||
|
imageUrl: string | null;
|
||||||
|
images: string[];
|
||||||
|
|
||||||
|
// Status
|
||||||
|
isActive: boolean;
|
||||||
|
customerType: 'ADULT' | 'MEDICAL' | 'BOTH';
|
||||||
|
|
||||||
|
// Timestamps
|
||||||
|
lastUpdated: string;
|
||||||
|
createdAt: string;
|
||||||
|
|
||||||
|
// Full raw data preserved
|
||||||
|
raw: TreezProductRaw;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store/dispensary information from Treez
|
||||||
|
*/
|
||||||
|
export interface TreezStore {
|
||||||
|
storeId: string;
|
||||||
|
name: string;
|
||||||
|
address?: string;
|
||||||
|
city?: string;
|
||||||
|
state?: string;
|
||||||
|
zip?: string;
|
||||||
|
lat?: number;
|
||||||
|
lng?: number;
|
||||||
|
phone?: string;
|
||||||
|
isRecreational: boolean;
|
||||||
|
isMedical: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SESSION TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
import type { Browser, Page, CDPSession } from 'puppeteer';
|
||||||
|
import type { BrowserFingerprint } from '../../services/crawl-rotator';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Active Treez browser session
|
||||||
|
*/
|
||||||
|
export interface TreezSession {
|
||||||
|
sessionId: string;
|
||||||
|
browser: Browser;
|
||||||
|
page: Page;
|
||||||
|
cdpClient: CDPSession;
|
||||||
|
fingerprint: BrowserFingerprint;
|
||||||
|
proxyUrl: string | null;
|
||||||
|
startedAt: Date;
|
||||||
|
storeId?: string;
|
||||||
|
capturedProducts: TreezProductRaw[];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// API CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Treez API endpoints and configuration
|
||||||
|
*/
|
||||||
|
export interface TreezConfig {
|
||||||
|
// Elasticsearch API (main product data)
|
||||||
|
esEndpoint: string;
|
||||||
|
esApiKey: string;
|
||||||
|
|
||||||
|
// Treez Headless API (discounts, etc.)
|
||||||
|
headlessApiBase: string;
|
||||||
|
clientId: string;
|
||||||
|
clientSecret: string;
|
||||||
|
|
||||||
|
// Timeouts
|
||||||
|
navigationTimeout: number;
|
||||||
|
scrollDelay: number;
|
||||||
|
maxScrollAttempts: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RESPONSE TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Elasticsearch API response structure
|
||||||
|
*/
|
||||||
|
export interface TreezESResponse {
|
||||||
|
hits: {
|
||||||
|
total: { value: number };
|
||||||
|
hits: Array<{
|
||||||
|
_source: TreezProductRaw;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
aggregations?: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Captured API response
|
||||||
|
*/
|
||||||
|
export interface CapturedResponse {
|
||||||
|
type: 'products' | 'discounts' | 'other';
|
||||||
|
url: string;
|
||||||
|
data: any;
|
||||||
|
timestamp: Date;
|
||||||
|
}
|
||||||
@@ -1,15 +1,27 @@
|
|||||||
/**
|
/**
|
||||||
* Treez Product Discovery Handler
|
* ============================================================
|
||||||
|
* TREEZ PRODUCT DISCOVERY HANDLER
|
||||||
|
* ============================================================
|
||||||
*
|
*
|
||||||
* Fetches all products from a Treez store via Puppeteer + DOM scraping.
|
* Fetches all products from a Treez store via Puppeteer + CDP interception.
|
||||||
*
|
*
|
||||||
* Flow:
|
* HOW IT WORKS:
|
||||||
|
* Treez uses Cloudflare + headless detection on their Elasticsearch API.
|
||||||
|
* We bypass this by:
|
||||||
|
* 1. Using Puppeteer with Stealth plugin
|
||||||
|
* 2. Intercepting ES API responses via CDP (Chrome DevTools Protocol)
|
||||||
|
* 3. Scrolling to trigger all product loads
|
||||||
|
*
|
||||||
|
* FLOW:
|
||||||
* 1. Load dispensary with platform_dispensary_id (store slug)
|
* 1. Load dispensary with platform_dispensary_id (store slug)
|
||||||
* 2. Navigate to menu URL, bypass age gate
|
* 2. Start Puppeteer session with Stealth plugin
|
||||||
* 3. Scroll to load all products (infinite scroll)
|
* 3. Navigate to menu, bypass age gate if present
|
||||||
* 4. Extract products from DOM
|
* 4. Scroll to load all products (triggers ES API calls)
|
||||||
* 5. Save raw payload to filesystem
|
* 5. CDP intercepts ES responses and captures product data
|
||||||
* 6. Queue product_refresh task for normalization
|
* 6. Save raw payload to filesystem
|
||||||
|
* 7. Queue product_refresh task for normalization
|
||||||
|
*
|
||||||
|
* ============================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { TaskContext, TaskResult } from '../task-worker';
|
import { TaskContext, TaskResult } from '../task-worker';
|
||||||
@@ -90,17 +102,16 @@ export async function handleProductDiscoveryTreez(ctx: TaskContext): Promise<Tas
|
|||||||
console.log(`[TreezProductDiscovery] Captured ${result.products.length} products`);
|
console.log(`[TreezProductDiscovery] Captured ${result.products.length} products`);
|
||||||
|
|
||||||
// Build payload for storage
|
// Build payload for storage
|
||||||
|
// result.products = raw Elasticsearch data
|
||||||
|
// result.normalized = parsed/normalized products
|
||||||
const rawPayload = {
|
const rawPayload = {
|
||||||
products: result.products, // Store the scraped product data
|
products: result.products, // Raw ES product data
|
||||||
store: {
|
normalized: result.normalized, // Parsed product data
|
||||||
storeId: result.store.storeId,
|
storeId: result.storeId,
|
||||||
name: result.store.name,
|
sourceUrl: result.sourceUrl,
|
||||||
url: result.store.url,
|
capturedAt: result.fetchedAt.toISOString(),
|
||||||
},
|
|
||||||
capturedAt: new Date().toISOString(),
|
|
||||||
platform: 'treez',
|
platform: 'treez',
|
||||||
dispensaryId,
|
dispensaryId,
|
||||||
scrollCount: result.scrollCount,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Save raw payload to filesystem (platform = 'treez')
|
// Save raw payload to filesystem (platform = 'treez')
|
||||||
@@ -140,14 +151,11 @@ export async function handleProductDiscoveryTreez(ctx: TaskContext): Promise<Tas
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
productCount: result.products.length,
|
productCount: result.totalCaptured,
|
||||||
payloadId,
|
payloadId,
|
||||||
payloadSizeKB: Math.round(sizeBytes / 1024),
|
payloadSizeKB: Math.round(sizeBytes / 1024),
|
||||||
storeInfo: {
|
storeId: result.storeId,
|
||||||
storeId: result.store.storeId,
|
sourceUrl: result.sourceUrl,
|
||||||
name: result.store.name,
|
|
||||||
},
|
|
||||||
scrollCount: result.scrollCount,
|
|
||||||
queuedProductRefresh: true,
|
queuedProductRefresh: true,
|
||||||
};
|
};
|
||||||
} catch (error: unknown) {
|
} catch (error: unknown) {
|
||||||
|
|||||||
Reference in New Issue
Block a user