From 023cfc127f5c664c6681adb4fa1b33703161669f Mon Sep 17 00:00:00 2001 From: Kelly Date: Sat, 13 Dec 2025 16:40:52 -0700 Subject: [PATCH] fix(preflight): Apply stored fingerprint to task browser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add WorkerFingerprint interface with timezone, city, state, ip, locale - Store fingerprint in TaskWorker after preflight passes - Pass fingerprint through TaskContext to handlers - Apply timezone via CDP and locale via Accept-Language header - Ensures browser fingerprint matches proxy IP location This fixes anti-detect detection where timezone/locale mismatch with proxy IP was getting blocked by Cloudflare. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/scripts/count-jane-stores-v2.ts | 130 ++++ backend/scripts/count-jane-stores.ts | 98 +++ backend/scripts/explore-treez-structure.ts | 247 ++++++++ backend/scripts/test-iheartjane.ts | 188 ++++++ backend/scripts/test-jane-api-explore.ts | 224 +++++++ backend/scripts/test-jane-client.ts | 126 ++++ backend/scripts/test-jane-med-rec-compare.ts | 55 ++ backend/scripts/test-jane-med-rec-diff.ts | 79 +++ backend/scripts/test-jane-payload.ts | 35 ++ backend/scripts/test-treez-client.ts | 113 ++++ backend/scripts/test-treez-discovery.ts | 559 +++++++++++++++++ backend/src/hydration/normalizers/treez.ts | 227 +++++++ backend/src/platforms/treez/client.ts | 570 ++++++++++++++++++ backend/src/platforms/treez/index.ts | 50 ++ backend/src/platforms/treez/queries.ts | 132 ++++ backend/src/tasks/handlers/index.ts | 3 + .../handlers/product-discovery-dutchie.ts | 22 + .../tasks/handlers/product-discovery-treez.ts | 172 ++++++ backend/src/tasks/handlers/product-refresh.ts | 2 + backend/src/tasks/task-worker.ts | 47 +- cannaiq/dist/index.html | 4 +- cannaiq/src/pages/TasksDashboard.tsx | 5 +- 22 files changed, 3083 insertions(+), 5 deletions(-) create mode 100644 backend/scripts/count-jane-stores-v2.ts create mode 100644 backend/scripts/count-jane-stores.ts create mode 100644 backend/scripts/explore-treez-structure.ts create mode 100644 backend/scripts/test-iheartjane.ts create mode 100644 backend/scripts/test-jane-api-explore.ts create mode 100644 backend/scripts/test-jane-client.ts create mode 100644 backend/scripts/test-jane-med-rec-compare.ts create mode 100644 backend/scripts/test-jane-med-rec-diff.ts create mode 100644 backend/scripts/test-jane-payload.ts create mode 100644 backend/scripts/test-treez-client.ts create mode 100644 backend/scripts/test-treez-discovery.ts create mode 100644 backend/src/hydration/normalizers/treez.ts create mode 100644 backend/src/platforms/treez/client.ts create mode 100644 backend/src/platforms/treez/index.ts create mode 100644 backend/src/platforms/treez/queries.ts create mode 100644 backend/src/tasks/handlers/product-discovery-treez.ts diff --git a/backend/scripts/count-jane-stores-v2.ts b/backend/scripts/count-jane-stores-v2.ts new file mode 100644 index 00000000..21aa55a6 --- /dev/null +++ b/backend/scripts/count-jane-stores-v2.ts @@ -0,0 +1,130 @@ +/** + * Count Jane stores - v2: Try Algolia store search + * Usage: npx ts-node scripts/count-jane-stores-v2.ts + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +puppeteer.use(StealthPlugin()); + +const STATES = [ + 'AZ', 'CA', 'CO', 'FL', 'IL', 'MA', 'MI', 'NV', 'NJ', 'NY', 'OH', 'PA', 'WA', 'OR' +]; + +async function main() { + console.log('Counting Jane stores by exploring state pages...\n'); + + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + const allStores: Map = new Map(); + + await page.setRequestInterception(true); + page.on('request', (req) => { + const type = req.resourceType(); + if (['image', 'font', 'media', 'stylesheet'].includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + + page.on('response', async (response) => { + const url = response.url(); + const contentType = response.headers()['content-type'] || ''; + if (url.includes('iheartjane.com') && contentType.includes('json')) { + try { + const json = await response.json(); + // Look for stores in any response + if (json.stores && Array.isArray(json.stores)) { + for (const s of json.stores) { + if (s.id) allStores.set(s.id, s); + } + } + // Also check hits (Algolia format) + if (json.hits && Array.isArray(json.hits)) { + for (const s of json.hits) { + if (s.id) allStores.set(s.id, s); + } + } + } catch {} + } + }); + + // First visit the main stores page + console.log('Visiting main stores page...'); + await page.goto('https://www.iheartjane.com/stores', { + waitUntil: 'networkidle0', + timeout: 60000, + }); + await new Promise(r => setTimeout(r, 3000)); + + // Try to scroll to load more stores + console.log('Scrolling to load more...'); + for (let i = 0; i < 5; i++) { + await page.evaluate(() => window.scrollBy(0, 1000)); + await new Promise(r => setTimeout(r, 1000)); + } + + // Try clicking "Load More" if it exists + try { + const loadMore = await page.$('button:has-text("Load More"), [class*="load-more"]'); + if (loadMore) { + console.log('Clicking Load More...'); + await loadMore.click(); + await new Promise(r => setTimeout(r, 3000)); + } + } catch {} + + // Extract stores from DOM as fallback + const domStores = await page.evaluate(() => { + const storeElements = document.querySelectorAll('[data-store-id], [class*="StoreCard"], [class*="store-card"]'); + return storeElements.length; + }); + + console.log(`\nStores from DOM elements: ${domStores}`); + + await browser.close(); + + // Count by state + const byState: Record = {}; + for (const store of allStores.values()) { + const state = store.state || 'Unknown'; + byState[state] = (byState[state] || 0) + 1; + } + + console.log('\n=== JANE STORE COUNTS ===\n'); + console.log(`Unique stores captured: ${allStores.size}`); + + if (allStores.size > 0) { + console.log('\nBy State:'); + const sorted = Object.entries(byState).sort((a, b) => b[1] - a[1]); + for (const [state, count] of sorted.slice(0, 20)) { + console.log(` ${state}: ${count}`); + } + + // Check Arizona specifically + const azStores = Array.from(allStores.values()).filter(s => + s.state === 'Arizona' || s.state === 'AZ' + ); + console.log(`\nArizona stores: ${azStores.length}`); + if (azStores.length > 0) { + console.log('AZ stores:'); + for (const s of azStores.slice(0, 10)) { + console.log(` - ${s.name} (ID: ${s.id}) - ${s.city}`); + } + } + } + + // Note about total + console.log('\n--- Note ---'); + console.log('Jane uses server-side rendering. To get full store count,'); + console.log('you may need to check their public marketing materials or'); + console.log('iterate through known store IDs.'); +} + +main().catch(console.error); diff --git a/backend/scripts/count-jane-stores.ts b/backend/scripts/count-jane-stores.ts new file mode 100644 index 00000000..32720a29 --- /dev/null +++ b/backend/scripts/count-jane-stores.ts @@ -0,0 +1,98 @@ +/** + * Count Jane stores by state + * Usage: npx ts-node scripts/count-jane-stores.ts + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +puppeteer.use(StealthPlugin()); + +async function main() { + console.log('Counting Jane stores...\n'); + + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + + // Capture store data from API + const stores: any[] = []; + + await page.setRequestInterception(true); + page.on('request', (req) => { + const type = req.resourceType(); + if (['image', 'font', 'media', 'stylesheet'].includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + + page.on('response', async (response) => { + const url = response.url(); + if (url.includes('iheartjane.com') && url.includes('stores')) { + try { + const json = await response.json(); + if (json.stores && Array.isArray(json.stores)) { + stores.push(...json.stores); + } + } catch {} + } + }); + + // Visit the store directory + console.log('Loading Jane store directory...'); + await page.goto('https://www.iheartjane.com/stores', { + waitUntil: 'networkidle2', + timeout: 60000, + }); + + // Wait for stores to load + await new Promise(r => setTimeout(r, 5000)); + + // Also try to get store count from page content + const pageStoreCount = await page.evaluate(() => { + // Look for store count in page text + const text = document.body.innerText; + const match = text.match(/(\d+)\s*stores?/i); + return match ? parseInt(match[1]) : null; + }); + + await browser.close(); + + // Count by state + const byState: Record = {}; + for (const store of stores) { + const state = store.state || 'Unknown'; + byState[state] = (byState[state] || 0) + 1; + } + + console.log('\n=== JANE STORE COUNTS ===\n'); + console.log(`Total stores captured from API: ${stores.length}`); + if (pageStoreCount) { + console.log(`Page claims: ${pageStoreCount} stores`); + } + + console.log('\nBy State:'); + const sorted = Object.entries(byState).sort((a, b) => b[1] - a[1]); + for (const [state, count] of sorted) { + console.log(` ${state}: ${count}`); + } + + // Check Arizona specifically + const azStores = stores.filter(s => + s.state === 'Arizona' || s.state === 'AZ' + ); + console.log(`\nArizona stores: ${azStores.length}`); + if (azStores.length > 0) { + console.log('Sample AZ stores:'); + for (const s of azStores.slice(0, 5)) { + console.log(` - ${s.name} (ID: ${s.id}) - ${s.city}`); + } + } +} + +main().catch(console.error); diff --git a/backend/scripts/explore-treez-structure.ts b/backend/scripts/explore-treez-structure.ts new file mode 100644 index 00000000..dff1f38b --- /dev/null +++ b/backend/scripts/explore-treez-structure.ts @@ -0,0 +1,247 @@ +/** + * Explore Treez site structure to find full product catalog + * + * Usage: npx ts-node scripts/explore-treez-structure.ts + */ + +import puppeteer from 'puppeteer'; + +const STORE_ID = 'best'; + +async function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function main() { + console.log('='.repeat(60)); + console.log('Exploring Treez Site Structure'); + console.log('='.repeat(60)); + + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + + try { + // Navigate to base menu URL + const baseUrl = `https://${STORE_ID}.treez.io/onlinemenu/?customerType=ADULT`; + console.log(`\n[1] Navigating to: ${baseUrl}`); + await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 60000 }); + await sleep(3000); + + // Bypass age gate if present + const ageGate = await page.$('[data-testid="age-gate-modal"]'); + if (ageGate) { + console.log('[1] Age gate detected, bypassing...'); + const btn = await page.$('[data-testid="age-gate-submit-button"]'); + if (btn) await btn.click(); + await sleep(2000); + } + + // Get all navigation links + console.log('\n[2] Extracting navigation structure...'); + const navInfo = await page.evaluate(() => { + const links: { text: string; href: string }[] = []; + + // Look for nav links + document.querySelectorAll('nav a, [class*="nav"] a, [class*="menu"] a, header a').forEach(el => { + const text = el.textContent?.trim() || ''; + const href = el.getAttribute('href') || ''; + if (text && href && !links.some(l => l.href === href)) { + links.push({ text, href }); + } + }); + + // Look for category tabs/buttons + document.querySelectorAll('[class*="category"], [class*="tab"], [role="tab"]').forEach(el => { + const text = el.textContent?.trim() || ''; + const href = el.getAttribute('href') || el.getAttribute('data-href') || ''; + if (text && !links.some(l => l.text === text)) { + links.push({ text, href: href || `(click: ${el.className})` }); + } + }); + + // Get current URL + const currentUrl = window.location.href; + + // Count products on page + const productCount = document.querySelectorAll('[class*="product_product__"]').length; + + return { links, currentUrl, productCount }; + }); + + console.log(`Current URL: ${navInfo.currentUrl}`); + console.log(`Products on homepage: ${navInfo.productCount}`); + console.log('\nNavigation links found:'); + navInfo.links.forEach(l => { + console.log(` "${l.text}" → ${l.href}`); + }); + + // Look for category buttons/tabs specifically + console.log('\n[3] Looking for category navigation...'); + const categories = await page.evaluate(() => { + const cats: { text: string; className: string; tagName: string }[] = []; + + // Find all clickable elements that might be categories + const selectors = [ + '[class*="CategoryNav"]', + '[class*="category"]', + '[class*="Category"]', + '[class*="nav"] button', + '[class*="tab"]', + '[role="tablist"] *', + '.MuiTab-root', + '[class*="filter"]', + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(el => { + const text = el.textContent?.trim() || ''; + if (text && text.length < 50 && !cats.some(c => c.text === text)) { + cats.push({ + text, + className: el.className?.toString().slice(0, 80) || '', + tagName: el.tagName, + }); + } + }); + }); + + return cats; + }); + + console.log('Category-like elements:'); + categories.forEach(c => { + console.log(` [${c.tagName}] "${c.text}" (class: ${c.className})`); + }); + + // Try clicking on "Flower" or "All" if found + console.log('\n[4] Looking for "Flower" or "All Products" link...'); + const clickTargets = ['Flower', 'All', 'All Products', 'Shop All', 'View All']; + + for (const target of clickTargets) { + const element = await page.evaluate((targetText) => { + const els = Array.from(document.querySelectorAll('a, button, [role="tab"], [class*="category"]')); + const match = els.find(el => + el.textContent?.trim().toLowerCase() === targetText.toLowerCase() + ); + if (match) { + return { + found: true, + text: match.textContent?.trim(), + tag: match.tagName, + }; + } + return { found: false }; + }, target); + + if (element.found) { + console.log(`Found "${element.text}" (${element.tag}), clicking...`); + + await page.evaluate((targetText) => { + const els = Array.from(document.querySelectorAll('a, button, [role="tab"], [class*="category"]')); + const match = els.find(el => + el.textContent?.trim().toLowerCase() === targetText.toLowerCase() + ); + if (match) (match as HTMLElement).click(); + }, target); + + await sleep(3000); + + const newUrl = page.url(); + const newCount = await page.evaluate(() => + document.querySelectorAll('[class*="product_product__"]').length + ); + + console.log(` New URL: ${newUrl}`); + console.log(` Products after click: ${newCount}`); + + if (newCount > navInfo.productCount) { + console.log(` ✓ Found more products! (${navInfo.productCount} → ${newCount})`); + } + + break; + } + } + + // Check page height and scroll behavior + console.log('\n[5] Checking scroll behavior on current page...'); + let previousHeight = 0; + let scrollCount = 0; + let previousProductCount = await page.evaluate(() => + document.querySelectorAll('[class*="product_product__"]').length + ); + + while (scrollCount < 10) { + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + + if (currentHeight === previousHeight) { + console.log(` Scroll ${scrollCount + 1}: No height change, stopping`); + break; + } + + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await sleep(1500); + + const currentProductCount = await page.evaluate(() => + document.querySelectorAll('[class*="product_product__"]').length + ); + + console.log(` Scroll ${scrollCount + 1}: height=${currentHeight}, products=${currentProductCount}`); + + if (currentProductCount === previousProductCount && scrollCount > 2) { + console.log(' No new products loading, stopping'); + break; + } + + previousHeight = currentHeight; + previousProductCount = currentProductCount; + scrollCount++; + } + + // Try direct URL patterns + console.log('\n[6] Testing URL patterns...'); + const urlPatterns = [ + '/onlinemenu/flower?customerType=ADULT', + '/onlinemenu/all?customerType=ADULT', + '/onlinemenu?category=flower&customerType=ADULT', + '/onlinemenu?view=all&customerType=ADULT', + ]; + + for (const pattern of urlPatterns) { + const testUrl = `https://${STORE_ID}.treez.io${pattern}`; + console.log(`\nTrying: ${testUrl}`); + + await page.goto(testUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + await sleep(2000); + + // Bypass age gate again if needed + const gate = await page.$('[data-testid="age-gate-modal"]'); + if (gate) { + const btn = await page.$('[data-testid="age-gate-submit-button"]'); + if (btn) await btn.click(); + await sleep(2000); + } + + const productCount = await page.evaluate(() => + document.querySelectorAll('[class*="product_product__"]').length + ); + + console.log(` Products found: ${productCount}`); + } + + // Screenshot the final state + await page.screenshot({ path: '/tmp/treez-explore.png', fullPage: true }); + console.log('\n[7] Screenshot saved to /tmp/treez-explore.png'); + + } catch (error: any) { + console.error('Error:', error.message); + } finally { + await browser.close(); + } +} + +main().catch(console.error); diff --git a/backend/scripts/test-iheartjane.ts b/backend/scripts/test-iheartjane.ts new file mode 100644 index 00000000..caa43d4e --- /dev/null +++ b/backend/scripts/test-iheartjane.ts @@ -0,0 +1,188 @@ +/** + * One-off script to test iHeartJane scraping + * Mimics remote worker: Puppeteer + stealth + proxy + * + * Usage: npx ts-node scripts/test-iheartjane.ts + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +puppeteer.use(StealthPlugin()); + +const TARGET_URL = 'https://theflowershopusa.com/mesa/menu/'; +const STORE_ID = 2788; + +async function main() { + console.log('[iHeartJane Test] Starting...'); + + // No proxy for local testing + const browser = await puppeteer.launch({ + headless: true, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ], + }); + + const page = await browser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + + // Intercept network requests to capture API calls + const apiResponses: any[] = []; + + await page.setRequestInterception(true); + page.on('request', (req) => { + // Block heavy resources + const type = req.resourceType(); + if (['image', 'font', 'media', 'stylesheet'].includes(type)) { + req.abort(); + } else { + req.continue(); + } + }); + + page.on('response', async (response) => { + const url = response.url(); + const contentType = response.headers()['content-type'] || ''; + + // Capture any JSON response from iheartjane domains + if ((url.includes('iheartjane.com') || url.includes('algolia')) && contentType.includes('json')) { + try { + const json = await response.json(); + const type = url.includes('store') ? 'STORE' : + url.includes('product') ? 'PRODUCT' : + url.includes('algolia') ? 'ALGOLIA' : 'API'; + apiResponses.push({ type, url, data: json }); + console.log(`[${type}] ${url.substring(0, 120)}...`); + } catch { + // Not JSON + } + } + }); + + console.log(`[iHeartJane Test] Navigating to ${TARGET_URL}`); + + try { + await page.goto(TARGET_URL, { + waitUntil: 'networkidle2', + timeout: 60000, + }); + + console.log('[iHeartJane Test] Menu page loaded, waiting for data...'); + + // Wait a bit for all API calls to complete + await new Promise(r => setTimeout(r, 3000)); + + // Also try to get store info by visiting the store page + console.log('[iHeartJane Test] Fetching store info...'); + const storeInfoUrl = `https://api.iheartjane.com/v1/stores/${STORE_ID}`; + + // Try to fetch store info via page.evaluate (uses browser context) + const storeInfo = await page.evaluate(async (storeId) => { + try { + const resp = await fetch(`https://api.iheartjane.com/v1/stores/${storeId}`); + if (resp.ok) return await resp.json(); + return { error: resp.status }; + } catch (e: any) { + return { error: e.message }; + } + }, STORE_ID); + + if (storeInfo && !storeInfo.error) { + apiResponses.push({ type: 'STORE_DIRECT', url: storeInfoUrl, data: storeInfo }); + console.log('[STORE_DIRECT] Got store info via fetch'); + } else { + console.log(`[STORE_DIRECT] Failed: ${JSON.stringify(storeInfo)}`); + } + + console.log('[iHeartJane Test] Processing results...'); + + // Wait for products to load + await page.waitForSelector('[data-testid="product-card"], .product-card, [class*="ProductCard"]', { + timeout: 30000, + }).catch(() => console.log('[iHeartJane Test] No product cards found via selector')); + + // Try to extract product data from the page + const products = await page.evaluate(() => { + // Look for product data in various places + const results: any[] = []; + + // Method 1: Look for __INITIAL_STATE__ or similar + const scripts = Array.from(document.querySelectorAll('script')); + for (const script of scripts) { + const text = script.textContent || ''; + if (text.includes('products') && text.includes('price')) { + try { + // Try to find JSON object + const match = text.match(/\{[\s\S]*"products"[\s\S]*\}/); + if (match) { + results.push({ source: 'script', data: match[0].substring(0, 500) }); + } + } catch {} + } + } + + // Method 2: Look for product elements in DOM + const productElements = document.querySelectorAll('[data-testid="product-card"], .product-card, [class*="product"]'); + for (const el of Array.from(productElements).slice(0, 5)) { + const name = el.querySelector('[class*="name"], h3, h4')?.textContent; + const price = el.querySelector('[class*="price"]')?.textContent; + if (name) { + results.push({ source: 'dom', name, price }); + } + } + + return results; + }); + + console.log('\n[iHeartJane Test] === RESULTS ==='); + console.log(`Total API responses captured: ${apiResponses.length}`); + + // Group by type + const byType: Record = {}; + for (const r of apiResponses) { + byType[r.type] = byType[r.type] || []; + byType[r.type].push(r); + } + + for (const [type, items] of Object.entries(byType)) { + console.log(`\n--- ${type} (${items.length} responses) ---`); + for (const item of items) { + console.log(`URL: ${item.url}`); + // Show structure + if (item.data.hits) { + console.log(` Products: ${item.data.hits.length} hits`); + if (item.data.hits[0]) { + console.log(` Fields: ${Object.keys(item.data.hits[0]).join(', ')}`); + } + } else if (item.data.store) { + console.log(` Store: ${JSON.stringify(item.data.store, null, 2).substring(0, 1000)}`); + } else { + console.log(` Keys: ${Object.keys(item.data).join(', ')}`); + } + } + } + + // Write full data to file + const fs = await import('fs'); + fs.writeFileSync('/tmp/iheartjane-data.json', JSON.stringify(apiResponses, null, 2)); + console.log('\n[iHeartJane Test] Full data saved to /tmp/iheartjane-data.json'); + + // Take screenshot + await page.screenshot({ path: '/tmp/iheartjane-test.png', fullPage: false }); + console.log('[iHeartJane Test] Screenshot saved to /tmp/iheartjane-test.png'); + + } catch (error: any) { + console.error('[iHeartJane Test] Error:', error.message); + await page.screenshot({ path: '/tmp/iheartjane-error.png' }); + } finally { + await browser.close(); + } + + console.log('[iHeartJane Test] Done'); +} + +main().catch(console.error); diff --git a/backend/scripts/test-jane-api-explore.ts b/backend/scripts/test-jane-api-explore.ts new file mode 100644 index 00000000..e28c6be5 --- /dev/null +++ b/backend/scripts/test-jane-api-explore.ts @@ -0,0 +1,224 @@ +/** + * Explore Jane API to understand data structure + * Usage: npx ts-node scripts/test-jane-api-explore.ts + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +puppeteer.use(StealthPlugin()); + +async function main() { + console.log('Exploring Jane API from browser context...\n'); + + const browser = await puppeteer.launch({ + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + + // Intercept network requests to find store data API calls + const capturedResponses: Array<{ url: string; data: any }> = []; + + await page.setRequestInterception(true); + page.on('request', (req) => req.continue()); + + page.on('response', async (response) => { + const url = response.url(); + if (url.includes('iheartjane.com') && + (url.includes('/stores') || url.includes('/search') || url.includes('algolia'))) { + try { + const text = await response.text(); + if (text.startsWith('{') || text.startsWith('[')) { + const data = JSON.parse(text); + capturedResponses.push({ url, data }); + console.log(`Captured: ${url.substring(0, 100)}...`); + } + } catch { + // Not JSON + } + } + }); + + // Visit Jane to establish session + console.log('Visiting Jane stores page to capture network requests...'); + await page.goto('https://www.iheartjane.com/stores', { + waitUntil: 'networkidle2', + timeout: 60000, + }); + + console.log(`\nCaptured ${capturedResponses.length} API responses`); + + for (const resp of capturedResponses) { + console.log(`\n--- ${resp.url.substring(0, 80)} ---`); + const keys = Object.keys(resp.data); + console.log('Keys:', keys); + + // Check for stores array + if (resp.data.stores && Array.isArray(resp.data.stores)) { + console.log(`Stores count: ${resp.data.stores.length}`); + const firstStore = resp.data.stores[0]; + if (firstStore) { + console.log('First store keys:', Object.keys(firstStore)); + console.log('Sample:', JSON.stringify(firstStore, null, 2).substring(0, 500)); + } + } + + // Check for hits (Algolia) + if (resp.data.hits && Array.isArray(resp.data.hits)) { + console.log(`Hits count: ${resp.data.hits.length}`); + const firstHit = resp.data.hits[0]; + if (firstHit) { + console.log('First hit keys:', Object.keys(firstHit)); + } + } + } + + // Look for __NEXT_DATA__ or similar embedded data + console.log('\n--- Checking for embedded page data ---'); + const pageData = await page.evaluate(() => { + // Check for Next.js data + const nextData = (window as any).__NEXT_DATA__; + if (nextData?.props?.pageProps?.stores) { + return { + source: '__NEXT_DATA__', + storeCount: nextData.props.pageProps.stores.length, + firstStore: nextData.props.pageProps.stores[0], + }; + } + + // Check for any global store data + const win = window as any; + if (win.stores) return { source: 'window.stores', data: win.stores }; + if (win.__stores) return { source: 'window.__stores', data: win.__stores }; + + return null; + }); + + if (pageData) { + console.log('Found embedded data:', pageData.source); + console.log('Store count:', pageData.storeCount); + if (pageData.firstStore) { + console.log('First store keys:', Object.keys(pageData.firstStore)); + console.log('Sample:', JSON.stringify({ + id: pageData.firstStore.id, + name: pageData.firstStore.name, + city: pageData.firstStore.city, + state: pageData.firstStore.state, + }, null, 2)); + } + } else { + console.log('No embedded page data found'); + } + + // Try alternative API endpoints from browser context + console.log('\n--- Testing alternative API endpoints ---'); + + // Try the map endpoint + const mapData = await page.evaluate(async () => { + try { + const res = await fetch('https://api.iheartjane.com/v1/stores/map?per_page=100'); + if (res.ok) return await res.json(); + } catch {} + return null; + }); + + if (mapData) { + console.log('\n/v1/stores/map response:'); + console.log('Keys:', Object.keys(mapData)); + if (mapData.stores?.[0]) { + console.log('First store keys:', Object.keys(mapData.stores[0])); + } + } + + // Try index endpoint + const indexData = await page.evaluate(async () => { + try { + const res = await fetch('https://api.iheartjane.com/v1/stores/index?per_page=10'); + if (res.ok) return await res.json(); + } catch {} + return null; + }); + + if (indexData) { + console.log('\n/v1/stores/index response:'); + console.log('Keys:', Object.keys(indexData)); + if (indexData.stores?.[0]) { + console.log('First store keys:', Object.keys(indexData.stores[0])); + } + } + + // Try with state parameter + const stateData = await page.evaluate(async () => { + try { + const res = await fetch('https://api.iheartjane.com/v1/stores?state=AZ&per_page=10'); + if (res.ok) return await res.json(); + } catch {} + return null; + }); + + if (stateData) { + console.log('\n/v1/stores?state=AZ response:'); + console.log('Keys:', Object.keys(stateData)); + console.log('Stores count:', stateData.stores?.length); + if (stateData.stores?.[0]) { + console.log('First store keys:', Object.keys(stateData.stores[0])); + console.log('Sample:', JSON.stringify(stateData.stores[0], null, 2).substring(0, 300)); + } + } + + // Try Algolia directly for stores + console.log('\n--- Testing Algolia for stores ---'); + const algoliaStores = await page.evaluate(async () => { + try { + // Common Algolia search pattern + const res = await fetch('https://search.iheartjane.com/1/indexes/stores-production/query', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'X-Algolia-Application-Id': 'HKXSXRD7RA', + 'X-Algolia-API-Key': 'YjZhYjQxZjU4ZTNjMTRhYzExZTk2YjU2MzliMGE4ZTE5YjJkMmZkZTI2ODllYTY2MThlMzQ3Y2QxOTFkMjI5Y3RhZ0ZpbHRlcnM9', + }, + body: JSON.stringify({ + query: 'Arizona', + hitsPerPage: 20, + }), + }); + if (res.ok) return await res.json(); + } catch {} + return null; + }); + + if (algoliaStores) { + console.log('Algolia stores-production response:'); + console.log('Keys:', Object.keys(algoliaStores)); + console.log('Hits count:', algoliaStores.hits?.length); + if (algoliaStores.hits?.[0]) { + console.log('First hit keys:', Object.keys(algoliaStores.hits[0])); + console.log('Sample:', JSON.stringify(algoliaStores.hits[0], null, 2).substring(0, 500)); + } + } + + // Check if there's a /v2 endpoint + const v2Data = await page.evaluate(async () => { + try { + const res = await fetch('https://api.iheartjane.com/v2/stores?per_page=10'); + if (res.ok) return await res.json(); + } catch {} + return null; + }); + + if (v2Data) { + console.log('\n/v2/stores response:'); + console.log('Keys:', Object.keys(v2Data)); + if (v2Data.stores?.[0]) { + console.log('First store keys:', Object.keys(v2Data.stores[0])); + } + } + + await browser.close(); + console.log('\nDone!'); +} + +main().catch(console.error); diff --git a/backend/scripts/test-jane-client.ts b/backend/scripts/test-jane-client.ts new file mode 100644 index 00000000..16be3850 --- /dev/null +++ b/backend/scripts/test-jane-client.ts @@ -0,0 +1,126 @@ +/** + * Test script for Jane platform client + * Tests the new Jane integration with The Flower Shop Mesa + * + * Usage: npx ts-node scripts/test-jane-client.ts + */ + +import { + startSession, + endSession, + fetchProductsFromUrl, + resolveStoreFromUrl, +} from '../src/platforms/jane'; +import { JaneNormalizer } from '../src/hydration/normalizers/jane'; + +const TEST_URL = 'https://theflowershopusa.com/mesa/menu/'; + +async function main() { + console.log('='.repeat(60)); + console.log('Jane Platform Client Test'); + console.log('='.repeat(60)); + console.log(`Test URL: ${TEST_URL}`); + console.log(''); + + try { + // Test 1: Fetch products from URL + console.log('[Test 1] Fetching products from menu URL...'); + const result = await fetchProductsFromUrl(TEST_URL); + + console.log(''); + console.log('[Results]'); + console.log(` Store: ${result.store?.name || 'Not captured'}`); + console.log(` Store ID: ${result.store?.id || 'N/A'}`); + console.log(` Products captured: ${result.products.length}`); + console.log(` API responses: ${result.responses.length}`); + + if (result.store) { + console.log(''); + console.log('[Store Info]'); + console.log(` Address: ${result.store.address}, ${result.store.city}, ${result.store.state} ${result.store.zip}`); + console.log(` Phone: ${result.store.phone}`); + console.log(` Coordinates: ${result.store.lat}, ${result.store.long}`); + console.log(` Medical: ${result.store.medical}, Recreational: ${result.store.recreational}`); + console.log(` Rating: ${result.store.rating} (${result.store.reviews_count} reviews)`); + console.log(` Product count (store): ${result.store.product_count}`); + } + + if (result.products.length > 0) { + console.log(''); + console.log('[Sample Products (first 5)]'); + for (const p of result.products.slice(0, 5)) { + const price = p.price_gram || p.price_each || 'N/A'; + console.log(` - ${p.name} (${p.brand}) - $${price}`); + console.log(` Kind: ${p.kind}, Category: ${p.category}, THC: ${p.percent_thc}%`); + } + + // Test 2: Normalize products + console.log(''); + console.log('[Test 2] Testing normalizer...'); + const normalizer = new JaneNormalizer(); + + // Build a fake payload structure + const fakePayload = { + id: 'test-payload', + dispensary_id: 9999, + crawl_run_id: null, + platform: 'jane', + payload_version: 1, + raw_json: { hits: result.products.map(p => p.raw) }, + product_count: result.products.length, + pricing_type: null, + crawl_mode: null, + fetched_at: new Date(), + processed: false, + normalized_at: null, + hydration_error: null, + hydration_attempts: 0, + created_at: new Date(), + }; + + const normalized = normalizer.normalize(fakePayload); + + console.log(` Products normalized: ${normalized.products.length}`); + console.log(` Brands extracted: ${normalized.brands.length}`); + console.log(` Categories extracted: ${normalized.categories.length}`); + console.log(` Errors: ${normalized.errors.length}`); + + if (normalized.products.length > 0) { + console.log(''); + console.log('[Sample Normalized Product]'); + const np = normalized.products[0]; + console.log(` External ID: ${np.externalProductId}`); + console.log(` Name: ${np.name}`); + console.log(` Brand: ${np.brandName}`); + console.log(` Category: ${np.category}`); + console.log(` Type: ${np.type}`); + console.log(` Strain: ${np.strainType}`); + console.log(` THC: ${np.thcPercent}%`); + console.log(` CBD: ${np.cbdPercent}%`); + console.log(` Image: ${np.primaryImageUrl?.slice(0, 60)}...`); + + const pricing = normalized.pricing.get(np.externalProductId); + if (pricing) { + console.log(` Price (cents): ${pricing.priceRec}`); + console.log(` On Special: ${pricing.isOnSpecial}`); + } + } + } + + console.log(''); + console.log('='.repeat(60)); + console.log('TEST PASSED'); + console.log('='.repeat(60)); + + } catch (error: any) { + console.error(''); + console.error('='.repeat(60)); + console.error('TEST FAILED'); + console.error('='.repeat(60)); + console.error(`Error: ${error.message}`); + console.error(error.stack); + process.exit(1); + } +} + +main().catch(console.error); diff --git a/backend/scripts/test-jane-med-rec-compare.ts b/backend/scripts/test-jane-med-rec-compare.ts new file mode 100644 index 00000000..96d4ec7a --- /dev/null +++ b/backend/scripts/test-jane-med-rec-compare.ts @@ -0,0 +1,55 @@ +/** + * Compare MED vs REC product menus for same location + */ +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +puppeteer.use(StealthPlugin()); + +async function main() { + const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox'] }); + const page = await browser.newPage(); + + await page.goto('https://www.iheartjane.com/stores', { waitUntil: 'domcontentloaded' }); + await new Promise(r => setTimeout(r, 2000)); + + // Fetch REC products (store 3379) + const recProducts: number[] = await page.evaluate(async () => { + const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query: '', hitsPerPage: 100, filters: 'store_id=3379' }), + }); + const data = await res.json(); + return data.hits?.map((h: any) => h.product_id) || []; + }); + + // Fetch MED products (store 4540) + const medProducts: number[] = await page.evaluate(async () => { + const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query: '', hitsPerPage: 100, filters: 'store_id=4540' }), + }); + const data = await res.json(); + return data.hits?.map((h: any) => h.product_id) || []; + }); + + const recSet = new Set(recProducts); + const medSet = new Set(medProducts); + + const recOnly = recProducts.filter(id => !medSet.has(id)).length; + const medOnly = medProducts.filter(id => !recSet.has(id)).length; + const shared = recProducts.filter(id => medSet.has(id)).length; + + console.log('\nHana Phoenix - MED vs REC comparison (100 products each):'); + console.log(' REC products fetched:', recProducts.length); + console.log(' MED products fetched:', medProducts.length); + console.log(' REC-only:', recOnly); + console.log(' MED-only:', medOnly); + console.log(' Shared:', shared); + console.log(' Menus are:', shared === 0 ? 'COMPLETELY DIFFERENT' : shared === recProducts.length ? 'IDENTICAL' : 'PARTIALLY OVERLAPPING'); + + await browser.close(); +} + +main().catch(console.error); diff --git a/backend/scripts/test-jane-med-rec-diff.ts b/backend/scripts/test-jane-med-rec-diff.ts new file mode 100644 index 00000000..c2831d48 --- /dev/null +++ b/backend/scripts/test-jane-med-rec-diff.ts @@ -0,0 +1,79 @@ +/** + * Find ALL differing fields between MED and REC product payloads + */ +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +puppeteer.use(StealthPlugin()); + +async function main() { + const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox'] }); + const page = await browser.newPage(); + + await page.goto('https://www.iheartjane.com/stores', { waitUntil: 'domcontentloaded' }); + await new Promise(r => setTimeout(r, 2000)); + + // Get full product payload from REC store + const recProduct = await page.evaluate(async () => { + const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query: '', hitsPerPage: 1, filters: 'store_id=3379' }), + }); + const data = await res.json(); + return data.hits?.[0]; + }); + + const productId = recProduct?.product_id; + + // Get same product from MED store + const medProduct = await page.evaluate(async (pid: number) => { + const res = await fetch('https://search.iheartjane.com/1/indexes/menu-products-production/query', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query: '', hitsPerPage: 100, filters: 'store_id=4540' }), + }); + const data = await res.json(); + return data.hits?.find((h: any) => h.product_id === pid); + }, productId); + + console.log('Product:', recProduct?.name, '(ID:', productId, ')\n'); + + // Get all keys + const allKeys = new Set([...Object.keys(recProduct || {}), ...Object.keys(medProduct || {})]); + const sortedKeys = [...allKeys].sort(); + + console.log('=== ALL KEYS IN PAYLOAD ==='); + console.log(sortedKeys.join(', ')); + + console.log('\n=== FIELDS THAT DIFFER ==='); + let diffCount = 0; + for (const key of sortedKeys) { + const recVal = JSON.stringify(recProduct?.[key]); + const medVal = JSON.stringify(medProduct?.[key]); + if (recVal !== medVal) { + diffCount++; + console.log(`${key}:`); + console.log(` REC: ${recVal?.substring(0, 100)}`); + console.log(` MED: ${medVal?.substring(0, 100)}`); + } + } + + if (diffCount === 0) { + console.log('(none - payloads are identical)'); + } + + // Check for limit/allowance related fields + console.log('\n=== LIMIT-RELATED FIELDS ==='); + const limitFields = sortedKeys.filter(k => + k.includes('limit') || k.includes('max') || k.includes('allow') || + k.includes('quantity') || k.includes('cart') || k.includes('medical') || + k.includes('rec') || k.includes('weight') + ); + for (const key of limitFields) { + console.log(`${key}: REC=${JSON.stringify(recProduct?.[key])} | MED=${JSON.stringify(medProduct?.[key])}`); + } + + await browser.close(); +} + +main().catch(console.error); diff --git a/backend/scripts/test-jane-payload.ts b/backend/scripts/test-jane-payload.ts new file mode 100644 index 00000000..6f7438d0 --- /dev/null +++ b/backend/scripts/test-jane-payload.ts @@ -0,0 +1,35 @@ +/** + * Test script to capture and save full Jane payload + * Usage: npx ts-node scripts/test-jane-payload.ts + */ + +import * as fs from 'fs'; +import { fetchProductsFromUrl } from '../src/platforms/jane'; + +const TEST_URL = 'https://theflowershopusa.com/mesa/menu/'; +const OUTPUT_FILE = '/tmp/jane-test-payload.json'; + +async function main() { + console.log('Fetching Jane payload...'); + + const result = await fetchProductsFromUrl(TEST_URL); + + // Build payload structure matching what would be saved + const payload = { + hits: result.products.map(p => p.raw), + store: result.store?.raw || null, + capturedAt: new Date().toISOString(), + platform: 'jane', + storeId: result.store?.id, + productCount: result.products.length, + responseCount: result.responses.length, + }; + + // Save to file + fs.writeFileSync(OUTPUT_FILE, JSON.stringify(payload, null, 2)); + console.log(`\nPayload saved to: ${OUTPUT_FILE}`); + console.log(`Products: ${result.products.length}`); + console.log(`Size: ${Math.round(fs.statSync(OUTPUT_FILE).size / 1024)}KB`); +} + +main().catch(console.error); diff --git a/backend/scripts/test-treez-client.ts b/backend/scripts/test-treez-client.ts new file mode 100644 index 00000000..5770c59b --- /dev/null +++ b/backend/scripts/test-treez-client.ts @@ -0,0 +1,113 @@ +/** + * Test script for Treez platform client + * Tests the new Treez integration with Best Dispensary + * + * Usage: npx ts-node scripts/test-treez-client.ts + */ + +import { + fetchProductsByStoreId, +} from '../src/platforms/treez'; +import { TreezNormalizer } from '../src/hydration/normalizers/treez'; + +const TEST_STORE_ID = 'best'; + +async function main() { + console.log('='.repeat(60)); + console.log('Treez Platform Client Test'); + console.log('='.repeat(60)); + console.log(`Test Store: ${TEST_STORE_ID}`); + console.log(''); + + try { + // Test 1: Fetch products from store + console.log('[Test 1] Fetching products from Treez store...'); + const result = await fetchProductsByStoreId(TEST_STORE_ID); + + console.log(''); + console.log('[Results]'); + console.log(` Store: ${result.store.name}`); + console.log(` Store ID: ${result.store.storeId}`); + console.log(` Products captured: ${result.products.length}`); + console.log(` Scroll count: ${result.scrollCount}`); + + if (result.products.length > 0) { + console.log(''); + console.log('[Sample Products (first 5)]'); + for (const p of result.products.slice(0, 5)) { + console.log(` - ${p.name}`); + console.log(` Brand: ${p.brand || 'N/A'}`); + console.log(` Category: ${p.category || 'N/A'} / ${p.subcategory || 'N/A'}`); + console.log(` Price: ${p.price ? '$' + p.price : 'N/A'}`); + console.log(` THC: ${p.thcPercent !== null ? p.thcPercent + '%' : 'N/A'}`); + } + + // Test 2: Normalize products + console.log(''); + console.log('[Test 2] Testing normalizer...'); + const normalizer = new TreezNormalizer(); + + // Build a fake payload structure + const fakePayload = { + id: 'test-payload', + dispensary_id: 9999, + crawl_run_id: null, + platform: 'treez', + payload_version: 1, + raw_json: { products: result.products }, + product_count: result.products.length, + pricing_type: null, + crawl_mode: null, + fetched_at: new Date(), + processed: false, + normalized_at: null, + hydration_error: null, + hydration_attempts: 0, + created_at: new Date(), + }; + + const normalized = normalizer.normalize(fakePayload); + + console.log(` Products normalized: ${normalized.products.length}`); + console.log(` Brands extracted: ${normalized.brands.length}`); + console.log(` Categories extracted: ${normalized.categories.length}`); + console.log(` Errors: ${normalized.errors.length}`); + + if (normalized.products.length > 0) { + console.log(''); + console.log('[Sample Normalized Product]'); + const np = normalized.products[0]; + console.log(` External ID: ${np.externalProductId}`); + console.log(` Name: ${np.name}`); + console.log(` Brand: ${np.brandName}`); + console.log(` Category: ${np.category}`); + console.log(` Type: ${np.type}`); + console.log(` Strain: ${np.strainType}`); + console.log(` THC: ${np.thcPercent !== null ? np.thcPercent + '%' : 'N/A'}`); + console.log(` CBD: ${np.cbdPercent !== null ? np.cbdPercent + '%' : 'N/A'}`); + console.log(` Image: ${np.primaryImageUrl?.slice(0, 60) || 'N/A'}...`); + + const pricing = normalized.pricing.get(np.externalProductId); + if (pricing) { + console.log(` Price (cents): ${pricing.priceRec}`); + } + } + } + + console.log(''); + console.log('='.repeat(60)); + console.log('TEST PASSED'); + console.log('='.repeat(60)); + + } catch (error: any) { + console.error(''); + console.error('='.repeat(60)); + console.error('TEST FAILED'); + console.error('='.repeat(60)); + console.error(`Error: ${error.message}`); + console.error(error.stack); + process.exit(1); + } +} + +main().catch(console.error); diff --git a/backend/scripts/test-treez-discovery.ts b/backend/scripts/test-treez-discovery.ts new file mode 100644 index 00000000..8e279006 --- /dev/null +++ b/backend/scripts/test-treez-discovery.ts @@ -0,0 +1,559 @@ +/** + * Treez Platform Smoke Test + * + * Discovers DOM structure and extracts products from Treez menu pages. + * Used to determine actual CSS selectors for the platform client. + * + * Usage: npx ts-node scripts/test-treez-discovery.ts + */ + +import puppeteer, { Page } from 'puppeteer'; +import puppeteerExtra from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +// Register stealth plugin (even though Treez doesn't use Cloudflare, good practice) +puppeteerExtra.use(StealthPlugin()); + +const TEST_URL = 'https://best.treez.io/onlinemenu/?customerType=ADULT'; +const STORE_ID = 'best'; + +interface TreezProductRaw { + productId: string; + name: string; + brand: string; + category: string; + subcategory: string; + thcPercent: number | null; + cbdPercent: number | null; + price: number | null; + priceUnit: string; + imageUrl: string | null; + inStock: boolean; + weight: string | null; +} + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Scroll to load all products (infinite scroll) + */ +async function scrollToLoadAll(page: Page, maxScrolls = 30): Promise { + let previousHeight = 0; + let scrollCount = 0; + let sameHeightCount = 0; + + console.log('[Scroll] Starting infinite scroll...'); + + while (scrollCount < maxScrolls) { + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + + if (currentHeight === previousHeight) { + sameHeightCount++; + if (sameHeightCount >= 3) { + console.log('[Scroll] No new content after 3 attempts, stopping'); + break; + } + } else { + sameHeightCount = 0; + } + + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await sleep(1500); // Wait for products to load + + previousHeight = currentHeight; + scrollCount++; + + // Check how many products we have + const productCount = await page.evaluate(() => { + // Try multiple possible selectors + const selectors = [ + '[class*="product"]', + '[class*="Product"]', + '[data-product]', + '.menu-item', + '[class*="card"]', + '[class*="Card"]', + ]; + + for (const sel of selectors) { + const els = document.querySelectorAll(sel); + if (els.length > 10) return els.length; + } + return 0; + }); + + console.log(`[Scroll] Scroll ${scrollCount}: height=${currentHeight}, products~${productCount}`); + } + + return scrollCount; +} + +/** + * Analyze DOM structure to find product selectors + */ +async function analyzeDOM(page: Page): Promise { + console.log('\n' + '='.repeat(60)); + console.log('DOM STRUCTURE ANALYSIS'); + console.log('='.repeat(60)); + + // Find elements with "product" in class name + const productClasses = await page.evaluate(() => { + const classes = new Set(); + document.querySelectorAll('*').forEach((el) => { + const className = el.className; + if (typeof className === 'string' && className.toLowerCase().includes('product')) { + className.split(' ').forEach((c) => { + if (c.toLowerCase().includes('product')) { + classes.add(c); + } + }); + } + }); + return Array.from(classes).slice(0, 20); + }); + + console.log('\n[Classes containing "product"]:'); + productClasses.forEach((c: string) => console.log(` .${c}`)); + + // Find elements with "card" in class name + const cardClasses = await page.evaluate(() => { + const classes = new Set(); + document.querySelectorAll('*').forEach((el) => { + const className = el.className; + if (typeof className === 'string' && className.toLowerCase().includes('card')) { + className.split(' ').forEach((c) => { + if (c.toLowerCase().includes('card')) { + classes.add(c); + } + }); + } + }); + return Array.from(classes).slice(0, 20); + }); + + console.log('\n[Classes containing "card"]:'); + cardClasses.forEach((c: string) => console.log(` .${c}`)); + + // Find data attributes + const dataAttrs = await page.evaluate(() => { + const attrs = new Set(); + document.querySelectorAll('*').forEach((el) => { + Array.from(el.attributes).forEach((attr) => { + if (attr.name.startsWith('data-') && !attr.name.includes('reactid')) { + attrs.add(attr.name); + } + }); + }); + return Array.from(attrs).slice(0, 30); + }); + + console.log('\n[Data attributes found]:'); + dataAttrs.forEach((attr: string) => console.log(` ${attr}`)); + + // Get sample HTML of potential product container + const sampleHTML = await page.evaluate(() => { + // Try to find a product container + const selectors = [ + '[class*="ProductCard"]', + '[class*="product-card"]', + '[class*="menuItem"]', + '[class*="menu-item"]', + '[data-testid*="product"]', + ]; + + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el) { + return { + selector: sel, + html: el.outerHTML.slice(0, 2000), + childCount: el.children.length, + }; + } + } + + // Fallback: find repeating structures + const containers = document.querySelectorAll('div[class]'); + const classCounts = new Map(); + + containers.forEach((el) => { + if (el.children.length > 2 && el.className) { + classCounts.set(el.className, (classCounts.get(el.className) || 0) + 1); + } + }); + + // Find class that appears many times (likely product cards) + let bestClass = ''; + let bestCount = 0; + classCounts.forEach((count, className) => { + if (count > bestCount && count > 5) { + bestCount = count; + bestClass = className; + } + }); + + if (bestClass) { + const el = document.querySelector(`.${bestClass.split(' ')[0]}`); + if (el) { + return { + selector: `.${bestClass.split(' ')[0]}`, + html: el.outerHTML.slice(0, 2000), + childCount: el.children.length, + count: bestCount, + }; + } + } + + return null; + }); + + if (sampleHTML) { + console.log('\n[Sample Product Container]:'); + console.log(` Selector: ${sampleHTML.selector}`); + console.log(` Children: ${sampleHTML.childCount}`); + if ((sampleHTML as any).count) { + console.log(` Occurrences: ${(sampleHTML as any).count}`); + } + console.log('\n[Sample HTML (first 1000 chars)]:'); + console.log(sampleHTML.html.slice(0, 1000)); + } +} + +/** + * Extract products using discovered selectors + * Based on DOM analysis of Treez/GapCommerce React app + */ +async function extractProducts(page: Page): Promise { + console.log('\n' + '='.repeat(60)); + console.log('PRODUCT EXTRACTION'); + console.log('='.repeat(60)); + + const products = await page.evaluate(() => { + const results: any[] = []; + + // Treez uses classes like: product_product__ERWtJ + // Find all product cards using the discovered class patterns + const productSelectors = [ + '[class*="product_product__"]', // Main product container + '[class*="ProductCard"]', // Alternative pattern + ]; + + let productElements: Element[] = []; + + for (const selector of productSelectors) { + const elements = document.querySelectorAll(selector); + // Filter to only get the actual product cards, not child elements + const filtered = Array.from(elements).filter(el => { + // Must have a name element and price + const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]'); + const hasPrice = el.querySelector('[class*="price"]'); + return hasName || hasPrice; + }); + + if (filtered.length > 0) { + productElements = filtered; + console.log(`Found ${filtered.length} products with selector: ${selector}`); + break; + } + } + + // Dedupe - some cards may be captured multiple times + const seen = new Set(); + + // Extract data from each product element + for (const el of productElements) { + try { + // Get product name - look for name class + const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]'); + const name = nameEl?.textContent?.trim() || ''; + + if (!name || seen.has(name)) continue; + seen.add(name); + + // Get product ID from link + const linkEl = el.querySelector('a[href*="/product/"]'); + let productId = ''; + if (linkEl) { + const href = linkEl.getAttribute('href') || ''; + const match = href.match(/\/product\/([^\/\?]+)/); + productId = match ? match[1] : ''; + } + if (!productId) { + productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`; + } + + // Get brand from the info section or product name parsing + const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]'); + let brand = brandEl?.textContent?.trim() || ''; + + // Get price - look for price class with $ symbol + const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]'); + const priceText = priceEl?.textContent || ''; + const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/); + const price = priceMatch ? parseFloat(priceMatch[1]) : null; + + // Get image URL + const imgEl = el.querySelector('img'); + let imageUrl = imgEl?.getAttribute('src') || null; + // Handle Next.js image optimization URLs + if (imageUrl && imageUrl.includes('/_next/image')) { + const urlMatch = imageUrl.match(/url=([^&]+)/); + if (urlMatch) { + imageUrl = decodeURIComponent(urlMatch[1]); + } + } + + // Get text content for THC/CBD extraction + const text = el.textContent || ''; + + // Get THC/CBD - look for patterns like "THC 25.5%" or "25.5% THC" + const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) || + text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i); + const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) || + text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i); + const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null; + const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null; + + // Get weight from name or text (e.g., "3.5G", "1G") + const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i) || + text.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i); + const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null; + + // Price unit from weight + let priceUnit = ''; + if (weight) { + priceUnit = weight; + } + + // Get category/strain type + const strainTypes = ['indica', 'sativa', 'hybrid']; + let subcategory = ''; + const textLower = text.toLowerCase(); + for (const strain of strainTypes) { + if (textLower.includes(strain)) { + subcategory = strain; + break; + } + } + + // Determine category from various signals + let category = ''; + const categoryPatterns = [ + { pattern: /flower|bud/i, category: 'flower' }, + { pattern: /vape|cart|pen/i, category: 'vape' }, + { pattern: /edible|gummy|chocolate/i, category: 'edible' }, + { pattern: /concentrate|dab|wax|shatter/i, category: 'concentrate' }, + { pattern: /pre.?roll|joint/i, category: 'pre-roll' }, + { pattern: /topical|balm|cream/i, category: 'topical' }, + { pattern: /tincture/i, category: 'tincture' }, + ]; + for (const { pattern, category: cat } of categoryPatterns) { + if (pattern.test(text)) { + category = cat; + break; + } + } + + // Check stock status + const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out'); + + results.push({ + productId, + name, + brand, + category, + subcategory, + thcPercent, + cbdPercent, + price, + priceUnit, + imageUrl, + inStock, + weight, + }); + } catch (err) { + console.log('Error extracting product:', err); + } + } + + return results; + }); + + return products; +} + +/** + * Bypass age gate if present + */ +async function bypassAgeGate(page: Page): Promise { + console.log('[Age Gate] Checking for age gate...'); + + try { + // Wait for either age gate or main content + const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]'); + + if (ageGate) { + console.log('[Age Gate] Age gate detected, clicking confirm button...'); + + // Click the submit button + const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]'); + if (submitBtn) { + await submitBtn.click(); + console.log('[Age Gate] Clicked confirm button'); + + // Wait for age gate to disappear and menu to load + await sleep(2000); + + // Wait for navigation or content change + await page.waitForFunction( + () => !document.querySelector('[data-testid="age-gate-modal"]'), + { timeout: 10000 } + ).catch(() => { + console.log('[Age Gate] Gate may still be visible, continuing anyway'); + }); + + console.log('[Age Gate] Age gate bypassed'); + return true; + } else { + console.log('[Age Gate] No submit button found'); + } + } else { + console.log('[Age Gate] No age gate detected'); + } + + return false; + } catch (err: any) { + console.log(`[Age Gate] Error: ${err.message}`); + return false; + } +} + +async function main() { + console.log('='.repeat(60)); + console.log('TREEZ PLATFORM SMOKE TEST'); + console.log('='.repeat(60)); + console.log(`Store ID: ${STORE_ID}`); + console.log(`URL: ${TEST_URL}`); + console.log(''); + + const browser = await puppeteerExtra.launch({ + headless: true, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ], + }); + + try { + const page = await browser.newPage(); + + // Set viewport + await page.setViewport({ width: 1920, height: 1080 }); + + // Set user agent + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' + ); + + console.log('[Navigation] Going to Treez menu page...'); + await page.goto(TEST_URL, { + waitUntil: 'networkidle2', + timeout: 60000, + }); + + console.log('[Navigation] Page loaded, waiting for React app...'); + await sleep(2000); + + // Bypass age gate + await bypassAgeGate(page); + + // Wait for menu content to load + console.log('[Navigation] Waiting for menu content...'); + await sleep(3000); + + // Check if page loaded correctly + const pageTitle = await page.title(); + console.log(`[Navigation] Page title: ${pageTitle}`); + + // Take a screenshot for debugging + await page.screenshot({ path: '/tmp/treez-smoke-test.png', fullPage: false }); + console.log('[Debug] Screenshot saved to /tmp/treez-smoke-test.png'); + + // Analyze DOM structure + await analyzeDOM(page); + + // Scroll to load all products + await scrollToLoadAll(page); + + // Extract products + const products = await extractProducts(page); + + console.log('\n' + '='.repeat(60)); + console.log('RESULTS'); + console.log('='.repeat(60)); + console.log(`Total products extracted: ${products.length}`); + + if (products.length > 0) { + // Show statistics + const withPrice = products.filter((p) => p.price !== null).length; + const withThc = products.filter((p) => p.thcPercent !== null).length; + const withBrand = products.filter((p) => p.brand).length; + const withImage = products.filter((p) => p.imageUrl).length; + + console.log(`\n[Data Quality]`); + console.log(` With price: ${withPrice}/${products.length} (${Math.round((withPrice / products.length) * 100)}%)`); + console.log(` With THC%: ${withThc}/${products.length} (${Math.round((withThc / products.length) * 100)}%)`); + console.log(` With brand: ${withBrand}/${products.length} (${Math.round((withBrand / products.length) * 100)}%)`); + console.log(` With image: ${withImage}/${products.length} (${Math.round((withImage / products.length) * 100)}%)`); + + // Show sample products + console.log('\n[Sample Products (first 10)]:'); + for (const p of products.slice(0, 10)) { + console.log(`\n ${p.name}`); + console.log(` ID: ${p.productId}`); + console.log(` Brand: ${p.brand || 'N/A'}`); + console.log(` Category: ${p.category || 'N/A'} / ${p.subcategory || 'N/A'}`); + console.log(` THC: ${p.thcPercent !== null ? p.thcPercent + '%' : 'N/A'}`); + console.log(` CBD: ${p.cbdPercent !== null ? p.cbdPercent + '%' : 'N/A'}`); + console.log(` Price: ${p.price !== null ? '$' + p.price : 'N/A'} ${p.priceUnit}`); + console.log(` Weight: ${p.weight || 'N/A'}`); + console.log(` Image: ${p.imageUrl?.slice(0, 60) || 'N/A'}...`); + console.log(` In Stock: ${p.inStock}`); + } + + // Save full results to file + const fs = await import('fs'); + fs.writeFileSync('/tmp/treez-products.json', JSON.stringify(products, null, 2)); + console.log('\n[Debug] Full product list saved to /tmp/treez-products.json'); + } else { + console.log('\n[WARNING] No products extracted!'); + console.log('Check /tmp/treez-smoke-test.png for page state'); + + // Dump page HTML for debugging + const html = await page.content(); + const fs = await import('fs'); + fs.writeFileSync('/tmp/treez-page.html', html); + console.log('[Debug] Page HTML saved to /tmp/treez-page.html'); + } + + console.log('\n' + '='.repeat(60)); + console.log(products.length > 0 ? 'SMOKE TEST PASSED' : 'SMOKE TEST NEEDS ADJUSTMENT'); + console.log('='.repeat(60)); + + } catch (error: any) { + console.error('\n' + '='.repeat(60)); + console.error('SMOKE TEST FAILED'); + console.error('='.repeat(60)); + console.error(`Error: ${error.message}`); + console.error(error.stack); + process.exit(1); + } finally { + await browser.close(); + } +} + +main().catch(console.error); diff --git a/backend/src/hydration/normalizers/treez.ts b/backend/src/hydration/normalizers/treez.ts new file mode 100644 index 00000000..a92de299 --- /dev/null +++ b/backend/src/hydration/normalizers/treez.ts @@ -0,0 +1,227 @@ +/** + * Treez Platform Normalizer + * + * Normalizes raw Treez DOM-scraped product data to canonical format. + * + * Treez is scraped via Puppeteer (no API), so the raw format is + * the TreezProductRaw interface from our client. + * + * Key differences from Dutchie/Jane: + * - Data comes from DOM parsing, not API response + * - Price is a single value (not multiple weights like Jane) + * - Product ID is generated from product name or URL slug + * - Less structured data (category/strain inferred from text) + */ + +import { BaseNormalizer } from './base'; +import { + NormalizedProduct, + NormalizedPricing, + NormalizedAvailability, + NormalizedBrand, + NormalizedCategory, +} from '../types'; + +export class TreezNormalizer extends BaseNormalizer { + readonly platform = 'treez'; + readonly supportedVersions = [1]; + + // ============================================================ + // EXTRACTION + // ============================================================ + + extractProducts(rawJson: any): any[] { + // Treez payload format: { products: [...] } + if (rawJson?.products && Array.isArray(rawJson.products)) { + return rawJson.products; + } + + // Direct array of products + if (Array.isArray(rawJson)) { + return rawJson; + } + + // Hits array (normalized format) + if (rawJson?.hits && Array.isArray(rawJson.hits)) { + return rawJson.hits; + } + + console.warn('[TreezNormalizer] Could not extract products from payload'); + return []; + } + + validatePayload(rawJson: any): { valid: boolean; errors: string[] } { + const errors: string[] = []; + + if (!rawJson) { + errors.push('Payload is null or undefined'); + return { valid: false, errors }; + } + + const products = this.extractProducts(rawJson); + if (products.length === 0) { + errors.push('No products found in payload'); + } + + return { valid: errors.length === 0, errors }; + } + + // ============================================================ + // NORMALIZATION + // ============================================================ + + protected normalizeProduct(rawProduct: any, dispensaryId: number): NormalizedProduct | null { + const externalId = rawProduct.productId; + if (!externalId) { + console.warn('[TreezNormalizer] Product missing ID, skipping'); + return null; + } + + const name = rawProduct.name; + if (!name) { + console.warn(`[TreezNormalizer] Product ${externalId} missing name, skipping`); + return null; + } + + return { + externalProductId: String(externalId), + dispensaryId, + platform: 'treez', + platformDispensaryId: '', // Will be set by handler + + // Core fields + name, + brandName: rawProduct.brand || null, + brandId: null, // Treez doesn't expose brand IDs + category: this.normalizeCategory(rawProduct.category) || null, + subcategory: rawProduct.subcategory || null, + type: rawProduct.category || null, + strainType: rawProduct.subcategory || null, // indica, sativa, hybrid + + // Potency + thcPercent: rawProduct.thcPercent ?? null, + cbdPercent: rawProduct.cbdPercent ?? null, + thcContent: rawProduct.thcPercent ?? null, + cbdContent: rawProduct.cbdPercent ?? null, + + // Status - scraped products are active + status: 'Active', + isActive: rawProduct.inStock !== false, + medicalOnly: false, + recOnly: false, + + // Images + primaryImageUrl: rawProduct.imageUrl || null, + images: rawProduct.imageUrl + ? [{ url: rawProduct.imageUrl, position: 0 }] + : [], + + // Raw reference + rawProduct, + }; + } + + protected normalizePricing(rawProduct: any): NormalizedPricing | null { + const externalId = rawProduct.productId; + if (!externalId) return null; + + const price = rawProduct.price; + + return { + externalProductId: String(externalId), + + // Treez typically shows a single price + priceRec: this.toCents(price), + priceRecMin: this.toCents(price), + priceRecMax: this.toCents(price), + priceRecSpecial: null, + + // Treez doesn't distinguish med pricing in DOM + priceMed: null, + priceMedMin: null, + priceMedMax: null, + priceMedSpecial: null, + + isOnSpecial: false, + specialName: null, + discountPercent: null, + }; + } + + protected normalizeAvailability(rawProduct: any): NormalizedAvailability | null { + const externalId = rawProduct.productId; + if (!externalId) return null; + + const inStock = rawProduct.inStock !== false; + + return { + externalProductId: String(externalId), + inStock, + stockStatus: inStock ? 'in_stock' : 'out_of_stock', + quantity: null, // Treez doesn't expose quantity in DOM + quantityAvailable: null, + isBelowThreshold: false, + optionsBelowThreshold: false, + }; + } + + protected extractBrand(rawProduct: any): NormalizedBrand | null { + const brandName = rawProduct.brand; + if (!brandName) return null; + + return { + externalBrandId: null, // Treez doesn't expose brand IDs + name: brandName, + slug: this.slugify(brandName), + logoUrl: null, + }; + } + + protected extractCategory(rawProduct: any): NormalizedCategory | null { + const categoryName = rawProduct.category; + if (!categoryName) return null; + + return { + name: this.normalizeCategory(categoryName) || categoryName, + slug: this.slugify(categoryName), + parentCategory: null, + }; + } + + // ============================================================ + // HELPERS + // ============================================================ + + /** + * Normalize category name to standard format + */ + private normalizeCategory(category: string | null | undefined): string | null { + if (!category) return null; + + const categoryLower = category.toLowerCase().trim(); + + const categoryMap: Record = { + flower: 'Flower', + vape: 'Vape', + vapes: 'Vape', + cartridge: 'Vape', + edible: 'Edible', + edibles: 'Edible', + concentrate: 'Concentrate', + concentrates: 'Concentrate', + 'pre-roll': 'Pre-Roll', + preroll: 'Pre-Roll', + 'pre-rolls': 'Pre-Roll', + prerolls: 'Pre-Roll', + topical: 'Topical', + topicals: 'Topical', + tincture: 'Tincture', + tinctures: 'Tincture', + accessory: 'Accessory', + accessories: 'Accessory', + gear: 'Gear', + }; + + return categoryMap[categoryLower] || category; + } +} diff --git a/backend/src/platforms/treez/client.ts b/backend/src/platforms/treez/client.ts new file mode 100644 index 00000000..d7cc3bcd --- /dev/null +++ b/backend/src/platforms/treez/client.ts @@ -0,0 +1,570 @@ +/** + * ============================================================ + * TREEZ PLATFORM CLIENT + * ============================================================ + * + * Treez is a fully client-side rendered platform (React/Next.js). + * Unlike Dutchie (GraphQL) or Jane (Algolia), Treez requires DOM + * parsing after page render. No API endpoints are available. + * + * Key differences: + * - No Cloudflare protection (simpler than Jane) + * - Products loaded via infinite scroll + * - Data extracted from DOM elements + * - Age gate must be bypassed + * + * URL Pattern: https://{storeId}.treez.io/onlinemenu/?customerType=ADULT + * Store ID Format: String slug (e.g., "best") + * + * ============================================================ + */ + +import puppeteer, { Browser, Page } from 'puppeteer'; +import puppeteerExtra from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator'; + +// Register stealth plugin (good practice even without Cloudflare) +puppeteerExtra.use(StealthPlugin()); + +// ============================================================ +// TYPES +// ============================================================ + +export interface TreezProductRaw { + productId: string; + name: string; + brand: string; + category: string; + subcategory: string; // indica, sativa, hybrid + thcPercent: number | null; + cbdPercent: number | null; + price: number | null; + priceUnit: string; + imageUrl: string | null; + inStock: boolean; + weight: string | null; +} + +export interface TreezSession { + sessionId: string; + browser: Browser; + page: Page; + fingerprint: BrowserFingerprint; + proxyUrl: string | null; + startedAt: Date; + storeId?: string; +} + +export interface TreezStoreInfo { + storeId: string; + name: string; + url: string; +} + +// ============================================================ +// CONFIGURATION +// ============================================================ + +export const TREEZ_CONFIG = { + baseUrl: 'https://{storeId}.treez.io/onlinemenu/', + timeout: 60000, + navigationTimeout: 60000, + scrollDelay: 1500, + maxScrollAttempts: 50, + ageGateDelay: 2000, +}; + +// ============================================================ +// SESSION MANAGEMENT +// ============================================================ + +let currentSession: TreezSession | null = null; +let crawlRotator: CrawlRotator | null = null; + +/** + * Set CrawlRotator for proxy/fingerprint management + */ +export function setCrawlRotator(rotator: CrawlRotator | null): void { + crawlRotator = rotator; + if (rotator) { + console.log('[Treez Client] CrawlRotator attached'); + } +} + +/** + * Get attached CrawlRotator + */ +export function getCrawlRotator(): CrawlRotator | null { + return crawlRotator; +} + +/** + * Start a new Treez browser session + */ +export async function startSession(storeId?: string): Promise { + if (currentSession) { + console.log('[Treez Client] Closing existing session before starting new one'); + await endSession(); + } + + // Get fingerprint from rotator or use defaults + let fingerprint: BrowserFingerprint; + let proxyUrl: string | null = null; + + if (crawlRotator) { + fingerprint = crawlRotator.userAgent.getCurrent(); + const proxy = crawlRotator.proxy.getCurrent(); + if (proxy) { + proxyUrl = crawlRotator.proxy.getProxyUrl(proxy); + } + } else { + // Default fingerprint for local testing + fingerprint = { + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + browserName: 'Chrome', + deviceCategory: 'desktop', + platform: 'Windows', + screenWidth: 1920, + screenHeight: 1080, + viewportWidth: 1920, + viewportHeight: 1080, + acceptLanguage: 'en-US,en;q=0.9', + secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', + secChUaPlatform: '"Windows"', + secChUaMobile: '?0', + httpFingerprint: { + browserType: 'Chrome' as const, + headers: {}, + headerOrder: [], + curlImpersonateBinary: 'curl_chrome131', + hasDNT: false, + }, + }; + } + + // Build browser args + const browserArgs = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ]; + + if (proxyUrl) { + const proxyMatch = proxyUrl.match(/:\/\/([^@]+@)?([^/]+)/); + if (proxyMatch) { + browserArgs.push(`--proxy-server=${proxyMatch[2]}`); + } + } + + console.log('[Treez Client] Launching browser...'); + const browser = await puppeteerExtra.launch({ + headless: true, + args: browserArgs, + }); + + const page = await browser.newPage(); + + // Set viewport + await page.setViewport({ + width: fingerprint.viewportWidth || 1920, + height: fingerprint.viewportHeight || 1080, + }); + + // Set user agent + await page.setUserAgent(fingerprint.userAgent); + + // Block unnecessary resources to save bandwidth + // We only need HTML/JS for DOM extraction - not images, fonts, etc. + await page.setRequestInterception(true); + page.on('request', (request) => { + const resourceType = request.resourceType(); + if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) { + request.abort(); + } else { + request.continue(); + } + }); + + // Handle proxy authentication if needed + if (proxyUrl) { + const authMatch = proxyUrl.match(/:\/\/([^:]+):([^@]+)@/); + if (authMatch) { + await page.authenticate({ + username: authMatch[1], + password: authMatch[2], + }); + } + } + + const sessionId = `treez_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`; + + currentSession = { + sessionId, + browser, + page, + fingerprint, + proxyUrl, + startedAt: new Date(), + storeId, + }; + + console.log(`[Treez Client] Started session ${sessionId}`); + console.log(`[Treez Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`); + if (proxyUrl) { + console.log(`[Treez Client] Proxy: ${proxyUrl.replace(/:[^:@]+@/, ':***@')}`); + } + + return currentSession; +} + +/** + * End the current browser session + */ +export async function endSession(): Promise { + if (currentSession) { + const duration = Math.round((Date.now() - currentSession.startedAt.getTime()) / 1000); + console.log(`[Treez Client] Ending session ${currentSession.sessionId} (${duration}s)`); + + try { + await currentSession.browser.close(); + } catch (e) { + console.warn('[Treez Client] Error closing browser:', e); + } + + currentSession = null; + } +} + +/** + * Get current active session + */ +export function getCurrentSession(): TreezSession | null { + return currentSession; +} + +// ============================================================ +// AGE GATE HANDLING +// ============================================================ + +/** + * Bypass age gate if present + */ +export async function bypassAgeGate(page: Page): Promise { + console.log('[Treez Client] Checking for age gate...'); + + try { + const ageGate = await page.$('[data-testid="age-gate-modal"], [class*="AgeGate"]'); + + if (ageGate) { + console.log('[Treez Client] Age gate detected, clicking confirm button...'); + + const submitBtn = await page.$('[data-testid="age-gate-submit-button"], button[type="submit"]'); + if (submitBtn) { + await submitBtn.click(); + console.log('[Treez Client] Clicked confirm button'); + + await sleep(TREEZ_CONFIG.ageGateDelay); + + // Wait for age gate to disappear + await page.waitForFunction( + () => !document.querySelector('[data-testid="age-gate-modal"]'), + { timeout: 10000 } + ).catch(() => { + console.log('[Treez Client] Gate may still be visible, continuing anyway'); + }); + + console.log('[Treez Client] Age gate bypassed'); + return true; + } else { + console.log('[Treez Client] No submit button found'); + } + } else { + console.log('[Treez Client] No age gate detected'); + } + + return false; + } catch (err: any) { + console.log(`[Treez Client] Age gate error: ${err.message}`); + return false; + } +} + +// ============================================================ +// NAVIGATION & SCRAPING +// ============================================================ + +/** + * Build menu URL for a store + */ +export function buildMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string { + return `https://${storeId}.treez.io/onlinemenu/?customerType=${customerType}`; +} + +/** + * Navigate to a store's menu page + */ +export async function navigateToMenu(storeId: string): Promise { + if (!currentSession) { + throw new Error('[Treez Client] No active session - call startSession() first'); + } + + const { page } = currentSession; + const url = buildMenuUrl(storeId); + + console.log(`[Treez Client] Navigating to ${url}`); + + await page.goto(url, { + waitUntil: 'networkidle2', + timeout: TREEZ_CONFIG.navigationTimeout, + }); + + // Wait for React app to render + await sleep(2000); + + // Bypass age gate + await bypassAgeGate(page); + + // Wait for content to load + await sleep(2000); + + console.log('[Treez Client] Menu page loaded'); +} + +/** + * Scroll to load all products (infinite scroll) + */ +export async function scrollToLoadAll(page: Page): Promise { + let previousHeight = 0; + let scrollCount = 0; + let sameHeightCount = 0; + + console.log('[Treez Client] Starting infinite scroll...'); + + while (scrollCount < TREEZ_CONFIG.maxScrollAttempts) { + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + + if (currentHeight === previousHeight) { + sameHeightCount++; + if (sameHeightCount >= 3) { + console.log('[Treez Client] No new content after 3 attempts, stopping'); + break; + } + } else { + sameHeightCount = 0; + } + + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await sleep(TREEZ_CONFIG.scrollDelay); + + previousHeight = currentHeight; + scrollCount++; + + if (scrollCount % 5 === 0) { + const productCount = await page.evaluate(() => { + return document.querySelectorAll('[class*="product_product__"]').length; + }); + console.log(`[Treez Client] Scroll ${scrollCount}: ${productCount} products loaded`); + } + } + + return scrollCount; +} + +/** + * Extract products from the current page + */ +export async function extractProducts(page: Page): Promise { + console.log('[Treez Client] Extracting products from DOM...'); + + const products = await page.evaluate(() => { + const results: any[] = []; + + // Find all product cards + const productElements = Array.from( + document.querySelectorAll('[class*="product_product__"]') + ).filter(el => { + const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]'); + const hasPrice = el.querySelector('[class*="price"]'); + return hasName || hasPrice; + }); + + const seen = new Set(); + + for (const el of productElements) { + try { + // Get product name + const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]'); + const name = nameEl?.textContent?.trim() || ''; + + if (!name || seen.has(name)) continue; + seen.add(name); + + // Get product ID from link + const linkEl = el.querySelector('a[href*="/product/"]'); + let productId = ''; + if (linkEl) { + const href = linkEl.getAttribute('href') || ''; + const match = href.match(/\/product\/([^\/\?]+)/); + productId = match ? match[1] : ''; + } + if (!productId) { + productId = `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`; + } + + // Get brand + const brandEl = el.querySelector('[class*="brand"], [class*="Brand"]'); + const brand = brandEl?.textContent?.trim() || ''; + + // Get price + const priceEl = el.querySelector('[class*="price__ins"], [class*="price"]'); + const priceText = priceEl?.textContent || ''; + const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/); + const price = priceMatch ? parseFloat(priceMatch[1]) : null; + + // Get image URL + const imgEl = el.querySelector('img'); + let imageUrl = imgEl?.getAttribute('src') || null; + if (imageUrl && imageUrl.includes('/_next/image')) { + const urlMatch = imageUrl.match(/url=([^&]+)/); + if (urlMatch) { + imageUrl = decodeURIComponent(urlMatch[1]); + } + } + + // Get text content for data extraction + const text = el.textContent || ''; + const textLower = text.toLowerCase(); + + // Get THC/CBD + const thcMatch = text.match(/(?:THC[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*THC/i) || + text.match(/THC[:\s]*(\d+(?:\.\d+)?)\s*%?/i); + const cbdMatch = text.match(/(?:CBD[:\s]*)?(\d+(?:\.\d+)?)\s*%?\s*CBD/i) || + text.match(/CBD[:\s]*(\d+(?:\.\d+)?)\s*%?/i); + const thcPercent = thcMatch ? parseFloat(thcMatch[1]) : null; + const cbdPercent = cbdMatch ? parseFloat(cbdMatch[1]) : null; + + // Get weight from name + const weightMatch = name.match(/(\d+(?:\.\d+)?)\s*(G|g|MG|mg|OZ|oz)/i); + const weight = weightMatch ? `${weightMatch[1]}${weightMatch[2].toLowerCase()}` : null; + + // Determine category from weight and name (not full text to avoid nav pollution) + let category = ''; + + // Check explicit category patterns in NAME ONLY (not full text) + // This avoids false positives from navigation elements + const categoryPatterns = [ + { pattern: /vape|cart(?:ridge)?|pen|pod/i, category: 'vape' }, + { pattern: /edible|gummy|gummies|chocolate|candy/i, category: 'edible' }, + { pattern: /concentrate|dab|wax|shatter|rosin|resin/i, category: 'concentrate' }, + { pattern: /pre.?roll|joint|blunt/i, category: 'pre-roll' }, + { pattern: /topical|balm|cream|lotion/i, category: 'topical' }, + { pattern: /tincture/i, category: 'tincture' }, + ]; + for (const { pattern, category: cat } of categoryPatterns) { + if (pattern.test(name)) { + category = cat; + break; + } + } + + // If no explicit category found, infer from weight + if (!category && weight) { + const weightLower = weight.toLowerCase(); + if (weightLower.includes('g') && !weightLower.includes('mg')) { + // Gram weights (3.5g, 1g, 7g, etc.) are typically flower + category = 'flower'; + } else if (weightLower.includes('mg')) { + // Milligram weights are typically edibles + category = 'edible'; + } + } + + // Get strain type + const strainTypes = ['indica', 'sativa', 'hybrid']; + let subcategory = ''; + for (const strain of strainTypes) { + if (textLower.includes(strain)) { + subcategory = strain; + break; + } + } + + // Check stock status + const inStock = !textLower.includes('out of stock') && !textLower.includes('sold out'); + + results.push({ + productId, + name, + brand, + category, + subcategory, + thcPercent, + cbdPercent, + price, + priceUnit: weight || '', + imageUrl, + inStock, + weight, + }); + } catch (err) { + // Skip products that fail extraction + } + } + + return results; + }); + + console.log(`[Treez Client] Extracted ${products.length} products`); + return products; +} + +/** + * Fetch all products from a store + * Main entry point for product discovery + */ +export async function fetchAllProducts(storeId: string): Promise<{ + products: TreezProductRaw[]; + storeInfo: TreezStoreInfo; + scrollCount: number; +}> { + if (!currentSession) { + throw new Error('[Treez Client] No active session - call startSession() first'); + } + + const { page } = currentSession; + + // Navigate to menu + await navigateToMenu(storeId); + + // Get page title for store info + const pageTitle = await page.title(); + const storeInfo: TreezStoreInfo = { + storeId, + name: pageTitle.split('|')[1]?.trim() || pageTitle, + url: buildMenuUrl(storeId), + }; + + // Scroll to load all products + const scrollCount = await scrollToLoadAll(page); + + // Extract products + const products = await extractProducts(page); + + // Record success if we got products + if (crawlRotator && products.length > 0) { + await crawlRotator.recordSuccess(); + } + + return { products, storeInfo, scrollCount }; +} + +// ============================================================ +// UTILITY +// ============================================================ + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/backend/src/platforms/treez/index.ts b/backend/src/platforms/treez/index.ts new file mode 100644 index 00000000..1a764ac9 --- /dev/null +++ b/backend/src/platforms/treez/index.ts @@ -0,0 +1,50 @@ +/** + * Treez Platform Module + * + * Single export point for all Treez communication. + * All Treez workers MUST import from this module. + */ + +export { + // Session Management + startSession, + endSession, + getCurrentSession, + + // Proxy/Rotation + setCrawlRotator, + getCrawlRotator, + + // Core Operations + navigateToMenu, + scrollToLoadAll, + extractProducts, + fetchAllProducts, + bypassAgeGate, + + // URL Building + buildMenuUrl, + + // Configuration + TREEZ_CONFIG, + + // Types + type TreezSession, + type TreezStoreInfo, + type TreezProductRaw, +} from './client'; + +// High-level Query Functions +export { + fetchProductsByStoreId, + fetchProductsFromUrl, + extractStoreIdFromUrl, + validateStoreId, + getMenuUrl, + + // Types + type FetchProductsResult, +} from './queries'; + +// Re-export CrawlRotator types from canonical location +export type { CrawlRotator, Proxy, ProxyStats } from '../../services/crawl-rotator'; diff --git a/backend/src/platforms/treez/queries.ts b/backend/src/platforms/treez/queries.ts new file mode 100644 index 00000000..a07790ee --- /dev/null +++ b/backend/src/platforms/treez/queries.ts @@ -0,0 +1,132 @@ +/** + * Treez High-Level Query Functions + * + * Wraps the low-level client methods with business logic + * for common operations like product fetching. + */ + +import { + startSession, + endSession, + fetchAllProducts, + buildMenuUrl, + TreezProductRaw, + TreezStoreInfo, +} from './client'; + +// ============================================================ +// PRODUCT OPERATIONS +// ============================================================ + +export interface FetchProductsResult { + store: TreezStoreInfo; + products: TreezProductRaw[]; + totalCaptured: number; + scrollCount: number; +} + +/** + * Fetch all products from a Treez store + * + * @param storeId - Treez store ID (slug like "best") + * @returns Products and store data captured from the page + */ +export async function fetchProductsByStoreId(storeId: string): Promise { + try { + await startSession(storeId); + + const { products, storeInfo, scrollCount } = await fetchAllProducts(storeId); + + return { + store: storeInfo, + products, + totalCaptured: products.length, + scrollCount, + }; + } finally { + await endSession(); + } +} + +/** + * Fetch products from a Treez menu URL + * Extracts store ID from URL and fetches products + * + * @param menuUrl - Full Treez menu URL + * @returns Products and store data + */ +export async function fetchProductsFromUrl(menuUrl: string): Promise { + const storeId = extractStoreIdFromUrl(menuUrl); + if (!storeId) { + throw new Error(`Could not extract store ID from URL: ${menuUrl}`); + } + + return fetchProductsByStoreId(storeId); +} + +// ============================================================ +// STORE OPERATIONS +// ============================================================ + +/** + * Extract store ID from a Treez URL + * + * Supports formats: + * - https://best.treez.io/onlinemenu/ + * - https://shop.bestdispensary.com/ (resolves to best.treez.io) + * + * @param url - Treez menu URL + * @returns Store ID or null if not found + */ +export function extractStoreIdFromUrl(url: string): string | null { + // Pattern 1: {storeId}.treez.io + const treezMatch = url.match(/https?:\/\/([^.]+)\.treez\.io/i); + if (treezMatch) { + return treezMatch[1]; + } + + // Pattern 2: Custom domain - would need to follow redirect + // For now, return null and let the caller handle it + return null; +} + +/** + * Validate that a store ID exists and is accessible + * + * @param storeId - Treez store ID + * @returns True if store is accessible + */ +export async function validateStoreId(storeId: string): Promise { + try { + await startSession(storeId); + + const { page } = (await import('./client')).getCurrentSession()!; + const url = buildMenuUrl(storeId); + + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: 30000, + }); + + // Check if we got a valid page (not 404) + const title = await page.title(); + const is404 = title.toLowerCase().includes('404') || title.toLowerCase().includes('not found'); + + return !is404; + } catch { + return false; + } finally { + await endSession(); + } +} + +// ============================================================ +// UTILITY FUNCTIONS +// ============================================================ + +/** + * Get the direct Treez menu URL for a store + */ +export function getMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string { + return buildMenuUrl(storeId, customerType); +} diff --git a/backend/src/tasks/handlers/index.ts b/backend/src/tasks/handlers/index.ts index 24525c77..90c5eb68 100644 --- a/backend/src/tasks/handlers/index.ts +++ b/backend/src/tasks/handlers/index.ts @@ -27,3 +27,6 @@ export { handleStoreDiscoveryDutchie } from './store-discovery-dutchie'; export { handleStoreDiscoveryJane } from './store-discovery-jane'; export { handleEntryPointDiscoveryJane } from './entry-point-discovery-jane'; export { handleProductDiscoveryJane } from './product-discovery-jane'; + +// Treez Platform Handlers +export { handleProductDiscoveryTreez } from './product-discovery-treez'; diff --git a/backend/src/tasks/handlers/product-discovery-dutchie.ts b/backend/src/tasks/handlers/product-discovery-dutchie.ts index 23889069..55c2826c 100644 --- a/backend/src/tasks/handlers/product-discovery-dutchie.ts +++ b/backend/src/tasks/handlers/product-discovery-dutchie.ts @@ -126,6 +126,28 @@ export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise { + const { pool, task, crawlRotator } = ctx; + const dispensaryId = task.dispensary_id; + + if (!dispensaryId) { + return { + success: false, + error: 'Missing dispensary_id in task', + }; + } + + console.log(`[TreezProductDiscovery] Starting for dispensary ${dispensaryId}`); + + try { + // Load dispensary + const dispResult = await pool.query( + `SELECT id, name, menu_url, platform_dispensary_id, menu_type, platform + FROM dispensaries WHERE id = $1`, + [dispensaryId] + ); + + if (dispResult.rows.length === 0) { + return { + success: false, + error: `Dispensary ${dispensaryId} not found`, + }; + } + + const dispensary = dispResult.rows[0]; + + if (!dispensary.platform_dispensary_id) { + return { + success: false, + error: `Dispensary ${dispensaryId} has no platform_dispensary_id (Treez store ID)`, + }; + } + + const storeId = dispensary.platform_dispensary_id; + console.log(`[TreezProductDiscovery] Fetching products for Treez store "${storeId}"`); + + // Attach crawl rotator + if (crawlRotator) { + setCrawlRotator(crawlRotator); + } + + // Fetch products via DOM scraping + const result = await fetchProductsByStoreId(storeId); + + if (result.products.length === 0) { + console.warn(`[TreezProductDiscovery] No products captured for dispensary ${dispensaryId}`); + + // Update dispensary with failure + await pool.query( + `UPDATE dispensaries + SET consecutive_failures = consecutive_failures + 1, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ); + + return { + success: false, + error: 'No products captured from Treez menu page', + productCount: 0, + }; + } + + console.log(`[TreezProductDiscovery] Captured ${result.products.length} products`); + + // Build payload for storage + const rawPayload = { + products: result.products, // Store the scraped product data + store: { + storeId: result.store.storeId, + name: result.store.name, + url: result.store.url, + }, + capturedAt: new Date().toISOString(), + platform: 'treez', + dispensaryId, + scrollCount: result.scrollCount, + }; + + // Save raw payload to filesystem (platform = 'treez') + const { id: payloadId, sizeBytes } = await saveRawPayload( + pool, + dispensaryId, + rawPayload, + null, // crawl_run_id + result.products.length, + 'treez' // platform + ); + + console.log(`[TreezProductDiscovery] Saved payload ${payloadId} (${Math.round(sizeBytes / 1024)}KB)`); + + // Update dispensary stage and timestamps + await pool.query( + `UPDATE dispensaries + SET stage = 'hydrating', + last_fetch_at = NOW(), + product_count = $2, + consecutive_successes = consecutive_successes + 1, + consecutive_failures = 0, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId, result.products.length] + ); + + // Queue product_refresh task for normalization + console.log(`[TreezProductDiscovery] Queuing product_refresh for payload ${payloadId}`); + await taskService.createTask({ + role: 'product_refresh', + dispensary_id: dispensaryId, + platform: 'treez', + priority: task.priority || 0, + payload: { payload_id: payloadId }, + }); + + return { + success: true, + productCount: result.products.length, + payloadId, + payloadSizeKB: Math.round(sizeBytes / 1024), + storeInfo: { + storeId: result.store.storeId, + name: result.store.name, + }, + scrollCount: result.scrollCount, + queuedProductRefresh: true, + }; + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : 'Unknown error'; + console.error(`[TreezProductDiscovery] Error:`, errorMessage); + + // Update dispensary with failure + await pool.query( + `UPDATE dispensaries + SET consecutive_failures = consecutive_failures + 1, + stage = CASE WHEN consecutive_failures >= 2 THEN 'failing' ELSE stage END, + updated_at = NOW() + WHERE id = $1`, + [dispensaryId] + ).catch(() => {}); + + return { + success: false, + error: errorMessage, + }; + } +} diff --git a/backend/src/tasks/handlers/product-refresh.ts b/backend/src/tasks/handlers/product-refresh.ts index a2ca532c..acf037f6 100644 --- a/backend/src/tasks/handlers/product-refresh.ts +++ b/backend/src/tasks/handlers/product-refresh.ts @@ -24,6 +24,7 @@ import { TaskContext, TaskResult } from '../task-worker'; import { DutchieNormalizer } from '../../hydration/normalizers/dutchie'; import { JaneNormalizer } from '../../hydration/normalizers/jane'; +import { TreezNormalizer } from '../../hydration/normalizers/treez'; import { BaseNormalizer } from '../../hydration/normalizers/base'; import { upsertStoreProducts, @@ -37,6 +38,7 @@ import { taskService } from '../task-service'; const NORMALIZERS: Record = { dutchie: new DutchieNormalizer(), jane: new JaneNormalizer(), + treez: new TreezNormalizer(), }; /** diff --git a/backend/src/tasks/task-worker.ts b/backend/src/tasks/task-worker.ts index 475ac9b8..a4eb6609 100644 --- a/backend/src/tasks/task-worker.ts +++ b/backend/src/tasks/task-worker.ts @@ -87,6 +87,9 @@ import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane'; import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane'; import { handleProductDiscoveryJane } from './handlers/product-discovery-jane'; +// Treez Platform Handlers +import { handleProductDiscoveryTreez } from './handlers/product-discovery-treez'; + const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000'); const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000'); const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010'; @@ -136,6 +139,14 @@ const CPU_BACKOFF_THRESHOLD = parseFloat(process.env.CPU_BACKOFF_THRESHOLD || '0 // How long to wait (ms) when in backoff state before rechecking resources const BACKOFF_DURATION_MS = parseInt(process.env.BACKOFF_DURATION_MS || '10000'); +export interface WorkerFingerprint { + timezone?: string; + city?: string; + state?: string; + ip?: string; + locale?: string; +} + export interface TaskContext { pool: Pool; workerId: string; @@ -144,6 +155,8 @@ export interface TaskContext { crawlRotator?: CrawlRotator; /** Update the current step being executed (shown in dashboard) */ updateStep: (step: string, detail?: string) => void; + /** Worker's stored fingerprint from preflight (timezone, locale, etc.) */ + fingerprint?: WorkerFingerprint; } export interface TaskResult { @@ -201,6 +214,17 @@ function getHandlerForTask(task: WorkerTask): TaskHandler | undefined { } } + // ========================================================================== + // TREEZ PLATFORM ROUTING + // ========================================================================== + if (platform === 'treez') { + if (role === 'product_discovery') { + console.log(`[TaskWorker] Using Treez handler for product_discovery`); + return handleProductDiscoveryTreez; + } + // Treez uses shared product_refresh handler via normalizer registry + } + // ========================================================================== // DUTCHIE PLATFORM ROUTING (default) // ========================================================================== @@ -330,6 +354,8 @@ export class TaskWorker { private geoCity: string | null = null; private geoProxyUrl: string | null = null; private geoSessionStartedAt: Date | null = null; + private storedTimezone: string | null = null; + private storedFingerprint: WorkerFingerprint | null = null; constructor(role: TaskRole | null = null, workerId?: string) { this.pool = getPool(); @@ -655,7 +681,22 @@ export class TaskWorker { console.log(`[TaskWorker] Preflight status reported to worker_registry`); if (this.preflightHttpResult?.proxyIp) { - console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${(this.preflightHttpResult as any).detectedTimezone || 'unknown'}`); + const detectedTimezone = (this.preflightHttpResult as any).detectedTimezone; + const detectedLocation = (this.preflightHttpResult as any).detectedLocation; + console.log(`[TaskWorker] HTTP IP: ${this.preflightHttpResult.proxyIp}, Timezone: ${detectedTimezone || 'unknown'}`); + + // Store fingerprint for task execution - CRITICAL for anti-detect consistency + if (this.preflightHttpPassed) { + this.storedTimezone = detectedTimezone || null; + this.storedFingerprint = { + timezone: detectedTimezone, + city: detectedLocation?.city, + state: detectedLocation?.region, + ip: this.preflightHttpResult.proxyIp, + locale: 'en-US', // US proxies use English + }; + console.log(`[TaskWorker] Stored fingerprint: ${JSON.stringify(this.storedFingerprint)}`); + } } } catch (err: any) { // Non-fatal - worker can still function @@ -1349,7 +1390,7 @@ export class TaskWorker { throw new Error(`No handler registered for role: ${task.role}`); } - // Create context with step tracking + // Create context with step tracking and fingerprint const ctx: TaskContext = { pool: this.pool, workerId: this.workerId, @@ -1361,6 +1402,8 @@ export class TaskWorker { updateStep: (step: string, detail?: string) => { this.updateTaskStep(task.id, step, detail); }, + // Pass stored fingerprint for browser configuration + fingerprint: this.storedFingerprint || undefined, }; // Initialize step tracking for this task diff --git a/cannaiq/dist/index.html b/cannaiq/dist/index.html index 000f5df2..67c322d2 100644 --- a/cannaiq/dist/index.html +++ b/cannaiq/dist/index.html @@ -7,8 +7,8 @@ CannaIQ - Cannabis Menu Intelligence Platform - - + +
diff --git a/cannaiq/src/pages/TasksDashboard.tsx b/cannaiq/src/pages/TasksDashboard.tsx index d0a4ab5b..7315f5a1 100644 --- a/cannaiq/src/pages/TasksDashboard.tsx +++ b/cannaiq/src/pages/TasksDashboard.tsx @@ -1421,7 +1421,10 @@ export default function TasksDashboard() { {schedule.state_code} ) : ( - - + + + All + )}