diff --git a/backend/scripts/explore-treez-pages.ts b/backend/scripts/explore-treez-pages.ts new file mode 100644 index 00000000..b97bc501 --- /dev/null +++ b/backend/scripts/explore-treez-pages.ts @@ -0,0 +1,184 @@ +/** + * Explore all Treez page URLs to find the full product catalog + */ + +import puppeteer, { Page } from 'puppeteer'; + +const STORE_ID = 'best'; + +async function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function bypassAgeGate(page: Page): Promise { + const ageGate = await page.$('[data-testid="age-gate-modal"]'); + if (ageGate) { + console.log(' Age gate detected, bypassing...'); + const btn = await page.$('[data-testid="age-gate-submit-button"]'); + if (btn) await btn.click(); + await sleep(2000); + } +} + +async function countProducts(page: Page): Promise { + return page.evaluate(() => + document.querySelectorAll('[class*="product_product__"]').length + ); +} + +async function scrollAndCount(page: Page, maxScrolls: number = 30): Promise<{ products: number; scrolls: number }> { + let previousHeight = 0; + let scrollCount = 0; + let sameHeightCount = 0; + + while (scrollCount < maxScrolls) { + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + + if (currentHeight === previousHeight) { + sameHeightCount++; + if (sameHeightCount >= 3) break; + } else { + sameHeightCount = 0; + } + + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await sleep(1500); + + previousHeight = currentHeight; + scrollCount++; + } + + const products = await countProducts(page); + return { products, scrolls: scrollCount }; +} + +async function testUrl(page: Page, path: string): Promise<{ products: number; scrolls: number; error?: string }> { + const url = `https://${STORE_ID}.treez.io${path}`; + console.log(`\nTesting: ${url}`); + + try { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + await sleep(2000); + await bypassAgeGate(page); + await sleep(1000); + + const initialCount = await countProducts(page); + console.log(` Initial products: ${initialCount}`); + + if (initialCount > 0) { + const result = await scrollAndCount(page); + console.log(` After scroll: ${result.products} products (${result.scrolls} scrolls)`); + return result; + } + + // Check for brand/category cards instead + const cardCount = await page.evaluate(() => { + const selectors = [ + '[class*="brand"]', + '[class*="Brand"]', + '[class*="category"]', + '[class*="Category"]', + '[class*="card"]', + 'a[href*="/brand/"]', + 'a[href*="/category/"]', + ]; + let count = 0; + selectors.forEach(sel => { + count += document.querySelectorAll(sel).length; + }); + return count; + }); + console.log(` Cards/links found: ${cardCount}`); + + return { products: initialCount, scrolls: 0 }; + } catch (error: any) { + console.log(` Error: ${error.message}`); + return { products: 0, scrolls: 0, error: error.message }; + } +} + +async function main() { + console.log('='.repeat(60)); + console.log('Exploring Treez Page URLs'); + console.log('='.repeat(60)); + + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + + // Block images to speed up + await page.setRequestInterception(true); + page.on('request', (req) => { + if (['image', 'font', 'media', 'stylesheet'].includes(req.resourceType())) { + req.abort(); + } else { + req.continue(); + } + }); + + const urlsToTest = [ + '/onlinemenu/?customerType=ADULT', // Homepage + '/onlinemenu/brands?customerType=ADULT', // Brands page + '/onlinemenu/shop?customerType=ADULT', // Shop page? + '/onlinemenu/products?customerType=ADULT', // Products page? + '/onlinemenu/menu?customerType=ADULT', // Menu page? + '/onlinemenu/all?customerType=ADULT', // All products? + '/onlinemenu/flower?customerType=ADULT', // Flower category + '/onlinemenu/vapes?customerType=ADULT', // Vapes category + '/onlinemenu/edibles?customerType=ADULT', // Edibles category + '/onlinemenu/concentrates?customerType=ADULT', // Concentrates category + ]; + + const results: { path: string; products: number; scrolls: number }[] = []; + + for (const path of urlsToTest) { + const result = await testUrl(page, path); + results.push({ path, ...result }); + } + + // Look for navigation links on the main page + console.log('\n' + '='.repeat(60)); + console.log('Checking navigation structure on homepage...'); + console.log('='.repeat(60)); + + await page.goto(`https://${STORE_ID}.treez.io/onlinemenu/?customerType=ADULT`, { + waitUntil: 'networkidle2', + timeout: 30000, + }); + await sleep(2000); + await bypassAgeGate(page); + await sleep(1000); + + const navLinks = await page.evaluate(() => { + const links: { text: string; href: string }[] = []; + document.querySelectorAll('a[href*="/onlinemenu/"]').forEach(el => { + const text = el.textContent?.trim() || ''; + const href = el.getAttribute('href') || ''; + if (text && !links.some(l => l.href === href)) { + links.push({ text: text.slice(0, 50), href }); + } + }); + return links; + }); + + console.log('\nNavigation links found:'); + navLinks.forEach(l => console.log(` "${l.text}" → ${l.href}`)); + + // Summary + console.log('\n' + '='.repeat(60)); + console.log('Summary'); + console.log('='.repeat(60)); + + results.sort((a, b) => b.products - a.products); + results.forEach(r => { + console.log(`${r.products.toString().padStart(4)} products | ${r.path}`); + }); + + await browser.close(); +} + +main().catch(console.error); diff --git a/backend/src/platforms/treez/client.ts b/backend/src/platforms/treez/client.ts index d7cc3bcd..c4488a0c 100644 --- a/backend/src/platforms/treez/client.ts +++ b/backend/src/platforms/treez/client.ts @@ -298,9 +298,10 @@ export async function bypassAgeGate(page: Page): Promise { /** * Build menu URL for a store + * Uses /brands page which contains all products (not just homepage carousels) */ export function buildMenuUrl(storeId: string, customerType: 'ADULT' | 'MEDICAL' = 'ADULT'): string { - return `https://${storeId}.treez.io/onlinemenu/?customerType=${customerType}`; + return `https://${storeId}.treez.io/onlinemenu/brands?customerType=${customerType}`; } /** diff --git a/backend/src/tasks/task-worker.ts b/backend/src/tasks/task-worker.ts index a4eb6609..8aa5a126 100644 --- a/backend/src/tasks/task-worker.ts +++ b/backend/src/tasks/task-worker.ts @@ -533,24 +533,53 @@ export class TaskWorker { } try { + // ============================================================ + // PROXY INITIALIZATION ORDER: + // 1. Check Evomi API first (dynamic residential proxies) + // 2. Fall back to DB proxies if Evomi not configured + // + // Evomi provides geo-targeted proxies on-demand via API. + // DB proxies are static/datacenter proxies as fallback. + // ============================================================ + + // Import Evomi config checker + const { getEvomiConfig } = await import('../services/crawl-rotator'); + const evomiConfig = getEvomiConfig(); + + if (evomiConfig.enabled) { + // Evomi API is configured - we can get proxies on-demand + // No need to wait for DB proxies + console.log(`[TaskWorker] Evomi API configured (${evomiConfig.host}:${evomiConfig.port}) - proxies available on-demand`); + + // Still initialize rotator for user-agent rotation + await this.crawlRotator.initialize(); + setCrawlRotator(this.crawlRotator); + + console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, Evomi API for proxies`); + return; + } + + // Evomi not configured - fall back to DB proxies + console.log(`[TaskWorker] Evomi API not configured, falling back to DB proxies...`); + while (attempts < maxAttempts) { try { - // Load proxies from database + // Load proxies from database (fallback) await this.crawlRotator.initialize(); const stats = this.crawlRotator.proxy.getStats(); if (stats.activeProxies > 0) { - console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`); + console.log(`[TaskWorker] Loaded ${stats.activeProxies} DB proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`); // Wire rotator to Dutchie client - proxies will be used for ALL requests setCrawlRotator(this.crawlRotator); - console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`); + console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, DB proxies`); return; } attempts++; - console.log(`[TaskWorker] No active proxies available (attempt ${attempts}). Waiting for proxies...`); + console.log(`[TaskWorker] No DB proxies available (attempt ${attempts}). Waiting...`); // Wait for either notification or timeout await new Promise((resolve) => { @@ -564,7 +593,7 @@ export class TaskWorker { } } - throw new Error(`No active proxies available after waiting ${MAX_WAIT_MINUTES} minutes. Add proxies to the database.`); + throw new Error(`No proxies available after ${MAX_WAIT_MINUTES} minutes. Configure EVOMI_USER/EVOMI_PASS or add proxies to database.`); } finally { // Clean up LISTEN connection if (notifyClient) {