From d8a22fba53135728375a4093249a5d7f8743b5d9 Mon Sep 17 00:00:00 2001 From: Kelly Date: Sat, 13 Dec 2025 16:47:58 -0700 Subject: [PATCH] docs: Add Evomi residential proxy API documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Document priority order (Evomi API first, DB fallback) - List environment variables and defaults - Show K8s secret location - Explain proxy URL format with geo targeting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 29 +++ backend/scripts/test-treez-brands.ts | 257 +++++++++++++++++++++++++++ 2 files changed, 286 insertions(+) create mode 100644 backend/scripts/test-treez-brands.ts diff --git a/CLAUDE.md b/CLAUDE.md index 14cb20b0..3d7396e8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -248,6 +248,35 @@ All other browsers are filtered out. Uses `intoli/user-agents` library for reali These binaries mimic real browser TLS fingerprints to avoid detection. +### Evomi Residential Proxy API + +Workers use Evomi's residential proxy API for geo-targeted proxies on-demand. + +**Priority Order**: +1. Evomi API (if EVOMI_USER/EVOMI_PASS configured) +2. DB proxies (fallback if Evomi not configured) + +**Environment Variables**: +| Variable | Description | Default | +|----------|-------------|---------| +| `EVOMI_USER` | API username | - | +| `EVOMI_PASS` | API key | - | +| `EVOMI_HOST` | Proxy host | `rpc.evomi.com` | +| `EVOMI_PORT` | Proxy port | `1000` | + +**K8s Secret**: Credentials stored in `scraper-secrets`: +```bash +kubectl get secret scraper-secrets -n dispensary-scraper -o jsonpath='{.data.EVOMI_PASS}' | base64 -d +``` + +**Proxy URL Format**: `http://{user}_{session}_{geo}:{pass}@{host}:{port}` +- `session`: Worker ID for sticky sessions +- `geo`: State code (e.g., `arizona`, `california`) + +**Files**: +- `src/services/crawl-rotator.ts` - `getEvomiConfig()`, `buildEvomiProxyUrl()` +- `src/tasks/task-worker.ts` - Proxy initialization order + --- ## Bulk Task Workflow (Updated 2025-12-13) diff --git a/backend/scripts/test-treez-brands.ts b/backend/scripts/test-treez-brands.ts new file mode 100644 index 00000000..caadbde0 --- /dev/null +++ b/backend/scripts/test-treez-brands.ts @@ -0,0 +1,257 @@ +/** + * Test Treez brand-based product extraction + * 1. Load /brands page + * 2. Click "load more brands" to get all brands + * 3. Extract brand URLs + * 4. Visit each brand and extract products + */ + +import puppeteer, { Page } from 'puppeteer'; + +const STORE_ID = 'best'; + +async function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function bypassAgeGate(page: Page): Promise { + const ageGate = await page.$('[data-testid="age-gate-modal"]'); + if (ageGate) { + console.log('[AgeGate] Detected, bypassing...'); + const btn = await page.$('[data-testid="age-gate-submit-button"]'); + if (btn) await btn.click(); + await sleep(2000); + } +} + +async function loadAllBrands(page: Page): Promise { + console.log('[Brands] Looking for "load more" option...'); + + // Look for select/dropdown with "load more" or "all brands" option + const selectInfo = await page.evaluate(() => { + const selects = document.querySelectorAll('select'); + const info: { selector: string; options: string[] }[] = []; + + selects.forEach((sel, i) => { + const options = Array.from(sel.options).map(o => o.text); + info.push({ selector: `select:nth-of-type(${i + 1})`, options }); + }); + + return info; + }); + + console.log('[Brands] Found selects:', JSON.stringify(selectInfo, null, 2)); + + // Look for any button or link with "load more" or "show all" + const loadMoreButtons = await page.evaluate(() => { + const elements = document.querySelectorAll('button, a, [role="button"]'); + const matches: { text: string; tag: string }[] = []; + + elements.forEach(el => { + const text = el.textContent?.toLowerCase() || ''; + if (text.includes('load more') || text.includes('show all') || text.includes('view all')) { + matches.push({ text: el.textContent?.trim() || '', tag: el.tagName }); + } + }); + + return matches; + }); + + console.log('[Brands] Found load more buttons:', loadMoreButtons); + + // Try to find and interact with the brands dropdown + // First, let's see all interactive elements with "brand" in them + const brandElements = await page.evaluate(() => { + const all = document.querySelectorAll('*'); + const matches: { tag: string; class: string; text: string }[] = []; + + all.forEach(el => { + const className = el.className?.toString?.() || ''; + const text = el.textContent?.trim().slice(0, 100) || ''; + if (className.toLowerCase().includes('brand') || className.toLowerCase().includes('select')) { + matches.push({ + tag: el.tagName, + class: className.slice(0, 100), + text: text.slice(0, 50), + }); + } + }); + + return matches.slice(0, 20); + }); + + console.log('[Brands] Brand-related elements:', JSON.stringify(brandElements.slice(0, 10), null, 2)); +} + +async function extractBrandLinks(page: Page): Promise<{ name: string; url: string }[]> { + const brands = await page.evaluate(() => { + const links: { name: string; url: string }[] = []; + + // Look for brand cards/links + const selectors = [ + 'a[href*="/brand/"]', + 'a[href*="/brands/"]', + '[class*="brand"] a', + '[class*="Brand"] a', + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(el => { + const href = el.getAttribute('href'); + const name = el.textContent?.trim() || ''; + if (href && name && !links.some(l => l.url === href)) { + links.push({ name, url: href }); + } + }); + }); + + return links; + }); + + return brands; +} + +async function extractProductsFromBrandPage(page: Page): Promise { + // Scroll to load all products + let previousHeight = 0; + let scrollCount = 0; + let sameHeightCount = 0; + + while (scrollCount < 20) { + const currentHeight = await page.evaluate(() => document.body.scrollHeight); + + if (currentHeight === previousHeight) { + sameHeightCount++; + if (sameHeightCount >= 3) break; + } else { + sameHeightCount = 0; + } + + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await sleep(1000); + + previousHeight = currentHeight; + scrollCount++; + } + + // Extract products + const products = await page.evaluate(() => { + const results: any[] = []; + const seen = new Set(); + + document.querySelectorAll('[class*="product_product__"]').forEach(el => { + const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]'); + const name = nameEl?.textContent?.trim() || ''; + + if (!name || seen.has(name)) return; + seen.add(name); + + const priceEl = el.querySelector('[class*="price"]'); + const priceText = priceEl?.textContent || ''; + const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/); + const price = priceMatch ? parseFloat(priceMatch[1]) : null; + + const linkEl = el.querySelector('a[href*="/product/"]'); + let productId = ''; + if (linkEl) { + const href = linkEl.getAttribute('href') || ''; + const match = href.match(/\/product\/([^\/?]+)/); + productId = match ? match[1] : ''; + } + + results.push({ + productId: productId || `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`, + name, + price, + }); + }); + + return results; + }); + + return products; +} + +async function main() { + console.log('='.repeat(60)); + console.log('Testing Treez Brand-Based Extraction'); + console.log('='.repeat(60)); + + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + + // Block images + await page.setRequestInterception(true); + page.on('request', (req) => { + if (['image', 'font', 'media'].includes(req.resourceType())) { + req.abort(); + } else { + req.continue(); + } + }); + + try { + // Navigate to brands page + const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`; + console.log(`\n[1] Navigating to ${brandsUrl}`); + await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 }); + await sleep(2000); + await bypassAgeGate(page); + await sleep(1000); + + // Screenshot to see what we're working with + await page.screenshot({ path: '/tmp/treez-brands-page.png', fullPage: false }); + console.log('[1] Screenshot saved to /tmp/treez-brands-page.png'); + + // Try to load all brands + console.log('\n[2] Exploring brand selection options...'); + await loadAllBrands(page); + + // Extract brand links + console.log('\n[3] Extracting brand links...'); + const brandLinks = await extractBrandLinks(page); + console.log(`Found ${brandLinks.length} brand links:`); + brandLinks.slice(0, 10).forEach(b => console.log(` - ${b.name}: ${b.url}`)); + + // If we found brand links, visit a couple to test + if (brandLinks.length > 0) { + console.log('\n[4] Testing product extraction from first 3 brands...'); + + let totalProducts = 0; + const allProducts: any[] = []; + + for (const brand of brandLinks.slice(0, 3)) { + const brandUrl = brand.url.startsWith('http') + ? brand.url + : `https://${STORE_ID}.treez.io${brand.url}`; + + console.log(`\n Visiting brand: ${brand.name}`); + console.log(` URL: ${brandUrl}`); + + await page.goto(brandUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + await sleep(2000); + + const products = await extractProductsFromBrandPage(page); + console.log(` Products found: ${products.length}`); + + allProducts.push(...products.map(p => ({ ...p, brand: brand.name }))); + totalProducts += products.length; + } + + console.log(`\n[5] Summary from 3 brands: ${totalProducts} products`); + console.log(`Estimated total (${brandLinks.length} brands): ~${Math.round(totalProducts / 3 * brandLinks.length)} products`); + } + + } catch (error: any) { + console.error('Error:', error.message); + } finally { + await browser.close(); + } +} + +main().catch(console.error);