docs: Add Evomi residential proxy API documentation

- Document priority order (Evomi API first, DB fallback)
- List environment variables and defaults
- Show K8s secret location
- Explain proxy URL format with geo targeting

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-13 16:47:58 -07:00
parent cf99ef9e09
commit d8a22fba53
2 changed files with 286 additions and 0 deletions

View File

@@ -248,6 +248,35 @@ All other browsers are filtered out. Uses `intoli/user-agents` library for reali
These binaries mimic real browser TLS fingerprints to avoid detection. These binaries mimic real browser TLS fingerprints to avoid detection.
### Evomi Residential Proxy API
Workers use Evomi's residential proxy API for geo-targeted proxies on-demand.
**Priority Order**:
1. Evomi API (if EVOMI_USER/EVOMI_PASS configured)
2. DB proxies (fallback if Evomi not configured)
**Environment Variables**:
| Variable | Description | Default |
|----------|-------------|---------|
| `EVOMI_USER` | API username | - |
| `EVOMI_PASS` | API key | - |
| `EVOMI_HOST` | Proxy host | `rpc.evomi.com` |
| `EVOMI_PORT` | Proxy port | `1000` |
**K8s Secret**: Credentials stored in `scraper-secrets`:
```bash
kubectl get secret scraper-secrets -n dispensary-scraper -o jsonpath='{.data.EVOMI_PASS}' | base64 -d
```
**Proxy URL Format**: `http://{user}_{session}_{geo}:{pass}@{host}:{port}`
- `session`: Worker ID for sticky sessions
- `geo`: State code (e.g., `arizona`, `california`)
**Files**:
- `src/services/crawl-rotator.ts` - `getEvomiConfig()`, `buildEvomiProxyUrl()`
- `src/tasks/task-worker.ts` - Proxy initialization order
--- ---
## Bulk Task Workflow (Updated 2025-12-13) ## Bulk Task Workflow (Updated 2025-12-13)

View File

@@ -0,0 +1,257 @@
/**
* Test Treez brand-based product extraction
* 1. Load /brands page
* 2. Click "load more brands" to get all brands
* 3. Extract brand URLs
* 4. Visit each brand and extract products
*/
import puppeteer, { Page } from 'puppeteer';
const STORE_ID = 'best';
async function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function bypassAgeGate(page: Page): Promise<void> {
const ageGate = await page.$('[data-testid="age-gate-modal"]');
if (ageGate) {
console.log('[AgeGate] Detected, bypassing...');
const btn = await page.$('[data-testid="age-gate-submit-button"]');
if (btn) await btn.click();
await sleep(2000);
}
}
async function loadAllBrands(page: Page): Promise<void> {
console.log('[Brands] Looking for "load more" option...');
// Look for select/dropdown with "load more" or "all brands" option
const selectInfo = await page.evaluate(() => {
const selects = document.querySelectorAll('select');
const info: { selector: string; options: string[] }[] = [];
selects.forEach((sel, i) => {
const options = Array.from(sel.options).map(o => o.text);
info.push({ selector: `select:nth-of-type(${i + 1})`, options });
});
return info;
});
console.log('[Brands] Found selects:', JSON.stringify(selectInfo, null, 2));
// Look for any button or link with "load more" or "show all"
const loadMoreButtons = await page.evaluate(() => {
const elements = document.querySelectorAll('button, a, [role="button"]');
const matches: { text: string; tag: string }[] = [];
elements.forEach(el => {
const text = el.textContent?.toLowerCase() || '';
if (text.includes('load more') || text.includes('show all') || text.includes('view all')) {
matches.push({ text: el.textContent?.trim() || '', tag: el.tagName });
}
});
return matches;
});
console.log('[Brands] Found load more buttons:', loadMoreButtons);
// Try to find and interact with the brands dropdown
// First, let's see all interactive elements with "brand" in them
const brandElements = await page.evaluate(() => {
const all = document.querySelectorAll('*');
const matches: { tag: string; class: string; text: string }[] = [];
all.forEach(el => {
const className = el.className?.toString?.() || '';
const text = el.textContent?.trim().slice(0, 100) || '';
if (className.toLowerCase().includes('brand') || className.toLowerCase().includes('select')) {
matches.push({
tag: el.tagName,
class: className.slice(0, 100),
text: text.slice(0, 50),
});
}
});
return matches.slice(0, 20);
});
console.log('[Brands] Brand-related elements:', JSON.stringify(brandElements.slice(0, 10), null, 2));
}
async function extractBrandLinks(page: Page): Promise<{ name: string; url: string }[]> {
const brands = await page.evaluate(() => {
const links: { name: string; url: string }[] = [];
// Look for brand cards/links
const selectors = [
'a[href*="/brand/"]',
'a[href*="/brands/"]',
'[class*="brand"] a',
'[class*="Brand"] a',
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(el => {
const href = el.getAttribute('href');
const name = el.textContent?.trim() || '';
if (href && name && !links.some(l => l.url === href)) {
links.push({ name, url: href });
}
});
});
return links;
});
return brands;
}
async function extractProductsFromBrandPage(page: Page): Promise<any[]> {
// Scroll to load all products
let previousHeight = 0;
let scrollCount = 0;
let sameHeightCount = 0;
while (scrollCount < 20) {
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
sameHeightCount++;
if (sameHeightCount >= 3) break;
} else {
sameHeightCount = 0;
}
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await sleep(1000);
previousHeight = currentHeight;
scrollCount++;
}
// Extract products
const products = await page.evaluate(() => {
const results: any[] = [];
const seen = new Set<string>();
document.querySelectorAll('[class*="product_product__"]').forEach(el => {
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
const name = nameEl?.textContent?.trim() || '';
if (!name || seen.has(name)) return;
seen.add(name);
const priceEl = el.querySelector('[class*="price"]');
const priceText = priceEl?.textContent || '';
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
const linkEl = el.querySelector('a[href*="/product/"]');
let productId = '';
if (linkEl) {
const href = linkEl.getAttribute('href') || '';
const match = href.match(/\/product\/([^\/?]+)/);
productId = match ? match[1] : '';
}
results.push({
productId: productId || `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`,
name,
price,
});
});
return results;
});
return products;
}
async function main() {
console.log('='.repeat(60));
console.log('Testing Treez Brand-Based Extraction');
console.log('='.repeat(60));
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
// Block images
await page.setRequestInterception(true);
page.on('request', (req) => {
if (['image', 'font', 'media'].includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
try {
// Navigate to brands page
const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
console.log(`\n[1] Navigating to ${brandsUrl}`);
await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 });
await sleep(2000);
await bypassAgeGate(page);
await sleep(1000);
// Screenshot to see what we're working with
await page.screenshot({ path: '/tmp/treez-brands-page.png', fullPage: false });
console.log('[1] Screenshot saved to /tmp/treez-brands-page.png');
// Try to load all brands
console.log('\n[2] Exploring brand selection options...');
await loadAllBrands(page);
// Extract brand links
console.log('\n[3] Extracting brand links...');
const brandLinks = await extractBrandLinks(page);
console.log(`Found ${brandLinks.length} brand links:`);
brandLinks.slice(0, 10).forEach(b => console.log(` - ${b.name}: ${b.url}`));
// If we found brand links, visit a couple to test
if (brandLinks.length > 0) {
console.log('\n[4] Testing product extraction from first 3 brands...');
let totalProducts = 0;
const allProducts: any[] = [];
for (const brand of brandLinks.slice(0, 3)) {
const brandUrl = brand.url.startsWith('http')
? brand.url
: `https://${STORE_ID}.treez.io${brand.url}`;
console.log(`\n Visiting brand: ${brand.name}`);
console.log(` URL: ${brandUrl}`);
await page.goto(brandUrl, { waitUntil: 'networkidle2', timeout: 30000 });
await sleep(2000);
const products = await extractProductsFromBrandPage(page);
console.log(` Products found: ${products.length}`);
allProducts.push(...products.map(p => ({ ...p, brand: brand.name })));
totalProducts += products.length;
}
console.log(`\n[5] Summary from 3 brands: ${totalProducts} products`);
console.log(`Estimated total (${brandLinks.length} brands): ~${Math.round(totalProducts / 3 * brandLinks.length)} products`);
}
} catch (error: any) {
console.error('Error:', error.message);
} finally {
await browser.close();
}
}
main().catch(console.error);