docs: Add Evomi residential proxy API documentation
- Document priority order (Evomi API first, DB fallback) - List environment variables and defaults - Show K8s secret location - Explain proxy URL format with geo targeting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
29
CLAUDE.md
29
CLAUDE.md
@@ -248,6 +248,35 @@ All other browsers are filtered out. Uses `intoli/user-agents` library for reali
|
|||||||
|
|
||||||
These binaries mimic real browser TLS fingerprints to avoid detection.
|
These binaries mimic real browser TLS fingerprints to avoid detection.
|
||||||
|
|
||||||
|
### Evomi Residential Proxy API
|
||||||
|
|
||||||
|
Workers use Evomi's residential proxy API for geo-targeted proxies on-demand.
|
||||||
|
|
||||||
|
**Priority Order**:
|
||||||
|
1. Evomi API (if EVOMI_USER/EVOMI_PASS configured)
|
||||||
|
2. DB proxies (fallback if Evomi not configured)
|
||||||
|
|
||||||
|
**Environment Variables**:
|
||||||
|
| Variable | Description | Default |
|
||||||
|
|----------|-------------|---------|
|
||||||
|
| `EVOMI_USER` | API username | - |
|
||||||
|
| `EVOMI_PASS` | API key | - |
|
||||||
|
| `EVOMI_HOST` | Proxy host | `rpc.evomi.com` |
|
||||||
|
| `EVOMI_PORT` | Proxy port | `1000` |
|
||||||
|
|
||||||
|
**K8s Secret**: Credentials stored in `scraper-secrets`:
|
||||||
|
```bash
|
||||||
|
kubectl get secret scraper-secrets -n dispensary-scraper -o jsonpath='{.data.EVOMI_PASS}' | base64 -d
|
||||||
|
```
|
||||||
|
|
||||||
|
**Proxy URL Format**: `http://{user}_{session}_{geo}:{pass}@{host}:{port}`
|
||||||
|
- `session`: Worker ID for sticky sessions
|
||||||
|
- `geo`: State code (e.g., `arizona`, `california`)
|
||||||
|
|
||||||
|
**Files**:
|
||||||
|
- `src/services/crawl-rotator.ts` - `getEvomiConfig()`, `buildEvomiProxyUrl()`
|
||||||
|
- `src/tasks/task-worker.ts` - Proxy initialization order
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Bulk Task Workflow (Updated 2025-12-13)
|
## Bulk Task Workflow (Updated 2025-12-13)
|
||||||
|
|||||||
257
backend/scripts/test-treez-brands.ts
Normal file
257
backend/scripts/test-treez-brands.ts
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
/**
|
||||||
|
* Test Treez brand-based product extraction
|
||||||
|
* 1. Load /brands page
|
||||||
|
* 2. Click "load more brands" to get all brands
|
||||||
|
* 3. Extract brand URLs
|
||||||
|
* 4. Visit each brand and extract products
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer, { Page } from 'puppeteer';
|
||||||
|
|
||||||
|
const STORE_ID = 'best';
|
||||||
|
|
||||||
|
async function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function bypassAgeGate(page: Page): Promise<void> {
|
||||||
|
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
||||||
|
if (ageGate) {
|
||||||
|
console.log('[AgeGate] Detected, bypassing...');
|
||||||
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||||
|
if (btn) await btn.click();
|
||||||
|
await sleep(2000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadAllBrands(page: Page): Promise<void> {
|
||||||
|
console.log('[Brands] Looking for "load more" option...');
|
||||||
|
|
||||||
|
// Look for select/dropdown with "load more" or "all brands" option
|
||||||
|
const selectInfo = await page.evaluate(() => {
|
||||||
|
const selects = document.querySelectorAll('select');
|
||||||
|
const info: { selector: string; options: string[] }[] = [];
|
||||||
|
|
||||||
|
selects.forEach((sel, i) => {
|
||||||
|
const options = Array.from(sel.options).map(o => o.text);
|
||||||
|
info.push({ selector: `select:nth-of-type(${i + 1})`, options });
|
||||||
|
});
|
||||||
|
|
||||||
|
return info;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[Brands] Found selects:', JSON.stringify(selectInfo, null, 2));
|
||||||
|
|
||||||
|
// Look for any button or link with "load more" or "show all"
|
||||||
|
const loadMoreButtons = await page.evaluate(() => {
|
||||||
|
const elements = document.querySelectorAll('button, a, [role="button"]');
|
||||||
|
const matches: { text: string; tag: string }[] = [];
|
||||||
|
|
||||||
|
elements.forEach(el => {
|
||||||
|
const text = el.textContent?.toLowerCase() || '';
|
||||||
|
if (text.includes('load more') || text.includes('show all') || text.includes('view all')) {
|
||||||
|
matches.push({ text: el.textContent?.trim() || '', tag: el.tagName });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return matches;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[Brands] Found load more buttons:', loadMoreButtons);
|
||||||
|
|
||||||
|
// Try to find and interact with the brands dropdown
|
||||||
|
// First, let's see all interactive elements with "brand" in them
|
||||||
|
const brandElements = await page.evaluate(() => {
|
||||||
|
const all = document.querySelectorAll('*');
|
||||||
|
const matches: { tag: string; class: string; text: string }[] = [];
|
||||||
|
|
||||||
|
all.forEach(el => {
|
||||||
|
const className = el.className?.toString?.() || '';
|
||||||
|
const text = el.textContent?.trim().slice(0, 100) || '';
|
||||||
|
if (className.toLowerCase().includes('brand') || className.toLowerCase().includes('select')) {
|
||||||
|
matches.push({
|
||||||
|
tag: el.tagName,
|
||||||
|
class: className.slice(0, 100),
|
||||||
|
text: text.slice(0, 50),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return matches.slice(0, 20);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('[Brands] Brand-related elements:', JSON.stringify(brandElements.slice(0, 10), null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractBrandLinks(page: Page): Promise<{ name: string; url: string }[]> {
|
||||||
|
const brands = await page.evaluate(() => {
|
||||||
|
const links: { name: string; url: string }[] = [];
|
||||||
|
|
||||||
|
// Look for brand cards/links
|
||||||
|
const selectors = [
|
||||||
|
'a[href*="/brand/"]',
|
||||||
|
'a[href*="/brands/"]',
|
||||||
|
'[class*="brand"] a',
|
||||||
|
'[class*="Brand"] a',
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => {
|
||||||
|
const href = el.getAttribute('href');
|
||||||
|
const name = el.textContent?.trim() || '';
|
||||||
|
if (href && name && !links.some(l => l.url === href)) {
|
||||||
|
links.push({ name, url: href });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return links;
|
||||||
|
});
|
||||||
|
|
||||||
|
return brands;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractProductsFromBrandPage(page: Page): Promise<any[]> {
|
||||||
|
// Scroll to load all products
|
||||||
|
let previousHeight = 0;
|
||||||
|
let scrollCount = 0;
|
||||||
|
let sameHeightCount = 0;
|
||||||
|
|
||||||
|
while (scrollCount < 20) {
|
||||||
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
|
|
||||||
|
if (currentHeight === previousHeight) {
|
||||||
|
sameHeightCount++;
|
||||||
|
if (sameHeightCount >= 3) break;
|
||||||
|
} else {
|
||||||
|
sameHeightCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||||
|
await sleep(1000);
|
||||||
|
|
||||||
|
previousHeight = currentHeight;
|
||||||
|
scrollCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract products
|
||||||
|
const products = await page.evaluate(() => {
|
||||||
|
const results: any[] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
document.querySelectorAll('[class*="product_product__"]').forEach(el => {
|
||||||
|
const nameEl = el.querySelector('[class*="product__name"], [class*="name__"]');
|
||||||
|
const name = nameEl?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
if (!name || seen.has(name)) return;
|
||||||
|
seen.add(name);
|
||||||
|
|
||||||
|
const priceEl = el.querySelector('[class*="price"]');
|
||||||
|
const priceText = priceEl?.textContent || '';
|
||||||
|
const priceMatch = priceText.match(/\$(\d+(?:\.\d{2})?)/);
|
||||||
|
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
||||||
|
|
||||||
|
const linkEl = el.querySelector('a[href*="/product/"]');
|
||||||
|
let productId = '';
|
||||||
|
if (linkEl) {
|
||||||
|
const href = linkEl.getAttribute('href') || '';
|
||||||
|
const match = href.match(/\/product\/([^\/?]+)/);
|
||||||
|
productId = match ? match[1] : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
productId: productId || `treez_${name.replace(/\s+/g, '_').toLowerCase().slice(0, 30)}`,
|
||||||
|
name,
|
||||||
|
price,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
});
|
||||||
|
|
||||||
|
return products;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('Testing Treez Brand-Based Extraction');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
|
||||||
|
// Block images
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => {
|
||||||
|
if (['image', 'font', 'media'].includes(req.resourceType())) {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Navigate to brands page
|
||||||
|
const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
|
||||||
|
console.log(`\n[1] Navigating to ${brandsUrl}`);
|
||||||
|
await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||||
|
await sleep(2000);
|
||||||
|
await bypassAgeGate(page);
|
||||||
|
await sleep(1000);
|
||||||
|
|
||||||
|
// Screenshot to see what we're working with
|
||||||
|
await page.screenshot({ path: '/tmp/treez-brands-page.png', fullPage: false });
|
||||||
|
console.log('[1] Screenshot saved to /tmp/treez-brands-page.png');
|
||||||
|
|
||||||
|
// Try to load all brands
|
||||||
|
console.log('\n[2] Exploring brand selection options...');
|
||||||
|
await loadAllBrands(page);
|
||||||
|
|
||||||
|
// Extract brand links
|
||||||
|
console.log('\n[3] Extracting brand links...');
|
||||||
|
const brandLinks = await extractBrandLinks(page);
|
||||||
|
console.log(`Found ${brandLinks.length} brand links:`);
|
||||||
|
brandLinks.slice(0, 10).forEach(b => console.log(` - ${b.name}: ${b.url}`));
|
||||||
|
|
||||||
|
// If we found brand links, visit a couple to test
|
||||||
|
if (brandLinks.length > 0) {
|
||||||
|
console.log('\n[4] Testing product extraction from first 3 brands...');
|
||||||
|
|
||||||
|
let totalProducts = 0;
|
||||||
|
const allProducts: any[] = [];
|
||||||
|
|
||||||
|
for (const brand of brandLinks.slice(0, 3)) {
|
||||||
|
const brandUrl = brand.url.startsWith('http')
|
||||||
|
? brand.url
|
||||||
|
: `https://${STORE_ID}.treez.io${brand.url}`;
|
||||||
|
|
||||||
|
console.log(`\n Visiting brand: ${brand.name}`);
|
||||||
|
console.log(` URL: ${brandUrl}`);
|
||||||
|
|
||||||
|
await page.goto(brandUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
const products = await extractProductsFromBrandPage(page);
|
||||||
|
console.log(` Products found: ${products.length}`);
|
||||||
|
|
||||||
|
allProducts.push(...products.map(p => ({ ...p, brand: brand.name })));
|
||||||
|
totalProducts += products.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\n[5] Summary from 3 brands: ${totalProducts} products`);
|
||||||
|
console.log(`Estimated total (${brandLinks.length} brands): ~${Math.round(totalProducts / 3 * brandLinks.length)} products`);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error:', error.message);
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
Reference in New Issue
Block a user