perf(puppeteer): Block analytics/tracking domains to save proxy bandwidth
Block requests to non-essential domains: - googletagmanager.com, google-analytics.com (analytics) - launchdarkly.com (feature flags) - assets2.dutchie.com (CDN assets - we only need GraphQL) - sentry.io (error tracking) - segment.io/segment.com, amplitude.com, mixpanel.com (analytics) - hotjar.com, fullstory.com (session recording) Applied to both product-discovery-dutchie.ts and puppeteer-preflight.ts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
108
backend/scripts/test-treez-styles.ts
Normal file
108
backend/scripts/test-treez-styles.ts
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/**
|
||||||
|
* Test if blocking stylesheets affects product detection
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer, { Page } from 'puppeteer';
|
||||||
|
|
||||||
|
const STORE_ID = 'best';
|
||||||
|
|
||||||
|
async function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function bypassAgeGate(page: Page): Promise<void> {
|
||||||
|
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
||||||
|
if (ageGate) {
|
||||||
|
console.log(' Age gate detected, bypassing...');
|
||||||
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||||
|
if (btn) await btn.click();
|
||||||
|
await sleep(2000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function countProducts(page: Page): Promise<{ total: number; withName: number; withPrice: number }> {
|
||||||
|
return page.evaluate(() => {
|
||||||
|
const all = document.querySelectorAll('[class*="product_product__"]');
|
||||||
|
let withName = 0;
|
||||||
|
let withPrice = 0;
|
||||||
|
|
||||||
|
all.forEach(el => {
|
||||||
|
const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]');
|
||||||
|
const hasPrice = el.querySelector('[class*="price"]');
|
||||||
|
if (hasName) withName++;
|
||||||
|
if (hasPrice) withPrice++;
|
||||||
|
});
|
||||||
|
|
||||||
|
return { total: all.length, withName, withPrice };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testWithBlocking(blockStylesheets: boolean): Promise<void> {
|
||||||
|
console.log(`\n${'='.repeat(50)}`);
|
||||||
|
console.log(`Testing with ${blockStylesheets ? 'BLOCKED' : 'ALLOWED'} stylesheets`);
|
||||||
|
console.log('='.repeat(50));
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => {
|
||||||
|
const type = req.resourceType();
|
||||||
|
if (type === 'image' || type === 'font' || type === 'media') {
|
||||||
|
req.abort();
|
||||||
|
} else if (type === 'stylesheet' && blockStylesheets) {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const url = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
|
||||||
|
console.log(`Navigating to ${url}`);
|
||||||
|
|
||||||
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||||
|
await sleep(3000);
|
||||||
|
await bypassAgeGate(page);
|
||||||
|
await sleep(2000);
|
||||||
|
|
||||||
|
const counts = await countProducts(page);
|
||||||
|
console.log(`Total product elements: ${counts.total}`);
|
||||||
|
console.log(`With name selector: ${counts.withName}`);
|
||||||
|
console.log(`With price selector: ${counts.withPrice}`);
|
||||||
|
|
||||||
|
// Check what classes exist on product elements
|
||||||
|
const sampleClasses = await page.evaluate(() => {
|
||||||
|
const products = document.querySelectorAll('[class*="product_product__"]');
|
||||||
|
const sample = products[0];
|
||||||
|
if (!sample) return 'No products found';
|
||||||
|
|
||||||
|
const children = Array.from(sample.querySelectorAll('*')).slice(0, 20);
|
||||||
|
return children.map(el => ({
|
||||||
|
tag: el.tagName,
|
||||||
|
class: el.className?.toString?.().slice(0, 80) || '',
|
||||||
|
}));
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\nSample product children:');
|
||||||
|
if (Array.isArray(sampleClasses)) {
|
||||||
|
sampleClasses.forEach(c => console.log(` [${c.tag}] ${c.class}`));
|
||||||
|
} else {
|
||||||
|
console.log(` ${sampleClasses}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Testing stylesheet impact on Treez product detection');
|
||||||
|
|
||||||
|
await testWithBlocking(true); // Block stylesheets
|
||||||
|
await testWithBlocking(false); // Allow stylesheets
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
@@ -152,13 +152,39 @@ export async function runPuppeteerPreflight(
|
|||||||
|
|
||||||
// Block unnecessary resources to save bandwidth
|
// Block unnecessary resources to save bandwidth
|
||||||
await page.setRequestInterception(true);
|
await page.setRequestInterception(true);
|
||||||
|
|
||||||
|
// Domains to block - analytics, tracking, feature flags
|
||||||
|
const BLOCKED_DOMAINS = [
|
||||||
|
'googletagmanager.com',
|
||||||
|
'google-analytics.com',
|
||||||
|
'launchdarkly.com',
|
||||||
|
'assets2.dutchie.com',
|
||||||
|
'sentry.io',
|
||||||
|
'segment.io',
|
||||||
|
'segment.com',
|
||||||
|
'amplitude.com',
|
||||||
|
'mixpanel.com',
|
||||||
|
'hotjar.com',
|
||||||
|
'fullstory.com',
|
||||||
|
];
|
||||||
|
|
||||||
page.on('request', (request: any) => {
|
page.on('request', (request: any) => {
|
||||||
|
const url = request.url();
|
||||||
const resourceType = request.resourceType();
|
const resourceType = request.resourceType();
|
||||||
|
|
||||||
|
// Block by domain
|
||||||
|
if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) {
|
||||||
|
request.abort();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block by resource type
|
||||||
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
||||||
request.abort();
|
request.abort();
|
||||||
} else {
|
return;
|
||||||
request.continue();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
request.continue();
|
||||||
});
|
});
|
||||||
|
|
||||||
// If proxy has auth, set it up
|
// If proxy has auth, set it up
|
||||||
|
|||||||
@@ -105,14 +105,39 @@ export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise<T
|
|||||||
// Block unnecessary resources to save bandwidth
|
// Block unnecessary resources to save bandwidth
|
||||||
// We only need HTML/JS for session, then GraphQL JSON
|
// We only need HTML/JS for session, then GraphQL JSON
|
||||||
await page.setRequestInterception(true);
|
await page.setRequestInterception(true);
|
||||||
|
|
||||||
|
// Domains to block - analytics, tracking, feature flags (not needed for GraphQL)
|
||||||
|
const BLOCKED_DOMAINS = [
|
||||||
|
'googletagmanager.com',
|
||||||
|
'google-analytics.com',
|
||||||
|
'launchdarkly.com',
|
||||||
|
'assets2.dutchie.com', // CDN assets - we only need GraphQL
|
||||||
|
'sentry.io',
|
||||||
|
'segment.io',
|
||||||
|
'segment.com',
|
||||||
|
'amplitude.com',
|
||||||
|
'mixpanel.com',
|
||||||
|
'hotjar.com',
|
||||||
|
'fullstory.com',
|
||||||
|
];
|
||||||
|
|
||||||
page.on('request', (request: any) => {
|
page.on('request', (request: any) => {
|
||||||
|
const url = request.url();
|
||||||
const resourceType = request.resourceType();
|
const resourceType = request.resourceType();
|
||||||
// Block images, fonts, media, and stylesheets - we don't need them
|
|
||||||
|
// Block by domain - saves significant proxy bandwidth
|
||||||
|
if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) {
|
||||||
|
request.abort();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block by resource type - images, fonts, media, stylesheets
|
||||||
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
||||||
request.abort();
|
request.abort();
|
||||||
} else {
|
return;
|
||||||
request.continue();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
request.continue();
|
||||||
});
|
});
|
||||||
|
|
||||||
// Setup proxy auth if needed
|
// Setup proxy auth if needed
|
||||||
|
|||||||
Reference in New Issue
Block a user