fix: Block images/fonts/media in Puppeteer to save bandwidth

Add request interception to all Puppeteer handlers to block unnecessary
resources (images, fonts, media, stylesheets). We only need HTML/JS for
the session cookie, then the GraphQL JSON response.

This was causing 2.4GB of bandwidth from assets2.dutchie.com - every
page visit downloaded all product thumbnails, logos, etc.

Files updated:
- product-discovery-http.ts
- entry-point-discovery.ts
- store-discovery-http.ts
- store-discovery-state.ts
- puppeteer-preflight.ts

Note: Product images from payload are still downloaded once to MinIO
via image-storage.ts - this only blocks browser-rendered page images.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-13 03:28:12 -07:00
parent ec6843dfd6
commit 5084cb1a85
5 changed files with 59 additions and 0 deletions

View File

@@ -150,6 +150,17 @@ export async function runPuppeteerPreflight(
const page = await browser.newPage(); const page = await browser.newPage();
// Block unnecessary resources to save bandwidth
await page.setRequestInterception(true);
page.on('request', (request: any) => {
const resourceType = request.resourceType();
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// If proxy has auth, set it up // If proxy has auth, set it up
if (proxyUrl) { if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl); const proxyUrlParsed = new URL(proxyUrl);

View File

@@ -233,6 +233,18 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
const page = await browser.newPage(); const page = await browser.newPage();
// Block unnecessary resources to save bandwidth
// We only need HTML/JS for session, then GraphQL JSON
await page.setRequestInterception(true);
page.on('request', (request: any) => {
const resourceType = request.resourceType();
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// Setup proxy auth if needed // Setup proxy auth if needed
if (proxyUrl) { if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl); const proxyUrlParsed = new URL(proxyUrl);

View File

@@ -100,6 +100,19 @@ export async function handleProductDiscoveryHttp(ctx: TaskContext): Promise<Task
const page = await browser.newPage(); const page = await browser.newPage();
// Block unnecessary resources to save bandwidth
// We only need HTML/JS for session, then GraphQL JSON
await page.setRequestInterception(true);
page.on('request', (request: any) => {
const resourceType = request.resourceType();
// Block images, fonts, media, and stylesheets - we don't need them
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// Setup proxy auth if needed // Setup proxy auth if needed
if (proxyUrl) { if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl); const proxyUrlParsed = new URL(proxyUrl);

View File

@@ -112,6 +112,18 @@ export async function handleStoreDiscoveryHttp(ctx: TaskContext): Promise<TaskRe
const page = await browser.newPage(); const page = await browser.newPage();
// Block unnecessary resources to save bandwidth
// We only need HTML/JS for session, then GraphQL JSON
await page.setRequestInterception(true);
page.on('request', (request: any) => {
const resourceType = request.resourceType();
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// Setup proxy auth if needed // Setup proxy auth if needed
if (proxyUrl) { if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl); const proxyUrlParsed = new URL(proxyUrl);

View File

@@ -111,6 +111,17 @@ export async function handleStoreDiscoveryState(ctx: TaskContext): Promise<TaskR
const page = await browser.newPage(); const page = await browser.newPage();
// Block unnecessary resources to save bandwidth
await page.setRequestInterception(true);
page.on('request', (request: any) => {
const resourceType = request.resourceType();
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// Setup proxy auth if needed // Setup proxy auth if needed
if (proxyUrl) { if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl); const proxyUrlParsed = new URL(proxyUrl);