diff --git a/backend/scripts/test-treez-styles.ts b/backend/scripts/test-treez-styles.ts new file mode 100644 index 00000000..4ef028fc --- /dev/null +++ b/backend/scripts/test-treez-styles.ts @@ -0,0 +1,108 @@ +/** + * Test if blocking stylesheets affects product detection + */ + +import puppeteer, { Page } from 'puppeteer'; + +const STORE_ID = 'best'; + +async function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function bypassAgeGate(page: Page): Promise { + const ageGate = await page.$('[data-testid="age-gate-modal"]'); + if (ageGate) { + console.log(' Age gate detected, bypassing...'); + const btn = await page.$('[data-testid="age-gate-submit-button"]'); + if (btn) await btn.click(); + await sleep(2000); + } +} + +async function countProducts(page: Page): Promise<{ total: number; withName: number; withPrice: number }> { + return page.evaluate(() => { + const all = document.querySelectorAll('[class*="product_product__"]'); + let withName = 0; + let withPrice = 0; + + all.forEach(el => { + const hasName = el.querySelector('[class*="product__name"]') || el.querySelector('[class*="name__"]'); + const hasPrice = el.querySelector('[class*="price"]'); + if (hasName) withName++; + if (hasPrice) withPrice++; + }); + + return { total: all.length, withName, withPrice }; + }); +} + +async function testWithBlocking(blockStylesheets: boolean): Promise { + console.log(`\n${'='.repeat(50)}`); + console.log(`Testing with ${blockStylesheets ? 'BLOCKED' : 'ALLOWED'} stylesheets`); + console.log('='.repeat(50)); + + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + + await page.setRequestInterception(true); + page.on('request', (req) => { + const type = req.resourceType(); + if (type === 'image' || type === 'font' || type === 'media') { + req.abort(); + } else if (type === 'stylesheet' && blockStylesheets) { + req.abort(); + } else { + req.continue(); + } + }); + + const url = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`; + console.log(`Navigating to ${url}`); + + await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 }); + await sleep(3000); + await bypassAgeGate(page); + await sleep(2000); + + const counts = await countProducts(page); + console.log(`Total product elements: ${counts.total}`); + console.log(`With name selector: ${counts.withName}`); + console.log(`With price selector: ${counts.withPrice}`); + + // Check what classes exist on product elements + const sampleClasses = await page.evaluate(() => { + const products = document.querySelectorAll('[class*="product_product__"]'); + const sample = products[0]; + if (!sample) return 'No products found'; + + const children = Array.from(sample.querySelectorAll('*')).slice(0, 20); + return children.map(el => ({ + tag: el.tagName, + class: el.className?.toString?.().slice(0, 80) || '', + })); + }); + + console.log('\nSample product children:'); + if (Array.isArray(sampleClasses)) { + sampleClasses.forEach(c => console.log(` [${c.tag}] ${c.class}`)); + } else { + console.log(` ${sampleClasses}`); + } + + await browser.close(); +} + +async function main() { + console.log('Testing stylesheet impact on Treez product detection'); + + await testWithBlocking(true); // Block stylesheets + await testWithBlocking(false); // Allow stylesheets +} + +main().catch(console.error); diff --git a/backend/src/services/puppeteer-preflight.ts b/backend/src/services/puppeteer-preflight.ts index 40f1008e..276c0dd4 100644 --- a/backend/src/services/puppeteer-preflight.ts +++ b/backend/src/services/puppeteer-preflight.ts @@ -152,13 +152,39 @@ export async function runPuppeteerPreflight( // Block unnecessary resources to save bandwidth await page.setRequestInterception(true); + + // Domains to block - analytics, tracking, feature flags + const BLOCKED_DOMAINS = [ + 'googletagmanager.com', + 'google-analytics.com', + 'launchdarkly.com', + 'assets2.dutchie.com', + 'sentry.io', + 'segment.io', + 'segment.com', + 'amplitude.com', + 'mixpanel.com', + 'hotjar.com', + 'fullstory.com', + ]; + page.on('request', (request: any) => { + const url = request.url(); const resourceType = request.resourceType(); + + // Block by domain + if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) { + request.abort(); + return; + } + + // Block by resource type if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) { request.abort(); - } else { - request.continue(); + return; } + + request.continue(); }); // If proxy has auth, set it up diff --git a/backend/src/tasks/handlers/product-discovery-dutchie.ts b/backend/src/tasks/handlers/product-discovery-dutchie.ts index 55c2826c..9391bbe0 100644 --- a/backend/src/tasks/handlers/product-discovery-dutchie.ts +++ b/backend/src/tasks/handlers/product-discovery-dutchie.ts @@ -105,14 +105,39 @@ export async function handleProductDiscoveryDutchie(ctx: TaskContext): Promise { + const url = request.url(); const resourceType = request.resourceType(); - // Block images, fonts, media, and stylesheets - we don't need them + + // Block by domain - saves significant proxy bandwidth + if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) { + request.abort(); + return; + } + + // Block by resource type - images, fonts, media, stylesheets if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) { request.abort(); - } else { - request.continue(); + return; } + + request.continue(); }); // Setup proxy auth if needed