Force new git SHA to avoid CI scientific notation bug. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
204 lines
6.4 KiB
TypeScript
204 lines
6.4 KiB
TypeScript
/**
|
|
* Extract ALL product elements and find unique products
|
|
*/
|
|
|
|
import puppeteer, { Page } from 'puppeteer';
|
|
|
|
const STORE_ID = 'best';
|
|
|
|
async function sleep(ms: number): Promise<void> {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async function bypassAgeGate(page: Page): Promise<void> {
|
|
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
|
if (ageGate) {
|
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
|
if (btn) await btn.click();
|
|
await sleep(2000);
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
console.log('='.repeat(60));
|
|
console.log('Extracting ALL product elements');
|
|
console.log('='.repeat(60));
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (req) => {
|
|
if (['image', 'font', 'media'].includes(req.resourceType())) {
|
|
req.abort();
|
|
} else {
|
|
req.continue();
|
|
}
|
|
});
|
|
|
|
const url = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
await sleep(3000);
|
|
await bypassAgeGate(page);
|
|
await sleep(2000);
|
|
|
|
// Get ALL elements with product_product__ class
|
|
console.log('\n[1] Counting all product_product__ elements...');
|
|
|
|
const elementAnalysis = await page.evaluate(() => {
|
|
const all = document.querySelectorAll('[class*="product_product__"]');
|
|
const byTag: Record<string, number> = {};
|
|
const anchorHrefs: string[] = [];
|
|
const imgAlts: string[] = [];
|
|
|
|
all.forEach(el => {
|
|
const tag = el.tagName;
|
|
byTag[tag] = (byTag[tag] || 0) + 1;
|
|
|
|
if (tag === 'A') {
|
|
const href = el.getAttribute('href');
|
|
if (href && href.includes('/product/')) {
|
|
anchorHrefs.push(href);
|
|
}
|
|
}
|
|
|
|
if (tag === 'IMG') {
|
|
const alt = el.getAttribute('alt');
|
|
if (alt) imgAlts.push(alt);
|
|
}
|
|
});
|
|
|
|
return {
|
|
total: all.length,
|
|
byTag,
|
|
anchorHrefs: anchorHrefs.slice(0, 20),
|
|
uniqueAnchors: new Set(anchorHrefs).size,
|
|
imgAlts: imgAlts.slice(0, 20),
|
|
uniqueImgAlts: new Set(imgAlts).size,
|
|
};
|
|
});
|
|
|
|
console.log(`Total elements: ${elementAnalysis.total}`);
|
|
console.log(`By tag:`, elementAnalysis.byTag);
|
|
console.log(`Unique anchor hrefs: ${elementAnalysis.uniqueAnchors}`);
|
|
console.log(`Unique image alts: ${elementAnalysis.uniqueImgAlts}`);
|
|
console.log(`\nSample anchor hrefs:`, elementAnalysis.anchorHrefs.slice(0, 5));
|
|
console.log(`Sample image alts:`, elementAnalysis.imgAlts.slice(0, 5));
|
|
|
|
// Try to extract using different approaches
|
|
console.log('\n[2] Testing extraction approaches...');
|
|
|
|
const approaches = await page.evaluate(() => {
|
|
const results: Record<string, { count: number; unique: number; sample: string[] }> = {};
|
|
|
|
// Approach 1: Anchor elements with product links
|
|
const anchors = document.querySelectorAll('a[href*="/product/"]');
|
|
const anchorNames = new Set<string>();
|
|
anchors.forEach(a => {
|
|
const img = a.querySelector('img');
|
|
const name = img?.getAttribute('alt') || a.textContent?.trim().split('\n')[0] || '';
|
|
if (name) anchorNames.add(name);
|
|
});
|
|
results['a[href*="/product/"]'] = {
|
|
count: anchors.length,
|
|
unique: anchorNames.size,
|
|
sample: Array.from(anchorNames).slice(0, 5),
|
|
};
|
|
|
|
// Approach 2: Images with alt text inside product areas
|
|
const productImgs = document.querySelectorAll('[class*="product_product__"] img[alt]');
|
|
const imgNames = new Set<string>();
|
|
productImgs.forEach(img => {
|
|
const alt = img.getAttribute('alt');
|
|
if (alt && alt.length > 2) imgNames.add(alt);
|
|
});
|
|
results['[class*="product_product__"] img[alt]'] = {
|
|
count: productImgs.length,
|
|
unique: imgNames.size,
|
|
sample: Array.from(imgNames).slice(0, 5),
|
|
};
|
|
|
|
// Approach 3: H5 elements (product names)
|
|
const h5s = document.querySelectorAll('h5.product_product__name__JcEk0, h5[class*="product__name"]');
|
|
const h5Names = new Set<string>();
|
|
h5s.forEach(h5 => {
|
|
const text = h5.textContent?.trim();
|
|
if (text) h5Names.add(text);
|
|
});
|
|
results['h5[class*="product__name"]'] = {
|
|
count: h5s.length,
|
|
unique: h5Names.size,
|
|
sample: Array.from(h5Names).slice(0, 5),
|
|
};
|
|
|
|
// Approach 4: Link class with product_product__
|
|
const links = document.querySelectorAll('a.product_product__ERWtJ, a[class*="product_product__"][class*="link"]');
|
|
const linkNames = new Set<string>();
|
|
links.forEach(link => {
|
|
const h5 = link.querySelector('h5');
|
|
const img = link.querySelector('img');
|
|
const name = h5?.textContent?.trim() || img?.getAttribute('alt') || '';
|
|
if (name) linkNames.add(name);
|
|
});
|
|
results['a.product_product__ERWtJ'] = {
|
|
count: links.length,
|
|
unique: linkNames.size,
|
|
sample: Array.from(linkNames).slice(0, 5),
|
|
};
|
|
|
|
return results;
|
|
});
|
|
|
|
Object.entries(approaches).forEach(([sel, data]) => {
|
|
console.log(`\n${sel}:`);
|
|
console.log(` Count: ${data.count}, Unique: ${data.unique}`);
|
|
console.log(` Sample: ${data.sample.join(', ')}`);
|
|
});
|
|
|
|
// The best approach: use images with alt as the source of truth
|
|
console.log('\n[3] Full product extraction using img[alt] approach...');
|
|
|
|
const products = await page.evaluate(() => {
|
|
const seen = new Set<string>();
|
|
const products: { name: string; href: string; price: string }[] = [];
|
|
|
|
// Get all product links
|
|
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
|
|
const img = a.querySelector('img');
|
|
const name = img?.getAttribute('alt') || '';
|
|
|
|
if (!name || seen.has(name)) return;
|
|
seen.add(name);
|
|
|
|
const href = a.getAttribute('href') || '';
|
|
|
|
// Get price from within the link or parent
|
|
let price = '';
|
|
const priceEl = a.querySelector('[class*="price"]');
|
|
if (priceEl) {
|
|
const priceMatch = priceEl.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
|
|
price = priceMatch ? priceMatch[1] : '';
|
|
}
|
|
|
|
products.push({ name, href, price });
|
|
});
|
|
|
|
return products;
|
|
});
|
|
|
|
console.log(`Extracted ${products.length} unique products`);
|
|
console.log('\nSample products:');
|
|
products.slice(0, 10).forEach(p => {
|
|
console.log(` - ${p.name} | ${p.price ? '$' + p.price : 'N/A'} | ${p.href.slice(0, 40)}...`);
|
|
});
|
|
|
|
await browser.close();
|
|
}
|
|
|
|
main().catch(console.error);
|