Force new git SHA to avoid CI scientific notation bug. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
222 lines
6.5 KiB
TypeScript
222 lines
6.5 KiB
TypeScript
/**
|
|
* Full crawl: Visit each brand page and aggregate all products
|
|
*/
|
|
|
|
import puppeteer, { Page } from 'puppeteer';
|
|
|
|
const STORE_ID = 'best';
|
|
|
|
async function sleep(ms: number): Promise<void> {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async function bypassAgeGate(page: Page): Promise<void> {
|
|
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
|
if (ageGate) {
|
|
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
|
if (btn) await btn.click();
|
|
await sleep(2000);
|
|
}
|
|
}
|
|
|
|
async function scrollToLoadAll(page: Page): Promise<void> {
|
|
let previousHeight = 0;
|
|
let sameCount = 0;
|
|
|
|
for (let i = 0; i < 30; i++) {
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === previousHeight) {
|
|
sameCount++;
|
|
if (sameCount >= 3) break;
|
|
} else {
|
|
sameCount = 0;
|
|
}
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await sleep(1000);
|
|
previousHeight = currentHeight;
|
|
}
|
|
}
|
|
|
|
async function extractProducts(page: Page): Promise<{ name: string; brand: string; price: string; href: string }[]> {
|
|
return page.evaluate(() => {
|
|
const products: { name: string; brand: string; price: string; href: string }[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
|
|
const href = a.getAttribute('href') || '';
|
|
const img = a.querySelector('img');
|
|
const h5 = a.querySelector('h5');
|
|
const name = img?.getAttribute('alt') || h5?.textContent?.trim() || '';
|
|
|
|
if (!name || seen.has(href)) return;
|
|
seen.add(href);
|
|
|
|
// Extract brand from href pattern: /product/{brand}-{product}
|
|
const brandMatch = href.match(/\/product\/([^\/]+)/);
|
|
const productSlug = brandMatch ? brandMatch[1] : '';
|
|
|
|
const priceEl = a.querySelector('[class*="price"]');
|
|
const priceMatch = priceEl?.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
|
|
const price = priceMatch ? priceMatch[1] : '';
|
|
|
|
products.push({ name, brand: productSlug.split('-')[0] || '', price, href });
|
|
});
|
|
|
|
return products;
|
|
});
|
|
}
|
|
|
|
async function main() {
|
|
console.log('='.repeat(60));
|
|
console.log('Full Treez Crawl - All Brands');
|
|
console.log('='.repeat(60));
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (req) => {
|
|
if (['image', 'font', 'media'].includes(req.resourceType())) {
|
|
req.abort();
|
|
} else {
|
|
req.continue();
|
|
}
|
|
});
|
|
|
|
// Step 1: Go to brands page and extract all brand links
|
|
const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
|
|
console.log(`\n[1] Getting brand list...`);
|
|
|
|
await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
await sleep(3000);
|
|
await bypassAgeGate(page);
|
|
await sleep(2000);
|
|
|
|
// Get all brand links from the page
|
|
const brandLinks = await page.evaluate(() => {
|
|
const links: string[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
// Get all /brand/ links
|
|
document.querySelectorAll('a[href*="/brand/"]').forEach(a => {
|
|
const href = a.getAttribute('href') || '';
|
|
if (href && !seen.has(href)) {
|
|
seen.add(href);
|
|
links.push(href);
|
|
}
|
|
});
|
|
|
|
return links;
|
|
});
|
|
|
|
console.log(`Found ${brandLinks.length} brand links: ${brandLinks.join(', ')}`);
|
|
|
|
// Step 2: Also extract unique brands from product URLs
|
|
const productBrands = await page.evaluate(() => {
|
|
const brands = new Set<string>();
|
|
|
|
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
|
|
const href = a.getAttribute('href') || '';
|
|
// Pattern: /product/{brand}-{product}-...
|
|
// Extract first part before first hyphen that looks like brand
|
|
const match = href.match(/\/product\/([a-z0-9]+(?:-[a-z0-9]+)?)-/i);
|
|
if (match) {
|
|
brands.add(match[1].toLowerCase());
|
|
}
|
|
});
|
|
|
|
return Array.from(brands);
|
|
});
|
|
|
|
console.log(`Found ${productBrands.length} brands from product URLs`);
|
|
|
|
// Step 3: Build full brand URL list
|
|
const allBrandUrls = new Set<string>();
|
|
|
|
// Add direct brand links
|
|
brandLinks.forEach(link => {
|
|
if (link.startsWith('/')) {
|
|
allBrandUrls.add(`https://${STORE_ID}.treez.io${link}`);
|
|
} else {
|
|
allBrandUrls.add(link);
|
|
}
|
|
});
|
|
|
|
// Add brand URLs from product slugs
|
|
productBrands.forEach(brand => {
|
|
allBrandUrls.add(`https://${STORE_ID}.treez.io/brand/${encodeURIComponent(brand)}`);
|
|
});
|
|
|
|
console.log(`Total brand URLs to visit: ${allBrandUrls.size}`);
|
|
|
|
// Step 4: Visit each brand page and collect products
|
|
const allProducts = new Map<string, { name: string; brand: string; price: string; href: string }>();
|
|
let visitedBrands = 0;
|
|
|
|
for (const brandUrl of allBrandUrls) {
|
|
try {
|
|
const fullUrl = brandUrl.includes('customerType') ? brandUrl : `${brandUrl}?customerType=ADULT`;
|
|
console.log(`\n[${++visitedBrands}/${allBrandUrls.size}] Visiting: ${fullUrl}`);
|
|
|
|
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
|
await sleep(1500);
|
|
|
|
// Scroll to load all
|
|
await scrollToLoadAll(page);
|
|
|
|
const products = await extractProducts(page);
|
|
console.log(` Found ${products.length} products`);
|
|
|
|
products.forEach(p => {
|
|
if (!allProducts.has(p.href)) {
|
|
allProducts.set(p.href, p);
|
|
}
|
|
});
|
|
|
|
console.log(` Total unique so far: ${allProducts.size}`);
|
|
|
|
} catch (error: any) {
|
|
console.log(` Error: ${error.message.slice(0, 50)}`);
|
|
}
|
|
|
|
// Small delay between requests
|
|
await sleep(500);
|
|
}
|
|
|
|
// Summary
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('SUMMARY');
|
|
console.log('='.repeat(60));
|
|
console.log(`Brands visited: ${visitedBrands}`);
|
|
console.log(`Total unique products: ${allProducts.size}`);
|
|
|
|
// Count by brand
|
|
const brandCounts: Record<string, number> = {};
|
|
allProducts.forEach(p => {
|
|
brandCounts[p.brand] = (brandCounts[p.brand] || 0) + 1;
|
|
});
|
|
|
|
console.log('\nProducts by brand:');
|
|
Object.entries(brandCounts)
|
|
.sort((a, b) => b[1] - a[1])
|
|
.slice(0, 20)
|
|
.forEach(([brand, count]) => {
|
|
console.log(` ${brand}: ${count}`);
|
|
});
|
|
|
|
// Sample products
|
|
console.log('\nSample products:');
|
|
Array.from(allProducts.values()).slice(0, 10).forEach(p => {
|
|
console.log(` - ${p.name} | ${p.brand} | $${p.price || 'N/A'}`);
|
|
});
|
|
|
|
await browser.close();
|
|
}
|
|
|
|
main().catch(console.error);
|