chore: bump task worker version comment
Force new git SHA to avoid CI scientific notation bug. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
221
backend/scripts/test-treez-full-crawl.ts
Normal file
221
backend/scripts/test-treez-full-crawl.ts
Normal file
@@ -0,0 +1,221 @@
|
||||
/**
|
||||
* Full crawl: Visit each brand page and aggregate all products
|
||||
*/
|
||||
|
||||
import puppeteer, { Page } from 'puppeteer';
|
||||
|
||||
const STORE_ID = 'best';
|
||||
|
||||
async function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function bypassAgeGate(page: Page): Promise<void> {
|
||||
const ageGate = await page.$('[data-testid="age-gate-modal"]');
|
||||
if (ageGate) {
|
||||
const btn = await page.$('[data-testid="age-gate-submit-button"]');
|
||||
if (btn) await btn.click();
|
||||
await sleep(2000);
|
||||
}
|
||||
}
|
||||
|
||||
async function scrollToLoadAll(page: Page): Promise<void> {
|
||||
let previousHeight = 0;
|
||||
let sameCount = 0;
|
||||
|
||||
for (let i = 0; i < 30; i++) {
|
||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
if (currentHeight === previousHeight) {
|
||||
sameCount++;
|
||||
if (sameCount >= 3) break;
|
||||
} else {
|
||||
sameCount = 0;
|
||||
}
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await sleep(1000);
|
||||
previousHeight = currentHeight;
|
||||
}
|
||||
}
|
||||
|
||||
async function extractProducts(page: Page): Promise<{ name: string; brand: string; price: string; href: string }[]> {
|
||||
return page.evaluate(() => {
|
||||
const products: { name: string; brand: string; price: string; href: string }[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
|
||||
const href = a.getAttribute('href') || '';
|
||||
const img = a.querySelector('img');
|
||||
const h5 = a.querySelector('h5');
|
||||
const name = img?.getAttribute('alt') || h5?.textContent?.trim() || '';
|
||||
|
||||
if (!name || seen.has(href)) return;
|
||||
seen.add(href);
|
||||
|
||||
// Extract brand from href pattern: /product/{brand}-{product}
|
||||
const brandMatch = href.match(/\/product\/([^\/]+)/);
|
||||
const productSlug = brandMatch ? brandMatch[1] : '';
|
||||
|
||||
const priceEl = a.querySelector('[class*="price"]');
|
||||
const priceMatch = priceEl?.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? priceMatch[1] : '';
|
||||
|
||||
products.push({ name, brand: productSlug.split('-')[0] || '', price, href });
|
||||
});
|
||||
|
||||
return products;
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('='.repeat(60));
|
||||
console.log('Full Treez Crawl - All Brands');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (req) => {
|
||||
if (['image', 'font', 'media'].includes(req.resourceType())) {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Step 1: Go to brands page and extract all brand links
|
||||
const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
|
||||
console.log(`\n[1] Getting brand list...`);
|
||||
|
||||
await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
await sleep(3000);
|
||||
await bypassAgeGate(page);
|
||||
await sleep(2000);
|
||||
|
||||
// Get all brand links from the page
|
||||
const brandLinks = await page.evaluate(() => {
|
||||
const links: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Get all /brand/ links
|
||||
document.querySelectorAll('a[href*="/brand/"]').forEach(a => {
|
||||
const href = a.getAttribute('href') || '';
|
||||
if (href && !seen.has(href)) {
|
||||
seen.add(href);
|
||||
links.push(href);
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
});
|
||||
|
||||
console.log(`Found ${brandLinks.length} brand links: ${brandLinks.join(', ')}`);
|
||||
|
||||
// Step 2: Also extract unique brands from product URLs
|
||||
const productBrands = await page.evaluate(() => {
|
||||
const brands = new Set<string>();
|
||||
|
||||
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
|
||||
const href = a.getAttribute('href') || '';
|
||||
// Pattern: /product/{brand}-{product}-...
|
||||
// Extract first part before first hyphen that looks like brand
|
||||
const match = href.match(/\/product\/([a-z0-9]+(?:-[a-z0-9]+)?)-/i);
|
||||
if (match) {
|
||||
brands.add(match[1].toLowerCase());
|
||||
}
|
||||
});
|
||||
|
||||
return Array.from(brands);
|
||||
});
|
||||
|
||||
console.log(`Found ${productBrands.length} brands from product URLs`);
|
||||
|
||||
// Step 3: Build full brand URL list
|
||||
const allBrandUrls = new Set<string>();
|
||||
|
||||
// Add direct brand links
|
||||
brandLinks.forEach(link => {
|
||||
if (link.startsWith('/')) {
|
||||
allBrandUrls.add(`https://${STORE_ID}.treez.io${link}`);
|
||||
} else {
|
||||
allBrandUrls.add(link);
|
||||
}
|
||||
});
|
||||
|
||||
// Add brand URLs from product slugs
|
||||
productBrands.forEach(brand => {
|
||||
allBrandUrls.add(`https://${STORE_ID}.treez.io/brand/${encodeURIComponent(brand)}`);
|
||||
});
|
||||
|
||||
console.log(`Total brand URLs to visit: ${allBrandUrls.size}`);
|
||||
|
||||
// Step 4: Visit each brand page and collect products
|
||||
const allProducts = new Map<string, { name: string; brand: string; price: string; href: string }>();
|
||||
let visitedBrands = 0;
|
||||
|
||||
for (const brandUrl of allBrandUrls) {
|
||||
try {
|
||||
const fullUrl = brandUrl.includes('customerType') ? brandUrl : `${brandUrl}?customerType=ADULT`;
|
||||
console.log(`\n[${++visitedBrands}/${allBrandUrls.size}] Visiting: ${fullUrl}`);
|
||||
|
||||
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||
await sleep(1500);
|
||||
|
||||
// Scroll to load all
|
||||
await scrollToLoadAll(page);
|
||||
|
||||
const products = await extractProducts(page);
|
||||
console.log(` Found ${products.length} products`);
|
||||
|
||||
products.forEach(p => {
|
||||
if (!allProducts.has(p.href)) {
|
||||
allProducts.set(p.href, p);
|
||||
}
|
||||
});
|
||||
|
||||
console.log(` Total unique so far: ${allProducts.size}`);
|
||||
|
||||
} catch (error: any) {
|
||||
console.log(` Error: ${error.message.slice(0, 50)}`);
|
||||
}
|
||||
|
||||
// Small delay between requests
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
// Summary
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Brands visited: ${visitedBrands}`);
|
||||
console.log(`Total unique products: ${allProducts.size}`);
|
||||
|
||||
// Count by brand
|
||||
const brandCounts: Record<string, number> = {};
|
||||
allProducts.forEach(p => {
|
||||
brandCounts[p.brand] = (brandCounts[p.brand] || 0) + 1;
|
||||
});
|
||||
|
||||
console.log('\nProducts by brand:');
|
||||
Object.entries(brandCounts)
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 20)
|
||||
.forEach(([brand, count]) => {
|
||||
console.log(` ${brand}: ${count}`);
|
||||
});
|
||||
|
||||
// Sample products
|
||||
console.log('\nSample products:');
|
||||
Array.from(allProducts.values()).slice(0, 10).forEach(p => {
|
||||
console.log(` - ${p.name} | ${p.brand} | $${p.price || 'N/A'}`);
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user