Files
cannaiq/backend/scripts/test-treez-full-crawl.ts
Kelly 698995e46f chore: bump task worker version comment
Force new git SHA to avoid CI scientific notation bug.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 02:02:30 -07:00

222 lines
6.5 KiB
TypeScript

/**
* Full crawl: Visit each brand page and aggregate all products
*/
import puppeteer, { Page } from 'puppeteer';
const STORE_ID = 'best';
async function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function bypassAgeGate(page: Page): Promise<void> {
const ageGate = await page.$('[data-testid="age-gate-modal"]');
if (ageGate) {
const btn = await page.$('[data-testid="age-gate-submit-button"]');
if (btn) await btn.click();
await sleep(2000);
}
}
async function scrollToLoadAll(page: Page): Promise<void> {
let previousHeight = 0;
let sameCount = 0;
for (let i = 0; i < 30; i++) {
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
sameCount++;
if (sameCount >= 3) break;
} else {
sameCount = 0;
}
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await sleep(1000);
previousHeight = currentHeight;
}
}
async function extractProducts(page: Page): Promise<{ name: string; brand: string; price: string; href: string }[]> {
return page.evaluate(() => {
const products: { name: string; brand: string; price: string; href: string }[] = [];
const seen = new Set<string>();
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
const href = a.getAttribute('href') || '';
const img = a.querySelector('img');
const h5 = a.querySelector('h5');
const name = img?.getAttribute('alt') || h5?.textContent?.trim() || '';
if (!name || seen.has(href)) return;
seen.add(href);
// Extract brand from href pattern: /product/{brand}-{product}
const brandMatch = href.match(/\/product\/([^\/]+)/);
const productSlug = brandMatch ? brandMatch[1] : '';
const priceEl = a.querySelector('[class*="price"]');
const priceMatch = priceEl?.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
const price = priceMatch ? priceMatch[1] : '';
products.push({ name, brand: productSlug.split('-')[0] || '', price, href });
});
return products;
});
}
async function main() {
console.log('='.repeat(60));
console.log('Full Treez Crawl - All Brands');
console.log('='.repeat(60));
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setRequestInterception(true);
page.on('request', (req) => {
if (['image', 'font', 'media'].includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
// Step 1: Go to brands page and extract all brand links
const brandsUrl = `https://${STORE_ID}.treez.io/onlinemenu/brands?customerType=ADULT`;
console.log(`\n[1] Getting brand list...`);
await page.goto(brandsUrl, { waitUntil: 'networkidle2', timeout: 60000 });
await sleep(3000);
await bypassAgeGate(page);
await sleep(2000);
// Get all brand links from the page
const brandLinks = await page.evaluate(() => {
const links: string[] = [];
const seen = new Set<string>();
// Get all /brand/ links
document.querySelectorAll('a[href*="/brand/"]').forEach(a => {
const href = a.getAttribute('href') || '';
if (href && !seen.has(href)) {
seen.add(href);
links.push(href);
}
});
return links;
});
console.log(`Found ${brandLinks.length} brand links: ${brandLinks.join(', ')}`);
// Step 2: Also extract unique brands from product URLs
const productBrands = await page.evaluate(() => {
const brands = new Set<string>();
document.querySelectorAll('a[href*="/product/"]').forEach(a => {
const href = a.getAttribute('href') || '';
// Pattern: /product/{brand}-{product}-...
// Extract first part before first hyphen that looks like brand
const match = href.match(/\/product\/([a-z0-9]+(?:-[a-z0-9]+)?)-/i);
if (match) {
brands.add(match[1].toLowerCase());
}
});
return Array.from(brands);
});
console.log(`Found ${productBrands.length} brands from product URLs`);
// Step 3: Build full brand URL list
const allBrandUrls = new Set<string>();
// Add direct brand links
brandLinks.forEach(link => {
if (link.startsWith('/')) {
allBrandUrls.add(`https://${STORE_ID}.treez.io${link}`);
} else {
allBrandUrls.add(link);
}
});
// Add brand URLs from product slugs
productBrands.forEach(brand => {
allBrandUrls.add(`https://${STORE_ID}.treez.io/brand/${encodeURIComponent(brand)}`);
});
console.log(`Total brand URLs to visit: ${allBrandUrls.size}`);
// Step 4: Visit each brand page and collect products
const allProducts = new Map<string, { name: string; brand: string; price: string; href: string }>();
let visitedBrands = 0;
for (const brandUrl of allBrandUrls) {
try {
const fullUrl = brandUrl.includes('customerType') ? brandUrl : `${brandUrl}?customerType=ADULT`;
console.log(`\n[${++visitedBrands}/${allBrandUrls.size}] Visiting: ${fullUrl}`);
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 30000 });
await sleep(1500);
// Scroll to load all
await scrollToLoadAll(page);
const products = await extractProducts(page);
console.log(` Found ${products.length} products`);
products.forEach(p => {
if (!allProducts.has(p.href)) {
allProducts.set(p.href, p);
}
});
console.log(` Total unique so far: ${allProducts.size}`);
} catch (error: any) {
console.log(` Error: ${error.message.slice(0, 50)}`);
}
// Small delay between requests
await sleep(500);
}
// Summary
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log('='.repeat(60));
console.log(`Brands visited: ${visitedBrands}`);
console.log(`Total unique products: ${allProducts.size}`);
// Count by brand
const brandCounts: Record<string, number> = {};
allProducts.forEach(p => {
brandCounts[p.brand] = (brandCounts[p.brand] || 0) + 1;
});
console.log('\nProducts by brand:');
Object.entries(brandCounts)
.sort((a, b) => b[1] - a[1])
.slice(0, 20)
.forEach(([brand, count]) => {
console.log(` ${brand}: ${count}`);
});
// Sample products
console.log('\nSample products:');
Array.from(allProducts.values()).slice(0, 10).forEach(p => {
console.log(` - ${p.name} | ${p.brand} | $${p.price || 'N/A'}`);
});
await browser.close();
}
main().catch(console.error);