165 lines
5.3 KiB
TypeScript
165 lines
5.3 KiB
TypeScript
import { createStealthBrowser, createStealthContext, waitForPageLoad, isCloudflareChallenge, waitForCloudflareChallenge } from './src/utils/stealthBrowser';
|
|
import { getRandomProxy } from './src/utils/proxyManager';
|
|
import { pool } from './src/db/migrate';
|
|
|
|
interface Brand {
|
|
name: string;
|
|
logo_url?: string;
|
|
}
|
|
|
|
async function scrapeBrands(storeId: number) {
|
|
console.log(`🏷️ Scraping brands for store ID: ${storeId}\n`);
|
|
|
|
try {
|
|
// Get store info
|
|
const storeResult = await pool.query(`
|
|
SELECT id, name, dutchie_url
|
|
FROM stores
|
|
WHERE id = $1
|
|
`, [storeId]);
|
|
|
|
if (storeResult.rows.length === 0) {
|
|
throw new Error('Store not found');
|
|
}
|
|
|
|
const store = storeResult.rows[0];
|
|
console.log(`Store: ${store.name}`);
|
|
|
|
// Build brands page URL
|
|
const brandsUrl = `${store.dutchie_url}/brands`;
|
|
console.log(`Brands URL: ${brandsUrl}\n`);
|
|
|
|
// Get proxy
|
|
const proxy = await getRandomProxy();
|
|
if (proxy) {
|
|
console.log(`🔍 Using proxy: ${proxy.server}\n`);
|
|
}
|
|
|
|
const browser = await createStealthBrowser({ proxy: proxy || undefined, headless: true });
|
|
|
|
try {
|
|
const context = await createStealthContext(browser, { state: 'Arizona' });
|
|
const page = await context.newPage();
|
|
|
|
console.log('🌐 Loading brands page...');
|
|
await page.goto(brandsUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
|
|
// Check for Cloudflare
|
|
if (await isCloudflareChallenge(page)) {
|
|
console.log('🛡️ Cloudflare detected, waiting...');
|
|
const passed = await waitForCloudflareChallenge(page, 60000);
|
|
if (!passed) {
|
|
console.log('❌ Failed to pass Cloudflare');
|
|
await browser.close();
|
|
await pool.end();
|
|
return;
|
|
}
|
|
}
|
|
|
|
await waitForPageLoad(page);
|
|
await page.waitForTimeout(3000);
|
|
|
|
console.log('\n📦 Extracting brands...\n');
|
|
|
|
// Extract brands
|
|
const brands = await page.evaluate(() => {
|
|
const foundBrands: Brand[] = [];
|
|
|
|
// Look for brand cards/links
|
|
const brandLinks = document.querySelectorAll('a[href*="/brands/"], a[href*="/brand/"]');
|
|
|
|
brandLinks.forEach(link => {
|
|
const img = link.querySelector('img');
|
|
const logoUrl = img ? img.getAttribute('src') || '' : '';
|
|
|
|
// Try to get brand name
|
|
let name = '';
|
|
const heading = link.querySelector('h1, h2, h3, h4, h5, h6');
|
|
if (heading?.textContent) {
|
|
name = heading.textContent.trim();
|
|
} else if (img?.alt) {
|
|
name = img.alt.trim();
|
|
} else {
|
|
const text = link.textContent?.trim() || '';
|
|
name = text.split('\n')[0].trim();
|
|
}
|
|
|
|
if (name && name.length > 1 && name.length < 100) {
|
|
foundBrands.push({
|
|
name,
|
|
logo_url: logoUrl || undefined
|
|
});
|
|
}
|
|
});
|
|
|
|
return foundBrands;
|
|
});
|
|
|
|
console.log(`✅ Found ${brands.length} brands!\n`);
|
|
|
|
if (brands.length > 0) {
|
|
console.log('Brands found:');
|
|
brands.forEach((brand, i) => {
|
|
console.log(`\n${i + 1}. ${brand.name}`);
|
|
if (brand.logo_url) console.log(` Logo: ${brand.logo_url.substring(0, 80)}...`);
|
|
});
|
|
|
|
// Save brands to database with timestamps
|
|
console.log(`\n💾 Saving brands to database...`);
|
|
|
|
for (const brand of brands) {
|
|
// Insert or update brand
|
|
const brandResult = await pool.query(`
|
|
INSERT INTO brands (name, logo_url, first_seen_at, last_seen_at)
|
|
VALUES ($1, $2, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
ON CONFLICT (name)
|
|
DO UPDATE SET
|
|
logo_url = COALESCE($2, brands.logo_url),
|
|
last_seen_at = CURRENT_TIMESTAMP,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
RETURNING id
|
|
`, [brand.name, brand.logo_url]);
|
|
|
|
const brandId = brandResult.rows[0].id;
|
|
|
|
// Link brand to store
|
|
await pool.query(`
|
|
INSERT INTO store_brands (store_id, brand_id, first_seen_at, last_seen_at, active)
|
|
VALUES ($1, $2, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, true)
|
|
ON CONFLICT (store_id, brand_id)
|
|
DO UPDATE SET
|
|
last_seen_at = CURRENT_TIMESTAMP,
|
|
active = true,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
`, [storeId, brandId]);
|
|
}
|
|
|
|
console.log(`✅ Saved ${brands.length} brands with timestamps!`);
|
|
} else {
|
|
console.log('⚠️ No brands found - page structure may be different');
|
|
|
|
// Save page for debugging
|
|
console.log('\n📸 Saving screenshot and HTML for debugging...');
|
|
await page.screenshot({ path: '/tmp/brands-page.png', fullPage: true });
|
|
const html = await page.content();
|
|
const fs = await import('fs/promises');
|
|
await fs.writeFile('/tmp/brands-page.html', html);
|
|
console.log('Saved to /tmp/brands-page.png and /tmp/brands-page.html');
|
|
}
|
|
|
|
await browser.close();
|
|
|
|
} catch (error) {
|
|
console.error('❌ Error:', error);
|
|
await browser.close();
|
|
}
|
|
} catch (error) {
|
|
console.error('❌ Error:', error);
|
|
} finally {
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
// Scrape Sol Flower Deer Valley (ID 23)
|
|
scrapeBrands(23);
|