Files
cannaiq/backend/scrape-brands.ts
2025-11-28 19:45:44 -07:00

165 lines
5.3 KiB
TypeScript

import { createStealthBrowser, createStealthContext, waitForPageLoad, isCloudflareChallenge, waitForCloudflareChallenge } from './src/utils/stealthBrowser';
import { getRandomProxy } from './src/utils/proxyManager';
import { pool } from './src/db/migrate';
interface Brand {
name: string;
logo_url?: string;
}
async function scrapeBrands(storeId: number) {
console.log(`🏷️ Scraping brands for store ID: ${storeId}\n`);
try {
// Get store info
const storeResult = await pool.query(`
SELECT id, name, dutchie_url
FROM stores
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
console.log(`Store: ${store.name}`);
// Build brands page URL
const brandsUrl = `${store.dutchie_url}/brands`;
console.log(`Brands URL: ${brandsUrl}\n`);
// Get proxy
const proxy = await getRandomProxy();
if (proxy) {
console.log(`🔍 Using proxy: ${proxy.server}\n`);
}
const browser = await createStealthBrowser({ proxy: proxy || undefined, headless: true });
try {
const context = await createStealthContext(browser, { state: 'Arizona' });
const page = await context.newPage();
console.log('🌐 Loading brands page...');
await page.goto(brandsUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
// Check for Cloudflare
if (await isCloudflareChallenge(page)) {
console.log('🛡️ Cloudflare detected, waiting...');
const passed = await waitForCloudflareChallenge(page, 60000);
if (!passed) {
console.log('❌ Failed to pass Cloudflare');
await browser.close();
await pool.end();
return;
}
}
await waitForPageLoad(page);
await page.waitForTimeout(3000);
console.log('\n📦 Extracting brands...\n');
// Extract brands
const brands = await page.evaluate(() => {
const foundBrands: Brand[] = [];
// Look for brand cards/links
const brandLinks = document.querySelectorAll('a[href*="/brands/"], a[href*="/brand/"]');
brandLinks.forEach(link => {
const img = link.querySelector('img');
const logoUrl = img ? img.getAttribute('src') || '' : '';
// Try to get brand name
let name = '';
const heading = link.querySelector('h1, h2, h3, h4, h5, h6');
if (heading?.textContent) {
name = heading.textContent.trim();
} else if (img?.alt) {
name = img.alt.trim();
} else {
const text = link.textContent?.trim() || '';
name = text.split('\n')[0].trim();
}
if (name && name.length > 1 && name.length < 100) {
foundBrands.push({
name,
logo_url: logoUrl || undefined
});
}
});
return foundBrands;
});
console.log(`✅ Found ${brands.length} brands!\n`);
if (brands.length > 0) {
console.log('Brands found:');
brands.forEach((brand, i) => {
console.log(`\n${i + 1}. ${brand.name}`);
if (brand.logo_url) console.log(` Logo: ${brand.logo_url.substring(0, 80)}...`);
});
// Save brands to database with timestamps
console.log(`\n💾 Saving brands to database...`);
for (const brand of brands) {
// Insert or update brand
const brandResult = await pool.query(`
INSERT INTO brands (name, logo_url, first_seen_at, last_seen_at)
VALUES ($1, $2, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
ON CONFLICT (name)
DO UPDATE SET
logo_url = COALESCE($2, brands.logo_url),
last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
RETURNING id
`, [brand.name, brand.logo_url]);
const brandId = brandResult.rows[0].id;
// Link brand to store
await pool.query(`
INSERT INTO store_brands (store_id, brand_id, first_seen_at, last_seen_at, active)
VALUES ($1, $2, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, true)
ON CONFLICT (store_id, brand_id)
DO UPDATE SET
last_seen_at = CURRENT_TIMESTAMP,
active = true,
updated_at = CURRENT_TIMESTAMP
`, [storeId, brandId]);
}
console.log(`✅ Saved ${brands.length} brands with timestamps!`);
} else {
console.log('⚠️ No brands found - page structure may be different');
// Save page for debugging
console.log('\n📸 Saving screenshot and HTML for debugging...');
await page.screenshot({ path: '/tmp/brands-page.png', fullPage: true });
const html = await page.content();
const fs = await import('fs/promises');
await fs.writeFile('/tmp/brands-page.html', html);
console.log('Saved to /tmp/brands-page.png and /tmp/brands-page.html');
}
await browser.close();
} catch (error) {
console.error('❌ Error:', error);
await browser.close();
}
} catch (error) {
console.error('❌ Error:', error);
} finally {
await pool.end();
}
}
// Scrape Sol Flower Deer Valley (ID 23)
scrapeBrands(23);