Files
cannaiq/backend/scrape-48th-brands.ts
2025-11-28 19:45:44 -07:00

148 lines
4.4 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function main() {
let browser;
try {
console.log('STEP 2: Getting random proxy from pool...');
const proxyResult = await pool.query(`
SELECT host, port, protocol FROM proxies
ORDER BY RANDOM() LIMIT 1
`);
const proxy = proxyResult.rows[0];
console.log(`✅ Selected proxy: ${proxy.host}:${proxy.port}\n`);
console.log('STEP 3: Launching browser with proxy + anti-fingerprint...');
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
`--proxy-server=${proxyUrl}`,
'--disable-blink-features=AutomationControlled'
]
});
const page = await browser.newPage();
// Set Googlebot user-agent
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
console.log('✅ Set UA to Googlebot\n');
// Anti-fingerprint: spoof timezone, geolocation, remove webdriver
await page.evaluateOnNewDocument(() => {
// Timezone (Arizona)
Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', {
value: function() { return { timeZone: 'America/Phoenix' }; }
});
// Geolocation (Phoenix)
Object.defineProperty(navigator, 'geolocation', {
get: () => ({
getCurrentPosition: (success: any) => {
setTimeout(() => success({
coords: { latitude: 33.4484, longitude: -112.0740, accuracy: 100 }
}), 100);
}
})
});
// Remove webdriver
Object.defineProperty(navigator, 'webdriver', { get: () => false });
});
console.log('✅ Fingerprint spoofed (timezone=Arizona, geo=Phoenix, webdriver=hidden)\n');
console.log('STEP 4: Navigating to Curaleaf Phoenix Airport brands page...');
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
console.log(`URL: ${url}\n`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
await page.waitForTimeout(5000);
console.log('STEP 5: Scraping brand data from page...');
// Get page info for debugging
const pageInfo = await page.evaluate(() => ({
title: document.title,
url: window.location.href,
bodyLength: document.body.innerHTML.length
}));
console.log(`Page title: "${pageInfo.title}"`);
console.log(`Current URL: ${pageInfo.url}`);
console.log(`Body HTML length: ${pageInfo.bodyLength} chars\n`);
// Scrape brands
const brands = await page.evaluate(() => {
// Try multiple selectors
const selectors = [
'[data-testid*="brand"]',
'[class*="Brand"]',
'[class*="brand"]',
'a[href*="/brand/"]',
'.brand-card',
'.brand-item'
];
const found = new Set<string>();
selectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
const text = el.textContent?.trim();
if (text && text.length > 0 && text.length < 50) {
found.add(text);
}
});
});
return Array.from(found);
});
console.log(`✅ Found ${brands.length} brands:\n`);
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
if (brands.length === 0) {
console.log('\n⚠ No brands found. Possible reasons:');
console.log(' - IP/proxy is blocked');
console.log(' - Page requires different selectors');
console.log(' - Brands load asynchronously');
return;
}
console.log('\n\nSTEP 6: Saving brands to database...');
let saved = 0;
for (const brand of brands) {
try {
await pool.query(`
INSERT INTO products (store_id, name, brand, dutchie_url, in_stock)
VALUES (1, $1, $2, $3, true)
ON CONFLICT (store_id, name, brand) DO NOTHING
`, [`${brand} Product`, brand, url]);
saved++;
} catch (e) {}
}
console.log(`✅ Saved ${saved} brands to database\n`);
} catch (error: any) {
console.error('❌ ERROR:', error.message);
} finally {
if (browser) await browser.close();
await pool.end();
}
}
main();