148 lines
4.4 KiB
TypeScript
148 lines
4.4 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||
import { Pool } from 'pg';
|
||
|
||
puppeteer.use(StealthPlugin());
|
||
|
||
const pool = new Pool({
|
||
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
||
});
|
||
|
||
async function main() {
|
||
let browser;
|
||
|
||
try {
|
||
console.log('STEP 2: Getting random proxy from pool...');
|
||
const proxyResult = await pool.query(`
|
||
SELECT host, port, protocol FROM proxies
|
||
ORDER BY RANDOM() LIMIT 1
|
||
`);
|
||
|
||
const proxy = proxyResult.rows[0];
|
||
console.log(`✅ Selected proxy: ${proxy.host}:${proxy.port}\n`);
|
||
|
||
console.log('STEP 3: Launching browser with proxy + anti-fingerprint...');
|
||
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
||
|
||
browser = await puppeteer.launch({
|
||
headless: true,
|
||
args: [
|
||
'--no-sandbox',
|
||
'--disable-setuid-sandbox',
|
||
`--proxy-server=${proxyUrl}`,
|
||
'--disable-blink-features=AutomationControlled'
|
||
]
|
||
});
|
||
|
||
const page = await browser.newPage();
|
||
|
||
// Set Googlebot user-agent
|
||
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
||
console.log('✅ Set UA to Googlebot\n');
|
||
|
||
// Anti-fingerprint: spoof timezone, geolocation, remove webdriver
|
||
await page.evaluateOnNewDocument(() => {
|
||
// Timezone (Arizona)
|
||
Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', {
|
||
value: function() { return { timeZone: 'America/Phoenix' }; }
|
||
});
|
||
|
||
// Geolocation (Phoenix)
|
||
Object.defineProperty(navigator, 'geolocation', {
|
||
get: () => ({
|
||
getCurrentPosition: (success: any) => {
|
||
setTimeout(() => success({
|
||
coords: { latitude: 33.4484, longitude: -112.0740, accuracy: 100 }
|
||
}), 100);
|
||
}
|
||
})
|
||
});
|
||
|
||
// Remove webdriver
|
||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||
});
|
||
console.log('✅ Fingerprint spoofed (timezone=Arizona, geo=Phoenix, webdriver=hidden)\n');
|
||
|
||
console.log('STEP 4: Navigating to Curaleaf Phoenix Airport brands page...');
|
||
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
|
||
console.log(`URL: ${url}\n`);
|
||
|
||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||
await page.waitForTimeout(5000);
|
||
|
||
console.log('STEP 5: Scraping brand data from page...');
|
||
|
||
// Get page info for debugging
|
||
const pageInfo = await page.evaluate(() => ({
|
||
title: document.title,
|
||
url: window.location.href,
|
||
bodyLength: document.body.innerHTML.length
|
||
}));
|
||
|
||
console.log(`Page title: "${pageInfo.title}"`);
|
||
console.log(`Current URL: ${pageInfo.url}`);
|
||
console.log(`Body HTML length: ${pageInfo.bodyLength} chars\n`);
|
||
|
||
// Scrape brands
|
||
const brands = await page.evaluate(() => {
|
||
// Try multiple selectors
|
||
const selectors = [
|
||
'[data-testid*="brand"]',
|
||
'[class*="Brand"]',
|
||
'[class*="brand"]',
|
||
'a[href*="/brand/"]',
|
||
'.brand-card',
|
||
'.brand-item'
|
||
];
|
||
|
||
const found = new Set<string>();
|
||
|
||
selectors.forEach(selector => {
|
||
document.querySelectorAll(selector).forEach(el => {
|
||
const text = el.textContent?.trim();
|
||
if (text && text.length > 0 && text.length < 50) {
|
||
found.add(text);
|
||
}
|
||
});
|
||
});
|
||
|
||
return Array.from(found);
|
||
});
|
||
|
||
console.log(`✅ Found ${brands.length} brands:\n`);
|
||
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
|
||
|
||
if (brands.length === 0) {
|
||
console.log('\n⚠️ No brands found. Possible reasons:');
|
||
console.log(' - IP/proxy is blocked');
|
||
console.log(' - Page requires different selectors');
|
||
console.log(' - Brands load asynchronously');
|
||
return;
|
||
}
|
||
|
||
console.log('\n\nSTEP 6: Saving brands to database...');
|
||
|
||
let saved = 0;
|
||
for (const brand of brands) {
|
||
try {
|
||
await pool.query(`
|
||
INSERT INTO products (store_id, name, brand, dutchie_url, in_stock)
|
||
VALUES (1, $1, $2, $3, true)
|
||
ON CONFLICT (store_id, name, brand) DO NOTHING
|
||
`, [`${brand} Product`, brand, url]);
|
||
saved++;
|
||
} catch (e) {}
|
||
}
|
||
|
||
console.log(`✅ Saved ${saved} brands to database\n`);
|
||
|
||
} catch (error: any) {
|
||
console.error('❌ ERROR:', error.message);
|
||
} finally {
|
||
if (browser) await browser.close();
|
||
await pool.end();
|
||
}
|
||
}
|
||
|
||
main();
|