- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
216 lines
6.2 KiB
TypeScript
216 lines
6.2 KiB
TypeScript
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { Pool } from 'pg';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const pool = new Pool({
|
|
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
|
});
|
|
|
|
async function scrapeBrands() {
|
|
let browser;
|
|
|
|
try {
|
|
console.log('\n🔍 Scraping brands from Curaleaf 48th Street...\n');
|
|
|
|
// Get proxy
|
|
const proxyResult = await pool.query(`
|
|
SELECT host, port, protocol, username, password
|
|
FROM proxies
|
|
LIMIT 1
|
|
`);
|
|
|
|
const proxy = proxyResult.rows[0];
|
|
|
|
const browserArgs = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled'
|
|
];
|
|
|
|
if (proxy) {
|
|
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
|
browserArgs.push(`--proxy-server=${proxyUrl}`);
|
|
console.log(`Using proxy: ${proxy.host}:${proxy.port}`);
|
|
}
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: browserArgs
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Set Googlebot user agent
|
|
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
|
|
|
// Set age verification cookie
|
|
await page.setCookie({
|
|
name: 'age_verified',
|
|
value: 'true',
|
|
domain: '.curaleaf.com',
|
|
path: '/'
|
|
});
|
|
|
|
// Anti-detection
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false,
|
|
});
|
|
});
|
|
|
|
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-48th-street';
|
|
console.log(`Navigating to: ${url}`);
|
|
console.log('─'.repeat(80));
|
|
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
|
|
const currentUrl = page.url();
|
|
console.log(`Current URL after navigation: ${currentUrl}\n`);
|
|
|
|
console.log('Page loaded, waiting for content to render...\n');
|
|
|
|
// Wait a bit for JavaScript to render content
|
|
await page.waitForTimeout(3000);
|
|
|
|
// Check for iframes
|
|
const iframeInfo = await page.evaluate(() => {
|
|
const iframes = Array.from(document.querySelectorAll('iframe'));
|
|
return iframes.map(iframe => ({
|
|
src: iframe.src,
|
|
id: iframe.id,
|
|
className: iframe.className
|
|
}));
|
|
});
|
|
|
|
if (iframeInfo.length > 0) {
|
|
console.log('Found iframes:');
|
|
iframeInfo.forEach((iframe, i) => {
|
|
console.log(` ${i + 1}. ${iframe.src}`);
|
|
});
|
|
console.log('');
|
|
}
|
|
|
|
// Extract brands from the page
|
|
const brands = await page.evaluate(() => {
|
|
const brandSet = new Set<string>();
|
|
|
|
// Try multiple selectors for brands
|
|
const selectors = [
|
|
'[class*="brand"]',
|
|
'[class*="Brand"]',
|
|
'[data-testid*="brand"]',
|
|
'[class*="product"] [class*="brand"]',
|
|
'[class*="Product"] [class*="Brand"]'
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
elements.forEach(el => {
|
|
const text = el.textContent?.trim();
|
|
if (text && text.length > 0 && text.length < 100) {
|
|
brandSet.add(text);
|
|
}
|
|
});
|
|
}
|
|
|
|
// Also look for product cards and extract any brand info
|
|
const productCards = document.querySelectorAll('[class*="product"], [class*="Product"], [class*="card"]');
|
|
productCards.forEach(card => {
|
|
const brandElement = card.querySelector('[class*="brand"], [class*="Brand"]');
|
|
if (brandElement) {
|
|
const text = brandElement.textContent?.trim();
|
|
if (text && text.length > 0 && text.length < 100) {
|
|
brandSet.add(text);
|
|
}
|
|
}
|
|
});
|
|
|
|
return Array.from(brandSet);
|
|
});
|
|
|
|
console.log('BRANDS FOUND:');
|
|
console.log('─'.repeat(80));
|
|
|
|
if (brands.length === 0) {
|
|
console.log('No brands found!');
|
|
console.log('\nDumping page structure for debugging...\n');
|
|
|
|
const pageInfo = await page.evaluate(() => {
|
|
return {
|
|
title: document.title,
|
|
bodyText: document.body.textContent?.substring(0, 1000),
|
|
productElements: document.querySelectorAll('[class*="product"], [class*="Product"]').length,
|
|
hasIframe: document.querySelectorAll('iframe').length,
|
|
allText: document.body.innerText?.substring(0, 500)
|
|
};
|
|
});
|
|
|
|
console.log('Page Title:', pageInfo.title);
|
|
console.log('Product Elements:', pageInfo.productElements);
|
|
console.log('Iframes:', pageInfo.hasIframe);
|
|
console.log('\nFirst 500 chars of visible text:');
|
|
console.log(pageInfo.allText);
|
|
} else {
|
|
brands.sort().forEach((brand, i) => {
|
|
console.log(`${i + 1}. ${brand}`);
|
|
});
|
|
console.log('─'.repeat(80));
|
|
console.log(`Total unique brands: ${brands.length}\n`);
|
|
|
|
// Update store URL in database
|
|
console.log('Updating store URL in database...');
|
|
await pool.query(
|
|
`UPDATE stores
|
|
SET dutchie_url = $1,
|
|
slug = $2,
|
|
updated_at = NOW()
|
|
WHERE slug = $3`,
|
|
[
|
|
url,
|
|
'curaleaf-az-48th-street', // Update to match the local URL pattern
|
|
'curaleaf-az-48th-street-med'
|
|
]
|
|
);
|
|
console.log('✓ Store URL updated\n');
|
|
|
|
// Insert brands into database
|
|
console.log('Inserting brands into database...');
|
|
const storeResult = await pool.query(
|
|
'SELECT id FROM stores WHERE slug = $1',
|
|
['curaleaf-az-48th-street']
|
|
);
|
|
|
|
if (storeResult.rows.length > 0) {
|
|
const storeId = storeResult.rows[0].id;
|
|
|
|
for (const brandName of brands) {
|
|
await pool.query(
|
|
`INSERT INTO brands (store_id, name, created_at, updated_at)
|
|
VALUES ($1, $2, NOW(), NOW())
|
|
ON CONFLICT (store_id, name) DO NOTHING`,
|
|
[storeId, brandName]
|
|
);
|
|
}
|
|
|
|
console.log(`✓ Inserted ${brands.length} brands\n`);
|
|
}
|
|
}
|
|
|
|
} catch (error: any) {
|
|
console.error('Error:', error.message);
|
|
if (error.stack) {
|
|
console.error(error.stack);
|
|
}
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
}
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeBrands();
|