Files
cannaiq/backend/archive/scrape-brands-48th.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

216 lines
6.2 KiB
TypeScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { Pool } from 'pg';
puppeteer.use(StealthPlugin());
const pool = new Pool({
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
});
async function scrapeBrands() {
let browser;
try {
console.log('\n🔍 Scraping brands from Curaleaf 48th Street...\n');
// Get proxy
const proxyResult = await pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
LIMIT 1
`);
const proxy = proxyResult.rows[0];
const browserArgs = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
];
if (proxy) {
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
browserArgs.push(`--proxy-server=${proxyUrl}`);
console.log(`Using proxy: ${proxy.host}:${proxy.port}`);
}
browser = await puppeteer.launch({
headless: true,
args: browserArgs
});
const page = await browser.newPage();
// Set Googlebot user agent
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
// Set age verification cookie
await page.setCookie({
name: 'age_verified',
value: 'true',
domain: '.curaleaf.com',
path: '/'
});
// Anti-detection
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-48th-street';
console.log(`Navigating to: ${url}`);
console.log('─'.repeat(80));
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
const currentUrl = page.url();
console.log(`Current URL after navigation: ${currentUrl}\n`);
console.log('Page loaded, waiting for content to render...\n');
// Wait a bit for JavaScript to render content
await page.waitForTimeout(3000);
// Check for iframes
const iframeInfo = await page.evaluate(() => {
const iframes = Array.from(document.querySelectorAll('iframe'));
return iframes.map(iframe => ({
src: iframe.src,
id: iframe.id,
className: iframe.className
}));
});
if (iframeInfo.length > 0) {
console.log('Found iframes:');
iframeInfo.forEach((iframe, i) => {
console.log(` ${i + 1}. ${iframe.src}`);
});
console.log('');
}
// Extract brands from the page
const brands = await page.evaluate(() => {
const brandSet = new Set<string>();
// Try multiple selectors for brands
const selectors = [
'[class*="brand"]',
'[class*="Brand"]',
'[data-testid*="brand"]',
'[class*="product"] [class*="brand"]',
'[class*="Product"] [class*="Brand"]'
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
elements.forEach(el => {
const text = el.textContent?.trim();
if (text && text.length > 0 && text.length < 100) {
brandSet.add(text);
}
});
}
// Also look for product cards and extract any brand info
const productCards = document.querySelectorAll('[class*="product"], [class*="Product"], [class*="card"]');
productCards.forEach(card => {
const brandElement = card.querySelector('[class*="brand"], [class*="Brand"]');
if (brandElement) {
const text = brandElement.textContent?.trim();
if (text && text.length > 0 && text.length < 100) {
brandSet.add(text);
}
}
});
return Array.from(brandSet);
});
console.log('BRANDS FOUND:');
console.log('─'.repeat(80));
if (brands.length === 0) {
console.log('No brands found!');
console.log('\nDumping page structure for debugging...\n');
const pageInfo = await page.evaluate(() => {
return {
title: document.title,
bodyText: document.body.textContent?.substring(0, 1000),
productElements: document.querySelectorAll('[class*="product"], [class*="Product"]').length,
hasIframe: document.querySelectorAll('iframe').length,
allText: document.body.innerText?.substring(0, 500)
};
});
console.log('Page Title:', pageInfo.title);
console.log('Product Elements:', pageInfo.productElements);
console.log('Iframes:', pageInfo.hasIframe);
console.log('\nFirst 500 chars of visible text:');
console.log(pageInfo.allText);
} else {
brands.sort().forEach((brand, i) => {
console.log(`${i + 1}. ${brand}`);
});
console.log('─'.repeat(80));
console.log(`Total unique brands: ${brands.length}\n`);
// Update store URL in database
console.log('Updating store URL in database...');
await pool.query(
`UPDATE stores
SET dutchie_url = $1,
slug = $2,
updated_at = NOW()
WHERE slug = $3`,
[
url,
'curaleaf-az-48th-street', // Update to match the local URL pattern
'curaleaf-az-48th-street-med'
]
);
console.log('✓ Store URL updated\n');
// Insert brands into database
console.log('Inserting brands into database...');
const storeResult = await pool.query(
'SELECT id FROM stores WHERE slug = $1',
['curaleaf-az-48th-street']
);
if (storeResult.rows.length > 0) {
const storeId = storeResult.rows[0].id;
for (const brandName of brands) {
await pool.query(
`INSERT INTO brands (store_id, name, created_at, updated_at)
VALUES ($1, $2, NOW(), NOW())
ON CONFLICT (store_id, name) DO NOTHING`,
[storeId, brandName]
);
}
console.log(`✓ Inserted ${brands.length} brands\n`);
}
}
} catch (error: any) {
console.error('Error:', error.message);
if (error.stack) {
console.error(error.stack);
}
} finally {
if (browser) {
await browser.close();
}
await pool.end();
}
}
scrapeBrands();