feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
153
backend/archive/test-brand-scrape.ts
Normal file
153
backend/archive/test-brand-scrape.ts
Normal file
@@ -0,0 +1,153 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
||||
});
|
||||
|
||||
async function testBrandScrape() {
|
||||
let browser;
|
||||
|
||||
try {
|
||||
// 1. Get the store
|
||||
const storeResult = await pool.query(
|
||||
"SELECT id, name, slug, dutchie_url FROM stores WHERE slug = $1",
|
||||
['curaleaf-az-48th-street-med']
|
||||
);
|
||||
|
||||
if (storeResult.rows.length === 0) {
|
||||
console.log('Store not found');
|
||||
return;
|
||||
}
|
||||
|
||||
const store = storeResult.rows[0];
|
||||
console.log(`\nTesting brand scrape for: ${store.name}`);
|
||||
console.log(`URL: ${store.dutchie_url}\n`);
|
||||
|
||||
// 2. Get an active proxy
|
||||
const proxyResult = await pool.query(`
|
||||
SELECT host, port, protocol, username, password
|
||||
FROM proxies
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
if (proxyResult.rows.length === 0) {
|
||||
console.log('No active proxies available - will try without proxy');
|
||||
}
|
||||
|
||||
const proxy = proxyResult.rows[0];
|
||||
|
||||
if (proxy) {
|
||||
console.log(`Using proxy: ${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
console.log(`User-Agent: Googlebot`);
|
||||
console.log('─'.repeat(60));
|
||||
|
||||
// 3. Launch browser with proxy
|
||||
const browserArgs = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled'
|
||||
];
|
||||
|
||||
if (proxy) {
|
||||
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
||||
browserArgs.push(`--proxy-server=${proxyUrl}`);
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: browserArgs
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Set Googlebot user agent
|
||||
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
||||
|
||||
// Anti-detection
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => false,
|
||||
});
|
||||
});
|
||||
|
||||
// 4. Navigate and extract brands
|
||||
console.log('\nNavigating to store page...');
|
||||
await page.goto(store.dutchie_url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
console.log('Page loaded, extracting brands...\n');
|
||||
|
||||
// 5. Extract brands from product cards
|
||||
const brands = await page.evaluate(() => {
|
||||
const brandSet = new Set<string>();
|
||||
|
||||
// Try multiple selectors
|
||||
const selectors = [
|
||||
'[class*="brand"]',
|
||||
'[class*="Brand"]',
|
||||
'[data-testid*="brand"]',
|
||||
'[class*="product"] [class*="brand"]'
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
elements.forEach(el => {
|
||||
const text = el.textContent?.trim();
|
||||
if (text && text.length > 0 && text.length < 100) {
|
||||
brandSet.add(text);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return Array.from(brandSet);
|
||||
});
|
||||
|
||||
console.log('BRANDS FOUND:');
|
||||
console.log('─'.repeat(60));
|
||||
|
||||
if (brands.length === 0) {
|
||||
console.log('No brands found!');
|
||||
console.log('\nLet me also check what the page structure looks like...\n');
|
||||
|
||||
// Debug: show page structure
|
||||
const pageInfo = await page.evaluate(() => {
|
||||
return {
|
||||
title: document.title,
|
||||
productCards: document.querySelectorAll('[class*="product"], [class*="Product"]').length,
|
||||
hasImages: document.querySelectorAll('img[src*="dutchie"]').length,
|
||||
bodyText: document.body.textContent?.substring(0, 500)
|
||||
};
|
||||
});
|
||||
|
||||
console.log('Page Title:', pageInfo.title);
|
||||
console.log('Product Cards Found:', pageInfo.productCards);
|
||||
console.log('Dutchie Images:', pageInfo.hasImages);
|
||||
console.log('\nFirst 500 chars of page:');
|
||||
console.log(pageInfo.bodyText);
|
||||
} else {
|
||||
brands.sort().forEach((brand, i) => {
|
||||
console.log(`${i + 1}. ${brand}`);
|
||||
});
|
||||
console.log('─'.repeat(60));
|
||||
console.log(`Total unique brands: ${brands.length}`);
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Error:', error.message);
|
||||
if (error.stack) {
|
||||
console.error(error.stack);
|
||||
}
|
||||
} finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
testBrandScrape();
|
||||
Reference in New Issue
Block a user