feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
147
backend/archive/scrape-48th-brands.ts
Normal file
147
backend/archive/scrape-48th-brands.ts
Normal file
@@ -0,0 +1,147 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: 'postgresql://sail:password@localhost:5432/dutchie_menus'
|
||||
});
|
||||
|
||||
async function main() {
|
||||
let browser;
|
||||
|
||||
try {
|
||||
console.log('STEP 2: Getting random proxy from pool...');
|
||||
const proxyResult = await pool.query(`
|
||||
SELECT host, port, protocol FROM proxies
|
||||
ORDER BY RANDOM() LIMIT 1
|
||||
`);
|
||||
|
||||
const proxy = proxyResult.rows[0];
|
||||
console.log(`✅ Selected proxy: ${proxy.host}:${proxy.port}\n`);
|
||||
|
||||
console.log('STEP 3: Launching browser with proxy + anti-fingerprint...');
|
||||
const proxyUrl = `${proxy.protocol}://${proxy.host}:${proxy.port}`;
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
`--proxy-server=${proxyUrl}`,
|
||||
'--disable-blink-features=AutomationControlled'
|
||||
]
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Set Googlebot user-agent
|
||||
await page.setUserAgent('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
||||
console.log('✅ Set UA to Googlebot\n');
|
||||
|
||||
// Anti-fingerprint: spoof timezone, geolocation, remove webdriver
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// Timezone (Arizona)
|
||||
Object.defineProperty(Intl.DateTimeFormat.prototype, 'resolvedOptions', {
|
||||
value: function() { return { timeZone: 'America/Phoenix' }; }
|
||||
});
|
||||
|
||||
// Geolocation (Phoenix)
|
||||
Object.defineProperty(navigator, 'geolocation', {
|
||||
get: () => ({
|
||||
getCurrentPosition: (success: any) => {
|
||||
setTimeout(() => success({
|
||||
coords: { latitude: 33.4484, longitude: -112.0740, accuracy: 100 }
|
||||
}), 100);
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
// Remove webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
});
|
||||
console.log('✅ Fingerprint spoofed (timezone=Arizona, geo=Phoenix, webdriver=hidden)\n');
|
||||
|
||||
console.log('STEP 4: Navigating to Curaleaf Phoenix Airport brands page...');
|
||||
const url = 'https://curaleaf.com/stores/curaleaf-dispensary-phoenix-airport/brands';
|
||||
console.log(`URL: ${url}\n`);
|
||||
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
console.log('STEP 5: Scraping brand data from page...');
|
||||
|
||||
// Get page info for debugging
|
||||
const pageInfo = await page.evaluate(() => ({
|
||||
title: document.title,
|
||||
url: window.location.href,
|
||||
bodyLength: document.body.innerHTML.length
|
||||
}));
|
||||
|
||||
console.log(`Page title: "${pageInfo.title}"`);
|
||||
console.log(`Current URL: ${pageInfo.url}`);
|
||||
console.log(`Body HTML length: ${pageInfo.bodyLength} chars\n`);
|
||||
|
||||
// Scrape brands
|
||||
const brands = await page.evaluate(() => {
|
||||
// Try multiple selectors
|
||||
const selectors = [
|
||||
'[data-testid*="brand"]',
|
||||
'[class*="Brand"]',
|
||||
'[class*="brand"]',
|
||||
'a[href*="/brand/"]',
|
||||
'.brand-card',
|
||||
'.brand-item'
|
||||
];
|
||||
|
||||
const found = new Set<string>();
|
||||
|
||||
selectors.forEach(selector => {
|
||||
document.querySelectorAll(selector).forEach(el => {
|
||||
const text = el.textContent?.trim();
|
||||
if (text && text.length > 0 && text.length < 50) {
|
||||
found.add(text);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return Array.from(found);
|
||||
});
|
||||
|
||||
console.log(`✅ Found ${brands.length} brands:\n`);
|
||||
brands.forEach((b, i) => console.log(` ${i + 1}. ${b}`));
|
||||
|
||||
if (brands.length === 0) {
|
||||
console.log('\n⚠️ No brands found. Possible reasons:');
|
||||
console.log(' - IP/proxy is blocked');
|
||||
console.log(' - Page requires different selectors');
|
||||
console.log(' - Brands load asynchronously');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('\n\nSTEP 6: Saving brands to database...');
|
||||
|
||||
let saved = 0;
|
||||
for (const brand of brands) {
|
||||
try {
|
||||
await pool.query(`
|
||||
INSERT INTO products (store_id, name, brand, dutchie_url, in_stock)
|
||||
VALUES (1, $1, $2, $3, true)
|
||||
ON CONFLICT (store_id, name, brand) DO NOTHING
|
||||
`, [`${brand} Product`, brand, url]);
|
||||
saved++;
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
console.log(`✅ Saved ${saved} brands to database\n`);
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('❌ ERROR:', error.message);
|
||||
} finally {
|
||||
if (browser) await browser.close();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user