chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
92
backend/src/_deprecated/services/scraper-debug.ts
Normal file
92
backend/src/_deprecated/services/scraper-debug.ts
Normal file
@@ -0,0 +1,92 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { logger } from './logger';
|
||||
|
||||
// Apply stealth plugin
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export async function debugDutchiePage(url: string) {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
|
||||
logger.info('scraper', `Loading: ${url}`);
|
||||
|
||||
try {
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
logger.info('scraper', 'Page loaded, waiting for content...');
|
||||
|
||||
// Wait for content to render
|
||||
await page.waitForTimeout(8000);
|
||||
|
||||
const debug = await page.evaluate(() => {
|
||||
// Try to find product cards
|
||||
const productSelectors = [
|
||||
'[data-testid*="product"]',
|
||||
'[class*="Product"]',
|
||||
'[class*="product"]',
|
||||
'article',
|
||||
'[role="article"]',
|
||||
'li'
|
||||
];
|
||||
|
||||
const results: any = {
|
||||
selectors: {}
|
||||
};
|
||||
|
||||
for (const selector of productSelectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
results.selectors[selector] = elements.length;
|
||||
}
|
||||
|
||||
// Get sample HTML from first few matches
|
||||
const firstMatch = document.querySelector('[class*="product" i], article, [data-testid*="product"]');
|
||||
if (firstMatch) {
|
||||
results.sampleHTML = firstMatch.outerHTML.substring(0, 1000);
|
||||
results.sampleText = firstMatch.textContent?.substring(0, 500);
|
||||
}
|
||||
|
||||
// Get all class names that might be products
|
||||
const allElements = document.querySelectorAll('*');
|
||||
const classNames = new Set<string>();
|
||||
allElements.forEach(el => {
|
||||
const classes = el.className;
|
||||
if (typeof classes === 'string' && classes.toLowerCase().includes('product')) {
|
||||
classes.split(' ').forEach(c => classNames.add(c));
|
||||
}
|
||||
});
|
||||
|
||||
results.productClasses = Array.from(classNames).slice(0, 20);
|
||||
results.bodyTextSample = document.body.innerText.substring(0, 500);
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
logger.info('scraper', `Debug results:\n${JSON.stringify(debug, null, 2)}`);
|
||||
|
||||
} catch (error) {
|
||||
logger.error('scraper', `Debug navigation error: ${error}`);
|
||||
|
||||
// Try to get whatever we can
|
||||
try {
|
||||
const partialDebug = await page.evaluate(() => {
|
||||
return {
|
||||
url: window.location.href,
|
||||
title: document.title,
|
||||
bodyLength: document.body?.innerHTML?.length || 0,
|
||||
bodyStart: document.body?.innerHTML?.substring(0, 500) || ''
|
||||
};
|
||||
});
|
||||
logger.info('scraper', `Partial debug:\n${JSON.stringify(partialDebug, null, 2)}`);
|
||||
} catch (e) {
|
||||
logger.error('scraper', `Could not get partial debug: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
Reference in New Issue
Block a user