Files
cannaiq/backend/dist/scripts/test-jane-scraper.js
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

256 lines
11 KiB
JavaScript

"use strict";
/**
* Test script for iHeartJane menu scraping via Playwright
* Intercepts API/Algolia calls made by the browser
*/
Object.defineProperty(exports, "__esModule", { value: true });
const playwright_1 = require("playwright");
async function scrapeJaneMenu(urlOrStoreId) {
// Handle either a full URL or just a store ID
const menuUrl = urlOrStoreId.startsWith('http')
? urlOrStoreId
: `https://www.iheartjane.com/embed/stores/${urlOrStoreId}/menu`;
console.log(`Starting Playwright scrape for iHeartJane: ${menuUrl}`);
const browser = await playwright_1.chromium.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled'
]
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/Chicago'
});
// Add stealth scripts to avoid detection
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
});
const page = await context.newPage();
const products = [];
const apiResponses = [];
const capturedCredentials = {};
// Intercept ALL network requests to capture API/Algolia data and credentials
page.on('request', (request) => {
const url = request.url();
const headers = request.headers();
// Capture Algolia credentials from request headers
if (url.includes('algolia')) {
const appId = headers['x-algolia-application-id'];
const apiKey = headers['x-algolia-api-key'];
if (appId && apiKey) {
capturedCredentials.algolia = { appId, apiKey };
console.log(`Captured Algolia credentials: App=${appId}, Key=${apiKey.substring(0, 10)}...`);
}
}
});
page.on('response', async (response) => {
const url = response.url();
// Capture Algolia search results
if (url.includes('algolia.net') || url.includes('algolianet.com')) {
try {
const data = await response.json();
if (data.results && data.results[0] && data.results[0].hits) {
console.log(`Captured ${data.results[0].hits.length} products from Algolia`);
apiResponses.push({ type: 'algolia', data: data.results[0] });
}
}
catch (e) {
// Not JSON or error parsing
}
}
// Capture Jane API responses
if (url.includes('api.iheartjane.com') && url.includes('products')) {
try {
const data = await response.json();
console.log(`Captured Jane API response: ${url}`);
apiResponses.push({ type: 'jane-api', url, data });
}
catch (e) {
// Not JSON or error parsing
}
}
});
try {
console.log(`Navigating to: ${menuUrl}`);
await page.goto(menuUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Wait for page to settle
await page.waitForTimeout(2000);
// Handle age gate - use Playwright locator with force click
console.log('Looking for age gate...');
try {
let clicked = false;
// Method 1: Use Playwright locator with exact text match
try {
const yesButton = page.locator('button:has-text("Yes")').first();
await yesButton.waitFor({ state: 'visible', timeout: 5000 });
await yesButton.click({ force: true });
clicked = true;
console.log('Clicked age gate via Playwright locator');
await page.waitForTimeout(5000);
}
catch (e) {
console.log('Playwright locator failed:', e.message);
}
// Method 2: Try clicking by visible bounding box
if (!clicked) {
try {
const box = await page.locator('button:has-text("Yes")').first().boundingBox();
if (box) {
await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2);
clicked = true;
console.log(`Clicked age gate at coordinates: ${box.x + box.width / 2}, ${box.y + box.height / 2}`);
await page.waitForTimeout(5000);
}
}
catch (e) {
console.log('Bounding box click failed');
}
}
// Method 3: Try JavaScript click
if (!clicked) {
const jsClickResult = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
for (const btn of buttons) {
if (btn.textContent?.includes('Yes')) {
btn.click();
return { success: true, buttonText: btn.textContent };
}
}
return { success: false };
});
if (jsClickResult.success) {
clicked = true;
console.log(`Clicked via JS: ${jsClickResult.buttonText}`);
await page.waitForTimeout(5000);
}
}
// Method 4: Click element containing "Yes" with dispatchEvent
if (!clicked) {
const dispatchResult = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
for (const btn of buttons) {
if (btn.textContent?.includes('Yes')) {
btn.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
return true;
}
}
return false;
});
if (dispatchResult) {
clicked = true;
console.log('Clicked via dispatchEvent');
await page.waitForTimeout(5000);
}
}
// Log button info for debugging
const buttonInfo = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
return buttons.map(b => ({
text: b.textContent?.trim(),
visible: b.offsetParent !== null,
rect: b.getBoundingClientRect()
}));
});
console.log('Buttons found:', JSON.stringify(buttonInfo, null, 2));
}
catch (e) {
console.log('Age gate handling error:', e);
}
// Wait for content to load after age gate
await page.waitForTimeout(3000);
// Try to scroll to trigger more product loads
console.log('Scrolling to load more products...');
for (let i = 0; i < 3; i++) {
await page.evaluate(() => window.scrollBy(0, 1000));
await page.waitForTimeout(1000);
}
// Extract products from the page DOM as backup
const domProducts = await page.evaluate(() => {
const items = [];
// Try various selectors that Jane might use
const productCards = document.querySelectorAll('[data-testid*="product"], [class*="ProductCard"], [class*="product-card"], .product-tile');
productCards.forEach((card) => {
const name = card.querySelector('[class*="name"], [class*="title"], h3, h4')?.textContent?.trim();
const brand = card.querySelector('[class*="brand"]')?.textContent?.trim();
const price = card.querySelector('[class*="price"]')?.textContent?.trim();
const image = card.querySelector('img')?.getAttribute('src');
if (name) {
items.push({ name, brand, price, image, source: 'dom' });
}
});
return items;
});
console.log(`Extracted ${domProducts.length} products from DOM`);
// Check for __NEXT_DATA__ or similar embedded data
const embeddedData = await page.evaluate(() => {
// Check for Next.js data
const nextData = document.getElementById('__NEXT_DATA__');
if (nextData) {
return { type: 'next', data: JSON.parse(nextData.textContent || '{}') };
}
// Check for any window-level product data
const win = window;
if (win.__INITIAL_STATE__)
return { type: 'initial_state', data: win.__INITIAL_STATE__ };
if (win.__PRELOADED_STATE__)
return { type: 'preloaded', data: win.__PRELOADED_STATE__ };
if (win.products)
return { type: 'products', data: win.products };
return null;
});
if (embeddedData) {
console.log(`Found embedded data: ${embeddedData.type}`);
apiResponses.push(embeddedData);
}
// Take a screenshot for debugging
const screenshotPath = `/tmp/jane-scrape-${Date.now()}.png`;
await page.screenshot({ path: screenshotPath, fullPage: true });
console.log(`Screenshot saved to ${screenshotPath}`);
// Process captured API responses
console.log('\n=== API Responses Summary ===');
for (const resp of apiResponses) {
console.log(`Type: ${resp.type}`);
if (resp.type === 'algolia' && resp.data.hits) {
console.log(` Hits: ${resp.data.hits.length}`);
console.log(` Total: ${resp.data.nbHits}`);
if (resp.data.hits[0]) {
console.log(` Sample product:`, JSON.stringify(resp.data.hits[0], null, 2).substring(0, 1000));
}
}
}
console.log('\n=== DOM Products Sample ===');
console.log(JSON.stringify(domProducts.slice(0, 3), null, 2));
console.log('\n=== Captured Credentials ===');
console.log(JSON.stringify(capturedCredentials, null, 2));
return {
apiResponses,
domProducts,
embeddedData,
capturedCredentials
};
}
finally {
await browser.close();
}
}
// Main execution
const urlOrStoreId = process.argv[2] || 'https://iheartjane.com/aly2djS2yXoTGnR0/DBeqE6HSSwijog9l'; // Default to The Flower Shop Az
scrapeJaneMenu(urlOrStoreId)
.then((result) => {
console.log('\n=== Scrape Complete ===');
console.log(`Total API responses captured: ${result.apiResponses.length}`);
console.log(`Total DOM products: ${result.domProducts.length}`);
})
.catch((err) => {
console.error('Scrape failed:', err);
process.exit(1);
});