Add request interception to all Puppeteer handlers to block unnecessary resources (images, fonts, media, stylesheets). We only need HTML/JS for the session cookie, then the GraphQL JSON response. This was causing 2.4GB of bandwidth from assets2.dutchie.com - every page visit downloaded all product thumbnails, logos, etc. Files updated: - product-discovery-http.ts - entry-point-discovery.ts - store-discovery-http.ts - store-discovery-state.ts - puppeteer-preflight.ts Note: Product images from payload are still downloaded once to MinIO via image-storage.ts - this only blocks browser-rendered page images. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
302 lines
11 KiB
TypeScript
302 lines
11 KiB
TypeScript
/**
|
|
* Puppeteer Preflight - Verify browser-based transport works with anti-detect
|
|
*
|
|
* Uses Puppeteer + StealthPlugin to:
|
|
* 1. Launch headless browser with stealth mode + PROXY
|
|
* 2. Visit fingerprint.com demo to verify anti-detect and confirm proxy IP
|
|
* 3. Establish session by visiting Dutchie embedded menu
|
|
* 4. Make GraphQL request from browser context
|
|
* 5. Verify we get a valid response (not blocked)
|
|
*
|
|
* Use case: Anti-detect scraping that needs real browser fingerprint through proxy
|
|
*
|
|
* Based on test-intercept.js which successfully captures 1000+ products
|
|
*/
|
|
|
|
import { PreflightResult, CrawlRotator } from './crawl-rotator';
|
|
|
|
// GraphQL hash for FilteredProducts query - MUST match CLAUDE.md
|
|
const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
|
|
|
// Test dispensary - AZ-Deeply-Rooted (known working)
|
|
const TEST_CNAME = 'AZ-Deeply-Rooted';
|
|
const TEST_PLATFORM_ID = '6405ef617056e8014d79101b';
|
|
|
|
// Anti-detect verification sites (primary + fallback)
|
|
const FINGERPRINT_DEMO_URL = 'https://demo.fingerprint.com/';
|
|
const AMIUNIQUE_URL = 'https://amiunique.org/fingerprint';
|
|
|
|
// IP geolocation API for timezone lookup (free, no key required)
|
|
const IP_API_URL = 'http://ip-api.com/json';
|
|
|
|
/**
|
|
* Look up timezone from IP address using ip-api.com
|
|
* Returns IANA timezone (e.g., 'America/New_York') or null on failure
|
|
*/
|
|
async function getTimezoneFromIp(ip: string): Promise<{ timezone: string; city?: string; region?: string } | null> {
|
|
try {
|
|
const axios = require('axios');
|
|
const response = await axios.get(`${IP_API_URL}/${ip}?fields=status,timezone,city,regionName`, {
|
|
timeout: 5000,
|
|
});
|
|
|
|
if (response.data?.status === 'success' && response.data?.timezone) {
|
|
return {
|
|
timezone: response.data.timezone,
|
|
city: response.data.city,
|
|
region: response.data.regionName,
|
|
};
|
|
}
|
|
return null;
|
|
} catch (err: any) {
|
|
console.log(`[PuppeteerPreflight] IP geolocation lookup failed: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export interface PuppeteerPreflightResult extends PreflightResult {
|
|
method: 'http';
|
|
/** Number of products returned (proves API access) */
|
|
productsReturned?: number;
|
|
/** Browser user agent used */
|
|
browserUserAgent?: string;
|
|
/** Bot detection result from fingerprint.com */
|
|
botDetection?: {
|
|
detected: boolean;
|
|
probability?: number;
|
|
type?: string;
|
|
};
|
|
/** Expected proxy IP (from pool) */
|
|
expectedProxyIp?: string;
|
|
/** Whether IP verification passed (detected IP matches proxy) */
|
|
ipVerified?: boolean;
|
|
/** Detected timezone from IP geolocation */
|
|
detectedTimezone?: string;
|
|
/** Detected location from IP geolocation */
|
|
detectedLocation?: {
|
|
city?: string;
|
|
region?: string;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Run Puppeteer preflight check with proxy
|
|
* Tests browser-based access with anti-detect verification via fingerprint.com
|
|
*
|
|
* @param crawlRotator - CrawlRotator instance to get proxy from pool
|
|
*/
|
|
export async function runPuppeteerPreflight(
|
|
crawlRotator?: CrawlRotator
|
|
): Promise<PuppeteerPreflightResult> {
|
|
const result: PuppeteerPreflightResult = {
|
|
method: 'http',
|
|
passed: false,
|
|
proxyAvailable: false,
|
|
proxyConnected: false,
|
|
antidetectReady: false,
|
|
proxyIp: null,
|
|
fingerprint: null,
|
|
error: null,
|
|
responseTimeMs: null,
|
|
productsReturned: 0,
|
|
ipVerified: false,
|
|
};
|
|
|
|
let browser: any = null;
|
|
|
|
try {
|
|
// Step 0: Get a proxy from the pool
|
|
let proxyUrl: string | null = null;
|
|
let expectedProxyHost: string | null = null;
|
|
|
|
if (crawlRotator) {
|
|
const currentProxy = crawlRotator.proxy.getCurrent();
|
|
if (currentProxy) {
|
|
result.proxyAvailable = true;
|
|
proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
|
|
expectedProxyHost = currentProxy.host;
|
|
result.expectedProxyIp = expectedProxyHost;
|
|
console.log(`[PuppeteerPreflight] Using proxy: ${currentProxy.host}:${currentProxy.port}`);
|
|
} else {
|
|
result.error = 'No proxy available from pool';
|
|
console.log(`[PuppeteerPreflight] FAILED - No proxy available`);
|
|
return result;
|
|
}
|
|
} else {
|
|
console.log(`[PuppeteerPreflight] WARNING: No CrawlRotator provided - using direct connection`);
|
|
result.proxyAvailable = true; // No proxy needed for direct
|
|
}
|
|
|
|
// Dynamic imports to avoid loading Puppeteer unless needed
|
|
const puppeteer = require('puppeteer-extra');
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const startTime = Date.now();
|
|
|
|
// Build browser args
|
|
const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox'];
|
|
if (proxyUrl) {
|
|
// Extract host:port for Puppeteer (it handles auth separately)
|
|
const proxyUrlParsed = new URL(proxyUrl);
|
|
browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`);
|
|
}
|
|
|
|
// Launch browser with stealth + proxy
|
|
browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: browserArgs,
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Block unnecessary resources to save bandwidth
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (request: any) => {
|
|
const resourceType = request.resourceType();
|
|
if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
|
|
request.abort();
|
|
} else {
|
|
request.continue();
|
|
}
|
|
});
|
|
|
|
// If proxy has auth, set it up
|
|
if (proxyUrl) {
|
|
const proxyUrlParsed = new URL(proxyUrl);
|
|
if (proxyUrlParsed.username && proxyUrlParsed.password) {
|
|
await page.authenticate({
|
|
username: decodeURIComponent(proxyUrlParsed.username),
|
|
password: decodeURIComponent(proxyUrlParsed.password),
|
|
});
|
|
}
|
|
}
|
|
|
|
// Get browser user agent
|
|
const userAgent = await page.evaluate(() => navigator.userAgent);
|
|
result.browserUserAgent = userAgent;
|
|
result.fingerprint = {
|
|
userAgent,
|
|
browserName: 'Chrome (Puppeteer)',
|
|
deviceCategory: 'desktop',
|
|
};
|
|
|
|
// =========================================================================
|
|
// STEP 1a: Get IP address directly via simple API (more reliable than scraping)
|
|
// =========================================================================
|
|
console.log(`[PuppeteerPreflight] Getting proxy IP address...`);
|
|
try {
|
|
const ipApiResponse = await page.evaluate(async () => {
|
|
try {
|
|
const response = await fetch('https://api.ipify.org?format=json');
|
|
const data = await response.json();
|
|
return { ip: data.ip, error: null };
|
|
} catch (err: any) {
|
|
return { ip: null, error: err.message };
|
|
}
|
|
});
|
|
|
|
if (ipApiResponse.ip) {
|
|
result.proxyIp = ipApiResponse.ip;
|
|
result.proxyConnected = true;
|
|
console.log(`[PuppeteerPreflight] Detected proxy IP: ${ipApiResponse.ip}`);
|
|
|
|
// Look up timezone from IP
|
|
const geoData = await getTimezoneFromIp(ipApiResponse.ip);
|
|
if (geoData) {
|
|
result.detectedTimezone = geoData.timezone;
|
|
result.detectedLocation = { city: geoData.city, region: geoData.region };
|
|
console.log(`[PuppeteerPreflight] IP Geolocation: ${geoData.city}, ${geoData.region} (${geoData.timezone})`);
|
|
|
|
// Set browser timezone to match proxy location via CDP
|
|
try {
|
|
const client = await page.target().createCDPSession();
|
|
await client.send('Emulation.setTimezoneOverride', { timezoneId: geoData.timezone });
|
|
console.log(`[PuppeteerPreflight] Browser timezone set to: ${geoData.timezone}`);
|
|
} catch (tzErr: any) {
|
|
console.log(`[PuppeteerPreflight] Failed to set browser timezone: ${tzErr.message}`);
|
|
}
|
|
} else {
|
|
console.log(`[PuppeteerPreflight] WARNING: Could not determine timezone from IP - timezone mismatch possible`);
|
|
}
|
|
} else {
|
|
console.log(`[PuppeteerPreflight] IP lookup failed: ${ipApiResponse.error || 'unknown error'}`);
|
|
}
|
|
} catch (ipErr: any) {
|
|
console.log(`[PuppeteerPreflight] IP API error: ${ipErr.message}`);
|
|
}
|
|
|
|
// =========================================================================
|
|
// STEP 2: Preflight complete - proxy verified via ipify.org
|
|
// We skip heavy fingerprint.com/amiunique.org tests - just verify proxy works
|
|
// The actual Dutchie test happens at task time.
|
|
// =========================================================================
|
|
|
|
// If we got an IP from ipify.org, proxy is working
|
|
if (result.proxyIp) {
|
|
result.proxyConnected = true;
|
|
result.antidetectReady = true; // Assume stealth plugin is working
|
|
}
|
|
result.responseTimeMs = Date.now() - startTime;
|
|
|
|
// If we got here with proxyConnected=true and antidetectReady=true, we're good
|
|
if (result.proxyConnected && result.antidetectReady) {
|
|
result.passed = true;
|
|
console.log(
|
|
`[PuppeteerPreflight] PASSED - Proxy connected, anti-detect ready (${result.responseTimeMs}ms)`
|
|
);
|
|
if (result.proxyIp) {
|
|
console.log(`[PuppeteerPreflight] Browser IP via proxy: ${result.proxyIp}`);
|
|
}
|
|
} else if (result.proxyConnected) {
|
|
// Proxy works but anti-detect check failed - still pass (anti-detect is best-effort)
|
|
result.passed = true;
|
|
result.antidetectReady = true; // Assume ready since proxy works
|
|
console.log(
|
|
`[PuppeteerPreflight] PASSED - Proxy connected (anti-detect check skipped, ${result.responseTimeMs}ms)`
|
|
);
|
|
} else {
|
|
result.error = result.error || 'Proxy connection failed';
|
|
console.log(`[PuppeteerPreflight] FAILED - ${result.error}`);
|
|
}
|
|
} catch (err: any) {
|
|
result.error = `Browser error: ${err.message || 'Unknown error'}`;
|
|
console.log(`[PuppeteerPreflight] FAILED - ${result.error}`);
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close().catch(() => {});
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Run Puppeteer preflight with retry
|
|
* Retries once on failure to handle transient issues
|
|
*
|
|
* @param crawlRotator - CrawlRotator instance to get proxy from pool
|
|
* @param maxRetries - Number of retry attempts (default 1)
|
|
*/
|
|
export async function runPuppeteerPreflightWithRetry(
|
|
crawlRotator?: CrawlRotator,
|
|
maxRetries: number = 1
|
|
): Promise<PuppeteerPreflightResult> {
|
|
let lastResult: PuppeteerPreflightResult | null = null;
|
|
|
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
if (attempt > 0) {
|
|
console.log(`[PuppeteerPreflight] Retry attempt ${attempt}/${maxRetries}...`);
|
|
await new Promise((r) => setTimeout(r, 5000)); // Wait 5s between retries
|
|
}
|
|
|
|
lastResult = await runPuppeteerPreflight(crawlRotator);
|
|
|
|
if (lastResult.passed) {
|
|
return lastResult;
|
|
}
|
|
}
|
|
|
|
return lastResult!;
|
|
}
|