diff --git a/backend/src/services/browser-factory.ts b/backend/src/services/browser-factory.ts new file mode 100644 index 00000000..39435856 --- /dev/null +++ b/backend/src/services/browser-factory.ts @@ -0,0 +1,241 @@ +/** + * Browser Factory - Centralized browser setup with antidetect features + * + * Creates Puppeteer browser instances with: + * - Stealth plugin for antidetect + * - Proxy configuration + * - Timezone spoofing (matches proxy geo) + * - Geolocation spoofing (matches proxy geo) + * - Resource blocking for bandwidth optimization + * + * Use this factory for all browser-based scraping to ensure consistent antidetect. + */ + +/** + * Look up IP geolocation via ip-api.com + * Returns IP address, timezone, coordinates, city, and region + */ +async function getProxyGeoData(page: any): Promise { + try { + const geoData = await page.evaluate(async () => { + const response = await fetch('http://ip-api.com/json?fields=status,query,timezone,city,regionName,lat,lon'); + return response.json(); + }); + + if (geoData?.status === 'success') { + return { + timezone: geoData.timezone, + city: geoData.city, + region: geoData.regionName, + latitude: geoData.lat, + longitude: geoData.lon, + }; + } + return null; + } catch (err: any) { + console.log(`[BrowserFactory] IP geolocation lookup failed: ${err.message}`); + return null; + } +} + +// Domains to block - analytics, tracking, feature flags +const BLOCKED_DOMAINS = [ + 'googletagmanager.com', + 'google-analytics.com', + 'launchdarkly.com', + 'assets2.dutchie.com', + 'sentry.io', + 'segment.io', + 'segment.com', + 'amplitude.com', + 'mixpanel.com', + 'hotjar.com', + 'fullstory.com', +]; + +export interface BrowserConfig { + proxyUrl?: string; + /** Target domains for geolocation permission (default: dutchie.com) */ + targetDomains?: string[]; + /** Block images/fonts/media for bandwidth (default: true) */ + blockResources?: boolean; + /** Headless mode (default: 'new') */ + headless?: boolean | 'new'; +} + +export interface GeoData { + timezone: string; + city?: string; + region?: string; + latitude?: number; + longitude?: number; +} + +export interface BrowserSession { + browser: any; + page: any; + proxyIp: string | null; + geoData: GeoData | null; + close: () => Promise; +} + + +/** + * Create a browser session with full antidetect setup + * + * @param config - Browser configuration options + * @returns Browser session with page, proxy IP, and geo data + */ +export async function createBrowserSession(config: BrowserConfig = {}): Promise { + const { + proxyUrl, + targetDomains = ['https://dutchie.com', 'https://api.dutchie.com'], + blockResources = true, + headless = 'new', + } = config; + + const puppeteer = require('puppeteer-extra'); + const StealthPlugin = require('puppeteer-extra-plugin-stealth'); + puppeteer.use(StealthPlugin()); + + // Build browser args + const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox']; + if (proxyUrl) { + const proxyUrlParsed = new URL(proxyUrl); + browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`); + } + + // Launch browser with stealth + proxy + const browser = await puppeteer.launch({ + headless, + args: browserArgs, + }); + + const page = await browser.newPage(); + + // Grant geolocation permission for target domains + const context = browser.defaultBrowserContext(); + for (const domain of targetDomains) { + try { + await context.overridePermissions(domain, ['geolocation']); + } catch (err) { + // Ignore permission errors for domains we might not visit + } + } + + // Set up request interception for resource blocking + if (blockResources) { + await page.setRequestInterception(true); + + page.on('request', (request: any) => { + const url = request.url(); + const resourceType = request.resourceType(); + + // Block by domain + if (BLOCKED_DOMAINS.some(domain => url.includes(domain))) { + request.abort(); + return; + } + + // Block by resource type (optional - saves bandwidth) + if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) { + request.abort(); + return; + } + + request.continue(); + }); + } + + // Set up proxy authentication if needed + if (proxyUrl) { + const proxyUrlParsed = new URL(proxyUrl); + if (proxyUrlParsed.username && proxyUrlParsed.password) { + await page.authenticate({ + username: decodeURIComponent(proxyUrlParsed.username), + password: decodeURIComponent(proxyUrlParsed.password), + }); + } + } + + // Detect proxy IP and get actual geolocation + console.log(`[BrowserFactory] Detecting proxy IP and location...`); + const geoData = await getProxyGeoData(page); + + if (!geoData) { + console.log(`[BrowserFactory] Failed to detect proxy location - closing browser`); + await browser.close().catch(() => {}); + throw new Error('Failed to detect proxy IP/location'); + } + + console.log(`[BrowserFactory] Proxy location: ${geoData.city}, ${geoData.region} (${geoData.timezone})`); + + try { + const client = await page.target().createCDPSession(); + + // Spoof timezone to match proxy IP + await client.send('Emulation.setTimezoneOverride', { timezoneId: geoData.timezone }); + + // Spoof geolocation to match proxy IP + await client.send('Emulation.setGeolocationOverride', { + latitude: geoData.latitude, + longitude: geoData.longitude, + accuracy: 100, + }); + + console.log(`[BrowserFactory] Antidetect configured: TZ=${geoData.timezone}, Geo=${geoData.latitude},${geoData.longitude}`); + } catch (err: any) { + console.log(`[BrowserFactory] Failed to set antidetect: ${err.message}`); + } + + const proxyIp: string | null = null; // IP is in geoData + + return { + browser, + page, + proxyIp, + geoData, + close: async () => { + try { + await browser.close(); + } catch (err) { + // Ignore close errors + } + }, + }; +} + +/** + * Apply antidetect settings to an existing page + * Use this when you can't use createBrowserSession (e.g., existing browser) + * + * @param page - Puppeteer page instance + */ +export async function applyAntidetectSettings(page: any): Promise { + // Get IP and geo data in one call + const geoData = await getProxyGeoData(page); + if (!geoData) { + console.log(`[BrowserFactory] Could not detect IP for antidetect settings`); + return null; + } + + try { + const client = await page.target().createCDPSession(); + + await client.send('Emulation.setTimezoneOverride', { timezoneId: geoData.timezone }); + + if (geoData.latitude && geoData.longitude) { + await client.send('Emulation.setGeolocationOverride', { + latitude: geoData.latitude, + longitude: geoData.longitude, + accuracy: 100, + }); + } + + console.log(`[BrowserFactory] Antidetect applied: ${geoData.city}, ${geoData.region}`); + } catch (err: any) { + console.log(`[BrowserFactory] Failed to apply antidetect: ${err.message}`); + } + + return geoData; +} diff --git a/backend/src/services/puppeteer-preflight.ts b/backend/src/services/puppeteer-preflight.ts index f71cd449..506dabeb 100644 --- a/backend/src/services/puppeteer-preflight.ts +++ b/backend/src/services/puppeteer-preflight.ts @@ -26,25 +26,38 @@ const TEST_PLATFORM_ID = '6405ef617056e8014d79101b'; const FINGERPRINT_DEMO_URL = 'https://demo.fingerprint.com/'; const AMIUNIQUE_URL = 'https://amiunique.org/fingerprint'; -// IP geolocation API for timezone lookup (free, no key required) +// IP geolocation API - returns IP, timezone, lat/lng, city, region in one call const IP_API_URL = 'http://ip-api.com/json'; +interface GeoData { + ip: string; + timezone: string; + city: string; + region: string; + lat: number; + lng: number; +} + /** - * Look up timezone from IP address using ip-api.com - * Returns IANA timezone (e.g., 'America/New_York') or null on failure + * Look up IP geolocation via ip-api.com + * Returns IP address, timezone, coordinates, city, and region */ -async function getTimezoneFromIp(ip: string): Promise<{ timezone: string; city?: string; region?: string } | null> { +async function getProxyGeoData(page: any): Promise { try { - const axios = require('axios'); - const response = await axios.get(`${IP_API_URL}/${ip}?fields=status,timezone,city,regionName`, { - timeout: 5000, + // Use browser to fetch - this goes through the proxy + const geoData = await page.evaluate(async () => { + const response = await fetch('http://ip-api.com/json?fields=status,query,timezone,city,regionName,lat,lon'); + return response.json(); }); - if (response.data?.status === 'success' && response.data?.timezone) { + if (geoData?.status === 'success') { return { - timezone: response.data.timezone, - city: response.data.city, - region: response.data.regionName, + ip: geoData.query, + timezone: geoData.timezone, + city: geoData.city, + region: geoData.regionName, + lat: geoData.lat, + lng: geoData.lon, }; } return null; @@ -85,10 +98,12 @@ export interface PuppeteerPreflightResult extends PreflightResult { * * @param crawlRotator - CrawlRotator instance to get proxy from pool * @param customProxyUrl - Optional custom proxy URL (for identity pool system) + * @param targetState - Target state code for geo spoofing (e.g., 'AZ') */ export async function runPuppeteerPreflight( crawlRotator?: CrawlRotator, - customProxyUrl?: string + customProxyUrl?: string, + targetState?: string ): Promise { const result: PuppeteerPreflightResult = { method: 'http', @@ -183,6 +198,11 @@ export async function runPuppeteerPreflight( const page = await browser.newPage(); + // Grant geolocation permission so navigator.geolocation works without prompting + const context = browser.defaultBrowserContext(); + await context.overridePermissions('https://dutchie.com', ['geolocation']); + await context.overridePermissions('https://api.dutchie.com', ['geolocation']); + // Block unnecessary resources to save bandwidth await page.setRequestInterception(true); @@ -241,81 +261,59 @@ export async function runPuppeteerPreflight( }; // ========================================================================= - // STEP 1a: Get IP address directly via simple API (more reliable than scraping) + // STEP 1: Detect proxy IP and get actual geolocation + // ========================================================================= + console.log(`[PuppeteerPreflight] Detecting proxy IP and location...`); + const geoData = await getProxyGeoData(page); + + if (!geoData) { + result.error = 'Failed to detect proxy IP/location'; + console.log(`[PuppeteerPreflight] FAILED - ${result.error}`); + result.responseTimeMs = Date.now() - startTime; + return result; + } + + result.proxyIp = geoData.ip; + result.proxyConnected = true; + result.detectedTimezone = geoData.timezone; + result.detectedLocation = { city: geoData.city, region: geoData.region }; + console.log(`[PuppeteerPreflight] Proxy IP: ${geoData.ip} - ${geoData.city}, ${geoData.region} (${geoData.timezone})`); + + // ========================================================================= + // STEP 2: Configure antidetect to match actual proxy location // ========================================================================= - console.log(`[PuppeteerPreflight] Getting proxy IP address...`); try { - const ipApiResponse = await page.evaluate(async () => { - try { - const response = await fetch('https://api.ipify.org?format=json'); - const data = await response.json(); - return { ip: data.ip, error: null }; - } catch (err: any) { - return { ip: null, error: err.message }; - } + const client = await page.target().createCDPSession(); + + // Spoof timezone to match proxy IP + await client.send('Emulation.setTimezoneOverride', { timezoneId: geoData.timezone }); + console.log(`[PuppeteerPreflight] Timezone spoofed: ${geoData.timezone}`); + + // Spoof geolocation to match proxy IP + await client.send('Emulation.setGeolocationOverride', { + latitude: geoData.lat, + longitude: geoData.lng, + accuracy: 100, }); + console.log(`[PuppeteerPreflight] Geolocation spoofed: ${geoData.lat}, ${geoData.lng}`); - if (ipApiResponse.ip) { - result.proxyIp = ipApiResponse.ip; - result.proxyConnected = true; - console.log(`[PuppeteerPreflight] Detected proxy IP: ${ipApiResponse.ip}`); - - // Look up timezone from IP - const geoData = await getTimezoneFromIp(ipApiResponse.ip); - if (geoData) { - result.detectedTimezone = geoData.timezone; - result.detectedLocation = { city: geoData.city, region: geoData.region }; - console.log(`[PuppeteerPreflight] IP Geolocation: ${geoData.city}, ${geoData.region} (${geoData.timezone})`); - - // Set browser timezone to match proxy location via CDP - try { - const client = await page.target().createCDPSession(); - await client.send('Emulation.setTimezoneOverride', { timezoneId: geoData.timezone }); - console.log(`[PuppeteerPreflight] Browser timezone set to: ${geoData.timezone}`); - } catch (tzErr: any) { - console.log(`[PuppeteerPreflight] Failed to set browser timezone: ${tzErr.message}`); - } - } else { - console.log(`[PuppeteerPreflight] WARNING: Could not determine timezone from IP - timezone mismatch possible`); - } - } else { - console.log(`[PuppeteerPreflight] IP lookup failed: ${ipApiResponse.error || 'unknown error'}`); - } - } catch (ipErr: any) { - console.log(`[PuppeteerPreflight] IP API error: ${ipErr.message}`); + result.antidetectReady = true; + } catch (cdpErr: any) { + console.log(`[PuppeteerPreflight] Failed to set timezone/geolocation: ${cdpErr.message}`); } // ========================================================================= - // STEP 2: Preflight complete - proxy verified via ipify.org - // We skip heavy fingerprint.com/amiunique.org tests - just verify proxy works - // The actual Dutchie test happens at task time. + // STEP 3: Preflight complete // ========================================================================= - - // If we got an IP from ipify.org, proxy is working - if (result.proxyIp) { - result.proxyConnected = true; - result.antidetectReady = true; // Assume stealth plugin is working - } result.responseTimeMs = Date.now() - startTime; - // If we got here with proxyConnected=true and antidetectReady=true, we're good - if (result.proxyConnected && result.antidetectReady) { + if (result.antidetectReady) { result.passed = true; console.log( - `[PuppeteerPreflight] PASSED - Proxy connected, anti-detect ready (${result.responseTimeMs}ms)` - ); - if (result.proxyIp) { - console.log(`[PuppeteerPreflight] Browser IP via proxy: ${result.proxyIp}`); - } - } else if (result.proxyConnected) { - // Proxy works but anti-detect check failed - still pass (anti-detect is best-effort) - result.passed = true; - result.antidetectReady = true; // Assume ready since proxy works - console.log( - `[PuppeteerPreflight] PASSED - Proxy connected (anti-detect check skipped, ${result.responseTimeMs}ms)` + `[PuppeteerPreflight] PASSED - ${geoData.city}, ${geoData.region} (${result.responseTimeMs}ms)` ); } else { - result.error = result.error || 'Proxy connection failed'; + result.error = result.error || 'Antidetect configuration failed'; console.log(`[PuppeteerPreflight] FAILED - ${result.error}`); } } catch (err: any) { @@ -341,7 +339,8 @@ export async function runPuppeteerPreflight( export async function runPuppeteerPreflightWithRetry( crawlRotator?: CrawlRotator, maxRetries: number = 1, - customProxyUrl?: string + customProxyUrl?: string, + targetState?: string ): Promise { let lastResult: PuppeteerPreflightResult | null = null; @@ -351,7 +350,7 @@ export async function runPuppeteerPreflightWithRetry( await new Promise((r) => setTimeout(r, 5000)); // Wait 5s between retries } - lastResult = await runPuppeteerPreflight(crawlRotator, customProxyUrl); + lastResult = await runPuppeteerPreflight(crawlRotator, customProxyUrl, targetState); if (lastResult.passed) { return lastResult;