Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
156 lines
5.3 KiB
TypeScript
156 lines
5.3 KiB
TypeScript
/**
|
|
* Test script for stealth session management
|
|
*
|
|
* Per workflow-12102025.md:
|
|
* - Tests HTTP fingerprinting (browser-specific headers + ordering)
|
|
* - Tests UA generation (device distribution, browser filtering)
|
|
* - Tests dynamic Referer per dispensary
|
|
*
|
|
* Usage:
|
|
* npx tsx src/scripts/test-stealth-session.ts
|
|
*/
|
|
|
|
import {
|
|
startSession,
|
|
endSession,
|
|
getCurrentSession,
|
|
buildHeaders,
|
|
setCrawlRotator,
|
|
} from '../platforms/dutchie';
|
|
|
|
import { CrawlRotator } from '../services/crawl-rotator';
|
|
import {
|
|
generateHTTPFingerprint,
|
|
buildRefererFromMenuUrl,
|
|
BrowserType,
|
|
} from '../services/http-fingerprint';
|
|
|
|
console.log('='.repeat(60));
|
|
console.log('STEALTH SESSION TEST (per workflow-12102025.md)');
|
|
console.log('='.repeat(60));
|
|
|
|
// Initialize CrawlRotator (required for sessions)
|
|
console.log('\n[Setup] Initializing CrawlRotator...');
|
|
const rotator = new CrawlRotator();
|
|
setCrawlRotator(rotator);
|
|
console.log(' CrawlRotator initialized');
|
|
|
|
// Test 1: HTTP Fingerprint Generation
|
|
console.log('\n[Test 1] HTTP Fingerprint Generation:');
|
|
const browsers: BrowserType[] = ['Chrome', 'Firefox', 'Safari', 'Edge'];
|
|
|
|
for (const browser of browsers) {
|
|
const httpFp = generateHTTPFingerprint(browser);
|
|
console.log(` ${browser}:`);
|
|
console.log(` TLS binary: ${httpFp.curlImpersonateBinary}`);
|
|
console.log(` DNT: ${httpFp.hasDNT ? 'enabled' : 'disabled'}`);
|
|
console.log(` Header order: ${httpFp.headerOrder.slice(0, 5).join(', ')}...`);
|
|
}
|
|
|
|
// Test 2: Dynamic Referer from menu URLs
|
|
console.log('\n[Test 2] Dynamic Referer from Menu URLs:');
|
|
const testUrls = [
|
|
'https://dutchie.com/embedded-menu/harvest-of-tempe',
|
|
'https://dutchie.com/dispensary/zen-leaf-mesa',
|
|
'/embedded-menu/deeply-rooted',
|
|
'/dispensary/curaleaf-phoenix',
|
|
null,
|
|
undefined,
|
|
];
|
|
|
|
for (const url of testUrls) {
|
|
const referer = buildRefererFromMenuUrl(url);
|
|
console.log(` ${url || '(null/undefined)'}`);
|
|
console.log(` → ${referer}`);
|
|
}
|
|
|
|
// Test 3: Session with Dynamic Referer
|
|
console.log('\n[Test 3] Session with Dynamic Referer:');
|
|
const testMenuUrl = 'https://dutchie.com/dispensary/harvest-of-tempe';
|
|
console.log(` Starting session with menuUrl: ${testMenuUrl}`);
|
|
|
|
const session1 = startSession(testMenuUrl);
|
|
console.log(` Session ID: ${session1.sessionId}`);
|
|
console.log(` Browser: ${session1.fingerprint.browserName}`);
|
|
console.log(` Device: ${session1.fingerprint.deviceCategory}`);
|
|
console.log(` Referer: ${session1.referer}`);
|
|
console.log(` DNT: ${session1.fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
|
console.log(` TLS: ${session1.fingerprint.httpFingerprint.curlImpersonateBinary}`);
|
|
|
|
// Test 4: Build Headers (browser-specific order)
|
|
console.log('\n[Test 4] Build Headers (browser-specific order):');
|
|
const { headers, orderedHeaders } = buildHeaders(true, 1000);
|
|
console.log(` Headers built for ${session1.fingerprint.browserName}:`);
|
|
console.log(` Order: ${orderedHeaders.join(' → ')}`);
|
|
console.log(` Sample headers:`);
|
|
console.log(` User-Agent: ${headers['User-Agent']?.slice(0, 50)}...`);
|
|
console.log(` Accept: ${headers['Accept']}`);
|
|
console.log(` Accept-Language: ${headers['Accept-Language']}`);
|
|
console.log(` Referer: ${headers['Referer']}`);
|
|
if (headers['sec-ch-ua']) {
|
|
console.log(` sec-ch-ua: ${headers['sec-ch-ua']}`);
|
|
}
|
|
if (headers['DNT']) {
|
|
console.log(` DNT: ${headers['DNT']}`);
|
|
}
|
|
|
|
endSession();
|
|
|
|
// Test 5: Multiple Sessions (UA variety)
|
|
console.log('\n[Test 5] Multiple Sessions (UA & fingerprint variety):');
|
|
const sessions: {
|
|
browser: string;
|
|
device: string;
|
|
hasDNT: boolean;
|
|
}[] = [];
|
|
|
|
for (let i = 0; i < 10; i++) {
|
|
const session = startSession(`/dispensary/store-${i}`);
|
|
sessions.push({
|
|
browser: session.fingerprint.browserName,
|
|
device: session.fingerprint.deviceCategory,
|
|
hasDNT: session.fingerprint.httpFingerprint.hasDNT,
|
|
});
|
|
endSession();
|
|
}
|
|
|
|
// Count distribution
|
|
const browserCounts: Record<string, number> = {};
|
|
const deviceCounts: Record<string, number> = {};
|
|
let dntCount = 0;
|
|
|
|
for (const s of sessions) {
|
|
browserCounts[s.browser] = (browserCounts[s.browser] || 0) + 1;
|
|
deviceCounts[s.device] = (deviceCounts[s.device] || 0) + 1;
|
|
if (s.hasDNT) dntCount++;
|
|
}
|
|
|
|
console.log(` 10 sessions created:`);
|
|
console.log(` Browsers: ${JSON.stringify(browserCounts)}`);
|
|
console.log(` Devices: ${JSON.stringify(deviceCounts)}`);
|
|
console.log(` DNT enabled: ${dntCount}/10 (expected ~30%)`);
|
|
|
|
// Test 6: Device distribution check (per workflow-12102025.md: 62/36/2)
|
|
console.log('\n[Test 6] Device Distribution (larger sample):');
|
|
const deviceSamples: string[] = [];
|
|
|
|
for (let i = 0; i < 100; i++) {
|
|
const session = startSession();
|
|
deviceSamples.push(session.fingerprint.deviceCategory);
|
|
endSession();
|
|
}
|
|
|
|
const mobileCount = deviceSamples.filter(d => d === 'mobile').length;
|
|
const desktopCount = deviceSamples.filter(d => d === 'desktop').length;
|
|
const tabletCount = deviceSamples.filter(d => d === 'tablet').length;
|
|
|
|
console.log(` 100 sessions (expected: 62% mobile, 36% desktop, 2% tablet):`);
|
|
console.log(` Mobile: ${mobileCount}%`);
|
|
console.log(` Desktop: ${desktopCount}%`);
|
|
console.log(` Tablet: ${tabletCount}%`);
|
|
console.log(` Distribution: ${Math.abs(mobileCount - 62) < 15 && Math.abs(desktopCount - 36) < 15 ? '✅ Reasonable' : '⚠️ Off target'}`);
|
|
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('TEST COMPLETE');
|
|
console.log('='.repeat(60));
|