feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -16,10 +16,11 @@ import {
|
||||
executeGraphQL,
|
||||
startSession,
|
||||
endSession,
|
||||
getFingerprint,
|
||||
setCrawlRotator,
|
||||
GRAPHQL_HASHES,
|
||||
DUTCHIE_CONFIG,
|
||||
} from '../platforms/dutchie';
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@@ -108,19 +109,27 @@ async function main() {
|
||||
|
||||
// ============================================================
|
||||
// STEP 2: Start stealth session
|
||||
// Per workflow-12102025.md: Initialize CrawlRotator and start session with menuUrl
|
||||
// ============================================================
|
||||
console.log('┌─────────────────────────────────────────────────────────────┐');
|
||||
console.log('│ STEP 2: Start Stealth Session │');
|
||||
console.log('└─────────────────────────────────────────────────────────────┘');
|
||||
|
||||
// Use Arizona timezone for this store
|
||||
const session = startSession(disp.state || 'AZ', 'America/Phoenix');
|
||||
// Per workflow-12102025.md: Initialize CrawlRotator (required for sessions)
|
||||
const rotator = new CrawlRotator();
|
||||
setCrawlRotator(rotator);
|
||||
|
||||
const fp = getFingerprint();
|
||||
// Per workflow-12102025.md: startSession takes menuUrl for dynamic Referer
|
||||
const session = startSession(disp.menu_url);
|
||||
|
||||
const fp = session.fingerprint;
|
||||
console.log(` Session ID: ${session.sessionId}`);
|
||||
console.log(` Browser: ${fp.browserName} (${fp.deviceCategory})`);
|
||||
console.log(` User-Agent: ${fp.userAgent.slice(0, 60)}...`);
|
||||
console.log(` Accept-Language: ${fp.acceptLanguage}`);
|
||||
console.log(` Sec-CH-UA: ${fp.secChUa || '(not set)'}`);
|
||||
console.log(` Referer: ${session.referer}`);
|
||||
console.log(` DNT: ${fp.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(` TLS: ${fp.httpFingerprint.curlImpersonateBinary}`);
|
||||
console.log('');
|
||||
|
||||
// ============================================================
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
/**
|
||||
* Test script for stealth session management
|
||||
*
|
||||
* Tests:
|
||||
* 1. Per-session fingerprint rotation
|
||||
* 2. Geographic consistency (timezone → Accept-Language)
|
||||
* 3. Proxy location loading from database
|
||||
* Per workflow-12102025.md:
|
||||
* - Tests HTTP fingerprinting (browser-specific headers + ordering)
|
||||
* - Tests UA generation (device distribution, browser filtering)
|
||||
* - Tests dynamic Referer per dispensary
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/test-stealth-session.ts
|
||||
@@ -14,104 +14,142 @@ import {
|
||||
startSession,
|
||||
endSession,
|
||||
getCurrentSession,
|
||||
getFingerprint,
|
||||
getRandomFingerprint,
|
||||
getLocaleForTimezone,
|
||||
buildHeaders,
|
||||
setCrawlRotator,
|
||||
} from '../platforms/dutchie';
|
||||
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
import {
|
||||
generateHTTPFingerprint,
|
||||
buildRefererFromMenuUrl,
|
||||
BrowserType,
|
||||
} from '../services/http-fingerprint';
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('STEALTH SESSION TEST');
|
||||
console.log('STEALTH SESSION TEST (per workflow-12102025.md)');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
// Test 1: Timezone to Locale mapping
|
||||
console.log('\n[Test 1] Timezone to Locale Mapping:');
|
||||
const testTimezones = [
|
||||
'America/Phoenix',
|
||||
'America/Los_Angeles',
|
||||
'America/New_York',
|
||||
'America/Chicago',
|
||||
// Initialize CrawlRotator (required for sessions)
|
||||
console.log('\n[Setup] Initializing CrawlRotator...');
|
||||
const rotator = new CrawlRotator();
|
||||
setCrawlRotator(rotator);
|
||||
console.log(' CrawlRotator initialized');
|
||||
|
||||
// Test 1: HTTP Fingerprint Generation
|
||||
console.log('\n[Test 1] HTTP Fingerprint Generation:');
|
||||
const browsers: BrowserType[] = ['Chrome', 'Firefox', 'Safari', 'Edge'];
|
||||
|
||||
for (const browser of browsers) {
|
||||
const httpFp = generateHTTPFingerprint(browser);
|
||||
console.log(` ${browser}:`);
|
||||
console.log(` TLS binary: ${httpFp.curlImpersonateBinary}`);
|
||||
console.log(` DNT: ${httpFp.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(` Header order: ${httpFp.headerOrder.slice(0, 5).join(', ')}...`);
|
||||
}
|
||||
|
||||
// Test 2: Dynamic Referer from menu URLs
|
||||
console.log('\n[Test 2] Dynamic Referer from Menu URLs:');
|
||||
const testUrls = [
|
||||
'https://dutchie.com/embedded-menu/harvest-of-tempe',
|
||||
'https://dutchie.com/dispensary/zen-leaf-mesa',
|
||||
'/embedded-menu/deeply-rooted',
|
||||
'/dispensary/curaleaf-phoenix',
|
||||
null,
|
||||
undefined,
|
||||
'Invalid/Timezone',
|
||||
];
|
||||
|
||||
for (const tz of testTimezones) {
|
||||
const locale = getLocaleForTimezone(tz);
|
||||
console.log(` ${tz || '(undefined)'} → ${locale}`);
|
||||
for (const url of testUrls) {
|
||||
const referer = buildRefererFromMenuUrl(url);
|
||||
console.log(` ${url || '(null/undefined)'}`);
|
||||
console.log(` → ${referer}`);
|
||||
}
|
||||
|
||||
// Test 2: Random fingerprint selection
|
||||
console.log('\n[Test 2] Random Fingerprint Selection (5 samples):');
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const fp = getRandomFingerprint();
|
||||
console.log(` ${i + 1}. ${fp.userAgent.slice(0, 60)}...`);
|
||||
// Test 3: Session with Dynamic Referer
|
||||
console.log('\n[Test 3] Session with Dynamic Referer:');
|
||||
const testMenuUrl = 'https://dutchie.com/dispensary/harvest-of-tempe';
|
||||
console.log(` Starting session with menuUrl: ${testMenuUrl}`);
|
||||
|
||||
const session1 = startSession(testMenuUrl);
|
||||
console.log(` Session ID: ${session1.sessionId}`);
|
||||
console.log(` Browser: ${session1.fingerprint.browserName}`);
|
||||
console.log(` Device: ${session1.fingerprint.deviceCategory}`);
|
||||
console.log(` Referer: ${session1.referer}`);
|
||||
console.log(` DNT: ${session1.fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(` TLS: ${session1.fingerprint.httpFingerprint.curlImpersonateBinary}`);
|
||||
|
||||
// Test 4: Build Headers (browser-specific order)
|
||||
console.log('\n[Test 4] Build Headers (browser-specific order):');
|
||||
const { headers, orderedHeaders } = buildHeaders(true, 1000);
|
||||
console.log(` Headers built for ${session1.fingerprint.browserName}:`);
|
||||
console.log(` Order: ${orderedHeaders.join(' → ')}`);
|
||||
console.log(` Sample headers:`);
|
||||
console.log(` User-Agent: ${headers['User-Agent']?.slice(0, 50)}...`);
|
||||
console.log(` Accept: ${headers['Accept']}`);
|
||||
console.log(` Accept-Language: ${headers['Accept-Language']}`);
|
||||
console.log(` Referer: ${headers['Referer']}`);
|
||||
if (headers['sec-ch-ua']) {
|
||||
console.log(` sec-ch-ua: ${headers['sec-ch-ua']}`);
|
||||
}
|
||||
if (headers['DNT']) {
|
||||
console.log(` DNT: ${headers['DNT']}`);
|
||||
}
|
||||
|
||||
// Test 3: Session Management
|
||||
console.log('\n[Test 3] Session Management:');
|
||||
|
||||
// Before session - should use default fingerprint
|
||||
console.log(' Before session:');
|
||||
const beforeFp = getFingerprint();
|
||||
console.log(` getFingerprint(): ${beforeFp.userAgent.slice(0, 50)}...`);
|
||||
console.log(` getCurrentSession(): ${getCurrentSession()}`);
|
||||
|
||||
// Start session with Arizona timezone
|
||||
console.log('\n Starting session (AZ, America/Phoenix):');
|
||||
const session1 = startSession('AZ', 'America/Phoenix');
|
||||
console.log(` Session ID: ${session1.sessionId}`);
|
||||
console.log(` Fingerprint UA: ${session1.fingerprint.userAgent.slice(0, 50)}...`);
|
||||
console.log(` Accept-Language: ${session1.fingerprint.acceptLanguage}`);
|
||||
console.log(` Timezone: ${session1.timezone}`);
|
||||
|
||||
// During session - should use session fingerprint
|
||||
console.log('\n During session:');
|
||||
const duringFp = getFingerprint();
|
||||
console.log(` getFingerprint(): ${duringFp.userAgent.slice(0, 50)}...`);
|
||||
console.log(` Same as session? ${duringFp.userAgent === session1.fingerprint.userAgent}`);
|
||||
|
||||
// Test buildHeaders with session
|
||||
console.log('\n buildHeaders() during session:');
|
||||
const headers = buildHeaders('/embedded-menu/test-store');
|
||||
console.log(` User-Agent: ${headers['user-agent'].slice(0, 50)}...`);
|
||||
console.log(` Accept-Language: ${headers['accept-language']}`);
|
||||
console.log(` Origin: ${headers['origin']}`);
|
||||
console.log(` Referer: ${headers['referer']}`);
|
||||
|
||||
// End session
|
||||
console.log('\n Ending session:');
|
||||
endSession();
|
||||
console.log(` getCurrentSession(): ${getCurrentSession()}`);
|
||||
|
||||
// Test 4: Multiple sessions should have different fingerprints
|
||||
console.log('\n[Test 4] Multiple Sessions (fingerprint variety):');
|
||||
const fingerprints: string[] = [];
|
||||
// Test 5: Multiple Sessions (UA variety)
|
||||
console.log('\n[Test 5] Multiple Sessions (UA & fingerprint variety):');
|
||||
const sessions: {
|
||||
browser: string;
|
||||
device: string;
|
||||
hasDNT: boolean;
|
||||
}[] = [];
|
||||
|
||||
for (let i = 0; i < 10; i++) {
|
||||
const session = startSession('CA', 'America/Los_Angeles');
|
||||
fingerprints.push(session.fingerprint.userAgent);
|
||||
const session = startSession(`/dispensary/store-${i}`);
|
||||
sessions.push({
|
||||
browser: session.fingerprint.browserName,
|
||||
device: session.fingerprint.deviceCategory,
|
||||
hasDNT: session.fingerprint.httpFingerprint.hasDNT,
|
||||
});
|
||||
endSession();
|
||||
}
|
||||
|
||||
const uniqueCount = new Set(fingerprints).size;
|
||||
console.log(` 10 sessions created, ${uniqueCount} unique fingerprints`);
|
||||
console.log(` Variety: ${uniqueCount >= 3 ? '✅ Good' : '⚠️ Low - may need more fingerprint options'}`);
|
||||
// Count distribution
|
||||
const browserCounts: Record<string, number> = {};
|
||||
const deviceCounts: Record<string, number> = {};
|
||||
let dntCount = 0;
|
||||
|
||||
// Test 5: Geographic consistency check
|
||||
console.log('\n[Test 5] Geographic Consistency:');
|
||||
const geoTests = [
|
||||
{ state: 'AZ', tz: 'America/Phoenix' },
|
||||
{ state: 'CA', tz: 'America/Los_Angeles' },
|
||||
{ state: 'NY', tz: 'America/New_York' },
|
||||
{ state: 'IL', tz: 'America/Chicago' },
|
||||
];
|
||||
for (const s of sessions) {
|
||||
browserCounts[s.browser] = (browserCounts[s.browser] || 0) + 1;
|
||||
deviceCounts[s.device] = (deviceCounts[s.device] || 0) + 1;
|
||||
if (s.hasDNT) dntCount++;
|
||||
}
|
||||
|
||||
for (const { state, tz } of geoTests) {
|
||||
const session = startSession(state, tz);
|
||||
const consistent = session.fingerprint.acceptLanguage.includes('en-US');
|
||||
console.log(` ${state} (${tz}): Accept-Language=${session.fingerprint.acceptLanguage} ${consistent ? '✅' : '❌'}`);
|
||||
console.log(` 10 sessions created:`);
|
||||
console.log(` Browsers: ${JSON.stringify(browserCounts)}`);
|
||||
console.log(` Devices: ${JSON.stringify(deviceCounts)}`);
|
||||
console.log(` DNT enabled: ${dntCount}/10 (expected ~30%)`);
|
||||
|
||||
// Test 6: Device distribution check (per workflow-12102025.md: 62/36/2)
|
||||
console.log('\n[Test 6] Device Distribution (larger sample):');
|
||||
const deviceSamples: string[] = [];
|
||||
|
||||
for (let i = 0; i < 100; i++) {
|
||||
const session = startSession();
|
||||
deviceSamples.push(session.fingerprint.deviceCategory);
|
||||
endSession();
|
||||
}
|
||||
|
||||
const mobileCount = deviceSamples.filter(d => d === 'mobile').length;
|
||||
const desktopCount = deviceSamples.filter(d => d === 'desktop').length;
|
||||
const tabletCount = deviceSamples.filter(d => d === 'tablet').length;
|
||||
|
||||
console.log(` 100 sessions (expected: 62% mobile, 36% desktop, 2% tablet):`);
|
||||
console.log(` Mobile: ${mobileCount}%`);
|
||||
console.log(` Desktop: ${desktopCount}%`);
|
||||
console.log(` Tablet: ${tabletCount}%`);
|
||||
console.log(` Distribution: ${Math.abs(mobileCount - 62) < 15 && Math.abs(desktopCount - 36) < 15 ? '✅ Reasonable' : '⚠️ Off target'}`);
|
||||
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('TEST COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
Reference in New Issue
Block a user