Files
cannaiq/backend/src/services/crawl-rotator.ts
Kelly 4949b22457 feat(tasks): Refactor task workflow with payload/refresh separation
Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 22:15:35 -07:00

695 lines
21 KiB
TypeScript

/**
* Crawl Rotator - Proxy & User Agent Rotation for Crawlers
*
* Updated: 2025-12-10 per workflow-12102025.md
*
* KEY BEHAVIORS (per workflow-12102025.md):
* 1. Task determines WHAT work to do, proxy determines SESSION IDENTITY
* 2. Proxy location (timezone) sets Accept-Language headers (always English)
* 3. On 403: immediately get new IP, new fingerprint, retry
* 4. After 3 consecutive 403s on same proxy with different fingerprints → disable proxy
*
* USER-AGENT GENERATION (per workflow-12102025.md):
* - Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
* - Browser whitelist: Chrome, Safari, Edge, Firefox only
* - UA sticks until IP rotates
* - Failure = alert admin + stop crawl (no fallback)
*
* Uses intoli/user-agents for realistic UA generation with daily-updated data.
*
* Canonical location: src/services/crawl-rotator.ts
*/
import { Pool } from 'pg';
import UserAgent from 'user-agents';
import {
HTTPFingerprint,
generateHTTPFingerprint,
BrowserType,
} from './http-fingerprint';
// ============================================================
// UA CONSTANTS (per workflow-12102025.md)
// ============================================================
/**
* Per workflow-12102025.md: Device category distribution (hardcoded)
* Mobile: 62%, Desktop: 36%, Tablet: 2%
*/
const DEVICE_WEIGHTS = {
mobile: 62,
desktop: 36,
tablet: 2,
} as const;
/**
* Per workflow-12102025.md: Browser whitelist
* Only Chrome (67%), Safari (20%), Edge (6%), Firefox (3%)
* Samsung Internet, Opera, and other niche browsers are filtered out
*/
const ALLOWED_BROWSERS = ['Chrome', 'Safari', 'Edge', 'Firefox'] as const;
// ============================================================
// PROXY TYPES
// ============================================================
export interface Proxy {
id: number;
host: string;
port: number;
username?: string;
password?: string;
protocol: 'http' | 'https' | 'socks5';
isActive: boolean;
lastUsedAt: Date | null;
failureCount: number;
successCount: number;
avgResponseTimeMs: number | null;
maxConnections: number;
/**
* Per workflow-12102025.md: Track consecutive 403s with different fingerprints.
* After 3 consecutive 403s → disable proxy (it's burned).
*/
consecutive403Count: number;
// Location info - determines session headers per workflow-12102025.md
city?: string;
state?: string;
country?: string;
countryCode?: string;
timezone?: string;
}
export interface ProxyStats {
totalProxies: number;
activeProxies: number;
blockedProxies: number;
avgSuccessRate: number;
}
// ============================================================
// FINGERPRINT TYPE
// Per workflow-12102025.md: Full browser fingerprint from user-agents
// ============================================================
export interface BrowserFingerprint {
userAgent: string;
platform: string;
screenWidth: number;
screenHeight: number;
viewportWidth: number;
viewportHeight: number;
deviceCategory: string;
browserName: string; // Per workflow-12102025.md: for session logging
// Derived headers for anti-detect
acceptLanguage: string;
secChUa?: string;
secChUaPlatform?: string;
secChUaMobile?: string;
// Per workflow-12102025.md: HTTP Fingerprinting section
httpFingerprint: HTTPFingerprint;
}
/**
* Per workflow-12102025.md: Session log entry for debugging blocked sessions
*/
export interface UASessionLog {
deviceCategory: string;
browserName: string;
userAgent: string;
proxyIp: string | null;
sessionStartedAt: Date;
}
// ============================================================
// PROXY ROTATOR CLASS
// ============================================================
export class ProxyRotator {
private pool: Pool | null = null;
private proxies: Proxy[] = [];
private currentIndex: number = 0;
private lastRotation: Date = new Date();
constructor(pool?: Pool) {
this.pool = pool || null;
}
setPool(pool: Pool): void {
this.pool = pool;
}
/**
* Load proxies from database
*/
async loadProxies(): Promise<void> {
if (!this.pool) {
console.warn('[ProxyRotator] No database pool configured');
return;
}
try {
const result = await this.pool.query<Proxy>(`
SELECT
id,
host,
port,
username,
password,
protocol,
active as "isActive",
last_tested_at as "lastUsedAt",
failure_count as "failureCount",
0 as "successCount",
response_time_ms as "avgResponseTimeMs",
COALESCE(max_connections, 1) as "maxConnections",
COALESCE(consecutive_403_count, 0) as "consecutive403Count",
city,
state,
country,
country_code as "countryCode",
timezone
FROM proxies
WHERE active = true
ORDER BY failure_count ASC, last_tested_at ASC NULLS FIRST
`);
this.proxies = result.rows;
const totalCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies (${totalCapacity} max concurrent connections)`);
} catch (error) {
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
this.proxies = [];
}
}
/**
* Get next proxy in rotation
*/
getNext(): Proxy | null {
if (this.proxies.length === 0) return null;
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
this.lastRotation = new Date();
return this.proxies[this.currentIndex];
}
/**
* Get current proxy without rotating
*/
getCurrent(): Proxy | null {
if (this.proxies.length === 0) return null;
return this.proxies[this.currentIndex];
}
/**
* Get proxy by ID
*/
getById(id: number): Proxy | null {
return this.proxies.find(p => p.id === id) || null;
}
/**
* Rotate to a specific proxy
*/
setProxy(id: number): boolean {
const index = this.proxies.findIndex(p => p.id === id);
if (index === -1) return false;
this.currentIndex = index;
this.lastRotation = new Date();
return true;
}
/**
* Mark proxy as blocked (403 received)
* Per workflow-12102025.md:
* - Increment consecutive_403_count
* - After 3 consecutive 403s with different fingerprints → disable proxy
* - This is separate from general failures (timeouts, etc.)
*/
async markBlocked(proxyId: number): Promise<boolean> {
const proxy = this.proxies.find(p => p.id === proxyId);
let shouldDisable = false;
if (proxy) {
proxy.consecutive403Count++;
// Per workflow-12102025.md: 3 consecutive 403s → proxy is burned
if (proxy.consecutive403Count >= 3) {
proxy.isActive = false;
this.proxies = this.proxies.filter(p => p.id !== proxyId);
console.log(`[ProxyRotator] Proxy ${proxyId} DISABLED after ${proxy.consecutive403Count} consecutive 403s (burned)`);
shouldDisable = true;
} else {
console.log(`[ProxyRotator] Proxy ${proxyId} blocked (403 #${proxy.consecutive403Count}/3)`);
}
}
// Update database
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
consecutive_403_count = COALESCE(consecutive_403_count, 0) + 1,
last_failure_at = NOW(),
test_result = '403 Forbidden',
active = CASE WHEN COALESCE(consecutive_403_count, 0) >= 2 THEN false ELSE active END,
updated_at = NOW()
WHERE id = $1
`, [proxyId]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
return shouldDisable;
}
/**
* Mark proxy as failed (general error - timeout, connection error, etc.)
* Separate from 403 blocking per workflow-12102025.md
*/
async markFailed(proxyId: number, error?: string): Promise<void> {
const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) {
proxy.failureCount++;
// Deactivate if too many general failures
if (proxy.failureCount >= 5) {
proxy.isActive = false;
this.proxies = this.proxies.filter(p => p.id !== proxyId);
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} general failures`);
}
}
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
failure_count = failure_count + 1,
updated_at = NOW(),
test_result = $2,
active = CASE WHEN failure_count >= 4 THEN false ELSE active END
WHERE id = $1
`, [proxyId, error || 'failed']);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
}
/**
* Mark proxy as successful - resets consecutive 403 count
* Per workflow-12102025.md: successful request clears the 403 counter
*/
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) {
proxy.successCount++;
proxy.consecutive403Count = 0; // Reset on success per workflow-12102025.md
proxy.lastUsedAt = new Date();
if (responseTimeMs !== undefined) {
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
: responseTimeMs;
}
}
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
last_tested_at = NOW(),
test_result = 'success',
consecutive_403_count = 0,
response_time_ms = CASE
WHEN response_time_ms IS NULL THEN $2
ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer
END,
updated_at = NOW()
WHERE id = $1
`, [proxyId, responseTimeMs || null]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
}
/**
* Get proxy URL for HTTP client
*/
getProxyUrl(proxy: Proxy): string {
const auth = proxy.username && proxy.password
? `${proxy.username}:${proxy.password}@`
: '';
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
}
/**
* Get stats about proxy pool
*/
getStats(): ProxyStats {
const totalProxies = this.proxies.length;
const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5 || p.consecutive403Count >= 3).length;
const successRates = this.proxies
.filter(p => p.successCount + p.failureCount > 0)
.map(p => p.successCount / (p.successCount + p.failureCount));
const avgSuccessRate = successRates.length > 0
? successRates.reduce((a, b) => a + b, 0) / successRates.length
: 0;
return {
totalProxies,
activeProxies,
blockedProxies,
avgSuccessRate,
};
}
hasAvailableProxies(): boolean {
return this.proxies.length > 0;
}
}
// ============================================================
// USER AGENT ROTATOR CLASS
// Per workflow-12102025.md: Uses intoli/user-agents for realistic fingerprints
// ============================================================
export class UserAgentRotator {
private currentFingerprint: BrowserFingerprint | null = null;
private sessionLog: UASessionLog | null = null;
constructor() {
// Per workflow-12102025.md: Initialize with first fingerprint
this.rotate();
}
/**
* Per workflow-12102025.md: Roll device category based on distribution
* Mobile: 62%, Desktop: 36%, Tablet: 2%
*/
private rollDeviceCategory(): 'mobile' | 'desktop' | 'tablet' {
const roll = Math.random() * 100;
if (roll < DEVICE_WEIGHTS.mobile) {
return 'mobile';
} else if (roll < DEVICE_WEIGHTS.mobile + DEVICE_WEIGHTS.desktop) {
return 'desktop';
} else {
return 'tablet';
}
}
/**
* Per workflow-12102025.md: Extract browser name from UA string
*/
private extractBrowserName(userAgent: string): string {
if (userAgent.includes('Edg/')) return 'Edge';
if (userAgent.includes('Firefox/')) return 'Firefox';
if (userAgent.includes('Safari/') && !userAgent.includes('Chrome/')) return 'Safari';
if (userAgent.includes('Chrome/')) return 'Chrome';
return 'Unknown';
}
/**
* Per workflow-12102025.md: Check if browser is in whitelist
*/
private isAllowedBrowser(userAgent: string): boolean {
const browserName = this.extractBrowserName(userAgent);
return ALLOWED_BROWSERS.includes(browserName as typeof ALLOWED_BROWSERS[number]);
}
/**
* Generate a new random fingerprint
* Per workflow-12102025.md:
* - Roll device category (62/36/2)
* - Filter to top 4 browsers only
* - Failure = alert admin + stop (no fallback)
*/
rotate(proxyIp?: string): BrowserFingerprint {
// Per workflow-12102025.md: Roll device category
const deviceCategory = this.rollDeviceCategory();
// Per workflow-12102025.md: Generate UA filtered to device category
const generator = new UserAgent({ deviceCategory });
// Per workflow-12102025.md: Try to get an allowed browser (max 50 attempts)
let ua: ReturnType<typeof generator>;
let attempts = 0;
const maxAttempts = 50;
do {
ua = generator();
attempts++;
} while (!this.isAllowedBrowser(ua.data.userAgent) && attempts < maxAttempts);
// Per workflow-12102025.md: If we can't get allowed browser, this is a failure
if (!this.isAllowedBrowser(ua.data.userAgent)) {
const errorMsg = `[UserAgentRotator] CRITICAL: Failed to generate allowed browser after ${maxAttempts} attempts. Device: ${deviceCategory}. Last UA: ${ua.data.userAgent}`;
console.error(errorMsg);
// Per workflow-12102025.md: Alert admin + stop crawl
// TODO: Post alert to admin dashboard
throw new Error(errorMsg);
}
const data = ua.data;
const browserName = this.extractBrowserName(data.userAgent);
// Build sec-ch-ua headers from user agent string
const secChUa = this.buildSecChUa(data.userAgent, deviceCategory);
// Per workflow-12102025.md: HTTP Fingerprinting - generate full HTTP fingerprint
const httpFingerprint = generateHTTPFingerprint(browserName as BrowserType);
this.currentFingerprint = {
userAgent: data.userAgent,
platform: data.platform,
screenWidth: data.screenWidth,
screenHeight: data.screenHeight,
viewportWidth: data.viewportWidth,
viewportHeight: data.viewportHeight,
deviceCategory: data.deviceCategory,
browserName, // Per workflow-12102025.md: for session logging
// Per workflow-12102025.md: always English
acceptLanguage: 'en-US,en;q=0.9',
...secChUa,
// Per workflow-12102025.md: HTTP Fingerprinting section
httpFingerprint,
};
// Per workflow-12102025.md: Log session data
this.sessionLog = {
deviceCategory,
browserName,
userAgent: data.userAgent,
proxyIp: proxyIp || null,
sessionStartedAt: new Date(),
};
console.log(`[UserAgentRotator] New fingerprint: device=${deviceCategory}, browser=${browserName}, UA=${data.userAgent.slice(0, 50)}...`);
return this.currentFingerprint;
}
/**
* Get current fingerprint without rotating
*/
getCurrent(): BrowserFingerprint {
if (!this.currentFingerprint) {
return this.rotate();
}
return this.currentFingerprint;
}
/**
* Get a random fingerprint (rotates and returns)
*/
getRandom(proxyIp?: string): BrowserFingerprint {
return this.rotate(proxyIp);
}
/**
* Per workflow-12102025.md: Get session log for debugging
*/
getSessionLog(): UASessionLog | null {
return this.sessionLog;
}
/**
* Build sec-ch-ua headers from user agent string
* Per workflow-12102025.md: Include mobile indicator based on device category
*/
private buildSecChUa(userAgent: string, deviceCategory: string): { secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string } {
const isMobile = deviceCategory === 'mobile' || deviceCategory === 'tablet';
// Extract Chrome version if present
const chromeMatch = userAgent.match(/Chrome\/(\d+)/);
const edgeMatch = userAgent.match(/Edg\/(\d+)/);
if (edgeMatch) {
const version = edgeMatch[1];
return {
secChUa: `"Microsoft Edge";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
secChUaPlatform: userAgent.includes('Windows') ? '"Windows"' : userAgent.includes('Android') ? '"Android"' : '"macOS"',
secChUaMobile: isMobile ? '?1' : '?0',
};
}
if (chromeMatch) {
const version = chromeMatch[1];
let platform = '"Linux"';
if (userAgent.includes('Windows')) platform = '"Windows"';
else if (userAgent.includes('Mac')) platform = '"macOS"';
else if (userAgent.includes('Android')) platform = '"Android"';
else if (userAgent.includes('iPhone') || userAgent.includes('iPad')) platform = '"iOS"';
return {
secChUa: `"Google Chrome";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
secChUaPlatform: platform,
secChUaMobile: isMobile ? '?1' : '?0',
};
}
// Firefox/Safari don't send sec-ch-ua
return {};
}
getCount(): number {
return 1; // user-agents generates dynamically
}
}
// ============================================================
// COMBINED ROTATOR
// Per workflow-12102025.md: Coordinates proxy + fingerprint rotation
// ============================================================
export class CrawlRotator {
public proxy: ProxyRotator;
public userAgent: UserAgentRotator;
constructor(pool?: Pool) {
this.proxy = new ProxyRotator(pool);
this.userAgent = new UserAgentRotator();
}
async initialize(): Promise<void> {
await this.proxy.loadProxies();
}
/**
* Rotate proxy only (get new IP)
*/
rotateProxy(): Proxy | null {
return this.proxy.getNext();
}
/**
* Rotate fingerprint only (new UA, screen size, etc.)
*/
rotateFingerprint(): BrowserFingerprint {
return this.userAgent.rotate();
}
/**
* Rotate both proxy and fingerprint
* Per workflow-12102025.md: called on 403 for fresh identity
* Passes proxy IP to UA rotation for session logging
*/
rotateBoth(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } {
const proxy = this.proxy.getNext();
const proxyIp = proxy ? proxy.host : undefined;
return {
proxy,
fingerprint: this.userAgent.rotate(proxyIp),
};
}
/**
* Get current proxy and fingerprint without rotating
*/
getCurrent(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } {
return {
proxy: this.proxy.getCurrent(),
fingerprint: this.userAgent.getCurrent(),
};
}
/**
* Record success for current proxy
* Per workflow-12102025.md: resets consecutive 403 count
*/
async recordSuccess(responseTimeMs?: number): Promise<void> {
const current = this.proxy.getCurrent();
if (current) {
await this.proxy.markSuccess(current.id, responseTimeMs);
}
}
/**
* Record 403 block for current proxy
* Per workflow-12102025.md: increments consecutive_403_count, disables after 3
* Returns true if proxy was disabled
*/
async recordBlock(): Promise<boolean> {
const current = this.proxy.getCurrent();
if (current) {
return await this.proxy.markBlocked(current.id);
}
return false;
}
/**
* Record general failure (not 403)
*/
async recordFailure(error?: string): Promise<void> {
const current = this.proxy.getCurrent();
if (current) {
await this.proxy.markFailed(current.id, error);
}
}
/**
* Get current proxy location info
* Per workflow-12102025.md: proxy location determines session headers
*/
getProxyLocation(): { city?: string; state?: string; country?: string; timezone?: string; isRotating: boolean } | null {
const current = this.proxy.getCurrent();
if (!current) return null;
const isRotating = current.maxConnections > 1;
return {
city: current.city,
state: current.state,
country: current.country,
timezone: current.timezone,
isRotating
};
}
/**
* Get timezone from current proxy
* Per workflow-12102025.md: used for Accept-Language header
*/
getProxyTimezone(): string | undefined {
const current = this.proxy.getCurrent();
return current?.timezone;
}
}
// ============================================================
// SINGLETON INSTANCES
// ============================================================
export const proxyRotator = new ProxyRotator();
export const userAgentRotator = new UserAgentRotator();
export const crawlRotator = new CrawlRotator();