/** * Worker Session Service * * Manages the worker session lifecycle: * 1. Claim up to 6 tasks for same geo * 2. Get Evomi proxy for that geo * 3. Check IP availability (not in use, not in cooldown) * 4. Lock IP/fingerprint to worker * 5. Track task completion * 6. Retire session after 6 tasks (8hr cooldown) */ import { pool } from '../db/pool'; import { buildEvomiProxyUrl, getEvomiConfig } from './crawl-rotator'; export interface ClaimedTask { task_id: number; role: string; dispensary_id: number; dispensary_name: string; city: string | null; state_code: string; platform: string; method: string | null; } export interface WorkerSession { id: number; ip_address: string; fingerprint_hash: string; fingerprint_data: Record | null; state_code: string; city: string | null; worker_id: string; status: 'active' | 'cooldown' | 'available'; tasks_claimed: number; tasks_completed: number; tasks_failed: number; max_tasks: number; locked_at: Date; } export interface SessionWithTasks { session: WorkerSession; tasks: ClaimedTask[]; proxyUrl: string; } const MAX_TASKS_PER_SESSION = 6; const MAX_IP_ATTEMPTS = 10; // How many IPs to try before giving up const COOLDOWN_HOURS = 8; /** * Claim tasks and establish a session for a worker. * This is the main entry point for the new worker flow. * * Flow: * 1. Claim up to 6 tasks for same geo * 2. Get Evomi proxy for that geo * 3. Try IPs until we find one that's available * 4. Lock IP to this worker * 5. Return session + tasks + proxy URL */ export async function claimSessionWithTasks( workerId: string, role?: string ): Promise { const client = await pool.connect(); try { await client.query('BEGIN'); // Step 1: Claim up to 6 tasks for same geo const { rows: tasks } = await client.query( `SELECT * FROM claim_tasks_batch($1, $2, $3)`, [workerId, MAX_TASKS_PER_SESSION, role || null] ); if (tasks.length === 0) { await client.query('ROLLBACK'); console.log(`[WorkerSession] No pending tasks available for ${workerId}`); return null; } // Get geo from first claimed task (all same geo) const { state_code, city } = tasks[0]; console.log(`[WorkerSession] ${workerId} claimed ${tasks.length} tasks for ${city || 'any'}, ${state_code}`); // Step 2: Get Evomi proxy for this geo const evomiConfig = getEvomiConfig(); if (!evomiConfig.enabled) { await client.query('ROLLBACK'); throw new Error('Evomi proxy not configured'); } // Step 3: Try to get an available IP let session: WorkerSession | null = null; let proxyUrl: string | null = null; for (let attempt = 0; attempt < MAX_IP_ATTEMPTS; attempt++) { // Build proxy URL with unique session ID for each attempt const sessionId = `${workerId}-${Date.now()}-${attempt}`; const proxyResult = buildEvomiProxyUrl(state_code, sessionId, city || undefined); if (!proxyResult) { console.warn(`[WorkerSession] Failed to build proxy URL for ${state_code}`); continue; } // TODO: Actually make a request through the proxy to get the real IP // For now, we'll use a placeholder - in production, run a quick IP check const testIp = await getProxyIp(proxyResult.url); if (!testIp) { console.warn(`[WorkerSession] Failed to get IP from proxy attempt ${attempt + 1}`); continue; } // Step 4: Try to lock this IP const { rows } = await client.query( `SELECT * FROM lock_worker_session($1, $2, $3, $4)`, [workerId, testIp, state_code, city] ); if (rows[0]?.id) { session = rows[0]; proxyUrl = proxyResult.url; console.log(`[WorkerSession] ${workerId} locked IP ${testIp} for ${city || 'any'}, ${state_code}`); break; } console.log(`[WorkerSession] IP ${testIp} not available (in use or cooldown), trying next...`); } if (!session || !proxyUrl) { // Release claimed tasks back to pool await client.query(`SELECT release_claimed_tasks($1)`, [workerId]); await client.query('ROLLBACK'); console.error(`[WorkerSession] ${workerId} failed to get available IP after ${MAX_IP_ATTEMPTS} attempts`); return null; } // Update session with task count await client.query( `SELECT session_task_claimed($1, $2)`, [workerId, tasks.length] ); await client.query('COMMIT'); return { session, tasks, proxyUrl, }; } catch (err) { await client.query('ROLLBACK'); throw err; } finally { client.release(); } } /** * Get the real IP address from a proxy by making a test request */ async function getProxyIp(proxyUrl: string): Promise { try { // Use a simple IP check service const { default: axios } = await import('axios'); const { HttpsProxyAgent } = await import('https-proxy-agent'); const agent = new HttpsProxyAgent(proxyUrl); const response = await axios.get('https://api.ipify.org?format=json', { httpAgent: agent, httpsAgent: agent, timeout: 10000, }); return response.data?.ip || null; } catch (err: any) { console.warn(`[WorkerSession] IP check failed: ${err.message}`); return null; } } /** * Mark a task as started (running) */ export async function startTask(taskId: number, workerId: string): Promise { const { rows } = await pool.query( `SELECT start_task($1, $2) as success`, [taskId, workerId] ); return rows[0]?.success || false; } /** * Mark a task as completed */ export async function completeTask( taskId: number, workerId: string, result?: Record ): Promise { const client = await pool.connect(); try { await client.query('BEGIN'); // Complete the task const { rows } = await client.query( `SELECT complete_task($1, $2, $3) as success`, [taskId, workerId, result ? JSON.stringify(result) : null] ); if (rows[0]?.success) { // Update session counter await client.query(`SELECT session_task_completed($1)`, [workerId]); } await client.query('COMMIT'); return rows[0]?.success || false; } catch (err) { await client.query('ROLLBACK'); throw err; } finally { client.release(); } } /** * Mark a task as failed (returns to pending for retry) */ export async function failTask( taskId: number, workerId: string, error?: string ): Promise { const client = await pool.connect(); try { await client.query('BEGIN'); // Fail the task (may return to pending or mark as permanently failed) const { rows } = await client.query( `SELECT fail_task($1, $2, $3) as success`, [taskId, workerId, error || null] ); if (rows[0]?.success) { // Update session counter await client.query(`SELECT session_task_failed($1)`, [workerId]); } await client.query('COMMIT'); return rows[0]?.success || false; } catch (err) { await client.query('ROLLBACK'); throw err; } finally { client.release(); } } /** * Get current session for a worker */ export async function getWorkerSession(workerId: string): Promise { const { rows } = await pool.query( `SELECT * FROM get_worker_session($1)`, [workerId] ); return rows[0] as WorkerSession || null; } /** * Check if worker session is complete (all 6 tasks done) */ export async function isSessionComplete(workerId: string): Promise { const session = await getWorkerSession(workerId); if (!session) return true; // No session = complete const totalDone = session.tasks_completed + session.tasks_failed; return totalDone >= session.tasks_claimed; } /** * Retire a worker's session (start 8hr cooldown) */ export async function retireSession(workerId: string): Promise { const { rows } = await pool.query( `SELECT retire_worker_session($1) as success`, [workerId] ); console.log(`[WorkerSession] ${workerId} session retired, IP in ${COOLDOWN_HOURS}hr cooldown`); return rows[0]?.success || false; } /** * Release any claimed tasks back to pool (for worker shutdown) */ export async function releaseClaimedTasks(workerId: string): Promise { const { rows } = await pool.query( `SELECT release_claimed_tasks($1) as count`, [workerId] ); const count = rows[0]?.count || 0; if (count > 0) { console.log(`[WorkerSession] Released ${count} claimed tasks for ${workerId}`); } return count; } /** * Cleanup: release expired sessions from cooldown */ export async function releaseExpiredSessions(): Promise { const { rows } = await pool.query( `SELECT release_expired_sessions() as count` ); return rows[0]?.count || 0; } /** * Get session stats for monitoring */ export async function getSessionStats(): Promise<{ active: number; cooldown: number; available: number; uniqueIps: number; }> { const { rows } = await pool.query(` SELECT COUNT(*) FILTER (WHERE status = 'active') as active, COUNT(*) FILTER (WHERE status = 'cooldown') as cooldown, COUNT(*) FILTER (WHERE status = 'available') as available, COUNT(DISTINCT ip_address) as unique_ips FROM worker_sessions `); return { active: parseInt(rows[0]?.active || '0'), cooldown: parseInt(rows[0]?.cooldown || '0'), available: parseInt(rows[0]?.available || '0'), uniqueIps: parseInt(rows[0]?.unique_ips || '0'), }; }