Files
cannaiq/backend/src/services/worker-session.ts
Kelly 4cb4e1c502 feat(workers): Session pool system - claim tasks first, then get IP
New worker flow (enabled via USE_SESSION_POOL=true):
1. Worker claims up to 6 tasks for same geo (atomically marked claimed)
2. Gets Evomi proxy for that geo
3. Checks IP availability (not in use, not in 8hr cooldown)
4. Locks IP exclusively to this worker
5. Runs preflight with locked IP
6. Executes tasks (3 concurrent)
7. After 6 tasks, retires session (8hr IP cooldown)
8. Repeats with new IP

Key files:
- migrations/112_worker_session_pool.sql: Session table + atomic claiming
- services/worker-session.ts: Session lifecycle management
- tasks/task-worker.ts: sessionPoolMainLoop() with new flow
- services/crawl-rotator.ts: setFixedProxy() for session locking

Failed tasks return to pending for retry by another worker.
No two workers can share same IP simultaneously.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 22:54:45 -07:00

348 lines
9.4 KiB
TypeScript

/**
* Worker Session Service
*
* Manages the worker session lifecycle:
* 1. Claim up to 6 tasks for same geo
* 2. Get Evomi proxy for that geo
* 3. Check IP availability (not in use, not in cooldown)
* 4. Lock IP/fingerprint to worker
* 5. Track task completion
* 6. Retire session after 6 tasks (8hr cooldown)
*/
import { pool } from '../db/pool';
import { buildEvomiProxyUrl, getEvomiConfig } from './crawl-rotator';
export interface ClaimedTask {
task_id: number;
role: string;
dispensary_id: number;
dispensary_name: string;
city: string | null;
state_code: string;
platform: string;
method: string | null;
}
export interface WorkerSession {
id: number;
ip_address: string;
fingerprint_hash: string;
fingerprint_data: Record<string, unknown> | null;
state_code: string;
city: string | null;
worker_id: string;
status: 'active' | 'cooldown' | 'available';
tasks_claimed: number;
tasks_completed: number;
tasks_failed: number;
max_tasks: number;
locked_at: Date;
}
export interface SessionWithTasks {
session: WorkerSession;
tasks: ClaimedTask[];
proxyUrl: string;
}
const MAX_TASKS_PER_SESSION = 6;
const MAX_IP_ATTEMPTS = 10; // How many IPs to try before giving up
const COOLDOWN_HOURS = 8;
/**
* Claim tasks and establish a session for a worker.
* This is the main entry point for the new worker flow.
*
* Flow:
* 1. Claim up to 6 tasks for same geo
* 2. Get Evomi proxy for that geo
* 3. Try IPs until we find one that's available
* 4. Lock IP to this worker
* 5. Return session + tasks + proxy URL
*/
export async function claimSessionWithTasks(
workerId: string,
role?: string
): Promise<SessionWithTasks | null> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Step 1: Claim up to 6 tasks for same geo
const { rows: tasks } = await client.query<ClaimedTask>(
`SELECT * FROM claim_tasks_batch($1, $2, $3)`,
[workerId, MAX_TASKS_PER_SESSION, role || null]
);
if (tasks.length === 0) {
await client.query('ROLLBACK');
console.log(`[WorkerSession] No pending tasks available for ${workerId}`);
return null;
}
// Get geo from first claimed task (all same geo)
const { state_code, city } = tasks[0];
console.log(`[WorkerSession] ${workerId} claimed ${tasks.length} tasks for ${city || 'any'}, ${state_code}`);
// Step 2: Get Evomi proxy for this geo
const evomiConfig = getEvomiConfig();
if (!evomiConfig.enabled) {
await client.query('ROLLBACK');
throw new Error('Evomi proxy not configured');
}
// Step 3: Try to get an available IP
let session: WorkerSession | null = null;
let proxyUrl: string | null = null;
for (let attempt = 0; attempt < MAX_IP_ATTEMPTS; attempt++) {
// Build proxy URL with unique session ID for each attempt
const sessionId = `${workerId}-${Date.now()}-${attempt}`;
const proxyResult = buildEvomiProxyUrl(state_code, sessionId, city || undefined);
if (!proxyResult) {
console.warn(`[WorkerSession] Failed to build proxy URL for ${state_code}`);
continue;
}
// TODO: Actually make a request through the proxy to get the real IP
// For now, we'll use a placeholder - in production, run a quick IP check
const testIp = await getProxyIp(proxyResult.url);
if (!testIp) {
console.warn(`[WorkerSession] Failed to get IP from proxy attempt ${attempt + 1}`);
continue;
}
// Step 4: Try to lock this IP
const { rows } = await client.query<WorkerSession>(
`SELECT * FROM lock_worker_session($1, $2, $3, $4)`,
[workerId, testIp, state_code, city]
);
if (rows[0]?.id) {
session = rows[0];
proxyUrl = proxyResult.url;
console.log(`[WorkerSession] ${workerId} locked IP ${testIp} for ${city || 'any'}, ${state_code}`);
break;
}
console.log(`[WorkerSession] IP ${testIp} not available (in use or cooldown), trying next...`);
}
if (!session || !proxyUrl) {
// Release claimed tasks back to pool
await client.query(`SELECT release_claimed_tasks($1)`, [workerId]);
await client.query('ROLLBACK');
console.error(`[WorkerSession] ${workerId} failed to get available IP after ${MAX_IP_ATTEMPTS} attempts`);
return null;
}
// Update session with task count
await client.query(
`SELECT session_task_claimed($1, $2)`,
[workerId, tasks.length]
);
await client.query('COMMIT');
return {
session,
tasks,
proxyUrl,
};
} catch (err) {
await client.query('ROLLBACK');
throw err;
} finally {
client.release();
}
}
/**
* Get the real IP address from a proxy by making a test request
*/
async function getProxyIp(proxyUrl: string): Promise<string | null> {
try {
// Use a simple IP check service
const { default: axios } = await import('axios');
const { HttpsProxyAgent } = await import('https-proxy-agent');
const agent = new HttpsProxyAgent(proxyUrl);
const response = await axios.get('https://api.ipify.org?format=json', {
httpAgent: agent,
httpsAgent: agent,
timeout: 10000,
});
return response.data?.ip || null;
} catch (err: any) {
console.warn(`[WorkerSession] IP check failed: ${err.message}`);
return null;
}
}
/**
* Mark a task as started (running)
*/
export async function startTask(taskId: number, workerId: string): Promise<boolean> {
const { rows } = await pool.query(
`SELECT start_task($1, $2) as success`,
[taskId, workerId]
);
return rows[0]?.success || false;
}
/**
* Mark a task as completed
*/
export async function completeTask(
taskId: number,
workerId: string,
result?: Record<string, unknown>
): Promise<boolean> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Complete the task
const { rows } = await client.query(
`SELECT complete_task($1, $2, $3) as success`,
[taskId, workerId, result ? JSON.stringify(result) : null]
);
if (rows[0]?.success) {
// Update session counter
await client.query(`SELECT session_task_completed($1)`, [workerId]);
}
await client.query('COMMIT');
return rows[0]?.success || false;
} catch (err) {
await client.query('ROLLBACK');
throw err;
} finally {
client.release();
}
}
/**
* Mark a task as failed (returns to pending for retry)
*/
export async function failTask(
taskId: number,
workerId: string,
error?: string
): Promise<boolean> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Fail the task (may return to pending or mark as permanently failed)
const { rows } = await client.query(
`SELECT fail_task($1, $2, $3) as success`,
[taskId, workerId, error || null]
);
if (rows[0]?.success) {
// Update session counter
await client.query(`SELECT session_task_failed($1)`, [workerId]);
}
await client.query('COMMIT');
return rows[0]?.success || false;
} catch (err) {
await client.query('ROLLBACK');
throw err;
} finally {
client.release();
}
}
/**
* Get current session for a worker
*/
export async function getWorkerSession(workerId: string): Promise<WorkerSession | null> {
const { rows } = await pool.query(
`SELECT * FROM get_worker_session($1)`,
[workerId]
);
return rows[0] as WorkerSession || null;
}
/**
* Check if worker session is complete (all 6 tasks done)
*/
export async function isSessionComplete(workerId: string): Promise<boolean> {
const session = await getWorkerSession(workerId);
if (!session) return true; // No session = complete
const totalDone = session.tasks_completed + session.tasks_failed;
return totalDone >= session.tasks_claimed;
}
/**
* Retire a worker's session (start 8hr cooldown)
*/
export async function retireSession(workerId: string): Promise<boolean> {
const { rows } = await pool.query(
`SELECT retire_worker_session($1) as success`,
[workerId]
);
console.log(`[WorkerSession] ${workerId} session retired, IP in ${COOLDOWN_HOURS}hr cooldown`);
return rows[0]?.success || false;
}
/**
* Release any claimed tasks back to pool (for worker shutdown)
*/
export async function releaseClaimedTasks(workerId: string): Promise<number> {
const { rows } = await pool.query(
`SELECT release_claimed_tasks($1) as count`,
[workerId]
);
const count = rows[0]?.count || 0;
if (count > 0) {
console.log(`[WorkerSession] Released ${count} claimed tasks for ${workerId}`);
}
return count;
}
/**
* Cleanup: release expired sessions from cooldown
*/
export async function releaseExpiredSessions(): Promise<number> {
const { rows } = await pool.query(
`SELECT release_expired_sessions() as count`
);
return rows[0]?.count || 0;
}
/**
* Get session stats for monitoring
*/
export async function getSessionStats(): Promise<{
active: number;
cooldown: number;
available: number;
uniqueIps: number;
}> {
const { rows } = await pool.query(`
SELECT
COUNT(*) FILTER (WHERE status = 'active') as active,
COUNT(*) FILTER (WHERE status = 'cooldown') as cooldown,
COUNT(*) FILTER (WHERE status = 'available') as available,
COUNT(DISTINCT ip_address) as unique_ips
FROM worker_sessions
`);
return {
active: parseInt(rows[0]?.active || '0'),
cooldown: parseInt(rows[0]?.cooldown || '0'),
available: parseInt(rows[0]?.available || '0'),
uniqueIps: parseInt(rows[0]?.unique_ips || '0'),
};
}