From f2864bd2ad7f048057c414eac2cabdd19afd59be Mon Sep 17 00:00:00 2001 From: Kelly Date: Thu, 11 Dec 2025 21:37:22 -0700 Subject: [PATCH] fix(ci): Remove buildx cache and add preflight enforcement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove cache_from/cache_to from CI (plugin bug splitting commas) - Add preflight() method to CrawlRotator - tests proxy + anti-detect - Add pre-task preflight check - workers MUST pass before executing - Add releaseTask() to release tasks back to pending on preflight fail - Rename proxy_test task to whoami for clarity 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .woodpecker.yml | 17 +--- backend/src/services/crawl-rotator.ts | 112 +++++++++++++++++++++++ backend/src/tasks/handlers/index.ts | 2 +- backend/src/tasks/handlers/proxy-test.ts | 51 ----------- backend/src/tasks/handlers/whoami.ts | 80 ++++++++++++++++ backend/src/tasks/task-service.ts | 20 +++- backend/src/tasks/task-worker.ts | 26 +++++- 7 files changed, 237 insertions(+), 71 deletions(-) delete mode 100644 backend/src/tasks/handlers/proxy-test.ts create mode 100644 backend/src/tasks/handlers/whoami.ts diff --git a/.woodpecker.yml b/.woodpecker.yml index c7233f95..1cd61159 100644 --- a/.woodpecker.yml +++ b/.woodpecker.yml @@ -69,6 +69,7 @@ steps: # =========================================== # MASTER DEPLOY: Parallel Docker builds + # NOTE: cache_from/cache_to removed due to plugin bug splitting on commas # =========================================== docker-backend: image: woodpeckerci/plugin-docker-buildx @@ -86,10 +87,6 @@ steps: from_secret: registry_password platforms: linux/amd64 provenance: false - cache_from: - - "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache" - cache_to: - - "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache,mode=max" build_args: APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8} APP_GIT_SHA: ${CI_COMMIT_SHA} @@ -116,10 +113,6 @@ steps: from_secret: registry_password platforms: linux/amd64 provenance: false - cache_from: - - "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache" - cache_to: - - "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache,mode=max" depends_on: [] when: branch: master @@ -141,10 +134,6 @@ steps: from_secret: registry_password platforms: linux/amd64 provenance: false - cache_from: - - "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache" - cache_to: - - "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache,mode=max" depends_on: [] when: branch: master @@ -166,10 +155,6 @@ steps: from_secret: registry_password platforms: linux/amd64 provenance: false - cache_from: - - "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache" - cache_to: - - "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache,mode=max" depends_on: [] when: branch: master diff --git a/backend/src/services/crawl-rotator.ts b/backend/src/services/crawl-rotator.ts index 6d0d9d6b..0a04dcc8 100644 --- a/backend/src/services/crawl-rotator.ts +++ b/backend/src/services/crawl-rotator.ts @@ -683,6 +683,118 @@ export class CrawlRotator { const current = this.proxy.getCurrent(); return current?.timezone; } + + /** + * Preflight check - verifies proxy and anti-detect are working + * MUST be called before any task execution to ensure anonymity. + * + * Tests: + * 1. Proxy available - a proxy must be loaded and active + * 2. Proxy connectivity - makes HTTP request through proxy to verify connection + * 3. Anti-detect headers - verifies fingerprint is set with required headers + * + * @returns Promise with pass/fail status and details + */ + async preflight(): Promise { + const result: PreflightResult = { + passed: false, + proxyAvailable: false, + proxyConnected: false, + antidetectReady: false, + proxyIp: null, + fingerprint: null, + error: null, + responseTimeMs: null, + }; + + // Step 1: Check proxy is available + const currentProxy = this.proxy.getCurrent(); + if (!currentProxy) { + result.error = 'No proxy available'; + console.log('[Preflight] FAILED - No proxy available'); + return result; + } + result.proxyAvailable = true; + result.proxyIp = currentProxy.host; + + // Step 2: Check fingerprint/anti-detect is ready + const fingerprint = this.userAgent.getCurrent(); + if (!fingerprint || !fingerprint.userAgent) { + result.error = 'Anti-detect fingerprint not initialized'; + console.log('[Preflight] FAILED - No fingerprint'); + return result; + } + result.antidetectReady = true; + result.fingerprint = { + userAgent: fingerprint.userAgent, + browserName: fingerprint.browserName, + deviceCategory: fingerprint.deviceCategory, + }; + + // Step 3: Test proxy connectivity with an actual HTTP request + // Use httpbin.org/ip to verify request goes through proxy + const proxyUrl = this.proxy.getProxyUrl(currentProxy); + const testUrl = 'https://httpbin.org/ip'; + + try { + const { default: axios } = await import('axios'); + const { HttpsProxyAgent } = await import('https-proxy-agent'); + + const agent = new HttpsProxyAgent(proxyUrl); + const startTime = Date.now(); + + const response = await axios.get(testUrl, { + httpsAgent: agent, + timeout: 15000, // 15 second timeout + headers: { + 'User-Agent': fingerprint.userAgent, + 'Accept-Language': fingerprint.acceptLanguage, + ...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }), + ...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }), + ...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }), + }, + }); + + result.responseTimeMs = Date.now() - startTime; + result.proxyConnected = true; + result.passed = true; + + // Mark success on proxy stats + await this.proxy.markSuccess(currentProxy.id, result.responseTimeMs); + + console.log(`[Preflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`); + } catch (err: any) { + result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`; + console.log(`[Preflight] FAILED - Proxy connection error: ${err.message}`); + + // Mark failure on proxy stats + await this.proxy.markFailed(currentProxy.id, err.message); + } + + return result; + } +} + +/** + * Result from preflight check + */ +export interface PreflightResult { + /** Overall pass/fail */ + passed: boolean; + /** Step 1: Is a proxy loaded? */ + proxyAvailable: boolean; + /** Step 2: Did HTTP request through proxy succeed? */ + proxyConnected: boolean; + /** Step 3: Is fingerprint/anti-detect ready? */ + antidetectReady: boolean; + /** Current proxy IP */ + proxyIp: string | null; + /** Fingerprint summary */ + fingerprint: { userAgent: string; browserName: string; deviceCategory: string } | null; + /** Error message if failed */ + error: string | null; + /** Proxy response time in ms */ + responseTimeMs: number | null; } // ============================================================ diff --git a/backend/src/tasks/handlers/index.ts b/backend/src/tasks/handlers/index.ts index 7ac17dcd..0f49d0f2 100644 --- a/backend/src/tasks/handlers/index.ts +++ b/backend/src/tasks/handlers/index.ts @@ -9,4 +9,4 @@ export { handleProductDiscovery } from './product-discovery'; export { handleStoreDiscovery } from './store-discovery'; export { handleEntryPointDiscovery } from './entry-point-discovery'; export { handleAnalyticsRefresh } from './analytics-refresh'; -export { handleProxyTest } from './proxy-test'; +export { handleWhoami } from './whoami'; diff --git a/backend/src/tasks/handlers/proxy-test.ts b/backend/src/tasks/handlers/proxy-test.ts deleted file mode 100644 index 8f54d490..00000000 --- a/backend/src/tasks/handlers/proxy-test.ts +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Proxy Test Handler - * Tests proxy connectivity by fetching public IP via ipify - */ - -import { TaskContext, TaskResult } from '../task-worker'; -import { execSync } from 'child_process'; - -export async function handleProxyTest(ctx: TaskContext): Promise { - const { pool } = ctx; - - console.log('[ProxyTest] Testing proxy connection...'); - - try { - // Get active proxy from DB - const proxyResult = await pool.query(` - SELECT host, port, username, password - FROM proxies - WHERE is_active = true - LIMIT 1 - `); - - if (proxyResult.rows.length === 0) { - return { success: false, error: 'No active proxy configured' }; - } - - const p = proxyResult.rows[0]; - const proxyUrl = p.username - ? `http://${p.username}:${p.password}@${p.host}:${p.port}` - : `http://${p.host}:${p.port}`; - - console.log(`[ProxyTest] Using proxy: ${p.host}:${p.port}`); - - // Fetch IP via proxy - const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`; - const output = execSync(cmd, { timeout: 30000 }).toString().trim(); - const data = JSON.parse(output); - - console.log(`[ProxyTest] Proxy IP: ${data.ip}`); - - return { - success: true, - proxyIp: data.ip, - proxyHost: p.host, - proxyPort: p.port, - }; - } catch (error: any) { - console.error('[ProxyTest] Error:', error.message); - return { success: false, error: error.message }; - } -} diff --git a/backend/src/tasks/handlers/whoami.ts b/backend/src/tasks/handlers/whoami.ts new file mode 100644 index 00000000..a1df75cb --- /dev/null +++ b/backend/src/tasks/handlers/whoami.ts @@ -0,0 +1,80 @@ +/** + * WhoAmI Handler + * Tests proxy connectivity and anti-detect by fetching public IP + * Reports: proxy IP, fingerprint info, and connection status + */ + +import { TaskContext, TaskResult } from '../task-worker'; +import { execSync } from 'child_process'; + +export async function handleWhoami(ctx: TaskContext): Promise { + const { pool, crawlRotator } = ctx; + + console.log('[WhoAmI] Testing proxy and anti-detect...'); + + try { + // Use the preflight check which tests proxy + anti-detect + if (crawlRotator) { + const preflight = await crawlRotator.preflight(); + + if (!preflight.passed) { + return { + success: false, + error: preflight.error || 'Preflight check failed', + proxyAvailable: preflight.proxyAvailable, + proxyConnected: preflight.proxyConnected, + antidetectReady: preflight.antidetectReady, + }; + } + + console.log(`[WhoAmI] Proxy IP: ${preflight.proxyIp}, Response: ${preflight.responseTimeMs}ms`); + console.log(`[WhoAmI] Fingerprint: ${preflight.fingerprint?.browserName}/${preflight.fingerprint?.deviceCategory}`); + + return { + success: true, + proxyIp: preflight.proxyIp, + responseTimeMs: preflight.responseTimeMs, + fingerprint: preflight.fingerprint, + proxyAvailable: preflight.proxyAvailable, + proxyConnected: preflight.proxyConnected, + antidetectReady: preflight.antidetectReady, + }; + } + + // Fallback: Direct proxy test without CrawlRotator + const proxyResult = await pool.query(` + SELECT host, port, username, password + FROM proxies + WHERE is_active = true + LIMIT 1 + `); + + if (proxyResult.rows.length === 0) { + return { success: false, error: 'No active proxy configured' }; + } + + const p = proxyResult.rows[0]; + const proxyUrl = p.username + ? `http://${p.username}:${p.password}@${p.host}:${p.port}` + : `http://${p.host}:${p.port}`; + + console.log(`[WhoAmI] Using proxy: ${p.host}:${p.port}`); + + // Fetch IP via proxy + const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`; + const output = execSync(cmd, { timeout: 30000 }).toString().trim(); + const data = JSON.parse(output); + + console.log(`[WhoAmI] Proxy IP: ${data.ip}`); + + return { + success: true, + proxyIp: data.ip, + proxyHost: p.host, + proxyPort: p.port, + }; + } catch (error: any) { + console.error('[WhoAmI] Error:', error.message); + return { success: false, error: error.message }; + } +} diff --git a/backend/src/tasks/task-service.ts b/backend/src/tasks/task-service.ts index 322690d3..61948de2 100644 --- a/backend/src/tasks/task-service.ts +++ b/backend/src/tasks/task-service.ts @@ -32,7 +32,7 @@ export type TaskRole = | 'payload_fetch' // NEW: Fetches from API, saves to disk | 'product_refresh' // CHANGED: Now reads from local payload | 'analytics_refresh' - | 'proxy_test'; // Tests proxy connectivity via ipify + | 'whoami'; // Tests proxy + anti-detect connectivity export type TaskStatus = | 'pending' @@ -231,6 +231,24 @@ class TaskService { ); } + /** + * Release a claimed task back to pending (e.g., when preflight fails) + * This allows another worker to pick it up. + */ + async releaseTask(taskId: number): Promise { + await pool.query( + `UPDATE worker_tasks + SET status = 'pending', + worker_id = NULL, + claimed_at = NULL, + started_at = NULL, + updated_at = NOW() + WHERE id = $1 AND status IN ('claimed', 'running')`, + [taskId] + ); + console.log(`[TaskService] Task ${taskId} released back to pending`); + } + /** * Mark a task as failed, with auto-retry if under max_retries * Returns true if task was re-queued for retry, false if permanently failed diff --git a/backend/src/tasks/task-worker.ts b/backend/src/tasks/task-worker.ts index efadf027..9827eee6 100644 --- a/backend/src/tasks/task-worker.ts +++ b/backend/src/tasks/task-worker.ts @@ -59,7 +59,7 @@ import { handleProductDiscovery } from './handlers/product-discovery'; import { handleStoreDiscovery } from './handlers/store-discovery'; import { handleEntryPointDiscovery } from './handlers/entry-point-discovery'; import { handleAnalyticsRefresh } from './handlers/analytics-refresh'; -import { handleProxyTest } from './handlers/proxy-test'; +import { handleWhoami } from './handlers/whoami'; const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000'); const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000'); @@ -111,6 +111,7 @@ export interface TaskContext { workerId: string; task: WorkerTask; heartbeat: () => Promise; + crawlRotator?: CrawlRotator; } export interface TaskResult { @@ -134,7 +135,7 @@ const TASK_HANDLERS: Record = { store_discovery: handleStoreDiscovery, entry_point_discovery: handleEntryPointDiscovery, analytics_refresh: handleAnalyticsRefresh, - proxy_test: handleProxyTest, // Tests proxy via ipify + whoami: handleWhoami, // Tests proxy + anti-detect }; /** @@ -555,6 +556,26 @@ export class TaskWorker { if (task) { console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`); + + // ================================================================= + // PREFLIGHT CHECK - CRITICAL: Worker MUST pass before task execution + // Verifies: 1) Proxy available 2) Proxy connected 3) Anti-detect ready + // ================================================================= + const preflight = await this.crawlRotator.preflight(); + if (!preflight.passed) { + console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${preflight.error}`); + console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without proxy/anti-detect`); + + // Release task back to pending so another worker can pick it up + await taskService.releaseTask(task.id); + + // Wait before trying again - give proxies time to recover + await this.sleep(30000); // 30 second wait on preflight failure + return; + } + + console.log(`[TaskWorker] ${this.friendlyName} preflight PASSED for task ${task.id} (proxy: ${preflight.proxyIp}, ${preflight.responseTimeMs}ms)`); + this.activeTasks.set(task.id, task); // Start task in background (don't await) @@ -611,6 +632,7 @@ export class TaskWorker { heartbeat: async () => { await taskService.heartbeat(task.id); }, + crawlRotator: this.crawlRotator, }; // Execute the task