Merge pull request 'fix(ci): Remove buildx cache and add preflight enforcement' (#48) from fix/ci-and-preflight-enforcement into master
Reviewed-on: https://code.cannabrands.app/Creationshop/dispensary-scraper/pulls/48
This commit is contained in:
@@ -69,6 +69,7 @@ steps:
|
||||
|
||||
# ===========================================
|
||||
# MASTER DEPLOY: Parallel Docker builds
|
||||
# NOTE: cache_from/cache_to removed due to plugin bug splitting on commas
|
||||
# ===========================================
|
||||
docker-backend:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
@@ -86,10 +87,6 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache"
|
||||
cache_to:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache,mode=max"
|
||||
build_args:
|
||||
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
|
||||
APP_GIT_SHA: ${CI_COMMIT_SHA}
|
||||
@@ -116,10 +113,6 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache"
|
||||
cache_to:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache,mode=max"
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
@@ -141,10 +134,6 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache"
|
||||
cache_to:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache,mode=max"
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
@@ -166,10 +155,6 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache"
|
||||
cache_to:
|
||||
- "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache,mode=max"
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
|
||||
@@ -683,6 +683,118 @@ export class CrawlRotator {
|
||||
const current = this.proxy.getCurrent();
|
||||
return current?.timezone;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preflight check - verifies proxy and anti-detect are working
|
||||
* MUST be called before any task execution to ensure anonymity.
|
||||
*
|
||||
* Tests:
|
||||
* 1. Proxy available - a proxy must be loaded and active
|
||||
* 2. Proxy connectivity - makes HTTP request through proxy to verify connection
|
||||
* 3. Anti-detect headers - verifies fingerprint is set with required headers
|
||||
*
|
||||
* @returns Promise<PreflightResult> with pass/fail status and details
|
||||
*/
|
||||
async preflight(): Promise<PreflightResult> {
|
||||
const result: PreflightResult = {
|
||||
passed: false,
|
||||
proxyAvailable: false,
|
||||
proxyConnected: false,
|
||||
antidetectReady: false,
|
||||
proxyIp: null,
|
||||
fingerprint: null,
|
||||
error: null,
|
||||
responseTimeMs: null,
|
||||
};
|
||||
|
||||
// Step 1: Check proxy is available
|
||||
const currentProxy = this.proxy.getCurrent();
|
||||
if (!currentProxy) {
|
||||
result.error = 'No proxy available';
|
||||
console.log('[Preflight] FAILED - No proxy available');
|
||||
return result;
|
||||
}
|
||||
result.proxyAvailable = true;
|
||||
result.proxyIp = currentProxy.host;
|
||||
|
||||
// Step 2: Check fingerprint/anti-detect is ready
|
||||
const fingerprint = this.userAgent.getCurrent();
|
||||
if (!fingerprint || !fingerprint.userAgent) {
|
||||
result.error = 'Anti-detect fingerprint not initialized';
|
||||
console.log('[Preflight] FAILED - No fingerprint');
|
||||
return result;
|
||||
}
|
||||
result.antidetectReady = true;
|
||||
result.fingerprint = {
|
||||
userAgent: fingerprint.userAgent,
|
||||
browserName: fingerprint.browserName,
|
||||
deviceCategory: fingerprint.deviceCategory,
|
||||
};
|
||||
|
||||
// Step 3: Test proxy connectivity with an actual HTTP request
|
||||
// Use httpbin.org/ip to verify request goes through proxy
|
||||
const proxyUrl = this.proxy.getProxyUrl(currentProxy);
|
||||
const testUrl = 'https://httpbin.org/ip';
|
||||
|
||||
try {
|
||||
const { default: axios } = await import('axios');
|
||||
const { HttpsProxyAgent } = await import('https-proxy-agent');
|
||||
|
||||
const agent = new HttpsProxyAgent(proxyUrl);
|
||||
const startTime = Date.now();
|
||||
|
||||
const response = await axios.get(testUrl, {
|
||||
httpsAgent: agent,
|
||||
timeout: 15000, // 15 second timeout
|
||||
headers: {
|
||||
'User-Agent': fingerprint.userAgent,
|
||||
'Accept-Language': fingerprint.acceptLanguage,
|
||||
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
|
||||
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
|
||||
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
|
||||
},
|
||||
});
|
||||
|
||||
result.responseTimeMs = Date.now() - startTime;
|
||||
result.proxyConnected = true;
|
||||
result.passed = true;
|
||||
|
||||
// Mark success on proxy stats
|
||||
await this.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
|
||||
|
||||
console.log(`[Preflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
|
||||
} catch (err: any) {
|
||||
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
|
||||
console.log(`[Preflight] FAILED - Proxy connection error: ${err.message}`);
|
||||
|
||||
// Mark failure on proxy stats
|
||||
await this.proxy.markFailed(currentProxy.id, err.message);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Result from preflight check
|
||||
*/
|
||||
export interface PreflightResult {
|
||||
/** Overall pass/fail */
|
||||
passed: boolean;
|
||||
/** Step 1: Is a proxy loaded? */
|
||||
proxyAvailable: boolean;
|
||||
/** Step 2: Did HTTP request through proxy succeed? */
|
||||
proxyConnected: boolean;
|
||||
/** Step 3: Is fingerprint/anti-detect ready? */
|
||||
antidetectReady: boolean;
|
||||
/** Current proxy IP */
|
||||
proxyIp: string | null;
|
||||
/** Fingerprint summary */
|
||||
fingerprint: { userAgent: string; browserName: string; deviceCategory: string } | null;
|
||||
/** Error message if failed */
|
||||
error: string | null;
|
||||
/** Proxy response time in ms */
|
||||
responseTimeMs: number | null;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
|
||||
@@ -9,4 +9,4 @@ export { handleProductDiscovery } from './product-discovery';
|
||||
export { handleStoreDiscovery } from './store-discovery';
|
||||
export { handleEntryPointDiscovery } from './entry-point-discovery';
|
||||
export { handleAnalyticsRefresh } from './analytics-refresh';
|
||||
export { handleProxyTest } from './proxy-test';
|
||||
export { handleWhoami } from './whoami';
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
/**
|
||||
* Proxy Test Handler
|
||||
* Tests proxy connectivity by fetching public IP via ipify
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import { execSync } from 'child_process';
|
||||
|
||||
export async function handleProxyTest(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool } = ctx;
|
||||
|
||||
console.log('[ProxyTest] Testing proxy connection...');
|
||||
|
||||
try {
|
||||
// Get active proxy from DB
|
||||
const proxyResult = await pool.query(`
|
||||
SELECT host, port, username, password
|
||||
FROM proxies
|
||||
WHERE is_active = true
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
if (proxyResult.rows.length === 0) {
|
||||
return { success: false, error: 'No active proxy configured' };
|
||||
}
|
||||
|
||||
const p = proxyResult.rows[0];
|
||||
const proxyUrl = p.username
|
||||
? `http://${p.username}:${p.password}@${p.host}:${p.port}`
|
||||
: `http://${p.host}:${p.port}`;
|
||||
|
||||
console.log(`[ProxyTest] Using proxy: ${p.host}:${p.port}`);
|
||||
|
||||
// Fetch IP via proxy
|
||||
const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`;
|
||||
const output = execSync(cmd, { timeout: 30000 }).toString().trim();
|
||||
const data = JSON.parse(output);
|
||||
|
||||
console.log(`[ProxyTest] Proxy IP: ${data.ip}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
proxyIp: data.ip,
|
||||
proxyHost: p.host,
|
||||
proxyPort: p.port,
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('[ProxyTest] Error:', error.message);
|
||||
return { success: false, error: error.message };
|
||||
}
|
||||
}
|
||||
80
backend/src/tasks/handlers/whoami.ts
Normal file
80
backend/src/tasks/handlers/whoami.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
/**
|
||||
* WhoAmI Handler
|
||||
* Tests proxy connectivity and anti-detect by fetching public IP
|
||||
* Reports: proxy IP, fingerprint info, and connection status
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import { execSync } from 'child_process';
|
||||
|
||||
export async function handleWhoami(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool, crawlRotator } = ctx;
|
||||
|
||||
console.log('[WhoAmI] Testing proxy and anti-detect...');
|
||||
|
||||
try {
|
||||
// Use the preflight check which tests proxy + anti-detect
|
||||
if (crawlRotator) {
|
||||
const preflight = await crawlRotator.preflight();
|
||||
|
||||
if (!preflight.passed) {
|
||||
return {
|
||||
success: false,
|
||||
error: preflight.error || 'Preflight check failed',
|
||||
proxyAvailable: preflight.proxyAvailable,
|
||||
proxyConnected: preflight.proxyConnected,
|
||||
antidetectReady: preflight.antidetectReady,
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[WhoAmI] Proxy IP: ${preflight.proxyIp}, Response: ${preflight.responseTimeMs}ms`);
|
||||
console.log(`[WhoAmI] Fingerprint: ${preflight.fingerprint?.browserName}/${preflight.fingerprint?.deviceCategory}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
proxyIp: preflight.proxyIp,
|
||||
responseTimeMs: preflight.responseTimeMs,
|
||||
fingerprint: preflight.fingerprint,
|
||||
proxyAvailable: preflight.proxyAvailable,
|
||||
proxyConnected: preflight.proxyConnected,
|
||||
antidetectReady: preflight.antidetectReady,
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback: Direct proxy test without CrawlRotator
|
||||
const proxyResult = await pool.query(`
|
||||
SELECT host, port, username, password
|
||||
FROM proxies
|
||||
WHERE is_active = true
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
if (proxyResult.rows.length === 0) {
|
||||
return { success: false, error: 'No active proxy configured' };
|
||||
}
|
||||
|
||||
const p = proxyResult.rows[0];
|
||||
const proxyUrl = p.username
|
||||
? `http://${p.username}:${p.password}@${p.host}:${p.port}`
|
||||
: `http://${p.host}:${p.port}`;
|
||||
|
||||
console.log(`[WhoAmI] Using proxy: ${p.host}:${p.port}`);
|
||||
|
||||
// Fetch IP via proxy
|
||||
const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`;
|
||||
const output = execSync(cmd, { timeout: 30000 }).toString().trim();
|
||||
const data = JSON.parse(output);
|
||||
|
||||
console.log(`[WhoAmI] Proxy IP: ${data.ip}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
proxyIp: data.ip,
|
||||
proxyHost: p.host,
|
||||
proxyPort: p.port,
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('[WhoAmI] Error:', error.message);
|
||||
return { success: false, error: error.message };
|
||||
}
|
||||
}
|
||||
@@ -32,7 +32,7 @@ export type TaskRole =
|
||||
| 'payload_fetch' // NEW: Fetches from API, saves to disk
|
||||
| 'product_refresh' // CHANGED: Now reads from local payload
|
||||
| 'analytics_refresh'
|
||||
| 'proxy_test'; // Tests proxy connectivity via ipify
|
||||
| 'whoami'; // Tests proxy + anti-detect connectivity
|
||||
|
||||
export type TaskStatus =
|
||||
| 'pending'
|
||||
@@ -231,6 +231,24 @@ class TaskService {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Release a claimed task back to pending (e.g., when preflight fails)
|
||||
* This allows another worker to pick it up.
|
||||
*/
|
||||
async releaseTask(taskId: number): Promise<void> {
|
||||
await pool.query(
|
||||
`UPDATE worker_tasks
|
||||
SET status = 'pending',
|
||||
worker_id = NULL,
|
||||
claimed_at = NULL,
|
||||
started_at = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1 AND status IN ('claimed', 'running')`,
|
||||
[taskId]
|
||||
);
|
||||
console.log(`[TaskService] Task ${taskId} released back to pending`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a task as failed, with auto-retry if under max_retries
|
||||
* Returns true if task was re-queued for retry, false if permanently failed
|
||||
|
||||
@@ -59,7 +59,7 @@ import { handleProductDiscovery } from './handlers/product-discovery';
|
||||
import { handleStoreDiscovery } from './handlers/store-discovery';
|
||||
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
||||
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
||||
import { handleProxyTest } from './handlers/proxy-test';
|
||||
import { handleWhoami } from './handlers/whoami';
|
||||
|
||||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||||
@@ -111,6 +111,7 @@ export interface TaskContext {
|
||||
workerId: string;
|
||||
task: WorkerTask;
|
||||
heartbeat: () => Promise<void>;
|
||||
crawlRotator?: CrawlRotator;
|
||||
}
|
||||
|
||||
export interface TaskResult {
|
||||
@@ -134,7 +135,7 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
||||
store_discovery: handleStoreDiscovery,
|
||||
entry_point_discovery: handleEntryPointDiscovery,
|
||||
analytics_refresh: handleAnalyticsRefresh,
|
||||
proxy_test: handleProxyTest, // Tests proxy via ipify
|
||||
whoami: handleWhoami, // Tests proxy + anti-detect
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -555,6 +556,26 @@ export class TaskWorker {
|
||||
|
||||
if (task) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
|
||||
|
||||
// =================================================================
|
||||
// PREFLIGHT CHECK - CRITICAL: Worker MUST pass before task execution
|
||||
// Verifies: 1) Proxy available 2) Proxy connected 3) Anti-detect ready
|
||||
// =================================================================
|
||||
const preflight = await this.crawlRotator.preflight();
|
||||
if (!preflight.passed) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${preflight.error}`);
|
||||
console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without proxy/anti-detect`);
|
||||
|
||||
// Release task back to pending so another worker can pick it up
|
||||
await taskService.releaseTask(task.id);
|
||||
|
||||
// Wait before trying again - give proxies time to recover
|
||||
await this.sleep(30000); // 30 second wait on preflight failure
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[TaskWorker] ${this.friendlyName} preflight PASSED for task ${task.id} (proxy: ${preflight.proxyIp}, ${preflight.responseTimeMs}ms)`);
|
||||
|
||||
this.activeTasks.set(task.id, task);
|
||||
|
||||
// Start task in background (don't await)
|
||||
@@ -611,6 +632,7 @@ export class TaskWorker {
|
||||
heartbeat: async () => {
|
||||
await taskService.heartbeat(task.id);
|
||||
},
|
||||
crawlRotator: this.crawlRotator,
|
||||
};
|
||||
|
||||
// Execute the task
|
||||
|
||||
Reference in New Issue
Block a user