Merge pull request 'fix(ci): Remove buildx cache and add preflight enforcement' (#48) from fix/ci-and-preflight-enforcement into master

Reviewed-on: https://code.cannabrands.app/Creationshop/dispensary-scraper/pulls/48
This commit is contained in:
kelly
2025-12-12 04:53:20 +00:00
7 changed files with 237 additions and 71 deletions

View File

@@ -69,6 +69,7 @@ steps:
# ===========================================
# MASTER DEPLOY: Parallel Docker builds
# NOTE: cache_from/cache_to removed due to plugin bug splitting on commas
# ===========================================
docker-backend:
image: woodpeckerci/plugin-docker-buildx
@@ -86,10 +87,6 @@ steps:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from:
- "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache"
cache_to:
- "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache,mode=max"
build_args:
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
APP_GIT_SHA: ${CI_COMMIT_SHA}
@@ -116,10 +113,6 @@ steps:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from:
- "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache"
cache_to:
- "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache,mode=max"
depends_on: []
when:
branch: master
@@ -141,10 +134,6 @@ steps:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from:
- "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache"
cache_to:
- "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache,mode=max"
depends_on: []
when:
branch: master
@@ -166,10 +155,6 @@ steps:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from:
- "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache"
cache_to:
- "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache,mode=max"
depends_on: []
when:
branch: master

View File

@@ -683,6 +683,118 @@ export class CrawlRotator {
const current = this.proxy.getCurrent();
return current?.timezone;
}
/**
* Preflight check - verifies proxy and anti-detect are working
* MUST be called before any task execution to ensure anonymity.
*
* Tests:
* 1. Proxy available - a proxy must be loaded and active
* 2. Proxy connectivity - makes HTTP request through proxy to verify connection
* 3. Anti-detect headers - verifies fingerprint is set with required headers
*
* @returns Promise<PreflightResult> with pass/fail status and details
*/
async preflight(): Promise<PreflightResult> {
const result: PreflightResult = {
passed: false,
proxyAvailable: false,
proxyConnected: false,
antidetectReady: false,
proxyIp: null,
fingerprint: null,
error: null,
responseTimeMs: null,
};
// Step 1: Check proxy is available
const currentProxy = this.proxy.getCurrent();
if (!currentProxy) {
result.error = 'No proxy available';
console.log('[Preflight] FAILED - No proxy available');
return result;
}
result.proxyAvailable = true;
result.proxyIp = currentProxy.host;
// Step 2: Check fingerprint/anti-detect is ready
const fingerprint = this.userAgent.getCurrent();
if (!fingerprint || !fingerprint.userAgent) {
result.error = 'Anti-detect fingerprint not initialized';
console.log('[Preflight] FAILED - No fingerprint');
return result;
}
result.antidetectReady = true;
result.fingerprint = {
userAgent: fingerprint.userAgent,
browserName: fingerprint.browserName,
deviceCategory: fingerprint.deviceCategory,
};
// Step 3: Test proxy connectivity with an actual HTTP request
// Use httpbin.org/ip to verify request goes through proxy
const proxyUrl = this.proxy.getProxyUrl(currentProxy);
const testUrl = 'https://httpbin.org/ip';
try {
const { default: axios } = await import('axios');
const { HttpsProxyAgent } = await import('https-proxy-agent');
const agent = new HttpsProxyAgent(proxyUrl);
const startTime = Date.now();
const response = await axios.get(testUrl, {
httpsAgent: agent,
timeout: 15000, // 15 second timeout
headers: {
'User-Agent': fingerprint.userAgent,
'Accept-Language': fingerprint.acceptLanguage,
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
},
});
result.responseTimeMs = Date.now() - startTime;
result.proxyConnected = true;
result.passed = true;
// Mark success on proxy stats
await this.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
console.log(`[Preflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
} catch (err: any) {
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
console.log(`[Preflight] FAILED - Proxy connection error: ${err.message}`);
// Mark failure on proxy stats
await this.proxy.markFailed(currentProxy.id, err.message);
}
return result;
}
}
/**
* Result from preflight check
*/
export interface PreflightResult {
/** Overall pass/fail */
passed: boolean;
/** Step 1: Is a proxy loaded? */
proxyAvailable: boolean;
/** Step 2: Did HTTP request through proxy succeed? */
proxyConnected: boolean;
/** Step 3: Is fingerprint/anti-detect ready? */
antidetectReady: boolean;
/** Current proxy IP */
proxyIp: string | null;
/** Fingerprint summary */
fingerprint: { userAgent: string; browserName: string; deviceCategory: string } | null;
/** Error message if failed */
error: string | null;
/** Proxy response time in ms */
responseTimeMs: number | null;
}
// ============================================================

View File

@@ -9,4 +9,4 @@ export { handleProductDiscovery } from './product-discovery';
export { handleStoreDiscovery } from './store-discovery';
export { handleEntryPointDiscovery } from './entry-point-discovery';
export { handleAnalyticsRefresh } from './analytics-refresh';
export { handleProxyTest } from './proxy-test';
export { handleWhoami } from './whoami';

View File

@@ -1,51 +0,0 @@
/**
* Proxy Test Handler
* Tests proxy connectivity by fetching public IP via ipify
*/
import { TaskContext, TaskResult } from '../task-worker';
import { execSync } from 'child_process';
export async function handleProxyTest(ctx: TaskContext): Promise<TaskResult> {
const { pool } = ctx;
console.log('[ProxyTest] Testing proxy connection...');
try {
// Get active proxy from DB
const proxyResult = await pool.query(`
SELECT host, port, username, password
FROM proxies
WHERE is_active = true
LIMIT 1
`);
if (proxyResult.rows.length === 0) {
return { success: false, error: 'No active proxy configured' };
}
const p = proxyResult.rows[0];
const proxyUrl = p.username
? `http://${p.username}:${p.password}@${p.host}:${p.port}`
: `http://${p.host}:${p.port}`;
console.log(`[ProxyTest] Using proxy: ${p.host}:${p.port}`);
// Fetch IP via proxy
const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`;
const output = execSync(cmd, { timeout: 30000 }).toString().trim();
const data = JSON.parse(output);
console.log(`[ProxyTest] Proxy IP: ${data.ip}`);
return {
success: true,
proxyIp: data.ip,
proxyHost: p.host,
proxyPort: p.port,
};
} catch (error: any) {
console.error('[ProxyTest] Error:', error.message);
return { success: false, error: error.message };
}
}

View File

@@ -0,0 +1,80 @@
/**
* WhoAmI Handler
* Tests proxy connectivity and anti-detect by fetching public IP
* Reports: proxy IP, fingerprint info, and connection status
*/
import { TaskContext, TaskResult } from '../task-worker';
import { execSync } from 'child_process';
export async function handleWhoami(ctx: TaskContext): Promise<TaskResult> {
const { pool, crawlRotator } = ctx;
console.log('[WhoAmI] Testing proxy and anti-detect...');
try {
// Use the preflight check which tests proxy + anti-detect
if (crawlRotator) {
const preflight = await crawlRotator.preflight();
if (!preflight.passed) {
return {
success: false,
error: preflight.error || 'Preflight check failed',
proxyAvailable: preflight.proxyAvailable,
proxyConnected: preflight.proxyConnected,
antidetectReady: preflight.antidetectReady,
};
}
console.log(`[WhoAmI] Proxy IP: ${preflight.proxyIp}, Response: ${preflight.responseTimeMs}ms`);
console.log(`[WhoAmI] Fingerprint: ${preflight.fingerprint?.browserName}/${preflight.fingerprint?.deviceCategory}`);
return {
success: true,
proxyIp: preflight.proxyIp,
responseTimeMs: preflight.responseTimeMs,
fingerprint: preflight.fingerprint,
proxyAvailable: preflight.proxyAvailable,
proxyConnected: preflight.proxyConnected,
antidetectReady: preflight.antidetectReady,
};
}
// Fallback: Direct proxy test without CrawlRotator
const proxyResult = await pool.query(`
SELECT host, port, username, password
FROM proxies
WHERE is_active = true
LIMIT 1
`);
if (proxyResult.rows.length === 0) {
return { success: false, error: 'No active proxy configured' };
}
const p = proxyResult.rows[0];
const proxyUrl = p.username
? `http://${p.username}:${p.password}@${p.host}:${p.port}`
: `http://${p.host}:${p.port}`;
console.log(`[WhoAmI] Using proxy: ${p.host}:${p.port}`);
// Fetch IP via proxy
const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`;
const output = execSync(cmd, { timeout: 30000 }).toString().trim();
const data = JSON.parse(output);
console.log(`[WhoAmI] Proxy IP: ${data.ip}`);
return {
success: true,
proxyIp: data.ip,
proxyHost: p.host,
proxyPort: p.port,
};
} catch (error: any) {
console.error('[WhoAmI] Error:', error.message);
return { success: false, error: error.message };
}
}

View File

@@ -32,7 +32,7 @@ export type TaskRole =
| 'payload_fetch' // NEW: Fetches from API, saves to disk
| 'product_refresh' // CHANGED: Now reads from local payload
| 'analytics_refresh'
| 'proxy_test'; // Tests proxy connectivity via ipify
| 'whoami'; // Tests proxy + anti-detect connectivity
export type TaskStatus =
| 'pending'
@@ -231,6 +231,24 @@ class TaskService {
);
}
/**
* Release a claimed task back to pending (e.g., when preflight fails)
* This allows another worker to pick it up.
*/
async releaseTask(taskId: number): Promise<void> {
await pool.query(
`UPDATE worker_tasks
SET status = 'pending',
worker_id = NULL,
claimed_at = NULL,
started_at = NULL,
updated_at = NOW()
WHERE id = $1 AND status IN ('claimed', 'running')`,
[taskId]
);
console.log(`[TaskService] Task ${taskId} released back to pending`);
}
/**
* Mark a task as failed, with auto-retry if under max_retries
* Returns true if task was re-queued for retry, false if permanently failed

View File

@@ -59,7 +59,7 @@ import { handleProductDiscovery } from './handlers/product-discovery';
import { handleStoreDiscovery } from './handlers/store-discovery';
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
import { handleProxyTest } from './handlers/proxy-test';
import { handleWhoami } from './handlers/whoami';
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
@@ -111,6 +111,7 @@ export interface TaskContext {
workerId: string;
task: WorkerTask;
heartbeat: () => Promise<void>;
crawlRotator?: CrawlRotator;
}
export interface TaskResult {
@@ -134,7 +135,7 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
store_discovery: handleStoreDiscovery,
entry_point_discovery: handleEntryPointDiscovery,
analytics_refresh: handleAnalyticsRefresh,
proxy_test: handleProxyTest, // Tests proxy via ipify
whoami: handleWhoami, // Tests proxy + anti-detect
};
/**
@@ -555,6 +556,26 @@ export class TaskWorker {
if (task) {
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
// =================================================================
// PREFLIGHT CHECK - CRITICAL: Worker MUST pass before task execution
// Verifies: 1) Proxy available 2) Proxy connected 3) Anti-detect ready
// =================================================================
const preflight = await this.crawlRotator.preflight();
if (!preflight.passed) {
console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${preflight.error}`);
console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without proxy/anti-detect`);
// Release task back to pending so another worker can pick it up
await taskService.releaseTask(task.id);
// Wait before trying again - give proxies time to recover
await this.sleep(30000); // 30 second wait on preflight failure
return;
}
console.log(`[TaskWorker] ${this.friendlyName} preflight PASSED for task ${task.id} (proxy: ${preflight.proxyIp}, ${preflight.responseTimeMs}ms)`);
this.activeTasks.set(task.id, task);
// Start task in background (don't await)
@@ -611,6 +632,7 @@ export class TaskWorker {
heartbeat: async () => {
await taskService.heartbeat(task.id);
},
crawlRotator: this.crawlRotator,
};
// Execute the task