Merge pull request 'fix(ci): Remove buildx cache and add preflight enforcement' (#48) from fix/ci-and-preflight-enforcement into master
Reviewed-on: https://code.cannabrands.app/Creationshop/dispensary-scraper/pulls/48
This commit is contained in:
@@ -69,6 +69,7 @@ steps:
|
|||||||
|
|
||||||
# ===========================================
|
# ===========================================
|
||||||
# MASTER DEPLOY: Parallel Docker builds
|
# MASTER DEPLOY: Parallel Docker builds
|
||||||
|
# NOTE: cache_from/cache_to removed due to plugin bug splitting on commas
|
||||||
# ===========================================
|
# ===========================================
|
||||||
docker-backend:
|
docker-backend:
|
||||||
image: woodpeckerci/plugin-docker-buildx
|
image: woodpeckerci/plugin-docker-buildx
|
||||||
@@ -86,10 +87,6 @@ steps:
|
|||||||
from_secret: registry_password
|
from_secret: registry_password
|
||||||
platforms: linux/amd64
|
platforms: linux/amd64
|
||||||
provenance: false
|
provenance: false
|
||||||
cache_from:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache"
|
|
||||||
cache_to:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache,mode=max"
|
|
||||||
build_args:
|
build_args:
|
||||||
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
|
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
|
||||||
APP_GIT_SHA: ${CI_COMMIT_SHA}
|
APP_GIT_SHA: ${CI_COMMIT_SHA}
|
||||||
@@ -116,10 +113,6 @@ steps:
|
|||||||
from_secret: registry_password
|
from_secret: registry_password
|
||||||
platforms: linux/amd64
|
platforms: linux/amd64
|
||||||
provenance: false
|
provenance: false
|
||||||
cache_from:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache"
|
|
||||||
cache_to:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache,mode=max"
|
|
||||||
depends_on: []
|
depends_on: []
|
||||||
when:
|
when:
|
||||||
branch: master
|
branch: master
|
||||||
@@ -141,10 +134,6 @@ steps:
|
|||||||
from_secret: registry_password
|
from_secret: registry_password
|
||||||
platforms: linux/amd64
|
platforms: linux/amd64
|
||||||
provenance: false
|
provenance: false
|
||||||
cache_from:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache"
|
|
||||||
cache_to:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache,mode=max"
|
|
||||||
depends_on: []
|
depends_on: []
|
||||||
when:
|
when:
|
||||||
branch: master
|
branch: master
|
||||||
@@ -166,10 +155,6 @@ steps:
|
|||||||
from_secret: registry_password
|
from_secret: registry_password
|
||||||
platforms: linux/amd64
|
platforms: linux/amd64
|
||||||
provenance: false
|
provenance: false
|
||||||
cache_from:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache"
|
|
||||||
cache_to:
|
|
||||||
- "type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache,mode=max"
|
|
||||||
depends_on: []
|
depends_on: []
|
||||||
when:
|
when:
|
||||||
branch: master
|
branch: master
|
||||||
|
|||||||
@@ -683,6 +683,118 @@ export class CrawlRotator {
|
|||||||
const current = this.proxy.getCurrent();
|
const current = this.proxy.getCurrent();
|
||||||
return current?.timezone;
|
return current?.timezone;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preflight check - verifies proxy and anti-detect are working
|
||||||
|
* MUST be called before any task execution to ensure anonymity.
|
||||||
|
*
|
||||||
|
* Tests:
|
||||||
|
* 1. Proxy available - a proxy must be loaded and active
|
||||||
|
* 2. Proxy connectivity - makes HTTP request through proxy to verify connection
|
||||||
|
* 3. Anti-detect headers - verifies fingerprint is set with required headers
|
||||||
|
*
|
||||||
|
* @returns Promise<PreflightResult> with pass/fail status and details
|
||||||
|
*/
|
||||||
|
async preflight(): Promise<PreflightResult> {
|
||||||
|
const result: PreflightResult = {
|
||||||
|
passed: false,
|
||||||
|
proxyAvailable: false,
|
||||||
|
proxyConnected: false,
|
||||||
|
antidetectReady: false,
|
||||||
|
proxyIp: null,
|
||||||
|
fingerprint: null,
|
||||||
|
error: null,
|
||||||
|
responseTimeMs: null,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 1: Check proxy is available
|
||||||
|
const currentProxy = this.proxy.getCurrent();
|
||||||
|
if (!currentProxy) {
|
||||||
|
result.error = 'No proxy available';
|
||||||
|
console.log('[Preflight] FAILED - No proxy available');
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
result.proxyAvailable = true;
|
||||||
|
result.proxyIp = currentProxy.host;
|
||||||
|
|
||||||
|
// Step 2: Check fingerprint/anti-detect is ready
|
||||||
|
const fingerprint = this.userAgent.getCurrent();
|
||||||
|
if (!fingerprint || !fingerprint.userAgent) {
|
||||||
|
result.error = 'Anti-detect fingerprint not initialized';
|
||||||
|
console.log('[Preflight] FAILED - No fingerprint');
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
result.antidetectReady = true;
|
||||||
|
result.fingerprint = {
|
||||||
|
userAgent: fingerprint.userAgent,
|
||||||
|
browserName: fingerprint.browserName,
|
||||||
|
deviceCategory: fingerprint.deviceCategory,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 3: Test proxy connectivity with an actual HTTP request
|
||||||
|
// Use httpbin.org/ip to verify request goes through proxy
|
||||||
|
const proxyUrl = this.proxy.getProxyUrl(currentProxy);
|
||||||
|
const testUrl = 'https://httpbin.org/ip';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { default: axios } = await import('axios');
|
||||||
|
const { HttpsProxyAgent } = await import('https-proxy-agent');
|
||||||
|
|
||||||
|
const agent = new HttpsProxyAgent(proxyUrl);
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
const response = await axios.get(testUrl, {
|
||||||
|
httpsAgent: agent,
|
||||||
|
timeout: 15000, // 15 second timeout
|
||||||
|
headers: {
|
||||||
|
'User-Agent': fingerprint.userAgent,
|
||||||
|
'Accept-Language': fingerprint.acceptLanguage,
|
||||||
|
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
|
||||||
|
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
|
||||||
|
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
result.responseTimeMs = Date.now() - startTime;
|
||||||
|
result.proxyConnected = true;
|
||||||
|
result.passed = true;
|
||||||
|
|
||||||
|
// Mark success on proxy stats
|
||||||
|
await this.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
|
||||||
|
|
||||||
|
console.log(`[Preflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
|
||||||
|
} catch (err: any) {
|
||||||
|
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
|
||||||
|
console.log(`[Preflight] FAILED - Proxy connection error: ${err.message}`);
|
||||||
|
|
||||||
|
// Mark failure on proxy stats
|
||||||
|
await this.proxy.markFailed(currentProxy.id, err.message);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result from preflight check
|
||||||
|
*/
|
||||||
|
export interface PreflightResult {
|
||||||
|
/** Overall pass/fail */
|
||||||
|
passed: boolean;
|
||||||
|
/** Step 1: Is a proxy loaded? */
|
||||||
|
proxyAvailable: boolean;
|
||||||
|
/** Step 2: Did HTTP request through proxy succeed? */
|
||||||
|
proxyConnected: boolean;
|
||||||
|
/** Step 3: Is fingerprint/anti-detect ready? */
|
||||||
|
antidetectReady: boolean;
|
||||||
|
/** Current proxy IP */
|
||||||
|
proxyIp: string | null;
|
||||||
|
/** Fingerprint summary */
|
||||||
|
fingerprint: { userAgent: string; browserName: string; deviceCategory: string } | null;
|
||||||
|
/** Error message if failed */
|
||||||
|
error: string | null;
|
||||||
|
/** Proxy response time in ms */
|
||||||
|
responseTimeMs: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|||||||
@@ -9,4 +9,4 @@ export { handleProductDiscovery } from './product-discovery';
|
|||||||
export { handleStoreDiscovery } from './store-discovery';
|
export { handleStoreDiscovery } from './store-discovery';
|
||||||
export { handleEntryPointDiscovery } from './entry-point-discovery';
|
export { handleEntryPointDiscovery } from './entry-point-discovery';
|
||||||
export { handleAnalyticsRefresh } from './analytics-refresh';
|
export { handleAnalyticsRefresh } from './analytics-refresh';
|
||||||
export { handleProxyTest } from './proxy-test';
|
export { handleWhoami } from './whoami';
|
||||||
|
|||||||
@@ -1,51 +0,0 @@
|
|||||||
/**
|
|
||||||
* Proxy Test Handler
|
|
||||||
* Tests proxy connectivity by fetching public IP via ipify
|
|
||||||
*/
|
|
||||||
|
|
||||||
import { TaskContext, TaskResult } from '../task-worker';
|
|
||||||
import { execSync } from 'child_process';
|
|
||||||
|
|
||||||
export async function handleProxyTest(ctx: TaskContext): Promise<TaskResult> {
|
|
||||||
const { pool } = ctx;
|
|
||||||
|
|
||||||
console.log('[ProxyTest] Testing proxy connection...');
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Get active proxy from DB
|
|
||||||
const proxyResult = await pool.query(`
|
|
||||||
SELECT host, port, username, password
|
|
||||||
FROM proxies
|
|
||||||
WHERE is_active = true
|
|
||||||
LIMIT 1
|
|
||||||
`);
|
|
||||||
|
|
||||||
if (proxyResult.rows.length === 0) {
|
|
||||||
return { success: false, error: 'No active proxy configured' };
|
|
||||||
}
|
|
||||||
|
|
||||||
const p = proxyResult.rows[0];
|
|
||||||
const proxyUrl = p.username
|
|
||||||
? `http://${p.username}:${p.password}@${p.host}:${p.port}`
|
|
||||||
: `http://${p.host}:${p.port}`;
|
|
||||||
|
|
||||||
console.log(`[ProxyTest] Using proxy: ${p.host}:${p.port}`);
|
|
||||||
|
|
||||||
// Fetch IP via proxy
|
|
||||||
const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`;
|
|
||||||
const output = execSync(cmd, { timeout: 30000 }).toString().trim();
|
|
||||||
const data = JSON.parse(output);
|
|
||||||
|
|
||||||
console.log(`[ProxyTest] Proxy IP: ${data.ip}`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
success: true,
|
|
||||||
proxyIp: data.ip,
|
|
||||||
proxyHost: p.host,
|
|
||||||
proxyPort: p.port,
|
|
||||||
};
|
|
||||||
} catch (error: any) {
|
|
||||||
console.error('[ProxyTest] Error:', error.message);
|
|
||||||
return { success: false, error: error.message };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
80
backend/src/tasks/handlers/whoami.ts
Normal file
80
backend/src/tasks/handlers/whoami.ts
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
/**
|
||||||
|
* WhoAmI Handler
|
||||||
|
* Tests proxy connectivity and anti-detect by fetching public IP
|
||||||
|
* Reports: proxy IP, fingerprint info, and connection status
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { TaskContext, TaskResult } from '../task-worker';
|
||||||
|
import { execSync } from 'child_process';
|
||||||
|
|
||||||
|
export async function handleWhoami(ctx: TaskContext): Promise<TaskResult> {
|
||||||
|
const { pool, crawlRotator } = ctx;
|
||||||
|
|
||||||
|
console.log('[WhoAmI] Testing proxy and anti-detect...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Use the preflight check which tests proxy + anti-detect
|
||||||
|
if (crawlRotator) {
|
||||||
|
const preflight = await crawlRotator.preflight();
|
||||||
|
|
||||||
|
if (!preflight.passed) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: preflight.error || 'Preflight check failed',
|
||||||
|
proxyAvailable: preflight.proxyAvailable,
|
||||||
|
proxyConnected: preflight.proxyConnected,
|
||||||
|
antidetectReady: preflight.antidetectReady,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[WhoAmI] Proxy IP: ${preflight.proxyIp}, Response: ${preflight.responseTimeMs}ms`);
|
||||||
|
console.log(`[WhoAmI] Fingerprint: ${preflight.fingerprint?.browserName}/${preflight.fingerprint?.deviceCategory}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
proxyIp: preflight.proxyIp,
|
||||||
|
responseTimeMs: preflight.responseTimeMs,
|
||||||
|
fingerprint: preflight.fingerprint,
|
||||||
|
proxyAvailable: preflight.proxyAvailable,
|
||||||
|
proxyConnected: preflight.proxyConnected,
|
||||||
|
antidetectReady: preflight.antidetectReady,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: Direct proxy test without CrawlRotator
|
||||||
|
const proxyResult = await pool.query(`
|
||||||
|
SELECT host, port, username, password
|
||||||
|
FROM proxies
|
||||||
|
WHERE is_active = true
|
||||||
|
LIMIT 1
|
||||||
|
`);
|
||||||
|
|
||||||
|
if (proxyResult.rows.length === 0) {
|
||||||
|
return { success: false, error: 'No active proxy configured' };
|
||||||
|
}
|
||||||
|
|
||||||
|
const p = proxyResult.rows[0];
|
||||||
|
const proxyUrl = p.username
|
||||||
|
? `http://${p.username}:${p.password}@${p.host}:${p.port}`
|
||||||
|
: `http://${p.host}:${p.port}`;
|
||||||
|
|
||||||
|
console.log(`[WhoAmI] Using proxy: ${p.host}:${p.port}`);
|
||||||
|
|
||||||
|
// Fetch IP via proxy
|
||||||
|
const cmd = `curl -s --proxy '${proxyUrl}' 'https://api.ipify.org?format=json'`;
|
||||||
|
const output = execSync(cmd, { timeout: 30000 }).toString().trim();
|
||||||
|
const data = JSON.parse(output);
|
||||||
|
|
||||||
|
console.log(`[WhoAmI] Proxy IP: ${data.ip}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
proxyIp: data.ip,
|
||||||
|
proxyHost: p.host,
|
||||||
|
proxyPort: p.port,
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[WhoAmI] Error:', error.message);
|
||||||
|
return { success: false, error: error.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -32,7 +32,7 @@ export type TaskRole =
|
|||||||
| 'payload_fetch' // NEW: Fetches from API, saves to disk
|
| 'payload_fetch' // NEW: Fetches from API, saves to disk
|
||||||
| 'product_refresh' // CHANGED: Now reads from local payload
|
| 'product_refresh' // CHANGED: Now reads from local payload
|
||||||
| 'analytics_refresh'
|
| 'analytics_refresh'
|
||||||
| 'proxy_test'; // Tests proxy connectivity via ipify
|
| 'whoami'; // Tests proxy + anti-detect connectivity
|
||||||
|
|
||||||
export type TaskStatus =
|
export type TaskStatus =
|
||||||
| 'pending'
|
| 'pending'
|
||||||
@@ -231,6 +231,24 @@ class TaskService {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Release a claimed task back to pending (e.g., when preflight fails)
|
||||||
|
* This allows another worker to pick it up.
|
||||||
|
*/
|
||||||
|
async releaseTask(taskId: number): Promise<void> {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE worker_tasks
|
||||||
|
SET status = 'pending',
|
||||||
|
worker_id = NULL,
|
||||||
|
claimed_at = NULL,
|
||||||
|
started_at = NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1 AND status IN ('claimed', 'running')`,
|
||||||
|
[taskId]
|
||||||
|
);
|
||||||
|
console.log(`[TaskService] Task ${taskId} released back to pending`);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mark a task as failed, with auto-retry if under max_retries
|
* Mark a task as failed, with auto-retry if under max_retries
|
||||||
* Returns true if task was re-queued for retry, false if permanently failed
|
* Returns true if task was re-queued for retry, false if permanently failed
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ import { handleProductDiscovery } from './handlers/product-discovery';
|
|||||||
import { handleStoreDiscovery } from './handlers/store-discovery';
|
import { handleStoreDiscovery } from './handlers/store-discovery';
|
||||||
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
||||||
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
||||||
import { handleProxyTest } from './handlers/proxy-test';
|
import { handleWhoami } from './handlers/whoami';
|
||||||
|
|
||||||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||||||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||||||
@@ -111,6 +111,7 @@ export interface TaskContext {
|
|||||||
workerId: string;
|
workerId: string;
|
||||||
task: WorkerTask;
|
task: WorkerTask;
|
||||||
heartbeat: () => Promise<void>;
|
heartbeat: () => Promise<void>;
|
||||||
|
crawlRotator?: CrawlRotator;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TaskResult {
|
export interface TaskResult {
|
||||||
@@ -134,7 +135,7 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
|||||||
store_discovery: handleStoreDiscovery,
|
store_discovery: handleStoreDiscovery,
|
||||||
entry_point_discovery: handleEntryPointDiscovery,
|
entry_point_discovery: handleEntryPointDiscovery,
|
||||||
analytics_refresh: handleAnalyticsRefresh,
|
analytics_refresh: handleAnalyticsRefresh,
|
||||||
proxy_test: handleProxyTest, // Tests proxy via ipify
|
whoami: handleWhoami, // Tests proxy + anti-detect
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -555,6 +556,26 @@ export class TaskWorker {
|
|||||||
|
|
||||||
if (task) {
|
if (task) {
|
||||||
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
|
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
|
||||||
|
|
||||||
|
// =================================================================
|
||||||
|
// PREFLIGHT CHECK - CRITICAL: Worker MUST pass before task execution
|
||||||
|
// Verifies: 1) Proxy available 2) Proxy connected 3) Anti-detect ready
|
||||||
|
// =================================================================
|
||||||
|
const preflight = await this.crawlRotator.preflight();
|
||||||
|
if (!preflight.passed) {
|
||||||
|
console.log(`[TaskWorker] ${this.friendlyName} PREFLIGHT FAILED for task ${task.id}: ${preflight.error}`);
|
||||||
|
console.log(`[TaskWorker] Releasing task ${task.id} back to pending - worker cannot proceed without proxy/anti-detect`);
|
||||||
|
|
||||||
|
// Release task back to pending so another worker can pick it up
|
||||||
|
await taskService.releaseTask(task.id);
|
||||||
|
|
||||||
|
// Wait before trying again - give proxies time to recover
|
||||||
|
await this.sleep(30000); // 30 second wait on preflight failure
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[TaskWorker] ${this.friendlyName} preflight PASSED for task ${task.id} (proxy: ${preflight.proxyIp}, ${preflight.responseTimeMs}ms)`);
|
||||||
|
|
||||||
this.activeTasks.set(task.id, task);
|
this.activeTasks.set(task.id, task);
|
||||||
|
|
||||||
// Start task in background (don't await)
|
// Start task in background (don't await)
|
||||||
@@ -611,6 +632,7 @@ export class TaskWorker {
|
|||||||
heartbeat: async () => {
|
heartbeat: async () => {
|
||||||
await taskService.heartbeat(task.id);
|
await taskService.heartbeat(task.id);
|
||||||
},
|
},
|
||||||
|
crawlRotator: this.crawlRotator,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Execute the task
|
// Execute the task
|
||||||
|
|||||||
Reference in New Issue
Block a user