feat(workers): Add dual-transport preflight system
Workers now run both curl and http (Puppeteer) preflights on startup: - curl-preflight.ts: Tests axios + proxy via httpbin.org - puppeteer-preflight.ts: Tests browser + StealthPlugin via fingerprint.com (with amiunique.org fallback) - Migration 084: Adds preflight columns to worker_registry and method column to worker_tasks - Workers report preflight status, IP, fingerprint, and response time - Tasks can require specific transport method (curl/http) - Dashboard shows Transport column with preflight status badges 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
253
backend/migrations/084_dual_transport_preflight.sql
Normal file
253
backend/migrations/084_dual_transport_preflight.sql
Normal file
@@ -0,0 +1,253 @@
|
||||
-- Migration 084: Dual Transport Preflight System
|
||||
-- Workers run both curl and http (Puppeteer) preflights on startup
|
||||
-- Tasks can require a specific transport method
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 1: Add preflight columns to worker_registry
|
||||
-- ===================================================================
|
||||
|
||||
-- Preflight status for curl/axios transport (proxy-based)
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_status VARCHAR(20) DEFAULT 'pending';
|
||||
|
||||
-- Preflight status for http/Puppeteer transport (browser-based)
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_status VARCHAR(20) DEFAULT 'pending';
|
||||
|
||||
-- Timestamps for when each preflight completed
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_at TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_at TIMESTAMPTZ;
|
||||
|
||||
-- Error messages for failed preflights
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_error TEXT;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_error TEXT;
|
||||
|
||||
-- Response time for successful preflights (ms)
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_ms INTEGER;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_ms INTEGER;
|
||||
|
||||
-- Constraints for preflight status values
|
||||
ALTER TABLE worker_registry
|
||||
DROP CONSTRAINT IF EXISTS valid_preflight_curl_status;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD CONSTRAINT valid_preflight_curl_status
|
||||
CHECK (preflight_curl_status IN ('pending', 'passed', 'failed', 'skipped'));
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
DROP CONSTRAINT IF EXISTS valid_preflight_http_status;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD CONSTRAINT valid_preflight_http_status
|
||||
CHECK (preflight_http_status IN ('pending', 'passed', 'failed', 'skipped'));
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 2: Add method column to worker_tasks
|
||||
-- ===================================================================
|
||||
|
||||
-- Transport method requirement for the task
|
||||
-- NULL = no preference (any worker can claim)
|
||||
-- 'curl' = requires curl/axios transport (proxy-based, fast)
|
||||
-- 'http' = requires http/Puppeteer transport (browser-based, anti-detect)
|
||||
ALTER TABLE worker_tasks
|
||||
ADD COLUMN IF NOT EXISTS method VARCHAR(10);
|
||||
|
||||
-- Constraint for valid method values
|
||||
ALTER TABLE worker_tasks
|
||||
DROP CONSTRAINT IF EXISTS valid_task_method;
|
||||
|
||||
ALTER TABLE worker_tasks
|
||||
ADD CONSTRAINT valid_task_method
|
||||
CHECK (method IS NULL OR method IN ('curl', 'http'));
|
||||
|
||||
-- Index for method-based task claiming
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_method
|
||||
ON worker_tasks(method)
|
||||
WHERE status = 'pending';
|
||||
|
||||
-- Set default method for all existing pending tasks to 'http'
|
||||
-- ALL current tasks require Puppeteer/browser-based transport
|
||||
UPDATE worker_tasks
|
||||
SET method = 'http'
|
||||
WHERE method IS NULL;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 3: Update claim_task function for method compatibility
|
||||
-- ===================================================================
|
||||
|
||||
CREATE OR REPLACE FUNCTION claim_task(
|
||||
p_role VARCHAR(50),
|
||||
p_worker_id VARCHAR(100),
|
||||
p_curl_passed BOOLEAN DEFAULT TRUE,
|
||||
p_http_passed BOOLEAN DEFAULT FALSE
|
||||
) RETURNS worker_tasks AS $$
|
||||
DECLARE
|
||||
claimed_task worker_tasks;
|
||||
BEGIN
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'claimed',
|
||||
worker_id = p_worker_id,
|
||||
claimed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_tasks
|
||||
WHERE role = p_role
|
||||
AND status = 'pending'
|
||||
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
|
||||
-- Method compatibility: worker must have passed the required preflight
|
||||
AND (
|
||||
method IS NULL -- No preference, any worker can claim
|
||||
OR (method = 'curl' AND p_curl_passed = TRUE)
|
||||
OR (method = 'http' AND p_http_passed = TRUE)
|
||||
)
|
||||
-- Exclude stores that already have an active task
|
||||
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM worker_tasks
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND dispensary_id IS NOT NULL
|
||||
))
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING * INTO claimed_task;
|
||||
|
||||
RETURN claimed_task;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 4: Update v_active_workers view
|
||||
-- ===================================================================
|
||||
|
||||
DROP VIEW IF EXISTS v_active_workers;
|
||||
|
||||
CREATE VIEW v_active_workers AS
|
||||
SELECT
|
||||
wr.id,
|
||||
wr.worker_id,
|
||||
wr.friendly_name,
|
||||
wr.role,
|
||||
wr.status,
|
||||
wr.pod_name,
|
||||
wr.hostname,
|
||||
wr.started_at,
|
||||
wr.last_heartbeat_at,
|
||||
wr.last_task_at,
|
||||
wr.tasks_completed,
|
||||
wr.tasks_failed,
|
||||
wr.current_task_id,
|
||||
-- Preflight status
|
||||
wr.preflight_curl_status,
|
||||
wr.preflight_http_status,
|
||||
wr.preflight_curl_at,
|
||||
wr.preflight_http_at,
|
||||
wr.preflight_curl_error,
|
||||
wr.preflight_http_error,
|
||||
wr.preflight_curl_ms,
|
||||
wr.preflight_http_ms,
|
||||
-- Computed fields
|
||||
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN wr.status = 'offline' THEN 'offline'
|
||||
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status,
|
||||
-- Capability flags (can this worker handle curl/http tasks?)
|
||||
(wr.preflight_curl_status = 'passed') as can_curl,
|
||||
(wr.preflight_http_status = 'passed') as can_http
|
||||
FROM worker_registry wr
|
||||
WHERE wr.status != 'terminated'
|
||||
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 5: View for task queue with method info
|
||||
-- ===================================================================
|
||||
|
||||
DROP VIEW IF EXISTS v_task_history;
|
||||
|
||||
CREATE VIEW v_task_history AS
|
||||
SELECT
|
||||
t.id,
|
||||
t.role,
|
||||
t.dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
t.platform,
|
||||
t.status,
|
||||
t.priority,
|
||||
t.method,
|
||||
t.worker_id,
|
||||
t.scheduled_for,
|
||||
t.claimed_at,
|
||||
t.started_at,
|
||||
t.completed_at,
|
||||
t.error_message,
|
||||
t.retry_count,
|
||||
t.created_at,
|
||||
EXTRACT(EPOCH FROM (t.completed_at - t.started_at)) as duration_sec
|
||||
FROM worker_tasks t
|
||||
LEFT JOIN dispensaries d ON d.id = t.dispensary_id
|
||||
ORDER BY t.created_at DESC;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 6: Helper function to update worker preflight status
|
||||
-- ===================================================================
|
||||
|
||||
CREATE OR REPLACE FUNCTION update_worker_preflight(
|
||||
p_worker_id VARCHAR(100),
|
||||
p_transport VARCHAR(10), -- 'curl' or 'http'
|
||||
p_status VARCHAR(20), -- 'passed', 'failed', 'skipped'
|
||||
p_response_ms INTEGER DEFAULT NULL,
|
||||
p_error TEXT DEFAULT NULL
|
||||
) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
IF p_transport = 'curl' THEN
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_curl_status = p_status,
|
||||
preflight_curl_at = NOW(),
|
||||
preflight_curl_ms = p_response_ms,
|
||||
preflight_curl_error = p_error,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
ELSIF p_transport = 'http' THEN
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_http_status = p_status,
|
||||
preflight_http_at = NOW(),
|
||||
preflight_http_ms = p_response_ms,
|
||||
preflight_http_error = p_error,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ===================================================================
|
||||
-- Comments
|
||||
-- ===================================================================
|
||||
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_status IS 'Status of curl/axios preflight: pending, passed, failed, skipped';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_status IS 'Status of http/Puppeteer preflight: pending, passed, failed, skipped';
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_at IS 'When curl preflight completed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_at IS 'When http preflight completed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_error IS 'Error message if curl preflight failed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_error IS 'Error message if http preflight failed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_ms IS 'Response time of successful curl preflight (ms)';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_ms IS 'Response time of successful http preflight (ms)';
|
||||
|
||||
COMMENT ON COLUMN worker_tasks.method IS 'Transport method required: NULL=any, curl=proxy-based, http=browser-based';
|
||||
|
||||
COMMENT ON FUNCTION claim_task IS 'Atomically claim a task, respecting method requirements and per-store locking';
|
||||
COMMENT ON FUNCTION update_worker_preflight IS 'Update a workers preflight status for a given transport';
|
||||
100
backend/src/services/curl-preflight.ts
Normal file
100
backend/src/services/curl-preflight.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
/**
|
||||
* Curl Preflight - Verify curl/axios transport works through proxy
|
||||
*
|
||||
* Tests:
|
||||
* 1. Proxy is available and active
|
||||
* 2. HTTP request through proxy succeeds
|
||||
* 3. Anti-detect headers are properly set
|
||||
*
|
||||
* Use case: Fast, simple API requests that don't need browser fingerprint
|
||||
*/
|
||||
|
||||
import axios from 'axios';
|
||||
import { HttpsProxyAgent } from 'https-proxy-agent';
|
||||
import { CrawlRotator, PreflightResult } from './crawl-rotator';
|
||||
|
||||
export interface CurlPreflightResult extends PreflightResult {
|
||||
method: 'curl';
|
||||
}
|
||||
|
||||
/**
|
||||
* Run curl preflight check
|
||||
* Tests proxy connectivity using axios/curl through the proxy
|
||||
*/
|
||||
export async function runCurlPreflight(
|
||||
crawlRotator: CrawlRotator
|
||||
): Promise<CurlPreflightResult> {
|
||||
const result: CurlPreflightResult = {
|
||||
method: 'curl',
|
||||
passed: false,
|
||||
proxyAvailable: false,
|
||||
proxyConnected: false,
|
||||
antidetectReady: false,
|
||||
proxyIp: null,
|
||||
fingerprint: null,
|
||||
error: null,
|
||||
responseTimeMs: null,
|
||||
};
|
||||
|
||||
// Step 1: Check proxy is available
|
||||
const currentProxy = crawlRotator.proxy.getCurrent();
|
||||
if (!currentProxy) {
|
||||
result.error = 'No proxy available';
|
||||
console.log('[CurlPreflight] FAILED - No proxy available');
|
||||
return result;
|
||||
}
|
||||
result.proxyAvailable = true;
|
||||
result.proxyIp = currentProxy.host;
|
||||
|
||||
// Step 2: Check fingerprint/anti-detect is ready
|
||||
const fingerprint = crawlRotator.userAgent.getCurrent();
|
||||
if (!fingerprint || !fingerprint.userAgent) {
|
||||
result.error = 'Anti-detect fingerprint not initialized';
|
||||
console.log('[CurlPreflight] FAILED - No fingerprint');
|
||||
return result;
|
||||
}
|
||||
result.antidetectReady = true;
|
||||
result.fingerprint = {
|
||||
userAgent: fingerprint.userAgent,
|
||||
browserName: fingerprint.browserName,
|
||||
deviceCategory: fingerprint.deviceCategory,
|
||||
};
|
||||
|
||||
// Step 3: Test proxy connectivity with an actual HTTP request
|
||||
const proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
|
||||
const testUrl = 'https://httpbin.org/ip';
|
||||
|
||||
try {
|
||||
const agent = new HttpsProxyAgent(proxyUrl);
|
||||
const startTime = Date.now();
|
||||
|
||||
const response = await axios.get(testUrl, {
|
||||
httpsAgent: agent,
|
||||
timeout: 15000, // 15 second timeout
|
||||
headers: {
|
||||
'User-Agent': fingerprint.userAgent,
|
||||
'Accept-Language': fingerprint.acceptLanguage,
|
||||
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
|
||||
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
|
||||
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
|
||||
},
|
||||
});
|
||||
|
||||
result.responseTimeMs = Date.now() - startTime;
|
||||
result.proxyConnected = true;
|
||||
result.passed = true;
|
||||
|
||||
// Mark success on proxy stats
|
||||
await crawlRotator.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
|
||||
|
||||
console.log(`[CurlPreflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
|
||||
} catch (err: any) {
|
||||
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
|
||||
console.log(`[CurlPreflight] FAILED - Proxy connection error: ${err.message}`);
|
||||
|
||||
// Mark failure on proxy stats
|
||||
await crawlRotator.proxy.markFailed(currentProxy.id, err.message);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
399
backend/src/services/puppeteer-preflight.ts
Normal file
399
backend/src/services/puppeteer-preflight.ts
Normal file
@@ -0,0 +1,399 @@
|
||||
/**
|
||||
* Puppeteer Preflight - Verify browser-based transport works with anti-detect
|
||||
*
|
||||
* Uses Puppeteer + StealthPlugin to:
|
||||
* 1. Launch headless browser with stealth mode + PROXY
|
||||
* 2. Visit fingerprint.com demo to verify anti-detect and confirm proxy IP
|
||||
* 3. Establish session by visiting Dutchie embedded menu
|
||||
* 4. Make GraphQL request from browser context
|
||||
* 5. Verify we get a valid response (not blocked)
|
||||
*
|
||||
* Use case: Anti-detect scraping that needs real browser fingerprint through proxy
|
||||
*
|
||||
* Based on test-intercept.js which successfully captures 1000+ products
|
||||
*/
|
||||
|
||||
import { PreflightResult, CrawlRotator } from './crawl-rotator';
|
||||
|
||||
// GraphQL hash for FilteredProducts query - MUST match CLAUDE.md
|
||||
const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
|
||||
// Test dispensary - AZ-Deeply-Rooted (known working)
|
||||
const TEST_CNAME = 'AZ-Deeply-Rooted';
|
||||
const TEST_PLATFORM_ID = '6405ef617056e8014d79101b';
|
||||
|
||||
// Anti-detect verification sites (primary + fallback)
|
||||
const FINGERPRINT_DEMO_URL = 'https://demo.fingerprint.com/';
|
||||
const AMIUNIQUE_URL = 'https://amiunique.org/fingerprint';
|
||||
|
||||
export interface PuppeteerPreflightResult extends PreflightResult {
|
||||
method: 'http';
|
||||
/** Number of products returned (proves API access) */
|
||||
productsReturned?: number;
|
||||
/** Browser user agent used */
|
||||
browserUserAgent?: string;
|
||||
/** Bot detection result from fingerprint.com */
|
||||
botDetection?: {
|
||||
detected: boolean;
|
||||
probability?: number;
|
||||
type?: string;
|
||||
};
|
||||
/** Expected proxy IP (from pool) */
|
||||
expectedProxyIp?: string;
|
||||
/** Whether IP verification passed (detected IP matches proxy) */
|
||||
ipVerified?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run Puppeteer preflight check with proxy
|
||||
* Tests browser-based access with anti-detect verification via fingerprint.com
|
||||
*
|
||||
* @param crawlRotator - CrawlRotator instance to get proxy from pool
|
||||
*/
|
||||
export async function runPuppeteerPreflight(
|
||||
crawlRotator?: CrawlRotator
|
||||
): Promise<PuppeteerPreflightResult> {
|
||||
const result: PuppeteerPreflightResult = {
|
||||
method: 'http',
|
||||
passed: false,
|
||||
proxyAvailable: false,
|
||||
proxyConnected: false,
|
||||
antidetectReady: false,
|
||||
proxyIp: null,
|
||||
fingerprint: null,
|
||||
error: null,
|
||||
responseTimeMs: null,
|
||||
productsReturned: 0,
|
||||
ipVerified: false,
|
||||
};
|
||||
|
||||
let browser: any = null;
|
||||
|
||||
try {
|
||||
// Step 0: Get a proxy from the pool
|
||||
let proxyUrl: string | null = null;
|
||||
let expectedProxyHost: string | null = null;
|
||||
|
||||
if (crawlRotator) {
|
||||
const currentProxy = crawlRotator.proxy.getCurrent();
|
||||
if (currentProxy) {
|
||||
result.proxyAvailable = true;
|
||||
proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
|
||||
expectedProxyHost = currentProxy.host;
|
||||
result.expectedProxyIp = expectedProxyHost;
|
||||
console.log(`[PuppeteerPreflight] Using proxy: ${currentProxy.host}:${currentProxy.port}`);
|
||||
} else {
|
||||
result.error = 'No proxy available from pool';
|
||||
console.log(`[PuppeteerPreflight] FAILED - No proxy available`);
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
console.log(`[PuppeteerPreflight] WARNING: No CrawlRotator provided - using direct connection`);
|
||||
result.proxyAvailable = true; // No proxy needed for direct
|
||||
}
|
||||
|
||||
// Dynamic imports to avoid loading Puppeteer unless needed
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// Build browser args
|
||||
const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox'];
|
||||
if (proxyUrl) {
|
||||
// Extract host:port for Puppeteer (it handles auth separately)
|
||||
const proxyUrlParsed = new URL(proxyUrl);
|
||||
browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`);
|
||||
}
|
||||
|
||||
// Launch browser with stealth + proxy
|
||||
browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: browserArgs,
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// If proxy has auth, set it up
|
||||
if (proxyUrl) {
|
||||
const proxyUrlParsed = new URL(proxyUrl);
|
||||
if (proxyUrlParsed.username && proxyUrlParsed.password) {
|
||||
await page.authenticate({
|
||||
username: decodeURIComponent(proxyUrlParsed.username),
|
||||
password: decodeURIComponent(proxyUrlParsed.password),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Get browser user agent
|
||||
const userAgent = await page.evaluate(() => navigator.userAgent);
|
||||
result.browserUserAgent = userAgent;
|
||||
result.fingerprint = {
|
||||
userAgent,
|
||||
browserName: 'Chrome (Puppeteer)',
|
||||
deviceCategory: 'desktop',
|
||||
};
|
||||
|
||||
// =========================================================================
|
||||
// STEP 1: Visit fingerprint.com demo to verify anti-detect and get IP
|
||||
// =========================================================================
|
||||
console.log(`[PuppeteerPreflight] Testing anti-detect at ${FINGERPRINT_DEMO_URL}...`);
|
||||
|
||||
try {
|
||||
await page.goto(FINGERPRINT_DEMO_URL, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
result.proxyConnected = true; // If we got here, proxy is working
|
||||
|
||||
// Wait for fingerprint results to load
|
||||
await page.waitForSelector('[data-test="visitor-id"]', { timeout: 10000 }).catch(() => {});
|
||||
|
||||
// Extract fingerprint data from the page
|
||||
const fingerprintData = await page.evaluate(() => {
|
||||
// Try to find the IP address displayed on the page
|
||||
const ipElement = document.querySelector('[data-test="ip-address"]');
|
||||
const ip = ipElement?.textContent?.trim() || null;
|
||||
|
||||
// Try to find bot detection info
|
||||
const botElement = document.querySelector('[data-test="bot-detected"]');
|
||||
const botDetected = botElement?.textContent?.toLowerCase().includes('true') || false;
|
||||
|
||||
// Try to find visitor ID (proves fingerprinting worked)
|
||||
const visitorIdElement = document.querySelector('[data-test="visitor-id"]');
|
||||
const visitorId = visitorIdElement?.textContent?.trim() || null;
|
||||
|
||||
// Alternative: look for common UI patterns if data-test attrs not present
|
||||
let detectedIp = ip;
|
||||
if (!detectedIp) {
|
||||
// Look for IP in any element containing IP-like pattern
|
||||
const allText = document.body.innerText;
|
||||
const ipMatch = allText.match(/\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/);
|
||||
detectedIp = ipMatch ? ipMatch[1] : null;
|
||||
}
|
||||
|
||||
return {
|
||||
ip: detectedIp,
|
||||
botDetected,
|
||||
visitorId,
|
||||
pageLoaded: !!document.body,
|
||||
};
|
||||
});
|
||||
|
||||
if (fingerprintData.ip) {
|
||||
result.proxyIp = fingerprintData.ip;
|
||||
console.log(`[PuppeteerPreflight] Detected IP: ${fingerprintData.ip}`);
|
||||
|
||||
// Verify IP matches expected proxy
|
||||
if (expectedProxyHost) {
|
||||
// Check if detected IP contains the proxy host (or is close match)
|
||||
if (fingerprintData.ip === expectedProxyHost ||
|
||||
expectedProxyHost.includes(fingerprintData.ip) ||
|
||||
fingerprintData.ip.includes(expectedProxyHost.split('.').slice(0, 3).join('.'))) {
|
||||
result.ipVerified = true;
|
||||
console.log(`[PuppeteerPreflight] IP VERIFIED - matches proxy`);
|
||||
} else {
|
||||
console.log(`[PuppeteerPreflight] IP mismatch: expected ${expectedProxyHost}, got ${fingerprintData.ip}`);
|
||||
// Don't fail - residential proxies often show different egress IPs
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fingerprintData.visitorId) {
|
||||
console.log(`[PuppeteerPreflight] Fingerprint visitor ID: ${fingerprintData.visitorId}`);
|
||||
}
|
||||
|
||||
result.botDetection = {
|
||||
detected: fingerprintData.botDetected,
|
||||
};
|
||||
|
||||
if (fingerprintData.botDetected) {
|
||||
console.log(`[PuppeteerPreflight] WARNING: Bot detection triggered!`);
|
||||
} else {
|
||||
console.log(`[PuppeteerPreflight] Anti-detect check: NOT detected as bot`);
|
||||
result.antidetectReady = true;
|
||||
}
|
||||
} catch (fpErr: any) {
|
||||
// Could mean proxy connection failed
|
||||
console.log(`[PuppeteerPreflight] Fingerprint.com check failed: ${fpErr.message}`);
|
||||
if (fpErr.message.includes('net::ERR_PROXY') || fpErr.message.includes('ECONNREFUSED')) {
|
||||
result.error = `Proxy connection failed: ${fpErr.message}`;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Try fallback: amiunique.org
|
||||
console.log(`[PuppeteerPreflight] Trying fallback: ${AMIUNIQUE_URL}...`);
|
||||
try {
|
||||
await page.goto(AMIUNIQUE_URL, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
result.proxyConnected = true;
|
||||
|
||||
// Extract IP from amiunique.org page
|
||||
const amiData = await page.evaluate(() => {
|
||||
const allText = document.body.innerText;
|
||||
const ipMatch = allText.match(/\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/);
|
||||
return {
|
||||
ip: ipMatch ? ipMatch[1] : null,
|
||||
pageLoaded: !!document.body,
|
||||
};
|
||||
});
|
||||
|
||||
if (amiData.ip) {
|
||||
result.proxyIp = amiData.ip;
|
||||
console.log(`[PuppeteerPreflight] Detected IP via amiunique.org: ${amiData.ip}`);
|
||||
}
|
||||
|
||||
result.antidetectReady = true;
|
||||
console.log(`[PuppeteerPreflight] amiunique.org fallback succeeded`);
|
||||
} catch (amiErr: any) {
|
||||
console.log(`[PuppeteerPreflight] amiunique.org fallback also failed: ${amiErr.message}`);
|
||||
// Continue with Dutchie test anyway
|
||||
result.proxyConnected = true;
|
||||
result.antidetectReady = true;
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// STEP 2: Test Dutchie API access (the real test)
|
||||
// =========================================================================
|
||||
const embedUrl = `https://dutchie.com/embedded-menu/${TEST_CNAME}?menuType=rec`;
|
||||
console.log(`[PuppeteerPreflight] Establishing session at ${embedUrl}...`);
|
||||
|
||||
await page.goto(embedUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Make GraphQL request from browser context
|
||||
const graphqlResult = await page.evaluate(
|
||||
async (platformId: string, hash: string) => {
|
||||
try {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: platformId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active', // CRITICAL: Must be 'Active' per CLAUDE.md
|
||||
types: [],
|
||||
useCache: true,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page: 0,
|
||||
perPage: 10, // Just need a few to prove it works
|
||||
};
|
||||
|
||||
const extensions = {
|
||||
persistedQuery: {
|
||||
version: 1,
|
||||
sha256Hash: hash,
|
||||
},
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify(extensions),
|
||||
});
|
||||
|
||||
const url = `https://dutchie.com/api-3/graphql?${qs.toString()}`;
|
||||
const sessionId = 'preflight-' + Date.now();
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
Accept: 'application/json',
|
||||
'content-type': 'application/json',
|
||||
'x-dutchie-session': sessionId,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return { error: `HTTP ${response.status}`, products: 0 };
|
||||
}
|
||||
|
||||
const json = await response.json();
|
||||
|
||||
if (json.errors) {
|
||||
return { error: JSON.stringify(json.errors).slice(0, 200), products: 0 };
|
||||
}
|
||||
|
||||
const products = json?.data?.filteredProducts?.products || [];
|
||||
return { error: null, products: products.length };
|
||||
} catch (err: any) {
|
||||
return { error: err.message || 'Unknown error', products: 0 };
|
||||
}
|
||||
},
|
||||
TEST_PLATFORM_ID,
|
||||
FILTERED_PRODUCTS_HASH
|
||||
);
|
||||
|
||||
result.responseTimeMs = Date.now() - startTime;
|
||||
|
||||
if (graphqlResult.error) {
|
||||
result.error = `GraphQL error: ${graphqlResult.error}`;
|
||||
console.log(`[PuppeteerPreflight] FAILED - ${result.error}`);
|
||||
} else if (graphqlResult.products === 0) {
|
||||
result.error = 'GraphQL returned 0 products';
|
||||
console.log(`[PuppeteerPreflight] FAILED - No products returned`);
|
||||
} else {
|
||||
result.passed = true;
|
||||
result.productsReturned = graphqlResult.products;
|
||||
console.log(
|
||||
`[PuppeteerPreflight] PASSED - Got ${graphqlResult.products} products in ${result.responseTimeMs}ms`
|
||||
);
|
||||
if (result.proxyIp) {
|
||||
console.log(`[PuppeteerPreflight] Browser IP via proxy: ${result.proxyIp}`);
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
result.error = `Browser error: ${err.message || 'Unknown error'}`;
|
||||
console.log(`[PuppeteerPreflight] FAILED - ${result.error}`);
|
||||
} finally {
|
||||
if (browser) {
|
||||
await browser.close().catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run Puppeteer preflight with retry
|
||||
* Retries once on failure to handle transient issues
|
||||
*
|
||||
* @param crawlRotator - CrawlRotator instance to get proxy from pool
|
||||
* @param maxRetries - Number of retry attempts (default 1)
|
||||
*/
|
||||
export async function runPuppeteerPreflightWithRetry(
|
||||
crawlRotator?: CrawlRotator,
|
||||
maxRetries: number = 1
|
||||
): Promise<PuppeteerPreflightResult> {
|
||||
let lastResult: PuppeteerPreflightResult | null = null;
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
if (attempt > 0) {
|
||||
console.log(`[PuppeteerPreflight] Retry attempt ${attempt}/${maxRetries}...`);
|
||||
await new Promise((r) => setTimeout(r, 5000)); // Wait 5s between retries
|
||||
}
|
||||
|
||||
lastResult = await runPuppeteerPreflight(crawlRotator);
|
||||
|
||||
if (lastResult.passed) {
|
||||
return lastResult;
|
||||
}
|
||||
}
|
||||
|
||||
return lastResult!;
|
||||
}
|
||||
@@ -51,6 +51,7 @@ export interface WorkerTask {
|
||||
platform: string | null;
|
||||
status: TaskStatus;
|
||||
priority: number;
|
||||
method: 'curl' | 'http' | null; // Transport method: curl=axios/proxy, http=Puppeteer/browser
|
||||
scheduled_for: Date | null;
|
||||
worker_id: string | null;
|
||||
claimed_at: Date | null;
|
||||
@@ -152,23 +153,33 @@ class TaskService {
|
||||
* Claim a task atomically for a worker
|
||||
* If role is null, claims ANY available task (role-agnostic worker)
|
||||
* Returns null if task pool is paused.
|
||||
*
|
||||
* @param role - Task role to claim, or null for any task
|
||||
* @param workerId - Worker ID claiming the task
|
||||
* @param curlPassed - Whether worker passed curl preflight (default true for backward compat)
|
||||
* @param httpPassed - Whether worker passed http/Puppeteer preflight (default false)
|
||||
*/
|
||||
async claimTask(role: TaskRole | null, workerId: string): Promise<WorkerTask | null> {
|
||||
async claimTask(
|
||||
role: TaskRole | null,
|
||||
workerId: string,
|
||||
curlPassed: boolean = true,
|
||||
httpPassed: boolean = false
|
||||
): Promise<WorkerTask | null> {
|
||||
// Check if task pool is paused - don't claim any tasks
|
||||
if (isTaskPoolPaused()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (role) {
|
||||
// Role-specific claiming - use the SQL function
|
||||
// Role-specific claiming - use the SQL function with preflight capabilities
|
||||
const result = await pool.query(
|
||||
`SELECT * FROM claim_task($1, $2)`,
|
||||
[role, workerId]
|
||||
`SELECT * FROM claim_task($1, $2, $3, $4)`,
|
||||
[role, workerId, curlPassed, httpPassed]
|
||||
);
|
||||
return (result.rows[0] as WorkerTask) || null;
|
||||
}
|
||||
|
||||
// Role-agnostic claiming - claim ANY pending task
|
||||
// Role-agnostic claiming - claim ANY pending task matching worker capabilities
|
||||
const result = await pool.query(`
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
@@ -179,6 +190,12 @@ class TaskService {
|
||||
SELECT id FROM worker_tasks
|
||||
WHERE status = 'pending'
|
||||
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
|
||||
-- Method compatibility: worker must have passed the required preflight
|
||||
AND (
|
||||
method IS NULL -- No preference, any worker can claim
|
||||
OR (method = 'curl' AND $2 = TRUE)
|
||||
OR (method = 'http' AND $3 = TRUE)
|
||||
)
|
||||
-- Exclude stores that already have an active task
|
||||
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM worker_tasks
|
||||
@@ -190,7 +207,7 @@ class TaskService {
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING *
|
||||
`, [workerId]);
|
||||
`, [workerId, curlPassed, httpPassed]);
|
||||
|
||||
return (result.rows[0] as WorkerTask) || null;
|
||||
}
|
||||
|
||||
@@ -51,6 +51,10 @@ import os from 'os';
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
import { setCrawlRotator } from '../platforms/dutchie';
|
||||
|
||||
// Dual-transport preflight system
|
||||
import { runCurlPreflight, CurlPreflightResult } from '../services/curl-preflight';
|
||||
import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight';
|
||||
|
||||
// Task handlers by role
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch and product_refresh are now separate
|
||||
import { handlePayloadFetch } from './handlers/payload-fetch';
|
||||
@@ -189,6 +193,21 @@ export class TaskWorker {
|
||||
private isBackingOff: boolean = false;
|
||||
private backoffReason: string | null = null;
|
||||
|
||||
// ==========================================================================
|
||||
// DUAL-TRANSPORT PREFLIGHT STATUS
|
||||
// ==========================================================================
|
||||
// Workers run BOTH preflights on startup:
|
||||
// - curl: axios/proxy transport - fast, for simple API calls
|
||||
// - http: Puppeteer/browser transport - anti-detect, for Dutchie GraphQL
|
||||
//
|
||||
// Task claiming checks method compatibility - worker must have passed
|
||||
// the preflight for the task's required method.
|
||||
// ==========================================================================
|
||||
private preflightCurlPassed: boolean = false;
|
||||
private preflightHttpPassed: boolean = false;
|
||||
private preflightCurlResult: CurlPreflightResult | null = null;
|
||||
private preflightHttpResult: PuppeteerPreflightResult | null = null;
|
||||
|
||||
constructor(role: TaskRole | null = null, workerId?: string) {
|
||||
this.pool = getPool();
|
||||
this.role = role;
|
||||
@@ -351,6 +370,99 @@ export class TaskWorker {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run dual-transport preflights on startup
|
||||
* Tests both curl (axios/proxy) and http (Puppeteer/browser) transport methods.
|
||||
* Results are reported to worker_registry and used for task claiming.
|
||||
*
|
||||
* NOTE: All current tasks require 'http' method, so http preflight must pass
|
||||
* for the worker to claim any tasks. Curl preflight is for future use.
|
||||
*/
|
||||
private async runDualPreflights(): Promise<void> {
|
||||
console.log(`[TaskWorker] Running dual-transport preflights...`);
|
||||
|
||||
// Run both preflights in parallel for efficiency
|
||||
const [curlResult, httpResult] = await Promise.all([
|
||||
runCurlPreflight(this.crawlRotator).catch((err): CurlPreflightResult => ({
|
||||
method: 'curl',
|
||||
passed: false,
|
||||
proxyAvailable: false,
|
||||
proxyConnected: false,
|
||||
antidetectReady: false,
|
||||
proxyIp: null,
|
||||
fingerprint: null,
|
||||
error: `Preflight error: ${err.message}`,
|
||||
responseTimeMs: null,
|
||||
})),
|
||||
runPuppeteerPreflightWithRetry(this.crawlRotator, 1).catch((err): PuppeteerPreflightResult => ({
|
||||
method: 'http',
|
||||
passed: false,
|
||||
proxyAvailable: false,
|
||||
proxyConnected: false,
|
||||
antidetectReady: false,
|
||||
proxyIp: null,
|
||||
fingerprint: null,
|
||||
error: `Preflight error: ${err.message}`,
|
||||
responseTimeMs: null,
|
||||
productsReturned: 0,
|
||||
})),
|
||||
]);
|
||||
|
||||
// Store results
|
||||
this.preflightCurlResult = curlResult;
|
||||
this.preflightHttpResult = httpResult;
|
||||
this.preflightCurlPassed = curlResult.passed;
|
||||
this.preflightHttpPassed = httpResult.passed;
|
||||
|
||||
// Log results
|
||||
console.log(`[TaskWorker] CURL preflight: ${curlResult.passed ? 'PASSED' : 'FAILED'}${curlResult.error ? ` - ${curlResult.error}` : ''}`);
|
||||
console.log(`[TaskWorker] HTTP preflight: ${httpResult.passed ? 'PASSED' : 'FAILED'}${httpResult.error ? ` - ${httpResult.error}` : ''}`);
|
||||
|
||||
if (httpResult.passed && httpResult.productsReturned) {
|
||||
console.log(`[TaskWorker] HTTP preflight returned ${httpResult.productsReturned} products from test store`);
|
||||
}
|
||||
|
||||
// Report to worker_registry via API
|
||||
await this.reportPreflightStatus();
|
||||
|
||||
// Since all tasks require 'http', warn if http preflight failed
|
||||
if (!this.preflightHttpPassed) {
|
||||
console.warn(`[TaskWorker] WARNING: HTTP preflight failed - this worker cannot claim any tasks!`);
|
||||
console.warn(`[TaskWorker] Error: ${httpResult.error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Report preflight status to worker_registry
|
||||
*/
|
||||
private async reportPreflightStatus(): Promise<void> {
|
||||
try {
|
||||
// Update worker_registry directly via SQL (more reliable than API)
|
||||
await this.pool.query(`
|
||||
SELECT update_worker_preflight($1, 'curl', $2, $3, $4)
|
||||
`, [
|
||||
this.workerId,
|
||||
this.preflightCurlPassed ? 'passed' : 'failed',
|
||||
this.preflightCurlResult?.responseTimeMs || null,
|
||||
this.preflightCurlResult?.error || null,
|
||||
]);
|
||||
|
||||
await this.pool.query(`
|
||||
SELECT update_worker_preflight($1, 'http', $2, $3, $4)
|
||||
`, [
|
||||
this.workerId,
|
||||
this.preflightHttpPassed ? 'passed' : 'failed',
|
||||
this.preflightHttpResult?.responseTimeMs || null,
|
||||
this.preflightHttpResult?.error || null,
|
||||
]);
|
||||
|
||||
console.log(`[TaskWorker] Preflight status reported to worker_registry`);
|
||||
} catch (err: any) {
|
||||
// Non-fatal - worker can still function
|
||||
console.warn(`[TaskWorker] Could not report preflight status: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Register worker with the registry (get friendly name)
|
||||
*/
|
||||
@@ -494,11 +606,15 @@ export class TaskWorker {
|
||||
// Register with the API to get a friendly name
|
||||
await this.register();
|
||||
|
||||
// Run dual-transport preflights
|
||||
await this.runDualPreflights();
|
||||
|
||||
// Start registry heartbeat
|
||||
this.startRegistryHeartbeat();
|
||||
|
||||
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
|
||||
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (max ${this.maxConcurrentTasks} concurrent tasks)`);
|
||||
const preflightMsg = `curl=${this.preflightCurlPassed ? '✓' : '✗'} http=${this.preflightHttpPassed ? '✓' : '✗'}`;
|
||||
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg} (${preflightMsg}, max ${this.maxConcurrentTasks} concurrent tasks)`);
|
||||
|
||||
while (this.isRunning) {
|
||||
try {
|
||||
@@ -552,7 +668,13 @@ export class TaskWorker {
|
||||
|
||||
// Try to claim more tasks if we have capacity
|
||||
if (this.canAcceptMoreTasks()) {
|
||||
const task = await taskService.claimTask(this.role, this.workerId);
|
||||
// Pass preflight capabilities to only claim compatible tasks
|
||||
const task = await taskService.claimTask(
|
||||
this.role,
|
||||
this.workerId,
|
||||
this.preflightCurlPassed,
|
||||
this.preflightHttpPassed
|
||||
);
|
||||
|
||||
if (task) {
|
||||
console.log(`[TaskWorker] ${this.friendlyName} claimed task ${task.id} (${task.role}) [${this.activeTasks.size + 1}/${this.maxConcurrentTasks}]`);
|
||||
@@ -738,6 +860,8 @@ export class TaskWorker {
|
||||
maxConcurrentTasks: number;
|
||||
isBackingOff: boolean;
|
||||
backoffReason: string | null;
|
||||
preflightCurlPassed: boolean;
|
||||
preflightHttpPassed: boolean;
|
||||
} {
|
||||
return {
|
||||
workerId: this.workerId,
|
||||
@@ -748,6 +872,8 @@ export class TaskWorker {
|
||||
maxConcurrentTasks: this.maxConcurrentTasks,
|
||||
isBackingOff: this.isBackingOff,
|
||||
backoffReason: this.backoffReason,
|
||||
preflightCurlPassed: this.preflightCurlPassed,
|
||||
preflightHttpPassed: this.preflightHttpPassed,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user