feat: Platform isolation, Evomi geo-targeting, proxy management

Platform isolation:
- Rename handlers to {task}-{platform}.ts convention
- Deprecate -curl variants (now _deprecated-*)
- Platform-based routing in task-worker.ts
- Add Jane platform handlers and client

Evomi geo-targeting:
- Add dynamic proxy URL builder with state/city targeting
- Session stickiness per worker per state (30 min)
- Fallback to static proxy table when API unavailable
- Add proxy tracking columns to worker_tasks

Proxy management:
- New /proxies admin page for visibility
- Track proxy_ip, proxy_geo, proxy_source per task
- Show active sessions and task history

Validation filtering:
- Filter by validated stores (platform_dispensary_id + menu_url)
- Mark incomplete stores as deprecated
- Update all dashboard/stats queries

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-13 15:16:48 -07:00
parent 59e0e45f8f
commit c215d11a84
31 changed files with 2698 additions and 129 deletions

View File

@@ -68,19 +68,22 @@ import { runCurlPreflight, CurlPreflightResult } from '../services/curl-prefligh
import { runPuppeteerPreflightWithRetry, PuppeteerPreflightResult } from '../services/puppeteer-preflight';
// Task handlers by role
// Per TASK_WORKFLOW_2024-12-10.md: payload_fetch and product_refresh are now separate
// Dual-transport: curl vs http (browser-based) handlers
import { handlePayloadFetch } from './handlers/payload-fetch-curl';
// Platform-based handlers: {task}-{platform}.ts convention
import { handleProductRefresh } from './handlers/product-refresh';
import { handleProductDiscovery } from './handlers/product-discovery-curl';
import { handleProductDiscoveryHttp } from './handlers/product-discovery-http';
import { handleStoreDiscovery } from './handlers/store-discovery';
import { handleStoreDiscoveryHttp } from './handlers/store-discovery-http';
import { handleStoreDiscoveryState } from './handlers/store-discovery-state';
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
import { handleWhoami } from './handlers/whoami';
// Dutchie Platform Handlers
import { handleProductDiscoveryDutchie } from './handlers/product-discovery-dutchie';
import { handleStoreDiscoveryDutchie } from './handlers/store-discovery-dutchie';
// Jane Platform Handlers
import { handleStoreDiscoveryJane } from './handlers/store-discovery-jane';
import { handleEntryPointDiscoveryJane } from './handlers/entry-point-discovery-jane';
import { handleProductDiscoveryJane } from './handlers/product-discovery-jane';
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
@@ -152,48 +155,66 @@ export interface TaskResult {
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
// Per TASK_WORKFLOW_2024-12-10.md: Handler registry
// payload_fetch: Fetches from Dutchie API, saves to disk
// product_refresh: Reads local payload, normalizes, upserts to DB
// product_discovery: Main handler for product crawling (has curl and http variants)
const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
payload_fetch: handlePayloadFetch, // API fetch -> disk (curl)
product_refresh: handleProductRefresh, // disk -> DB
product_discovery: handleProductDiscovery, // Default: curl (see getHandlerForTask for http override)
store_discovery: handleStoreDiscovery,
store_discovery_state: handleStoreDiscoveryState, // Per-state parallelized discovery
// Platform-agnostic handlers (shared across Dutchie and Jane)
// product_refresh: Reads local payload, uses platform-aware normalizer, upserts to DB
const SHARED_HANDLERS: Partial<Record<TaskRole, TaskHandler>> = {
product_refresh: handleProductRefresh,
store_discovery_state: handleStoreDiscoveryState,
entry_point_discovery: handleEntryPointDiscovery,
analytics_refresh: handleAnalyticsRefresh,
whoami: handleWhoami, // Tests proxy + anti-detect
whoami: handleWhoami,
};
/**
* Get the appropriate handler for a task, considering both role and method.
* Get the appropriate handler for a task based on platform.
*
* Dual-transport handlers:
* - product_discovery: curl (axios) or http (Puppeteer)
* - store_discovery: curl (axios) or http (Puppeteer)
* Naming convention: {task}-{platform}.ts
* - product-discovery-dutchie.ts
* - product-discovery-jane.ts
* - store-discovery-dutchie.ts
* - store-discovery-jane.ts
*
* Default method is 'http' since all GraphQL queries should use browser transport
* for better TLS fingerprinting and session-based proxy compatibility.
* All handlers use HTTP/Puppeteer transport (curl transport is deprecated).
*/
function getHandlerForTask(task: WorkerTask): TaskHandler | undefined {
const role = task.role as TaskRole;
const method = task.method || 'http'; // Default to HTTP for all GraphQL tasks
const platform = task.platform || 'dutchie';
// product_discovery: dual-transport support
if (role === 'product_discovery' && method === 'http') {
console.log(`[TaskWorker] Using HTTP handler for product_discovery (method=${method})`);
return handleProductDiscoveryHttp;
// ==========================================================================
// JANE PLATFORM ROUTING
// ==========================================================================
if (platform === 'jane') {
if (role === 'store_discovery' || role === 'store_discovery_state') {
console.log(`[TaskWorker] Using Jane handler for store_discovery`);
return handleStoreDiscoveryJane;
}
if (role === 'entry_point_discovery') {
console.log(`[TaskWorker] Using Jane handler for entry_point_discovery`);
return handleEntryPointDiscoveryJane;
}
if (role === 'product_discovery') {
console.log(`[TaskWorker] Using Jane handler for product_discovery`);
return handleProductDiscoveryJane;
}
}
// store_discovery: dual-transport support
if (role === 'store_discovery' && method === 'http') {
console.log(`[TaskWorker] Using HTTP handler for store_discovery (method=${method})`);
return handleStoreDiscoveryHttp;
// ==========================================================================
// DUTCHIE PLATFORM ROUTING (default)
// ==========================================================================
if (role === 'product_discovery') {
console.log(`[TaskWorker] Using Dutchie handler for product_discovery`);
return handleProductDiscoveryDutchie;
}
// Default: use the static handler registry (curl-based)
return TASK_HANDLERS[role];
if (role === 'store_discovery') {
console.log(`[TaskWorker] Using Dutchie handler for store_discovery`);
return handleStoreDiscoveryDutchie;
}
// ==========================================================================
// SHARED HANDLERS (platform-agnostic)
// ==========================================================================
return SHARED_HANDLERS[role];
}
/**