feat: Stealth worker system with mandatory proxy rotation
## Worker System - Role-agnostic workers that can handle any task type - Pod-based architecture with StatefulSet (5-15 pods, 5 workers each) - Custom pod names (Aethelgard, Xylos, Kryll, etc.) - Worker registry with friendly names and resource monitoring - Hub-and-spoke visualization on JobQueue page ## Stealth & Anti-Detection (REQUIRED) - Proxies are MANDATORY - workers fail to start without active proxies - CrawlRotator initializes on worker startup - Loads proxies from `proxies` table - Auto-rotates proxy + fingerprint on 403 errors - 12 browser fingerprints (Chrome, Firefox, Safari, Edge) - Locale/timezone matching for geographic consistency ## Task System - Renamed product_resync → product_refresh - Task chaining: store_discovery → entry_point → product_discovery - Priority-based claiming with FOR UPDATE SKIP LOCKED - Heartbeat and stale task recovery ## UI Updates - JobQueue: Pod visualization, resource monitoring on hover - WorkersDashboard: Simplified worker list - Removed unused filters from task list ## Other - IP2Location service for visitor analytics - Findagram consumer features scaffolding - Documentation updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,13 +1,21 @@
|
||||
/**
|
||||
* Entry Point Discovery Handler
|
||||
*
|
||||
* Detects menu type and resolves platform IDs for a discovered store.
|
||||
* Resolves platform IDs for a discovered store using Dutchie GraphQL.
|
||||
* This is the step between store_discovery and product_discovery.
|
||||
*
|
||||
* TODO: Integrate with platform ID resolution when available
|
||||
* Flow:
|
||||
* 1. Load dispensary info from database
|
||||
* 2. Extract slug from menu_url
|
||||
* 3. Start stealth session (fingerprint + optional proxy)
|
||||
* 4. Query Dutchie GraphQL to resolve slug → platform_dispensary_id
|
||||
* 5. Update dispensary record with resolved ID
|
||||
* 6. Queue product_discovery task if successful
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import { startSession, endSession } from '../../platforms/dutchie';
|
||||
import { resolveDispensaryIdWithDetails } from '../../platforms/dutchie/queries';
|
||||
|
||||
export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool, task } = ctx;
|
||||
@@ -18,9 +26,11 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
}
|
||||
|
||||
try {
|
||||
// Get dispensary info
|
||||
// ============================================================
|
||||
// STEP 1: Load dispensary info
|
||||
// ============================================================
|
||||
const dispResult = await pool.query(`
|
||||
SELECT id, name, menu_url, platform_dispensary_id, menu_type
|
||||
SELECT id, name, menu_url, platform_dispensary_id, menu_type, state
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
@@ -33,7 +43,7 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
|
||||
// If already has platform_dispensary_id, we're done
|
||||
if (dispensary.platform_dispensary_id) {
|
||||
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID`);
|
||||
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID: ${dispensary.platform_dispensary_id}`);
|
||||
return {
|
||||
success: true,
|
||||
alreadyResolved: true,
|
||||
@@ -46,9 +56,12 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
return { success: false, error: `Dispensary ${dispensaryId} has no menu_url` };
|
||||
}
|
||||
|
||||
console.log(`[EntryPointDiscovery] Would resolve platform ID for ${dispensary.name} from ${menuUrl}`);
|
||||
console.log(`[EntryPointDiscovery] Resolving platform ID for ${dispensary.name}`);
|
||||
console.log(`[EntryPointDiscovery] Menu URL: ${menuUrl}`);
|
||||
|
||||
// Extract slug from menu URL
|
||||
// ============================================================
|
||||
// STEP 2: Extract slug from menu URL
|
||||
// ============================================================
|
||||
let slug: string | null = null;
|
||||
|
||||
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
|
||||
@@ -61,21 +74,109 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
}
|
||||
|
||||
if (!slug) {
|
||||
// Mark as non-dutchie menu type
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = 'unknown', updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Could not extract slug from menu_url: ${menuUrl}`,
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: Integrate with actual platform ID resolution
|
||||
// For now, mark the task as needing manual resolution
|
||||
console.log(`[EntryPointDiscovery] Found slug: ${slug} - manual resolution needed`);
|
||||
console.log(`[EntryPointDiscovery] Extracted slug: ${slug}`);
|
||||
|
||||
await ctx.heartbeat();
|
||||
|
||||
// ============================================================
|
||||
// STEP 3: Start stealth session
|
||||
// ============================================================
|
||||
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
|
||||
console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`);
|
||||
|
||||
try {
|
||||
// ============================================================
|
||||
// STEP 4: Resolve platform ID via GraphQL
|
||||
// ============================================================
|
||||
console.log(`[EntryPointDiscovery] Querying Dutchie GraphQL for slug: ${slug}`);
|
||||
|
||||
const result = await resolveDispensaryIdWithDetails(slug);
|
||||
|
||||
if (!result.dispensaryId) {
|
||||
// Resolution failed - could be 403, 404, or invalid response
|
||||
const reason = result.httpStatus
|
||||
? `HTTP ${result.httpStatus}`
|
||||
: result.error || 'Unknown error';
|
||||
|
||||
console.log(`[EntryPointDiscovery] Failed to resolve ${slug}: ${reason}`);
|
||||
|
||||
// Mark as failed resolution but keep menu_type as dutchie
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
menu_type = CASE
|
||||
WHEN $2 = 404 THEN 'removed'
|
||||
WHEN $2 = 403 THEN 'blocked'
|
||||
ELSE 'dutchie'
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId, result.httpStatus || 0]);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Could not resolve platform ID: ${reason}`,
|
||||
slug,
|
||||
httpStatus: result.httpStatus,
|
||||
};
|
||||
}
|
||||
|
||||
const platformId = result.dispensaryId;
|
||||
console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${platformId}`);
|
||||
|
||||
await ctx.heartbeat();
|
||||
|
||||
// ============================================================
|
||||
// STEP 5: Update dispensary with resolved ID
|
||||
// ============================================================
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
platform_dispensary_id = $2,
|
||||
menu_type = 'dutchie',
|
||||
crawl_enabled = true,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId, platformId]);
|
||||
|
||||
console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID`);
|
||||
|
||||
// ============================================================
|
||||
// STEP 6: Queue product_discovery task
|
||||
// ============================================================
|
||||
await pool.query(`
|
||||
INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for)
|
||||
VALUES ('product_discovery', $1, 5, NOW())
|
||||
ON CONFLICT DO NOTHING
|
||||
`, [dispensaryId]);
|
||||
|
||||
console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
platformId,
|
||||
slug,
|
||||
queuedProductDiscovery: true,
|
||||
};
|
||||
|
||||
} finally {
|
||||
// Always end session
|
||||
endSession();
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: 'Slug extracted, awaiting platform ID resolution',
|
||||
slug,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
console.error(`[EntryPointDiscovery] Error for dispensary ${dispensaryId}:`, errorMessage);
|
||||
|
||||
Reference in New Issue
Block a user