feat: Stealth worker system with mandatory proxy rotation

## Worker System
- Role-agnostic workers that can handle any task type
- Pod-based architecture with StatefulSet (5-15 pods, 5 workers each)
- Custom pod names (Aethelgard, Xylos, Kryll, etc.)
- Worker registry with friendly names and resource monitoring
- Hub-and-spoke visualization on JobQueue page

## Stealth & Anti-Detection (REQUIRED)
- Proxies are MANDATORY - workers fail to start without active proxies
- CrawlRotator initializes on worker startup
- Loads proxies from `proxies` table
- Auto-rotates proxy + fingerprint on 403 errors
- 12 browser fingerprints (Chrome, Firefox, Safari, Edge)
- Locale/timezone matching for geographic consistency

## Task System
- Renamed product_resync → product_refresh
- Task chaining: store_discovery → entry_point → product_discovery
- Priority-based claiming with FOR UPDATE SKIP LOCKED
- Heartbeat and stale task recovery

## UI Updates
- JobQueue: Pod visualization, resource monitoring on hover
- WorkersDashboard: Simplified worker list
- Removed unused filters from task list

## Other
- IP2Location service for visitor analytics
- Findagram consumer features scaffolding
- Documentation updates

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 00:44:59 -07:00
parent 0295637ed6
commit 56cc171287
61 changed files with 8591 additions and 2076 deletions

View File

@@ -1,13 +1,21 @@
/**
* Entry Point Discovery Handler
*
* Detects menu type and resolves platform IDs for a discovered store.
* Resolves platform IDs for a discovered store using Dutchie GraphQL.
* This is the step between store_discovery and product_discovery.
*
* TODO: Integrate with platform ID resolution when available
* Flow:
* 1. Load dispensary info from database
* 2. Extract slug from menu_url
* 3. Start stealth session (fingerprint + optional proxy)
* 4. Query Dutchie GraphQL to resolve slug → platform_dispensary_id
* 5. Update dispensary record with resolved ID
* 6. Queue product_discovery task if successful
*/
import { TaskContext, TaskResult } from '../task-worker';
import { startSession, endSession } from '../../platforms/dutchie';
import { resolveDispensaryIdWithDetails } from '../../platforms/dutchie/queries';
export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskResult> {
const { pool, task } = ctx;
@@ -18,9 +26,11 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
}
try {
// Get dispensary info
// ============================================================
// STEP 1: Load dispensary info
// ============================================================
const dispResult = await pool.query(`
SELECT id, name, menu_url, platform_dispensary_id, menu_type
SELECT id, name, menu_url, platform_dispensary_id, menu_type, state
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
@@ -33,7 +43,7 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
// If already has platform_dispensary_id, we're done
if (dispensary.platform_dispensary_id) {
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID`);
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID: ${dispensary.platform_dispensary_id}`);
return {
success: true,
alreadyResolved: true,
@@ -46,9 +56,12 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
return { success: false, error: `Dispensary ${dispensaryId} has no menu_url` };
}
console.log(`[EntryPointDiscovery] Would resolve platform ID for ${dispensary.name} from ${menuUrl}`);
console.log(`[EntryPointDiscovery] Resolving platform ID for ${dispensary.name}`);
console.log(`[EntryPointDiscovery] Menu URL: ${menuUrl}`);
// Extract slug from menu URL
// ============================================================
// STEP 2: Extract slug from menu URL
// ============================================================
let slug: string | null = null;
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
@@ -61,21 +74,109 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
}
if (!slug) {
// Mark as non-dutchie menu type
await pool.query(`
UPDATE dispensaries
SET menu_type = 'unknown', updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
return {
success: false,
error: `Could not extract slug from menu_url: ${menuUrl}`,
};
}
// TODO: Integrate with actual platform ID resolution
// For now, mark the task as needing manual resolution
console.log(`[EntryPointDiscovery] Found slug: ${slug} - manual resolution needed`);
console.log(`[EntryPointDiscovery] Extracted slug: ${slug}`);
await ctx.heartbeat();
// ============================================================
// STEP 3: Start stealth session
// ============================================================
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`);
try {
// ============================================================
// STEP 4: Resolve platform ID via GraphQL
// ============================================================
console.log(`[EntryPointDiscovery] Querying Dutchie GraphQL for slug: ${slug}`);
const result = await resolveDispensaryIdWithDetails(slug);
if (!result.dispensaryId) {
// Resolution failed - could be 403, 404, or invalid response
const reason = result.httpStatus
? `HTTP ${result.httpStatus}`
: result.error || 'Unknown error';
console.log(`[EntryPointDiscovery] Failed to resolve ${slug}: ${reason}`);
// Mark as failed resolution but keep menu_type as dutchie
await pool.query(`
UPDATE dispensaries
SET
menu_type = CASE
WHEN $2 = 404 THEN 'removed'
WHEN $2 = 403 THEN 'blocked'
ELSE 'dutchie'
END,
updated_at = NOW()
WHERE id = $1
`, [dispensaryId, result.httpStatus || 0]);
return {
success: false,
error: `Could not resolve platform ID: ${reason}`,
slug,
httpStatus: result.httpStatus,
};
}
const platformId = result.dispensaryId;
console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${platformId}`);
await ctx.heartbeat();
// ============================================================
// STEP 5: Update dispensary with resolved ID
// ============================================================
await pool.query(`
UPDATE dispensaries
SET
platform_dispensary_id = $2,
menu_type = 'dutchie',
crawl_enabled = true,
updated_at = NOW()
WHERE id = $1
`, [dispensaryId, platformId]);
console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID`);
// ============================================================
// STEP 6: Queue product_discovery task
// ============================================================
await pool.query(`
INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for)
VALUES ('product_discovery', $1, 5, NOW())
ON CONFLICT DO NOTHING
`, [dispensaryId]);
console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`);
return {
success: true,
platformId,
slug,
queuedProductDiscovery: true,
};
} finally {
// Always end session
endSession();
}
return {
success: true,
message: 'Slug extracted, awaiting platform ID resolution',
slug,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[EntryPointDiscovery] Error for dispensary ${dispensaryId}:`, errorMessage);