## Worker System - Role-agnostic workers that can handle any task type - Pod-based architecture with StatefulSet (5-15 pods, 5 workers each) - Custom pod names (Aethelgard, Xylos, Kryll, etc.) - Worker registry with friendly names and resource monitoring - Hub-and-spoke visualization on JobQueue page ## Stealth & Anti-Detection (REQUIRED) - Proxies are MANDATORY - workers fail to start without active proxies - CrawlRotator initializes on worker startup - Loads proxies from `proxies` table - Auto-rotates proxy + fingerprint on 403 errors - 12 browser fingerprints (Chrome, Firefox, Safari, Edge) - Locale/timezone matching for geographic consistency ## Task System - Renamed product_resync → product_refresh - Task chaining: store_discovery → entry_point → product_discovery - Priority-based claiming with FOR UPDATE SKIP LOCKED - Heartbeat and stale task recovery ## UI Updates - JobQueue: Pod visualization, resource monitoring on hover - WorkersDashboard: Simplified worker list - Removed unused filters from task list ## Other - IP2Location service for visitor analytics - Findagram consumer features scaffolding - Documentation updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
189 lines
6.2 KiB
TypeScript
189 lines
6.2 KiB
TypeScript
/**
|
|
* Entry Point Discovery Handler
|
|
*
|
|
* Resolves platform IDs for a discovered store using Dutchie GraphQL.
|
|
* This is the step between store_discovery and product_discovery.
|
|
*
|
|
* Flow:
|
|
* 1. Load dispensary info from database
|
|
* 2. Extract slug from menu_url
|
|
* 3. Start stealth session (fingerprint + optional proxy)
|
|
* 4. Query Dutchie GraphQL to resolve slug → platform_dispensary_id
|
|
* 5. Update dispensary record with resolved ID
|
|
* 6. Queue product_discovery task if successful
|
|
*/
|
|
|
|
import { TaskContext, TaskResult } from '../task-worker';
|
|
import { startSession, endSession } from '../../platforms/dutchie';
|
|
import { resolveDispensaryIdWithDetails } from '../../platforms/dutchie/queries';
|
|
|
|
export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
|
const { pool, task } = ctx;
|
|
const dispensaryId = task.dispensary_id;
|
|
|
|
if (!dispensaryId) {
|
|
return { success: false, error: 'No dispensary_id specified for entry_point_discovery task' };
|
|
}
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 1: Load dispensary info
|
|
// ============================================================
|
|
const dispResult = await pool.query(`
|
|
SELECT id, name, menu_url, platform_dispensary_id, menu_type, state
|
|
FROM dispensaries
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
if (dispResult.rows.length === 0) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} not found` };
|
|
}
|
|
|
|
const dispensary = dispResult.rows[0];
|
|
|
|
// If already has platform_dispensary_id, we're done
|
|
if (dispensary.platform_dispensary_id) {
|
|
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID: ${dispensary.platform_dispensary_id}`);
|
|
return {
|
|
success: true,
|
|
alreadyResolved: true,
|
|
platformId: dispensary.platform_dispensary_id,
|
|
};
|
|
}
|
|
|
|
const menuUrl = dispensary.menu_url;
|
|
if (!menuUrl) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} has no menu_url` };
|
|
}
|
|
|
|
console.log(`[EntryPointDiscovery] Resolving platform ID for ${dispensary.name}`);
|
|
console.log(`[EntryPointDiscovery] Menu URL: ${menuUrl}`);
|
|
|
|
// ============================================================
|
|
// STEP 2: Extract slug from menu URL
|
|
// ============================================================
|
|
let slug: string | null = null;
|
|
|
|
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
|
|
const dispensaryMatch = menuUrl.match(/\/dispensary\/([^/?]+)/);
|
|
|
|
if (embeddedMatch) {
|
|
slug = embeddedMatch[1];
|
|
} else if (dispensaryMatch) {
|
|
slug = dispensaryMatch[1];
|
|
}
|
|
|
|
if (!slug) {
|
|
// Mark as non-dutchie menu type
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET menu_type = 'unknown', updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
return {
|
|
success: false,
|
|
error: `Could not extract slug from menu_url: ${menuUrl}`,
|
|
};
|
|
}
|
|
|
|
console.log(`[EntryPointDiscovery] Extracted slug: ${slug}`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 3: Start stealth session
|
|
// ============================================================
|
|
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
|
|
console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`);
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 4: Resolve platform ID via GraphQL
|
|
// ============================================================
|
|
console.log(`[EntryPointDiscovery] Querying Dutchie GraphQL for slug: ${slug}`);
|
|
|
|
const result = await resolveDispensaryIdWithDetails(slug);
|
|
|
|
if (!result.dispensaryId) {
|
|
// Resolution failed - could be 403, 404, or invalid response
|
|
const reason = result.httpStatus
|
|
? `HTTP ${result.httpStatus}`
|
|
: result.error || 'Unknown error';
|
|
|
|
console.log(`[EntryPointDiscovery] Failed to resolve ${slug}: ${reason}`);
|
|
|
|
// Mark as failed resolution but keep menu_type as dutchie
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET
|
|
menu_type = CASE
|
|
WHEN $2 = 404 THEN 'removed'
|
|
WHEN $2 = 403 THEN 'blocked'
|
|
ELSE 'dutchie'
|
|
END,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId, result.httpStatus || 0]);
|
|
|
|
return {
|
|
success: false,
|
|
error: `Could not resolve platform ID: ${reason}`,
|
|
slug,
|
|
httpStatus: result.httpStatus,
|
|
};
|
|
}
|
|
|
|
const platformId = result.dispensaryId;
|
|
console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${platformId}`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 5: Update dispensary with resolved ID
|
|
// ============================================================
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET
|
|
platform_dispensary_id = $2,
|
|
menu_type = 'dutchie',
|
|
crawl_enabled = true,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId, platformId]);
|
|
|
|
console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID`);
|
|
|
|
// ============================================================
|
|
// STEP 6: Queue product_discovery task
|
|
// ============================================================
|
|
await pool.query(`
|
|
INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for)
|
|
VALUES ('product_discovery', $1, 5, NOW())
|
|
ON CONFLICT DO NOTHING
|
|
`, [dispensaryId]);
|
|
|
|
console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`);
|
|
|
|
return {
|
|
success: true,
|
|
platformId,
|
|
slug,
|
|
queuedProductDiscovery: true,
|
|
};
|
|
|
|
} finally {
|
|
// Always end session
|
|
endSession();
|
|
}
|
|
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
console.error(`[EntryPointDiscovery] Error for dispensary ${dispensaryId}:`, errorMessage);
|
|
return {
|
|
success: false,
|
|
error: errorMessage,
|
|
};
|
|
}
|
|
}
|