Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
190 lines
6.3 KiB
TypeScript
190 lines
6.3 KiB
TypeScript
/**
|
|
* Entry Point Discovery Handler
|
|
*
|
|
* Resolves platform IDs for a discovered store using Dutchie GraphQL.
|
|
* This is the step between store_discovery and product_discovery.
|
|
*
|
|
* Flow:
|
|
* 1. Load dispensary info from database
|
|
* 2. Extract slug from menu_url
|
|
* 3. Start stealth session (fingerprint + optional proxy)
|
|
* 4. Query Dutchie GraphQL to resolve slug → platform_dispensary_id
|
|
* 5. Update dispensary record with resolved ID
|
|
* 6. Queue product_discovery task if successful
|
|
*/
|
|
|
|
import { TaskContext, TaskResult } from '../task-worker';
|
|
import { startSession, endSession } from '../../platforms/dutchie';
|
|
import { resolveDispensaryIdWithDetails } from '../../platforms/dutchie/queries';
|
|
|
|
export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
|
const { pool, task } = ctx;
|
|
const dispensaryId = task.dispensary_id;
|
|
|
|
if (!dispensaryId) {
|
|
return { success: false, error: 'No dispensary_id specified for entry_point_discovery task' };
|
|
}
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 1: Load dispensary info
|
|
// ============================================================
|
|
const dispResult = await pool.query(`
|
|
SELECT id, name, menu_url, platform_dispensary_id, menu_type, state
|
|
FROM dispensaries
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
if (dispResult.rows.length === 0) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} not found` };
|
|
}
|
|
|
|
const dispensary = dispResult.rows[0];
|
|
|
|
// If already has platform_dispensary_id, we're done
|
|
if (dispensary.platform_dispensary_id) {
|
|
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID: ${dispensary.platform_dispensary_id}`);
|
|
return {
|
|
success: true,
|
|
alreadyResolved: true,
|
|
platformId: dispensary.platform_dispensary_id,
|
|
};
|
|
}
|
|
|
|
const menuUrl = dispensary.menu_url;
|
|
if (!menuUrl) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} has no menu_url` };
|
|
}
|
|
|
|
console.log(`[EntryPointDiscovery] Resolving platform ID for ${dispensary.name}`);
|
|
console.log(`[EntryPointDiscovery] Menu URL: ${menuUrl}`);
|
|
|
|
// ============================================================
|
|
// STEP 2: Extract slug from menu URL
|
|
// ============================================================
|
|
let slug: string | null = null;
|
|
|
|
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
|
|
const dispensaryMatch = menuUrl.match(/\/dispensary\/([^/?]+)/);
|
|
|
|
if (embeddedMatch) {
|
|
slug = embeddedMatch[1];
|
|
} else if (dispensaryMatch) {
|
|
slug = dispensaryMatch[1];
|
|
}
|
|
|
|
if (!slug) {
|
|
// Mark as non-dutchie menu type
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET menu_type = 'unknown', updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
return {
|
|
success: false,
|
|
error: `Could not extract slug from menu_url: ${menuUrl}`,
|
|
};
|
|
}
|
|
|
|
console.log(`[EntryPointDiscovery] Extracted slug: ${slug}`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 3: Start stealth session
|
|
// ============================================================
|
|
// Per workflow-12102025.md: session identity comes from proxy location, not task params
|
|
const session = startSession();
|
|
console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`);
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 4: Resolve platform ID via GraphQL
|
|
// ============================================================
|
|
console.log(`[EntryPointDiscovery] Querying Dutchie GraphQL for slug: ${slug}`);
|
|
|
|
const result = await resolveDispensaryIdWithDetails(slug);
|
|
|
|
if (!result.dispensaryId) {
|
|
// Resolution failed - could be 403, 404, or invalid response
|
|
const reason = result.httpStatus
|
|
? `HTTP ${result.httpStatus}`
|
|
: result.error || 'Unknown error';
|
|
|
|
console.log(`[EntryPointDiscovery] Failed to resolve ${slug}: ${reason}`);
|
|
|
|
// Mark as failed resolution but keep menu_type as dutchie
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET
|
|
menu_type = CASE
|
|
WHEN $2 = 404 THEN 'removed'
|
|
WHEN $2 = 403 THEN 'blocked'
|
|
ELSE 'dutchie'
|
|
END,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId, result.httpStatus || 0]);
|
|
|
|
return {
|
|
success: false,
|
|
error: `Could not resolve platform ID: ${reason}`,
|
|
slug,
|
|
httpStatus: result.httpStatus,
|
|
};
|
|
}
|
|
|
|
const platformId = result.dispensaryId;
|
|
console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${platformId}`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 5: Update dispensary with resolved ID
|
|
// ============================================================
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET
|
|
platform_dispensary_id = $2,
|
|
menu_type = 'dutchie',
|
|
crawl_enabled = true,
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId, platformId]);
|
|
|
|
console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID`);
|
|
|
|
// ============================================================
|
|
// STEP 6: Queue product_discovery task
|
|
// ============================================================
|
|
await pool.query(`
|
|
INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for)
|
|
VALUES ('product_discovery', $1, 5, NOW())
|
|
ON CONFLICT DO NOTHING
|
|
`, [dispensaryId]);
|
|
|
|
console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`);
|
|
|
|
return {
|
|
success: true,
|
|
platformId,
|
|
slug,
|
|
queuedProductDiscovery: true,
|
|
};
|
|
|
|
} finally {
|
|
// Always end session
|
|
endSession();
|
|
}
|
|
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
console.error(`[EntryPointDiscovery] Error for dispensary ${dispensaryId}:`, errorMessage);
|
|
return {
|
|
success: false,
|
|
error: errorMessage,
|
|
};
|
|
}
|
|
}
|