Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
222 lines
6.9 KiB
TypeScript
222 lines
6.9 KiB
TypeScript
/**
|
|
* Payload Fetch Handler
|
|
*
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing.
|
|
*
|
|
* This handler ONLY:
|
|
* 1. Hits Dutchie GraphQL API
|
|
* 2. Saves raw payload to filesystem (gzipped)
|
|
* 3. Records metadata in raw_crawl_payloads table
|
|
* 4. Queues a product_refresh task to process the payload
|
|
*
|
|
* Benefits of separation:
|
|
* - Retry-friendly: If normalize fails, re-run refresh without re-crawling
|
|
* - Faster refreshes: Local file read vs network call
|
|
* - Replay-able: Run refresh against any historical payload
|
|
* - Less API pressure: Only this role hits Dutchie
|
|
*/
|
|
|
|
import { TaskContext, TaskResult } from '../task-worker';
|
|
import {
|
|
executeGraphQL,
|
|
startSession,
|
|
endSession,
|
|
GRAPHQL_HASHES,
|
|
DUTCHIE_CONFIG,
|
|
} from '../../platforms/dutchie';
|
|
import { saveRawPayload } from '../../utils/payload-storage';
|
|
import { taskService } from '../task-service';
|
|
|
|
export async function handlePayloadFetch(ctx: TaskContext): Promise<TaskResult> {
|
|
const { pool, task } = ctx;
|
|
const dispensaryId = task.dispensary_id;
|
|
|
|
if (!dispensaryId) {
|
|
return { success: false, error: 'No dispensary_id specified for payload_fetch task' };
|
|
}
|
|
|
|
try {
|
|
// ============================================================
|
|
// STEP 1: Load dispensary info
|
|
// ============================================================
|
|
const dispResult = await pool.query(`
|
|
SELECT
|
|
id, name, platform_dispensary_id, menu_url, menu_type, city, state
|
|
FROM dispensaries
|
|
WHERE id = $1 AND crawl_enabled = true
|
|
`, [dispensaryId]);
|
|
|
|
if (dispResult.rows.length === 0) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` };
|
|
}
|
|
|
|
const dispensary = dispResult.rows[0];
|
|
const platformId = dispensary.platform_dispensary_id;
|
|
|
|
if (!platformId) {
|
|
return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` };
|
|
}
|
|
|
|
// Extract cName from menu_url
|
|
const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
|
|
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
|
|
|
|
console.log(`[PayloadFetch] Starting fetch for ${dispensary.name} (ID: ${dispensaryId})`);
|
|
console.log(`[PayloadFetch] Platform ID: ${platformId}, cName: ${cName}`);
|
|
|
|
// ============================================================
|
|
// STEP 2: Start stealth session
|
|
// ============================================================
|
|
const session = startSession();
|
|
console.log(`[PayloadFetch] Session started: ${session.sessionId}`);
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 3: Fetch products via GraphQL (Status: 'All')
|
|
// ============================================================
|
|
const allProducts: any[] = [];
|
|
let page = 0;
|
|
let totalCount = 0;
|
|
const perPage = DUTCHIE_CONFIG.perPage;
|
|
const maxPages = DUTCHIE_CONFIG.maxPages;
|
|
|
|
try {
|
|
while (page < maxPages) {
|
|
const variables = {
|
|
includeEnterpriseSpecials: false,
|
|
productsFilter: {
|
|
dispensaryId: platformId,
|
|
pricingType: 'rec',
|
|
Status: 'All',
|
|
types: [],
|
|
useCache: false,
|
|
isDefaultSort: true,
|
|
sortBy: 'popularSortIdx',
|
|
sortDirection: 1,
|
|
bypassOnlineThresholds: true,
|
|
isKioskMenu: false,
|
|
removeProductsBelowOptionThresholds: false,
|
|
},
|
|
page,
|
|
perPage,
|
|
};
|
|
|
|
console.log(`[PayloadFetch] Fetching page ${page + 1}...`);
|
|
|
|
const result = await executeGraphQL(
|
|
'FilteredProducts',
|
|
variables,
|
|
GRAPHQL_HASHES.FilteredProducts,
|
|
{ cName, maxRetries: 3 }
|
|
);
|
|
|
|
const data = result?.data?.filteredProducts;
|
|
if (!data || !data.products) {
|
|
if (page === 0) {
|
|
throw new Error('No product data returned from GraphQL');
|
|
}
|
|
break;
|
|
}
|
|
|
|
const products = data.products;
|
|
allProducts.push(...products);
|
|
|
|
if (page === 0) {
|
|
totalCount = data.queryInfo?.totalCount || products.length;
|
|
console.log(`[PayloadFetch] Total products reported: ${totalCount}`);
|
|
}
|
|
|
|
if (allProducts.length >= totalCount || products.length < perPage) {
|
|
break;
|
|
}
|
|
|
|
page++;
|
|
|
|
if (page < maxPages) {
|
|
await new Promise(r => setTimeout(r, DUTCHIE_CONFIG.pageDelayMs));
|
|
}
|
|
|
|
if (page % 5 === 0) {
|
|
await ctx.heartbeat();
|
|
}
|
|
}
|
|
|
|
console.log(`[PayloadFetch] Fetched ${allProducts.length} products in ${page + 1} pages`);
|
|
|
|
} finally {
|
|
endSession();
|
|
}
|
|
|
|
if (allProducts.length === 0) {
|
|
return {
|
|
success: false,
|
|
error: 'No products returned from GraphQL',
|
|
productsProcessed: 0,
|
|
};
|
|
}
|
|
|
|
await ctx.heartbeat();
|
|
|
|
// ============================================================
|
|
// STEP 4: Save raw payload to filesystem
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Metadata/Payload separation
|
|
// ============================================================
|
|
const rawPayload = {
|
|
dispensaryId,
|
|
platformId,
|
|
cName,
|
|
fetchedAt: new Date().toISOString(),
|
|
productCount: allProducts.length,
|
|
products: allProducts,
|
|
};
|
|
|
|
const payloadResult = await saveRawPayload(
|
|
pool,
|
|
dispensaryId,
|
|
rawPayload,
|
|
null, // crawl_run_id - not using crawl_runs in new system
|
|
allProducts.length
|
|
);
|
|
|
|
console.log(`[PayloadFetch] Saved payload #${payloadResult.id} (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`);
|
|
|
|
// ============================================================
|
|
// STEP 5: Update dispensary last_fetch_at
|
|
// ============================================================
|
|
await pool.query(`
|
|
UPDATE dispensaries
|
|
SET last_fetch_at = NOW()
|
|
WHERE id = $1
|
|
`, [dispensaryId]);
|
|
|
|
// ============================================================
|
|
// STEP 6: Queue product_refresh task to process the payload
|
|
// Per TASK_WORKFLOW_2024-12-10.md: Task chaining
|
|
// ============================================================
|
|
await taskService.createTask({
|
|
role: 'product_refresh',
|
|
dispensary_id: dispensaryId,
|
|
priority: task.priority || 0,
|
|
payload: { payload_id: payloadResult.id },
|
|
});
|
|
|
|
console.log(`[PayloadFetch] Queued product_refresh task for payload #${payloadResult.id}`);
|
|
|
|
return {
|
|
success: true,
|
|
payloadId: payloadResult.id,
|
|
productCount: allProducts.length,
|
|
sizeBytes: payloadResult.sizeBytes,
|
|
};
|
|
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
console.error(`[PayloadFetch] Error for dispensary ${dispensaryId}:`, errorMessage);
|
|
return {
|
|
success: false,
|
|
error: errorMessage,
|
|
};
|
|
}
|
|
}
|