feat(tasks): Refactor task workflow with payload/refresh separation

Major changes:
- Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB)
- Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh
- Add payload storage utilities for gzipped JSON on filesystem
- Add /api/payloads endpoints for payload access and diffing
- Add DB-driven TaskScheduler with schedule persistence
- Track newDispensaryIds through discovery promotion for chaining
- Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements
- Add Workers dashboard K8s scaling controls

New files:
- src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk
- src/services/task-scheduler.ts - DB-driven schedule management
- src/utils/payload-storage.ts - Payload save/load utilities
- src/routes/payloads.ts - Payload API endpoints
- src/services/http-fingerprint.ts - Browser fingerprint generation
- docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation

Migrations:
- 078: Proxy consecutive 403 tracking
- 079: task_schedules table
- 080: raw_crawl_payloads table
- 081: payload column and last_fetch_at

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-10 22:15:35 -07:00
parent 1fb0eb94c2
commit 4949b22457
33 changed files with 4064 additions and 737 deletions

View File

@@ -0,0 +1,221 @@
/**
* Payload Fetch Handler
*
* Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing.
*
* This handler ONLY:
* 1. Hits Dutchie GraphQL API
* 2. Saves raw payload to filesystem (gzipped)
* 3. Records metadata in raw_crawl_payloads table
* 4. Queues a product_refresh task to process the payload
*
* Benefits of separation:
* - Retry-friendly: If normalize fails, re-run refresh without re-crawling
* - Faster refreshes: Local file read vs network call
* - Replay-able: Run refresh against any historical payload
* - Less API pressure: Only this role hits Dutchie
*/
import { TaskContext, TaskResult } from '../task-worker';
import {
executeGraphQL,
startSession,
endSession,
GRAPHQL_HASHES,
DUTCHIE_CONFIG,
} from '../../platforms/dutchie';
import { saveRawPayload } from '../../utils/payload-storage';
import { taskService } from '../task-service';
export async function handlePayloadFetch(ctx: TaskContext): Promise<TaskResult> {
const { pool, task } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return { success: false, error: 'No dispensary_id specified for payload_fetch task' };
}
try {
// ============================================================
// STEP 1: Load dispensary info
// ============================================================
const dispResult = await pool.query(`
SELECT
id, name, platform_dispensary_id, menu_url, menu_type, city, state
FROM dispensaries
WHERE id = $1 AND crawl_enabled = true
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` };
}
const dispensary = dispResult.rows[0];
const platformId = dispensary.platform_dispensary_id;
if (!platformId) {
return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` };
}
// Extract cName from menu_url
const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
console.log(`[PayloadFetch] Starting fetch for ${dispensary.name} (ID: ${dispensaryId})`);
console.log(`[PayloadFetch] Platform ID: ${platformId}, cName: ${cName}`);
// ============================================================
// STEP 2: Start stealth session
// ============================================================
const session = startSession();
console.log(`[PayloadFetch] Session started: ${session.sessionId}`);
await ctx.heartbeat();
// ============================================================
// STEP 3: Fetch products via GraphQL (Status: 'All')
// ============================================================
const allProducts: any[] = [];
let page = 0;
let totalCount = 0;
const perPage = DUTCHIE_CONFIG.perPage;
const maxPages = DUTCHIE_CONFIG.maxPages;
try {
while (page < maxPages) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: platformId,
pricingType: 'rec',
Status: 'All',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage,
};
console.log(`[PayloadFetch] Fetching page ${page + 1}...`);
const result = await executeGraphQL(
'FilteredProducts',
variables,
GRAPHQL_HASHES.FilteredProducts,
{ cName, maxRetries: 3 }
);
const data = result?.data?.filteredProducts;
if (!data || !data.products) {
if (page === 0) {
throw new Error('No product data returned from GraphQL');
}
break;
}
const products = data.products;
allProducts.push(...products);
if (page === 0) {
totalCount = data.queryInfo?.totalCount || products.length;
console.log(`[PayloadFetch] Total products reported: ${totalCount}`);
}
if (allProducts.length >= totalCount || products.length < perPage) {
break;
}
page++;
if (page < maxPages) {
await new Promise(r => setTimeout(r, DUTCHIE_CONFIG.pageDelayMs));
}
if (page % 5 === 0) {
await ctx.heartbeat();
}
}
console.log(`[PayloadFetch] Fetched ${allProducts.length} products in ${page + 1} pages`);
} finally {
endSession();
}
if (allProducts.length === 0) {
return {
success: false,
error: 'No products returned from GraphQL',
productsProcessed: 0,
};
}
await ctx.heartbeat();
// ============================================================
// STEP 4: Save raw payload to filesystem
// Per TASK_WORKFLOW_2024-12-10.md: Metadata/Payload separation
// ============================================================
const rawPayload = {
dispensaryId,
platformId,
cName,
fetchedAt: new Date().toISOString(),
productCount: allProducts.length,
products: allProducts,
};
const payloadResult = await saveRawPayload(
pool,
dispensaryId,
rawPayload,
null, // crawl_run_id - not using crawl_runs in new system
allProducts.length
);
console.log(`[PayloadFetch] Saved payload #${payloadResult.id} (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`);
// ============================================================
// STEP 5: Update dispensary last_fetch_at
// ============================================================
await pool.query(`
UPDATE dispensaries
SET last_fetch_at = NOW()
WHERE id = $1
`, [dispensaryId]);
// ============================================================
// STEP 6: Queue product_refresh task to process the payload
// Per TASK_WORKFLOW_2024-12-10.md: Task chaining
// ============================================================
await taskService.createTask({
role: 'product_refresh',
dispensary_id: dispensaryId,
priority: task.priority || 0,
payload: { payload_id: payloadResult.id },
});
console.log(`[PayloadFetch] Queued product_refresh task for payload #${payloadResult.id}`);
return {
success: true,
payloadId: payloadResult.id,
productCount: allProducts.length,
sizeBytes: payloadResult.sizeBytes,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[PayloadFetch] Error for dispensary ${dispensaryId}:`, errorMessage);
return {
success: false,
error: errorMessage,
};
}
}