feat(tasks): Refactor task workflow with payload/refresh separation
Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
406
backend/src/utils/payload-storage.ts
Normal file
406
backend/src/utils/payload-storage.ts
Normal file
@@ -0,0 +1,406 @@
|
||||
/**
|
||||
* Payload Storage Utility
|
||||
*
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: Store raw GraphQL payloads for historical analysis.
|
||||
*
|
||||
* Design Pattern: Metadata/Payload Separation
|
||||
* - Metadata in PostgreSQL (raw_crawl_payloads table): Small, indexed, queryable
|
||||
* - Payload on filesystem: Gzipped JSON at storage_path
|
||||
*
|
||||
* Storage structure:
|
||||
* /storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
|
||||
*
|
||||
* Benefits:
|
||||
* - Compare any two crawls to see what changed
|
||||
* - Replay/re-normalize historical data if logic changes
|
||||
* - Debug issues by seeing exactly what the API returned
|
||||
* - DB stays small, backups stay fast
|
||||
* - ~90% compression (1.5MB -> 150KB per crawl)
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as zlib from 'zlib';
|
||||
import { promisify } from 'util';
|
||||
import { Pool } from 'pg';
|
||||
import * as crypto from 'crypto';
|
||||
|
||||
const gzip = promisify(zlib.gzip);
|
||||
const gunzip = promisify(zlib.gunzip);
|
||||
|
||||
// Base path for payload storage (matches image storage pattern)
|
||||
const PAYLOAD_BASE_PATH = process.env.PAYLOAD_STORAGE_PATH || './storage/payloads';
|
||||
|
||||
/**
|
||||
* Result from saving a payload
|
||||
*/
|
||||
export interface SavePayloadResult {
|
||||
id: number;
|
||||
storagePath: string;
|
||||
sizeBytes: number;
|
||||
sizeBytesRaw: number;
|
||||
checksum: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result from loading a payload
|
||||
*/
|
||||
export interface LoadPayloadResult {
|
||||
payload: any;
|
||||
metadata: {
|
||||
id: number;
|
||||
dispensaryId: number;
|
||||
crawlRunId: number | null;
|
||||
productCount: number;
|
||||
fetchedAt: Date;
|
||||
storagePath: string;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate storage path for a payload
|
||||
*
|
||||
* Format: /storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
|
||||
*/
|
||||
function generateStoragePath(dispensaryId: number, timestamp: Date): string {
|
||||
const year = timestamp.getFullYear();
|
||||
const month = String(timestamp.getMonth() + 1).padStart(2, '0');
|
||||
const day = String(timestamp.getDate()).padStart(2, '0');
|
||||
const ts = timestamp.getTime();
|
||||
|
||||
return path.join(
|
||||
PAYLOAD_BASE_PATH,
|
||||
String(year),
|
||||
month,
|
||||
day,
|
||||
`store_${dispensaryId}_${ts}.json.gz`
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure directory exists for a file path
|
||||
*/
|
||||
async function ensureDir(filePath: string): Promise<void> {
|
||||
const dir = path.dirname(filePath);
|
||||
await fs.promises.mkdir(dir, { recursive: true });
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate SHA256 checksum of data
|
||||
*/
|
||||
function calculateChecksum(data: Buffer): string {
|
||||
return crypto.createHash('sha256').update(data).digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a raw crawl payload to filesystem and record metadata in DB
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @param dispensaryId - ID of the dispensary
|
||||
* @param payload - Raw JSON payload from GraphQL
|
||||
* @param crawlRunId - Optional crawl_run ID for linking
|
||||
* @param productCount - Number of products in payload
|
||||
* @returns SavePayloadResult with file info and DB record ID
|
||||
*/
|
||||
export async function saveRawPayload(
|
||||
pool: Pool,
|
||||
dispensaryId: number,
|
||||
payload: any,
|
||||
crawlRunId: number | null = null,
|
||||
productCount: number = 0
|
||||
): Promise<SavePayloadResult> {
|
||||
const timestamp = new Date();
|
||||
const storagePath = generateStoragePath(dispensaryId, timestamp);
|
||||
|
||||
// Serialize and compress
|
||||
const jsonStr = JSON.stringify(payload);
|
||||
const rawSize = Buffer.byteLength(jsonStr, 'utf8');
|
||||
const compressed = await gzip(Buffer.from(jsonStr, 'utf8'));
|
||||
const compressedSize = compressed.length;
|
||||
const checksum = calculateChecksum(compressed);
|
||||
|
||||
// Write to filesystem
|
||||
await ensureDir(storagePath);
|
||||
await fs.promises.writeFile(storagePath, compressed);
|
||||
|
||||
// Record metadata in DB
|
||||
const result = await pool.query(`
|
||||
INSERT INTO raw_crawl_payloads (
|
||||
crawl_run_id,
|
||||
dispensary_id,
|
||||
storage_path,
|
||||
product_count,
|
||||
size_bytes,
|
||||
size_bytes_raw,
|
||||
fetched_at,
|
||||
checksum_sha256
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING id
|
||||
`, [
|
||||
crawlRunId,
|
||||
dispensaryId,
|
||||
storagePath,
|
||||
productCount,
|
||||
compressedSize,
|
||||
rawSize,
|
||||
timestamp,
|
||||
checksum
|
||||
]);
|
||||
|
||||
console.log(`[PayloadStorage] Saved payload for store ${dispensaryId}: ${storagePath} (${(compressedSize / 1024).toFixed(1)}KB compressed, ${(rawSize / 1024).toFixed(1)}KB raw)`);
|
||||
|
||||
return {
|
||||
id: result.rows[0].id,
|
||||
storagePath,
|
||||
sizeBytes: compressedSize,
|
||||
sizeBytesRaw: rawSize,
|
||||
checksum
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a raw payload from filesystem by metadata ID
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @param payloadId - ID from raw_crawl_payloads table
|
||||
* @returns LoadPayloadResult with parsed payload and metadata
|
||||
*/
|
||||
export async function loadRawPayloadById(
|
||||
pool: Pool,
|
||||
payloadId: number
|
||||
): Promise<LoadPayloadResult | null> {
|
||||
const result = await pool.query(`
|
||||
SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at
|
||||
FROM raw_crawl_payloads
|
||||
WHERE id = $1
|
||||
`, [payloadId]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const row = result.rows[0];
|
||||
const payload = await loadPayloadFromPath(row.storage_path);
|
||||
|
||||
return {
|
||||
payload,
|
||||
metadata: {
|
||||
id: row.id,
|
||||
dispensaryId: row.dispensary_id,
|
||||
crawlRunId: row.crawl_run_id,
|
||||
productCount: row.product_count,
|
||||
fetchedAt: row.fetched_at,
|
||||
storagePath: row.storage_path
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a raw payload directly from filesystem path
|
||||
*
|
||||
* @param storagePath - Path to gzipped JSON file
|
||||
* @returns Parsed JSON payload
|
||||
*/
|
||||
export async function loadPayloadFromPath(storagePath: string): Promise<any> {
|
||||
const compressed = await fs.promises.readFile(storagePath);
|
||||
const decompressed = await gunzip(compressed);
|
||||
return JSON.parse(decompressed.toString('utf8'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the latest payload for a dispensary
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @param dispensaryId - ID of the dispensary
|
||||
* @returns LoadPayloadResult or null if none exists
|
||||
*/
|
||||
export async function getLatestPayload(
|
||||
pool: Pool,
|
||||
dispensaryId: number
|
||||
): Promise<LoadPayloadResult | null> {
|
||||
const result = await pool.query(`
|
||||
SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at
|
||||
FROM raw_crawl_payloads
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY fetched_at DESC
|
||||
LIMIT 1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const row = result.rows[0];
|
||||
const payload = await loadPayloadFromPath(row.storage_path);
|
||||
|
||||
return {
|
||||
payload,
|
||||
metadata: {
|
||||
id: row.id,
|
||||
dispensaryId: row.dispensary_id,
|
||||
crawlRunId: row.crawl_run_id,
|
||||
productCount: row.product_count,
|
||||
fetchedAt: row.fetched_at,
|
||||
storagePath: row.storage_path
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get two payloads for comparison (latest and previous, or by IDs)
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @param dispensaryId - ID of the dispensary
|
||||
* @param limit - Number of recent payloads to retrieve (default 2)
|
||||
* @returns Array of LoadPayloadResult, most recent first
|
||||
*/
|
||||
export async function getRecentPayloads(
|
||||
pool: Pool,
|
||||
dispensaryId: number,
|
||||
limit: number = 2
|
||||
): Promise<LoadPayloadResult[]> {
|
||||
const result = await pool.query(`
|
||||
SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at
|
||||
FROM raw_crawl_payloads
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY fetched_at DESC
|
||||
LIMIT $2
|
||||
`, [dispensaryId, limit]);
|
||||
|
||||
const payloads: LoadPayloadResult[] = [];
|
||||
|
||||
for (const row of result.rows) {
|
||||
const payload = await loadPayloadFromPath(row.storage_path);
|
||||
payloads.push({
|
||||
payload,
|
||||
metadata: {
|
||||
id: row.id,
|
||||
dispensaryId: row.dispensary_id,
|
||||
crawlRunId: row.crawl_run_id,
|
||||
productCount: row.product_count,
|
||||
fetchedAt: row.fetched_at,
|
||||
storagePath: row.storage_path
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return payloads;
|
||||
}
|
||||
|
||||
/**
|
||||
* List payload metadata without loading files (for browsing/pagination)
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @param options - Query options
|
||||
* @returns Array of metadata rows
|
||||
*/
|
||||
export async function listPayloadMetadata(
|
||||
pool: Pool,
|
||||
options: {
|
||||
dispensaryId?: number;
|
||||
startDate?: Date;
|
||||
endDate?: Date;
|
||||
limit?: number;
|
||||
offset?: number;
|
||||
} = {}
|
||||
): Promise<Array<{
|
||||
id: number;
|
||||
dispensaryId: number;
|
||||
crawlRunId: number | null;
|
||||
storagePath: string;
|
||||
productCount: number;
|
||||
sizeBytes: number;
|
||||
sizeBytesRaw: number;
|
||||
fetchedAt: Date;
|
||||
}>> {
|
||||
const conditions: string[] = [];
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (options.dispensaryId) {
|
||||
conditions.push(`dispensary_id = $${paramIndex++}`);
|
||||
params.push(options.dispensaryId);
|
||||
}
|
||||
|
||||
if (options.startDate) {
|
||||
conditions.push(`fetched_at >= $${paramIndex++}`);
|
||||
params.push(options.startDate);
|
||||
}
|
||||
|
||||
if (options.endDate) {
|
||||
conditions.push(`fetched_at <= $${paramIndex++}`);
|
||||
params.push(options.endDate);
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
const limit = options.limit || 50;
|
||||
const offset = options.offset || 0;
|
||||
|
||||
params.push(limit, offset);
|
||||
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
dispensary_id,
|
||||
crawl_run_id,
|
||||
storage_path,
|
||||
product_count,
|
||||
size_bytes,
|
||||
size_bytes_raw,
|
||||
fetched_at
|
||||
FROM raw_crawl_payloads
|
||||
${whereClause}
|
||||
ORDER BY fetched_at DESC
|
||||
LIMIT $${paramIndex++} OFFSET $${paramIndex}
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
dispensaryId: row.dispensary_id,
|
||||
crawlRunId: row.crawl_run_id,
|
||||
storagePath: row.storage_path,
|
||||
productCount: row.product_count,
|
||||
sizeBytes: row.size_bytes,
|
||||
sizeBytesRaw: row.size_bytes_raw,
|
||||
fetchedAt: row.fetched_at
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete old payloads (for retention policy)
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @param olderThan - Delete payloads older than this date
|
||||
* @returns Number of payloads deleted
|
||||
*/
|
||||
export async function deleteOldPayloads(
|
||||
pool: Pool,
|
||||
olderThan: Date
|
||||
): Promise<number> {
|
||||
// Get paths first
|
||||
const result = await pool.query(`
|
||||
SELECT id, storage_path FROM raw_crawl_payloads
|
||||
WHERE fetched_at < $1
|
||||
`, [olderThan]);
|
||||
|
||||
// Delete files
|
||||
for (const row of result.rows) {
|
||||
try {
|
||||
await fs.promises.unlink(row.storage_path);
|
||||
} catch (err: any) {
|
||||
if (err.code !== 'ENOENT') {
|
||||
console.warn(`[PayloadStorage] Failed to delete ${row.storage_path}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Delete DB records
|
||||
await pool.query(`
|
||||
DELETE FROM raw_crawl_payloads
|
||||
WHERE fetched_at < $1
|
||||
`, [olderThan]);
|
||||
|
||||
console.log(`[PayloadStorage] Deleted ${result.rows.length} payloads older than ${olderThan.toISOString()}`);
|
||||
|
||||
return result.rows.length;
|
||||
}
|
||||
Reference in New Issue
Block a user