/** * Payload Storage Utility * * Per TASK_WORKFLOW_2024-12-10.md: Store raw GraphQL payloads for historical analysis. * * Design Pattern: Metadata/Payload Separation * - Metadata in PostgreSQL (raw_crawl_payloads table): Small, indexed, queryable * - Payload on filesystem: Gzipped JSON at storage_path * * Storage structure: * /storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz * * Benefits: * - Compare any two crawls to see what changed * - Replay/re-normalize historical data if logic changes * - Debug issues by seeing exactly what the API returned * - DB stays small, backups stay fast * - ~90% compression (1.5MB -> 150KB per crawl) */ import * as fs from 'fs'; import * as path from 'path'; import * as zlib from 'zlib'; import { promisify } from 'util'; import { Pool } from 'pg'; import * as crypto from 'crypto'; const gzip = promisify(zlib.gzip); const gunzip = promisify(zlib.gunzip); // Base path for payload storage (matches image storage pattern) const PAYLOAD_BASE_PATH = process.env.PAYLOAD_STORAGE_PATH || './storage/payloads'; /** * Result from saving a payload */ export interface SavePayloadResult { id: number; storagePath: string; sizeBytes: number; sizeBytesRaw: number; checksum: string; } /** * Result from loading a payload */ export interface LoadPayloadResult { payload: any; metadata: { id: number; dispensaryId: number; crawlRunId: number | null; productCount: number; fetchedAt: Date; storagePath: string; }; } /** * Generate storage path for a payload * * Format: /storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz */ function generateStoragePath(dispensaryId: number, timestamp: Date): string { const year = timestamp.getFullYear(); const month = String(timestamp.getMonth() + 1).padStart(2, '0'); const day = String(timestamp.getDate()).padStart(2, '0'); const ts = timestamp.getTime(); return path.join( PAYLOAD_BASE_PATH, String(year), month, day, `store_${dispensaryId}_${ts}.json.gz` ); } /** * Ensure directory exists for a file path */ async function ensureDir(filePath: string): Promise { const dir = path.dirname(filePath); await fs.promises.mkdir(dir, { recursive: true }); } /** * Calculate SHA256 checksum of data */ function calculateChecksum(data: Buffer): string { return crypto.createHash('sha256').update(data).digest('hex'); } /** * Save a raw crawl payload to filesystem and record metadata in DB * * @param pool - Database connection pool * @param dispensaryId - ID of the dispensary * @param payload - Raw JSON payload from GraphQL * @param crawlRunId - Optional crawl_run ID for linking * @param productCount - Number of products in payload * @returns SavePayloadResult with file info and DB record ID */ export async function saveRawPayload( pool: Pool, dispensaryId: number, payload: any, crawlRunId: number | null = null, productCount: number = 0 ): Promise { const timestamp = new Date(); const storagePath = generateStoragePath(dispensaryId, timestamp); // Serialize and compress const jsonStr = JSON.stringify(payload); const rawSize = Buffer.byteLength(jsonStr, 'utf8'); const compressed = await gzip(Buffer.from(jsonStr, 'utf8')); const compressedSize = compressed.length; const checksum = calculateChecksum(compressed); // Write to filesystem await ensureDir(storagePath); await fs.promises.writeFile(storagePath, compressed); // Record metadata in DB const result = await pool.query(` INSERT INTO raw_crawl_payloads ( crawl_run_id, dispensary_id, storage_path, product_count, size_bytes, size_bytes_raw, fetched_at, checksum_sha256 ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING id `, [ crawlRunId, dispensaryId, storagePath, productCount, compressedSize, rawSize, timestamp, checksum ]); console.log(`[PayloadStorage] Saved payload for store ${dispensaryId}: ${storagePath} (${(compressedSize / 1024).toFixed(1)}KB compressed, ${(rawSize / 1024).toFixed(1)}KB raw)`); return { id: result.rows[0].id, storagePath, sizeBytes: compressedSize, sizeBytesRaw: rawSize, checksum }; } /** * Load a raw payload from filesystem by metadata ID * * @param pool - Database connection pool * @param payloadId - ID from raw_crawl_payloads table * @returns LoadPayloadResult with parsed payload and metadata */ export async function loadRawPayloadById( pool: Pool, payloadId: number ): Promise { const result = await pool.query(` SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at FROM raw_crawl_payloads WHERE id = $1 `, [payloadId]); if (result.rows.length === 0) { return null; } const row = result.rows[0]; const payload = await loadPayloadFromPath(row.storage_path); return { payload, metadata: { id: row.id, dispensaryId: row.dispensary_id, crawlRunId: row.crawl_run_id, productCount: row.product_count, fetchedAt: row.fetched_at, storagePath: row.storage_path } }; } /** * Load a raw payload directly from filesystem path * * @param storagePath - Path to gzipped JSON file * @returns Parsed JSON payload */ export async function loadPayloadFromPath(storagePath: string): Promise { const compressed = await fs.promises.readFile(storagePath); const decompressed = await gunzip(compressed); return JSON.parse(decompressed.toString('utf8')); } /** * Get the latest payload for a dispensary * * @param pool - Database connection pool * @param dispensaryId - ID of the dispensary * @returns LoadPayloadResult or null if none exists */ export async function getLatestPayload( pool: Pool, dispensaryId: number ): Promise { const result = await pool.query(` SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at FROM raw_crawl_payloads WHERE dispensary_id = $1 ORDER BY fetched_at DESC LIMIT 1 `, [dispensaryId]); if (result.rows.length === 0) { return null; } const row = result.rows[0]; const payload = await loadPayloadFromPath(row.storage_path); return { payload, metadata: { id: row.id, dispensaryId: row.dispensary_id, crawlRunId: row.crawl_run_id, productCount: row.product_count, fetchedAt: row.fetched_at, storagePath: row.storage_path } }; } /** * Get two payloads for comparison (latest and previous, or by IDs) * * @param pool - Database connection pool * @param dispensaryId - ID of the dispensary * @param limit - Number of recent payloads to retrieve (default 2) * @returns Array of LoadPayloadResult, most recent first */ export async function getRecentPayloads( pool: Pool, dispensaryId: number, limit: number = 2 ): Promise { const result = await pool.query(` SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at FROM raw_crawl_payloads WHERE dispensary_id = $1 ORDER BY fetched_at DESC LIMIT $2 `, [dispensaryId, limit]); const payloads: LoadPayloadResult[] = []; for (const row of result.rows) { const payload = await loadPayloadFromPath(row.storage_path); payloads.push({ payload, metadata: { id: row.id, dispensaryId: row.dispensary_id, crawlRunId: row.crawl_run_id, productCount: row.product_count, fetchedAt: row.fetched_at, storagePath: row.storage_path } }); } return payloads; } /** * List payload metadata without loading files (for browsing/pagination) * * @param pool - Database connection pool * @param options - Query options * @returns Array of metadata rows */ export async function listPayloadMetadata( pool: Pool, options: { dispensaryId?: number; startDate?: Date; endDate?: Date; limit?: number; offset?: number; } = {} ): Promise> { const conditions: string[] = []; const params: any[] = []; let paramIndex = 1; if (options.dispensaryId) { conditions.push(`dispensary_id = $${paramIndex++}`); params.push(options.dispensaryId); } if (options.startDate) { conditions.push(`fetched_at >= $${paramIndex++}`); params.push(options.startDate); } if (options.endDate) { conditions.push(`fetched_at <= $${paramIndex++}`); params.push(options.endDate); } const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : ''; const limit = options.limit || 50; const offset = options.offset || 0; params.push(limit, offset); const result = await pool.query(` SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, size_bytes, size_bytes_raw, fetched_at FROM raw_crawl_payloads ${whereClause} ORDER BY fetched_at DESC LIMIT $${paramIndex++} OFFSET $${paramIndex} `, params); return result.rows.map(row => ({ id: row.id, dispensaryId: row.dispensary_id, crawlRunId: row.crawl_run_id, storagePath: row.storage_path, productCount: row.product_count, sizeBytes: row.size_bytes, sizeBytesRaw: row.size_bytes_raw, fetchedAt: row.fetched_at })); } /** * Result from saving a discovery payload */ export interface SaveDiscoveryPayloadResult { id: number; storagePath: string; sizeBytes: number; sizeBytesRaw: number; checksum: string; } /** * Generate storage path for a discovery payload * * Format: /storage/payloads/discovery/{year}/{month}/{day}/state_{state_code}_{timestamp}.json.gz */ function generateDiscoveryStoragePath(stateCode: string, timestamp: Date): string { const year = timestamp.getFullYear(); const month = String(timestamp.getMonth() + 1).padStart(2, '0'); const day = String(timestamp.getDate()).padStart(2, '0'); const ts = timestamp.getTime(); return path.join( PAYLOAD_BASE_PATH, 'discovery', String(year), month, day, `state_${stateCode.toLowerCase()}_${ts}.json.gz` ); } /** * Save a raw store discovery payload to filesystem and record metadata in DB * * @param pool - Database connection pool * @param stateCode - State code (e.g., 'AZ', 'MI') * @param payload - Raw JSON payload from discovery GraphQL * @param storeCount - Number of stores in payload * @returns SaveDiscoveryPayloadResult with file info and DB record ID */ export async function saveDiscoveryPayload( pool: Pool, stateCode: string, payload: any, storeCount: number = 0 ): Promise { const timestamp = new Date(); const storagePath = generateDiscoveryStoragePath(stateCode, timestamp); // Serialize and compress const jsonStr = JSON.stringify(payload); const rawSize = Buffer.byteLength(jsonStr, 'utf8'); const compressed = await gzip(Buffer.from(jsonStr, 'utf8')); const compressedSize = compressed.length; const checksum = calculateChecksum(compressed); // Write to filesystem await ensureDir(storagePath); await fs.promises.writeFile(storagePath, compressed); // Record metadata in DB const result = await pool.query(` INSERT INTO raw_crawl_payloads ( payload_type, state_code, storage_path, store_count, size_bytes, size_bytes_raw, fetched_at, checksum_sha256 ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING id `, [ 'store_discovery', stateCode.toUpperCase(), storagePath, storeCount, compressedSize, rawSize, timestamp, checksum ]); console.log(`[PayloadStorage] Saved discovery payload for ${stateCode}: ${storagePath} (${storeCount} stores, ${(compressedSize / 1024).toFixed(1)}KB compressed)`); return { id: result.rows[0].id, storagePath, sizeBytes: compressedSize, sizeBytesRaw: rawSize, checksum }; } /** * Get the latest discovery payload for a state * * @param pool - Database connection pool * @param stateCode - State code (e.g., 'AZ', 'MI') * @returns Parsed payload and metadata, or null if none exists */ export async function getLatestDiscoveryPayload( pool: Pool, stateCode: string ): Promise<{ payload: any; metadata: any } | null> { const result = await pool.query(` SELECT id, state_code, storage_path, store_count, fetched_at FROM raw_crawl_payloads WHERE payload_type = 'store_discovery' AND state_code = $1 ORDER BY fetched_at DESC LIMIT 1 `, [stateCode.toUpperCase()]); if (result.rows.length === 0) { return null; } const row = result.rows[0]; const payload = await loadPayloadFromPath(row.storage_path); return { payload, metadata: { id: row.id, stateCode: row.state_code, storeCount: row.store_count, fetchedAt: row.fetched_at, storagePath: row.storage_path } }; } /** * Delete old payloads (for retention policy) * * @param pool - Database connection pool * @param olderThan - Delete payloads older than this date * @returns Number of payloads deleted */ export async function deleteOldPayloads( pool: Pool, olderThan: Date ): Promise { // Get paths first const result = await pool.query(` SELECT id, storage_path FROM raw_crawl_payloads WHERE fetched_at < $1 `, [olderThan]); // Delete files for (const row of result.rows) { try { await fs.promises.unlink(row.storage_path); } catch (err: any) { if (err.code !== 'ENOENT') { console.warn(`[PayloadStorage] Failed to delete ${row.storage_path}: ${err.message}`); } } } // Delete DB records await pool.query(` DELETE FROM raw_crawl_payloads WHERE fetched_at < $1 `, [olderThan]); console.log(`[PayloadStorage] Deleted ${result.rows.length} payloads older than ${olderThan.toISOString()}`); return result.rows.length; }