## Changes - **Migration 089**: Add is_immutable and method columns to task_schedules - Per-state product_discovery schedules (4h default) - Store discovery weekly (168h) - All schedules use HTTP transport (Puppeteer/browser) - **Task Scheduler**: HTTP-only product discovery with per-state scheduling - Each state has its own immutable schedule - Schedules can be edited (interval/priority) but not deleted - **TasksDashboard UI**: Full immutability support - Lock icon for immutable schedules - State and Method columns in schedules table - Disabled delete for immutable, restricted edit fields - **Store Discovery HTTP**: Auto-queue product_discovery for new stores - **Migration 088**: Discovery payloads storage schema 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
542 lines
14 KiB
TypeScript
542 lines
14 KiB
TypeScript
/**
|
|
* Payload Storage Utility
|
|
*
|
|
* Per TASK_WORKFLOW_2024-12-10.md: Store raw GraphQL payloads for historical analysis.
|
|
*
|
|
* Design Pattern: Metadata/Payload Separation
|
|
* - Metadata in PostgreSQL (raw_crawl_payloads table): Small, indexed, queryable
|
|
* - Payload on filesystem: Gzipped JSON at storage_path
|
|
*
|
|
* Storage structure:
|
|
* /storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
|
|
*
|
|
* Benefits:
|
|
* - Compare any two crawls to see what changed
|
|
* - Replay/re-normalize historical data if logic changes
|
|
* - Debug issues by seeing exactly what the API returned
|
|
* - DB stays small, backups stay fast
|
|
* - ~90% compression (1.5MB -> 150KB per crawl)
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as zlib from 'zlib';
|
|
import { promisify } from 'util';
|
|
import { Pool } from 'pg';
|
|
import * as crypto from 'crypto';
|
|
|
|
const gzip = promisify(zlib.gzip);
|
|
const gunzip = promisify(zlib.gunzip);
|
|
|
|
// Base path for payload storage (matches image storage pattern)
|
|
const PAYLOAD_BASE_PATH = process.env.PAYLOAD_STORAGE_PATH || './storage/payloads';
|
|
|
|
/**
|
|
* Result from saving a payload
|
|
*/
|
|
export interface SavePayloadResult {
|
|
id: number;
|
|
storagePath: string;
|
|
sizeBytes: number;
|
|
sizeBytesRaw: number;
|
|
checksum: string;
|
|
}
|
|
|
|
/**
|
|
* Result from loading a payload
|
|
*/
|
|
export interface LoadPayloadResult {
|
|
payload: any;
|
|
metadata: {
|
|
id: number;
|
|
dispensaryId: number;
|
|
crawlRunId: number | null;
|
|
productCount: number;
|
|
fetchedAt: Date;
|
|
storagePath: string;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Generate storage path for a payload
|
|
*
|
|
* Format: /storage/payloads/{year}/{month}/{day}/store_{dispensary_id}_{timestamp}.json.gz
|
|
*/
|
|
function generateStoragePath(dispensaryId: number, timestamp: Date): string {
|
|
const year = timestamp.getFullYear();
|
|
const month = String(timestamp.getMonth() + 1).padStart(2, '0');
|
|
const day = String(timestamp.getDate()).padStart(2, '0');
|
|
const ts = timestamp.getTime();
|
|
|
|
return path.join(
|
|
PAYLOAD_BASE_PATH,
|
|
String(year),
|
|
month,
|
|
day,
|
|
`store_${dispensaryId}_${ts}.json.gz`
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Ensure directory exists for a file path
|
|
*/
|
|
async function ensureDir(filePath: string): Promise<void> {
|
|
const dir = path.dirname(filePath);
|
|
await fs.promises.mkdir(dir, { recursive: true });
|
|
}
|
|
|
|
/**
|
|
* Calculate SHA256 checksum of data
|
|
*/
|
|
function calculateChecksum(data: Buffer): string {
|
|
return crypto.createHash('sha256').update(data).digest('hex');
|
|
}
|
|
|
|
/**
|
|
* Save a raw crawl payload to filesystem and record metadata in DB
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param dispensaryId - ID of the dispensary
|
|
* @param payload - Raw JSON payload from GraphQL
|
|
* @param crawlRunId - Optional crawl_run ID for linking
|
|
* @param productCount - Number of products in payload
|
|
* @returns SavePayloadResult with file info and DB record ID
|
|
*/
|
|
export async function saveRawPayload(
|
|
pool: Pool,
|
|
dispensaryId: number,
|
|
payload: any,
|
|
crawlRunId: number | null = null,
|
|
productCount: number = 0
|
|
): Promise<SavePayloadResult> {
|
|
const timestamp = new Date();
|
|
const storagePath = generateStoragePath(dispensaryId, timestamp);
|
|
|
|
// Serialize and compress
|
|
const jsonStr = JSON.stringify(payload);
|
|
const rawSize = Buffer.byteLength(jsonStr, 'utf8');
|
|
const compressed = await gzip(Buffer.from(jsonStr, 'utf8'));
|
|
const compressedSize = compressed.length;
|
|
const checksum = calculateChecksum(compressed);
|
|
|
|
// Write to filesystem
|
|
await ensureDir(storagePath);
|
|
await fs.promises.writeFile(storagePath, compressed);
|
|
|
|
// Record metadata in DB
|
|
const result = await pool.query(`
|
|
INSERT INTO raw_crawl_payloads (
|
|
crawl_run_id,
|
|
dispensary_id,
|
|
storage_path,
|
|
product_count,
|
|
size_bytes,
|
|
size_bytes_raw,
|
|
fetched_at,
|
|
checksum_sha256
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
RETURNING id
|
|
`, [
|
|
crawlRunId,
|
|
dispensaryId,
|
|
storagePath,
|
|
productCount,
|
|
compressedSize,
|
|
rawSize,
|
|
timestamp,
|
|
checksum
|
|
]);
|
|
|
|
console.log(`[PayloadStorage] Saved payload for store ${dispensaryId}: ${storagePath} (${(compressedSize / 1024).toFixed(1)}KB compressed, ${(rawSize / 1024).toFixed(1)}KB raw)`);
|
|
|
|
return {
|
|
id: result.rows[0].id,
|
|
storagePath,
|
|
sizeBytes: compressedSize,
|
|
sizeBytesRaw: rawSize,
|
|
checksum
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Load a raw payload from filesystem by metadata ID
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param payloadId - ID from raw_crawl_payloads table
|
|
* @returns LoadPayloadResult with parsed payload and metadata
|
|
*/
|
|
export async function loadRawPayloadById(
|
|
pool: Pool,
|
|
payloadId: number
|
|
): Promise<LoadPayloadResult | null> {
|
|
const result = await pool.query(`
|
|
SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at
|
|
FROM raw_crawl_payloads
|
|
WHERE id = $1
|
|
`, [payloadId]);
|
|
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
const row = result.rows[0];
|
|
const payload = await loadPayloadFromPath(row.storage_path);
|
|
|
|
return {
|
|
payload,
|
|
metadata: {
|
|
id: row.id,
|
|
dispensaryId: row.dispensary_id,
|
|
crawlRunId: row.crawl_run_id,
|
|
productCount: row.product_count,
|
|
fetchedAt: row.fetched_at,
|
|
storagePath: row.storage_path
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Load a raw payload directly from filesystem path
|
|
*
|
|
* @param storagePath - Path to gzipped JSON file
|
|
* @returns Parsed JSON payload
|
|
*/
|
|
export async function loadPayloadFromPath(storagePath: string): Promise<any> {
|
|
const compressed = await fs.promises.readFile(storagePath);
|
|
const decompressed = await gunzip(compressed);
|
|
return JSON.parse(decompressed.toString('utf8'));
|
|
}
|
|
|
|
/**
|
|
* Get the latest payload for a dispensary
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param dispensaryId - ID of the dispensary
|
|
* @returns LoadPayloadResult or null if none exists
|
|
*/
|
|
export async function getLatestPayload(
|
|
pool: Pool,
|
|
dispensaryId: number
|
|
): Promise<LoadPayloadResult | null> {
|
|
const result = await pool.query(`
|
|
SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at
|
|
FROM raw_crawl_payloads
|
|
WHERE dispensary_id = $1
|
|
ORDER BY fetched_at DESC
|
|
LIMIT 1
|
|
`, [dispensaryId]);
|
|
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
const row = result.rows[0];
|
|
const payload = await loadPayloadFromPath(row.storage_path);
|
|
|
|
return {
|
|
payload,
|
|
metadata: {
|
|
id: row.id,
|
|
dispensaryId: row.dispensary_id,
|
|
crawlRunId: row.crawl_run_id,
|
|
productCount: row.product_count,
|
|
fetchedAt: row.fetched_at,
|
|
storagePath: row.storage_path
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get two payloads for comparison (latest and previous, or by IDs)
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param dispensaryId - ID of the dispensary
|
|
* @param limit - Number of recent payloads to retrieve (default 2)
|
|
* @returns Array of LoadPayloadResult, most recent first
|
|
*/
|
|
export async function getRecentPayloads(
|
|
pool: Pool,
|
|
dispensaryId: number,
|
|
limit: number = 2
|
|
): Promise<LoadPayloadResult[]> {
|
|
const result = await pool.query(`
|
|
SELECT id, dispensary_id, crawl_run_id, storage_path, product_count, fetched_at
|
|
FROM raw_crawl_payloads
|
|
WHERE dispensary_id = $1
|
|
ORDER BY fetched_at DESC
|
|
LIMIT $2
|
|
`, [dispensaryId, limit]);
|
|
|
|
const payloads: LoadPayloadResult[] = [];
|
|
|
|
for (const row of result.rows) {
|
|
const payload = await loadPayloadFromPath(row.storage_path);
|
|
payloads.push({
|
|
payload,
|
|
metadata: {
|
|
id: row.id,
|
|
dispensaryId: row.dispensary_id,
|
|
crawlRunId: row.crawl_run_id,
|
|
productCount: row.product_count,
|
|
fetchedAt: row.fetched_at,
|
|
storagePath: row.storage_path
|
|
}
|
|
});
|
|
}
|
|
|
|
return payloads;
|
|
}
|
|
|
|
/**
|
|
* List payload metadata without loading files (for browsing/pagination)
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param options - Query options
|
|
* @returns Array of metadata rows
|
|
*/
|
|
export async function listPayloadMetadata(
|
|
pool: Pool,
|
|
options: {
|
|
dispensaryId?: number;
|
|
startDate?: Date;
|
|
endDate?: Date;
|
|
limit?: number;
|
|
offset?: number;
|
|
} = {}
|
|
): Promise<Array<{
|
|
id: number;
|
|
dispensaryId: number;
|
|
crawlRunId: number | null;
|
|
storagePath: string;
|
|
productCount: number;
|
|
sizeBytes: number;
|
|
sizeBytesRaw: number;
|
|
fetchedAt: Date;
|
|
}>> {
|
|
const conditions: string[] = [];
|
|
const params: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (options.dispensaryId) {
|
|
conditions.push(`dispensary_id = $${paramIndex++}`);
|
|
params.push(options.dispensaryId);
|
|
}
|
|
|
|
if (options.startDate) {
|
|
conditions.push(`fetched_at >= $${paramIndex++}`);
|
|
params.push(options.startDate);
|
|
}
|
|
|
|
if (options.endDate) {
|
|
conditions.push(`fetched_at <= $${paramIndex++}`);
|
|
params.push(options.endDate);
|
|
}
|
|
|
|
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
|
const limit = options.limit || 50;
|
|
const offset = options.offset || 0;
|
|
|
|
params.push(limit, offset);
|
|
|
|
const result = await pool.query(`
|
|
SELECT
|
|
id,
|
|
dispensary_id,
|
|
crawl_run_id,
|
|
storage_path,
|
|
product_count,
|
|
size_bytes,
|
|
size_bytes_raw,
|
|
fetched_at
|
|
FROM raw_crawl_payloads
|
|
${whereClause}
|
|
ORDER BY fetched_at DESC
|
|
LIMIT $${paramIndex++} OFFSET $${paramIndex}
|
|
`, params);
|
|
|
|
return result.rows.map(row => ({
|
|
id: row.id,
|
|
dispensaryId: row.dispensary_id,
|
|
crawlRunId: row.crawl_run_id,
|
|
storagePath: row.storage_path,
|
|
productCount: row.product_count,
|
|
sizeBytes: row.size_bytes,
|
|
sizeBytesRaw: row.size_bytes_raw,
|
|
fetchedAt: row.fetched_at
|
|
}));
|
|
}
|
|
|
|
/**
|
|
* Result from saving a discovery payload
|
|
*/
|
|
export interface SaveDiscoveryPayloadResult {
|
|
id: number;
|
|
storagePath: string;
|
|
sizeBytes: number;
|
|
sizeBytesRaw: number;
|
|
checksum: string;
|
|
}
|
|
|
|
/**
|
|
* Generate storage path for a discovery payload
|
|
*
|
|
* Format: /storage/payloads/discovery/{year}/{month}/{day}/state_{state_code}_{timestamp}.json.gz
|
|
*/
|
|
function generateDiscoveryStoragePath(stateCode: string, timestamp: Date): string {
|
|
const year = timestamp.getFullYear();
|
|
const month = String(timestamp.getMonth() + 1).padStart(2, '0');
|
|
const day = String(timestamp.getDate()).padStart(2, '0');
|
|
const ts = timestamp.getTime();
|
|
|
|
return path.join(
|
|
PAYLOAD_BASE_PATH,
|
|
'discovery',
|
|
String(year),
|
|
month,
|
|
day,
|
|
`state_${stateCode.toLowerCase()}_${ts}.json.gz`
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Save a raw store discovery payload to filesystem and record metadata in DB
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param stateCode - State code (e.g., 'AZ', 'MI')
|
|
* @param payload - Raw JSON payload from discovery GraphQL
|
|
* @param storeCount - Number of stores in payload
|
|
* @returns SaveDiscoveryPayloadResult with file info and DB record ID
|
|
*/
|
|
export async function saveDiscoveryPayload(
|
|
pool: Pool,
|
|
stateCode: string,
|
|
payload: any,
|
|
storeCount: number = 0
|
|
): Promise<SaveDiscoveryPayloadResult> {
|
|
const timestamp = new Date();
|
|
const storagePath = generateDiscoveryStoragePath(stateCode, timestamp);
|
|
|
|
// Serialize and compress
|
|
const jsonStr = JSON.stringify(payload);
|
|
const rawSize = Buffer.byteLength(jsonStr, 'utf8');
|
|
const compressed = await gzip(Buffer.from(jsonStr, 'utf8'));
|
|
const compressedSize = compressed.length;
|
|
const checksum = calculateChecksum(compressed);
|
|
|
|
// Write to filesystem
|
|
await ensureDir(storagePath);
|
|
await fs.promises.writeFile(storagePath, compressed);
|
|
|
|
// Record metadata in DB
|
|
const result = await pool.query(`
|
|
INSERT INTO raw_crawl_payloads (
|
|
payload_type,
|
|
state_code,
|
|
storage_path,
|
|
store_count,
|
|
size_bytes,
|
|
size_bytes_raw,
|
|
fetched_at,
|
|
checksum_sha256
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
RETURNING id
|
|
`, [
|
|
'store_discovery',
|
|
stateCode.toUpperCase(),
|
|
storagePath,
|
|
storeCount,
|
|
compressedSize,
|
|
rawSize,
|
|
timestamp,
|
|
checksum
|
|
]);
|
|
|
|
console.log(`[PayloadStorage] Saved discovery payload for ${stateCode}: ${storagePath} (${storeCount} stores, ${(compressedSize / 1024).toFixed(1)}KB compressed)`);
|
|
|
|
return {
|
|
id: result.rows[0].id,
|
|
storagePath,
|
|
sizeBytes: compressedSize,
|
|
sizeBytesRaw: rawSize,
|
|
checksum
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get the latest discovery payload for a state
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param stateCode - State code (e.g., 'AZ', 'MI')
|
|
* @returns Parsed payload and metadata, or null if none exists
|
|
*/
|
|
export async function getLatestDiscoveryPayload(
|
|
pool: Pool,
|
|
stateCode: string
|
|
): Promise<{ payload: any; metadata: any } | null> {
|
|
const result = await pool.query(`
|
|
SELECT id, state_code, storage_path, store_count, fetched_at
|
|
FROM raw_crawl_payloads
|
|
WHERE payload_type = 'store_discovery'
|
|
AND state_code = $1
|
|
ORDER BY fetched_at DESC
|
|
LIMIT 1
|
|
`, [stateCode.toUpperCase()]);
|
|
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
const row = result.rows[0];
|
|
const payload = await loadPayloadFromPath(row.storage_path);
|
|
|
|
return {
|
|
payload,
|
|
metadata: {
|
|
id: row.id,
|
|
stateCode: row.state_code,
|
|
storeCount: row.store_count,
|
|
fetchedAt: row.fetched_at,
|
|
storagePath: row.storage_path
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Delete old payloads (for retention policy)
|
|
*
|
|
* @param pool - Database connection pool
|
|
* @param olderThan - Delete payloads older than this date
|
|
* @returns Number of payloads deleted
|
|
*/
|
|
export async function deleteOldPayloads(
|
|
pool: Pool,
|
|
olderThan: Date
|
|
): Promise<number> {
|
|
// Get paths first
|
|
const result = await pool.query(`
|
|
SELECT id, storage_path FROM raw_crawl_payloads
|
|
WHERE fetched_at < $1
|
|
`, [olderThan]);
|
|
|
|
// Delete files
|
|
for (const row of result.rows) {
|
|
try {
|
|
await fs.promises.unlink(row.storage_path);
|
|
} catch (err: any) {
|
|
if (err.code !== 'ENOENT') {
|
|
console.warn(`[PayloadStorage] Failed to delete ${row.storage_path}: ${err.message}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Delete DB records
|
|
await pool.query(`
|
|
DELETE FROM raw_crawl_payloads
|
|
WHERE fetched_at < $1
|
|
`, [olderThan]);
|
|
|
|
console.log(`[PayloadStorage] Deleted ${result.rows.length} payloads older than ${olderThan.toISOString()}`);
|
|
|
|
return result.rows.length;
|
|
}
|