-- Migration 080: Raw Crawl Payloads Metadata Table -- Per TASK_WORKFLOW_2024-12-10.md: Store full GraphQL payloads for historical analysis -- -- Design Pattern: Metadata/Payload Separation -- - Metadata (this table): Small, indexed, queryable -- - Payload (filesystem): Gzipped JSON at storage_path -- -- Benefits: -- - Compare any two crawls to see what changed -- - Replay/re-normalize historical data if logic changes -- - Debug issues by seeing exactly what the API returned -- - DB stays small, backups stay fast -- -- Storage location: /storage/payloads/{year}/{month}/{day}/store_{id}_{timestamp}.json.gz -- Compression: ~90% reduction (1.5MB -> 150KB per crawl) CREATE TABLE IF NOT EXISTS raw_crawl_payloads ( id SERIAL PRIMARY KEY, -- Links to crawl tracking crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL, dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE, -- File location (gzipped JSON) storage_path TEXT NOT NULL, -- Metadata for quick queries without loading file product_count INTEGER NOT NULL DEFAULT 0, size_bytes INTEGER, -- Compressed size size_bytes_raw INTEGER, -- Uncompressed size -- Timestamps fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- Optional: checksum for integrity verification checksum_sha256 VARCHAR(64) ); -- Indexes for common queries CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary ON raw_crawl_payloads(dispensary_id); CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary_fetched ON raw_crawl_payloads(dispensary_id, fetched_at DESC); CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_fetched ON raw_crawl_payloads(fetched_at DESC); CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_crawl_run ON raw_crawl_payloads(crawl_run_id) WHERE crawl_run_id IS NOT NULL; -- Comments COMMENT ON TABLE raw_crawl_payloads IS 'Metadata for raw GraphQL payloads stored on filesystem. Per TASK_WORKFLOW_2024-12-10.md: Full payloads enable historical diffs and replay.'; COMMENT ON COLUMN raw_crawl_payloads.storage_path IS 'Path to gzipped JSON file, e.g. /storage/payloads/2024/12/10/store_123_1702234567.json.gz'; COMMENT ON COLUMN raw_crawl_payloads.size_bytes IS 'Compressed file size in bytes'; COMMENT ON COLUMN raw_crawl_payloads.size_bytes_raw IS 'Uncompressed payload size in bytes';