Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
59 lines
2.4 KiB
SQL
59 lines
2.4 KiB
SQL
-- Migration 080: Raw Crawl Payloads Metadata Table
|
|
-- Per TASK_WORKFLOW_2024-12-10.md: Store full GraphQL payloads for historical analysis
|
|
--
|
|
-- Design Pattern: Metadata/Payload Separation
|
|
-- - Metadata (this table): Small, indexed, queryable
|
|
-- - Payload (filesystem): Gzipped JSON at storage_path
|
|
--
|
|
-- Benefits:
|
|
-- - Compare any two crawls to see what changed
|
|
-- - Replay/re-normalize historical data if logic changes
|
|
-- - Debug issues by seeing exactly what the API returned
|
|
-- - DB stays small, backups stay fast
|
|
--
|
|
-- Storage location: /storage/payloads/{year}/{month}/{day}/store_{id}_{timestamp}.json.gz
|
|
-- Compression: ~90% reduction (1.5MB -> 150KB per crawl)
|
|
|
|
CREATE TABLE IF NOT EXISTS raw_crawl_payloads (
|
|
id SERIAL PRIMARY KEY,
|
|
|
|
-- Links to crawl tracking
|
|
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
|
|
|
-- File location (gzipped JSON)
|
|
storage_path TEXT NOT NULL,
|
|
|
|
-- Metadata for quick queries without loading file
|
|
product_count INTEGER NOT NULL DEFAULT 0,
|
|
size_bytes INTEGER, -- Compressed size
|
|
size_bytes_raw INTEGER, -- Uncompressed size
|
|
|
|
-- Timestamps
|
|
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
-- Optional: checksum for integrity verification
|
|
checksum_sha256 VARCHAR(64)
|
|
);
|
|
|
|
-- Indexes for common queries
|
|
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary
|
|
ON raw_crawl_payloads(dispensary_id);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary_fetched
|
|
ON raw_crawl_payloads(dispensary_id, fetched_at DESC);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_fetched
|
|
ON raw_crawl_payloads(fetched_at DESC);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_crawl_run
|
|
ON raw_crawl_payloads(crawl_run_id)
|
|
WHERE crawl_run_id IS NOT NULL;
|
|
|
|
-- Comments
|
|
COMMENT ON TABLE raw_crawl_payloads IS 'Metadata for raw GraphQL payloads stored on filesystem. Per TASK_WORKFLOW_2024-12-10.md: Full payloads enable historical diffs and replay.';
|
|
COMMENT ON COLUMN raw_crawl_payloads.storage_path IS 'Path to gzipped JSON file, e.g. /storage/payloads/2024/12/10/store_123_1702234567.json.gz';
|
|
COMMENT ON COLUMN raw_crawl_payloads.size_bytes IS 'Compressed file size in bytes';
|
|
COMMENT ON COLUMN raw_crawl_payloads.size_bytes_raw IS 'Uncompressed payload size in bytes';
|