-- Migration 028: Snapshot Architecture -- Implements append-only snapshots for full history tracking -- Following the principle: "Never delete, only append observations" -- ===================================================== -- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE) -- ===================================================== -- Product snapshots - one row per product per crawl CREATE TABLE IF NOT EXISTS product_snapshots ( id SERIAL PRIMARY KEY, -- Source identification crawl_id UUID NOT NULL, -- Groups all products from same crawl run dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), -- Product identification external_product_id VARCHAR(255), -- Dutchie/provider product ID product_slug VARCHAR(500), -- URL slug for matching -- Product details (as seen at crawl time) name VARCHAR(500) NOT NULL, brand VARCHAR(255), category VARCHAR(100), subcategory VARCHAR(100), -- Pricing snapshot price NUMERIC(10,2), original_price NUMERIC(10,2), sale_price NUMERIC(10,2), discount_type VARCHAR(50), discount_value VARCHAR(100), -- Availability snapshot availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown', -- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown' stock_quantity INTEGER, -- Potency snapshot thc_percentage NUMERIC(5,2), cbd_percentage NUMERIC(5,2), -- Product attributes strain_type VARCHAR(100), weight VARCHAR(100), variant VARCHAR(255), -- Rich data description TEXT, image_url TEXT, effects TEXT[], terpenes TEXT[], -- Timestamp captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- Raw metadata from provider raw_data JSONB ); -- Indexes for efficient querying CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id); CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC); -- Brand snapshots - summary of brands seen per crawl CREATE TABLE IF NOT EXISTS brand_snapshots ( id SERIAL PRIMARY KEY, crawl_id UUID NOT NULL, dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), brand_name VARCHAR(255) NOT NULL, product_count INTEGER NOT NULL DEFAULT 0, in_stock_count INTEGER NOT NULL DEFAULT 0, -- Price range for this brand at this store at this time min_price NUMERIC(10,2), max_price NUMERIC(10,2), avg_price NUMERIC(10,2), -- Categories this brand has products in categories TEXT[], captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id); -- Crawl runs table - metadata about each crawl CREATE TABLE IF NOT EXISTS crawl_runs ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), completed_at TIMESTAMPTZ, status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed' -- Results products_found INTEGER DEFAULT 0, brands_found INTEGER DEFAULT 0, categories_found INTEGER DEFAULT 0, -- Errors if any error_message TEXT, -- Provider info provider VARCHAR(50), menu_url TEXT ); CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC); CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status); -- ===================================================== -- LAYER 2: Summary/Rollup Tables (can be recalculated) -- ===================================================== -- Daily brand summary per store CREATE TABLE IF NOT EXISTS brand_store_day_summary ( id SERIAL PRIMARY KEY, dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), brand_name VARCHAR(255) NOT NULL, summary_date DATE NOT NULL, -- Presence first_seen_at TIMESTAMPTZ, last_seen_at TIMESTAMPTZ, crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today -- Product counts total_skus INTEGER DEFAULT 0, in_stock_skus INTEGER DEFAULT 0, out_of_stock_events INTEGER DEFAULT 0, -- Price stats min_price NUMERIC(10,2), max_price NUMERIC(10,2), avg_price NUMERIC(10,2), -- Categories categories TEXT[], created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW(), UNIQUE(dispensary_id, brand_name, summary_date) ); CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC); CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC); -- Product SKU daily summary CREATE TABLE IF NOT EXISTS product_sku_day_summary ( id SERIAL PRIMARY KEY, dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), product_slug VARCHAR(500) NOT NULL, summary_date DATE NOT NULL, -- Latest values name VARCHAR(500), brand VARCHAR(255), category VARCHAR(100), -- Price tracking opening_price NUMERIC(10,2), -- first price of day closing_price NUMERIC(10,2), -- last price of day min_price NUMERIC(10,2), max_price NUMERIC(10,2), price_changes INTEGER DEFAULT 0, -- Availability times_in_stock INTEGER DEFAULT 0, times_out_of_stock INTEGER DEFAULT 0, first_seen_at TIMESTAMPTZ, last_seen_at TIMESTAMPTZ, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW(), UNIQUE(dispensary_id, product_slug, summary_date) ); CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC); CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC); -- ===================================================== -- VIEWS for common queries -- ===================================================== -- Current products view (latest snapshot per product) CREATE OR REPLACE VIEW current_products AS SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug) ps.*, d.name AS dispensary_name, COALESCE(d.dba_name, d.name) AS store_name FROM product_snapshots ps JOIN dispensaries d ON d.id = ps.dispensary_id ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC; -- Current brands per store view CREATE OR REPLACE VIEW current_brands AS SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name) bs.*, d.name AS dispensary_name, COALESCE(d.dba_name, d.name) AS store_name FROM brand_snapshots bs JOIN dispensaries d ON d.id = bs.dispensary_id WHERE bs.captured_at >= NOW() - INTERVAL '7 days' ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC; -- Brand coverage across stores CREATE OR REPLACE VIEW brand_store_coverage AS SELECT brand_name, COUNT(DISTINCT dispensary_id) AS store_count, SUM(product_count) AS total_skus, MIN(min_price) AS market_min_price, MAX(max_price) AS market_max_price, AVG(avg_price) AS market_avg_price, MAX(captured_at) AS last_seen_at FROM brand_snapshots WHERE captured_at >= NOW() - INTERVAL '7 days' GROUP BY brand_name; -- Grant permissions GRANT SELECT, INSERT ON product_snapshots TO scraper; GRANT SELECT, INSERT ON brand_snapshots TO scraper; GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper; GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper; GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper; GRANT SELECT ON current_products TO scraper; GRANT SELECT ON current_brands TO scraper; GRANT SELECT ON brand_store_coverage TO scraper;