Files
cannaiq/backend/migrations/028_snapshot_architecture.sql
Kelly 199b6a8a23 Remove incorrect migration 029, add snapshot architecture, improve scraper
- Delete migration 029 that was incorrectly creating duplicate dispensaries
- Add migration 028 for snapshot architecture
- Improve downloader with proxy/UA rotation
- Update scraper monitor and tools pages
- Various scraper improvements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-01 08:52:54 -07:00

241 lines
7.9 KiB
SQL

-- Migration 028: Snapshot Architecture
-- Implements append-only snapshots for full history tracking
-- Following the principle: "Never delete, only append observations"
-- =====================================================
-- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE)
-- =====================================================
-- Product snapshots - one row per product per crawl
CREATE TABLE IF NOT EXISTS product_snapshots (
id SERIAL PRIMARY KEY,
-- Source identification
crawl_id UUID NOT NULL, -- Groups all products from same crawl run
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
-- Product identification
external_product_id VARCHAR(255), -- Dutchie/provider product ID
product_slug VARCHAR(500), -- URL slug for matching
-- Product details (as seen at crawl time)
name VARCHAR(500) NOT NULL,
brand VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
-- Pricing snapshot
price NUMERIC(10,2),
original_price NUMERIC(10,2),
sale_price NUMERIC(10,2),
discount_type VARCHAR(50),
discount_value VARCHAR(100),
-- Availability snapshot
availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown',
-- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown'
stock_quantity INTEGER,
-- Potency snapshot
thc_percentage NUMERIC(5,2),
cbd_percentage NUMERIC(5,2),
-- Product attributes
strain_type VARCHAR(100),
weight VARCHAR(100),
variant VARCHAR(255),
-- Rich data
description TEXT,
image_url TEXT,
effects TEXT[],
terpenes TEXT[],
-- Timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw metadata from provider
raw_data JSONB
);
-- Indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id);
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC);
-- Brand snapshots - summary of brands seen per crawl
CREATE TABLE IF NOT EXISTS brand_snapshots (
id SERIAL PRIMARY KEY,
crawl_id UUID NOT NULL,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
brand_name VARCHAR(255) NOT NULL,
product_count INTEGER NOT NULL DEFAULT 0,
in_stock_count INTEGER NOT NULL DEFAULT 0,
-- Price range for this brand at this store at this time
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
avg_price NUMERIC(10,2),
-- Categories this brand has products in
categories TEXT[],
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id);
-- Crawl runs table - metadata about each crawl
CREATE TABLE IF NOT EXISTS crawl_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed'
-- Results
products_found INTEGER DEFAULT 0,
brands_found INTEGER DEFAULT 0,
categories_found INTEGER DEFAULT 0,
-- Errors if any
error_message TEXT,
-- Provider info
provider VARCHAR(50),
menu_url TEXT
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
-- =====================================================
-- LAYER 2: Summary/Rollup Tables (can be recalculated)
-- =====================================================
-- Daily brand summary per store
CREATE TABLE IF NOT EXISTS brand_store_day_summary (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
brand_name VARCHAR(255) NOT NULL,
summary_date DATE NOT NULL,
-- Presence
first_seen_at TIMESTAMPTZ,
last_seen_at TIMESTAMPTZ,
crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today
-- Product counts
total_skus INTEGER DEFAULT 0,
in_stock_skus INTEGER DEFAULT 0,
out_of_stock_events INTEGER DEFAULT 0,
-- Price stats
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
avg_price NUMERIC(10,2),
-- Categories
categories TEXT[],
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, brand_name, summary_date)
);
CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC);
CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC);
-- Product SKU daily summary
CREATE TABLE IF NOT EXISTS product_sku_day_summary (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
product_slug VARCHAR(500) NOT NULL,
summary_date DATE NOT NULL,
-- Latest values
name VARCHAR(500),
brand VARCHAR(255),
category VARCHAR(100),
-- Price tracking
opening_price NUMERIC(10,2), -- first price of day
closing_price NUMERIC(10,2), -- last price of day
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
price_changes INTEGER DEFAULT 0,
-- Availability
times_in_stock INTEGER DEFAULT 0,
times_out_of_stock INTEGER DEFAULT 0,
first_seen_at TIMESTAMPTZ,
last_seen_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, product_slug, summary_date)
);
CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC);
CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC);
-- =====================================================
-- VIEWS for common queries
-- =====================================================
-- Current products view (latest snapshot per product)
CREATE OR REPLACE VIEW current_products AS
SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug)
ps.*,
d.name AS dispensary_name,
COALESCE(d.dba_name, d.name) AS store_name
FROM product_snapshots ps
JOIN dispensaries d ON d.id = ps.dispensary_id
ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC;
-- Current brands per store view
CREATE OR REPLACE VIEW current_brands AS
SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name)
bs.*,
d.name AS dispensary_name,
COALESCE(d.dba_name, d.name) AS store_name
FROM brand_snapshots bs
JOIN dispensaries d ON d.id = bs.dispensary_id
WHERE bs.captured_at >= NOW() - INTERVAL '7 days'
ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC;
-- Brand coverage across stores
CREATE OR REPLACE VIEW brand_store_coverage AS
SELECT
brand_name,
COUNT(DISTINCT dispensary_id) AS store_count,
SUM(product_count) AS total_skus,
MIN(min_price) AS market_min_price,
MAX(max_price) AS market_max_price,
AVG(avg_price) AS market_avg_price,
MAX(captured_at) AS last_seen_at
FROM brand_snapshots
WHERE captured_at >= NOW() - INTERVAL '7 days'
GROUP BY brand_name;
-- Grant permissions
GRANT SELECT, INSERT ON product_snapshots TO scraper;
GRANT SELECT, INSERT ON brand_snapshots TO scraper;
GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper;
GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper;
GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper;
GRANT SELECT ON current_products TO scraper;
GRANT SELECT ON current_brands TO scraper;
GRANT SELECT ON brand_store_coverage TO scraper;