- Delete migration 029 that was incorrectly creating duplicate dispensaries - Add migration 028 for snapshot architecture - Improve downloader with proxy/UA rotation - Update scraper monitor and tools pages - Various scraper improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
241 lines
7.9 KiB
SQL
241 lines
7.9 KiB
SQL
-- Migration 028: Snapshot Architecture
|
|
-- Implements append-only snapshots for full history tracking
|
|
-- Following the principle: "Never delete, only append observations"
|
|
|
|
-- =====================================================
|
|
-- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE)
|
|
-- =====================================================
|
|
|
|
-- Product snapshots - one row per product per crawl
|
|
CREATE TABLE IF NOT EXISTS product_snapshots (
|
|
id SERIAL PRIMARY KEY,
|
|
|
|
-- Source identification
|
|
crawl_id UUID NOT NULL, -- Groups all products from same crawl run
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
|
|
|
-- Product identification
|
|
external_product_id VARCHAR(255), -- Dutchie/provider product ID
|
|
product_slug VARCHAR(500), -- URL slug for matching
|
|
|
|
-- Product details (as seen at crawl time)
|
|
name VARCHAR(500) NOT NULL,
|
|
brand VARCHAR(255),
|
|
category VARCHAR(100),
|
|
subcategory VARCHAR(100),
|
|
|
|
-- Pricing snapshot
|
|
price NUMERIC(10,2),
|
|
original_price NUMERIC(10,2),
|
|
sale_price NUMERIC(10,2),
|
|
discount_type VARCHAR(50),
|
|
discount_value VARCHAR(100),
|
|
|
|
-- Availability snapshot
|
|
availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown',
|
|
-- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown'
|
|
stock_quantity INTEGER,
|
|
|
|
-- Potency snapshot
|
|
thc_percentage NUMERIC(5,2),
|
|
cbd_percentage NUMERIC(5,2),
|
|
|
|
-- Product attributes
|
|
strain_type VARCHAR(100),
|
|
weight VARCHAR(100),
|
|
variant VARCHAR(255),
|
|
|
|
-- Rich data
|
|
description TEXT,
|
|
image_url TEXT,
|
|
effects TEXT[],
|
|
terpenes TEXT[],
|
|
|
|
-- Timestamp
|
|
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
-- Raw metadata from provider
|
|
raw_data JSONB
|
|
);
|
|
|
|
-- Indexes for efficient querying
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id);
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC);
|
|
|
|
-- Brand snapshots - summary of brands seen per crawl
|
|
CREATE TABLE IF NOT EXISTS brand_snapshots (
|
|
id SERIAL PRIMARY KEY,
|
|
crawl_id UUID NOT NULL,
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
|
|
|
brand_name VARCHAR(255) NOT NULL,
|
|
product_count INTEGER NOT NULL DEFAULT 0,
|
|
in_stock_count INTEGER NOT NULL DEFAULT 0,
|
|
|
|
-- Price range for this brand at this store at this time
|
|
min_price NUMERIC(10,2),
|
|
max_price NUMERIC(10,2),
|
|
avg_price NUMERIC(10,2),
|
|
|
|
-- Categories this brand has products in
|
|
categories TEXT[],
|
|
|
|
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id);
|
|
|
|
-- Crawl runs table - metadata about each crawl
|
|
CREATE TABLE IF NOT EXISTS crawl_runs (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
|
|
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
completed_at TIMESTAMPTZ,
|
|
|
|
status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed'
|
|
|
|
-- Results
|
|
products_found INTEGER DEFAULT 0,
|
|
brands_found INTEGER DEFAULT 0,
|
|
categories_found INTEGER DEFAULT 0,
|
|
|
|
-- Errors if any
|
|
error_message TEXT,
|
|
|
|
-- Provider info
|
|
provider VARCHAR(50),
|
|
menu_url TEXT
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
|
|
|
-- =====================================================
|
|
-- LAYER 2: Summary/Rollup Tables (can be recalculated)
|
|
-- =====================================================
|
|
|
|
-- Daily brand summary per store
|
|
CREATE TABLE IF NOT EXISTS brand_store_day_summary (
|
|
id SERIAL PRIMARY KEY,
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
|
brand_name VARCHAR(255) NOT NULL,
|
|
summary_date DATE NOT NULL,
|
|
|
|
-- Presence
|
|
first_seen_at TIMESTAMPTZ,
|
|
last_seen_at TIMESTAMPTZ,
|
|
crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today
|
|
|
|
-- Product counts
|
|
total_skus INTEGER DEFAULT 0,
|
|
in_stock_skus INTEGER DEFAULT 0,
|
|
out_of_stock_events INTEGER DEFAULT 0,
|
|
|
|
-- Price stats
|
|
min_price NUMERIC(10,2),
|
|
max_price NUMERIC(10,2),
|
|
avg_price NUMERIC(10,2),
|
|
|
|
-- Categories
|
|
categories TEXT[],
|
|
|
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
|
|
|
UNIQUE(dispensary_id, brand_name, summary_date)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC);
|
|
|
|
-- Product SKU daily summary
|
|
CREATE TABLE IF NOT EXISTS product_sku_day_summary (
|
|
id SERIAL PRIMARY KEY,
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
|
product_slug VARCHAR(500) NOT NULL,
|
|
summary_date DATE NOT NULL,
|
|
|
|
-- Latest values
|
|
name VARCHAR(500),
|
|
brand VARCHAR(255),
|
|
category VARCHAR(100),
|
|
|
|
-- Price tracking
|
|
opening_price NUMERIC(10,2), -- first price of day
|
|
closing_price NUMERIC(10,2), -- last price of day
|
|
min_price NUMERIC(10,2),
|
|
max_price NUMERIC(10,2),
|
|
price_changes INTEGER DEFAULT 0,
|
|
|
|
-- Availability
|
|
times_in_stock INTEGER DEFAULT 0,
|
|
times_out_of_stock INTEGER DEFAULT 0,
|
|
first_seen_at TIMESTAMPTZ,
|
|
last_seen_at TIMESTAMPTZ,
|
|
|
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
|
|
|
UNIQUE(dispensary_id, product_slug, summary_date)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC);
|
|
|
|
-- =====================================================
|
|
-- VIEWS for common queries
|
|
-- =====================================================
|
|
|
|
-- Current products view (latest snapshot per product)
|
|
CREATE OR REPLACE VIEW current_products AS
|
|
SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug)
|
|
ps.*,
|
|
d.name AS dispensary_name,
|
|
COALESCE(d.dba_name, d.name) AS store_name
|
|
FROM product_snapshots ps
|
|
JOIN dispensaries d ON d.id = ps.dispensary_id
|
|
ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC;
|
|
|
|
-- Current brands per store view
|
|
CREATE OR REPLACE VIEW current_brands AS
|
|
SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name)
|
|
bs.*,
|
|
d.name AS dispensary_name,
|
|
COALESCE(d.dba_name, d.name) AS store_name
|
|
FROM brand_snapshots bs
|
|
JOIN dispensaries d ON d.id = bs.dispensary_id
|
|
WHERE bs.captured_at >= NOW() - INTERVAL '7 days'
|
|
ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC;
|
|
|
|
-- Brand coverage across stores
|
|
CREATE OR REPLACE VIEW brand_store_coverage AS
|
|
SELECT
|
|
brand_name,
|
|
COUNT(DISTINCT dispensary_id) AS store_count,
|
|
SUM(product_count) AS total_skus,
|
|
MIN(min_price) AS market_min_price,
|
|
MAX(max_price) AS market_max_price,
|
|
AVG(avg_price) AS market_avg_price,
|
|
MAX(captured_at) AS last_seen_at
|
|
FROM brand_snapshots
|
|
WHERE captured_at >= NOW() - INTERVAL '7 days'
|
|
GROUP BY brand_name;
|
|
|
|
-- Grant permissions
|
|
GRANT SELECT, INSERT ON product_snapshots TO scraper;
|
|
GRANT SELECT, INSERT ON brand_snapshots TO scraper;
|
|
GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper;
|
|
GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper;
|
|
GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper;
|
|
GRANT SELECT ON current_products TO scraper;
|
|
GRANT SELECT ON current_brands TO scraper;
|
|
GRANT SELECT ON brand_store_coverage TO scraper;
|