Remove incorrect migration 029, add snapshot architecture, improve scraper
- Delete migration 029 that was incorrectly creating duplicate dispensaries - Add migration 028 for snapshot architecture - Improve downloader with proxy/UA rotation - Update scraper monitor and tools pages - Various scraper improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
240
backend/migrations/028_snapshot_architecture.sql
Normal file
240
backend/migrations/028_snapshot_architecture.sql
Normal file
@@ -0,0 +1,240 @@
|
||||
-- Migration 028: Snapshot Architecture
|
||||
-- Implements append-only snapshots for full history tracking
|
||||
-- Following the principle: "Never delete, only append observations"
|
||||
|
||||
-- =====================================================
|
||||
-- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE)
|
||||
-- =====================================================
|
||||
|
||||
-- Product snapshots - one row per product per crawl
|
||||
CREATE TABLE IF NOT EXISTS product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Source identification
|
||||
crawl_id UUID NOT NULL, -- Groups all products from same crawl run
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
|
||||
-- Product identification
|
||||
external_product_id VARCHAR(255), -- Dutchie/provider product ID
|
||||
product_slug VARCHAR(500), -- URL slug for matching
|
||||
|
||||
-- Product details (as seen at crawl time)
|
||||
name VARCHAR(500) NOT NULL,
|
||||
brand VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
|
||||
-- Pricing snapshot
|
||||
price NUMERIC(10,2),
|
||||
original_price NUMERIC(10,2),
|
||||
sale_price NUMERIC(10,2),
|
||||
discount_type VARCHAR(50),
|
||||
discount_value VARCHAR(100),
|
||||
|
||||
-- Availability snapshot
|
||||
availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown',
|
||||
-- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown'
|
||||
stock_quantity INTEGER,
|
||||
|
||||
-- Potency snapshot
|
||||
thc_percentage NUMERIC(5,2),
|
||||
cbd_percentage NUMERIC(5,2),
|
||||
|
||||
-- Product attributes
|
||||
strain_type VARCHAR(100),
|
||||
weight VARCHAR(100),
|
||||
variant VARCHAR(255),
|
||||
|
||||
-- Rich data
|
||||
description TEXT,
|
||||
image_url TEXT,
|
||||
effects TEXT[],
|
||||
terpenes TEXT[],
|
||||
|
||||
-- Timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Raw metadata from provider
|
||||
raw_data JSONB
|
||||
);
|
||||
|
||||
-- Indexes for efficient querying
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC);
|
||||
|
||||
-- Brand snapshots - summary of brands seen per crawl
|
||||
CREATE TABLE IF NOT EXISTS brand_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
crawl_id UUID NOT NULL,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
|
||||
brand_name VARCHAR(255) NOT NULL,
|
||||
product_count INTEGER NOT NULL DEFAULT 0,
|
||||
in_stock_count INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
-- Price range for this brand at this store at this time
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
avg_price NUMERIC(10,2),
|
||||
|
||||
-- Categories this brand has products in
|
||||
categories TEXT[],
|
||||
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id);
|
||||
|
||||
-- Crawl runs table - metadata about each crawl
|
||||
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed'
|
||||
|
||||
-- Results
|
||||
products_found INTEGER DEFAULT 0,
|
||||
brands_found INTEGER DEFAULT 0,
|
||||
categories_found INTEGER DEFAULT 0,
|
||||
|
||||
-- Errors if any
|
||||
error_message TEXT,
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50),
|
||||
menu_url TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||
|
||||
-- =====================================================
|
||||
-- LAYER 2: Summary/Rollup Tables (can be recalculated)
|
||||
-- =====================================================
|
||||
|
||||
-- Daily brand summary per store
|
||||
CREATE TABLE IF NOT EXISTS brand_store_day_summary (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
brand_name VARCHAR(255) NOT NULL,
|
||||
summary_date DATE NOT NULL,
|
||||
|
||||
-- Presence
|
||||
first_seen_at TIMESTAMPTZ,
|
||||
last_seen_at TIMESTAMPTZ,
|
||||
crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today
|
||||
|
||||
-- Product counts
|
||||
total_skus INTEGER DEFAULT 0,
|
||||
in_stock_skus INTEGER DEFAULT 0,
|
||||
out_of_stock_events INTEGER DEFAULT 0,
|
||||
|
||||
-- Price stats
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
avg_price NUMERIC(10,2),
|
||||
|
||||
-- Categories
|
||||
categories TEXT[],
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, brand_name, summary_date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC);
|
||||
|
||||
-- Product SKU daily summary
|
||||
CREATE TABLE IF NOT EXISTS product_sku_day_summary (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
product_slug VARCHAR(500) NOT NULL,
|
||||
summary_date DATE NOT NULL,
|
||||
|
||||
-- Latest values
|
||||
name VARCHAR(500),
|
||||
brand VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
|
||||
-- Price tracking
|
||||
opening_price NUMERIC(10,2), -- first price of day
|
||||
closing_price NUMERIC(10,2), -- last price of day
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
price_changes INTEGER DEFAULT 0,
|
||||
|
||||
-- Availability
|
||||
times_in_stock INTEGER DEFAULT 0,
|
||||
times_out_of_stock INTEGER DEFAULT 0,
|
||||
first_seen_at TIMESTAMPTZ,
|
||||
last_seen_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, product_slug, summary_date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC);
|
||||
|
||||
-- =====================================================
|
||||
-- VIEWS for common queries
|
||||
-- =====================================================
|
||||
|
||||
-- Current products view (latest snapshot per product)
|
||||
CREATE OR REPLACE VIEW current_products AS
|
||||
SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug)
|
||||
ps.*,
|
||||
d.name AS dispensary_name,
|
||||
COALESCE(d.dba_name, d.name) AS store_name
|
||||
FROM product_snapshots ps
|
||||
JOIN dispensaries d ON d.id = ps.dispensary_id
|
||||
ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC;
|
||||
|
||||
-- Current brands per store view
|
||||
CREATE OR REPLACE VIEW current_brands AS
|
||||
SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name)
|
||||
bs.*,
|
||||
d.name AS dispensary_name,
|
||||
COALESCE(d.dba_name, d.name) AS store_name
|
||||
FROM brand_snapshots bs
|
||||
JOIN dispensaries d ON d.id = bs.dispensary_id
|
||||
WHERE bs.captured_at >= NOW() - INTERVAL '7 days'
|
||||
ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC;
|
||||
|
||||
-- Brand coverage across stores
|
||||
CREATE OR REPLACE VIEW brand_store_coverage AS
|
||||
SELECT
|
||||
brand_name,
|
||||
COUNT(DISTINCT dispensary_id) AS store_count,
|
||||
SUM(product_count) AS total_skus,
|
||||
MIN(min_price) AS market_min_price,
|
||||
MAX(max_price) AS market_max_price,
|
||||
AVG(avg_price) AS market_avg_price,
|
||||
MAX(captured_at) AS last_seen_at
|
||||
FROM brand_snapshots
|
||||
WHERE captured_at >= NOW() - INTERVAL '7 days'
|
||||
GROUP BY brand_name;
|
||||
|
||||
-- Grant permissions
|
||||
GRANT SELECT, INSERT ON product_snapshots TO scraper;
|
||||
GRANT SELECT, INSERT ON brand_snapshots TO scraper;
|
||||
GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper;
|
||||
GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper;
|
||||
GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper;
|
||||
GRANT SELECT ON current_products TO scraper;
|
||||
GRANT SELECT ON current_brands TO scraper;
|
||||
GRANT SELECT ON brand_store_coverage TO scraper;
|
||||
@@ -1,56 +0,0 @@
|
||||
-- =====================================================
|
||||
-- Link Dutchie Stores to Dispensaries
|
||||
-- =====================================================
|
||||
-- Creates dispensary records for stores with dutchie_url that
|
||||
-- don't yet have a dispensary_id, then links them.
|
||||
|
||||
-- Create dispensaries for unlinked stores with dutchie_url
|
||||
DO $$
|
||||
DECLARE
|
||||
store_rec RECORD;
|
||||
new_slug TEXT;
|
||||
new_disp_id INTEGER;
|
||||
BEGIN
|
||||
FOR store_rec IN
|
||||
SELECT id, name, dutchie_url
|
||||
FROM stores
|
||||
WHERE dutchie_url IS NOT NULL AND dispensary_id IS NULL
|
||||
LOOP
|
||||
-- Extract slug from dutchie_url
|
||||
new_slug := regexp_replace(
|
||||
regexp_replace(store_rec.dutchie_url, '^https://dutchie\.com/(embedded-menu|dispensary)/', ''),
|
||||
'/.*$', ''
|
||||
);
|
||||
|
||||
-- Insert or update dispensary
|
||||
INSERT INTO dispensaries (name, slug, address, city, state, provider_type, menu_url, created_at, updated_at)
|
||||
VALUES (
|
||||
store_rec.name,
|
||||
new_slug,
|
||||
'TBD', -- Address to be filled in later
|
||||
'TBD', -- City to be filled in later
|
||||
'AZ', -- Default state
|
||||
'dutchie',
|
||||
store_rec.dutchie_url,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (slug) DO UPDATE SET
|
||||
provider_type = 'dutchie',
|
||||
menu_url = EXCLUDED.menu_url,
|
||||
updated_at = NOW()
|
||||
RETURNING id INTO new_disp_id;
|
||||
|
||||
-- Link store to dispensary
|
||||
UPDATE stores SET dispensary_id = new_disp_id WHERE id = store_rec.id;
|
||||
|
||||
RAISE NOTICE 'Linked store % (%) to dispensary %', store_rec.id, store_rec.name, new_disp_id;
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
-- Report on linked stores
|
||||
SELECT s.id as store_id, s.name as store_name, s.dispensary_id, d.slug as disp_slug
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON d.id = s.dispensary_id
|
||||
WHERE s.dutchie_url IS NOT NULL
|
||||
ORDER BY s.id;
|
||||
Reference in New Issue
Block a user