-- ============================================================================ -- Migration 051: CannaiQ Canonical Schema - Safe Bootstrap -- ============================================================================ -- -- Purpose: Create the canonical CannaiQ schema tables from scratch. -- This migration is FULLY IDEMPOTENT and safe to run multiple times. -- -- SAFETY RULES FOLLOWED: -- 1. ALL tables use CREATE TABLE IF NOT EXISTS -- 2. ALL columns use ALTER TABLE ADD COLUMN IF NOT EXISTS -- 3. ALL indexes use CREATE INDEX IF NOT EXISTS -- 4. NO DROP, DELETE, TRUNCATE, or destructive operations -- 5. NO assumptions about existing data or column existence -- 6. NO dependencies on migrations 041, 043, or 050 -- 7. Compatible with dutchie_menus database as it exists today -- 8. Safe handling of pre-existing states table with missing columns -- -- Tables Created: -- - states (US state reference table) -- - chains (retail chain/group table) -- - crawl_runs (crawl execution records) -- - store_products (current menu state) -- - store_product_snapshots (historical price/stock data) -- -- Columns Added: -- - dispensaries.state_id (FK to states) -- - dispensaries.chain_id (FK to chains) -- -- Run with: -- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \ -- -f migrations/051_cannaiq_canonical_safe_bootstrap.sql -- -- ============================================================================ -- ============================================================================ -- SECTION 1: STATES TABLE -- ============================================================================ -- Reference table for US states where CannaiQ operates. -- This section handles the case where the table exists but is missing columns. -- First, create the table if it doesn't exist (minimal definition) CREATE TABLE IF NOT EXISTS states ( id SERIAL PRIMARY KEY, code VARCHAR(2) NOT NULL, name VARCHAR(100) NOT NULL, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); -- Now safely add any missing columns (each is independent, won't fail if exists) ALTER TABLE states ADD COLUMN IF NOT EXISTS timezone TEXT; ALTER TABLE states ADD COLUMN IF NOT EXISTS is_active BOOLEAN DEFAULT TRUE; ALTER TABLE states ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE; -- Add unique constraint on code if not exists DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'states_code_key' AND conrelid = 'states'::regclass ) THEN -- Check if there's already a unique constraint with a different name IF NOT EXISTS ( SELECT 1 FROM pg_indexes WHERE tablename = 'states' AND indexdef LIKE '%UNIQUE%code%' ) THEN ALTER TABLE states ADD CONSTRAINT states_code_key UNIQUE (code); END IF; END IF; EXCEPTION WHEN duplicate_object THEN NULL; -- Constraint already exists WHEN OTHERS THEN NULL; -- Handle any other errors gracefully END $$; -- Set default timezone values for existing rows that have NULL UPDATE states SET timezone = 'America/Phoenix' WHERE timezone IS NULL AND code = 'AZ'; UPDATE states SET timezone = 'America/Los_Angeles' WHERE timezone IS NULL AND code IN ('CA', 'NV', 'OR', 'WA'); UPDATE states SET timezone = 'America/Denver' WHERE timezone IS NULL AND code = 'CO'; UPDATE states SET timezone = 'America/New_York' WHERE timezone IS NULL AND code IN ('FL', 'MA', 'MD', 'NJ', 'NY', 'OH', 'PA'); UPDATE states SET timezone = 'America/Chicago' WHERE timezone IS NULL AND code IN ('IL', 'MO', 'OK'); UPDATE states SET timezone = 'America/Detroit' WHERE timezone IS NULL AND code = 'MI'; -- Set default is_active for existing rows UPDATE states SET is_active = TRUE WHERE is_active IS NULL; UPDATE states SET crawl_enabled = TRUE WHERE crawl_enabled IS NULL; -- Insert known states (idempotent - ON CONFLICT DO UPDATE to fill missing values) INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES ('AZ', 'Arizona', 'America/Phoenix', TRUE, TRUE), ('CA', 'California', 'America/Los_Angeles', TRUE, TRUE), ('CO', 'Colorado', 'America/Denver', TRUE, TRUE), ('FL', 'Florida', 'America/New_York', TRUE, TRUE), ('IL', 'Illinois', 'America/Chicago', TRUE, TRUE), ('MA', 'Massachusetts', 'America/New_York', TRUE, TRUE), ('MD', 'Maryland', 'America/New_York', TRUE, TRUE), ('MI', 'Michigan', 'America/Detroit', TRUE, TRUE), ('MO', 'Missouri', 'America/Chicago', TRUE, TRUE), ('NV', 'Nevada', 'America/Los_Angeles', TRUE, TRUE), ('NJ', 'New Jersey', 'America/New_York', TRUE, TRUE), ('NY', 'New York', 'America/New_York', TRUE, TRUE), ('OH', 'Ohio', 'America/New_York', TRUE, TRUE), ('OK', 'Oklahoma', 'America/Chicago', TRUE, TRUE), ('OR', 'Oregon', 'America/Los_Angeles', TRUE, TRUE), ('PA', 'Pennsylvania', 'America/New_York', TRUE, TRUE), ('WA', 'Washington', 'America/Los_Angeles', TRUE, TRUE) ON CONFLICT (code) DO UPDATE SET timezone = COALESCE(states.timezone, EXCLUDED.timezone), is_active = COALESCE(states.is_active, EXCLUDED.is_active), crawl_enabled = COALESCE(states.crawl_enabled, EXCLUDED.crawl_enabled), updated_at = NOW(); CREATE INDEX IF NOT EXISTS idx_states_code ON states(code); CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE; COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.'; -- ============================================================================ -- SECTION 2: CHAINS TABLE -- ============================================================================ -- Retail chains/groups that own multiple dispensary locations. -- Examples: Curaleaf, Trulieve, Harvest, Columbia Care CREATE TABLE IF NOT EXISTS chains ( id SERIAL PRIMARY KEY, name VARCHAR(255) NOT NULL, slug VARCHAR(255) NOT NULL, website_url TEXT, logo_url TEXT, description TEXT, headquarters_city VARCHAR(100), headquarters_state_id INTEGER, founded_year INTEGER, is_active BOOLEAN DEFAULT TRUE, is_public BOOLEAN DEFAULT FALSE, stock_ticker VARCHAR(10), created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); -- Add unique constraint on slug if not exists DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'chains_slug_key' AND conrelid = 'chains'::regclass ) THEN ALTER TABLE chains ADD CONSTRAINT chains_slug_key UNIQUE (slug); END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; -- Add FK to states if not exists DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'chains_headquarters_state_id_fkey' ) THEN ALTER TABLE chains ADD CONSTRAINT chains_headquarters_state_id_fkey FOREIGN KEY (headquarters_state_id) REFERENCES states(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug); CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE; COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.'; -- ============================================================================ -- SECTION 3: ADD state_id AND chain_id TO DISPENSARIES -- ============================================================================ -- Link existing dispensaries table to states and chains. ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER; ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER; -- Add FK constraints if not exist DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'dispensaries_state_id_fkey' ) THEN ALTER TABLE dispensaries ADD CONSTRAINT dispensaries_state_id_fkey FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'dispensaries_chain_id_fkey' ) THEN ALTER TABLE dispensaries ADD CONSTRAINT dispensaries_chain_id_fkey FOREIGN KEY (chain_id) REFERENCES chains(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id); CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL; -- Backfill state_id from existing state column (safe - only updates NULL values) UPDATE dispensaries d SET state_id = s.id FROM states s WHERE d.state = s.code AND d.state_id IS NULL; COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.'; COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.'; -- ============================================================================ -- SECTION 4: CRAWL_RUNS TABLE -- ============================================================================ -- One record per crawl execution. Links to snapshots. CREATE TABLE IF NOT EXISTS crawl_runs ( id SERIAL PRIMARY KEY, dispensary_id INTEGER NOT NULL, state_id INTEGER, -- Provider info provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', -- Timing started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), finished_at TIMESTAMPTZ, duration_ms INTEGER, -- Status status VARCHAR(20) NOT NULL DEFAULT 'running', error_code VARCHAR(50), error_message TEXT, http_status INTEGER, -- Results products_found INTEGER DEFAULT 0, products_new INTEGER DEFAULT 0, products_updated INTEGER DEFAULT 0, products_missing INTEGER DEFAULT 0, snapshots_written INTEGER DEFAULT 0, -- Infrastructure worker_id VARCHAR(100), worker_hostname VARCHAR(100), proxy_used TEXT, trigger_type VARCHAR(50) DEFAULT 'scheduled', -- Metadata metadata JSONB DEFAULT '{}', created_at TIMESTAMPTZ DEFAULT NOW() ); -- Add FK constraints if not exist DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'crawl_runs_dispensary_id_fkey' ) THEN ALTER TABLE crawl_runs ADD CONSTRAINT crawl_runs_dispensary_id_fkey FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'crawl_runs_state_id_fkey' ) THEN ALTER TABLE crawl_runs ADD CONSTRAINT crawl_runs_state_id_fkey FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id); CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status); CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC); CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC); COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.'; -- ============================================================================ -- SECTION 5: STORE_PRODUCTS TABLE -- ============================================================================ -- Current state of products on each dispensary menu. -- Provider-agnostic structure for analytics. CREATE TABLE IF NOT EXISTS store_products ( id SERIAL PRIMARY KEY, dispensary_id INTEGER NOT NULL, state_id INTEGER, -- Provider-specific identifiers provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', provider_product_id VARCHAR(100) NOT NULL, provider_brand_id VARCHAR(100), enterprise_product_id VARCHAR(100), -- Raw data from platform (not normalized) name VARCHAR(500) NOT NULL, brand_name VARCHAR(255), category VARCHAR(100), subcategory VARCHAR(100), strain_type VARCHAR(50), description TEXT, -- Pricing (current) price_rec NUMERIC(10,2), price_med NUMERIC(10,2), price_rec_special NUMERIC(10,2), price_med_special NUMERIC(10,2), is_on_special BOOLEAN DEFAULT FALSE, special_name TEXT, discount_percent NUMERIC(5,2), price_unit VARCHAR(20) DEFAULT 'each', -- Inventory is_in_stock BOOLEAN DEFAULT TRUE, stock_quantity INTEGER, stock_status VARCHAR(50) DEFAULT 'in_stock', -- Potency thc_percent NUMERIC(5,2), cbd_percent NUMERIC(5,2), thc_mg NUMERIC(10,2), cbd_mg NUMERIC(10,2), -- Weight/Size weight_value NUMERIC(10,2), weight_unit VARCHAR(20), -- Images image_url TEXT, local_image_path TEXT, thumbnail_url TEXT, -- Flags is_featured BOOLEAN DEFAULT FALSE, medical_only BOOLEAN DEFAULT FALSE, rec_only BOOLEAN DEFAULT FALSE, -- Menu position (for tracking prominence) menu_position INTEGER, -- Timestamps first_seen_at TIMESTAMPTZ DEFAULT NOW(), last_seen_at TIMESTAMPTZ DEFAULT NOW(), last_price_change_at TIMESTAMPTZ, last_stock_change_at TIMESTAMPTZ, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); -- Add unique constraint if not exists DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_products_dispensary_provider_product_key' ) THEN ALTER TABLE store_products ADD CONSTRAINT store_products_dispensary_provider_product_key UNIQUE (dispensary_id, provider, provider_product_id); END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; -- Add FK constraints if not exist DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_products_dispensary_id_fkey' ) THEN ALTER TABLE store_products ADD CONSTRAINT store_products_dispensary_id_fkey FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_products_state_id_fkey' ) THEN ALTER TABLE store_products ADD CONSTRAINT store_products_state_id_fkey FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id); CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_store_products_brand_name ON store_products(brand_name) WHERE brand_name IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock); CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE; CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC); CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider); CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL; COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.'; -- ============================================================================ -- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE -- ============================================================================ -- Historical price/stock data. One row per product per crawl. -- CRITICAL: NEVER DELETE from this table. CREATE TABLE IF NOT EXISTS store_product_snapshots ( id SERIAL PRIMARY KEY, dispensary_id INTEGER NOT NULL, store_product_id INTEGER, state_id INTEGER, -- Provider info provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', provider_product_id VARCHAR(100), -- Link to crawl run crawl_run_id INTEGER, -- Capture timestamp captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- Raw data from platform name VARCHAR(500), brand_name VARCHAR(255), category VARCHAR(100), subcategory VARCHAR(100), -- Pricing at time of capture price_rec NUMERIC(10,2), price_med NUMERIC(10,2), price_rec_special NUMERIC(10,2), price_med_special NUMERIC(10,2), is_on_special BOOLEAN DEFAULT FALSE, discount_percent NUMERIC(5,2), -- Inventory at time of capture is_in_stock BOOLEAN DEFAULT TRUE, stock_quantity INTEGER, stock_status VARCHAR(50) DEFAULT 'in_stock', is_present_in_feed BOOLEAN DEFAULT TRUE, -- Potency at time of capture thc_percent NUMERIC(5,2), cbd_percent NUMERIC(5,2), -- Menu position (for tracking prominence changes) menu_position INTEGER, -- Image URL at time of capture image_url TEXT, -- Full raw response for debugging raw_data JSONB, created_at TIMESTAMPTZ DEFAULT NOW() ); -- Add FK constraints if not exist DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_product_snapshots_dispensary_id_fkey' ) THEN ALTER TABLE store_product_snapshots ADD CONSTRAINT store_product_snapshots_dispensary_id_fkey FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_product_snapshots_store_product_id_fkey' ) THEN ALTER TABLE store_product_snapshots ADD CONSTRAINT store_product_snapshots_store_product_id_fkey FOREIGN KEY (store_product_id) REFERENCES store_products(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_product_snapshots_state_id_fkey' ) THEN ALTER TABLE store_product_snapshots ADD CONSTRAINT store_product_snapshots_state_id_fkey FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM pg_constraint WHERE conname = 'store_product_snapshots_crawl_run_id_fkey' ) THEN ALTER TABLE store_product_snapshots ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL; END IF; EXCEPTION WHEN duplicate_object THEN NULL; WHEN OTHERS THEN NULL; END $$; -- Indexes optimized for analytics queries CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC); CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_snapshots_provider_product ON store_product_snapshots(provider_product_id) WHERE provider_product_id IS NOT NULL; COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.'; -- ============================================================================ -- SECTION 7: VIEWS FOR BACKWARD COMPATIBILITY -- ============================================================================ -- View: Latest snapshot per store product CREATE OR REPLACE VIEW v_latest_store_snapshots AS SELECT DISTINCT ON (dispensary_id, provider_product_id) sps.* FROM store_product_snapshots sps ORDER BY dispensary_id, provider_product_id, captured_at DESC; -- View: Crawl run summary per dispensary CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS SELECT d.id AS dispensary_id, COALESCE(d.dba_name, d.name) AS dispensary_name, d.city, d.state, d.state_id, s.name AS state_name, COUNT(DISTINCT sp.id) AS current_product_count, COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count, COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count, MAX(cr.finished_at) AS last_crawl_at, (SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status FROM dispensaries d LEFT JOIN states s ON s.id = d.state_id LEFT JOIN store_products sp ON sp.dispensary_id = d.id LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name; -- ============================================================================ -- MIGRATION 051 COMPLETE -- ============================================================================ SELECT 'Migration 051 completed successfully. Canonical schema is ready.' AS status;