feat: Add v2 architecture with multi-state support and orchestrator services

Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-07 11:30:57 -07:00
parent 8ac64ba077
commit b4a2fb7d03
248 changed files with 60714 additions and 666 deletions

View File

@@ -0,0 +1,90 @@
-- Migration 037: Add per-store crawler profiles for Dutchie dispensaries
-- This enables per-store crawler configuration without changing shared logic
-- Phase 1: Schema only - no automatic behavior changes
-- Create the crawler profiles table
CREATE TABLE IF NOT EXISTS dispensary_crawler_profiles (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Human readable name for this profile
profile_name VARCHAR(255) NOT NULL,
-- High-level type, e.g. 'dutchie', 'treez', 'jane'
crawler_type VARCHAR(50) NOT NULL,
-- Optional key for mapping to a per-store crawler module later,
-- e.g. 'curaleaf-dispensary-gilbert'
profile_key VARCHAR(255),
-- Generic configuration bucket; will hold selectors, URLs, flags, etc.
config JSONB NOT NULL DEFAULT '{}'::jsonb,
-- Execution hints (safe defaults; can be overridden in config if needed)
timeout_ms INTEGER DEFAULT 30000,
download_images BOOLEAN DEFAULT TRUE,
track_stock BOOLEAN DEFAULT TRUE,
version INTEGER DEFAULT 1,
enabled BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Unique index on dispensary_id + profile_name
CREATE UNIQUE INDEX IF NOT EXISTS dispensary_crawler_profiles_unique_name
ON dispensary_crawler_profiles (dispensary_id, profile_name);
-- Index for finding enabled profiles by type
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_type_enabled
ON dispensary_crawler_profiles (crawler_type, enabled);
-- Index for dispensary lookup
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_dispensary
ON dispensary_crawler_profiles (dispensary_id);
-- Add FK from dispensaries to active profile
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries'
AND column_name = 'active_crawler_profile_id') THEN
ALTER TABLE dispensaries
ADD COLUMN active_crawler_profile_id INTEGER NULL
REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL;
END IF;
END $$;
-- Create index on the FK for faster joins
CREATE INDEX IF NOT EXISTS idx_dispensaries_active_profile
ON dispensaries (active_crawler_profile_id)
WHERE active_crawler_profile_id IS NOT NULL;
-- Create or replace trigger function for updated_at
CREATE OR REPLACE FUNCTION set_updated_at_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Add trigger to keep updated_at fresh (drop first if exists to avoid duplicates)
DROP TRIGGER IF EXISTS dispensary_crawler_profiles_set_timestamp ON dispensary_crawler_profiles;
CREATE TRIGGER dispensary_crawler_profiles_set_timestamp
BEFORE UPDATE ON dispensary_crawler_profiles
FOR EACH ROW EXECUTE PROCEDURE set_updated_at_timestamp();
-- Add comments for documentation
COMMENT ON TABLE dispensary_crawler_profiles IS 'Per-store crawler configuration profiles. Each dispensary can have multiple profiles but only one active at a time.';
COMMENT ON COLUMN dispensary_crawler_profiles.profile_name IS 'Human readable name for the profile, e.g. "Curaleaf Gilbert - Dutchie v1"';
COMMENT ON COLUMN dispensary_crawler_profiles.crawler_type IS 'The crawler implementation type: dutchie, treez, jane, sandbox, custom';
COMMENT ON COLUMN dispensary_crawler_profiles.profile_key IS 'Optional identifier for per-store crawler module mapping';
COMMENT ON COLUMN dispensary_crawler_profiles.config IS 'JSONB configuration for the crawler. Schema depends on crawler_type.';
COMMENT ON COLUMN dispensary_crawler_profiles.timeout_ms IS 'Request timeout in milliseconds (default 30000)';
COMMENT ON COLUMN dispensary_crawler_profiles.download_images IS 'Whether to download product images locally';
COMMENT ON COLUMN dispensary_crawler_profiles.track_stock IS 'Whether to track inventory/stock levels';
COMMENT ON COLUMN dispensary_crawler_profiles.version IS 'Profile version number for A/B testing or upgrades';
COMMENT ON COLUMN dispensary_crawler_profiles.enabled IS 'Whether this profile can be used (soft delete)';
COMMENT ON COLUMN dispensaries.active_crawler_profile_id IS 'FK to the currently active crawler profile for this dispensary';

View File

@@ -0,0 +1,84 @@
-- Migration: Add status field to dispensary_crawler_profiles
-- This adds a proper status column for crawler state machine
-- Status values: 'production', 'sandbox', 'needs_manual', 'disabled'
-- Add status column with default 'production' for existing profiles
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'production';
-- Add next_retry_at column for sandbox retry scheduling
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS next_retry_at TIMESTAMPTZ;
-- Add sandbox_attempt_count for quick lookup
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS sandbox_attempt_count INTEGER DEFAULT 0;
-- Add last_sandbox_at for tracking
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS last_sandbox_at TIMESTAMPTZ;
-- Create index for finding profiles by status
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_status
ON dispensary_crawler_profiles(status) WHERE enabled = true;
-- Create index for finding profiles needing retry
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_next_retry
ON dispensary_crawler_profiles(next_retry_at) WHERE enabled = true AND status = 'sandbox';
-- Add comment explaining status values
COMMENT ON COLUMN dispensary_crawler_profiles.status IS
'Crawler status: production (ready for regular crawls), sandbox (discovery mode), needs_manual (max retries exceeded), disabled (turned off)';
-- Update existing profiles to have status based on config if present
UPDATE dispensary_crawler_profiles
SET status = COALESCE(config->>'status', 'production')
WHERE status IS NULL OR status = '';
-- Backfill sandbox_attempt_count from config
UPDATE dispensary_crawler_profiles
SET sandbox_attempt_count = COALESCE(
jsonb_array_length(config->'sandboxAttempts'),
0
)
WHERE config->'sandboxAttempts' IS NOT NULL;
-- Backfill next_retry_at from config
UPDATE dispensary_crawler_profiles
SET next_retry_at = (config->>'nextRetryAt')::timestamptz
WHERE config->>'nextRetryAt' IS NOT NULL;
-- Create view for crawler profile summary
CREATE OR REPLACE VIEW v_crawler_profile_summary AS
SELECT
dcp.id,
dcp.dispensary_id,
d.name AS dispensary_name,
d.city,
d.menu_type,
dcp.profile_name,
dcp.profile_key,
dcp.crawler_type,
dcp.status,
dcp.enabled,
dcp.sandbox_attempt_count,
dcp.next_retry_at,
dcp.last_sandbox_at,
dcp.created_at,
dcp.updated_at,
CASE
WHEN dcp.profile_key IS NOT NULL THEN 'per-store'
ELSE 'legacy'
END AS crawler_mode,
CASE
WHEN dcp.status = 'production' THEN 'Ready'
WHEN dcp.status = 'sandbox' AND dcp.next_retry_at <= NOW() THEN 'Retry Due'
WHEN dcp.status = 'sandbox' THEN 'Waiting'
WHEN dcp.status = 'needs_manual' THEN 'Needs Manual'
WHEN dcp.status = 'disabled' THEN 'Disabled'
ELSE 'Unknown'
END AS status_display
FROM dispensary_crawler_profiles dcp
JOIN dispensaries d ON d.id = dcp.dispensary_id
WHERE dcp.enabled = true
ORDER BY dcp.status, dcp.updated_at DESC;

View File

@@ -0,0 +1,73 @@
-- Migration: Create crawl_orchestration_traces table
-- Purpose: Store detailed step-by-step traces for every crawl orchestration run
-- This enables full visibility into per-store crawler behavior
CREATE TABLE IF NOT EXISTS crawl_orchestration_traces (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
run_id VARCHAR(255), -- UUID or job ID for this crawl run
profile_id INTEGER REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL,
profile_key VARCHAR(255), -- e.g. "trulieve-scottsdale"
crawler_module VARCHAR(255), -- Full path to .ts file loaded
state_at_start VARCHAR(50), -- sandbox, production, legacy, disabled
state_at_end VARCHAR(50), -- sandbox, production, needs_manual, etc.
-- The trace: ordered array of step objects
trace JSONB NOT NULL DEFAULT '[]'::jsonb,
-- Summary metrics for quick querying
total_steps INTEGER DEFAULT 0,
duration_ms INTEGER,
success BOOLEAN,
error_message TEXT,
products_found INTEGER,
-- Timestamps
started_at TIMESTAMPTZ DEFAULT NOW(),
completed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Index for quick lookup by dispensary
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_id
ON crawl_orchestration_traces(dispensary_id);
-- Index for finding latest trace per dispensary
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_created
ON crawl_orchestration_traces(dispensary_id, created_at DESC);
-- Index for finding traces by run_id
CREATE INDEX IF NOT EXISTS idx_traces_run_id
ON crawl_orchestration_traces(run_id) WHERE run_id IS NOT NULL;
-- Index for finding traces by profile
CREATE INDEX IF NOT EXISTS idx_traces_profile_id
ON crawl_orchestration_traces(profile_id) WHERE profile_id IS NOT NULL;
-- Comment explaining trace structure
COMMENT ON COLUMN crawl_orchestration_traces.trace IS
'Ordered array of step objects. Each step has:
{
"step": 1,
"action": "load_profile",
"description": "Loading crawler profile for dispensary",
"timestamp": 1701234567890,
"duration_ms": 45,
"input": { ... },
"output": { ... },
"what": "Description of what happened",
"why": "Reason this step was taken",
"where": "Code location / module",
"how": "Method or approach used",
"when": "ISO timestamp"
}';
-- View for easy access to latest traces
CREATE OR REPLACE VIEW v_latest_crawl_traces AS
SELECT DISTINCT ON (dispensary_id)
cot.*,
d.name AS dispensary_name,
d.city AS dispensary_city
FROM crawl_orchestration_traces cot
JOIN dispensaries d ON d.id = cot.dispensary_id
ORDER BY dispensary_id, cot.created_at DESC;

View File

@@ -0,0 +1,73 @@
-- Migration 040: Add dba_name column to dispensaries table
-- DBA (Doing Business As) name - the name the dispensary operates under,
-- which may differ from the legal entity name
-- This migration is idempotent - safe to run multiple times
-- Add dba_name column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'dba_name') THEN
ALTER TABLE dispensaries ADD COLUMN dba_name TEXT DEFAULT NULL;
END IF;
END $$;
-- Add company_name column (legal entity name)
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'company_name') THEN
ALTER TABLE dispensaries ADD COLUMN company_name TEXT DEFAULT NULL;
END IF;
END $$;
-- Add azdhs_id for Arizona Department of Health Services license number
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'azdhs_id') THEN
ALTER TABLE dispensaries ADD COLUMN azdhs_id INTEGER DEFAULT NULL;
END IF;
END $$;
-- Add phone column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'phone') THEN
ALTER TABLE dispensaries ADD COLUMN phone TEXT DEFAULT NULL;
END IF;
END $$;
-- Add email column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'email') THEN
ALTER TABLE dispensaries ADD COLUMN email TEXT DEFAULT NULL;
END IF;
END $$;
-- Add google_rating column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_rating') THEN
ALTER TABLE dispensaries ADD COLUMN google_rating NUMERIC(2,1) DEFAULT NULL;
END IF;
END $$;
-- Add google_review_count column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_review_count') THEN
ALTER TABLE dispensaries ADD COLUMN google_review_count INTEGER DEFAULT NULL;
END IF;
END $$;
-- Add comments for documentation
COMMENT ON COLUMN dispensaries.dba_name IS 'DBA (Doing Business As) name - the public-facing name the dispensary operates under';
COMMENT ON COLUMN dispensaries.company_name IS 'Legal entity/company name that owns the dispensary';
COMMENT ON COLUMN dispensaries.azdhs_id IS 'Arizona Department of Health Services license number';
COMMENT ON COLUMN dispensaries.phone IS 'Contact phone number';
COMMENT ON COLUMN dispensaries.email IS 'Contact email address';
COMMENT ON COLUMN dispensaries.google_rating IS 'Google Maps rating (1.0 to 5.0)';
COMMENT ON COLUMN dispensaries.google_review_count IS 'Number of Google reviews';
-- Create index for searching by dba_name
CREATE INDEX IF NOT EXISTS idx_dispensaries_dba_name ON dispensaries (dba_name);
CREATE INDEX IF NOT EXISTS idx_dispensaries_azdhs_id ON dispensaries (azdhs_id);

View File

@@ -0,0 +1,376 @@
-- Migration 041: CannaiQ Canonical Schema
--
-- This migration adds the canonical CannaiQ schema tables and columns.
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
--
-- Run with: psql $CANNAIQ_DB_URL -f migrations/041_cannaiq_canonical_schema.sql
--
-- Tables created:
-- - states (new)
-- - chains (new)
-- - brands (new)
-- - store_products (new - normalized view of current menu)
-- - store_product_snapshots (new - historical crawl data)
-- - crawl_runs (new - replaces/supplements dispensary_crawl_jobs)
--
-- Tables modified:
-- - dispensaries (add state_id, chain_id FKs)
-- - dispensary_crawler_profiles (add status, allow_autopromote, validated_at)
-- - crawl_orchestration_traces (add run_id FK)
--
-- =====================================================
-- 1) STATES TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL UNIQUE,
name VARCHAR(100) NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert known states
INSERT INTO states (code, name) VALUES
('AZ', 'Arizona'),
('CA', 'California'),
('CO', 'Colorado'),
('FL', 'Florida'),
('IL', 'Illinois'),
('MA', 'Massachusetts'),
('MD', 'Maryland'),
('MI', 'Michigan'),
('MO', 'Missouri'),
('NV', 'Nevada'),
('NJ', 'New Jersey'),
('NY', 'New York'),
('OH', 'Ohio'),
('OK', 'Oklahoma'),
('OR', 'Oregon'),
('PA', 'Pennsylvania'),
('WA', 'Washington')
ON CONFLICT (code) DO NOTHING;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state codes.';
-- =====================================================
-- 2) CHAINS TABLE (retail groups)
-- =====================================================
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
website_url TEXT,
logo_url TEXT,
description TEXT,
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations (e.g., Curaleaf, Trulieve).';
-- =====================================================
-- 3) BRANDS TABLE (canonical brand catalog)
-- =====================================================
CREATE TABLE IF NOT EXISTS brands (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
external_id VARCHAR(100), -- Provider-specific brand ID
website_url TEXT,
instagram_handle VARCHAR(100),
logo_url TEXT,
description TEXT,
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_brands_slug ON brands(slug);
CREATE INDEX IF NOT EXISTS idx_brands_external_id ON brands(external_id) WHERE external_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_brands_portfolio ON brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
COMMENT ON TABLE brands IS 'Canonical brand catalog. Brands may appear across multiple dispensaries.';
COMMENT ON COLUMN brands.is_portfolio_brand IS 'TRUE if this is a brand we represent/manage (vs third-party brand)';
-- =====================================================
-- 4) ADD state_id AND chain_id TO dispensaries
-- =====================================================
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
-- NOTE: state_id backfill is done by ETL script (042_legacy_import.ts), not this migration.
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- =====================================================
-- 5) STORE_PRODUCTS TABLE (current menu state)
-- =====================================================
-- This is the normalized "what is currently on the menu" table.
-- It supplements dutchie_products with a provider-agnostic structure.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL, -- Link to canonical product
brand_id INTEGER REFERENCES brands(id) ON DELETE SET NULL, -- Link to canonical brand
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', -- dutchie, treez, jane, etc.
provider_product_id VARCHAR(100), -- Platform-specific product ID
provider_brand_id VARCHAR(100), -- Platform-specific brand ID
-- Raw data from platform (not normalized)
name_raw VARCHAR(500) NOT NULL,
brand_name_raw VARCHAR(255),
category_raw VARCHAR(100),
subcategory_raw VARCHAR(100),
-- Pricing
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Images
image_url TEXT,
local_image_path TEXT,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, provider, provider_product_id)
);
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_product ON store_products(product_id) WHERE product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(brand_id) WHERE brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
COMMENT ON COLUMN store_products.product_id IS 'FK to canonical products table. NULL if not yet mapped.';
COMMENT ON COLUMN store_products.brand_id IS 'FK to canonical brands table. NULL if not yet mapped.';
-- =====================================================
-- 6) STORE_PRODUCT_SNAPSHOTS TABLE (historical data)
-- =====================================================
-- This is the critical time-series table for analytics.
-- One row per product per crawl.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER, -- FK added after crawl_runs table created
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name_raw VARCHAR(500),
brand_name_raw VARCHAR(255),
category_raw VARCHAR(100),
subcategory_raw VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(product_id, captured_at DESC) WHERE product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_store_product ON store_product_snapshots(store_product_id) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
COMMENT ON COLUMN store_product_snapshots.captured_at IS 'When this snapshot was captured (crawl time).';
-- =====================================================
-- 7) CRAWL_RUNS TABLE (job execution records)
-- =====================================================
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Provider
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Execution times
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
error_message TEXT,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
snapshots_written INTEGER DEFAULT 0,
-- Metadata
worker_id VARCHAR(100),
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- Add FK from store_product_snapshots to crawl_runs
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.table_constraints
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
END $$;
-- =====================================================
-- 8) UPDATE crawl_orchestration_traces
-- =====================================================
-- Add run_id FK if not exists
ALTER TABLE crawl_orchestration_traces
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
ON crawl_orchestration_traces(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- =====================================================
-- 9) UPDATE dispensary_crawler_profiles
-- =====================================================
-- Add missing columns from canonical schema
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_profiles_status
ON dispensary_crawler_profiles(status);
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
COMMENT ON COLUMN dispensary_crawler_profiles.allow_autopromote IS 'Whether this profile can be auto-promoted from sandbox to production';
COMMENT ON COLUMN dispensary_crawler_profiles.validated_at IS 'When this profile was last validated as working';
-- =====================================================
-- 10) VIEWS FOR BACKWARD COMPATIBILITY
-- =====================================================
-- View to get latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View to get crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
d.name AS dispensary_name,
d.city,
d.state,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.name, d.city, d.state;
-- =====================================================
-- 11) COMMENTS
-- =====================================================
COMMENT ON TABLE states IS 'Canonical list of US states. Use state_id FK in dispensaries.';
COMMENT ON TABLE chains IS 'Retail chains (multi-location operators).';
COMMENT ON TABLE brands IS 'Canonical brand catalog across all providers.';
COMMENT ON TABLE store_products IS 'Current menu state per dispensary. Provider-agnostic.';
COMMENT ON TABLE store_product_snapshots IS 'Historical price/stock data. One row per product per crawl.';
COMMENT ON TABLE crawl_runs IS 'Crawl execution records. Links snapshots to runs.';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================
--
-- Next steps (manual - not in this migration):
-- 1. Populate chains table from known retail groups
-- 2. Populate brands table from existing dutchie_products.brand_name
-- 3. Migrate data from dutchie_products → store_products
-- 4. Migrate data from dutchie_product_snapshots → store_product_snapshots
-- 5. Link dispensaries.chain_id to chains where applicable
--

View File

@@ -0,0 +1,50 @@
-- Migration 043: Add States Table
--
-- Creates the states table if it does not exist.
-- Safe to run multiple times (idempotent).
--
-- Run with:
-- CANNAIQ_DB_URL="postgresql://..." psql $CANNAIQ_DB_URL -f migrations/043_add_states_table.sql
-- =====================================================
-- 1) CREATE STATES TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- =====================================================
-- 2) INSERT CORE US STATES
-- =====================================================
INSERT INTO states (code, name) VALUES
('AZ', 'Arizona'),
('CA', 'California'),
('CO', 'Colorado'),
('FL', 'Florida'),
('IL', 'Illinois'),
('MA', 'Massachusetts'),
('MD', 'Maryland'),
('MI', 'Michigan'),
('MO', 'Missouri'),
('NV', 'Nevada'),
('NJ', 'New Jersey'),
('NY', 'New York'),
('OH', 'Ohio'),
('OK', 'Oklahoma'),
('OR', 'Oregon'),
('PA', 'Pennsylvania'),
('WA', 'Washington')
ON CONFLICT (code) DO NOTHING;
-- =====================================================
-- 3) ADD INDEX
-- =====================================================
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
-- =====================================================
-- DONE
-- =====================================================

View File

@@ -0,0 +1,45 @@
-- Migration 044: Add provider_detection_data column to dispensaries
--
-- This column stores detection metadata for menu provider discovery.
-- Used by menu-detection.ts and discovery.ts to track:
-- - Detected provider type
-- - Resolution attempts
-- - Error messages
-- - not_crawlable flag
--
-- Run with: psql $CANNAIQ_DB_URL -f migrations/044_add_provider_detection_data.sql
--
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
-- Add provider_detection_data to dispensaries table
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
) THEN
ALTER TABLE dispensaries
ADD COLUMN provider_detection_data JSONB DEFAULT NULL;
RAISE NOTICE 'Added provider_detection_data column to dispensaries table';
ELSE
RAISE NOTICE 'provider_detection_data column already exists on dispensaries table';
END IF;
END;
$$ LANGUAGE plpgsql;
-- Add index for querying by not_crawlable flag
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_not_crawlable
ON dispensaries ((provider_detection_data->>'not_crawlable'))
WHERE provider_detection_data IS NOT NULL;
-- Add index for querying by detected provider
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_provider
ON dispensaries ((provider_detection_data->>'detected_provider'))
WHERE provider_detection_data IS NOT NULL;
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSONB metadata from menu provider detection. Keys: detected_provider, resolution_error, not_crawlable, detection_timestamp';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================

View File

@@ -0,0 +1,27 @@
-- Migration 045: Add thumbnail_url columns to canonical tables
--
-- NOTE: image_url already exists in both tables from migration 041.
-- This migration adds thumbnail_url for cached thumbnail images.
DO $$
BEGIN
-- Add thumbnail_url to store_products if not exists
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'store_products' AND column_name = 'thumbnail_url'
) THEN
ALTER TABLE store_products ADD COLUMN thumbnail_url TEXT NULL;
END IF;
-- Add thumbnail_url to store_product_snapshots if not exists
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'store_product_snapshots' AND column_name = 'thumbnail_url'
) THEN
ALTER TABLE store_product_snapshots ADD COLUMN thumbnail_url TEXT NULL;
END IF;
END;
$$ LANGUAGE plpgsql;
COMMENT ON COLUMN store_products.thumbnail_url IS 'URL to cached thumbnail image';
COMMENT ON COLUMN store_product_snapshots.thumbnail_url IS 'URL to cached thumbnail image at time of snapshot';

View File

@@ -0,0 +1,351 @@
-- Migration 046: Crawler Reliability & Stabilization
-- Phase 1: Add fields for error taxonomy, retry management, and self-healing
-- ============================================================
-- PART 1: Error Taxonomy - Standardized error codes
-- ============================================================
-- Create enum for standardized error codes
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN
CREATE TYPE crawl_error_code AS ENUM (
'SUCCESS',
'RATE_LIMITED',
'BLOCKED_PROXY',
'HTML_CHANGED',
'TIMEOUT',
'AUTH_FAILED',
'NETWORK_ERROR',
'PARSE_ERROR',
'NO_PRODUCTS',
'UNKNOWN_ERROR'
);
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 2: Dispensary Crawl Configuration
-- ============================================================
-- Add crawl config columns to dispensaries
DO $$
BEGIN
-- Crawl frequency (minutes between crawls)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes'
) THEN
ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240;
END IF;
-- Max retries per crawl
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'max_retries'
) THEN
ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3;
END IF;
-- Current proxy ID
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id'
) THEN
ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL;
END IF;
-- Current user agent
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent'
) THEN
ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL;
END IF;
-- Next scheduled run
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at'
) THEN
ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL;
END IF;
-- Last successful crawl
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'last_success_at'
) THEN
ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL;
END IF;
-- Last error code (using text for flexibility, validated in app)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'last_error_code'
) THEN
ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL;
END IF;
-- Crawl status: active, degraded, paused, failed
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'crawl_status'
) THEN
ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active';
END IF;
-- Backoff multiplier (increases with failures)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier'
) THEN
ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0;
END IF;
-- Total attempt count (lifetime)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'total_attempts'
) THEN
ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0;
END IF;
-- Total success count (lifetime)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'total_successes'
) THEN
ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 3: Enhanced Job Tracking
-- ============================================================
-- Add columns to dispensary_crawl_jobs
DO $$
BEGIN
-- Error code
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL;
END IF;
-- Proxy used for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL;
END IF;
-- User agent used for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL;
END IF;
-- Attempt number for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1;
END IF;
-- Backoff delay applied (ms)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0;
END IF;
-- HTTP status code received
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL;
END IF;
-- Response time (ms)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 4: Crawl History Table (for detailed tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS crawl_attempts (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
job_id INTEGER REFERENCES dispensary_crawl_jobs(id),
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Result
error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR',
error_message TEXT,
http_status INTEGER,
-- Context
attempt_number INTEGER NOT NULL DEFAULT 1,
proxy_used TEXT,
user_agent_used TEXT,
-- Metrics
products_found INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
-- Metadata
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Index for quick lookups
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code);
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC);
-- ============================================================
-- PART 5: Views for Monitoring
-- ============================================================
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawler_status;
-- Crawler status view with all reliability fields
CREATE VIEW v_crawler_status AS
SELECT
d.id,
d.name,
d.slug,
d.menu_type,
d.platform_dispensary_id,
d.crawl_status,
d.consecutive_failures,
d.last_crawl_at,
d.last_success_at,
d.last_failure_at,
d.last_error_code,
d.next_crawl_at,
d.crawl_frequency_minutes,
d.max_retries,
d.current_proxy_id,
d.current_user_agent,
d.backoff_multiplier,
d.total_attempts,
d.total_successes,
d.product_count,
CASE
WHEN d.total_attempts > 0
THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1)
ELSE 0
END AS success_rate,
CASE
WHEN d.crawl_status = 'failed' THEN 'FAILED'
WHEN d.crawl_status = 'paused' THEN 'PAUSED'
WHEN d.crawl_status = 'degraded' THEN 'DEGRADED'
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION'
WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID'
WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED'
WHEN d.next_crawl_at <= NOW() THEN 'DUE'
ELSE 'SCHEDULED'
END AS schedule_status,
d.failed_at,
d.failure_notes
FROM dispensaries d
WHERE d.state = 'AZ';
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawl_error_summary;
-- Error summary view
CREATE VIEW v_crawl_error_summary AS
SELECT
error_code,
COUNT(*) as total_occurrences,
COUNT(DISTINCT dispensary_id) as affected_stores,
MAX(started_at) as last_occurrence,
AVG(duration_ms)::INTEGER as avg_duration_ms
FROM crawl_attempts
WHERE started_at > NOW() - INTERVAL '7 days'
GROUP BY error_code
ORDER BY total_occurrences DESC;
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawl_health;
-- Overall crawl health view
CREATE VIEW v_crawl_health AS
SELECT
COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers,
COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now,
COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures,
AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures,
COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h
FROM dispensaries
WHERE state = 'AZ' AND menu_type = 'dutchie';
-- ============================================================
-- PART 6: Constraint for minimum crawl gap
-- ============================================================
-- Function to check minimum crawl gap (2 minutes)
CREATE OR REPLACE FUNCTION check_minimum_crawl_gap()
RETURNS TRIGGER AS $$
BEGIN
-- Only check for new pending jobs
IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN
-- Check if there's a recent job for same dispensary
IF EXISTS (
SELECT 1 FROM dispensary_crawl_jobs
WHERE dispensary_id = NEW.dispensary_id
AND id != NEW.id
AND status IN ('pending', 'running')
AND created_at > NOW() - INTERVAL '2 minutes'
) THEN
RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary';
END IF;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create trigger (drop first if exists)
DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs;
CREATE TRIGGER enforce_minimum_crawl_gap
BEFORE INSERT ON dispensary_crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION check_minimum_crawl_gap();
-- ============================================================
-- PART 7: Comments
-- ============================================================
COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging';
COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics';
COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days';
COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';

View File

@@ -0,0 +1,130 @@
-- Migration 046: Raw Payloads Table
--
-- Immutable event stream for raw crawler responses.
-- NEVER delete or overwrite historical payloads.
--
-- Run with:
-- DATABASE_URL="postgresql://..." psql $DATABASE_URL -f migrations/046_raw_payloads_table.sql
-- =====================================================
-- 1) RAW_PAYLOADS TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS raw_payloads (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- Store reference
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Crawl run reference (nullable for backfilled data)
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
-- Platform identification
platform VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Versioning for schema evolution
payload_version INTEGER NOT NULL DEFAULT 1,
-- The raw JSON response from the crawler (immutable)
raw_json JSONB NOT NULL,
-- Metadata
product_count INTEGER, -- Number of products in payload
pricing_type VARCHAR(20), -- 'rec', 'med', or 'both'
crawl_mode VARCHAR(20), -- 'mode_a', 'mode_b', 'dual'
-- Timestamps
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Hydration status
processed BOOLEAN NOT NULL DEFAULT FALSE,
normalized_at TIMESTAMPTZ,
hydration_error TEXT,
hydration_attempts INTEGER DEFAULT 0,
-- Audit
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- =====================================================
-- 2) INDEXES FOR EFFICIENT QUERYING
-- =====================================================
-- Primary lookup: unprocessed payloads in FIFO order
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed
ON raw_payloads(fetched_at ASC)
WHERE processed = FALSE;
-- Store-based lookups
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary
ON raw_payloads(dispensary_id, fetched_at DESC);
-- Platform filtering
CREATE INDEX IF NOT EXISTS idx_raw_payloads_platform
ON raw_payloads(platform);
-- Crawl run linkage
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run
ON raw_payloads(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- Error tracking
CREATE INDEX IF NOT EXISTS idx_raw_payloads_errors
ON raw_payloads(hydration_attempts, processed)
WHERE hydration_error IS NOT NULL;
-- =====================================================
-- 3) HYDRATION LOCKS TABLE (distributed locking)
-- =====================================================
CREATE TABLE IF NOT EXISTS hydration_locks (
id SERIAL PRIMARY KEY,
lock_name VARCHAR(100) NOT NULL UNIQUE,
worker_id VARCHAR(100) NOT NULL,
acquired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ NOT NULL,
heartbeat_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_hydration_locks_expires
ON hydration_locks(expires_at);
-- =====================================================
-- 4) HYDRATION_RUNS TABLE (audit trail)
-- =====================================================
CREATE TABLE IF NOT EXISTS hydration_runs (
id SERIAL PRIMARY KEY,
worker_id VARCHAR(100) NOT NULL,
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed
-- Metrics
payloads_processed INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
brands_created INTEGER DEFAULT 0,
errors_count INTEGER DEFAULT 0,
-- Error details
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_hydration_runs_status
ON hydration_runs(status, started_at DESC);
-- =====================================================
-- 5) COMMENTS
-- =====================================================
COMMENT ON TABLE raw_payloads IS 'Immutable event stream of raw crawler responses. NEVER DELETE.';
COMMENT ON COLUMN raw_payloads.raw_json IS 'Complete raw JSON from GraphQL/API response. Immutable.';
COMMENT ON COLUMN raw_payloads.payload_version IS 'Schema version for normalization compatibility.';
COMMENT ON COLUMN raw_payloads.processed IS 'TRUE when payload has been hydrated to canonical tables.';
COMMENT ON COLUMN raw_payloads.normalized_at IS 'When the payload was successfully hydrated.';
COMMENT ON TABLE hydration_locks IS 'Distributed locks for hydration workers to prevent double-processing.';
COMMENT ON TABLE hydration_runs IS 'Audit trail of hydration job executions.';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================

View File

@@ -0,0 +1,473 @@
-- Migration 047: Analytics Infrastructure
-- Phase 3: Analytics Dashboards for CannaiQ
-- Creates views, functions, and tables for price trends, brand penetration, category growth, etc.
-- ============================================================
-- ANALYTICS CACHE TABLE (for expensive query results)
-- ============================================================
CREATE TABLE IF NOT EXISTS analytics_cache (
id SERIAL PRIMARY KEY,
cache_key VARCHAR(255) NOT NULL UNIQUE,
cache_data JSONB NOT NULL,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ NOT NULL,
query_time_ms INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_analytics_cache_key ON analytics_cache(cache_key);
CREATE INDEX IF NOT EXISTS idx_analytics_cache_expires ON analytics_cache(expires_at);
-- ============================================================
-- PRICE EXTRACTION HELPER FUNCTION
-- Extracts pricing from JSONB latest_raw_payload
-- ============================================================
CREATE OR REPLACE FUNCTION extract_min_price(payload JSONB)
RETURNS NUMERIC AS $$
DECLARE
prices JSONB;
min_val NUMERIC;
BEGIN
-- Try recPrices first (retail prices)
prices := payload->'recPrices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
END IF;
-- Try Prices array
prices := payload->'Prices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION extract_max_price(payload JSONB)
RETURNS NUMERIC AS $$
DECLARE
prices JSONB;
max_val NUMERIC;
BEGIN
prices := payload->'recPrices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
END IF;
prices := payload->'Prices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION extract_wholesale_price(payload JSONB)
RETURNS NUMERIC AS $$
DECLARE
prices JSONB;
min_val NUMERIC;
BEGIN
prices := payload->'wholesalePrices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
RETURN min_val;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- ============================================================
-- VIEW: v_product_pricing
-- Flattened view of products with extracted pricing
-- ============================================================
CREATE OR REPLACE VIEW v_product_pricing AS
SELECT
dp.id,
dp.dispensary_id,
dp.name,
dp.brand_name,
dp.brand_id,
dp.type as category,
dp.subcategory,
dp.strain_type,
dp.stock_status,
dp.status,
d.name as store_name,
d.city,
d.state,
extract_min_price(dp.latest_raw_payload) as min_price,
extract_max_price(dp.latest_raw_payload) as max_price,
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price,
dp.thc,
dp.cbd,
dp.updated_at,
dp.created_at
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id;
-- ============================================================
-- VIEW: v_brand_store_presence
-- Which brands are in which stores
-- ============================================================
CREATE OR REPLACE VIEW v_brand_store_presence AS
SELECT
dp.brand_name,
dp.brand_id,
dp.dispensary_id,
d.name as store_name,
d.city,
d.state,
dp.type as category,
COUNT(*) as sku_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count,
MAX(dp.updated_at) as last_updated
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name IS NOT NULL
GROUP BY dp.brand_name, dp.brand_id, dp.dispensary_id, d.name, d.city, d.state, dp.type;
-- ============================================================
-- VIEW: v_category_store_summary
-- Category breakdown per store
-- ============================================================
CREATE OR REPLACE VIEW v_category_store_summary AS
SELECT
dp.dispensary_id,
d.name as store_name,
d.city,
d.state,
dp.type as category,
COUNT(*) as sku_count,
COUNT(DISTINCT dp.brand_name) as brand_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.type IS NOT NULL
GROUP BY dp.dispensary_id, d.name, d.city, d.state, dp.type;
-- ============================================================
-- VIEW: v_brand_summary
-- Global brand statistics
-- ============================================================
CREATE OR REPLACE VIEW v_brand_summary AS
SELECT
dp.brand_name,
dp.brand_id,
COUNT(*) as total_skus,
COUNT(DISTINCT dp.dispensary_id) as store_count,
COUNT(DISTINCT dp.type) as category_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
MAX(dp.updated_at) as last_updated
FROM dutchie_products dp
WHERE dp.brand_name IS NOT NULL
GROUP BY dp.brand_name, dp.brand_id
ORDER BY total_skus DESC;
-- ============================================================
-- VIEW: v_category_summary
-- Global category statistics
-- ============================================================
CREATE OR REPLACE VIEW v_category_summary AS
SELECT
dp.type as category,
COUNT(*) as total_skus,
COUNT(DISTINCT dp.brand_name) as brand_count,
COUNT(DISTINCT dp.dispensary_id) as store_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus
FROM dutchie_products dp
WHERE dp.type IS NOT NULL
GROUP BY dp.type
ORDER BY total_skus DESC;
-- ============================================================
-- VIEW: v_store_summary
-- Store-level statistics
-- ============================================================
CREATE OR REPLACE VIEW v_store_summary AS
SELECT
d.id as store_id,
d.name as store_name,
d.city,
d.state,
d.chain_id,
COUNT(dp.id) as total_skus,
COUNT(DISTINCT dp.brand_name) as brand_count,
COUNT(DISTINCT dp.type) as category_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
d.last_crawl_at,
d.product_count
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
GROUP BY d.id, d.name, d.city, d.state, d.chain_id, d.last_crawl_at, d.product_count;
-- ============================================================
-- TABLE: brand_snapshots (for historical brand tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS brand_snapshots (
id SERIAL PRIMARY KEY,
brand_name VARCHAR(255) NOT NULL,
brand_id VARCHAR(255),
snapshot_date DATE NOT NULL,
store_count INTEGER NOT NULL DEFAULT 0,
total_skus INTEGER NOT NULL DEFAULT 0,
avg_price NUMERIC(10,2),
in_stock_skus INTEGER NOT NULL DEFAULT 0,
categories TEXT[],
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(brand_name, snapshot_date)
);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_date ON brand_snapshots(snapshot_date);
-- ============================================================
-- TABLE: category_snapshots (for historical category tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS category_snapshots (
id SERIAL PRIMARY KEY,
category VARCHAR(255) NOT NULL,
snapshot_date DATE NOT NULL,
store_count INTEGER NOT NULL DEFAULT 0,
brand_count INTEGER NOT NULL DEFAULT 0,
total_skus INTEGER NOT NULL DEFAULT 0,
avg_price NUMERIC(10,2),
in_stock_skus INTEGER NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(category, snapshot_date)
);
CREATE INDEX IF NOT EXISTS idx_category_snapshots_cat ON category_snapshots(category);
CREATE INDEX IF NOT EXISTS idx_category_snapshots_date ON category_snapshots(snapshot_date);
-- ============================================================
-- TABLE: store_change_events (for tracking store changes)
-- ============================================================
CREATE TABLE IF NOT EXISTS store_change_events (
id SERIAL PRIMARY KEY,
store_id INTEGER NOT NULL REFERENCES dispensaries(id),
event_type VARCHAR(50) NOT NULL, -- brand_added, brand_removed, product_added, product_removed, price_change, stock_change
event_date DATE NOT NULL,
brand_name VARCHAR(255),
product_id INTEGER,
product_name VARCHAR(500),
category VARCHAR(255),
old_value TEXT,
new_value TEXT,
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_store_events_store ON store_change_events(store_id);
CREATE INDEX IF NOT EXISTS idx_store_events_type ON store_change_events(event_type);
CREATE INDEX IF NOT EXISTS idx_store_events_date ON store_change_events(event_date);
CREATE INDEX IF NOT EXISTS idx_store_events_brand ON store_change_events(brand_name);
-- ============================================================
-- TABLE: analytics_alerts
-- ============================================================
CREATE TABLE IF NOT EXISTS analytics_alerts (
id SERIAL PRIMARY KEY,
alert_type VARCHAR(50) NOT NULL, -- price_warning, brand_dropped, competitive_intrusion, restock_event
severity VARCHAR(20) NOT NULL DEFAULT 'info', -- info, warning, critical
title VARCHAR(255) NOT NULL,
description TEXT,
store_id INTEGER REFERENCES dispensaries(id),
brand_name VARCHAR(255),
product_id INTEGER,
category VARCHAR(255),
metadata JSONB,
is_read BOOLEAN DEFAULT FALSE,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_type ON analytics_alerts(alert_type);
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_read ON analytics_alerts(is_read);
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_created ON analytics_alerts(created_at DESC);
-- ============================================================
-- FUNCTION: Capture daily brand snapshots
-- ============================================================
CREATE OR REPLACE FUNCTION capture_brand_snapshots()
RETURNS INTEGER AS $$
DECLARE
inserted_count INTEGER;
BEGIN
INSERT INTO brand_snapshots (brand_name, brand_id, snapshot_date, store_count, total_skus, avg_price, in_stock_skus, categories)
SELECT
brand_name,
brand_id,
CURRENT_DATE,
COUNT(DISTINCT dispensary_id),
COUNT(*),
AVG(extract_min_price(latest_raw_payload)),
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END),
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL)
FROM dutchie_products
WHERE brand_name IS NOT NULL
GROUP BY brand_name, brand_id
ON CONFLICT (brand_name, snapshot_date)
DO UPDATE SET
store_count = EXCLUDED.store_count,
total_skus = EXCLUDED.total_skus,
avg_price = EXCLUDED.avg_price,
in_stock_skus = EXCLUDED.in_stock_skus,
categories = EXCLUDED.categories;
GET DIAGNOSTICS inserted_count = ROW_COUNT;
RETURN inserted_count;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- FUNCTION: Capture daily category snapshots
-- ============================================================
CREATE OR REPLACE FUNCTION capture_category_snapshots()
RETURNS INTEGER AS $$
DECLARE
inserted_count INTEGER;
BEGIN
INSERT INTO category_snapshots (category, snapshot_date, store_count, brand_count, total_skus, avg_price, in_stock_skus)
SELECT
type,
CURRENT_DATE,
COUNT(DISTINCT dispensary_id),
COUNT(DISTINCT brand_name),
COUNT(*),
AVG(extract_min_price(latest_raw_payload)),
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END)
FROM dutchie_products
WHERE type IS NOT NULL
GROUP BY type
ON CONFLICT (category, snapshot_date)
DO UPDATE SET
store_count = EXCLUDED.store_count,
brand_count = EXCLUDED.brand_count,
total_skus = EXCLUDED.total_skus,
avg_price = EXCLUDED.avg_price,
in_stock_skus = EXCLUDED.in_stock_skus;
GET DIAGNOSTICS inserted_count = ROW_COUNT;
RETURN inserted_count;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- FUNCTION: Calculate price volatility for a product
-- ============================================================
CREATE OR REPLACE FUNCTION calculate_price_volatility(
p_product_id INTEGER,
p_days INTEGER DEFAULT 30
)
RETURNS NUMERIC AS $$
DECLARE
std_dev NUMERIC;
avg_price NUMERIC;
BEGIN
-- Using dutchie_product_snapshots if available
SELECT
STDDEV(rec_min_price_cents / 100.0),
AVG(rec_min_price_cents / 100.0)
INTO std_dev, avg_price
FROM dutchie_product_snapshots
WHERE dutchie_product_id = p_product_id
AND crawled_at >= NOW() - (p_days || ' days')::INTERVAL
AND rec_min_price_cents IS NOT NULL;
IF avg_price IS NULL OR avg_price = 0 THEN
RETURN NULL;
END IF;
-- Return coefficient of variation (CV)
RETURN ROUND((std_dev / avg_price) * 100, 2);
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- FUNCTION: Get brand penetration stats
-- ============================================================
CREATE OR REPLACE FUNCTION get_brand_penetration(
p_brand_name VARCHAR,
p_state VARCHAR DEFAULT NULL
)
RETURNS TABLE (
total_stores BIGINT,
stores_carrying BIGINT,
penetration_pct NUMERIC,
total_skus BIGINT,
avg_skus_per_store NUMERIC,
shelf_share_pct NUMERIC
) AS $$
BEGIN
RETURN QUERY
WITH store_counts AS (
SELECT
COUNT(DISTINCT d.id) as total,
COUNT(DISTINCT CASE WHEN dp.brand_name = p_brand_name THEN dp.dispensary_id END) as carrying
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
WHERE (p_state IS NULL OR d.state = p_state)
),
sku_counts AS (
SELECT
COUNT(*) as brand_skus,
COUNT(DISTINCT dispensary_id) as stores_with_brand
FROM dutchie_products
WHERE brand_name = p_brand_name
),
total_skus AS (
SELECT COUNT(*) as total FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE (p_state IS NULL OR d.state = p_state)
)
SELECT
sc.total,
sc.carrying,
ROUND((sc.carrying::NUMERIC / NULLIF(sc.total, 0)) * 100, 2),
skc.brand_skus,
ROUND(skc.brand_skus::NUMERIC / NULLIF(skc.stores_with_brand, 0), 2),
ROUND((skc.brand_skus::NUMERIC / NULLIF(ts.total, 0)) * 100, 2)
FROM store_counts sc, sku_counts skc, total_skus ts;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- Initial snapshot capture (run manually if needed)
-- ============================================================
-- Note: Run these after migration to capture initial snapshots:
-- SELECT capture_brand_snapshots();
-- SELECT capture_category_snapshots();
-- ============================================================
-- Grant permissions
-- ============================================================
-- Views are accessible to all roles by default
COMMENT ON VIEW v_product_pricing IS 'Flattened product view with extracted pricing from JSONB';
COMMENT ON VIEW v_brand_store_presence IS 'Brand presence across stores with SKU counts';
COMMENT ON VIEW v_brand_summary IS 'Global brand statistics';
COMMENT ON VIEW v_category_summary IS 'Global category statistics';
COMMENT ON VIEW v_store_summary IS 'Store-level statistics';
COMMENT ON TABLE analytics_cache IS 'Cache for expensive analytics queries';
COMMENT ON TABLE brand_snapshots IS 'Historical daily snapshots of brand metrics';
COMMENT ON TABLE category_snapshots IS 'Historical daily snapshots of category metrics';
COMMENT ON TABLE store_change_events IS 'Log of brand/product changes at stores';
COMMENT ON TABLE analytics_alerts IS 'Analytics-generated alerts and notifications';

View File

@@ -0,0 +1,598 @@
-- Migration 048: Production Sync + Monitoring Infrastructure
-- Phase 5: Full Production Sync + Monitoring
--
-- Creates:
-- 1. Sync orchestrator tables
-- 2. Dead-letter queue (DLQ)
-- 3. System metrics tracking
-- 4. Integrity check results
-- 5. Auto-fix audit log
-- ============================================================
-- SYNC ORCHESTRATOR TABLES
-- ============================================================
-- Orchestrator state and control
CREATE TABLE IF NOT EXISTS sync_orchestrator_state (
id INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1), -- Singleton row
status VARCHAR(20) NOT NULL DEFAULT 'SLEEPING', -- RUNNING, SLEEPING, LOCKED, PAUSED
current_worker_id VARCHAR(100),
last_heartbeat_at TIMESTAMPTZ,
last_run_started_at TIMESTAMPTZ,
last_run_completed_at TIMESTAMPTZ,
last_run_duration_ms INTEGER,
last_run_payloads_processed INTEGER DEFAULT 0,
last_run_errors INTEGER DEFAULT 0,
consecutive_failures INTEGER DEFAULT 0,
is_paused BOOLEAN DEFAULT FALSE,
pause_reason TEXT,
config JSONB DEFAULT '{
"batchSize": 50,
"pollIntervalMs": 5000,
"maxRetries": 3,
"lockTimeoutMs": 300000,
"enableAnalyticsPrecompute": true,
"enableIntegrityChecks": true
}'::jsonb,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert singleton row if not exists
INSERT INTO sync_orchestrator_state (id) VALUES (1) ON CONFLICT (id) DO NOTHING;
-- Sync run history
CREATE TABLE IF NOT EXISTS sync_runs (
id SERIAL PRIMARY KEY,
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
worker_id VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed, cancelled
started_at TIMESTAMPTZ DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Metrics
payloads_queued INTEGER DEFAULT 0,
payloads_processed INTEGER DEFAULT 0,
payloads_skipped INTEGER DEFAULT 0,
payloads_failed INTEGER DEFAULT 0,
payloads_dlq INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
products_inserted INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
products_discontinued INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
-- Error tracking
errors JSONB DEFAULT '[]'::jsonb,
error_summary TEXT,
-- Diff stats (before/after)
diff_stats JSONB DEFAULT '{}'::jsonb,
-- Analytics precompute triggered
analytics_updated BOOLEAN DEFAULT FALSE,
analytics_duration_ms INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_sync_runs_status ON sync_runs(status);
CREATE INDEX IF NOT EXISTS idx_sync_runs_started_at ON sync_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_sync_runs_run_id ON sync_runs(run_id);
-- ============================================================
-- DEAD-LETTER QUEUE (DLQ)
-- ============================================================
-- DLQ for failed payloads
CREATE TABLE IF NOT EXISTS raw_payloads_dlq (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
original_payload_id UUID NOT NULL,
dispensary_id INTEGER REFERENCES dispensaries(id),
state_code VARCHAR(2),
platform VARCHAR(50) DEFAULT 'dutchie',
-- Original payload data (preserved)
raw_json JSONB NOT NULL,
product_count INTEGER,
pricing_type VARCHAR(10),
crawl_mode VARCHAR(20),
-- DLQ metadata
moved_to_dlq_at TIMESTAMPTZ DEFAULT NOW(),
failure_count INTEGER DEFAULT 0,
-- Error history (array of error objects)
error_history JSONB DEFAULT '[]'::jsonb,
last_error_type VARCHAR(50),
last_error_message TEXT,
last_error_at TIMESTAMPTZ,
-- Retry tracking
retry_count INTEGER DEFAULT 0,
last_retry_at TIMESTAMPTZ,
next_retry_at TIMESTAMPTZ,
-- Resolution
status VARCHAR(20) DEFAULT 'pending', -- pending, retrying, resolved, abandoned
resolved_at TIMESTAMPTZ,
resolved_by VARCHAR(100),
resolution_notes TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_dlq_status ON raw_payloads_dlq(status);
CREATE INDEX IF NOT EXISTS idx_dlq_dispensary ON raw_payloads_dlq(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_dlq_error_type ON raw_payloads_dlq(last_error_type);
CREATE INDEX IF NOT EXISTS idx_dlq_moved_at ON raw_payloads_dlq(moved_to_dlq_at DESC);
-- ============================================================
-- SYSTEM METRICS
-- ============================================================
-- System metrics time series
CREATE TABLE IF NOT EXISTS system_metrics (
id SERIAL PRIMARY KEY,
metric_name VARCHAR(100) NOT NULL,
metric_value NUMERIC NOT NULL,
labels JSONB DEFAULT '{}',
recorded_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_metrics_name_time ON system_metrics(metric_name, recorded_at DESC);
CREATE INDEX IF NOT EXISTS idx_metrics_recorded_at ON system_metrics(recorded_at DESC);
-- Metrics snapshot (current state, updated continuously)
CREATE TABLE IF NOT EXISTS system_metrics_current (
metric_name VARCHAR(100) PRIMARY KEY,
metric_value NUMERIC NOT NULL,
labels JSONB DEFAULT '{}',
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Error buckets for classification
CREATE TABLE IF NOT EXISTS error_buckets (
id SERIAL PRIMARY KEY,
error_type VARCHAR(50) NOT NULL,
error_message TEXT,
source_table VARCHAR(50),
source_id TEXT,
dispensary_id INTEGER,
state_code VARCHAR(2),
context JSONB DEFAULT '{}',
occurred_at TIMESTAMPTZ DEFAULT NOW(),
acknowledged BOOLEAN DEFAULT FALSE,
acknowledged_at TIMESTAMPTZ,
acknowledged_by VARCHAR(100)
);
CREATE INDEX IF NOT EXISTS idx_error_buckets_type ON error_buckets(error_type);
CREATE INDEX IF NOT EXISTS idx_error_buckets_occurred ON error_buckets(occurred_at DESC);
CREATE INDEX IF NOT EXISTS idx_error_buckets_unacked ON error_buckets(acknowledged) WHERE acknowledged = FALSE;
-- ============================================================
-- INTEGRITY CHECK RESULTS
-- ============================================================
CREATE TABLE IF NOT EXISTS integrity_check_runs (
id SERIAL PRIMARY KEY,
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
check_type VARCHAR(50) NOT NULL, -- daily, on_demand, scheduled
triggered_by VARCHAR(100),
started_at TIMESTAMPTZ DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed
-- Results summary
total_checks INTEGER DEFAULT 0,
passed_checks INTEGER DEFAULT 0,
failed_checks INTEGER DEFAULT 0,
warning_checks INTEGER DEFAULT 0,
-- Detailed results
results JSONB DEFAULT '[]'::jsonb,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_integrity_runs_status ON integrity_check_runs(status);
CREATE INDEX IF NOT EXISTS idx_integrity_runs_started ON integrity_check_runs(started_at DESC);
-- Individual integrity check results
CREATE TABLE IF NOT EXISTS integrity_check_results (
id SERIAL PRIMARY KEY,
run_id UUID REFERENCES integrity_check_runs(run_id) ON DELETE CASCADE,
check_name VARCHAR(100) NOT NULL,
check_category VARCHAR(50) NOT NULL,
status VARCHAR(20) NOT NULL, -- passed, failed, warning, skipped
-- Check details
expected_value TEXT,
actual_value TEXT,
difference TEXT,
affected_count INTEGER DEFAULT 0,
-- Context
details JSONB DEFAULT '{}',
affected_ids JSONB DEFAULT '[]'::jsonb,
-- Remediation
can_auto_fix BOOLEAN DEFAULT FALSE,
fix_routine VARCHAR(100),
checked_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_integrity_results_run ON integrity_check_results(run_id);
CREATE INDEX IF NOT EXISTS idx_integrity_results_status ON integrity_check_results(status);
-- ============================================================
-- AUTO-FIX AUDIT LOG
-- ============================================================
CREATE TABLE IF NOT EXISTS auto_fix_runs (
id SERIAL PRIMARY KEY,
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
routine_name VARCHAR(100) NOT NULL,
triggered_by VARCHAR(100) NOT NULL,
trigger_type VARCHAR(20) NOT NULL, -- manual, auto, scheduled
started_at TIMESTAMPTZ DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed, rolled_back
-- What was changed
rows_affected INTEGER DEFAULT 0,
changes JSONB DEFAULT '[]'::jsonb,
-- Dry run support
is_dry_run BOOLEAN DEFAULT FALSE,
dry_run_preview JSONB,
-- Error handling
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_fix_runs_routine ON auto_fix_runs(routine_name);
CREATE INDEX IF NOT EXISTS idx_fix_runs_started ON auto_fix_runs(started_at DESC);
-- ============================================================
-- ALERTS TABLE
-- ============================================================
CREATE TABLE IF NOT EXISTS system_alerts (
id SERIAL PRIMARY KEY,
alert_type VARCHAR(50) NOT NULL,
severity VARCHAR(20) NOT NULL, -- info, warning, error, critical
title VARCHAR(255) NOT NULL,
message TEXT,
source VARCHAR(100),
-- Context
context JSONB DEFAULT '{}',
-- State
status VARCHAR(20) DEFAULT 'active', -- active, acknowledged, resolved, muted
acknowledged_at TIMESTAMPTZ,
acknowledged_by VARCHAR(100),
resolved_at TIMESTAMPTZ,
resolved_by VARCHAR(100),
-- Deduplication
fingerprint VARCHAR(64), -- Hash for dedup
occurrence_count INTEGER DEFAULT 1,
first_occurred_at TIMESTAMPTZ DEFAULT NOW(),
last_occurred_at TIMESTAMPTZ DEFAULT NOW(),
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_alerts_status ON system_alerts(status);
CREATE INDEX IF NOT EXISTS idx_alerts_severity ON system_alerts(severity);
CREATE INDEX IF NOT EXISTS idx_alerts_type ON system_alerts(alert_type);
CREATE INDEX IF NOT EXISTS idx_alerts_fingerprint ON system_alerts(fingerprint);
CREATE INDEX IF NOT EXISTS idx_alerts_active ON system_alerts(status, created_at DESC) WHERE status = 'active';
-- ============================================================
-- HELPER VIEWS
-- ============================================================
-- Current sync status view
CREATE OR REPLACE VIEW v_sync_status AS
SELECT
sos.status as orchestrator_status,
sos.current_worker_id,
sos.last_heartbeat_at,
sos.is_paused,
sos.pause_reason,
sos.consecutive_failures,
sos.last_run_started_at,
sos.last_run_completed_at,
sos.last_run_duration_ms,
sos.last_run_payloads_processed,
sos.last_run_errors,
sos.config,
(SELECT COUNT(*) FROM raw_payloads WHERE processed = FALSE) as unprocessed_payloads,
(SELECT COUNT(*) FROM raw_payloads_dlq WHERE status = 'pending') as dlq_pending,
(SELECT COUNT(*) FROM system_alerts WHERE status = 'active') as active_alerts,
(
SELECT json_build_object(
'total', COUNT(*),
'completed', COUNT(*) FILTER (WHERE status = 'completed'),
'failed', COUNT(*) FILTER (WHERE status = 'failed')
)
FROM sync_runs
WHERE started_at >= NOW() - INTERVAL '24 hours'
) as runs_24h
FROM sync_orchestrator_state sos
WHERE sos.id = 1;
-- DLQ summary view
CREATE OR REPLACE VIEW v_dlq_summary AS
SELECT
status,
last_error_type,
COUNT(*) as count,
MIN(moved_to_dlq_at) as oldest,
MAX(moved_to_dlq_at) as newest
FROM raw_payloads_dlq
GROUP BY status, last_error_type
ORDER BY count DESC;
-- Error bucket summary (last 24h)
CREATE OR REPLACE VIEW v_error_summary AS
SELECT
error_type,
COUNT(*) as count,
COUNT(*) FILTER (WHERE acknowledged = FALSE) as unacknowledged,
MIN(occurred_at) as first_occurred,
MAX(occurred_at) as last_occurred
FROM error_buckets
WHERE occurred_at >= NOW() - INTERVAL '24 hours'
GROUP BY error_type
ORDER BY count DESC;
-- Metrics summary view
CREATE OR REPLACE VIEW v_metrics_summary AS
SELECT
metric_name,
metric_value,
labels,
updated_at,
NOW() - updated_at as age
FROM system_metrics_current
ORDER BY metric_name;
-- ============================================================
-- HELPER FUNCTIONS
-- ============================================================
-- Record a metric
CREATE OR REPLACE FUNCTION record_metric(
p_name VARCHAR(100),
p_value NUMERIC,
p_labels JSONB DEFAULT '{}'
) RETURNS VOID AS $$
BEGIN
-- Insert into time series
INSERT INTO system_metrics (metric_name, metric_value, labels)
VALUES (p_name, p_value, p_labels);
-- Upsert current value
INSERT INTO system_metrics_current (metric_name, metric_value, labels, updated_at)
VALUES (p_name, p_value, p_labels, NOW())
ON CONFLICT (metric_name) DO UPDATE SET
metric_value = EXCLUDED.metric_value,
labels = EXCLUDED.labels,
updated_at = NOW();
END;
$$ LANGUAGE plpgsql;
-- Record an error
CREATE OR REPLACE FUNCTION record_error(
p_type VARCHAR(50),
p_message TEXT,
p_source_table VARCHAR(50) DEFAULT NULL,
p_source_id TEXT DEFAULT NULL,
p_dispensary_id INTEGER DEFAULT NULL,
p_context JSONB DEFAULT '{}'
) RETURNS INTEGER AS $$
DECLARE
v_id INTEGER;
BEGIN
INSERT INTO error_buckets (
error_type, error_message, source_table, source_id,
dispensary_id, context
)
VALUES (
p_type, p_message, p_source_table, p_source_id,
p_dispensary_id, p_context
)
RETURNING id INTO v_id;
-- Update error count metric
PERFORM record_metric(
'error_count_' || p_type,
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'error_count_' || p_type), 0) + 1
);
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Create or update alert (with deduplication)
CREATE OR REPLACE FUNCTION upsert_alert(
p_type VARCHAR(50),
p_severity VARCHAR(20),
p_title VARCHAR(255),
p_message TEXT DEFAULT NULL,
p_source VARCHAR(100) DEFAULT NULL,
p_context JSONB DEFAULT '{}'
) RETURNS INTEGER AS $$
DECLARE
v_fingerprint VARCHAR(64);
v_id INTEGER;
BEGIN
-- Generate fingerprint for dedup
v_fingerprint := md5(p_type || p_title || COALESCE(p_source, ''));
-- Try to find existing active alert
SELECT id INTO v_id
FROM system_alerts
WHERE fingerprint = v_fingerprint AND status = 'active';
IF v_id IS NOT NULL THEN
-- Update existing alert
UPDATE system_alerts
SET occurrence_count = occurrence_count + 1,
last_occurred_at = NOW(),
context = p_context
WHERE id = v_id;
ELSE
-- Create new alert
INSERT INTO system_alerts (
alert_type, severity, title, message, source, context, fingerprint
)
VALUES (
p_type, p_severity, p_title, p_message, p_source, p_context, v_fingerprint
)
RETURNING id INTO v_id;
END IF;
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Move payload to DLQ
CREATE OR REPLACE FUNCTION move_to_dlq(
p_payload_id UUID,
p_error_type VARCHAR(50),
p_error_message TEXT
) RETURNS UUID AS $$
DECLARE
v_dlq_id UUID;
v_payload RECORD;
BEGIN
-- Get the original payload
SELECT * INTO v_payload
FROM raw_payloads
WHERE id = p_payload_id;
IF v_payload IS NULL THEN
RAISE EXCEPTION 'Payload not found: %', p_payload_id;
END IF;
-- Insert into DLQ
INSERT INTO raw_payloads_dlq (
original_payload_id, dispensary_id, state_code, platform,
raw_json, product_count, pricing_type, crawl_mode,
failure_count, last_error_type, last_error_message, last_error_at,
error_history
)
VALUES (
p_payload_id, v_payload.dispensary_id,
(SELECT state FROM dispensaries WHERE id = v_payload.dispensary_id),
v_payload.platform,
v_payload.raw_json, v_payload.product_count, v_payload.pricing_type, v_payload.crawl_mode,
v_payload.hydration_attempts,
p_error_type, p_error_message, NOW(),
COALESCE(v_payload.hydration_error::jsonb, '[]'::jsonb) || jsonb_build_object(
'type', p_error_type,
'message', p_error_message,
'at', NOW()
)
)
RETURNING id INTO v_dlq_id;
-- Mark original as processed (moved to DLQ)
UPDATE raw_payloads
SET processed = TRUE,
hydration_error = 'Moved to DLQ: ' || p_error_message
WHERE id = p_payload_id;
-- Record metric
PERFORM record_metric('payloads_dlq_total',
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'payloads_dlq_total'), 0) + 1
);
-- Create alert for DLQ
PERFORM upsert_alert(
'DLQ_ARRIVAL',
'warning',
'Payload moved to Dead-Letter Queue',
p_error_message,
'hydration',
jsonb_build_object('payload_id', p_payload_id, 'dlq_id', v_dlq_id, 'error_type', p_error_type)
);
RETURN v_dlq_id;
END;
$$ LANGUAGE plpgsql;
-- Cleanup old metrics (keep 7 days of time series)
CREATE OR REPLACE FUNCTION cleanup_old_metrics() RETURNS INTEGER AS $$
DECLARE
v_deleted INTEGER;
BEGIN
DELETE FROM system_metrics
WHERE recorded_at < NOW() - INTERVAL '7 days';
GET DIAGNOSTICS v_deleted = ROW_COUNT;
RETURN v_deleted;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- ENSURE RAW_PAYLOADS HAS REQUIRED COLUMNS
-- ============================================================
-- Add state column to raw_payloads if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'raw_payloads' AND column_name = 'state_code'
) THEN
ALTER TABLE raw_payloads ADD COLUMN state_code VARCHAR(2);
END IF;
END $$;
-- ============================================================
-- INITIAL METRICS
-- ============================================================
-- Initialize core metrics
INSERT INTO system_metrics_current (metric_name, metric_value, labels)
VALUES
('payloads_unprocessed', 0, '{}'),
('payloads_processed_today', 0, '{}'),
('hydration_errors', 0, '{}'),
('hydration_success_rate', 100, '{}'),
('canonical_rows_inserted', 0, '{}'),
('canonical_rows_updated', 0, '{}'),
('canonical_rows_discontinued', 0, '{}'),
('snapshot_volume', 0, '{}'),
('ingestion_latency_avg_ms', 0, '{}'),
('payloads_dlq_total', 0, '{}')
ON CONFLICT (metric_name) DO NOTHING;
-- ============================================================
-- COMMENTS
-- ============================================================
COMMENT ON TABLE sync_orchestrator_state IS 'Singleton table tracking orchestrator status and config';
COMMENT ON TABLE sync_runs IS 'History of sync runs with metrics';
COMMENT ON TABLE raw_payloads_dlq IS 'Dead-letter queue for failed payloads';
COMMENT ON TABLE system_metrics IS 'Time-series metrics storage';
COMMENT ON TABLE system_metrics_current IS 'Current metric values (fast lookup)';
COMMENT ON TABLE error_buckets IS 'Classified errors for monitoring';
COMMENT ON TABLE integrity_check_runs IS 'Integrity check execution history';
COMMENT ON TABLE integrity_check_results IS 'Individual check results';
COMMENT ON TABLE auto_fix_runs IS 'Audit log for auto-fix routines';
COMMENT ON TABLE system_alerts IS 'System alerts with deduplication';

View File

@@ -0,0 +1,750 @@
-- ============================================================================
-- Migration 050: CannaiQ Canonical Schema v2
-- ============================================================================
--
-- Purpose: Add canonical tables for multi-state analytics, pricing engine,
-- promotions, intelligence, and brand/buyer portals.
--
-- RULES:
-- - STRICTLY ADDITIVE (no DROP, DELETE, TRUNCATE, or ALTER column type)
-- - All new tables use IF NOT EXISTS
-- - All new columns use ADD COLUMN IF NOT EXISTS
-- - All indexes use IF NOT EXISTS
-- - Compatible with existing dutchie_products, dispensaries, etc.
--
-- Run with:
-- psql $CANNAIQ_DB_URL -f migrations/050_cannaiq_canonical_v2.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: STATES TABLE
-- ============================================================================
-- Reference table for US states. Already may exist from 041/043.
-- This is idempotent.
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL UNIQUE,
name VARCHAR(100) NOT NULL,
timezone VARCHAR(50) DEFAULT 'America/Phoenix',
is_active BOOLEAN DEFAULT TRUE,
crawl_enabled BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert states if not present
INSERT INTO states (code, name, timezone) VALUES
('AZ', 'Arizona', 'America/Phoenix'),
('CA', 'California', 'America/Los_Angeles'),
('CO', 'Colorado', 'America/Denver'),
('FL', 'Florida', 'America/New_York'),
('IL', 'Illinois', 'America/Chicago'),
('MA', 'Massachusetts', 'America/New_York'),
('MD', 'Maryland', 'America/New_York'),
('MI', 'Michigan', 'America/Detroit'),
('MO', 'Missouri', 'America/Chicago'),
('NV', 'Nevada', 'America/Los_Angeles'),
('NJ', 'New Jersey', 'America/New_York'),
('NY', 'New York', 'America/New_York'),
('OH', 'Ohio', 'America/New_York'),
('OK', 'Oklahoma', 'America/Chicago'),
('OR', 'Oregon', 'America/Los_Angeles'),
('PA', 'Pennsylvania', 'America/New_York'),
('WA', 'Washington', 'America/Los_Angeles')
ON CONFLICT (code) DO UPDATE SET
timezone = EXCLUDED.timezone,
updated_at = NOW();
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
-- ============================================================================
-- SECTION 2: CHAINS TABLE (Retail Groups)
-- ============================================================================
-- Chains are multi-location operators like Curaleaf, Trulieve, Harvest, etc.
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
-- Branding
website_url TEXT,
logo_url TEXT,
description TEXT,
-- Business info
headquarters_city VARCHAR(100),
headquarters_state_id INTEGER REFERENCES states(id),
founded_year INTEGER,
-- Status
is_active BOOLEAN DEFAULT TRUE,
is_public BOOLEAN DEFAULT FALSE, -- Publicly traded?
stock_ticker VARCHAR(10),
-- Metadata
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
-- ============================================================================
-- SECTION 3: CANONICAL BRANDS TABLE
-- ============================================================================
-- This is the master brand catalog across all providers and states.
-- Distinct from the per-store `brands` table which tracks store-level brand presence.
CREATE TABLE IF NOT EXISTS canonical_brands (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
-- External IDs from various platforms
dutchie_brand_id VARCHAR(100),
jane_brand_id VARCHAR(100),
treez_brand_id VARCHAR(100),
weedmaps_brand_id VARCHAR(100),
-- Branding
logo_url TEXT,
local_logo_path TEXT, -- Local storage path
website_url TEXT,
instagram_handle VARCHAR(100),
description TEXT,
-- Classification
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
is_house_brand BOOLEAN DEFAULT FALSE, -- TRUE if dispensary house brand
parent_company VARCHAR(255), -- Parent company name if subsidiary
-- State presence
states_available TEXT[], -- Array of state codes where brand is present
-- Status
is_active BOOLEAN DEFAULT TRUE,
is_verified BOOLEAN DEFAULT FALSE, -- Manually verified brand info
verified_at TIMESTAMPTZ,
-- Metadata
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_canonical_brands_slug ON canonical_brands(slug);
CREATE INDEX IF NOT EXISTS idx_canonical_brands_dutchie ON canonical_brands(dutchie_brand_id) WHERE dutchie_brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_canonical_brands_portfolio ON canonical_brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
CREATE INDEX IF NOT EXISTS idx_canonical_brands_states ON canonical_brands USING GIN(states_available);
COMMENT ON TABLE canonical_brands IS 'Canonical brand catalog across all providers. Master brand reference.';
COMMENT ON COLUMN canonical_brands.is_portfolio_brand IS 'TRUE if this is a brand CannaiQ represents/manages.';
-- ============================================================================
-- SECTION 4: CRAWL_RUNS TABLE
-- ============================================================================
-- One record per crawl execution. Links to snapshots.
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
error_code VARCHAR(50),
error_message TEXT,
http_status INTEGER,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
products_missing INTEGER DEFAULT 0, -- Products gone from feed
snapshots_written INTEGER DEFAULT 0,
-- Infrastructure
worker_id VARCHAR(100),
worker_hostname VARCHAR(100),
proxy_used TEXT,
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
-- Metadata
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- ============================================================================
-- SECTION 5: STORE_PRODUCTS TABLE (Current Menu State)
-- ============================================================================
-- Canonical representation of what's currently on the menu.
-- Provider-agnostic structure for analytics.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
-- Links to canonical entities
canonical_brand_id INTEGER REFERENCES canonical_brands(id) ON DELETE SET NULL,
category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL,
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100) NOT NULL, -- Platform product ID
provider_brand_id VARCHAR(100), -- Platform brand ID
enterprise_product_id VARCHAR(100), -- Cross-store product ID
-- Raw data from platform (not normalized)
name VARCHAR(500) NOT NULL,
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
strain_type VARCHAR(50),
description TEXT,
-- Pricing (current)
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
price_unit VARCHAR(20) DEFAULT 'each', -- gram, ounce, each, mg
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock', -- in_stock, out_of_stock, low_stock, missing_from_feed
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
thc_mg NUMERIC(10,2),
cbd_mg NUMERIC(10,2),
-- Weight/Size
weight_value NUMERIC(10,2),
weight_unit VARCHAR(20), -- g, oz, mg
-- Images
image_url TEXT,
local_image_path TEXT,
thumbnail_url TEXT,
-- Flags
is_featured BOOLEAN DEFAULT FALSE,
medical_only BOOLEAN DEFAULT FALSE,
rec_only BOOLEAN DEFAULT FALSE,
-- Menu position (for tracking prominence)
menu_position INTEGER,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_price_change_at TIMESTAMPTZ,
last_stock_change_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, provider, provider_product_id)
);
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(canonical_brand_id) WHERE canonical_brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
-- ============================================================================
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE (Historical Data)
-- ============================================================================
-- Time-series data for analytics. One row per product per crawl.
-- CRITICAL: NEVER DELETE from this table.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
state_id INTEGER REFERENCES states(id),
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name VARCHAR(500),
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
is_present_in_feed BOOLEAN DEFAULT TRUE, -- FALSE = missing from feed
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Menu position (for tracking prominence changes)
menu_position INTEGER,
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Partitioning-ready indexes (for future table partitioning by month)
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
-- ============================================================================
-- SECTION 7: ADD state_id AND chain_id TO DISPENSARIES
-- ============================================================================
-- Link dispensaries to states and chains tables.
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
-- Backfill state_id from existing state column
UPDATE dispensaries d
SET state_id = s.id
FROM states s
WHERE d.state = s.code
AND d.state_id IS NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- ============================================================================
-- SECTION 8: BRAND PENETRATION TABLE
-- ============================================================================
-- Pre-computed brand presence across stores for analytics dashboards.
CREATE TABLE IF NOT EXISTS brand_penetration (
id SERIAL PRIMARY KEY,
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
state_id INTEGER NOT NULL REFERENCES states(id) ON DELETE CASCADE,
-- Metrics
stores_carrying INTEGER DEFAULT 0,
stores_total INTEGER DEFAULT 0,
penetration_pct NUMERIC(5,2) DEFAULT 0,
-- Product breakdown
products_count INTEGER DEFAULT 0,
products_in_stock INTEGER DEFAULT 0,
products_on_special INTEGER DEFAULT 0,
-- Pricing
avg_price NUMERIC(10,2),
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
-- Time range
calculated_at TIMESTAMPTZ DEFAULT NOW(),
period_start TIMESTAMPTZ,
period_end TIMESTAMPTZ,
UNIQUE(canonical_brand_id, state_id, calculated_at)
);
CREATE INDEX IF NOT EXISTS idx_brand_penetration_brand ON brand_penetration(canonical_brand_id);
CREATE INDEX IF NOT EXISTS idx_brand_penetration_state ON brand_penetration(state_id);
CREATE INDEX IF NOT EXISTS idx_brand_penetration_calculated ON brand_penetration(calculated_at DESC);
COMMENT ON TABLE brand_penetration IS 'Pre-computed brand penetration metrics by state.';
-- ============================================================================
-- SECTION 9: PRICE_ALERTS TABLE
-- ============================================================================
-- Track significant price changes for intelligence/alerts.
CREATE TABLE IF NOT EXISTS price_alerts (
id SERIAL PRIMARY KEY,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE CASCADE,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
-- What changed
alert_type VARCHAR(50) NOT NULL, -- price_drop, price_increase, new_special, special_ended
-- Values
old_price NUMERIC(10,2),
new_price NUMERIC(10,2),
change_amount NUMERIC(10,2),
change_percent NUMERIC(5,2),
-- Context
product_name VARCHAR(500),
brand_name VARCHAR(255),
category VARCHAR(100),
-- Status
is_processed BOOLEAN DEFAULT FALSE,
processed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_price_alerts_dispensary ON price_alerts(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_price_alerts_state ON price_alerts(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_price_alerts_type ON price_alerts(alert_type);
CREATE INDEX IF NOT EXISTS idx_price_alerts_unprocessed ON price_alerts(is_processed) WHERE is_processed = FALSE;
CREATE INDEX IF NOT EXISTS idx_price_alerts_created ON price_alerts(created_at DESC);
COMMENT ON TABLE price_alerts IS 'Significant price changes for intelligence/alerting.';
-- ============================================================================
-- SECTION 10: RAW_PAYLOADS TABLE
-- ============================================================================
-- Store raw API responses for replay/debugging. Separate from snapshots.
CREATE TABLE IF NOT EXISTS raw_payloads (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
-- Payload info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
payload_type VARCHAR(50) NOT NULL DEFAULT 'products', -- products, brands, specials
-- The raw data
payload JSONB NOT NULL,
payload_size_bytes INTEGER,
-- Deduplication
payload_hash VARCHAR(64), -- SHA256 for deduplication
-- Processing status
is_processed BOOLEAN DEFAULT FALSE,
processed_at TIMESTAMPTZ,
captured_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary ON raw_payloads(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run ON raw_payloads(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed ON raw_payloads(is_processed) WHERE is_processed = FALSE;
CREATE INDEX IF NOT EXISTS idx_raw_payloads_hash ON raw_payloads(payload_hash) WHERE payload_hash IS NOT NULL;
COMMENT ON TABLE raw_payloads IS 'Raw API responses for replay/debugging. Enables re-hydration.';
-- ============================================================================
-- SECTION 11: ANALYTICS CACHE TABLES
-- ============================================================================
-- Pre-computed analytics for dashboard performance.
-- Daily store metrics
CREATE TABLE IF NOT EXISTS analytics_store_daily (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
date DATE NOT NULL,
-- Product counts
total_products INTEGER DEFAULT 0,
in_stock_products INTEGER DEFAULT 0,
out_of_stock_products INTEGER DEFAULT 0,
on_special_products INTEGER DEFAULT 0,
-- Brand/category diversity
unique_brands INTEGER DEFAULT 0,
unique_categories INTEGER DEFAULT 0,
-- Pricing
avg_price NUMERIC(10,2),
median_price NUMERIC(10,2),
-- Crawl health
crawl_count INTEGER DEFAULT 0,
successful_crawls INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, date)
);
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_dispensary ON analytics_store_daily(dispensary_id, date DESC);
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_state ON analytics_store_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_date ON analytics_store_daily(date DESC);
-- Daily brand metrics
CREATE TABLE IF NOT EXISTS analytics_brand_daily (
id SERIAL PRIMARY KEY,
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
date DATE NOT NULL,
-- Presence
stores_carrying INTEGER DEFAULT 0,
products_count INTEGER DEFAULT 0,
-- Stock
in_stock_count INTEGER DEFAULT 0,
out_of_stock_count INTEGER DEFAULT 0,
-- Pricing
avg_price NUMERIC(10,2),
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
on_special_count INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(canonical_brand_id, state_id, date)
);
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_brand ON analytics_brand_daily(canonical_brand_id, date DESC);
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_state ON analytics_brand_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
-- ============================================================================
-- SECTION 12: VIEWS FOR COMPATIBILITY
-- ============================================================================
-- View: Latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View: Crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
COALESCE(d.dba_name, d.name) AS dispensary_name,
d.city,
d.state,
d.state_id,
s.name AS state_name,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN states s ON s.id = d.state_id
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
-- View: Brand presence across stores
CREATE OR REPLACE VIEW v_brand_store_presence AS
SELECT
cb.id AS brand_id,
cb.name AS brand_name,
cb.slug AS brand_slug,
s.id AS state_id,
s.code AS state_code,
COUNT(DISTINCT sp.dispensary_id) AS store_count,
COUNT(sp.id) AS product_count,
COUNT(sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
AVG(sp.price_rec) AS avg_price,
MIN(sp.price_rec) AS min_price,
MAX(sp.price_rec) AS max_price
FROM canonical_brands cb
JOIN store_products sp ON sp.canonical_brand_id = cb.id
LEFT JOIN states s ON s.id = sp.state_id
GROUP BY cb.id, cb.name, cb.slug, s.id, s.code;
-- ============================================================================
-- SECTION 13: ADD FK FROM store_product_snapshots TO crawl_runs
-- ============================================================================
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.table_constraints
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
END $$;
-- ============================================================================
-- SECTION 14: ADD crawl_run_id TO crawl_orchestration_traces
-- ============================================================================
ALTER TABLE crawl_orchestration_traces
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
ON crawl_orchestration_traces(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- ============================================================================
-- SECTION 15: UPDATE dispensary_crawler_profiles
-- ============================================================================
-- Add status columns for profile lifecycle.
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_profiles_status
ON dispensary_crawler_profiles(status);
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
-- ============================================================================
-- SECTION 16: UPDATE dispensary_crawl_jobs WITH ADDITIONAL COLUMNS
-- ============================================================================
-- Add columns needed for enhanced job tracking.
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS worker_id VARCHAR(100);
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS worker_hostname VARCHAR(100);
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS claimed_by VARCHAR(100);
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS claimed_at TIMESTAMPTZ;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS locked_until TIMESTAMPTZ;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS last_heartbeat_at TIMESTAMPTZ;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS max_retries INTEGER DEFAULT 3;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS products_upserted INTEGER DEFAULT 0;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS snapshots_created INTEGER DEFAULT 0;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS current_page INTEGER DEFAULT 0;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS total_pages INTEGER;
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status_pending ON dispensary_crawl_jobs(status) WHERE status = 'pending';
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_claimed_by ON dispensary_crawl_jobs(claimed_by) WHERE claimed_by IS NOT NULL;
-- ============================================================================
-- SECTION 17: QUEUE MONITORING VIEWS
-- ============================================================================
CREATE OR REPLACE VIEW v_queue_stats AS
SELECT
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'pending') AS pending_jobs,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'running') AS running_jobs,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS completed_1h,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') AS failed_1h,
(SELECT COUNT(DISTINCT worker_id) FROM dispensary_crawl_jobs WHERE status = 'running' AND worker_id IS NOT NULL) AS active_workers,
(SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS avg_duration_seconds;
CREATE OR REPLACE VIEW v_active_workers AS
SELECT
worker_id,
worker_hostname,
COUNT(*) AS current_jobs,
SUM(products_found) AS total_products_found,
SUM(products_upserted) AS total_products_upserted,
SUM(snapshots_created) AS total_snapshots,
MIN(claimed_at) AS first_claimed_at,
MAX(last_heartbeat_at) AS last_heartbeat
FROM dispensary_crawl_jobs
WHERE status = 'running' AND worker_id IS NOT NULL
GROUP BY worker_id, worker_hostname;
-- ============================================================================
-- DONE
-- ============================================================================
SELECT 'Migration 050 completed successfully. Canonical schema v2 is ready.' AS status;

View File

@@ -0,0 +1,642 @@
-- ============================================================================
-- Migration 051: CannaiQ Canonical Schema - Safe Bootstrap
-- ============================================================================
--
-- Purpose: Create the canonical CannaiQ schema tables from scratch.
-- This migration is FULLY IDEMPOTENT and safe to run multiple times.
--
-- SAFETY RULES FOLLOWED:
-- 1. ALL tables use CREATE TABLE IF NOT EXISTS
-- 2. ALL columns use ALTER TABLE ADD COLUMN IF NOT EXISTS
-- 3. ALL indexes use CREATE INDEX IF NOT EXISTS
-- 4. NO DROP, DELETE, TRUNCATE, or destructive operations
-- 5. NO assumptions about existing data or column existence
-- 6. NO dependencies on migrations 041, 043, or 050
-- 7. Compatible with dutchie_menus database as it exists today
-- 8. Safe handling of pre-existing states table with missing columns
--
-- Tables Created:
-- - states (US state reference table)
-- - chains (retail chain/group table)
-- - crawl_runs (crawl execution records)
-- - store_products (current menu state)
-- - store_product_snapshots (historical price/stock data)
--
-- Columns Added:
-- - dispensaries.state_id (FK to states)
-- - dispensaries.chain_id (FK to chains)
--
-- Run with:
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
-- -f migrations/051_cannaiq_canonical_safe_bootstrap.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: STATES TABLE
-- ============================================================================
-- Reference table for US states where CannaiQ operates.
-- This section handles the case where the table exists but is missing columns.
-- First, create the table if it doesn't exist (minimal definition)
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL,
name VARCHAR(100) NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Now safely add any missing columns (each is independent, won't fail if exists)
ALTER TABLE states ADD COLUMN IF NOT EXISTS timezone TEXT;
ALTER TABLE states ADD COLUMN IF NOT EXISTS is_active BOOLEAN DEFAULT TRUE;
ALTER TABLE states ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE;
-- Add unique constraint on code if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'states_code_key' AND conrelid = 'states'::regclass
) THEN
-- Check if there's already a unique constraint with a different name
IF NOT EXISTS (
SELECT 1 FROM pg_indexes
WHERE tablename = 'states' AND indexdef LIKE '%UNIQUE%code%'
) THEN
ALTER TABLE states ADD CONSTRAINT states_code_key UNIQUE (code);
END IF;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL; -- Constraint already exists
WHEN OTHERS THEN
NULL; -- Handle any other errors gracefully
END $$;
-- Set default timezone values for existing rows that have NULL
UPDATE states SET timezone = 'America/Phoenix' WHERE timezone IS NULL AND code = 'AZ';
UPDATE states SET timezone = 'America/Los_Angeles' WHERE timezone IS NULL AND code IN ('CA', 'NV', 'OR', 'WA');
UPDATE states SET timezone = 'America/Denver' WHERE timezone IS NULL AND code = 'CO';
UPDATE states SET timezone = 'America/New_York' WHERE timezone IS NULL AND code IN ('FL', 'MA', 'MD', 'NJ', 'NY', 'OH', 'PA');
UPDATE states SET timezone = 'America/Chicago' WHERE timezone IS NULL AND code IN ('IL', 'MO', 'OK');
UPDATE states SET timezone = 'America/Detroit' WHERE timezone IS NULL AND code = 'MI';
-- Set default is_active for existing rows
UPDATE states SET is_active = TRUE WHERE is_active IS NULL;
UPDATE states SET crawl_enabled = TRUE WHERE crawl_enabled IS NULL;
-- Insert known states (idempotent - ON CONFLICT DO UPDATE to fill missing values)
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
('AZ', 'Arizona', 'America/Phoenix', TRUE, TRUE),
('CA', 'California', 'America/Los_Angeles', TRUE, TRUE),
('CO', 'Colorado', 'America/Denver', TRUE, TRUE),
('FL', 'Florida', 'America/New_York', TRUE, TRUE),
('IL', 'Illinois', 'America/Chicago', TRUE, TRUE),
('MA', 'Massachusetts', 'America/New_York', TRUE, TRUE),
('MD', 'Maryland', 'America/New_York', TRUE, TRUE),
('MI', 'Michigan', 'America/Detroit', TRUE, TRUE),
('MO', 'Missouri', 'America/Chicago', TRUE, TRUE),
('NV', 'Nevada', 'America/Los_Angeles', TRUE, TRUE),
('NJ', 'New Jersey', 'America/New_York', TRUE, TRUE),
('NY', 'New York', 'America/New_York', TRUE, TRUE),
('OH', 'Ohio', 'America/New_York', TRUE, TRUE),
('OK', 'Oklahoma', 'America/Chicago', TRUE, TRUE),
('OR', 'Oregon', 'America/Los_Angeles', TRUE, TRUE),
('PA', 'Pennsylvania', 'America/New_York', TRUE, TRUE),
('WA', 'Washington', 'America/Los_Angeles', TRUE, TRUE)
ON CONFLICT (code) DO UPDATE SET
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
is_active = COALESCE(states.is_active, EXCLUDED.is_active),
crawl_enabled = COALESCE(states.crawl_enabled, EXCLUDED.crawl_enabled),
updated_at = NOW();
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
-- ============================================================================
-- SECTION 2: CHAINS TABLE
-- ============================================================================
-- Retail chains/groups that own multiple dispensary locations.
-- Examples: Curaleaf, Trulieve, Harvest, Columbia Care
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL,
website_url TEXT,
logo_url TEXT,
description TEXT,
headquarters_city VARCHAR(100),
headquarters_state_id INTEGER,
founded_year INTEGER,
is_active BOOLEAN DEFAULT TRUE,
is_public BOOLEAN DEFAULT FALSE,
stock_ticker VARCHAR(10),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add unique constraint on slug if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'chains_slug_key' AND conrelid = 'chains'::regclass
) THEN
ALTER TABLE chains ADD CONSTRAINT chains_slug_key UNIQUE (slug);
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
-- Add FK to states if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'chains_headquarters_state_id_fkey'
) THEN
ALTER TABLE chains
ADD CONSTRAINT chains_headquarters_state_id_fkey
FOREIGN KEY (headquarters_state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
-- ============================================================================
-- SECTION 3: ADD state_id AND chain_id TO DISPENSARIES
-- ============================================================================
-- Link existing dispensaries table to states and chains.
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER;
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dispensaries_state_id_fkey'
) THEN
ALTER TABLE dispensaries
ADD CONSTRAINT dispensaries_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dispensaries_chain_id_fkey'
) THEN
ALTER TABLE dispensaries
ADD CONSTRAINT dispensaries_chain_id_fkey
FOREIGN KEY (chain_id) REFERENCES chains(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
-- Backfill state_id from existing state column (safe - only updates NULL values)
UPDATE dispensaries d
SET state_id = s.id
FROM states s
WHERE d.state = s.code
AND d.state_id IS NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- ============================================================================
-- SECTION 4: CRAWL_RUNS TABLE
-- ============================================================================
-- One record per crawl execution. Links to snapshots.
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL,
state_id INTEGER,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running',
error_code VARCHAR(50),
error_message TEXT,
http_status INTEGER,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
products_missing INTEGER DEFAULT 0,
snapshots_written INTEGER DEFAULT 0,
-- Infrastructure
worker_id VARCHAR(100),
worker_hostname VARCHAR(100),
proxy_used TEXT,
trigger_type VARCHAR(50) DEFAULT 'scheduled',
-- Metadata
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'crawl_runs_dispensary_id_fkey'
) THEN
ALTER TABLE crawl_runs
ADD CONSTRAINT crawl_runs_dispensary_id_fkey
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'crawl_runs_state_id_fkey'
) THEN
ALTER TABLE crawl_runs
ADD CONSTRAINT crawl_runs_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- ============================================================================
-- SECTION 5: STORE_PRODUCTS TABLE
-- ============================================================================
-- Current state of products on each dispensary menu.
-- Provider-agnostic structure for analytics.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL,
state_id INTEGER,
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100) NOT NULL,
provider_brand_id VARCHAR(100),
enterprise_product_id VARCHAR(100),
-- Raw data from platform (not normalized)
name VARCHAR(500) NOT NULL,
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
strain_type VARCHAR(50),
description TEXT,
-- Pricing (current)
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
price_unit VARCHAR(20) DEFAULT 'each',
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
thc_mg NUMERIC(10,2),
cbd_mg NUMERIC(10,2),
-- Weight/Size
weight_value NUMERIC(10,2),
weight_unit VARCHAR(20),
-- Images
image_url TEXT,
local_image_path TEXT,
thumbnail_url TEXT,
-- Flags
is_featured BOOLEAN DEFAULT FALSE,
medical_only BOOLEAN DEFAULT FALSE,
rec_only BOOLEAN DEFAULT FALSE,
-- Menu position (for tracking prominence)
menu_position INTEGER,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_price_change_at TIMESTAMPTZ,
last_stock_change_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add unique constraint if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_products_dispensary_provider_product_key'
) THEN
ALTER TABLE store_products
ADD CONSTRAINT store_products_dispensary_provider_product_key
UNIQUE (dispensary_id, provider, provider_product_id);
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_products_dispensary_id_fkey'
) THEN
ALTER TABLE store_products
ADD CONSTRAINT store_products_dispensary_id_fkey
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_products_state_id_fkey'
) THEN
ALTER TABLE store_products
ADD CONSTRAINT store_products_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand_name ON store_products(brand_name) WHERE brand_name IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
-- ============================================================================
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE
-- ============================================================================
-- Historical price/stock data. One row per product per crawl.
-- CRITICAL: NEVER DELETE from this table.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL,
store_product_id INTEGER,
state_id INTEGER,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER,
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name VARCHAR(500),
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
is_present_in_feed BOOLEAN DEFAULT TRUE,
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Menu position (for tracking prominence changes)
menu_position INTEGER,
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_dispensary_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_dispensary_id_fkey
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_store_product_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_store_product_id_fkey
FOREIGN KEY (store_product_id) REFERENCES store_products(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_state_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
-- Indexes optimized for analytics queries
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_product ON store_product_snapshots(provider_product_id) WHERE provider_product_id IS NOT NULL;
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
-- ============================================================================
-- SECTION 7: VIEWS FOR BACKWARD COMPATIBILITY
-- ============================================================================
-- View: Latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View: Crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
COALESCE(d.dba_name, d.name) AS dispensary_name,
d.city,
d.state,
d.state_id,
s.name AS state_name,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN states s ON s.id = d.state_id
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
-- ============================================================================
-- MIGRATION 051 COMPLETE
-- ============================================================================
SELECT 'Migration 051 completed successfully. Canonical schema is ready.' AS status;

View File

@@ -0,0 +1,98 @@
-- Migration 051: Create materialized view for state metrics
-- Used by Analytics V2 state endpoints for fast aggregated queries
-- Canonical tables: states, dispensaries, store_products, store_product_snapshots, brands
-- Drop existing view if it exists (for clean recreation)
DROP MATERIALIZED VIEW IF EXISTS mv_state_metrics;
-- Create materialized view with comprehensive state metrics
-- Schema verified via information_schema on 2025-12-06
-- Real columns used:
-- states: id, code, name, recreational_legal, medical_legal, rec_year, med_year
-- dispensaries: id, state_id (NO is_active column)
-- store_products: id, dispensary_id, brand_id, category_raw, price_rec, price_med, is_in_stock
-- store_product_snapshots: id, store_product_id, captured_at
-- brands: id (joined via sp.brand_id)
CREATE MATERIALIZED VIEW mv_state_metrics AS
SELECT
s.id AS state_id,
s.code AS state,
s.name AS state_name,
COALESCE(s.recreational_legal, FALSE) AS recreational_legal,
COALESCE(s.medical_legal, FALSE) AS medical_legal,
s.rec_year,
s.med_year,
-- Dispensary metrics
COUNT(DISTINCT d.id) AS dispensary_count,
-- Product metrics
COUNT(DISTINCT sp.id) AS total_products,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = TRUE) AS in_stock_products,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = FALSE) AS out_of_stock_products,
-- Brand metrics (using brand_id FK, not brand_name)
COUNT(DISTINCT sp.brand_id) FILTER (WHERE sp.brand_id IS NOT NULL) AS unique_brands,
-- Category metrics (using category_raw, not category)
COUNT(DISTINCT sp.category_raw) FILTER (WHERE sp.category_raw IS NOT NULL) AS unique_categories,
-- Pricing metrics (recreational)
AVG(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_rec,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)
FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_rec,
MIN(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS min_price_rec,
MAX(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS max_price_rec,
-- Pricing metrics (medical)
AVG(sp.price_med) FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_med,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_med)
FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_med,
-- Snapshot/crawl metrics
COUNT(sps.id) AS total_snapshots,
MAX(sps.captured_at) AS last_crawl_at,
MIN(sps.captured_at) AS first_crawl_at,
-- Data freshness
CASE
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '24 hours' THEN 'fresh'
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '7 days' THEN 'recent'
WHEN MAX(sps.captured_at) IS NOT NULL THEN 'stale'
ELSE 'no_data'
END AS data_freshness,
-- Metadata
NOW() AS refreshed_at
FROM states s
LEFT JOIN dispensaries d ON d.state_id = s.id
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN store_product_snapshots sps ON sps.store_product_id = sp.id
GROUP BY s.id, s.code, s.name, s.recreational_legal, s.medical_legal, s.rec_year, s.med_year;
-- Create unique index on state code for fast lookups
CREATE UNIQUE INDEX IF NOT EXISTS mv_state_metrics_state_idx
ON mv_state_metrics (state);
-- Create index on state_id for joins
CREATE INDEX IF NOT EXISTS mv_state_metrics_state_id_idx
ON mv_state_metrics (state_id);
-- Create index for legal status filtering
CREATE INDEX IF NOT EXISTS mv_state_metrics_legal_idx
ON mv_state_metrics (recreational_legal, medical_legal);
-- Create index for data freshness queries
CREATE INDEX IF NOT EXISTS mv_state_metrics_freshness_idx
ON mv_state_metrics (data_freshness);
-- Comment on the view
COMMENT ON MATERIALIZED VIEW mv_state_metrics IS
'Aggregated state-level metrics for Analytics V2 endpoints. Refresh periodically with: REFRESH MATERIALIZED VIEW CONCURRENTLY mv_state_metrics;';
-- Record migration
INSERT INTO schema_migrations (version, name, applied_at)
VALUES ('051', 'create_mv_state_metrics', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -0,0 +1,96 @@
-- Migration 052: Add provider_data JSONB and frequently-queried columns
--
-- Adds hybrid storage for legacy data:
-- 1. provider_data JSONB on both tables for all extra fields
-- 2. Specific columns for frequently-queried fields
-- ============================================================================
-- store_products: Add provider_data and queryable columns
-- ============================================================================
-- JSONB for all extra provider-specific data
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS provider_data JSONB;
-- Frequently-queried columns
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS strain_type TEXT;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS medical_only BOOLEAN DEFAULT FALSE;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS rec_only BOOLEAN DEFAULT FALSE;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS brand_logo_url TEXT;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS platform_dispensary_id TEXT;
-- Index for strain_type queries
CREATE INDEX IF NOT EXISTS idx_store_products_strain_type
ON store_products(strain_type)
WHERE strain_type IS NOT NULL;
-- Index for medical/rec filtering
CREATE INDEX IF NOT EXISTS idx_store_products_medical_rec
ON store_products(medical_only, rec_only);
-- GIN index for provider_data JSONB queries
CREATE INDEX IF NOT EXISTS idx_store_products_provider_data
ON store_products USING GIN (provider_data);
-- ============================================================================
-- store_product_snapshots: Add provider_data and queryable columns
-- ============================================================================
-- JSONB for all extra provider-specific data
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS provider_data JSONB;
-- Frequently-queried columns
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS featured BOOLEAN DEFAULT FALSE;
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
-- Index for featured products
CREATE INDEX IF NOT EXISTS idx_snapshots_featured
ON store_product_snapshots(dispensary_id, featured)
WHERE featured = TRUE;
-- Index for low stock alerts
CREATE INDEX IF NOT EXISTS idx_snapshots_below_threshold
ON store_product_snapshots(dispensary_id, is_below_threshold)
WHERE is_below_threshold = TRUE;
-- GIN index for provider_data JSONB queries
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_data
ON store_product_snapshots USING GIN (provider_data);
-- ============================================================================
-- Comments for documentation
-- ============================================================================
COMMENT ON COLUMN store_products.provider_data IS
'JSONB blob containing all provider-specific fields not in canonical columns (effects, terpenes, cannabinoids_v2, etc.)';
COMMENT ON COLUMN store_products.strain_type IS
'Cannabis strain type: Indica, Sativa, Hybrid, Indica-Hybrid, Sativa-Hybrid';
COMMENT ON COLUMN store_products.platform_dispensary_id IS
'Provider platform dispensary ID (e.g., Dutchie MongoDB ObjectId)';
COMMENT ON COLUMN store_product_snapshots.provider_data IS
'JSONB blob containing all provider-specific snapshot fields (options, kiosk data, etc.)';
COMMENT ON COLUMN store_product_snapshots.featured IS
'Whether product was featured/highlighted at capture time';
COMMENT ON COLUMN store_product_snapshots.is_below_threshold IS
'Whether product was below inventory threshold at capture time';

View File

@@ -0,0 +1,127 @@
-- ============================================================================
-- Migration 052: Add Cannabis Legalization Flags to States
-- ============================================================================
--
-- Purpose: Add recreational/medical cannabis legalization status and years
-- to the existing states table, then seed all 50 states + DC.
--
-- SAFETY RULES:
-- - Uses ADD COLUMN IF NOT EXISTS (idempotent)
-- - Uses INSERT ... ON CONFLICT (code) DO UPDATE (idempotent)
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Safe to run multiple times
--
-- Run with:
-- psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: Add cannabis legalization columns
-- ============================================================================
ALTER TABLE states ADD COLUMN IF NOT EXISTS recreational_legal BOOLEAN;
ALTER TABLE states ADD COLUMN IF NOT EXISTS rec_year INTEGER;
ALTER TABLE states ADD COLUMN IF NOT EXISTS medical_legal BOOLEAN;
ALTER TABLE states ADD COLUMN IF NOT EXISTS med_year INTEGER;
COMMENT ON COLUMN states.recreational_legal IS 'Whether recreational cannabis is legal in this state';
COMMENT ON COLUMN states.rec_year IS 'Year recreational cannabis was legalized (NULL if not legal)';
COMMENT ON COLUMN states.medical_legal IS 'Whether medical cannabis is legal in this state';
COMMENT ON COLUMN states.med_year IS 'Year medical cannabis was legalized (NULL if not legal)';
-- ============================================================================
-- SECTION 2: Seed all 50 states + DC with cannabis legalization data
-- ============================================================================
-- Data sourced from state legalization records as of 2024
-- States ordered by medical legalization year, then alphabetically
INSERT INTO states (code, name, timezone, recreational_legal, rec_year, medical_legal, med_year)
VALUES
-- Recreational + Medical States (ordered by rec year)
('WA', 'Washington', 'America/Los_Angeles', TRUE, 2012, TRUE, 1998),
('CO', 'Colorado', 'America/Denver', TRUE, 2012, TRUE, 2000),
('AK', 'Alaska', 'America/Anchorage', TRUE, 2014, TRUE, 1998),
('OR', 'Oregon', 'America/Los_Angeles', TRUE, 2014, TRUE, 1998),
('DC', 'District of Columbia', 'America/New_York', TRUE, 2015, TRUE, 2011),
('CA', 'California', 'America/Los_Angeles', TRUE, 2016, TRUE, 1996),
('NV', 'Nevada', 'America/Los_Angeles', TRUE, 2016, TRUE, 1998),
('ME', 'Maine', 'America/New_York', TRUE, 2016, TRUE, 1999),
('MA', 'Massachusetts', 'America/New_York', TRUE, 2016, TRUE, 2012),
('MI', 'Michigan', 'America/Detroit', TRUE, 2018, TRUE, 2008),
('IL', 'Illinois', 'America/Chicago', TRUE, 2019, TRUE, 2013),
('AZ', 'Arizona', 'America/Phoenix', TRUE, 2020, TRUE, 2010),
('MT', 'Montana', 'America/Denver', TRUE, 2020, TRUE, 2004),
('NJ', 'New Jersey', 'America/New_York', TRUE, 2020, TRUE, 2010),
('VT', 'Vermont', 'America/New_York', TRUE, 2020, TRUE, 2004),
('CT', 'Connecticut', 'America/New_York', TRUE, 2021, TRUE, 2012),
('NM', 'New Mexico', 'America/Denver', TRUE, 2021, TRUE, 2007),
('NY', 'New York', 'America/New_York', TRUE, 2021, TRUE, 2014),
('VA', 'Virginia', 'America/New_York', TRUE, 2021, TRUE, 2020),
('MD', 'Maryland', 'America/New_York', TRUE, 2022, TRUE, 2013),
('MO', 'Missouri', 'America/Chicago', TRUE, 2022, TRUE, 2018),
('RI', 'Rhode Island', 'America/New_York', TRUE, 2022, TRUE, 2006),
('DE', 'Delaware', 'America/New_York', TRUE, 2023, TRUE, 2011),
('MN', 'Minnesota', 'America/Chicago', TRUE, 2023, TRUE, 2014),
('OH', 'Ohio', 'America/New_York', TRUE, 2023, TRUE, 2016),
-- Medical Only States (no recreational)
('HI', 'Hawaii', 'Pacific/Honolulu', FALSE, NULL, TRUE, 2000),
('NH', 'New Hampshire', 'America/New_York', FALSE, NULL, TRUE, 2013),
('GA', 'Georgia', 'America/New_York', FALSE, NULL, TRUE, 2015),
('LA', 'Louisiana', 'America/Chicago', FALSE, NULL, TRUE, 2015),
('TX', 'Texas', 'America/Chicago', FALSE, NULL, TRUE, 2015),
('AR', 'Arkansas', 'America/Chicago', FALSE, NULL, TRUE, 2016),
('FL', 'Florida', 'America/New_York', FALSE, NULL, TRUE, 2016),
('ND', 'North Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2016),
('PA', 'Pennsylvania', 'America/New_York', FALSE, NULL, TRUE, 2016),
('IA', 'Iowa', 'America/Chicago', FALSE, NULL, TRUE, 2017),
('WV', 'West Virginia', 'America/New_York', FALSE, NULL, TRUE, 2017),
('OK', 'Oklahoma', 'America/Chicago', FALSE, NULL, TRUE, 2018),
('UT', 'Utah', 'America/Denver', FALSE, NULL, TRUE, 2018),
('SD', 'South Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2020),
('AL', 'Alabama', 'America/Chicago', FALSE, NULL, TRUE, 2021),
('MS', 'Mississippi', 'America/Chicago', FALSE, NULL, TRUE, 2022),
('KY', 'Kentucky', 'America/New_York', FALSE, NULL, TRUE, 2023),
('NE', 'Nebraska', 'America/Chicago', FALSE, NULL, TRUE, 2024),
-- No Cannabis Programs (neither rec nor medical)
('ID', 'Idaho', 'America/Boise', FALSE, NULL, FALSE, NULL),
('IN', 'Indiana', 'America/Indiana/Indianapolis', FALSE, NULL, FALSE, NULL),
('KS', 'Kansas', 'America/Chicago', FALSE, NULL, FALSE, NULL),
('NC', 'North Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
('SC', 'South Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
('TN', 'Tennessee', 'America/Chicago', FALSE, NULL, FALSE, NULL),
('WI', 'Wisconsin', 'America/Chicago', FALSE, NULL, FALSE, NULL),
('WY', 'Wyoming', 'America/Denver', FALSE, NULL, FALSE, NULL)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
recreational_legal = EXCLUDED.recreational_legal,
rec_year = EXCLUDED.rec_year,
medical_legal = EXCLUDED.medical_legal,
med_year = EXCLUDED.med_year,
updated_at = NOW();
-- ============================================================================
-- SECTION 3: Add indexes for common queries
-- ============================================================================
CREATE INDEX IF NOT EXISTS idx_states_recreational ON states(recreational_legal) WHERE recreational_legal = TRUE;
CREATE INDEX IF NOT EXISTS idx_states_medical ON states(medical_legal) WHERE medical_legal = TRUE;
-- ============================================================================
-- SECTION 4: Verification query (informational only)
-- ============================================================================
SELECT
'Migration 052 completed successfully.' AS status,
(SELECT COUNT(*) FROM states WHERE recreational_legal = TRUE) AS rec_states,
(SELECT COUNT(*) FROM states WHERE medical_legal = TRUE AND recreational_legal = FALSE) AS med_only_states,
(SELECT COUNT(*) FROM states WHERE medical_legal = FALSE OR medical_legal IS NULL) AS no_program_states,
(SELECT COUNT(*) FROM states) AS total_states;

View File

@@ -0,0 +1,249 @@
-- ============================================================================
-- Migration 052: Hydration Schema Alignment
-- ============================================================================
--
-- Purpose: Add columns to canonical tables needed for hydration from
-- dutchie_products and dutchie_product_snapshots.
--
-- This migration ensures store_products and store_product_snapshots can
-- receive all data from the legacy dutchie_* tables.
--
-- SAFETY RULES:
-- - ALL columns use ADD COLUMN IF NOT EXISTS
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Fully idempotent - safe to run multiple times
--
-- Run with:
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
-- -f migrations/052_hydration_schema_alignment.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: store_products - Additional columns from dutchie_products
-- ============================================================================
-- Brand ID from Dutchie GraphQL (brandId field)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_brand_id VARCHAR(100);
-- Legacy dutchie_products.id for cross-reference during migration
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
-- THC/CBD content as text (from dutchie_products.thc_content/cbd_content)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS thc_content_text VARCHAR(50);
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cbd_content_text VARCHAR(50);
-- Full cannabinoid data
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cannabinoids JSONB;
-- Effects array
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS effects TEXT[];
-- Type (Flower, Edible, etc.) - maps to category in legacy
-- Already have category VARCHAR(100), but type may differ
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS product_type VARCHAR(100);
-- Additional images array
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS additional_images TEXT[];
-- Local image paths (from 032 migration)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_url TEXT;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS original_image_url TEXT;
-- Status from Dutchie (Active/Inactive)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
-- Threshold flags
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
-- cName / slug from Dutchie
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
-- Coming soon flag
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_coming_soon BOOLEAN DEFAULT FALSE;
-- Provider column already exists, ensure we have provider_dispensary_id
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
-- Enterprise product ID (cross-store product linking)
-- Already exists from migration 051
-- Total quantity available (from POSMetaData.children)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
-- Weight
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weight VARCHAR(50);
-- Options array (size/weight options)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options TEXT[];
-- Measurements
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS measurements JSONB;
-- Raw data from last crawl
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS raw_data JSONB;
-- Source timestamps from Dutchie
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_created_at TIMESTAMPTZ;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_updated_at TIMESTAMPTZ;
-- ============================================================================
-- SECTION 2: store_product_snapshots - Additional columns for hydration
-- ============================================================================
-- Legacy dutchie_product_snapshot.id for cross-reference
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_snapshot_id INTEGER;
-- Legacy dutchie_product_id reference
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
-- Options JSONB from dutchie_product_snapshots
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS options JSONB;
-- Provider dispensary ID
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
-- Inventory details
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
-- Platform status at time of snapshot
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
-- Threshold flags at time of snapshot
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
-- Special data
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_data JSONB;
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_name TEXT;
-- Pricing mode (rec/med)
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS pricing_type VARCHAR(10);
-- Crawl mode (mode_a/mode_b)
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS crawl_mode VARCHAR(20);
-- ============================================================================
-- SECTION 3: crawl_runs - Additional columns for hydration
-- ============================================================================
-- Legacy job ID references
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_dispensary_crawl_job_id INTEGER;
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_job_run_log_id INTEGER;
-- Schedule reference
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS schedule_id INTEGER;
-- Job type
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS job_type VARCHAR(50);
-- Brands found count
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS brands_found INTEGER DEFAULT 0;
-- Retry count
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0;
-- ============================================================================
-- SECTION 4: INDEXES for hydration queries
-- ============================================================================
-- Index on legacy IDs for migration lookups
CREATE INDEX IF NOT EXISTS idx_store_products_legacy_id
ON store_products(legacy_dutchie_product_id)
WHERE legacy_dutchie_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_id
ON store_product_snapshots(legacy_snapshot_id)
WHERE legacy_snapshot_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_product_id
ON store_product_snapshots(legacy_dutchie_product_id)
WHERE legacy_dutchie_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_legacy_job_id
ON crawl_runs(legacy_dispensary_crawl_job_id)
WHERE legacy_dispensary_crawl_job_id IS NOT NULL;
-- Index on provider_product_id for upserts
CREATE INDEX IF NOT EXISTS idx_store_products_provider_id
ON store_products(provider_product_id);
-- Composite index for canonical key lookup
CREATE INDEX IF NOT EXISTS idx_store_products_canonical_key
ON store_products(dispensary_id, provider, provider_product_id);
-- ============================================================================
-- SECTION 5: Unique constraint for idempotent hydration
-- ============================================================================
-- Ensure unique snapshots per product per crawl
-- This prevents duplicate snapshots during re-runs
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_unique_per_crawl'
) THEN
-- Can't add unique constraint on nullable columns directly,
-- so we use a partial unique index instead
CREATE UNIQUE INDEX IF NOT EXISTS idx_snapshots_unique_per_crawl
ON store_product_snapshots(store_product_id, crawl_run_id)
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- ============================================================================
-- SECTION 6: View for hydration status monitoring
-- ============================================================================
CREATE OR REPLACE VIEW v_hydration_status AS
SELECT
'dutchie_products' AS source_table,
(SELECT COUNT(*) FROM dutchie_products) AS source_count,
(SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) AS hydrated_count,
ROUND(
100.0 * (SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) /
NULLIF((SELECT COUNT(*) FROM dutchie_products), 0),
2
) AS hydration_pct
UNION ALL
SELECT
'dutchie_product_snapshots' AS source_table,
(SELECT COUNT(*) FROM dutchie_product_snapshots) AS source_count,
(SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) AS hydrated_count,
ROUND(
100.0 * (SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) /
NULLIF((SELECT COUNT(*) FROM dutchie_product_snapshots), 0),
2
) AS hydration_pct
UNION ALL
SELECT
'dispensary_crawl_jobs' AS source_table,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed') AS source_count,
(SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) AS hydrated_count,
ROUND(
100.0 * (SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) /
NULLIF((SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed'), 0),
2
) AS hydration_pct;
-- ============================================================================
-- DONE
-- ============================================================================
SELECT 'Migration 052 completed successfully. Hydration schema aligned.' AS status;

View File

@@ -0,0 +1,157 @@
-- ============================================================================
-- Migration 053: Analytics Engine Indexes
-- ============================================================================
--
-- Purpose: Add indexes optimized for analytics queries on canonical tables.
-- These indexes support price trends, brand penetration, category
-- growth, and state-level analytics.
--
-- SAFETY RULES:
-- - Uses CREATE INDEX IF NOT EXISTS (idempotent)
-- - Uses ADD COLUMN IF NOT EXISTS for helper columns
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Safe to run multiple times
--
-- Run with:
-- psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: Helper columns for analytics (if missing)
-- ============================================================================
-- Ensure store_products has brand_id for faster brand analytics joins
-- (brand_name exists, but a normalized brand_id helps)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS brand_id INTEGER;
-- Ensure snapshots have category for time-series category analytics
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS category VARCHAR(100);
-- ============================================================================
-- SECTION 2: Price Analytics Indexes
-- ============================================================================
-- Price trends by store_product over time
CREATE INDEX IF NOT EXISTS idx_snapshots_product_price_time
ON store_product_snapshots(store_product_id, captured_at DESC, price_rec, price_med)
WHERE store_product_id IS NOT NULL;
-- Price by category over time (for category price trends)
CREATE INDEX IF NOT EXISTS idx_snapshots_category_price_time
ON store_product_snapshots(category, captured_at DESC, price_rec)
WHERE category IS NOT NULL;
-- Price changes detection (for volatility analysis)
CREATE INDEX IF NOT EXISTS idx_products_price_change
ON store_products(last_price_change_at DESC)
WHERE last_price_change_at IS NOT NULL;
-- ============================================================================
-- SECTION 3: Brand Penetration Indexes
-- ============================================================================
-- Brand by dispensary (for penetration counts)
CREATE INDEX IF NOT EXISTS idx_products_brand_dispensary
ON store_products(brand_name, dispensary_id)
WHERE brand_name IS NOT NULL;
-- Brand by state (for state-level brand analytics)
CREATE INDEX IF NOT EXISTS idx_products_brand_state
ON store_products(brand_name, state_id)
WHERE brand_name IS NOT NULL AND state_id IS NOT NULL;
-- Brand first/last seen (for penetration trends)
CREATE INDEX IF NOT EXISTS idx_products_brand_first_seen
ON store_products(brand_name, first_seen_at)
WHERE brand_name IS NOT NULL;
-- ============================================================================
-- SECTION 4: Category Analytics Indexes
-- ============================================================================
-- Category by state (for state-level category analytics)
CREATE INDEX IF NOT EXISTS idx_products_category_state
ON store_products(category, state_id)
WHERE category IS NOT NULL;
-- Category by dispensary
CREATE INDEX IF NOT EXISTS idx_products_category_dispensary
ON store_products(category, dispensary_id)
WHERE category IS NOT NULL;
-- Category first seen (for growth tracking)
CREATE INDEX IF NOT EXISTS idx_products_category_first_seen
ON store_products(category, first_seen_at)
WHERE category IS NOT NULL;
-- ============================================================================
-- SECTION 5: Store Analytics Indexes
-- ============================================================================
-- Products added/removed by dispensary
CREATE INDEX IF NOT EXISTS idx_products_dispensary_first_seen
ON store_products(dispensary_id, first_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_products_dispensary_last_seen
ON store_products(dispensary_id, last_seen_at DESC);
-- Stock status changes
CREATE INDEX IF NOT EXISTS idx_products_stock_change
ON store_products(dispensary_id, last_stock_change_at DESC)
WHERE last_stock_change_at IS NOT NULL;
-- ============================================================================
-- SECTION 6: State Analytics Indexes
-- ============================================================================
-- Dispensary count by state
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_active
ON dispensaries(state_id)
WHERE state_id IS NOT NULL;
-- Products by state
CREATE INDEX IF NOT EXISTS idx_products_state_active
ON store_products(state_id, is_in_stock)
WHERE state_id IS NOT NULL;
-- Snapshots by state for time-series
CREATE INDEX IF NOT EXISTS idx_snapshots_state_time
ON store_product_snapshots(state_id, captured_at DESC)
WHERE state_id IS NOT NULL;
-- ============================================================================
-- SECTION 7: Composite indexes for common analytics queries
-- ============================================================================
-- Brand + Category + State (for market share calculations)
CREATE INDEX IF NOT EXISTS idx_products_brand_category_state
ON store_products(brand_name, category, state_id)
WHERE brand_name IS NOT NULL AND category IS NOT NULL;
-- Dispensary + Category + Brand (for store-level brand analysis)
CREATE INDEX IF NOT EXISTS idx_products_disp_cat_brand
ON store_products(dispensary_id, category, brand_name)
WHERE category IS NOT NULL;
-- Special pricing by category (for promo analysis)
CREATE INDEX IF NOT EXISTS idx_products_special_category
ON store_products(category, is_on_special)
WHERE is_on_special = TRUE;
-- ============================================================================
-- SECTION 8: Verification
-- ============================================================================
SELECT
'Migration 053 completed successfully.' AS status,
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_products_%') AS product_indexes,
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_snapshots_%') AS snapshot_indexes;

View File

@@ -0,0 +1,346 @@
-- ============================================================================
-- Migration 053: Dutchie Discovery Schema
-- ============================================================================
--
-- Purpose: Create tables for Dutchie store discovery workflow.
-- Stores are discovered and held in staging tables until verified,
-- then promoted to the canonical dispensaries table.
--
-- Tables Created:
-- - dutchie_discovery_cities: City pages from Dutchie
-- - dutchie_discovery_locations: Individual store locations
--
-- SAFETY RULES:
-- - ALL tables use CREATE TABLE IF NOT EXISTS
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Does NOT touch canonical dispensaries table
-- - Fully idempotent - safe to run multiple times
--
-- Run with:
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
-- -f migrations/053_dutchie_discovery_schema.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: DUTCHIE_DISCOVERY_CITIES
-- ============================================================================
-- Stores Dutchie city pages for systematic crawling.
-- Each city can contain multiple dispensary locations.
CREATE TABLE IF NOT EXISTS dutchie_discovery_cities (
id BIGSERIAL PRIMARY KEY,
-- Platform identification (future-proof for other platforms)
platform TEXT NOT NULL DEFAULT 'dutchie',
-- City identification
city_name TEXT NOT NULL,
city_slug TEXT NOT NULL,
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
country_code TEXT NOT NULL DEFAULT 'US',
-- Crawl management
last_crawled_at TIMESTAMPTZ,
crawl_enabled BOOLEAN NOT NULL DEFAULT TRUE,
location_count INTEGER, -- Number of locations found in this city
-- Metadata
notes TEXT,
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Add unique constraint if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_cities_unique'
) THEN
ALTER TABLE dutchie_discovery_cities
ADD CONSTRAINT dutchie_discovery_cities_unique
UNIQUE (platform, country_code, state_code, city_slug);
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Indexes
CREATE INDEX IF NOT EXISTS idx_discovery_cities_platform
ON dutchie_discovery_cities(platform);
CREATE INDEX IF NOT EXISTS idx_discovery_cities_state
ON dutchie_discovery_cities(country_code, state_code);
CREATE INDEX IF NOT EXISTS idx_discovery_cities_crawl_enabled
ON dutchie_discovery_cities(crawl_enabled)
WHERE crawl_enabled = TRUE;
CREATE INDEX IF NOT EXISTS idx_discovery_cities_last_crawled
ON dutchie_discovery_cities(last_crawled_at);
COMMENT ON TABLE dutchie_discovery_cities IS 'City pages from Dutchie for systematic store discovery.';
-- ============================================================================
-- SECTION 2: DUTCHIE_DISCOVERY_LOCATIONS
-- ============================================================================
-- Individual store locations discovered from Dutchie.
-- These are NOT promoted to canonical dispensaries until verified.
CREATE TABLE IF NOT EXISTS dutchie_discovery_locations (
id BIGSERIAL PRIMARY KEY,
-- Platform identification
platform TEXT NOT NULL DEFAULT 'dutchie',
platform_location_id TEXT NOT NULL, -- Dutchie's internal Location ID
platform_slug TEXT NOT NULL, -- URL slug for the store
platform_menu_url TEXT NOT NULL, -- Full menu URL
-- Store name
name TEXT NOT NULL,
-- Address components
raw_address TEXT,
address_line1 TEXT,
address_line2 TEXT,
city TEXT,
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
postal_code TEXT,
country_code TEXT, -- 'US' or 'CA'
-- Coordinates
latitude DOUBLE PRECISION,
longitude DOUBLE PRECISION,
timezone TEXT,
-- Discovery status
status TEXT NOT NULL DEFAULT 'discovered',
-- discovered: Just found, not yet verified
-- verified: Verified and promoted to canonical dispensaries
-- rejected: Manually rejected (e.g., duplicate, test store)
-- merged: Linked to existing canonical dispensary
-- Link to canonical dispensaries (only after verification)
dispensary_id INTEGER,
-- Reference to discovery city
discovery_city_id BIGINT,
-- Raw data from Dutchie
metadata JSONB,
notes TEXT,
-- Store capabilities (from Dutchie)
offers_delivery BOOLEAN,
offers_pickup BOOLEAN,
is_recreational BOOLEAN,
is_medical BOOLEAN,
-- Tracking
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_checked_at TIMESTAMPTZ,
verified_at TIMESTAMPTZ,
verified_by TEXT, -- User who verified
active BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Add unique constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_platform_id_unique'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_platform_id_unique
UNIQUE (platform, platform_location_id);
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_slug_unique'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_slug_unique
UNIQUE (platform, platform_slug, country_code, state_code, city);
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Add FK to dispensaries if not exists (allows NULL)
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_dispensary_fk'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_dispensary_fk
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Add FK to discovery cities if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_city_fk'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_city_fk
FOREIGN KEY (discovery_city_id) REFERENCES dutchie_discovery_cities(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Indexes
CREATE INDEX IF NOT EXISTS idx_discovery_locations_platform
ON dutchie_discovery_locations(platform);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_status
ON dutchie_discovery_locations(status);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_state
ON dutchie_discovery_locations(country_code, state_code);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_city
ON dutchie_discovery_locations(city, state_code);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_dispensary
ON dutchie_discovery_locations(dispensary_id)
WHERE dispensary_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_discovery_locations_discovered
ON dutchie_discovery_locations(status, first_seen_at DESC)
WHERE status = 'discovered';
CREATE INDEX IF NOT EXISTS idx_discovery_locations_active
ON dutchie_discovery_locations(active)
WHERE active = TRUE;
CREATE INDEX IF NOT EXISTS idx_discovery_locations_coords
ON dutchie_discovery_locations(latitude, longitude)
WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
COMMENT ON TABLE dutchie_discovery_locations IS 'Discovered store locations from Dutchie. Held in staging until verified.';
-- ============================================================================
-- SECTION 3: ADD CANADIAN PROVINCES TO STATES TABLE
-- ============================================================================
-- Support for Canadian provinces (Ontario, BC, Alberta, etc.)
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
('AB', 'Alberta', 'America/Edmonton', TRUE, TRUE),
('BC', 'British Columbia', 'America/Vancouver', TRUE, TRUE),
('MB', 'Manitoba', 'America/Winnipeg', TRUE, TRUE),
('NB', 'New Brunswick', 'America/Moncton', TRUE, TRUE),
('NL', 'Newfoundland and Labrador', 'America/St_Johns', TRUE, TRUE),
('NS', 'Nova Scotia', 'America/Halifax', TRUE, TRUE),
('NT', 'Northwest Territories', 'America/Yellowknife', TRUE, TRUE),
('NU', 'Nunavut', 'America/Iqaluit', TRUE, TRUE),
('ON', 'Ontario', 'America/Toronto', TRUE, TRUE),
('PE', 'Prince Edward Island', 'America/Halifax', TRUE, TRUE),
('QC', 'Quebec', 'America/Montreal', TRUE, TRUE),
('SK', 'Saskatchewan', 'America/Regina', TRUE, TRUE),
('YT', 'Yukon', 'America/Whitehorse', TRUE, TRUE)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
updated_at = NOW();
-- ============================================================================
-- SECTION 4: VIEWS FOR DISCOVERY MONITORING
-- ============================================================================
-- View: Discovery status summary
CREATE OR REPLACE VIEW v_discovery_status AS
SELECT
platform,
country_code,
state_code,
status,
COUNT(*) AS location_count,
COUNT(*) FILTER (WHERE dispensary_id IS NOT NULL) AS linked_count,
MIN(first_seen_at) AS earliest_discovery,
MAX(last_seen_at) AS latest_activity
FROM dutchie_discovery_locations
GROUP BY platform, country_code, state_code, status
ORDER BY country_code, state_code, status;
-- View: Unverified discoveries awaiting action
CREATE OR REPLACE VIEW v_discovery_pending AS
SELECT
dl.id,
dl.platform,
dl.name,
dl.city,
dl.state_code,
dl.country_code,
dl.platform_menu_url,
dl.first_seen_at,
dl.last_seen_at,
dl.offers_delivery,
dl.offers_pickup,
dl.is_recreational,
dl.is_medical,
dc.city_name AS discovery_city_name
FROM dutchie_discovery_locations dl
LEFT JOIN dutchie_discovery_cities dc ON dc.id = dl.discovery_city_id
WHERE dl.status = 'discovered'
AND dl.active = TRUE
ORDER BY dl.state_code, dl.city, dl.name;
-- View: City crawl status
CREATE OR REPLACE VIEW v_discovery_cities_status AS
SELECT
dc.id,
dc.platform,
dc.city_name,
dc.state_code,
dc.country_code,
dc.crawl_enabled,
dc.last_crawled_at,
dc.location_count,
COUNT(dl.id) AS actual_locations,
COUNT(dl.id) FILTER (WHERE dl.status = 'discovered') AS pending_count,
COUNT(dl.id) FILTER (WHERE dl.status = 'verified') AS verified_count,
COUNT(dl.id) FILTER (WHERE dl.status = 'rejected') AS rejected_count
FROM dutchie_discovery_cities dc
LEFT JOIN dutchie_discovery_locations dl ON dl.discovery_city_id = dc.id
GROUP BY dc.id, dc.platform, dc.city_name, dc.state_code, dc.country_code,
dc.crawl_enabled, dc.last_crawled_at, dc.location_count
ORDER BY dc.country_code, dc.state_code, dc.city_name;
-- ============================================================================
-- DONE
-- ============================================================================
SELECT 'Migration 053 completed successfully. Discovery schema created.' AS status;

View File

@@ -0,0 +1,49 @@
-- Migration 054: Worker Metadata for Named Workforce
-- Adds worker_name and worker_role to job tables for displaying friendly worker identities
-- Add worker metadata columns to job_schedules
ALTER TABLE job_schedules
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., Store Discovery Worker, GraphQL Product Sync)';
-- Add worker metadata columns to job_run_logs
ALTER TABLE job_run_logs
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
ADD COLUMN IF NOT EXISTS run_role VARCHAR(100);
COMMENT ON COLUMN job_run_logs.worker_name IS 'Name of the worker that executed this run (copied from schedule)';
COMMENT ON COLUMN job_run_logs.run_role IS 'Role description for this specific run';
-- Add worker_name to dispensary_crawl_jobs (for tracking which named worker enqueued it)
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
-- Update existing schedules with worker names
UPDATE job_schedules SET
worker_name = 'Bella',
worker_role = 'GraphQL Product Sync'
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Henry',
worker_role = 'Entry Point Finder'
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Alice',
worker_role = 'Store Discovery'
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Oscar',
worker_role = 'Analytics Refresh'
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
-- Create index for worker name lookups
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by ON dispensary_crawl_jobs(enqueued_by_worker);

View File

@@ -0,0 +1,123 @@
-- Migration 055: Workforce System Enhancements
-- Adds visibility tracking, slug change tracking, and scope support for workers
-- ============================================================
-- 1. VISIBILITY TRACKING FOR BELLA (Product Sync)
-- ============================================================
-- Add visibility tracking to dutchie_products
ALTER TABLE dutchie_products
ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE,
ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMPTZ;
COMMENT ON COLUMN dutchie_products.visibility_lost IS 'True if product disappeared from GraphQL results';
COMMENT ON COLUMN dutchie_products.visibility_lost_at IS 'When product was last marked as visibility lost';
COMMENT ON COLUMN dutchie_products.visibility_restored_at IS 'When product reappeared after being lost';
-- Index for visibility queries
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost
ON dutchie_products(dispensary_id, visibility_lost)
WHERE visibility_lost = TRUE;
-- ============================================================
-- 2. SLUG CHANGE TRACKING FOR ALICE (Store Discovery)
-- ============================================================
-- Add slug change and retirement tracking to discovery locations
ALTER TABLE dutchie_discovery_locations
ADD COLUMN IF NOT EXISTS slug_changed_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS previous_slug VARCHAR(255),
ADD COLUMN IF NOT EXISTS retired_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS retirement_reason VARCHAR(100);
COMMENT ON COLUMN dutchie_discovery_locations.slug_changed_at IS 'When the platform slug was last changed';
COMMENT ON COLUMN dutchie_discovery_locations.previous_slug IS 'Previous slug before the last change';
COMMENT ON COLUMN dutchie_discovery_locations.retired_at IS 'When store was marked as retired/removed';
COMMENT ON COLUMN dutchie_discovery_locations.retirement_reason IS 'Reason for retirement (removed_from_source, closed, etc.)';
-- Index for finding retired stores
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_retired
ON dutchie_discovery_locations(retired_at)
WHERE retired_at IS NOT NULL;
-- ============================================================
-- 3. ID RESOLUTION TRACKING FOR HENRY (Entry Point Finder)
-- ============================================================
-- Add resolution tracking to dispensaries
ALTER TABLE dispensaries
ADD COLUMN IF NOT EXISTS last_id_resolution_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS id_resolution_attempts INT DEFAULT 0,
ADD COLUMN IF NOT EXISTS id_resolution_error TEXT;
COMMENT ON COLUMN dispensaries.last_id_resolution_at IS 'When platform_dispensary_id was last resolved/attempted';
COMMENT ON COLUMN dispensaries.id_resolution_attempts IS 'Number of resolution attempts';
COMMENT ON COLUMN dispensaries.id_resolution_error IS 'Last error message from resolution attempt';
-- Index for finding stores needing resolution
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_resolution
ON dispensaries(state, menu_type)
WHERE platform_dispensary_id IS NULL AND menu_type = 'dutchie';
-- ============================================================
-- 4. ENHANCED CITIES TABLE FOR ALICE
-- ============================================================
-- Add tracking columns to cities table
ALTER TABLE dutchie_discovery_cities
ADD COLUMN IF NOT EXISTS state_name VARCHAR(100),
ADD COLUMN IF NOT EXISTS discovered_at TIMESTAMPTZ DEFAULT NOW(),
ADD COLUMN IF NOT EXISTS last_verified_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS store_count_reported INT,
ADD COLUMN IF NOT EXISTS store_count_actual INT;
COMMENT ON COLUMN dutchie_discovery_cities.state_name IS 'Full state name from source';
COMMENT ON COLUMN dutchie_discovery_cities.discovered_at IS 'When city was first discovered';
COMMENT ON COLUMN dutchie_discovery_cities.last_verified_at IS 'When city was last verified to exist';
COMMENT ON COLUMN dutchie_discovery_cities.store_count_reported IS 'Store count reported by source';
COMMENT ON COLUMN dutchie_discovery_cities.store_count_actual IS 'Actual store count from discovery';
-- ============================================================
-- 5. UPDATE WORKER ROLES (Standardize naming)
-- ============================================================
-- Update existing workers to use standardized role names
UPDATE job_schedules SET worker_role = 'store_discovery'
WHERE worker_name = 'Alice' AND worker_role = 'Store Discovery';
UPDATE job_schedules SET worker_role = 'entry_point_finder'
WHERE worker_name = 'Henry' AND worker_role = 'Entry Point Finder';
UPDATE job_schedules SET worker_role = 'product_sync'
WHERE worker_name = 'Bella' AND worker_role = 'GraphQL Product Sync';
UPDATE job_schedules SET worker_role = 'analytics_refresh'
WHERE worker_name = 'Oscar' AND worker_role = 'Analytics Refresh';
-- ============================================================
-- 6. VISIBILITY EVENTS IN SNAPSHOTS (JSONB approach)
-- ============================================================
-- Add visibility_events array to product snapshots metadata
-- This will store: [{event_type, timestamp, worker_name}]
-- No schema change needed - we use existing metadata JSONB column
-- ============================================================
-- 7. INDEXES FOR WORKER QUERIES
-- ============================================================
-- Index for finding recently added stores (for Henry)
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_created
ON dutchie_discovery_locations(created_at DESC)
WHERE active = TRUE;
-- Index for scope-based queries (by state)
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_menu
ON dispensaries(state, menu_type)
WHERE menu_type IS NOT NULL;
-- Record migration
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (55, '055_workforce_enhancements', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -0,0 +1,110 @@
-- Migration 056: Fix Worker Metadata and Job Run Logs
--
-- This migration safely ensures all expected schema exists for:
-- 1. job_schedules - worker_name, worker_role columns
-- 2. job_run_logs - entire table creation if missing
--
-- Uses IF NOT EXISTS / ADD COLUMN IF NOT EXISTS for idempotency.
-- Safe to run on databases that already have some or all of these changes.
-- ============================================================
-- 1. ADD MISSING COLUMNS TO job_schedules
-- ============================================================
ALTER TABLE job_schedules
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., store_discovery, product_sync)';
-- ============================================================
-- 2. CREATE job_run_logs TABLE IF NOT EXISTS
-- ============================================================
CREATE TABLE IF NOT EXISTS job_run_logs (
id SERIAL PRIMARY KEY,
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
job_name VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
duration_ms INTEGER,
error_message TEXT,
-- Results summary
items_processed INTEGER DEFAULT 0,
items_succeeded INTEGER DEFAULT 0,
items_failed INTEGER DEFAULT 0,
-- Worker metadata (from scheduler.ts createRunLog function)
worker_name VARCHAR(50),
run_role VARCHAR(100),
-- Additional run details
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Create indexes if they don't exist
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
-- ============================================================
-- 3. ADD enqueued_by_worker TO dispensary_crawl_jobs IF EXISTS
-- ============================================================
DO $$
BEGIN
-- Only add column if dispensary_crawl_jobs table exists
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'dispensary_crawl_jobs') THEN
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by
ON dispensary_crawl_jobs(enqueued_by_worker);
END IF;
END $$;
-- ============================================================
-- 4. SEED DEFAULT WORKER NAMES FOR EXISTING SCHEDULES
-- ============================================================
UPDATE job_schedules SET
worker_name = 'Bella',
worker_role = 'product_sync'
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Henry',
worker_role = 'entry_point_finder'
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Alice',
worker_role = 'store_discovery'
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Oscar',
worker_role = 'analytics_refresh'
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
-- ============================================================
-- 5. RECORD MIGRATION (if schema_migrations table exists)
-- ============================================================
DO $$
BEGIN
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'schema_migrations') THEN
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (56, '056_fix_worker_and_run_logs', NOW())
ON CONFLICT (version) DO NOTHING;
END IF;
END $$;