feat: Add v2 architecture with multi-state support and orchestrator services

Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-07 11:30:57 -07:00
parent 8ac64ba077
commit b4a2fb7d03
248 changed files with 60714 additions and 666 deletions

View File

@@ -0,0 +1,376 @@
-- Migration 041: CannaiQ Canonical Schema
--
-- This migration adds the canonical CannaiQ schema tables and columns.
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
--
-- Run with: psql $CANNAIQ_DB_URL -f migrations/041_cannaiq_canonical_schema.sql
--
-- Tables created:
-- - states (new)
-- - chains (new)
-- - brands (new)
-- - store_products (new - normalized view of current menu)
-- - store_product_snapshots (new - historical crawl data)
-- - crawl_runs (new - replaces/supplements dispensary_crawl_jobs)
--
-- Tables modified:
-- - dispensaries (add state_id, chain_id FKs)
-- - dispensary_crawler_profiles (add status, allow_autopromote, validated_at)
-- - crawl_orchestration_traces (add run_id FK)
--
-- =====================================================
-- 1) STATES TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL UNIQUE,
name VARCHAR(100) NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert known states
INSERT INTO states (code, name) VALUES
('AZ', 'Arizona'),
('CA', 'California'),
('CO', 'Colorado'),
('FL', 'Florida'),
('IL', 'Illinois'),
('MA', 'Massachusetts'),
('MD', 'Maryland'),
('MI', 'Michigan'),
('MO', 'Missouri'),
('NV', 'Nevada'),
('NJ', 'New Jersey'),
('NY', 'New York'),
('OH', 'Ohio'),
('OK', 'Oklahoma'),
('OR', 'Oregon'),
('PA', 'Pennsylvania'),
('WA', 'Washington')
ON CONFLICT (code) DO NOTHING;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state codes.';
-- =====================================================
-- 2) CHAINS TABLE (retail groups)
-- =====================================================
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
website_url TEXT,
logo_url TEXT,
description TEXT,
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations (e.g., Curaleaf, Trulieve).';
-- =====================================================
-- 3) BRANDS TABLE (canonical brand catalog)
-- =====================================================
CREATE TABLE IF NOT EXISTS brands (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
external_id VARCHAR(100), -- Provider-specific brand ID
website_url TEXT,
instagram_handle VARCHAR(100),
logo_url TEXT,
description TEXT,
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_brands_slug ON brands(slug);
CREATE INDEX IF NOT EXISTS idx_brands_external_id ON brands(external_id) WHERE external_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_brands_portfolio ON brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
COMMENT ON TABLE brands IS 'Canonical brand catalog. Brands may appear across multiple dispensaries.';
COMMENT ON COLUMN brands.is_portfolio_brand IS 'TRUE if this is a brand we represent/manage (vs third-party brand)';
-- =====================================================
-- 4) ADD state_id AND chain_id TO dispensaries
-- =====================================================
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
-- NOTE: state_id backfill is done by ETL script (042_legacy_import.ts), not this migration.
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- =====================================================
-- 5) STORE_PRODUCTS TABLE (current menu state)
-- =====================================================
-- This is the normalized "what is currently on the menu" table.
-- It supplements dutchie_products with a provider-agnostic structure.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL, -- Link to canonical product
brand_id INTEGER REFERENCES brands(id) ON DELETE SET NULL, -- Link to canonical brand
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', -- dutchie, treez, jane, etc.
provider_product_id VARCHAR(100), -- Platform-specific product ID
provider_brand_id VARCHAR(100), -- Platform-specific brand ID
-- Raw data from platform (not normalized)
name_raw VARCHAR(500) NOT NULL,
brand_name_raw VARCHAR(255),
category_raw VARCHAR(100),
subcategory_raw VARCHAR(100),
-- Pricing
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Images
image_url TEXT,
local_image_path TEXT,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, provider, provider_product_id)
);
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_product ON store_products(product_id) WHERE product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(brand_id) WHERE brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
COMMENT ON COLUMN store_products.product_id IS 'FK to canonical products table. NULL if not yet mapped.';
COMMENT ON COLUMN store_products.brand_id IS 'FK to canonical brands table. NULL if not yet mapped.';
-- =====================================================
-- 6) STORE_PRODUCT_SNAPSHOTS TABLE (historical data)
-- =====================================================
-- This is the critical time-series table for analytics.
-- One row per product per crawl.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER, -- FK added after crawl_runs table created
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name_raw VARCHAR(500),
brand_name_raw VARCHAR(255),
category_raw VARCHAR(100),
subcategory_raw VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(product_id, captured_at DESC) WHERE product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_store_product ON store_product_snapshots(store_product_id) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
COMMENT ON COLUMN store_product_snapshots.captured_at IS 'When this snapshot was captured (crawl time).';
-- =====================================================
-- 7) CRAWL_RUNS TABLE (job execution records)
-- =====================================================
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Provider
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Execution times
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
error_message TEXT,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
snapshots_written INTEGER DEFAULT 0,
-- Metadata
worker_id VARCHAR(100),
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- Add FK from store_product_snapshots to crawl_runs
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.table_constraints
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
END $$;
-- =====================================================
-- 8) UPDATE crawl_orchestration_traces
-- =====================================================
-- Add run_id FK if not exists
ALTER TABLE crawl_orchestration_traces
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
ON crawl_orchestration_traces(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- =====================================================
-- 9) UPDATE dispensary_crawler_profiles
-- =====================================================
-- Add missing columns from canonical schema
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_profiles_status
ON dispensary_crawler_profiles(status);
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
COMMENT ON COLUMN dispensary_crawler_profiles.allow_autopromote IS 'Whether this profile can be auto-promoted from sandbox to production';
COMMENT ON COLUMN dispensary_crawler_profiles.validated_at IS 'When this profile was last validated as working';
-- =====================================================
-- 10) VIEWS FOR BACKWARD COMPATIBILITY
-- =====================================================
-- View to get latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View to get crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
d.name AS dispensary_name,
d.city,
d.state,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.name, d.city, d.state;
-- =====================================================
-- 11) COMMENTS
-- =====================================================
COMMENT ON TABLE states IS 'Canonical list of US states. Use state_id FK in dispensaries.';
COMMENT ON TABLE chains IS 'Retail chains (multi-location operators).';
COMMENT ON TABLE brands IS 'Canonical brand catalog across all providers.';
COMMENT ON TABLE store_products IS 'Current menu state per dispensary. Provider-agnostic.';
COMMENT ON TABLE store_product_snapshots IS 'Historical price/stock data. One row per product per crawl.';
COMMENT ON TABLE crawl_runs IS 'Crawl execution records. Links snapshots to runs.';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================
--
-- Next steps (manual - not in this migration):
-- 1. Populate chains table from known retail groups
-- 2. Populate brands table from existing dutchie_products.brand_name
-- 3. Migrate data from dutchie_products → store_products
-- 4. Migrate data from dutchie_product_snapshots → store_product_snapshots
-- 5. Link dispensaries.chain_id to chains where applicable
--