Files
cannaiq/backend/migrations/046_crawler_reliability.sql
Kelly b4a2fb7d03 feat: Add v2 architecture with multi-state support and orchestrator services
Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-07 11:30:57 -07:00

352 lines
11 KiB
PL/PgSQL

-- Migration 046: Crawler Reliability & Stabilization
-- Phase 1: Add fields for error taxonomy, retry management, and self-healing
-- ============================================================
-- PART 1: Error Taxonomy - Standardized error codes
-- ============================================================
-- Create enum for standardized error codes
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN
CREATE TYPE crawl_error_code AS ENUM (
'SUCCESS',
'RATE_LIMITED',
'BLOCKED_PROXY',
'HTML_CHANGED',
'TIMEOUT',
'AUTH_FAILED',
'NETWORK_ERROR',
'PARSE_ERROR',
'NO_PRODUCTS',
'UNKNOWN_ERROR'
);
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 2: Dispensary Crawl Configuration
-- ============================================================
-- Add crawl config columns to dispensaries
DO $$
BEGIN
-- Crawl frequency (minutes between crawls)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes'
) THEN
ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240;
END IF;
-- Max retries per crawl
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'max_retries'
) THEN
ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3;
END IF;
-- Current proxy ID
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id'
) THEN
ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL;
END IF;
-- Current user agent
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent'
) THEN
ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL;
END IF;
-- Next scheduled run
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at'
) THEN
ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL;
END IF;
-- Last successful crawl
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'last_success_at'
) THEN
ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL;
END IF;
-- Last error code (using text for flexibility, validated in app)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'last_error_code'
) THEN
ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL;
END IF;
-- Crawl status: active, degraded, paused, failed
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'crawl_status'
) THEN
ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active';
END IF;
-- Backoff multiplier (increases with failures)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier'
) THEN
ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0;
END IF;
-- Total attempt count (lifetime)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'total_attempts'
) THEN
ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0;
END IF;
-- Total success count (lifetime)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'total_successes'
) THEN
ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 3: Enhanced Job Tracking
-- ============================================================
-- Add columns to dispensary_crawl_jobs
DO $$
BEGIN
-- Error code
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL;
END IF;
-- Proxy used for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL;
END IF;
-- User agent used for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL;
END IF;
-- Attempt number for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1;
END IF;
-- Backoff delay applied (ms)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0;
END IF;
-- HTTP status code received
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL;
END IF;
-- Response time (ms)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 4: Crawl History Table (for detailed tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS crawl_attempts (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
job_id INTEGER REFERENCES dispensary_crawl_jobs(id),
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Result
error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR',
error_message TEXT,
http_status INTEGER,
-- Context
attempt_number INTEGER NOT NULL DEFAULT 1,
proxy_used TEXT,
user_agent_used TEXT,
-- Metrics
products_found INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
-- Metadata
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Index for quick lookups
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code);
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC);
-- ============================================================
-- PART 5: Views for Monitoring
-- ============================================================
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawler_status;
-- Crawler status view with all reliability fields
CREATE VIEW v_crawler_status AS
SELECT
d.id,
d.name,
d.slug,
d.menu_type,
d.platform_dispensary_id,
d.crawl_status,
d.consecutive_failures,
d.last_crawl_at,
d.last_success_at,
d.last_failure_at,
d.last_error_code,
d.next_crawl_at,
d.crawl_frequency_minutes,
d.max_retries,
d.current_proxy_id,
d.current_user_agent,
d.backoff_multiplier,
d.total_attempts,
d.total_successes,
d.product_count,
CASE
WHEN d.total_attempts > 0
THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1)
ELSE 0
END AS success_rate,
CASE
WHEN d.crawl_status = 'failed' THEN 'FAILED'
WHEN d.crawl_status = 'paused' THEN 'PAUSED'
WHEN d.crawl_status = 'degraded' THEN 'DEGRADED'
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION'
WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID'
WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED'
WHEN d.next_crawl_at <= NOW() THEN 'DUE'
ELSE 'SCHEDULED'
END AS schedule_status,
d.failed_at,
d.failure_notes
FROM dispensaries d
WHERE d.state = 'AZ';
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawl_error_summary;
-- Error summary view
CREATE VIEW v_crawl_error_summary AS
SELECT
error_code,
COUNT(*) as total_occurrences,
COUNT(DISTINCT dispensary_id) as affected_stores,
MAX(started_at) as last_occurrence,
AVG(duration_ms)::INTEGER as avg_duration_ms
FROM crawl_attempts
WHERE started_at > NOW() - INTERVAL '7 days'
GROUP BY error_code
ORDER BY total_occurrences DESC;
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawl_health;
-- Overall crawl health view
CREATE VIEW v_crawl_health AS
SELECT
COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers,
COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now,
COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures,
AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures,
COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h
FROM dispensaries
WHERE state = 'AZ' AND menu_type = 'dutchie';
-- ============================================================
-- PART 6: Constraint for minimum crawl gap
-- ============================================================
-- Function to check minimum crawl gap (2 minutes)
CREATE OR REPLACE FUNCTION check_minimum_crawl_gap()
RETURNS TRIGGER AS $$
BEGIN
-- Only check for new pending jobs
IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN
-- Check if there's a recent job for same dispensary
IF EXISTS (
SELECT 1 FROM dispensary_crawl_jobs
WHERE dispensary_id = NEW.dispensary_id
AND id != NEW.id
AND status IN ('pending', 'running')
AND created_at > NOW() - INTERVAL '2 minutes'
) THEN
RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary';
END IF;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create trigger (drop first if exists)
DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs;
CREATE TRIGGER enforce_minimum_crawl_gap
BEFORE INSERT ON dispensary_crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION check_minimum_crawl_gap();
-- ============================================================
-- PART 7: Comments
-- ============================================================
COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging';
COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics';
COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days';
COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';