-- Migration 046: Crawler Reliability & Stabilization -- Phase 1: Add fields for error taxonomy, retry management, and self-healing -- ============================================================ -- PART 1: Error Taxonomy - Standardized error codes -- ============================================================ -- Create enum for standardized error codes DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN CREATE TYPE crawl_error_code AS ENUM ( 'SUCCESS', 'RATE_LIMITED', 'BLOCKED_PROXY', 'HTML_CHANGED', 'TIMEOUT', 'AUTH_FAILED', 'NETWORK_ERROR', 'PARSE_ERROR', 'NO_PRODUCTS', 'UNKNOWN_ERROR' ); END IF; END; $$ LANGUAGE plpgsql; -- ============================================================ -- PART 2: Dispensary Crawl Configuration -- ============================================================ -- Add crawl config columns to dispensaries DO $$ BEGIN -- Crawl frequency (minutes between crawls) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes' ) THEN ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240; END IF; -- Max retries per crawl IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'max_retries' ) THEN ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3; END IF; -- Current proxy ID IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id' ) THEN ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL; END IF; -- Current user agent IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent' ) THEN ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL; END IF; -- Next scheduled run IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at' ) THEN ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL; END IF; -- Last successful crawl IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'last_success_at' ) THEN ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL; END IF; -- Last error code (using text for flexibility, validated in app) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'last_error_code' ) THEN ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL; END IF; -- Crawl status: active, degraded, paused, failed IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'crawl_status' ) THEN ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active'; END IF; -- Backoff multiplier (increases with failures) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier' ) THEN ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0; END IF; -- Total attempt count (lifetime) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'total_attempts' ) THEN ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0; END IF; -- Total success count (lifetime) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'total_successes' ) THEN ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0; END IF; END; $$ LANGUAGE plpgsql; -- ============================================================ -- PART 3: Enhanced Job Tracking -- ============================================================ -- Add columns to dispensary_crawl_jobs DO $$ BEGIN -- Error code IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL; END IF; -- Proxy used for this job IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL; END IF; -- User agent used for this job IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL; END IF; -- Attempt number for this job IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1; END IF; -- Backoff delay applied (ms) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0; END IF; -- HTTP status code received IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL; END IF; -- Response time (ms) IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms' ) THEN ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL; END IF; END; $$ LANGUAGE plpgsql; -- ============================================================ -- PART 4: Crawl History Table (for detailed tracking) -- ============================================================ CREATE TABLE IF NOT EXISTS crawl_attempts ( id SERIAL PRIMARY KEY, dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), job_id INTEGER REFERENCES dispensary_crawl_jobs(id), -- Timing started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), finished_at TIMESTAMPTZ, duration_ms INTEGER, -- Result error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR', error_message TEXT, http_status INTEGER, -- Context attempt_number INTEGER NOT NULL DEFAULT 1, proxy_used TEXT, user_agent_used TEXT, -- Metrics products_found INTEGER DEFAULT 0, products_upserted INTEGER DEFAULT 0, snapshots_created INTEGER DEFAULT 0, -- Metadata metadata JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- Index for quick lookups CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id); CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code); CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC); -- ============================================================ -- PART 5: Views for Monitoring -- ============================================================ -- Drop existing view if exists DROP VIEW IF EXISTS v_crawler_status; -- Crawler status view with all reliability fields CREATE VIEW v_crawler_status AS SELECT d.id, d.name, d.slug, d.menu_type, d.platform_dispensary_id, d.crawl_status, d.consecutive_failures, d.last_crawl_at, d.last_success_at, d.last_failure_at, d.last_error_code, d.next_crawl_at, d.crawl_frequency_minutes, d.max_retries, d.current_proxy_id, d.current_user_agent, d.backoff_multiplier, d.total_attempts, d.total_successes, d.product_count, CASE WHEN d.total_attempts > 0 THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1) ELSE 0 END AS success_rate, CASE WHEN d.crawl_status = 'failed' THEN 'FAILED' WHEN d.crawl_status = 'paused' THEN 'PAUSED' WHEN d.crawl_status = 'degraded' THEN 'DEGRADED' WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION' WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID' WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED' WHEN d.next_crawl_at <= NOW() THEN 'DUE' ELSE 'SCHEDULED' END AS schedule_status, d.failed_at, d.failure_notes FROM dispensaries d WHERE d.state = 'AZ'; -- Drop existing view if exists DROP VIEW IF EXISTS v_crawl_error_summary; -- Error summary view CREATE VIEW v_crawl_error_summary AS SELECT error_code, COUNT(*) as total_occurrences, COUNT(DISTINCT dispensary_id) as affected_stores, MAX(started_at) as last_occurrence, AVG(duration_ms)::INTEGER as avg_duration_ms FROM crawl_attempts WHERE started_at > NOW() - INTERVAL '7 days' GROUP BY error_code ORDER BY total_occurrences DESC; -- Drop existing view if exists DROP VIEW IF EXISTS v_crawl_health; -- Overall crawl health view CREATE VIEW v_crawl_health AS SELECT COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers, COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers, COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers, COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers, COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now, COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures, AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures, COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h FROM dispensaries WHERE state = 'AZ' AND menu_type = 'dutchie'; -- ============================================================ -- PART 6: Constraint for minimum crawl gap -- ============================================================ -- Function to check minimum crawl gap (2 minutes) CREATE OR REPLACE FUNCTION check_minimum_crawl_gap() RETURNS TRIGGER AS $$ BEGIN -- Only check for new pending jobs IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN -- Check if there's a recent job for same dispensary IF EXISTS ( SELECT 1 FROM dispensary_crawl_jobs WHERE dispensary_id = NEW.dispensary_id AND id != NEW.id AND status IN ('pending', 'running') AND created_at > NOW() - INTERVAL '2 minutes' ) THEN RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary'; END IF; END IF; RETURN NEW; END; $$ LANGUAGE plpgsql; -- Create trigger (drop first if exists) DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs; CREATE TRIGGER enforce_minimum_crawl_gap BEFORE INSERT ON dispensary_crawl_jobs FOR EACH ROW EXECUTE FUNCTION check_minimum_crawl_gap(); -- ============================================================ -- PART 7: Comments -- ============================================================ COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging'; COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics'; COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days'; COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';