Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
352 lines
11 KiB
PL/PgSQL
352 lines
11 KiB
PL/PgSQL
-- Migration 046: Crawler Reliability & Stabilization
|
|
-- Phase 1: Add fields for error taxonomy, retry management, and self-healing
|
|
|
|
-- ============================================================
|
|
-- PART 1: Error Taxonomy - Standardized error codes
|
|
-- ============================================================
|
|
|
|
-- Create enum for standardized error codes
|
|
DO $$
|
|
BEGIN
|
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN
|
|
CREATE TYPE crawl_error_code AS ENUM (
|
|
'SUCCESS',
|
|
'RATE_LIMITED',
|
|
'BLOCKED_PROXY',
|
|
'HTML_CHANGED',
|
|
'TIMEOUT',
|
|
'AUTH_FAILED',
|
|
'NETWORK_ERROR',
|
|
'PARSE_ERROR',
|
|
'NO_PRODUCTS',
|
|
'UNKNOWN_ERROR'
|
|
);
|
|
END IF;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- ============================================================
|
|
-- PART 2: Dispensary Crawl Configuration
|
|
-- ============================================================
|
|
|
|
-- Add crawl config columns to dispensaries
|
|
DO $$
|
|
BEGIN
|
|
-- Crawl frequency (minutes between crawls)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240;
|
|
END IF;
|
|
|
|
-- Max retries per crawl
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'max_retries'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3;
|
|
END IF;
|
|
|
|
-- Current proxy ID
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL;
|
|
END IF;
|
|
|
|
-- Current user agent
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL;
|
|
END IF;
|
|
|
|
-- Next scheduled run
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL;
|
|
END IF;
|
|
|
|
-- Last successful crawl
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'last_success_at'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL;
|
|
END IF;
|
|
|
|
-- Last error code (using text for flexibility, validated in app)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'last_error_code'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL;
|
|
END IF;
|
|
|
|
-- Crawl status: active, degraded, paused, failed
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'crawl_status'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active';
|
|
END IF;
|
|
|
|
-- Backoff multiplier (increases with failures)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0;
|
|
END IF;
|
|
|
|
-- Total attempt count (lifetime)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'total_attempts'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0;
|
|
END IF;
|
|
|
|
-- Total success count (lifetime)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensaries' AND column_name = 'total_successes'
|
|
) THEN
|
|
ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0;
|
|
END IF;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- ============================================================
|
|
-- PART 3: Enhanced Job Tracking
|
|
-- ============================================================
|
|
|
|
-- Add columns to dispensary_crawl_jobs
|
|
DO $$
|
|
BEGIN
|
|
-- Error code
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL;
|
|
END IF;
|
|
|
|
-- Proxy used for this job
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL;
|
|
END IF;
|
|
|
|
-- User agent used for this job
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL;
|
|
END IF;
|
|
|
|
-- Attempt number for this job
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1;
|
|
END IF;
|
|
|
|
-- Backoff delay applied (ms)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0;
|
|
END IF;
|
|
|
|
-- HTTP status code received
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL;
|
|
END IF;
|
|
|
|
-- Response time (ms)
|
|
IF NOT EXISTS (
|
|
SELECT 1 FROM information_schema.columns
|
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms'
|
|
) THEN
|
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL;
|
|
END IF;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- ============================================================
|
|
-- PART 4: Crawl History Table (for detailed tracking)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE IF NOT EXISTS crawl_attempts (
|
|
id SERIAL PRIMARY KEY,
|
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
|
job_id INTEGER REFERENCES dispensary_crawl_jobs(id),
|
|
|
|
-- Timing
|
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
finished_at TIMESTAMPTZ,
|
|
duration_ms INTEGER,
|
|
|
|
-- Result
|
|
error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR',
|
|
error_message TEXT,
|
|
http_status INTEGER,
|
|
|
|
-- Context
|
|
attempt_number INTEGER NOT NULL DEFAULT 1,
|
|
proxy_used TEXT,
|
|
user_agent_used TEXT,
|
|
|
|
-- Metrics
|
|
products_found INTEGER DEFAULT 0,
|
|
products_upserted INTEGER DEFAULT 0,
|
|
snapshots_created INTEGER DEFAULT 0,
|
|
|
|
-- Metadata
|
|
metadata JSONB,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- Index for quick lookups
|
|
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id);
|
|
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code);
|
|
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC);
|
|
|
|
-- ============================================================
|
|
-- PART 5: Views for Monitoring
|
|
-- ============================================================
|
|
|
|
-- Drop existing view if exists
|
|
DROP VIEW IF EXISTS v_crawler_status;
|
|
|
|
-- Crawler status view with all reliability fields
|
|
CREATE VIEW v_crawler_status AS
|
|
SELECT
|
|
d.id,
|
|
d.name,
|
|
d.slug,
|
|
d.menu_type,
|
|
d.platform_dispensary_id,
|
|
d.crawl_status,
|
|
d.consecutive_failures,
|
|
d.last_crawl_at,
|
|
d.last_success_at,
|
|
d.last_failure_at,
|
|
d.last_error_code,
|
|
d.next_crawl_at,
|
|
d.crawl_frequency_minutes,
|
|
d.max_retries,
|
|
d.current_proxy_id,
|
|
d.current_user_agent,
|
|
d.backoff_multiplier,
|
|
d.total_attempts,
|
|
d.total_successes,
|
|
d.product_count,
|
|
CASE
|
|
WHEN d.total_attempts > 0
|
|
THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1)
|
|
ELSE 0
|
|
END AS success_rate,
|
|
CASE
|
|
WHEN d.crawl_status = 'failed' THEN 'FAILED'
|
|
WHEN d.crawl_status = 'paused' THEN 'PAUSED'
|
|
WHEN d.crawl_status = 'degraded' THEN 'DEGRADED'
|
|
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION'
|
|
WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID'
|
|
WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED'
|
|
WHEN d.next_crawl_at <= NOW() THEN 'DUE'
|
|
ELSE 'SCHEDULED'
|
|
END AS schedule_status,
|
|
d.failed_at,
|
|
d.failure_notes
|
|
FROM dispensaries d
|
|
WHERE d.state = 'AZ';
|
|
|
|
-- Drop existing view if exists
|
|
DROP VIEW IF EXISTS v_crawl_error_summary;
|
|
|
|
-- Error summary view
|
|
CREATE VIEW v_crawl_error_summary AS
|
|
SELECT
|
|
error_code,
|
|
COUNT(*) as total_occurrences,
|
|
COUNT(DISTINCT dispensary_id) as affected_stores,
|
|
MAX(started_at) as last_occurrence,
|
|
AVG(duration_ms)::INTEGER as avg_duration_ms
|
|
FROM crawl_attempts
|
|
WHERE started_at > NOW() - INTERVAL '7 days'
|
|
GROUP BY error_code
|
|
ORDER BY total_occurrences DESC;
|
|
|
|
-- Drop existing view if exists
|
|
DROP VIEW IF EXISTS v_crawl_health;
|
|
|
|
-- Overall crawl health view
|
|
CREATE VIEW v_crawl_health AS
|
|
SELECT
|
|
COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers,
|
|
COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers,
|
|
COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers,
|
|
COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers,
|
|
COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now,
|
|
COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures,
|
|
AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures,
|
|
COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h
|
|
FROM dispensaries
|
|
WHERE state = 'AZ' AND menu_type = 'dutchie';
|
|
|
|
-- ============================================================
|
|
-- PART 6: Constraint for minimum crawl gap
|
|
-- ============================================================
|
|
|
|
-- Function to check minimum crawl gap (2 minutes)
|
|
CREATE OR REPLACE FUNCTION check_minimum_crawl_gap()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
-- Only check for new pending jobs
|
|
IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN
|
|
-- Check if there's a recent job for same dispensary
|
|
IF EXISTS (
|
|
SELECT 1 FROM dispensary_crawl_jobs
|
|
WHERE dispensary_id = NEW.dispensary_id
|
|
AND id != NEW.id
|
|
AND status IN ('pending', 'running')
|
|
AND created_at > NOW() - INTERVAL '2 minutes'
|
|
) THEN
|
|
RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary';
|
|
END IF;
|
|
END IF;
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Create trigger (drop first if exists)
|
|
DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs;
|
|
CREATE TRIGGER enforce_minimum_crawl_gap
|
|
BEFORE INSERT ON dispensary_crawl_jobs
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION check_minimum_crawl_gap();
|
|
|
|
-- ============================================================
|
|
-- PART 7: Comments
|
|
-- ============================================================
|
|
|
|
COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging';
|
|
COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics';
|
|
COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days';
|
|
COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';
|