Add crawler scheduler, orchestrator, and multi-category intelligence
- Add scheduler UI with store schedules, job queue, and global settings - Add store crawl orchestrator for intelligent crawl workflow - Add multi-category intelligence detection (product, specials, brands, metadata) - Add CrawlerLogger for structured JSON logging - Add migrations for scheduler tables and dispensary linking - Add dispensary → scheduler navigation link - Support production/sandbox crawler modes per provider 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
163
backend/migrations/015_crawler_schedule.sql
Normal file
163
backend/migrations/015_crawler_schedule.sql
Normal file
@@ -0,0 +1,163 @@
|
||||
-- =====================================================
|
||||
-- Crawler Schedule Tables
|
||||
-- =====================================================
|
||||
|
||||
-- Add timezone column to stores table
|
||||
ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix';
|
||||
|
||||
-- 1. Global crawler schedule settings
|
||||
CREATE TABLE IF NOT EXISTS crawler_schedule (
|
||||
id SERIAL PRIMARY KEY,
|
||||
schedule_type VARCHAR(50) NOT NULL, -- 'global_interval', 'daily_special'
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
interval_hours INTEGER, -- For global_interval: every N hours
|
||||
run_time TIME, -- For daily_special: time to run (e.g., 00:01)
|
||||
description TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uq_crawler_schedule_type UNIQUE (schedule_type)
|
||||
);
|
||||
|
||||
-- Insert default schedules
|
||||
INSERT INTO crawler_schedule (schedule_type, enabled, interval_hours, description)
|
||||
VALUES ('global_interval', TRUE, 4, 'Crawl all stores every N hours')
|
||||
ON CONFLICT (schedule_type) DO NOTHING;
|
||||
|
||||
INSERT INTO crawler_schedule (schedule_type, enabled, run_time, description)
|
||||
VALUES ('daily_special', TRUE, '00:01', 'Daily specials run at store local midnight')
|
||||
ON CONFLICT (schedule_type) DO NOTHING;
|
||||
|
||||
-- 2. Per-store schedule overrides
|
||||
CREATE TABLE IF NOT EXISTS store_crawl_schedule (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
interval_hours INTEGER, -- NULL = use global setting
|
||||
daily_special_enabled BOOLEAN DEFAULT TRUE,
|
||||
daily_special_time TIME, -- NULL = use store's 00:01 local time
|
||||
priority INTEGER DEFAULT 0, -- Higher priority = scheduled first
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uq_store_crawl_schedule_store UNIQUE (store_id)
|
||||
);
|
||||
|
||||
-- 3. Crawl job queue
|
||||
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
|
||||
|
||||
-- Job identification
|
||||
job_type VARCHAR(50) NOT NULL DEFAULT 'full_crawl', -- 'full_crawl', 'specials_only', 'category'
|
||||
trigger_type VARCHAR(50) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_special'
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
|
||||
priority INTEGER DEFAULT 0,
|
||||
|
||||
-- Timing
|
||||
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- When job should run
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
|
||||
-- Results
|
||||
products_found INTEGER,
|
||||
products_new INTEGER,
|
||||
products_updated INTEGER,
|
||||
error_message TEXT,
|
||||
|
||||
-- Metadata
|
||||
worker_id VARCHAR(100),
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_crawl_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
|
||||
-- Indexes for efficient job lookup
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_status ON crawl_jobs(store_id, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_pending ON crawl_jobs(scheduled_at) WHERE status = 'pending';
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_time ON crawl_jobs(store_id, created_at DESC);
|
||||
|
||||
-- 4. Crawl history summary (for UI display)
|
||||
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||
SELECT
|
||||
s.id AS store_id,
|
||||
s.name AS store_name,
|
||||
s.slug AS store_slug,
|
||||
s.timezone,
|
||||
s.active,
|
||||
s.scrape_enabled,
|
||||
s.last_scraped_at,
|
||||
|
||||
-- Schedule settings (use store override or global)
|
||||
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||
COALESCE(scs.priority, 0) AS priority,
|
||||
|
||||
-- Next scheduled run calculation
|
||||
CASE
|
||||
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||
END AS next_scheduled_run,
|
||||
|
||||
-- Latest job info
|
||||
cj.id AS latest_job_id,
|
||||
cj.status AS latest_job_status,
|
||||
cj.started_at AS latest_job_started,
|
||||
cj.completed_at AS latest_job_completed,
|
||||
cj.products_found AS latest_products_found
|
||||
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT * FROM crawl_jobs cj2
|
||||
WHERE cj2.store_id = s.id
|
||||
ORDER BY cj2.created_at DESC
|
||||
LIMIT 1
|
||||
) cj ON TRUE
|
||||
WHERE s.active = TRUE;
|
||||
|
||||
-- Function to update updated_at timestamps
|
||||
CREATE OR REPLACE FUNCTION update_schedule_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Triggers
|
||||
DROP TRIGGER IF EXISTS trigger_crawler_schedule_updated_at ON crawler_schedule;
|
||||
CREATE TRIGGER trigger_crawler_schedule_updated_at
|
||||
BEFORE UPDATE ON crawler_schedule
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_schedule_updated_at();
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_store_crawl_schedule_updated_at ON store_crawl_schedule;
|
||||
CREATE TRIGGER trigger_store_crawl_schedule_updated_at
|
||||
BEFORE UPDATE ON store_crawl_schedule
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_schedule_updated_at();
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_crawl_jobs_updated_at ON crawl_jobs;
|
||||
CREATE TRIGGER trigger_crawl_jobs_updated_at
|
||||
BEFORE UPDATE ON crawl_jobs
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_schedule_updated_at();
|
||||
|
||||
-- Grant permissions
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON crawler_schedule TO dutchie;
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON store_crawl_schedule TO dutchie;
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON crawl_jobs TO dutchie;
|
||||
GRANT USAGE, SELECT ON SEQUENCE crawler_schedule_id_seq TO dutchie;
|
||||
GRANT USAGE, SELECT ON SEQUENCE store_crawl_schedule_id_seq TO dutchie;
|
||||
GRANT USAGE, SELECT ON SEQUENCE crawl_jobs_id_seq TO dutchie;
|
||||
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||
Reference in New Issue
Block a user