-- ===================================================== -- Crawler Schedule Tables -- ===================================================== -- Add timezone column to stores table ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix'; -- 1. Global crawler schedule settings CREATE TABLE IF NOT EXISTS crawler_schedule ( id SERIAL PRIMARY KEY, schedule_type VARCHAR(50) NOT NULL, -- 'global_interval', 'daily_special' enabled BOOLEAN NOT NULL DEFAULT TRUE, interval_hours INTEGER, -- For global_interval: every N hours run_time TIME, -- For daily_special: time to run (e.g., 00:01) description TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), CONSTRAINT uq_crawler_schedule_type UNIQUE (schedule_type) ); -- Insert default schedules INSERT INTO crawler_schedule (schedule_type, enabled, interval_hours, description) VALUES ('global_interval', TRUE, 4, 'Crawl all stores every N hours') ON CONFLICT (schedule_type) DO NOTHING; INSERT INTO crawler_schedule (schedule_type, enabled, run_time, description) VALUES ('daily_special', TRUE, '00:01', 'Daily specials run at store local midnight') ON CONFLICT (schedule_type) DO NOTHING; -- 2. Per-store schedule overrides CREATE TABLE IF NOT EXISTS store_crawl_schedule ( id SERIAL PRIMARY KEY, store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE, enabled BOOLEAN NOT NULL DEFAULT TRUE, interval_hours INTEGER, -- NULL = use global setting daily_special_enabled BOOLEAN DEFAULT TRUE, daily_special_time TIME, -- NULL = use store's 00:01 local time priority INTEGER DEFAULT 0, -- Higher priority = scheduled first created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), CONSTRAINT uq_store_crawl_schedule_store UNIQUE (store_id) ); -- 3. Crawl job queue CREATE TABLE IF NOT EXISTS crawl_jobs ( id SERIAL PRIMARY KEY, store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE, -- Job identification job_type VARCHAR(50) NOT NULL DEFAULT 'full_crawl', -- 'full_crawl', 'specials_only', 'category' trigger_type VARCHAR(50) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_special' -- Status status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled' priority INTEGER DEFAULT 0, -- Timing scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- When job should run started_at TIMESTAMPTZ, completed_at TIMESTAMPTZ, -- Results products_found INTEGER, products_new INTEGER, products_updated INTEGER, error_message TEXT, -- Metadata worker_id VARCHAR(100), metadata JSONB DEFAULT '{}', created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), CONSTRAINT chk_crawl_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')) ); -- Indexes for efficient job lookup CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status); CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_status ON crawl_jobs(store_id, status); CREATE INDEX IF NOT EXISTS idx_crawl_jobs_pending ON crawl_jobs(scheduled_at) WHERE status = 'pending'; CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_time ON crawl_jobs(store_id, created_at DESC); -- 4. Crawl history summary (for UI display) CREATE OR REPLACE VIEW crawl_schedule_status AS SELECT s.id AS store_id, s.name AS store_name, s.slug AS store_slug, s.timezone, s.active, s.scrape_enabled, s.last_scraped_at, -- Schedule settings (use store override or global) COALESCE(scs.enabled, TRUE) AS schedule_enabled, COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours, COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled, COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time, COALESCE(scs.priority, 0) AS priority, -- Next scheduled run calculation CASE WHEN s.last_scraped_at IS NULL THEN NOW() ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL END AS next_scheduled_run, -- Latest job info cj.id AS latest_job_id, cj.status AS latest_job_status, cj.started_at AS latest_job_started, cj.completed_at AS latest_job_completed, cj.products_found AS latest_products_found FROM stores s LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval' LEFT JOIN LATERAL ( SELECT * FROM crawl_jobs cj2 WHERE cj2.store_id = s.id ORDER BY cj2.created_at DESC LIMIT 1 ) cj ON TRUE WHERE s.active = TRUE; -- Function to update updated_at timestamps CREATE OR REPLACE FUNCTION update_schedule_updated_at() RETURNS TRIGGER AS $$ BEGIN NEW.updated_at = NOW(); RETURN NEW; END; $$ LANGUAGE plpgsql; -- Triggers DROP TRIGGER IF EXISTS trigger_crawler_schedule_updated_at ON crawler_schedule; CREATE TRIGGER trigger_crawler_schedule_updated_at BEFORE UPDATE ON crawler_schedule FOR EACH ROW EXECUTE FUNCTION update_schedule_updated_at(); DROP TRIGGER IF EXISTS trigger_store_crawl_schedule_updated_at ON store_crawl_schedule; CREATE TRIGGER trigger_store_crawl_schedule_updated_at BEFORE UPDATE ON store_crawl_schedule FOR EACH ROW EXECUTE FUNCTION update_schedule_updated_at(); DROP TRIGGER IF EXISTS trigger_crawl_jobs_updated_at ON crawl_jobs; CREATE TRIGGER trigger_crawl_jobs_updated_at BEFORE UPDATE ON crawl_jobs FOR EACH ROW EXECUTE FUNCTION update_schedule_updated_at(); -- Grant permissions GRANT SELECT, INSERT, UPDATE, DELETE ON crawler_schedule TO dutchie; GRANT SELECT, INSERT, UPDATE, DELETE ON store_crawl_schedule TO dutchie; GRANT SELECT, INSERT, UPDATE, DELETE ON crawl_jobs TO dutchie; GRANT USAGE, SELECT ON SEQUENCE crawler_schedule_id_seq TO dutchie; GRANT USAGE, SELECT ON SEQUENCE store_crawl_schedule_id_seq TO dutchie; GRANT USAGE, SELECT ON SEQUENCE crawl_jobs_id_seq TO dutchie; GRANT SELECT ON crawl_schedule_status TO dutchie;