Add crawler scheduler, orchestrator, and multi-category intelligence
- Add scheduler UI with store schedules, job queue, and global settings - Add store crawl orchestrator for intelligent crawl workflow - Add multi-category intelligence detection (product, specials, brands, metadata) - Add CrawlerLogger for structured JSON logging - Add migrations for scheduler tables and dispensary linking - Add dispensary → scheduler navigation link - Support production/sandbox crawler modes per provider 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
163
backend/migrations/015_crawler_schedule.sql
Normal file
163
backend/migrations/015_crawler_schedule.sql
Normal file
@@ -0,0 +1,163 @@
|
||||
-- =====================================================
|
||||
-- Crawler Schedule Tables
|
||||
-- =====================================================
|
||||
|
||||
-- Add timezone column to stores table
|
||||
ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix';
|
||||
|
||||
-- 1. Global crawler schedule settings
|
||||
CREATE TABLE IF NOT EXISTS crawler_schedule (
|
||||
id SERIAL PRIMARY KEY,
|
||||
schedule_type VARCHAR(50) NOT NULL, -- 'global_interval', 'daily_special'
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
interval_hours INTEGER, -- For global_interval: every N hours
|
||||
run_time TIME, -- For daily_special: time to run (e.g., 00:01)
|
||||
description TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uq_crawler_schedule_type UNIQUE (schedule_type)
|
||||
);
|
||||
|
||||
-- Insert default schedules
|
||||
INSERT INTO crawler_schedule (schedule_type, enabled, interval_hours, description)
|
||||
VALUES ('global_interval', TRUE, 4, 'Crawl all stores every N hours')
|
||||
ON CONFLICT (schedule_type) DO NOTHING;
|
||||
|
||||
INSERT INTO crawler_schedule (schedule_type, enabled, run_time, description)
|
||||
VALUES ('daily_special', TRUE, '00:01', 'Daily specials run at store local midnight')
|
||||
ON CONFLICT (schedule_type) DO NOTHING;
|
||||
|
||||
-- 2. Per-store schedule overrides
|
||||
CREATE TABLE IF NOT EXISTS store_crawl_schedule (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
interval_hours INTEGER, -- NULL = use global setting
|
||||
daily_special_enabled BOOLEAN DEFAULT TRUE,
|
||||
daily_special_time TIME, -- NULL = use store's 00:01 local time
|
||||
priority INTEGER DEFAULT 0, -- Higher priority = scheduled first
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uq_store_crawl_schedule_store UNIQUE (store_id)
|
||||
);
|
||||
|
||||
-- 3. Crawl job queue
|
||||
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
|
||||
|
||||
-- Job identification
|
||||
job_type VARCHAR(50) NOT NULL DEFAULT 'full_crawl', -- 'full_crawl', 'specials_only', 'category'
|
||||
trigger_type VARCHAR(50) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_special'
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
|
||||
priority INTEGER DEFAULT 0,
|
||||
|
||||
-- Timing
|
||||
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- When job should run
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
|
||||
-- Results
|
||||
products_found INTEGER,
|
||||
products_new INTEGER,
|
||||
products_updated INTEGER,
|
||||
error_message TEXT,
|
||||
|
||||
-- Metadata
|
||||
worker_id VARCHAR(100),
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_crawl_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
|
||||
-- Indexes for efficient job lookup
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_status ON crawl_jobs(store_id, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_pending ON crawl_jobs(scheduled_at) WHERE status = 'pending';
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_time ON crawl_jobs(store_id, created_at DESC);
|
||||
|
||||
-- 4. Crawl history summary (for UI display)
|
||||
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||
SELECT
|
||||
s.id AS store_id,
|
||||
s.name AS store_name,
|
||||
s.slug AS store_slug,
|
||||
s.timezone,
|
||||
s.active,
|
||||
s.scrape_enabled,
|
||||
s.last_scraped_at,
|
||||
|
||||
-- Schedule settings (use store override or global)
|
||||
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||
COALESCE(scs.priority, 0) AS priority,
|
||||
|
||||
-- Next scheduled run calculation
|
||||
CASE
|
||||
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||
END AS next_scheduled_run,
|
||||
|
||||
-- Latest job info
|
||||
cj.id AS latest_job_id,
|
||||
cj.status AS latest_job_status,
|
||||
cj.started_at AS latest_job_started,
|
||||
cj.completed_at AS latest_job_completed,
|
||||
cj.products_found AS latest_products_found
|
||||
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT * FROM crawl_jobs cj2
|
||||
WHERE cj2.store_id = s.id
|
||||
ORDER BY cj2.created_at DESC
|
||||
LIMIT 1
|
||||
) cj ON TRUE
|
||||
WHERE s.active = TRUE;
|
||||
|
||||
-- Function to update updated_at timestamps
|
||||
CREATE OR REPLACE FUNCTION update_schedule_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Triggers
|
||||
DROP TRIGGER IF EXISTS trigger_crawler_schedule_updated_at ON crawler_schedule;
|
||||
CREATE TRIGGER trigger_crawler_schedule_updated_at
|
||||
BEFORE UPDATE ON crawler_schedule
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_schedule_updated_at();
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_store_crawl_schedule_updated_at ON store_crawl_schedule;
|
||||
CREATE TRIGGER trigger_store_crawl_schedule_updated_at
|
||||
BEFORE UPDATE ON store_crawl_schedule
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_schedule_updated_at();
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_crawl_jobs_updated_at ON crawl_jobs;
|
||||
CREATE TRIGGER trigger_crawl_jobs_updated_at
|
||||
BEFORE UPDATE ON crawl_jobs
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_schedule_updated_at();
|
||||
|
||||
-- Grant permissions
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON crawler_schedule TO dutchie;
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON store_crawl_schedule TO dutchie;
|
||||
GRANT SELECT, INSERT, UPDATE, DELETE ON crawl_jobs TO dutchie;
|
||||
GRANT USAGE, SELECT ON SEQUENCE crawler_schedule_id_seq TO dutchie;
|
||||
GRANT USAGE, SELECT ON SEQUENCE store_crawl_schedule_id_seq TO dutchie;
|
||||
GRANT USAGE, SELECT ON SEQUENCE crawl_jobs_id_seq TO dutchie;
|
||||
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||
@@ -0,0 +1,57 @@
|
||||
-- =====================================================
|
||||
-- Extend dispensaries table for multi-provider crawler
|
||||
-- =====================================================
|
||||
|
||||
-- Menu provider detection
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider VARCHAR(50);
|
||||
-- Values: 'dutchie', 'treez', 'jane', 'weedmaps', 'iheartjane', 'leafly', 'meadow', 'greenlight', 'other', 'unknown'
|
||||
|
||||
-- Confidence score for provider detection (0-100)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider_confidence SMALLINT DEFAULT 0;
|
||||
ALTER TABLE dispensaries ADD CONSTRAINT chk_provider_confidence
|
||||
CHECK (menu_provider_confidence >= 0 AND menu_provider_confidence <= 100);
|
||||
|
||||
-- Crawler mode: production (stable templates) vs sandbox (learning/unstable)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_mode VARCHAR(20) DEFAULT 'production';
|
||||
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_mode
|
||||
CHECK (crawler_mode IN ('production', 'sandbox'));
|
||||
|
||||
-- Crawler status for job orchestration
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_status VARCHAR(30) DEFAULT 'idle';
|
||||
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_status
|
||||
CHECK (crawler_status IN ('idle', 'queued_detection', 'queued_crawl', 'running', 'ok', 'error_needs_review'));
|
||||
|
||||
-- Error tracking
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_menu_error_at TIMESTAMPTZ;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_error_message TEXT;
|
||||
|
||||
-- Provider detection metadata (raw signals, detection history)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS provider_detection_data JSONB DEFAULT '{}';
|
||||
|
||||
-- Indexes for efficient queue queries
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider ON dispensaries(menu_provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_mode ON dispensaries(crawler_mode);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_status ON dispensaries(crawler_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_confidence ON dispensaries(menu_provider_confidence);
|
||||
|
||||
-- Composite index for production Dutchie crawl queue
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_dutchie_production
|
||||
ON dispensaries(id)
|
||||
WHERE menu_provider = 'dutchie' AND crawler_mode = 'production';
|
||||
|
||||
-- Composite index for sandbox queue
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_sandbox
|
||||
ON dispensaries(id)
|
||||
WHERE crawler_mode = 'sandbox';
|
||||
|
||||
-- Composite index for detection queue
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_detection
|
||||
ON dispensaries(id)
|
||||
WHERE menu_provider IS NULL OR menu_provider_confidence < 70;
|
||||
|
||||
-- Comment on columns for documentation
|
||||
COMMENT ON COLUMN dispensaries.menu_provider IS 'Detected menu platform: dutchie, treez, jane, weedmaps, etc.';
|
||||
COMMENT ON COLUMN dispensaries.menu_provider_confidence IS 'Confidence score 0-100 for provider detection';
|
||||
COMMENT ON COLUMN dispensaries.crawler_mode IS 'production = stable templates, sandbox = learning mode';
|
||||
COMMENT ON COLUMN dispensaries.crawler_status IS 'Current state in crawl pipeline';
|
||||
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSON blob with detection signals and history';
|
||||
@@ -0,0 +1,237 @@
|
||||
-- =====================================================
|
||||
-- Crawler Sandboxes and Templates Tables
|
||||
-- =====================================================
|
||||
|
||||
-- 1. Crawler sandboxes - for learning new providers/templates
|
||||
CREATE TABLE IF NOT EXISTS crawler_sandboxes (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- Detection info
|
||||
suspected_menu_provider VARCHAR(50), -- What we think the provider is
|
||||
mode VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_learning', 'validation'
|
||||
|
||||
-- Captured data
|
||||
raw_html_location TEXT, -- S3 key or local file path to captured HTML
|
||||
screenshot_location TEXT, -- S3 key for screenshot
|
||||
analysis_json JSONB DEFAULT '{}', -- Extracted patterns, selectors, candidate templates
|
||||
|
||||
-- URLs discovered/tested
|
||||
urls_tested JSONB DEFAULT '[]', -- Array of URLs we fetched
|
||||
menu_entry_points JSONB DEFAULT '[]', -- Discovered menu URLs
|
||||
|
||||
-- Detection signals found
|
||||
detection_signals JSONB DEFAULT '{}', -- e.g., {"dutchie_embed": false, "treez_script": true, ...}
|
||||
|
||||
-- Status tracking
|
||||
status VARCHAR(30) NOT NULL DEFAULT 'pending',
|
||||
-- 'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
|
||||
|
||||
-- Results
|
||||
confidence_score SMALLINT DEFAULT 0,
|
||||
failure_reason TEXT,
|
||||
human_review_notes TEXT,
|
||||
|
||||
-- Timestamps
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
analyzed_at TIMESTAMPTZ,
|
||||
reviewed_at TIMESTAMPTZ,
|
||||
|
||||
CONSTRAINT chk_sandbox_mode CHECK (mode IN ('detection', 'template_learning', 'validation')),
|
||||
CONSTRAINT chk_sandbox_status CHECK (status IN (
|
||||
'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
|
||||
))
|
||||
);
|
||||
|
||||
-- Indexes for sandbox queries
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_dispensary ON crawler_sandboxes(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_status ON crawler_sandboxes(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_mode ON crawler_sandboxes(mode);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_suspected_provider ON crawler_sandboxes(suspected_menu_provider);
|
||||
|
||||
-- Unique constraint: one active sandbox per dispensary (can have historical completed ones)
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_sandbox_active_per_dispensary
|
||||
ON crawler_sandboxes(dispensary_id)
|
||||
WHERE status NOT IN ('moved_to_production', 'failed');
|
||||
|
||||
|
||||
-- 2. Crawler templates - reusable scraping configurations
|
||||
CREATE TABLE IF NOT EXISTS crawler_templates (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Template identification
|
||||
provider VARCHAR(50) NOT NULL, -- 'dutchie', 'treez', 'jane', etc.
|
||||
name VARCHAR(100) NOT NULL, -- 'dutchie_v1', 'treez_standard', 'jane_embedded'
|
||||
version INTEGER DEFAULT 1,
|
||||
|
||||
-- Status
|
||||
is_active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
is_default_for_provider BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Selector configuration
|
||||
selector_config JSONB NOT NULL DEFAULT '{}',
|
||||
-- Structure:
|
||||
-- {
|
||||
-- "product_list": "css:.product-card",
|
||||
-- "product_name": "css:.product-name",
|
||||
-- "product_price": "css:.price",
|
||||
-- "product_brand": "css:.brand",
|
||||
-- "product_image": "css:img.product-image@src",
|
||||
-- "pagination_next": "css:.next-page",
|
||||
-- "category_links": "css:.category-nav a",
|
||||
-- ...
|
||||
-- }
|
||||
|
||||
-- Navigation patterns
|
||||
navigation_config JSONB DEFAULT '{}',
|
||||
-- Structure:
|
||||
-- {
|
||||
-- "entry_paths": ["/menu", "/shop", "/order"],
|
||||
-- "age_gate": {"type": "click", "selector": ".age-confirm-btn"},
|
||||
-- "location_modal": {"dismiss_selector": ".modal-close"},
|
||||
-- "infinite_scroll": true,
|
||||
-- "wait_for": ".products-loaded"
|
||||
-- }
|
||||
|
||||
-- Data transformation rules
|
||||
transform_config JSONB DEFAULT '{}',
|
||||
-- Structure:
|
||||
-- {
|
||||
-- "price_regex": "\\$([\\d.]+)",
|
||||
-- "weight_normalizer": "g_to_oz",
|
||||
-- "thc_format": "percentage"
|
||||
-- }
|
||||
|
||||
-- Validation rules
|
||||
validation_rules JSONB DEFAULT '{}',
|
||||
-- Structure:
|
||||
-- {
|
||||
-- "min_products": 5,
|
||||
-- "required_fields": ["name", "price"],
|
||||
-- "price_range": [0.01, 10000]
|
||||
-- }
|
||||
|
||||
-- Test data for validation
|
||||
test_urls JSONB DEFAULT '[]', -- URLs to validate template against
|
||||
expected_structure JSONB DEFAULT '{}', -- What we expect to extract
|
||||
|
||||
-- Stats
|
||||
dispensaries_using INTEGER DEFAULT 0,
|
||||
success_rate DECIMAL(5,2) DEFAULT 0, -- 0-100%
|
||||
last_successful_crawl TIMESTAMPTZ,
|
||||
last_failed_crawl TIMESTAMPTZ,
|
||||
|
||||
-- Metadata
|
||||
notes TEXT,
|
||||
created_by VARCHAR(100),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uq_template_name UNIQUE (provider, name, version)
|
||||
);
|
||||
|
||||
-- Indexes for templates
|
||||
CREATE INDEX IF NOT EXISTS idx_template_provider ON crawler_templates(provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_template_active ON crawler_templates(is_active);
|
||||
CREATE INDEX IF NOT EXISTS idx_template_default ON crawler_templates(provider, is_default_for_provider)
|
||||
WHERE is_default_for_provider = TRUE;
|
||||
|
||||
|
||||
-- 3. Sandbox crawl jobs - separate queue for sandbox operations
|
||||
CREATE TABLE IF NOT EXISTS sandbox_crawl_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
sandbox_id INTEGER REFERENCES crawler_sandboxes(id) ON DELETE SET NULL,
|
||||
|
||||
-- Job type
|
||||
job_type VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_test', 'deep_crawl'
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
|
||||
priority INTEGER DEFAULT 0,
|
||||
|
||||
-- Timing
|
||||
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
|
||||
-- Worker tracking
|
||||
worker_id VARCHAR(100),
|
||||
|
||||
-- Results
|
||||
result_summary JSONB DEFAULT '{}',
|
||||
error_message TEXT,
|
||||
|
||||
-- Timestamps
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT chk_sandbox_job_type CHECK (job_type IN ('detection', 'template_test', 'deep_crawl')),
|
||||
CONSTRAINT chk_sandbox_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
|
||||
-- Indexes for sandbox jobs
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_job_status ON sandbox_crawl_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_job_dispensary ON sandbox_crawl_jobs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_job_pending ON sandbox_crawl_jobs(scheduled_at) WHERE status = 'pending';
|
||||
|
||||
|
||||
-- 4. Insert default Dutchie template (our only stable one for now)
|
||||
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, selector_config, navigation_config, notes)
|
||||
VALUES (
|
||||
'dutchie',
|
||||
'dutchie_standard',
|
||||
1,
|
||||
TRUE,
|
||||
TRUE,
|
||||
'{
|
||||
"type": "api_based",
|
||||
"graphql_endpoint": "/graphql",
|
||||
"product_container": "data.menu.products",
|
||||
"uses_puppeteer": true,
|
||||
"notes": "Dutchie uses GraphQL API, scraped via puppeteer interception"
|
||||
}'::jsonb,
|
||||
'{
|
||||
"entry_paths": ["/menu", "/order", "/embedded-menu", "/products"],
|
||||
"age_gate": {"type": "auto_detected", "handled_by_stealth": true},
|
||||
"wait_strategy": "networkidle2",
|
||||
"requires_javascript": true
|
||||
}'::jsonb,
|
||||
'Default Dutchie template - uses existing scraper-v2 pipeline'
|
||||
)
|
||||
ON CONFLICT (provider, name, version) DO NOTHING;
|
||||
|
||||
|
||||
-- 5. Triggers for updated_at
|
||||
CREATE OR REPLACE FUNCTION update_sandbox_timestamp()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_sandbox_updated_at ON crawler_sandboxes;
|
||||
CREATE TRIGGER trigger_sandbox_updated_at
|
||||
BEFORE UPDATE ON crawler_sandboxes
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_sandbox_timestamp();
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_template_updated_at ON crawler_templates;
|
||||
CREATE TRIGGER trigger_template_updated_at
|
||||
BEFORE UPDATE ON crawler_templates
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_sandbox_timestamp();
|
||||
|
||||
DROP TRIGGER IF EXISTS trigger_sandbox_job_updated_at ON sandbox_crawl_jobs;
|
||||
CREATE TRIGGER trigger_sandbox_job_updated_at
|
||||
BEFORE UPDATE ON sandbox_crawl_jobs
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_sandbox_timestamp();
|
||||
|
||||
|
||||
-- Comments for documentation
|
||||
COMMENT ON TABLE crawler_sandboxes IS 'Learning/testing environment for unknown menu providers';
|
||||
COMMENT ON TABLE crawler_templates IS 'Reusable scraping configurations per menu provider';
|
||||
COMMENT ON TABLE sandbox_crawl_jobs IS 'Job queue for sandbox crawl operations (separate from production)';
|
||||
118
backend/migrations/018_multi_category_intelligence.sql
Normal file
118
backend/migrations/018_multi_category_intelligence.sql
Normal file
@@ -0,0 +1,118 @@
|
||||
-- =====================================================
|
||||
-- Multi-Category Intelligence Support
|
||||
-- =====================================================
|
||||
-- Each dispensary can have different providers for different
|
||||
-- intelligence categories (products, specials, brands, metadata)
|
||||
|
||||
-- 1. Product Intelligence Columns
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_provider VARCHAR(50);
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_confidence SMALLINT DEFAULT 0;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_product_scan_at TIMESTAMPTZ;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_detection_data JSONB DEFAULT '{}';
|
||||
|
||||
-- 2. Specials Intelligence Columns
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_provider VARCHAR(50);
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_confidence SMALLINT DEFAULT 0;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_specials_scan_at TIMESTAMPTZ;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_detection_data JSONB DEFAULT '{}';
|
||||
|
||||
-- 3. Brand Intelligence Columns
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_provider VARCHAR(50);
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_confidence SMALLINT DEFAULT 0;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_brand_scan_at TIMESTAMPTZ;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_detection_data JSONB DEFAULT '{}';
|
||||
|
||||
-- 4. Metadata Intelligence Columns (categories, taxonomy, etc.)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_provider VARCHAR(50);
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_confidence SMALLINT DEFAULT 0;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_metadata_scan_at TIMESTAMPTZ;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_detection_data JSONB DEFAULT '{}';
|
||||
|
||||
-- 5. Add category column to crawler_sandboxes
|
||||
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
|
||||
-- Valid categories: 'product', 'specials', 'brand', 'metadata'
|
||||
|
||||
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
|
||||
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS quality_score SMALLINT DEFAULT 0;
|
||||
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS products_extracted INTEGER DEFAULT 0;
|
||||
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS fields_missing INTEGER DEFAULT 0;
|
||||
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS error_count INTEGER DEFAULT 0;
|
||||
|
||||
-- 6. Add category column to sandbox_crawl_jobs
|
||||
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
|
||||
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
|
||||
|
||||
-- 7. Indexes for per-category queries
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_product_provider ON dispensaries(product_provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_product_mode ON dispensaries(product_crawler_mode);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_specials_provider ON dispensaries(specials_provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_specials_mode ON dispensaries(specials_crawler_mode);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_brand_provider ON dispensaries(brand_provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_brand_mode ON dispensaries(brand_crawler_mode);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_metadata_provider ON dispensaries(metadata_provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_disp_metadata_mode ON dispensaries(metadata_crawler_mode);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_category ON crawler_sandboxes(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_template ON crawler_sandboxes(template_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_sandbox_job_category ON sandbox_crawl_jobs(category);
|
||||
|
||||
-- 8. Migrate existing menu_provider to product_provider for Dutchie stores
|
||||
-- (Only if menu_provider = 'dutchie' and product_provider is null)
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
product_provider = menu_provider,
|
||||
product_confidence = menu_provider_confidence,
|
||||
product_crawler_mode = CASE
|
||||
WHEN menu_provider = 'dutchie' AND menu_provider_confidence >= 70 THEN 'production'
|
||||
ELSE 'sandbox'
|
||||
END
|
||||
WHERE menu_provider IS NOT NULL
|
||||
AND product_provider IS NULL;
|
||||
|
||||
-- 9. Add environment column to crawler_templates if not exists
|
||||
ALTER TABLE crawler_templates ADD COLUMN IF NOT EXISTS environment VARCHAR(20) DEFAULT 'production';
|
||||
-- Valid: 'production', 'sandbox'
|
||||
|
||||
-- 10. Insert Treez sandbox template (existing code to be linked here)
|
||||
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, environment, selector_config, navigation_config, notes)
|
||||
VALUES (
|
||||
'treez',
|
||||
'treez_products_v0',
|
||||
1,
|
||||
FALSE, -- Not active for production
|
||||
FALSE, -- Not default
|
||||
'sandbox',
|
||||
'{
|
||||
"type": "api_based",
|
||||
"notes": "Treez API-based scraper - unreliable, sandbox only",
|
||||
"product_container": "products",
|
||||
"requires_api_key": false,
|
||||
"uses_puppeteer": true
|
||||
}'::jsonb,
|
||||
'{
|
||||
"entry_paths": ["/menu", "/shop"],
|
||||
"wait_strategy": "networkidle2",
|
||||
"requires_javascript": true
|
||||
}'::jsonb,
|
||||
'Treez sandbox template - v0 implementation, needs quality improvement'
|
||||
)
|
||||
ON CONFLICT (provider, name, version) DO NOTHING;
|
||||
|
||||
-- 11. Update existing Dutchie template to specify environment
|
||||
UPDATE crawler_templates
|
||||
SET environment = 'production'
|
||||
WHERE provider = 'dutchie' AND name = 'dutchie_standard';
|
||||
|
||||
-- 12. Comments
|
||||
COMMENT ON COLUMN dispensaries.product_provider IS 'Provider for product intelligence (dutchie, treez, jane, etc.)';
|
||||
COMMENT ON COLUMN dispensaries.product_crawler_mode IS 'production or sandbox mode for product crawling';
|
||||
COMMENT ON COLUMN dispensaries.specials_provider IS 'Provider for specials/deals intelligence';
|
||||
COMMENT ON COLUMN dispensaries.brand_provider IS 'Provider for brand intelligence';
|
||||
COMMENT ON COLUMN dispensaries.metadata_provider IS 'Provider for metadata/taxonomy intelligence';
|
||||
COMMENT ON COLUMN crawler_sandboxes.category IS 'Intelligence category: product, specials, brand, metadata';
|
||||
COMMENT ON COLUMN crawler_sandboxes.quality_score IS 'Quality score 0-100 for sandbox run results';
|
||||
COMMENT ON COLUMN crawler_templates.environment IS 'Template environment: production or sandbox';
|
||||
69
backend/migrations/019_link_stores_to_dispensaries.sql
Normal file
69
backend/migrations/019_link_stores_to_dispensaries.sql
Normal file
@@ -0,0 +1,69 @@
|
||||
-- =====================================================
|
||||
-- Link Stores to Dispensaries (Master AZDHS Directory)
|
||||
-- =====================================================
|
||||
-- This migration adds a foreign key from stores to dispensaries,
|
||||
-- allowing the scheduler to reference the master dispensary records.
|
||||
|
||||
-- 1. Add dispensary_id column to stores table
|
||||
ALTER TABLE stores ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL;
|
||||
|
||||
-- 2. Create index for efficient lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_stores_dispensary_id ON stores(dispensary_id);
|
||||
|
||||
-- 3. Update the crawl_schedule_status view to include dispensary info
|
||||
DROP VIEW IF EXISTS crawl_schedule_status;
|
||||
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||
SELECT
|
||||
s.id AS store_id,
|
||||
s.name AS store_name,
|
||||
s.slug AS store_slug,
|
||||
s.timezone,
|
||||
s.active,
|
||||
s.scrape_enabled,
|
||||
s.last_scraped_at,
|
||||
|
||||
-- Dispensary info (master record)
|
||||
s.dispensary_id,
|
||||
d.name AS dispensary_name,
|
||||
d.company_name AS dispensary_company,
|
||||
d.city AS dispensary_city,
|
||||
d.address AS dispensary_address,
|
||||
d.menu_url AS dispensary_menu_url,
|
||||
|
||||
-- Schedule settings (use store override or global)
|
||||
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||
COALESCE(scs.priority, 0) AS priority,
|
||||
|
||||
-- Next scheduled run calculation
|
||||
CASE
|
||||
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||
END AS next_scheduled_run,
|
||||
|
||||
-- Latest job info
|
||||
cj.id AS latest_job_id,
|
||||
cj.status AS latest_job_status,
|
||||
cj.started_at AS latest_job_started,
|
||||
cj.completed_at AS latest_job_completed,
|
||||
cj.products_found AS latest_products_found
|
||||
|
||||
FROM stores s
|
||||
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT * FROM crawl_jobs cj2
|
||||
WHERE cj2.store_id = s.id
|
||||
ORDER BY cj2.created_at DESC
|
||||
LIMIT 1
|
||||
) cj ON TRUE
|
||||
WHERE s.active = TRUE;
|
||||
|
||||
-- Grant permissions
|
||||
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||
|
||||
-- 4. Comments
|
||||
COMMENT ON COLUMN stores.dispensary_id IS 'FK to dispensaries table (master AZDHS directory)';
|
||||
99
backend/migrations/020_scheduler_orchestrator_fields.sql
Normal file
99
backend/migrations/020_scheduler_orchestrator_fields.sql
Normal file
@@ -0,0 +1,99 @@
|
||||
-- =====================================================
|
||||
-- Scheduler Orchestrator Fields
|
||||
-- =====================================================
|
||||
-- Add last_status and last_summary to store_crawl_schedule
|
||||
-- for meaningful job result tracking
|
||||
|
||||
-- 1. Add new columns to store_crawl_schedule
|
||||
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_status VARCHAR(50);
|
||||
-- Valid values: 'success', 'error', 'sandbox_only', 'detection_only', 'pending'
|
||||
|
||||
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_summary TEXT;
|
||||
-- Human-readable summary like "Detection + Dutchie products crawl (187 items)"
|
||||
|
||||
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_run_at TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_error TEXT;
|
||||
|
||||
-- 2. Add job_type tracking to crawl_jobs for better job categorization
|
||||
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS orchestrator_run_id UUID;
|
||||
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS detection_result JSONB;
|
||||
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_new INTEGER;
|
||||
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_updated INTEGER;
|
||||
|
||||
-- 3. Update the crawl_schedule_status view to include new fields
|
||||
DROP VIEW IF EXISTS crawl_schedule_status;
|
||||
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||
SELECT
|
||||
s.id AS store_id,
|
||||
s.name AS store_name,
|
||||
s.slug AS store_slug,
|
||||
s.timezone,
|
||||
s.active,
|
||||
s.scrape_enabled,
|
||||
s.last_scraped_at,
|
||||
|
||||
-- Dispensary info (master record)
|
||||
s.dispensary_id,
|
||||
d.name AS dispensary_name,
|
||||
d.company_name AS dispensary_company,
|
||||
d.city AS dispensary_city,
|
||||
d.address AS dispensary_address,
|
||||
d.menu_url AS dispensary_menu_url,
|
||||
|
||||
-- Provider intelligence from dispensary (if linked)
|
||||
d.product_provider,
|
||||
d.product_confidence,
|
||||
d.product_crawler_mode,
|
||||
|
||||
-- Schedule settings (use store override or global)
|
||||
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||
COALESCE(scs.priority, 0) AS priority,
|
||||
|
||||
-- Orchestrator status
|
||||
scs.last_status,
|
||||
scs.last_summary,
|
||||
scs.last_run_at AS schedule_last_run,
|
||||
scs.last_error,
|
||||
|
||||
-- Next scheduled run calculation
|
||||
CASE
|
||||
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||
END AS next_scheduled_run,
|
||||
|
||||
-- Latest job info
|
||||
cj.id AS latest_job_id,
|
||||
cj.status AS latest_job_status,
|
||||
cj.job_type AS latest_job_type,
|
||||
cj.trigger_type AS latest_job_trigger,
|
||||
cj.started_at AS latest_job_started,
|
||||
cj.completed_at AS latest_job_completed,
|
||||
cj.products_found AS latest_products_found,
|
||||
cj.products_new AS latest_products_new,
|
||||
cj.products_updated AS latest_products_updated,
|
||||
cj.error_message AS latest_job_error
|
||||
|
||||
FROM stores s
|
||||
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT * FROM crawl_jobs cj2
|
||||
WHERE cj2.store_id = s.id
|
||||
ORDER BY cj2.created_at DESC
|
||||
LIMIT 1
|
||||
) cj ON TRUE
|
||||
WHERE s.active = TRUE;
|
||||
|
||||
-- 4. Grant permissions
|
||||
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||
|
||||
-- 5. Comments
|
||||
COMMENT ON COLUMN store_crawl_schedule.last_status IS 'Orchestrator result status: success, error, sandbox_only, detection_only';
|
||||
COMMENT ON COLUMN store_crawl_schedule.last_summary IS 'Human-readable summary of last orchestrator run';
|
||||
COMMENT ON COLUMN store_crawl_schedule.last_run_at IS 'When orchestrator last ran for this store';
|
||||
COMMENT ON COLUMN crawl_jobs.orchestrator_run_id IS 'Groups related jobs from same orchestrator run';
|
||||
Reference in New Issue
Block a user