Add crawler scheduler, orchestrator, and multi-category intelligence

- Add scheduler UI with store schedules, job queue, and global settings
- Add store crawl orchestrator for intelligent crawl workflow
- Add multi-category intelligence detection (product, specials, brands, metadata)
- Add CrawlerLogger for structured JSON logging
- Add migrations for scheduler tables and dispensary linking
- Add dispensary → scheduler navigation link
- Support production/sandbox crawler modes per provider

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-11-30 09:29:15 -07:00
parent 8b4292fbb2
commit 3861a31a3b
25 changed files with 8874 additions and 13 deletions

View File

@@ -0,0 +1,163 @@
-- =====================================================
-- Crawler Schedule Tables
-- =====================================================
-- Add timezone column to stores table
ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix';
-- 1. Global crawler schedule settings
CREATE TABLE IF NOT EXISTS crawler_schedule (
id SERIAL PRIMARY KEY,
schedule_type VARCHAR(50) NOT NULL, -- 'global_interval', 'daily_special'
enabled BOOLEAN NOT NULL DEFAULT TRUE,
interval_hours INTEGER, -- For global_interval: every N hours
run_time TIME, -- For daily_special: time to run (e.g., 00:01)
description TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT uq_crawler_schedule_type UNIQUE (schedule_type)
);
-- Insert default schedules
INSERT INTO crawler_schedule (schedule_type, enabled, interval_hours, description)
VALUES ('global_interval', TRUE, 4, 'Crawl all stores every N hours')
ON CONFLICT (schedule_type) DO NOTHING;
INSERT INTO crawler_schedule (schedule_type, enabled, run_time, description)
VALUES ('daily_special', TRUE, '00:01', 'Daily specials run at store local midnight')
ON CONFLICT (schedule_type) DO NOTHING;
-- 2. Per-store schedule overrides
CREATE TABLE IF NOT EXISTS store_crawl_schedule (
id SERIAL PRIMARY KEY,
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
interval_hours INTEGER, -- NULL = use global setting
daily_special_enabled BOOLEAN DEFAULT TRUE,
daily_special_time TIME, -- NULL = use store's 00:01 local time
priority INTEGER DEFAULT 0, -- Higher priority = scheduled first
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT uq_store_crawl_schedule_store UNIQUE (store_id)
);
-- 3. Crawl job queue
CREATE TABLE IF NOT EXISTS crawl_jobs (
id SERIAL PRIMARY KEY,
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
-- Job identification
job_type VARCHAR(50) NOT NULL DEFAULT 'full_crawl', -- 'full_crawl', 'specials_only', 'category'
trigger_type VARCHAR(50) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_special'
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
priority INTEGER DEFAULT 0,
-- Timing
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- When job should run
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
-- Results
products_found INTEGER,
products_new INTEGER,
products_updated INTEGER,
error_message TEXT,
-- Metadata
worker_id VARCHAR(100),
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT chk_crawl_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
-- Indexes for efficient job lookup
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_status ON crawl_jobs(store_id, status);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_pending ON crawl_jobs(scheduled_at) WHERE status = 'pending';
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_time ON crawl_jobs(store_id, created_at DESC);
-- 4. Crawl history summary (for UI display)
CREATE OR REPLACE VIEW crawl_schedule_status AS
SELECT
s.id AS store_id,
s.name AS store_name,
s.slug AS store_slug,
s.timezone,
s.active,
s.scrape_enabled,
s.last_scraped_at,
-- Schedule settings (use store override or global)
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
COALESCE(scs.priority, 0) AS priority,
-- Next scheduled run calculation
CASE
WHEN s.last_scraped_at IS NULL THEN NOW()
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
END AS next_scheduled_run,
-- Latest job info
cj.id AS latest_job_id,
cj.status AS latest_job_status,
cj.started_at AS latest_job_started,
cj.completed_at AS latest_job_completed,
cj.products_found AS latest_products_found
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
LEFT JOIN LATERAL (
SELECT * FROM crawl_jobs cj2
WHERE cj2.store_id = s.id
ORDER BY cj2.created_at DESC
LIMIT 1
) cj ON TRUE
WHERE s.active = TRUE;
-- Function to update updated_at timestamps
CREATE OR REPLACE FUNCTION update_schedule_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Triggers
DROP TRIGGER IF EXISTS trigger_crawler_schedule_updated_at ON crawler_schedule;
CREATE TRIGGER trigger_crawler_schedule_updated_at
BEFORE UPDATE ON crawler_schedule
FOR EACH ROW
EXECUTE FUNCTION update_schedule_updated_at();
DROP TRIGGER IF EXISTS trigger_store_crawl_schedule_updated_at ON store_crawl_schedule;
CREATE TRIGGER trigger_store_crawl_schedule_updated_at
BEFORE UPDATE ON store_crawl_schedule
FOR EACH ROW
EXECUTE FUNCTION update_schedule_updated_at();
DROP TRIGGER IF EXISTS trigger_crawl_jobs_updated_at ON crawl_jobs;
CREATE TRIGGER trigger_crawl_jobs_updated_at
BEFORE UPDATE ON crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION update_schedule_updated_at();
-- Grant permissions
GRANT SELECT, INSERT, UPDATE, DELETE ON crawler_schedule TO dutchie;
GRANT SELECT, INSERT, UPDATE, DELETE ON store_crawl_schedule TO dutchie;
GRANT SELECT, INSERT, UPDATE, DELETE ON crawl_jobs TO dutchie;
GRANT USAGE, SELECT ON SEQUENCE crawler_schedule_id_seq TO dutchie;
GRANT USAGE, SELECT ON SEQUENCE store_crawl_schedule_id_seq TO dutchie;
GRANT USAGE, SELECT ON SEQUENCE crawl_jobs_id_seq TO dutchie;
GRANT SELECT ON crawl_schedule_status TO dutchie;

View File

@@ -0,0 +1,57 @@
-- =====================================================
-- Extend dispensaries table for multi-provider crawler
-- =====================================================
-- Menu provider detection
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider VARCHAR(50);
-- Values: 'dutchie', 'treez', 'jane', 'weedmaps', 'iheartjane', 'leafly', 'meadow', 'greenlight', 'other', 'unknown'
-- Confidence score for provider detection (0-100)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD CONSTRAINT chk_provider_confidence
CHECK (menu_provider_confidence >= 0 AND menu_provider_confidence <= 100);
-- Crawler mode: production (stable templates) vs sandbox (learning/unstable)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_mode VARCHAR(20) DEFAULT 'production';
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_mode
CHECK (crawler_mode IN ('production', 'sandbox'));
-- Crawler status for job orchestration
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_status VARCHAR(30) DEFAULT 'idle';
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_status
CHECK (crawler_status IN ('idle', 'queued_detection', 'queued_crawl', 'running', 'ok', 'error_needs_review'));
-- Error tracking
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_menu_error_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_error_message TEXT;
-- Provider detection metadata (raw signals, detection history)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS provider_detection_data JSONB DEFAULT '{}';
-- Indexes for efficient queue queries
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider ON dispensaries(menu_provider);
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_mode ON dispensaries(crawler_mode);
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_status ON dispensaries(crawler_status);
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_confidence ON dispensaries(menu_provider_confidence);
-- Composite index for production Dutchie crawl queue
CREATE INDEX IF NOT EXISTS idx_dispensaries_dutchie_production
ON dispensaries(id)
WHERE menu_provider = 'dutchie' AND crawler_mode = 'production';
-- Composite index for sandbox queue
CREATE INDEX IF NOT EXISTS idx_dispensaries_sandbox
ON dispensaries(id)
WHERE crawler_mode = 'sandbox';
-- Composite index for detection queue
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_detection
ON dispensaries(id)
WHERE menu_provider IS NULL OR menu_provider_confidence < 70;
-- Comment on columns for documentation
COMMENT ON COLUMN dispensaries.menu_provider IS 'Detected menu platform: dutchie, treez, jane, weedmaps, etc.';
COMMENT ON COLUMN dispensaries.menu_provider_confidence IS 'Confidence score 0-100 for provider detection';
COMMENT ON COLUMN dispensaries.crawler_mode IS 'production = stable templates, sandbox = learning mode';
COMMENT ON COLUMN dispensaries.crawler_status IS 'Current state in crawl pipeline';
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSON blob with detection signals and history';

View File

@@ -0,0 +1,237 @@
-- =====================================================
-- Crawler Sandboxes and Templates Tables
-- =====================================================
-- 1. Crawler sandboxes - for learning new providers/templates
CREATE TABLE IF NOT EXISTS crawler_sandboxes (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Detection info
suspected_menu_provider VARCHAR(50), -- What we think the provider is
mode VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_learning', 'validation'
-- Captured data
raw_html_location TEXT, -- S3 key or local file path to captured HTML
screenshot_location TEXT, -- S3 key for screenshot
analysis_json JSONB DEFAULT '{}', -- Extracted patterns, selectors, candidate templates
-- URLs discovered/tested
urls_tested JSONB DEFAULT '[]', -- Array of URLs we fetched
menu_entry_points JSONB DEFAULT '[]', -- Discovered menu URLs
-- Detection signals found
detection_signals JSONB DEFAULT '{}', -- e.g., {"dutchie_embed": false, "treez_script": true, ...}
-- Status tracking
status VARCHAR(30) NOT NULL DEFAULT 'pending',
-- 'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
-- Results
confidence_score SMALLINT DEFAULT 0,
failure_reason TEXT,
human_review_notes TEXT,
-- Timestamps
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
analyzed_at TIMESTAMPTZ,
reviewed_at TIMESTAMPTZ,
CONSTRAINT chk_sandbox_mode CHECK (mode IN ('detection', 'template_learning', 'validation')),
CONSTRAINT chk_sandbox_status CHECK (status IN (
'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
))
);
-- Indexes for sandbox queries
CREATE INDEX IF NOT EXISTS idx_sandbox_dispensary ON crawler_sandboxes(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_sandbox_status ON crawler_sandboxes(status);
CREATE INDEX IF NOT EXISTS idx_sandbox_mode ON crawler_sandboxes(mode);
CREATE INDEX IF NOT EXISTS idx_sandbox_suspected_provider ON crawler_sandboxes(suspected_menu_provider);
-- Unique constraint: one active sandbox per dispensary (can have historical completed ones)
CREATE UNIQUE INDEX IF NOT EXISTS idx_sandbox_active_per_dispensary
ON crawler_sandboxes(dispensary_id)
WHERE status NOT IN ('moved_to_production', 'failed');
-- 2. Crawler templates - reusable scraping configurations
CREATE TABLE IF NOT EXISTS crawler_templates (
id SERIAL PRIMARY KEY,
-- Template identification
provider VARCHAR(50) NOT NULL, -- 'dutchie', 'treez', 'jane', etc.
name VARCHAR(100) NOT NULL, -- 'dutchie_v1', 'treez_standard', 'jane_embedded'
version INTEGER DEFAULT 1,
-- Status
is_active BOOLEAN NOT NULL DEFAULT TRUE,
is_default_for_provider BOOLEAN DEFAULT FALSE,
-- Selector configuration
selector_config JSONB NOT NULL DEFAULT '{}',
-- Structure:
-- {
-- "product_list": "css:.product-card",
-- "product_name": "css:.product-name",
-- "product_price": "css:.price",
-- "product_brand": "css:.brand",
-- "product_image": "css:img.product-image@src",
-- "pagination_next": "css:.next-page",
-- "category_links": "css:.category-nav a",
-- ...
-- }
-- Navigation patterns
navigation_config JSONB DEFAULT '{}',
-- Structure:
-- {
-- "entry_paths": ["/menu", "/shop", "/order"],
-- "age_gate": {"type": "click", "selector": ".age-confirm-btn"},
-- "location_modal": {"dismiss_selector": ".modal-close"},
-- "infinite_scroll": true,
-- "wait_for": ".products-loaded"
-- }
-- Data transformation rules
transform_config JSONB DEFAULT '{}',
-- Structure:
-- {
-- "price_regex": "\\$([\\d.]+)",
-- "weight_normalizer": "g_to_oz",
-- "thc_format": "percentage"
-- }
-- Validation rules
validation_rules JSONB DEFAULT '{}',
-- Structure:
-- {
-- "min_products": 5,
-- "required_fields": ["name", "price"],
-- "price_range": [0.01, 10000]
-- }
-- Test data for validation
test_urls JSONB DEFAULT '[]', -- URLs to validate template against
expected_structure JSONB DEFAULT '{}', -- What we expect to extract
-- Stats
dispensaries_using INTEGER DEFAULT 0,
success_rate DECIMAL(5,2) DEFAULT 0, -- 0-100%
last_successful_crawl TIMESTAMPTZ,
last_failed_crawl TIMESTAMPTZ,
-- Metadata
notes TEXT,
created_by VARCHAR(100),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT uq_template_name UNIQUE (provider, name, version)
);
-- Indexes for templates
CREATE INDEX IF NOT EXISTS idx_template_provider ON crawler_templates(provider);
CREATE INDEX IF NOT EXISTS idx_template_active ON crawler_templates(is_active);
CREATE INDEX IF NOT EXISTS idx_template_default ON crawler_templates(provider, is_default_for_provider)
WHERE is_default_for_provider = TRUE;
-- 3. Sandbox crawl jobs - separate queue for sandbox operations
CREATE TABLE IF NOT EXISTS sandbox_crawl_jobs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
sandbox_id INTEGER REFERENCES crawler_sandboxes(id) ON DELETE SET NULL,
-- Job type
job_type VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_test', 'deep_crawl'
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
priority INTEGER DEFAULT 0,
-- Timing
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
-- Worker tracking
worker_id VARCHAR(100),
-- Results
result_summary JSONB DEFAULT '{}',
error_message TEXT,
-- Timestamps
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT chk_sandbox_job_type CHECK (job_type IN ('detection', 'template_test', 'deep_crawl')),
CONSTRAINT chk_sandbox_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
-- Indexes for sandbox jobs
CREATE INDEX IF NOT EXISTS idx_sandbox_job_status ON sandbox_crawl_jobs(status);
CREATE INDEX IF NOT EXISTS idx_sandbox_job_dispensary ON sandbox_crawl_jobs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_sandbox_job_pending ON sandbox_crawl_jobs(scheduled_at) WHERE status = 'pending';
-- 4. Insert default Dutchie template (our only stable one for now)
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, selector_config, navigation_config, notes)
VALUES (
'dutchie',
'dutchie_standard',
1,
TRUE,
TRUE,
'{
"type": "api_based",
"graphql_endpoint": "/graphql",
"product_container": "data.menu.products",
"uses_puppeteer": true,
"notes": "Dutchie uses GraphQL API, scraped via puppeteer interception"
}'::jsonb,
'{
"entry_paths": ["/menu", "/order", "/embedded-menu", "/products"],
"age_gate": {"type": "auto_detected", "handled_by_stealth": true},
"wait_strategy": "networkidle2",
"requires_javascript": true
}'::jsonb,
'Default Dutchie template - uses existing scraper-v2 pipeline'
)
ON CONFLICT (provider, name, version) DO NOTHING;
-- 5. Triggers for updated_at
CREATE OR REPLACE FUNCTION update_sandbox_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS trigger_sandbox_updated_at ON crawler_sandboxes;
CREATE TRIGGER trigger_sandbox_updated_at
BEFORE UPDATE ON crawler_sandboxes
FOR EACH ROW
EXECUTE FUNCTION update_sandbox_timestamp();
DROP TRIGGER IF EXISTS trigger_template_updated_at ON crawler_templates;
CREATE TRIGGER trigger_template_updated_at
BEFORE UPDATE ON crawler_templates
FOR EACH ROW
EXECUTE FUNCTION update_sandbox_timestamp();
DROP TRIGGER IF EXISTS trigger_sandbox_job_updated_at ON sandbox_crawl_jobs;
CREATE TRIGGER trigger_sandbox_job_updated_at
BEFORE UPDATE ON sandbox_crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION update_sandbox_timestamp();
-- Comments for documentation
COMMENT ON TABLE crawler_sandboxes IS 'Learning/testing environment for unknown menu providers';
COMMENT ON TABLE crawler_templates IS 'Reusable scraping configurations per menu provider';
COMMENT ON TABLE sandbox_crawl_jobs IS 'Job queue for sandbox crawl operations (separate from production)';

View File

@@ -0,0 +1,118 @@
-- =====================================================
-- Multi-Category Intelligence Support
-- =====================================================
-- Each dispensary can have different providers for different
-- intelligence categories (products, specials, brands, metadata)
-- 1. Product Intelligence Columns
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_product_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_detection_data JSONB DEFAULT '{}';
-- 2. Specials Intelligence Columns
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_specials_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_detection_data JSONB DEFAULT '{}';
-- 3. Brand Intelligence Columns
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_brand_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_detection_data JSONB DEFAULT '{}';
-- 4. Metadata Intelligence Columns (categories, taxonomy, etc.)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_metadata_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_detection_data JSONB DEFAULT '{}';
-- 5. Add category column to crawler_sandboxes
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
-- Valid categories: 'product', 'specials', 'brand', 'metadata'
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS quality_score SMALLINT DEFAULT 0;
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS products_extracted INTEGER DEFAULT 0;
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS fields_missing INTEGER DEFAULT 0;
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS error_count INTEGER DEFAULT 0;
-- 6. Add category column to sandbox_crawl_jobs
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
-- 7. Indexes for per-category queries
CREATE INDEX IF NOT EXISTS idx_disp_product_provider ON dispensaries(product_provider);
CREATE INDEX IF NOT EXISTS idx_disp_product_mode ON dispensaries(product_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_disp_specials_provider ON dispensaries(specials_provider);
CREATE INDEX IF NOT EXISTS idx_disp_specials_mode ON dispensaries(specials_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_disp_brand_provider ON dispensaries(brand_provider);
CREATE INDEX IF NOT EXISTS idx_disp_brand_mode ON dispensaries(brand_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_disp_metadata_provider ON dispensaries(metadata_provider);
CREATE INDEX IF NOT EXISTS idx_disp_metadata_mode ON dispensaries(metadata_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_sandbox_category ON crawler_sandboxes(category);
CREATE INDEX IF NOT EXISTS idx_sandbox_template ON crawler_sandboxes(template_name);
CREATE INDEX IF NOT EXISTS idx_sandbox_job_category ON sandbox_crawl_jobs(category);
-- 8. Migrate existing menu_provider to product_provider for Dutchie stores
-- (Only if menu_provider = 'dutchie' and product_provider is null)
UPDATE dispensaries
SET
product_provider = menu_provider,
product_confidence = menu_provider_confidence,
product_crawler_mode = CASE
WHEN menu_provider = 'dutchie' AND menu_provider_confidence >= 70 THEN 'production'
ELSE 'sandbox'
END
WHERE menu_provider IS NOT NULL
AND product_provider IS NULL;
-- 9. Add environment column to crawler_templates if not exists
ALTER TABLE crawler_templates ADD COLUMN IF NOT EXISTS environment VARCHAR(20) DEFAULT 'production';
-- Valid: 'production', 'sandbox'
-- 10. Insert Treez sandbox template (existing code to be linked here)
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, environment, selector_config, navigation_config, notes)
VALUES (
'treez',
'treez_products_v0',
1,
FALSE, -- Not active for production
FALSE, -- Not default
'sandbox',
'{
"type": "api_based",
"notes": "Treez API-based scraper - unreliable, sandbox only",
"product_container": "products",
"requires_api_key": false,
"uses_puppeteer": true
}'::jsonb,
'{
"entry_paths": ["/menu", "/shop"],
"wait_strategy": "networkidle2",
"requires_javascript": true
}'::jsonb,
'Treez sandbox template - v0 implementation, needs quality improvement'
)
ON CONFLICT (provider, name, version) DO NOTHING;
-- 11. Update existing Dutchie template to specify environment
UPDATE crawler_templates
SET environment = 'production'
WHERE provider = 'dutchie' AND name = 'dutchie_standard';
-- 12. Comments
COMMENT ON COLUMN dispensaries.product_provider IS 'Provider for product intelligence (dutchie, treez, jane, etc.)';
COMMENT ON COLUMN dispensaries.product_crawler_mode IS 'production or sandbox mode for product crawling';
COMMENT ON COLUMN dispensaries.specials_provider IS 'Provider for specials/deals intelligence';
COMMENT ON COLUMN dispensaries.brand_provider IS 'Provider for brand intelligence';
COMMENT ON COLUMN dispensaries.metadata_provider IS 'Provider for metadata/taxonomy intelligence';
COMMENT ON COLUMN crawler_sandboxes.category IS 'Intelligence category: product, specials, brand, metadata';
COMMENT ON COLUMN crawler_sandboxes.quality_score IS 'Quality score 0-100 for sandbox run results';
COMMENT ON COLUMN crawler_templates.environment IS 'Template environment: production or sandbox';

View File

@@ -0,0 +1,69 @@
-- =====================================================
-- Link Stores to Dispensaries (Master AZDHS Directory)
-- =====================================================
-- This migration adds a foreign key from stores to dispensaries,
-- allowing the scheduler to reference the master dispensary records.
-- 1. Add dispensary_id column to stores table
ALTER TABLE stores ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL;
-- 2. Create index for efficient lookups
CREATE INDEX IF NOT EXISTS idx_stores_dispensary_id ON stores(dispensary_id);
-- 3. Update the crawl_schedule_status view to include dispensary info
DROP VIEW IF EXISTS crawl_schedule_status;
CREATE OR REPLACE VIEW crawl_schedule_status AS
SELECT
s.id AS store_id,
s.name AS store_name,
s.slug AS store_slug,
s.timezone,
s.active,
s.scrape_enabled,
s.last_scraped_at,
-- Dispensary info (master record)
s.dispensary_id,
d.name AS dispensary_name,
d.company_name AS dispensary_company,
d.city AS dispensary_city,
d.address AS dispensary_address,
d.menu_url AS dispensary_menu_url,
-- Schedule settings (use store override or global)
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
COALESCE(scs.priority, 0) AS priority,
-- Next scheduled run calculation
CASE
WHEN s.last_scraped_at IS NULL THEN NOW()
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
END AS next_scheduled_run,
-- Latest job info
cj.id AS latest_job_id,
cj.status AS latest_job_status,
cj.started_at AS latest_job_started,
cj.completed_at AS latest_job_completed,
cj.products_found AS latest_products_found
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
LEFT JOIN LATERAL (
SELECT * FROM crawl_jobs cj2
WHERE cj2.store_id = s.id
ORDER BY cj2.created_at DESC
LIMIT 1
) cj ON TRUE
WHERE s.active = TRUE;
-- Grant permissions
GRANT SELECT ON crawl_schedule_status TO dutchie;
-- 4. Comments
COMMENT ON COLUMN stores.dispensary_id IS 'FK to dispensaries table (master AZDHS directory)';

View File

@@ -0,0 +1,99 @@
-- =====================================================
-- Scheduler Orchestrator Fields
-- =====================================================
-- Add last_status and last_summary to store_crawl_schedule
-- for meaningful job result tracking
-- 1. Add new columns to store_crawl_schedule
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_status VARCHAR(50);
-- Valid values: 'success', 'error', 'sandbox_only', 'detection_only', 'pending'
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_summary TEXT;
-- Human-readable summary like "Detection + Dutchie products crawl (187 items)"
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_run_at TIMESTAMPTZ;
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_error TEXT;
-- 2. Add job_type tracking to crawl_jobs for better job categorization
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS orchestrator_run_id UUID;
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS detection_result JSONB;
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_new INTEGER;
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_updated INTEGER;
-- 3. Update the crawl_schedule_status view to include new fields
DROP VIEW IF EXISTS crawl_schedule_status;
CREATE OR REPLACE VIEW crawl_schedule_status AS
SELECT
s.id AS store_id,
s.name AS store_name,
s.slug AS store_slug,
s.timezone,
s.active,
s.scrape_enabled,
s.last_scraped_at,
-- Dispensary info (master record)
s.dispensary_id,
d.name AS dispensary_name,
d.company_name AS dispensary_company,
d.city AS dispensary_city,
d.address AS dispensary_address,
d.menu_url AS dispensary_menu_url,
-- Provider intelligence from dispensary (if linked)
d.product_provider,
d.product_confidence,
d.product_crawler_mode,
-- Schedule settings (use store override or global)
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
COALESCE(scs.priority, 0) AS priority,
-- Orchestrator status
scs.last_status,
scs.last_summary,
scs.last_run_at AS schedule_last_run,
scs.last_error,
-- Next scheduled run calculation
CASE
WHEN s.last_scraped_at IS NULL THEN NOW()
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
END AS next_scheduled_run,
-- Latest job info
cj.id AS latest_job_id,
cj.status AS latest_job_status,
cj.job_type AS latest_job_type,
cj.trigger_type AS latest_job_trigger,
cj.started_at AS latest_job_started,
cj.completed_at AS latest_job_completed,
cj.products_found AS latest_products_found,
cj.products_new AS latest_products_new,
cj.products_updated AS latest_products_updated,
cj.error_message AS latest_job_error
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
LEFT JOIN LATERAL (
SELECT * FROM crawl_jobs cj2
WHERE cj2.store_id = s.id
ORDER BY cj2.created_at DESC
LIMIT 1
) cj ON TRUE
WHERE s.active = TRUE;
-- 4. Grant permissions
GRANT SELECT ON crawl_schedule_status TO dutchie;
-- 5. Comments
COMMENT ON COLUMN store_crawl_schedule.last_status IS 'Orchestrator result status: success, error, sandbox_only, detection_only';
COMMENT ON COLUMN store_crawl_schedule.last_summary IS 'Human-readable summary of last orchestrator run';
COMMENT ON COLUMN store_crawl_schedule.last_run_at IS 'When orchestrator last ran for this store';
COMMENT ON COLUMN crawl_jobs.orchestrator_run_id IS 'Groups related jobs from same orchestrator run';