Add crawler scheduler, orchestrator, and multi-category intelligence

- Add scheduler UI with store schedules, job queue, and global settings
- Add store crawl orchestrator for intelligent crawl workflow
- Add multi-category intelligence detection (product, specials, brands, metadata)
- Add CrawlerLogger for structured JSON logging
- Add migrations for scheduler tables and dispensary linking
- Add dispensary → scheduler navigation link
- Support production/sandbox crawler modes per provider

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-11-30 09:29:15 -07:00
parent 8b4292fbb2
commit 3861a31a3b
25 changed files with 8874 additions and 13 deletions

View File

@@ -0,0 +1,163 @@
-- =====================================================
-- Crawler Schedule Tables
-- =====================================================
-- Add timezone column to stores table
ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix';
-- 1. Global crawler schedule settings
CREATE TABLE IF NOT EXISTS crawler_schedule (
id SERIAL PRIMARY KEY,
schedule_type VARCHAR(50) NOT NULL, -- 'global_interval', 'daily_special'
enabled BOOLEAN NOT NULL DEFAULT TRUE,
interval_hours INTEGER, -- For global_interval: every N hours
run_time TIME, -- For daily_special: time to run (e.g., 00:01)
description TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT uq_crawler_schedule_type UNIQUE (schedule_type)
);
-- Insert default schedules
INSERT INTO crawler_schedule (schedule_type, enabled, interval_hours, description)
VALUES ('global_interval', TRUE, 4, 'Crawl all stores every N hours')
ON CONFLICT (schedule_type) DO NOTHING;
INSERT INTO crawler_schedule (schedule_type, enabled, run_time, description)
VALUES ('daily_special', TRUE, '00:01', 'Daily specials run at store local midnight')
ON CONFLICT (schedule_type) DO NOTHING;
-- 2. Per-store schedule overrides
CREATE TABLE IF NOT EXISTS store_crawl_schedule (
id SERIAL PRIMARY KEY,
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
interval_hours INTEGER, -- NULL = use global setting
daily_special_enabled BOOLEAN DEFAULT TRUE,
daily_special_time TIME, -- NULL = use store's 00:01 local time
priority INTEGER DEFAULT 0, -- Higher priority = scheduled first
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT uq_store_crawl_schedule_store UNIQUE (store_id)
);
-- 3. Crawl job queue
CREATE TABLE IF NOT EXISTS crawl_jobs (
id SERIAL PRIMARY KEY,
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
-- Job identification
job_type VARCHAR(50) NOT NULL DEFAULT 'full_crawl', -- 'full_crawl', 'specials_only', 'category'
trigger_type VARCHAR(50) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_special'
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
priority INTEGER DEFAULT 0,
-- Timing
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- When job should run
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
-- Results
products_found INTEGER,
products_new INTEGER,
products_updated INTEGER,
error_message TEXT,
-- Metadata
worker_id VARCHAR(100),
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT chk_crawl_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
-- Indexes for efficient job lookup
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_status ON crawl_jobs(store_id, status);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_pending ON crawl_jobs(scheduled_at) WHERE status = 'pending';
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_time ON crawl_jobs(store_id, created_at DESC);
-- 4. Crawl history summary (for UI display)
CREATE OR REPLACE VIEW crawl_schedule_status AS
SELECT
s.id AS store_id,
s.name AS store_name,
s.slug AS store_slug,
s.timezone,
s.active,
s.scrape_enabled,
s.last_scraped_at,
-- Schedule settings (use store override or global)
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
COALESCE(scs.priority, 0) AS priority,
-- Next scheduled run calculation
CASE
WHEN s.last_scraped_at IS NULL THEN NOW()
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
END AS next_scheduled_run,
-- Latest job info
cj.id AS latest_job_id,
cj.status AS latest_job_status,
cj.started_at AS latest_job_started,
cj.completed_at AS latest_job_completed,
cj.products_found AS latest_products_found
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
LEFT JOIN LATERAL (
SELECT * FROM crawl_jobs cj2
WHERE cj2.store_id = s.id
ORDER BY cj2.created_at DESC
LIMIT 1
) cj ON TRUE
WHERE s.active = TRUE;
-- Function to update updated_at timestamps
CREATE OR REPLACE FUNCTION update_schedule_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Triggers
DROP TRIGGER IF EXISTS trigger_crawler_schedule_updated_at ON crawler_schedule;
CREATE TRIGGER trigger_crawler_schedule_updated_at
BEFORE UPDATE ON crawler_schedule
FOR EACH ROW
EXECUTE FUNCTION update_schedule_updated_at();
DROP TRIGGER IF EXISTS trigger_store_crawl_schedule_updated_at ON store_crawl_schedule;
CREATE TRIGGER trigger_store_crawl_schedule_updated_at
BEFORE UPDATE ON store_crawl_schedule
FOR EACH ROW
EXECUTE FUNCTION update_schedule_updated_at();
DROP TRIGGER IF EXISTS trigger_crawl_jobs_updated_at ON crawl_jobs;
CREATE TRIGGER trigger_crawl_jobs_updated_at
BEFORE UPDATE ON crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION update_schedule_updated_at();
-- Grant permissions
GRANT SELECT, INSERT, UPDATE, DELETE ON crawler_schedule TO dutchie;
GRANT SELECT, INSERT, UPDATE, DELETE ON store_crawl_schedule TO dutchie;
GRANT SELECT, INSERT, UPDATE, DELETE ON crawl_jobs TO dutchie;
GRANT USAGE, SELECT ON SEQUENCE crawler_schedule_id_seq TO dutchie;
GRANT USAGE, SELECT ON SEQUENCE store_crawl_schedule_id_seq TO dutchie;
GRANT USAGE, SELECT ON SEQUENCE crawl_jobs_id_seq TO dutchie;
GRANT SELECT ON crawl_schedule_status TO dutchie;

View File

@@ -0,0 +1,57 @@
-- =====================================================
-- Extend dispensaries table for multi-provider crawler
-- =====================================================
-- Menu provider detection
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider VARCHAR(50);
-- Values: 'dutchie', 'treez', 'jane', 'weedmaps', 'iheartjane', 'leafly', 'meadow', 'greenlight', 'other', 'unknown'
-- Confidence score for provider detection (0-100)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD CONSTRAINT chk_provider_confidence
CHECK (menu_provider_confidence >= 0 AND menu_provider_confidence <= 100);
-- Crawler mode: production (stable templates) vs sandbox (learning/unstable)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_mode VARCHAR(20) DEFAULT 'production';
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_mode
CHECK (crawler_mode IN ('production', 'sandbox'));
-- Crawler status for job orchestration
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_status VARCHAR(30) DEFAULT 'idle';
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_status
CHECK (crawler_status IN ('idle', 'queued_detection', 'queued_crawl', 'running', 'ok', 'error_needs_review'));
-- Error tracking
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_menu_error_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_error_message TEXT;
-- Provider detection metadata (raw signals, detection history)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS provider_detection_data JSONB DEFAULT '{}';
-- Indexes for efficient queue queries
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider ON dispensaries(menu_provider);
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_mode ON dispensaries(crawler_mode);
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_status ON dispensaries(crawler_status);
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_confidence ON dispensaries(menu_provider_confidence);
-- Composite index for production Dutchie crawl queue
CREATE INDEX IF NOT EXISTS idx_dispensaries_dutchie_production
ON dispensaries(id)
WHERE menu_provider = 'dutchie' AND crawler_mode = 'production';
-- Composite index for sandbox queue
CREATE INDEX IF NOT EXISTS idx_dispensaries_sandbox
ON dispensaries(id)
WHERE crawler_mode = 'sandbox';
-- Composite index for detection queue
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_detection
ON dispensaries(id)
WHERE menu_provider IS NULL OR menu_provider_confidence < 70;
-- Comment on columns for documentation
COMMENT ON COLUMN dispensaries.menu_provider IS 'Detected menu platform: dutchie, treez, jane, weedmaps, etc.';
COMMENT ON COLUMN dispensaries.menu_provider_confidence IS 'Confidence score 0-100 for provider detection';
COMMENT ON COLUMN dispensaries.crawler_mode IS 'production = stable templates, sandbox = learning mode';
COMMENT ON COLUMN dispensaries.crawler_status IS 'Current state in crawl pipeline';
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSON blob with detection signals and history';

View File

@@ -0,0 +1,237 @@
-- =====================================================
-- Crawler Sandboxes and Templates Tables
-- =====================================================
-- 1. Crawler sandboxes - for learning new providers/templates
CREATE TABLE IF NOT EXISTS crawler_sandboxes (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Detection info
suspected_menu_provider VARCHAR(50), -- What we think the provider is
mode VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_learning', 'validation'
-- Captured data
raw_html_location TEXT, -- S3 key or local file path to captured HTML
screenshot_location TEXT, -- S3 key for screenshot
analysis_json JSONB DEFAULT '{}', -- Extracted patterns, selectors, candidate templates
-- URLs discovered/tested
urls_tested JSONB DEFAULT '[]', -- Array of URLs we fetched
menu_entry_points JSONB DEFAULT '[]', -- Discovered menu URLs
-- Detection signals found
detection_signals JSONB DEFAULT '{}', -- e.g., {"dutchie_embed": false, "treez_script": true, ...}
-- Status tracking
status VARCHAR(30) NOT NULL DEFAULT 'pending',
-- 'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
-- Results
confidence_score SMALLINT DEFAULT 0,
failure_reason TEXT,
human_review_notes TEXT,
-- Timestamps
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
analyzed_at TIMESTAMPTZ,
reviewed_at TIMESTAMPTZ,
CONSTRAINT chk_sandbox_mode CHECK (mode IN ('detection', 'template_learning', 'validation')),
CONSTRAINT chk_sandbox_status CHECK (status IN (
'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
))
);
-- Indexes for sandbox queries
CREATE INDEX IF NOT EXISTS idx_sandbox_dispensary ON crawler_sandboxes(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_sandbox_status ON crawler_sandboxes(status);
CREATE INDEX IF NOT EXISTS idx_sandbox_mode ON crawler_sandboxes(mode);
CREATE INDEX IF NOT EXISTS idx_sandbox_suspected_provider ON crawler_sandboxes(suspected_menu_provider);
-- Unique constraint: one active sandbox per dispensary (can have historical completed ones)
CREATE UNIQUE INDEX IF NOT EXISTS idx_sandbox_active_per_dispensary
ON crawler_sandboxes(dispensary_id)
WHERE status NOT IN ('moved_to_production', 'failed');
-- 2. Crawler templates - reusable scraping configurations
CREATE TABLE IF NOT EXISTS crawler_templates (
id SERIAL PRIMARY KEY,
-- Template identification
provider VARCHAR(50) NOT NULL, -- 'dutchie', 'treez', 'jane', etc.
name VARCHAR(100) NOT NULL, -- 'dutchie_v1', 'treez_standard', 'jane_embedded'
version INTEGER DEFAULT 1,
-- Status
is_active BOOLEAN NOT NULL DEFAULT TRUE,
is_default_for_provider BOOLEAN DEFAULT FALSE,
-- Selector configuration
selector_config JSONB NOT NULL DEFAULT '{}',
-- Structure:
-- {
-- "product_list": "css:.product-card",
-- "product_name": "css:.product-name",
-- "product_price": "css:.price",
-- "product_brand": "css:.brand",
-- "product_image": "css:img.product-image@src",
-- "pagination_next": "css:.next-page",
-- "category_links": "css:.category-nav a",
-- ...
-- }
-- Navigation patterns
navigation_config JSONB DEFAULT '{}',
-- Structure:
-- {
-- "entry_paths": ["/menu", "/shop", "/order"],
-- "age_gate": {"type": "click", "selector": ".age-confirm-btn"},
-- "location_modal": {"dismiss_selector": ".modal-close"},
-- "infinite_scroll": true,
-- "wait_for": ".products-loaded"
-- }
-- Data transformation rules
transform_config JSONB DEFAULT '{}',
-- Structure:
-- {
-- "price_regex": "\\$([\\d.]+)",
-- "weight_normalizer": "g_to_oz",
-- "thc_format": "percentage"
-- }
-- Validation rules
validation_rules JSONB DEFAULT '{}',
-- Structure:
-- {
-- "min_products": 5,
-- "required_fields": ["name", "price"],
-- "price_range": [0.01, 10000]
-- }
-- Test data for validation
test_urls JSONB DEFAULT '[]', -- URLs to validate template against
expected_structure JSONB DEFAULT '{}', -- What we expect to extract
-- Stats
dispensaries_using INTEGER DEFAULT 0,
success_rate DECIMAL(5,2) DEFAULT 0, -- 0-100%
last_successful_crawl TIMESTAMPTZ,
last_failed_crawl TIMESTAMPTZ,
-- Metadata
notes TEXT,
created_by VARCHAR(100),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT uq_template_name UNIQUE (provider, name, version)
);
-- Indexes for templates
CREATE INDEX IF NOT EXISTS idx_template_provider ON crawler_templates(provider);
CREATE INDEX IF NOT EXISTS idx_template_active ON crawler_templates(is_active);
CREATE INDEX IF NOT EXISTS idx_template_default ON crawler_templates(provider, is_default_for_provider)
WHERE is_default_for_provider = TRUE;
-- 3. Sandbox crawl jobs - separate queue for sandbox operations
CREATE TABLE IF NOT EXISTS sandbox_crawl_jobs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
sandbox_id INTEGER REFERENCES crawler_sandboxes(id) ON DELETE SET NULL,
-- Job type
job_type VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_test', 'deep_crawl'
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
priority INTEGER DEFAULT 0,
-- Timing
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
-- Worker tracking
worker_id VARCHAR(100),
-- Results
result_summary JSONB DEFAULT '{}',
error_message TEXT,
-- Timestamps
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT chk_sandbox_job_type CHECK (job_type IN ('detection', 'template_test', 'deep_crawl')),
CONSTRAINT chk_sandbox_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
);
-- Indexes for sandbox jobs
CREATE INDEX IF NOT EXISTS idx_sandbox_job_status ON sandbox_crawl_jobs(status);
CREATE INDEX IF NOT EXISTS idx_sandbox_job_dispensary ON sandbox_crawl_jobs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_sandbox_job_pending ON sandbox_crawl_jobs(scheduled_at) WHERE status = 'pending';
-- 4. Insert default Dutchie template (our only stable one for now)
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, selector_config, navigation_config, notes)
VALUES (
'dutchie',
'dutchie_standard',
1,
TRUE,
TRUE,
'{
"type": "api_based",
"graphql_endpoint": "/graphql",
"product_container": "data.menu.products",
"uses_puppeteer": true,
"notes": "Dutchie uses GraphQL API, scraped via puppeteer interception"
}'::jsonb,
'{
"entry_paths": ["/menu", "/order", "/embedded-menu", "/products"],
"age_gate": {"type": "auto_detected", "handled_by_stealth": true},
"wait_strategy": "networkidle2",
"requires_javascript": true
}'::jsonb,
'Default Dutchie template - uses existing scraper-v2 pipeline'
)
ON CONFLICT (provider, name, version) DO NOTHING;
-- 5. Triggers for updated_at
CREATE OR REPLACE FUNCTION update_sandbox_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS trigger_sandbox_updated_at ON crawler_sandboxes;
CREATE TRIGGER trigger_sandbox_updated_at
BEFORE UPDATE ON crawler_sandboxes
FOR EACH ROW
EXECUTE FUNCTION update_sandbox_timestamp();
DROP TRIGGER IF EXISTS trigger_template_updated_at ON crawler_templates;
CREATE TRIGGER trigger_template_updated_at
BEFORE UPDATE ON crawler_templates
FOR EACH ROW
EXECUTE FUNCTION update_sandbox_timestamp();
DROP TRIGGER IF EXISTS trigger_sandbox_job_updated_at ON sandbox_crawl_jobs;
CREATE TRIGGER trigger_sandbox_job_updated_at
BEFORE UPDATE ON sandbox_crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION update_sandbox_timestamp();
-- Comments for documentation
COMMENT ON TABLE crawler_sandboxes IS 'Learning/testing environment for unknown menu providers';
COMMENT ON TABLE crawler_templates IS 'Reusable scraping configurations per menu provider';
COMMENT ON TABLE sandbox_crawl_jobs IS 'Job queue for sandbox crawl operations (separate from production)';

View File

@@ -0,0 +1,118 @@
-- =====================================================
-- Multi-Category Intelligence Support
-- =====================================================
-- Each dispensary can have different providers for different
-- intelligence categories (products, specials, brands, metadata)
-- 1. Product Intelligence Columns
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_product_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_detection_data JSONB DEFAULT '{}';
-- 2. Specials Intelligence Columns
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_specials_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_detection_data JSONB DEFAULT '{}';
-- 3. Brand Intelligence Columns
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_brand_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_detection_data JSONB DEFAULT '{}';
-- 4. Metadata Intelligence Columns (categories, taxonomy, etc.)
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_provider VARCHAR(50);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_confidence SMALLINT DEFAULT 0;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_metadata_scan_at TIMESTAMPTZ;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_detection_data JSONB DEFAULT '{}';
-- 5. Add category column to crawler_sandboxes
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
-- Valid categories: 'product', 'specials', 'brand', 'metadata'
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS quality_score SMALLINT DEFAULT 0;
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS products_extracted INTEGER DEFAULT 0;
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS fields_missing INTEGER DEFAULT 0;
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS error_count INTEGER DEFAULT 0;
-- 6. Add category column to sandbox_crawl_jobs
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
-- 7. Indexes for per-category queries
CREATE INDEX IF NOT EXISTS idx_disp_product_provider ON dispensaries(product_provider);
CREATE INDEX IF NOT EXISTS idx_disp_product_mode ON dispensaries(product_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_disp_specials_provider ON dispensaries(specials_provider);
CREATE INDEX IF NOT EXISTS idx_disp_specials_mode ON dispensaries(specials_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_disp_brand_provider ON dispensaries(brand_provider);
CREATE INDEX IF NOT EXISTS idx_disp_brand_mode ON dispensaries(brand_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_disp_metadata_provider ON dispensaries(metadata_provider);
CREATE INDEX IF NOT EXISTS idx_disp_metadata_mode ON dispensaries(metadata_crawler_mode);
CREATE INDEX IF NOT EXISTS idx_sandbox_category ON crawler_sandboxes(category);
CREATE INDEX IF NOT EXISTS idx_sandbox_template ON crawler_sandboxes(template_name);
CREATE INDEX IF NOT EXISTS idx_sandbox_job_category ON sandbox_crawl_jobs(category);
-- 8. Migrate existing menu_provider to product_provider for Dutchie stores
-- (Only if menu_provider = 'dutchie' and product_provider is null)
UPDATE dispensaries
SET
product_provider = menu_provider,
product_confidence = menu_provider_confidence,
product_crawler_mode = CASE
WHEN menu_provider = 'dutchie' AND menu_provider_confidence >= 70 THEN 'production'
ELSE 'sandbox'
END
WHERE menu_provider IS NOT NULL
AND product_provider IS NULL;
-- 9. Add environment column to crawler_templates if not exists
ALTER TABLE crawler_templates ADD COLUMN IF NOT EXISTS environment VARCHAR(20) DEFAULT 'production';
-- Valid: 'production', 'sandbox'
-- 10. Insert Treez sandbox template (existing code to be linked here)
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, environment, selector_config, navigation_config, notes)
VALUES (
'treez',
'treez_products_v0',
1,
FALSE, -- Not active for production
FALSE, -- Not default
'sandbox',
'{
"type": "api_based",
"notes": "Treez API-based scraper - unreliable, sandbox only",
"product_container": "products",
"requires_api_key": false,
"uses_puppeteer": true
}'::jsonb,
'{
"entry_paths": ["/menu", "/shop"],
"wait_strategy": "networkidle2",
"requires_javascript": true
}'::jsonb,
'Treez sandbox template - v0 implementation, needs quality improvement'
)
ON CONFLICT (provider, name, version) DO NOTHING;
-- 11. Update existing Dutchie template to specify environment
UPDATE crawler_templates
SET environment = 'production'
WHERE provider = 'dutchie' AND name = 'dutchie_standard';
-- 12. Comments
COMMENT ON COLUMN dispensaries.product_provider IS 'Provider for product intelligence (dutchie, treez, jane, etc.)';
COMMENT ON COLUMN dispensaries.product_crawler_mode IS 'production or sandbox mode for product crawling';
COMMENT ON COLUMN dispensaries.specials_provider IS 'Provider for specials/deals intelligence';
COMMENT ON COLUMN dispensaries.brand_provider IS 'Provider for brand intelligence';
COMMENT ON COLUMN dispensaries.metadata_provider IS 'Provider for metadata/taxonomy intelligence';
COMMENT ON COLUMN crawler_sandboxes.category IS 'Intelligence category: product, specials, brand, metadata';
COMMENT ON COLUMN crawler_sandboxes.quality_score IS 'Quality score 0-100 for sandbox run results';
COMMENT ON COLUMN crawler_templates.environment IS 'Template environment: production or sandbox';

View File

@@ -0,0 +1,69 @@
-- =====================================================
-- Link Stores to Dispensaries (Master AZDHS Directory)
-- =====================================================
-- This migration adds a foreign key from stores to dispensaries,
-- allowing the scheduler to reference the master dispensary records.
-- 1. Add dispensary_id column to stores table
ALTER TABLE stores ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL;
-- 2. Create index for efficient lookups
CREATE INDEX IF NOT EXISTS idx_stores_dispensary_id ON stores(dispensary_id);
-- 3. Update the crawl_schedule_status view to include dispensary info
DROP VIEW IF EXISTS crawl_schedule_status;
CREATE OR REPLACE VIEW crawl_schedule_status AS
SELECT
s.id AS store_id,
s.name AS store_name,
s.slug AS store_slug,
s.timezone,
s.active,
s.scrape_enabled,
s.last_scraped_at,
-- Dispensary info (master record)
s.dispensary_id,
d.name AS dispensary_name,
d.company_name AS dispensary_company,
d.city AS dispensary_city,
d.address AS dispensary_address,
d.menu_url AS dispensary_menu_url,
-- Schedule settings (use store override or global)
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
COALESCE(scs.priority, 0) AS priority,
-- Next scheduled run calculation
CASE
WHEN s.last_scraped_at IS NULL THEN NOW()
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
END AS next_scheduled_run,
-- Latest job info
cj.id AS latest_job_id,
cj.status AS latest_job_status,
cj.started_at AS latest_job_started,
cj.completed_at AS latest_job_completed,
cj.products_found AS latest_products_found
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
LEFT JOIN LATERAL (
SELECT * FROM crawl_jobs cj2
WHERE cj2.store_id = s.id
ORDER BY cj2.created_at DESC
LIMIT 1
) cj ON TRUE
WHERE s.active = TRUE;
-- Grant permissions
GRANT SELECT ON crawl_schedule_status TO dutchie;
-- 4. Comments
COMMENT ON COLUMN stores.dispensary_id IS 'FK to dispensaries table (master AZDHS directory)';

View File

@@ -0,0 +1,99 @@
-- =====================================================
-- Scheduler Orchestrator Fields
-- =====================================================
-- Add last_status and last_summary to store_crawl_schedule
-- for meaningful job result tracking
-- 1. Add new columns to store_crawl_schedule
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_status VARCHAR(50);
-- Valid values: 'success', 'error', 'sandbox_only', 'detection_only', 'pending'
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_summary TEXT;
-- Human-readable summary like "Detection + Dutchie products crawl (187 items)"
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_run_at TIMESTAMPTZ;
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_error TEXT;
-- 2. Add job_type tracking to crawl_jobs for better job categorization
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS orchestrator_run_id UUID;
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS detection_result JSONB;
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_new INTEGER;
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_updated INTEGER;
-- 3. Update the crawl_schedule_status view to include new fields
DROP VIEW IF EXISTS crawl_schedule_status;
CREATE OR REPLACE VIEW crawl_schedule_status AS
SELECT
s.id AS store_id,
s.name AS store_name,
s.slug AS store_slug,
s.timezone,
s.active,
s.scrape_enabled,
s.last_scraped_at,
-- Dispensary info (master record)
s.dispensary_id,
d.name AS dispensary_name,
d.company_name AS dispensary_company,
d.city AS dispensary_city,
d.address AS dispensary_address,
d.menu_url AS dispensary_menu_url,
-- Provider intelligence from dispensary (if linked)
d.product_provider,
d.product_confidence,
d.product_crawler_mode,
-- Schedule settings (use store override or global)
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
COALESCE(scs.priority, 0) AS priority,
-- Orchestrator status
scs.last_status,
scs.last_summary,
scs.last_run_at AS schedule_last_run,
scs.last_error,
-- Next scheduled run calculation
CASE
WHEN s.last_scraped_at IS NULL THEN NOW()
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
END AS next_scheduled_run,
-- Latest job info
cj.id AS latest_job_id,
cj.status AS latest_job_status,
cj.job_type AS latest_job_type,
cj.trigger_type AS latest_job_trigger,
cj.started_at AS latest_job_started,
cj.completed_at AS latest_job_completed,
cj.products_found AS latest_products_found,
cj.products_new AS latest_products_new,
cj.products_updated AS latest_products_updated,
cj.error_message AS latest_job_error
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
LEFT JOIN LATERAL (
SELECT * FROM crawl_jobs cj2
WHERE cj2.store_id = s.id
ORDER BY cj2.created_at DESC
LIMIT 1
) cj ON TRUE
WHERE s.active = TRUE;
-- 4. Grant permissions
GRANT SELECT ON crawl_schedule_status TO dutchie;
-- 5. Comments
COMMENT ON COLUMN store_crawl_schedule.last_status IS 'Orchestrator result status: success, error, sandbox_only, detection_only';
COMMENT ON COLUMN store_crawl_schedule.last_summary IS 'Human-readable summary of last orchestrator run';
COMMENT ON COLUMN store_crawl_schedule.last_run_at IS 'When orchestrator last ran for this store';
COMMENT ON COLUMN crawl_jobs.orchestrator_run_id IS 'Groups related jobs from same orchestrator run';

View File

@@ -49,7 +49,10 @@ import scraperMonitorRoutes from './routes/scraper-monitor';
import apiTokensRoutes from './routes/api-tokens'; import apiTokensRoutes from './routes/api-tokens';
import apiPermissionsRoutes from './routes/api-permissions'; import apiPermissionsRoutes from './routes/api-permissions';
import parallelScrapeRoutes from './routes/parallel-scrape'; import parallelScrapeRoutes from './routes/parallel-scrape';
import scheduleRoutes from './routes/schedule';
import crawlerSandboxRoutes from './routes/crawler-sandbox';
import { trackApiUsage, checkRateLimit } from './middleware/apiTokenTracker'; import { trackApiUsage, checkRateLimit } from './middleware/apiTokenTracker';
import { startCrawlScheduler } from './services/crawl-scheduler';
import { validateWordPressPermissions } from './middleware/wordpressPermissions'; import { validateWordPressPermissions } from './middleware/wordpressPermissions';
// Apply WordPress permissions validation first (sets req.apiToken) // Apply WordPress permissions validation first (sets req.apiToken)
@@ -75,6 +78,8 @@ app.use('/api/scraper-monitor', scraperMonitorRoutes);
app.use('/api/api-tokens', apiTokensRoutes); app.use('/api/api-tokens', apiTokensRoutes);
app.use('/api/api-permissions', apiPermissionsRoutes); app.use('/api/api-permissions', apiPermissionsRoutes);
app.use('/api/parallel-scrape', parallelScrapeRoutes); app.use('/api/parallel-scrape', parallelScrapeRoutes);
app.use('/api/schedule', scheduleRoutes);
app.use('/api/crawler-sandbox', crawlerSandboxRoutes);
async function startServer() { async function startServer() {
try { try {
@@ -86,6 +91,10 @@ async function startServer() {
// Clean up any orphaned proxy test jobs from previous server runs // Clean up any orphaned proxy test jobs from previous server runs
await cleanupOrphanedJobs(); await cleanupOrphanedJobs();
// Start the crawl scheduler (checks every minute for jobs to run)
startCrawlScheduler();
logger.info('system', 'Crawl scheduler started');
app.listen(PORT, () => { app.listen(PORT, () => {
logger.info('system', `Server running on port ${PORT}`); logger.info('system', `Server running on port ${PORT}`);
console.log(`🚀 Server running on port ${PORT}`); console.log(`🚀 Server running on port ${PORT}`);

View File

@@ -0,0 +1,628 @@
/**
* Crawler Sandbox API Routes
*
* Endpoints for managing sandbox crawls, templates, and provider detection
*/
import express from 'express';
import { pool } from '../db/migrate';
import { authMiddleware, requireRole } from '../auth/middleware';
import { logger } from '../services/logger';
import {
runDetectMenuProviderJob,
runDutchieMenuCrawlJob,
runSandboxCrawlJob,
} from '../services/crawler-jobs';
const router = express.Router();
// Apply auth middleware to all routes
router.use(authMiddleware);
// ========================================
// Sandbox Entries
// ========================================
/**
* GET /api/crawler-sandbox
* List sandbox entries with optional filters
*/
router.get('/', async (req, res) => {
try {
const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
let query = `
SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE 1=1
`;
const params: any[] = [];
let paramIndex = 1;
if (status) {
query += ` AND cs.status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (dispensaryId) {
query += ` AND cs.dispensary_id = $${paramIndex}`;
params.push(Number(dispensaryId));
paramIndex++;
}
query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
params.push(Number(limit), Number(offset));
const result = await pool.query(query, params);
// Get total count
const countResult = await pool.query(
`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
${status ? 'AND cs.status = $1' : ''}
${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`,
status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []
);
res.json({
sandboxes: result.rows,
total: parseInt(countResult.rows[0].count),
limit: Number(limit),
offset: Number(offset),
});
} catch (error: any) {
logger.error('api', `Get sandboxes error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/crawler-sandbox/:id
* Get a single sandbox entry with full details
*/
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await pool.query(
`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE cs.id = $1`,
[id]
);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Get related jobs
const jobs = await pool.query(
`SELECT * FROM sandbox_crawl_jobs
WHERE sandbox_id = $1 OR dispensary_id = $2
ORDER BY created_at DESC
LIMIT 10`,
[id, result.rows[0].dispensary_id]
);
res.json({
sandbox: result.rows[0],
jobs: jobs.rows,
});
} catch (error: any) {
logger.error('api', `Get sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/:id/analyze
* Trigger re-analysis of a sandbox entry
*/
router.post('/:id/analyze', requireRole('admin'), async (req, res) => {
try {
const { id } = req.params;
const sandbox = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
if (sandbox.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Queue a new sandbox job
const job = await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
VALUES ($1, $2, 'deep_crawl', 'pending', 20)
RETURNING id`,
[sandbox.rows[0].dispensary_id, id]
);
// Update sandbox status
await pool.query(
`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`,
[id]
);
res.json({
message: 'Analysis job queued',
jobId: job.rows[0].id,
});
} catch (error: any) {
logger.error('api', `Analyze sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/:id/move-to-production
* Move a sandbox entry to production (for Dutchie dispensaries)
*/
router.post('/:id/move-to-production', requireRole('admin'), async (req, res) => {
try {
const { id } = req.params;
const sandbox = await pool.query(
`SELECT cs.*, d.menu_provider
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE cs.id = $1`,
[id]
);
if (sandbox.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Can only move to production if provider is dutchie
if (sandbox.rows[0].menu_provider !== 'dutchie') {
return res.status(400).json({
error: 'Only Dutchie dispensaries can be moved to production currently',
});
}
// Update dispensary to production mode
await pool.query(
`UPDATE dispensaries
SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
WHERE id = $1`,
[sandbox.rows[0].dispensary_id]
);
// Mark sandbox as moved
await pool.query(
`UPDATE crawler_sandboxes
SET status = 'moved_to_production', updated_at = NOW()
WHERE id = $1`,
[id]
);
res.json({ message: 'Dispensary moved to production' });
} catch (error: any) {
logger.error('api', `Move to production error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* PATCH /api/crawler-sandbox/:id
* Update sandbox entry (e.g., add human review notes)
*/
router.patch('/:id', requireRole('admin'), async (req, res) => {
try {
const { id } = req.params;
const { human_review_notes, status, suspected_menu_provider } = req.body;
const updates: string[] = [];
const params: any[] = [];
let paramIndex = 1;
if (human_review_notes !== undefined) {
updates.push(`human_review_notes = $${paramIndex}`);
params.push(human_review_notes);
paramIndex++;
}
if (status) {
updates.push(`status = $${paramIndex}`);
params.push(status);
paramIndex++;
}
if (suspected_menu_provider !== undefined) {
updates.push(`suspected_menu_provider = $${paramIndex}`);
params.push(suspected_menu_provider);
paramIndex++;
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No updates provided' });
}
updates.push('updated_at = NOW()');
if (human_review_notes !== undefined) {
updates.push('reviewed_at = NOW()');
}
params.push(id);
await pool.query(
`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
params
);
res.json({ message: 'Sandbox updated' });
} catch (error: any) {
logger.error('api', `Update sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Templates
// ========================================
/**
* GET /api/crawler-sandbox/templates
* List all crawler templates
*/
router.get('/templates/list', async (req, res) => {
try {
const result = await pool.query(
`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`
);
res.json({ templates: result.rows });
} catch (error: any) {
logger.error('api', `Get templates error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/crawler-sandbox/templates/:id
* Get a single template
*/
router.get('/templates/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Template not found' });
}
res.json({ template: result.rows[0] });
} catch (error: any) {
logger.error('api', `Get template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/templates
* Create a new template
*/
router.post('/templates', requireRole('admin'), async (req, res) => {
try {
const {
provider,
name,
selector_config,
navigation_config,
transform_config,
validation_rules,
notes,
} = req.body;
if (!provider || !name) {
return res.status(400).json({ error: 'provider and name are required' });
}
const result = await pool.query(
`INSERT INTO crawler_templates
(provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING *`,
[
provider,
name,
JSON.stringify(selector_config || {}),
JSON.stringify(navigation_config || {}),
JSON.stringify(transform_config || {}),
JSON.stringify(validation_rules || {}),
notes,
(req as any).user?.email || 'system',
]
);
res.status(201).json({ template: result.rows[0] });
} catch (error: any) {
logger.error('api', `Create template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* PUT /api/crawler-sandbox/templates/:id
* Update a template
*/
router.put('/templates/:id', requireRole('admin'), async (req, res) => {
try {
const { id } = req.params;
const {
is_active,
is_default_for_provider,
selector_config,
navigation_config,
transform_config,
validation_rules,
notes,
} = req.body;
const updates: string[] = [];
const params: any[] = [];
let paramIndex = 1;
if (is_active !== undefined) {
updates.push(`is_active = $${paramIndex}`);
params.push(is_active);
paramIndex++;
}
if (is_default_for_provider !== undefined) {
updates.push(`is_default_for_provider = $${paramIndex}`);
params.push(is_default_for_provider);
paramIndex++;
}
if (selector_config !== undefined) {
updates.push(`selector_config = $${paramIndex}`);
params.push(JSON.stringify(selector_config));
paramIndex++;
}
if (navigation_config !== undefined) {
updates.push(`navigation_config = $${paramIndex}`);
params.push(JSON.stringify(navigation_config));
paramIndex++;
}
if (transform_config !== undefined) {
updates.push(`transform_config = $${paramIndex}`);
params.push(JSON.stringify(transform_config));
paramIndex++;
}
if (validation_rules !== undefined) {
updates.push(`validation_rules = $${paramIndex}`);
params.push(JSON.stringify(validation_rules));
paramIndex++;
}
if (notes !== undefined) {
updates.push(`notes = $${paramIndex}`);
params.push(notes);
paramIndex++;
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No updates provided' });
}
updates.push('updated_at = NOW()');
params.push(id);
await pool.query(
`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
params
);
const result = await pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
res.json({ template: result.rows[0] });
} catch (error: any) {
logger.error('api', `Update template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Jobs
// ========================================
/**
* GET /api/crawler-sandbox/jobs
* List sandbox crawl jobs
*/
router.get('/jobs/list', async (req, res) => {
try {
const { status, dispensaryId, limit = 50 } = req.query;
let query = `
SELECT sj.*, d.name as dispensary_name
FROM sandbox_crawl_jobs sj
JOIN dispensaries d ON d.id = sj.dispensary_id
WHERE 1=1
`;
const params: any[] = [];
let paramIndex = 1;
if (status) {
query += ` AND sj.status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (dispensaryId) {
query += ` AND sj.dispensary_id = $${paramIndex}`;
params.push(Number(dispensaryId));
paramIndex++;
}
query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
params.push(Number(limit));
const result = await pool.query(query, params);
res.json({ jobs: result.rows });
} catch (error: any) {
logger.error('api', `Get jobs error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/jobs/detect/:dispensaryId
* Trigger provider detection for a dispensary
*/
router.post('/jobs/detect/:dispensaryId', requireRole('admin'), async (req, res) => {
try {
const { dispensaryId } = req.params;
// Create detection job
const job = await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 30)
RETURNING id`,
[dispensaryId]
);
// Update dispensary status
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
[dispensaryId]
);
res.json({
message: 'Detection job queued',
jobId: job.rows[0].id,
});
} catch (error: any) {
logger.error('api', `Queue detection error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/jobs/run/:id
* Immediately run a sandbox job
*/
router.post('/jobs/run/:id', requireRole('admin'), async (req, res) => {
try {
const { id } = req.params;
const job = await pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
if (job.rows.length === 0) {
return res.status(404).json({ error: 'Job not found' });
}
const jobData = job.rows[0];
// Run the job immediately
let result;
if (jobData.job_type === 'detection') {
result = await runDetectMenuProviderJob(jobData.dispensary_id);
} else {
result = await runSandboxCrawlJob(jobData.dispensary_id, jobData.sandbox_id);
}
// Update job status
await pool.query(
`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`,
[
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
id,
]
);
res.json(result);
} catch (error: any) {
logger.error('api', `Run job error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Stats
// ========================================
/**
* GET /api/crawler-sandbox/stats
* Get sandbox/crawler statistics
*/
router.get('/stats/overview', async (req, res) => {
try {
// Dispensary provider stats
const providerStats = await pool.query(`
SELECT
menu_provider,
COUNT(*) as count,
AVG(menu_provider_confidence)::integer as avg_confidence
FROM dispensaries
WHERE menu_provider IS NOT NULL
GROUP BY menu_provider
ORDER BY count DESC
`);
// Mode stats
const modeStats = await pool.query(`
SELECT
crawler_mode,
COUNT(*) as count
FROM dispensaries
GROUP BY crawler_mode
`);
// Status stats
const statusStats = await pool.query(`
SELECT
crawler_status,
COUNT(*) as count
FROM dispensaries
GROUP BY crawler_status
ORDER BY count DESC
`);
// Sandbox stats
const sandboxStats = await pool.query(`
SELECT
status,
COUNT(*) as count
FROM crawler_sandboxes
GROUP BY status
`);
// Job stats
const jobStats = await pool.query(`
SELECT
status,
job_type,
COUNT(*) as count
FROM sandbox_crawl_jobs
GROUP BY status, job_type
`);
// Recent activity
const recentActivity = await pool.query(`
SELECT 'sandbox' as type, id, dispensary_id, status, created_at
FROM crawler_sandboxes
ORDER BY created_at DESC
LIMIT 5
`);
res.json({
providers: providerStats.rows,
modes: modeStats.rows,
statuses: statusStats.rows,
sandbox: sandboxStats.rows,
jobs: jobStats.rows,
recentActivity: recentActivity.rows,
});
} catch (error: any) {
logger.error('api', `Get stats error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
export default router;

View File

@@ -0,0 +1,344 @@
import { Router, Request, Response } from 'express';
import { authMiddleware, requireRole } from '../auth/middleware';
import {
getGlobalSchedule,
updateGlobalSchedule,
getStoreScheduleStatuses,
getStoreSchedule,
updateStoreSchedule,
getAllRecentJobs,
getRecentJobs,
triggerManualCrawl,
triggerAllStoresCrawl,
cancelJob,
restartCrawlScheduler,
setSchedulerMode,
getSchedulerMode,
} from '../services/crawl-scheduler';
import {
runStoreCrawlOrchestrator,
runBatchOrchestrator,
getStoresDueForOrchestration,
} from '../services/store-crawl-orchestrator';
const router = Router();
router.use(authMiddleware);
// ============================================
// Global Schedule Endpoints
// ============================================
/**
* GET /api/schedule/global
* Get global schedule settings
*/
router.get('/global', async (req: Request, res: Response) => {
try {
const schedules = await getGlobalSchedule();
res.json({ schedules });
} catch (error: any) {
console.error('Error fetching global schedule:', error);
res.status(500).json({ error: 'Failed to fetch global schedule' });
}
});
/**
* PUT /api/schedule/global/:type
* Update global schedule setting
*/
router.put('/global/:type', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const { type } = req.params;
const { enabled, interval_hours, run_time } = req.body;
if (type !== 'global_interval' && type !== 'daily_special') {
return res.status(400).json({ error: 'Invalid schedule type' });
}
const schedule = await updateGlobalSchedule(type, {
enabled,
interval_hours,
run_time
});
// Restart scheduler to apply changes
await restartCrawlScheduler();
res.json({ schedule, message: 'Schedule updated and scheduler restarted' });
} catch (error: any) {
console.error('Error updating global schedule:', error);
res.status(500).json({ error: 'Failed to update global schedule' });
}
});
// ============================================
// Store Schedule Endpoints
// ============================================
/**
* GET /api/schedule/stores
* Get all store schedule statuses
*/
router.get('/stores', async (req: Request, res: Response) => {
try {
const stores = await getStoreScheduleStatuses();
res.json({ stores });
} catch (error: any) {
console.error('Error fetching store schedules:', error);
res.status(500).json({ error: 'Failed to fetch store schedules' });
}
});
/**
* GET /api/schedule/stores/:storeId
* Get schedule for a specific store
*/
router.get('/stores/:storeId', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const schedule = await getStoreSchedule(storeId);
res.json({ schedule });
} catch (error: any) {
console.error('Error fetching store schedule:', error);
res.status(500).json({ error: 'Failed to fetch store schedule' });
}
});
/**
* PUT /api/schedule/stores/:storeId
* Update schedule for a specific store
*/
router.put('/stores/:storeId', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const {
enabled,
interval_hours,
daily_special_enabled,
daily_special_time,
priority
} = req.body;
const schedule = await updateStoreSchedule(storeId, {
enabled,
interval_hours,
daily_special_enabled,
daily_special_time,
priority
});
res.json({ schedule });
} catch (error: any) {
console.error('Error updating store schedule:', error);
res.status(500).json({ error: 'Failed to update store schedule' });
}
});
// ============================================
// Job Queue Endpoints
// ============================================
/**
* GET /api/schedule/jobs
* Get recent jobs
*/
router.get('/jobs', async (req: Request, res: Response) => {
try {
const limit = parseInt(req.query.limit as string) || 50;
const jobs = await getAllRecentJobs(Math.min(limit, 200));
res.json({ jobs });
} catch (error: any) {
console.error('Error fetching jobs:', error);
res.status(500).json({ error: 'Failed to fetch jobs' });
}
});
/**
* GET /api/schedule/jobs/store/:storeId
* Get recent jobs for a specific store
*/
router.get('/jobs/store/:storeId', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const limit = parseInt(req.query.limit as string) || 10;
const jobs = await getRecentJobs(storeId, Math.min(limit, 100));
res.json({ jobs });
} catch (error: any) {
console.error('Error fetching store jobs:', error);
res.status(500).json({ error: 'Failed to fetch store jobs' });
}
});
/**
* POST /api/schedule/jobs/:jobId/cancel
* Cancel a pending job
*/
router.post('/jobs/:jobId/cancel', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const jobId = parseInt(req.params.jobId);
if (isNaN(jobId)) {
return res.status(400).json({ error: 'Invalid job ID' });
}
const cancelled = await cancelJob(jobId);
if (cancelled) {
res.json({ success: true, message: 'Job cancelled' });
} else {
res.status(400).json({ error: 'Job could not be cancelled (may not be pending)' });
}
} catch (error: any) {
console.error('Error cancelling job:', error);
res.status(500).json({ error: 'Failed to cancel job' });
}
});
// ============================================
// Manual Trigger Endpoints
// ============================================
/**
* POST /api/schedule/trigger/store/:storeId
* Manually trigger orchestrated crawl for a specific store
* Uses the intelligent orchestrator which:
* - Checks provider detection status
* - Runs detection if needed
* - Queues appropriate crawl type (production/sandbox)
*/
router.post('/trigger/store/:storeId', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
// Use the orchestrator instead of simple triggerManualCrawl
const result = await runStoreCrawlOrchestrator(storeId);
res.json({
result,
message: result.summary,
success: result.status === 'success' || result.status === 'sandbox_only',
});
} catch (error: any) {
console.error('Error triggering orchestrated crawl:', error);
res.status(500).json({ error: 'Failed to trigger crawl' });
}
});
/**
* POST /api/schedule/trigger/store/:storeId/legacy
* Legacy: Simple job queue trigger (no orchestration)
*/
router.post('/trigger/store/:storeId/legacy', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const job = await triggerManualCrawl(storeId);
res.json({ job, message: 'Crawl job created' });
} catch (error: any) {
console.error('Error triggering manual crawl:', error);
res.status(500).json({ error: 'Failed to trigger crawl' });
}
});
/**
* POST /api/schedule/trigger/all
* Manually trigger crawls for all stores
*/
router.post('/trigger/all', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const jobsCreated = await triggerAllStoresCrawl();
res.json({ jobs_created: jobsCreated, message: `Created ${jobsCreated} crawl jobs` });
} catch (error: any) {
console.error('Error triggering all crawls:', error);
res.status(500).json({ error: 'Failed to trigger crawls' });
}
});
/**
* POST /api/schedule/restart
* Restart the scheduler
*/
router.post('/restart', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
await restartCrawlScheduler();
res.json({ message: 'Scheduler restarted', mode: getSchedulerMode() });
} catch (error: any) {
console.error('Error restarting scheduler:', error);
res.status(500).json({ error: 'Failed to restart scheduler' });
}
});
// ============================================
// Scheduler Mode Endpoints
// ============================================
/**
* GET /api/schedule/mode
* Get current scheduler mode
*/
router.get('/mode', async (req: Request, res: Response) => {
try {
const mode = getSchedulerMode();
res.json({ mode });
} catch (error: any) {
console.error('Error getting scheduler mode:', error);
res.status(500).json({ error: 'Failed to get scheduler mode' });
}
});
/**
* PUT /api/schedule/mode
* Set scheduler mode (legacy or orchestrator)
*/
router.put('/mode', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
try {
const { mode } = req.body;
if (mode !== 'legacy' && mode !== 'orchestrator') {
return res.status(400).json({ error: 'Invalid mode. Must be "legacy" or "orchestrator"' });
}
setSchedulerMode(mode);
// Restart scheduler with new mode
await restartCrawlScheduler();
res.json({ mode, message: `Scheduler mode set to ${mode} and restarted` });
} catch (error: any) {
console.error('Error setting scheduler mode:', error);
res.status(500).json({ error: 'Failed to set scheduler mode' });
}
});
/**
* GET /api/schedule/due
* Get stores that are due for orchestration
*/
router.get('/due', async (req: Request, res: Response) => {
try {
const limit = parseInt(req.query.limit as string) || 10;
const storeIds = await getStoresDueForOrchestration(Math.min(limit, 50));
res.json({ stores_due: storeIds, count: storeIds.length });
} catch (error: any) {
console.error('Error getting stores due for orchestration:', error);
res.status(500).json({ error: 'Failed to get stores due' });
}
});
export default router;

View File

@@ -0,0 +1,345 @@
#!/usr/bin/env npx tsx
/**
* Backfill Store-Dispensary Mapping
*
* Links existing stores (scheduler) to dispensaries (master AZDHS directory)
* by matching on name, city, and zip code.
*
* Usage:
* npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
* npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
* npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
const args = process.argv.slice(2);
const flags = {
apply: args.includes('--apply'),
verbose: args.includes('--verbose'),
help: args.includes('--help') || args.includes('-h'),
};
interface Store {
id: number;
name: string;
slug: string;
dispensary_id: number | null;
}
interface Dispensary {
id: number;
name: string;
company_name: string | null;
city: string;
address: string;
slug: string;
}
interface MatchResult {
store: Store;
dispensary: Dispensary | null;
matchType: 'exact_name' | 'normalized_name' | 'company_name' | 'slug' | 'fuzzy' | 'none';
score: number;
}
/**
* Normalize a store/dispensary name for comparison
* Removes common suffixes, punctuation, and extra whitespace
*/
function normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
.replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
.replace(/['']/g, "'") // Normalize apostrophes
.replace(/[^\w\s']/g, '') // Remove other punctuation
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Simple Levenshtein distance for fuzzy matching
*/
function levenshteinDistance(a: string, b: string): number {
const matrix: number[][] = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Calculate similarity score (0-100)
*/
function similarityScore(a: string, b: string): number {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 100;
const distance = levenshteinDistance(a, b);
return Math.round((1 - distance / maxLen) * 100);
}
/**
* Find the best dispensary match for a store
*/
function findBestMatch(store: Store, dispensaries: Dispensary[]): MatchResult {
const normalizedStoreName = normalizeName(store.name);
const storeSlug = store.slug.toLowerCase();
let bestMatch: MatchResult = {
store,
dispensary: null,
matchType: 'none',
score: 0,
};
for (const disp of dispensaries) {
const normalizedDispName = normalizeName(disp.name);
const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
const dispSlug = disp.slug.toLowerCase();
// 1. Exact name match (case-insensitive)
if (store.name.toLowerCase() === disp.name.toLowerCase()) {
return {
store,
dispensary: disp,
matchType: 'exact_name',
score: 100,
};
}
// 2. Normalized name match
if (normalizedStoreName === normalizedDispName) {
return {
store,
dispensary: disp,
matchType: 'normalized_name',
score: 95,
};
}
// 3. Store name matches company name
if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
return {
store,
dispensary: disp,
matchType: 'company_name',
score: 90,
};
}
// 4. Slug match
if (storeSlug === dispSlug) {
return {
store,
dispensary: disp,
matchType: 'slug',
score: 85,
};
}
// 5. Fuzzy matching (only if score > 70)
const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
const companyScore = normalizedCompanyName
? similarityScore(normalizedStoreName, normalizedCompanyName)
: 0;
const fuzzyScore = Math.max(nameScore, companyScore);
if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
bestMatch = {
store,
dispensary: disp,
matchType: 'fuzzy',
score: fuzzyScore,
};
}
}
return bestMatch;
}
async function main() {
if (flags.help) {
console.log(`
Backfill Store-Dispensary Mapping
Links existing stores (scheduler) to dispensaries (master AZDHS directory)
by matching on name, company name, or slug similarity.
USAGE:
npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
OPTIONS:
--apply Apply the mappings to the database (default: preview only)
--verbose Show detailed match information for all stores
--help, -h Show this help message
EXAMPLES:
# Preview what would be matched
npx tsx src/scripts/backfill-store-dispensary.ts
# Apply the mappings
npx tsx src/scripts/backfill-store-dispensary.ts --apply
# Show verbose output
npx tsx src/scripts/backfill-store-dispensary.ts --verbose
`);
process.exit(0);
}
console.log('\n📦 Backfill Store-Dispensary Mapping');
console.log('=====================================\n');
try {
// Fetch all stores without a dispensary_id
const storesResult = await pool.query<Store>(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NULL
ORDER BY name
`);
const unmappedStores = storesResult.rows;
// Fetch all already-mapped stores for context
const mappedResult = await pool.query<Store>(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NOT NULL
ORDER BY name
`);
const mappedStores = mappedResult.rows;
// Fetch all dispensaries
const dispResult = await pool.query<Dispensary>(`
SELECT id, name, company_name, city, address, slug
FROM dispensaries
ORDER BY name
`);
const dispensaries = dispResult.rows;
console.log(`📊 Current Status:`);
console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
console.log(` Stores already mapped: ${mappedStores.length}`);
console.log(` Total dispensaries: ${dispensaries.length}\n`);
if (unmappedStores.length === 0) {
console.log('✅ All stores are already mapped to dispensaries!\n');
await pool.end();
process.exit(0);
}
// Find matches for each unmapped store
const matches: MatchResult[] = [];
const noMatches: Store[] = [];
for (const store of unmappedStores) {
const match = findBestMatch(store, dispensaries);
if (match.dispensary) {
matches.push(match);
} else {
noMatches.push(store);
}
}
// Sort matches by score (highest first)
matches.sort((a, b) => b.score - a.score);
// Display results
console.log(`\n🔗 Matches Found: ${matches.length}`);
console.log('----------------------------------\n');
if (matches.length > 0) {
// Group by match type
const byType: Record<string, MatchResult[]> = {};
for (const m of matches) {
if (!byType[m.matchType]) byType[m.matchType] = [];
byType[m.matchType].push(m);
}
const typeLabels: Record<string, string> = {
exact_name: '✅ Exact Name Match',
normalized_name: '✅ Normalized Name Match',
company_name: '🏢 Company Name Match',
slug: '🔗 Slug Match',
fuzzy: '🔍 Fuzzy Match',
};
for (const [type, results] of Object.entries(byType)) {
console.log(`${typeLabels[type]} (${results.length}):`);
for (const r of results) {
const dispInfo = r.dispensary!;
console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
}
console.log('');
}
}
if (noMatches.length > 0) {
console.log(`\n❌ No Match Found: ${noMatches.length}`);
console.log('----------------------------------\n');
for (const store of noMatches) {
console.log(` • "${store.name}" (slug: ${store.slug})`);
}
console.log('');
}
// Apply if requested
if (flags.apply && matches.length > 0) {
console.log('\n🔧 Applying mappings...\n');
let updated = 0;
for (const match of matches) {
if (!match.dispensary) continue;
await pool.query(
'UPDATE stores SET dispensary_id = $1 WHERE id = $2',
[match.dispensary.id, match.store.id]
);
updated++;
if (flags.verbose) {
console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
}
}
console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
} else if (matches.length > 0 && !flags.apply) {
console.log('\n💡 Run with --apply to update the database\n');
}
// Summary
console.log('📈 Summary:');
console.log(` Would match: ${matches.length} stores`);
console.log(` No match: ${noMatches.length} stores`);
console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main().catch(console.error);

View File

@@ -0,0 +1,424 @@
#!/usr/bin/env npx tsx
/**
* Queue Dispensaries Script
*
* Orchestrates the multi-provider crawler system:
* 1. Queue dispensaries that need provider detection
* 2. Queue Dutchie dispensaries for production crawl
* 3. Queue sandbox dispensaries for learning crawls
*
* Usage:
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
import {
runDetectMenuProviderJob,
runDutchieMenuCrawlJob,
runSandboxCrawlJob,
processSandboxJobs,
} from '../services/crawler-jobs';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
async function showHelp() {
console.log(`
Queue Dispensaries - Multi-Provider Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need provider detection
--production Queue Dutchie production crawls
--sandbox Queue sandbox/learning crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-dispensaries.ts
# Only queue detection jobs
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
# Dry run to see what would be queued
npx tsx src/scripts/queue-dispensaries.ts --dry-run
# Process sandbox jobs
npx tsx src/scripts/queue-dispensaries.ts --process
`);
}
async function queueDetectionJobs(): Promise<number> {
console.log('\n📡 Queueing Detection Jobs...');
// Find dispensaries that need provider detection:
// - menu_provider is null OR
// - menu_provider_confidence < 70 AND
// - crawler_status is idle (not already queued/running)
// - has a website URL
const query = `
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND crawler_status = 'idle'
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
ORDER BY
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
menu_provider_confidence ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create sandbox job for detection
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 10)`,
[dispensary.id]
);
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueProductionCrawls(): Promise<number> {
console.log('\n🏭 Queueing Production Dutchie Crawls...');
// Find Dutchie dispensaries ready for production crawl:
// - menu_provider = 'dutchie'
// - crawler_mode = 'production'
// - crawler_status is idle
// - last_menu_scrape is old or null
const query = `
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
FROM dispensaries d
WHERE d.menu_provider = 'dutchie'
AND d.crawler_mode = 'production'
AND d.crawler_status = 'idle'
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
d.last_menu_scrape ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
for (const row of result.rows) {
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create crawl job in the main crawl_jobs table (production queue)
await pool.query(
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id]
);
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueSandboxCrawls(): Promise<number> {
console.log('\n🧪 Queueing Sandbox Crawls...');
// Find sandbox dispensaries needing crawls:
// - crawler_mode = 'sandbox'
// - crawler_status in (idle, error_needs_review)
// - No recent sandbox job
const query = `
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
FROM dispensaries d
WHERE d.crawler_mode = 'sandbox'
AND d.crawler_status IN ('idle', 'error_needs_review')
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.status IN ('pending', 'running')
)
ORDER BY d.updated_at ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status
await pool.query(
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
[dispensary.id]
);
// Create sandbox job
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'deep_crawl', 'pending', 5)`,
[dispensary.id]
);
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function processJobs(): Promise<void> {
console.log('\n⚙ Processing Queued Jobs...\n');
// Process sandbox jobs (detection + sandbox crawls)
const sandboxJobs = await pool.query(
`SELECT * FROM sandbox_crawl_jobs
WHERE status = 'pending'
ORDER BY priority DESC, scheduled_at ASC
LIMIT $1`,
[flags.limit]
);
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
for (const job of sandboxJobs.rows) {
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
try {
// Mark as running
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`,
[job.id]
);
let result;
if (job.job_type === 'detection') {
result = await runDetectMenuProviderJob(job.dispensary_id);
} else {
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
}
// Update job status
await pool.query(
`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`,
[
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]
);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
} catch (error: any) {
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
[error.message, job.id]
);
console.log(` ✗ Error: ${error.message}\n`);
}
}
}
async function showStats(): Promise<void> {
console.log('\n📊 Current Stats:');
// Dispensary stats
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
Dispensaries: ${s.total}
- No provider detected: ${s.no_provider}
- Dutchie: ${s.dutchie}
- Other providers: ${s.other_providers}
- Unknown: ${s.unknown}
Crawler Mode:
- Production: ${s.production_mode}
- Sandbox: ${s.sandbox_mode}
Status:
- Idle: ${s.idle}
- Queued: ${s.queued}
- Running: ${s.running}
- OK: ${s.ok}
- Needs Review: ${s.needs_review}
`);
// Job stats
const jobStats = await pool.query(`
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
`);
const j = jobStats.rows[0];
console.log(` Sandbox Jobs:
- Pending: ${j.pending}
- Running: ${j.running}
- Completed: ${j.completed}
- Failed: ${j.failed}
`);
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Provider Crawler Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
try {
// Show current stats first
await showStats();
if (flags.process) {
// Process mode - run jobs instead of queuing
await processJobs();
} else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueDetectionJobs();
}
if (flags.production) {
totalQueued += await queueProductionCrawls();
}
if (flags.sandbox) {
totalQueued += await queueSandboxCrawls();
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total dispensaries queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,583 @@
#!/usr/bin/env npx tsx
/**
* Queue Intelligence Script
*
* Orchestrates the multi-category intelligence crawler system:
* 1. Queue dispensaries that need provider detection (all 4 categories)
* 2. Queue per-category production crawls (Dutchie products only for now)
* 3. Queue per-category sandbox crawls (all providers)
*
* Each category (product, specials, brand, metadata) is handled independently.
* A failure in one category does NOT affect other categories.
*
* Usage:
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
* npx tsx src/scripts/queue-intelligence.ts --dry-run
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
import {
detectMultiCategoryProviders,
updateAllCategoryProviders,
IntelligenceCategory,
} from '../services/intelligence-detector';
import {
runCrawlProductsJob,
runCrawlSpecialsJob,
runCrawlBrandIntelligenceJob,
runCrawlMetadataJob,
runSandboxProductsJob,
runSandboxSpecialsJob,
runSandboxBrandJob,
runSandboxMetadataJob,
runAllCategoryProductionCrawls,
runAllCategorySandboxCrawls,
processCategorySandboxJobs,
} from '../services/category-crawler-jobs';
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
category: args.find(a => a.startsWith('--category='))?.split('=')[1] as IntelligenceCategory | undefined,
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
const CATEGORIES: IntelligenceCategory[] = ['product', 'specials', 'brand', 'metadata'];
async function showHelp() {
console.log(`
Queue Intelligence - Multi-Category Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need multi-category detection
--production Queue per-category production crawls
--sandbox Queue per-category sandbox crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
--dispensary=ID Process only a specific dispensary
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
CATEGORIES:
product - Product/menu data (Dutchie=production, others=sandbox)
specials - Deals and specials (all sandbox for now)
brand - Brand intelligence (all sandbox for now)
metadata - Categories/taxonomy (all sandbox for now)
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-intelligence.ts
# Only queue product detection jobs
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
# Process sandbox jobs for specials category
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
# Run full detection for a specific dispensary
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
# Dry run to see what would be queued
npx tsx src/scripts/queue-intelligence.ts --dry-run
`);
}
async function queueMultiCategoryDetection(): Promise<number> {
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
// Find dispensaries that need provider detection for any category:
// - Any *_provider is null OR
// - Any *_confidence < 70
// - has a website URL
const query = `
SELECT id, name, website, menu_url,
product_provider, product_confidence, product_crawler_mode,
specials_provider, specials_confidence, specials_crawler_mode,
brand_provider, brand_confidence, brand_crawler_mode,
metadata_provider, metadata_confidence, metadata_crawler_mode
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND (
product_provider IS NULL OR product_confidence < 70 OR
specials_provider IS NULL OR specials_confidence < 70 OR
brand_provider IS NULL OR brand_confidence < 70 OR
metadata_provider IS NULL OR metadata_confidence < 70
)
ORDER BY
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
product_confidence ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
for (const row of result.rows) {
const needsDetection: string[] = [];
if (!row.product_provider || row.product_confidence < 70) needsDetection.push('product');
if (!row.specials_provider || row.specials_confidence < 70) needsDetection.push('specials');
if (!row.brand_provider || row.brand_confidence < 70) needsDetection.push('brand');
if (!row.metadata_provider || row.metadata_confidence < 70) needsDetection.push('metadata');
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Create detection jobs for each category that needs it
for (const category of CATEGORIES) {
const provider = dispensary[`${category}_provider`];
const confidence = dispensary[`${category}_confidence`];
if (!provider || confidence < 70) {
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
VALUES ($1, $2, 'detection', 'pending', 10)
ON CONFLICT DO NOTHING`,
[dispensary.id, category]
);
}
}
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueCategoryProductionCrawls(category?: IntelligenceCategory): Promise<number> {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
// For now, only products have production-ready crawlers (Dutchie only)
if (cat !== 'product') {
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
continue;
}
// Find dispensaries ready for production crawl
const query = `
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
FROM dispensaries
WHERE ${cat}_provider = 'dutchie'
AND ${cat}_crawler_mode = 'production'
AND ${cat}_confidence >= 70
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
last_${cat}_scan_at ASC
LIMIT $1
`;
const result = await pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
for (const row of result.rows) {
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// For products, use the existing crawl_jobs table for production
await pool.query(
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`,
[dispensary.id, cat]
);
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
totalQueued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function queueCategorySandboxCrawls(category?: IntelligenceCategory): Promise<number> {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
// Find dispensaries in sandbox mode for this category
const query = `
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
d.website, d.menu_url
FROM dispensaries d
WHERE d.${cat}_crawler_mode = 'sandbox'
AND d.${cat}_provider IS NOT NULL
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.category = $1
AND sj.status IN ('pending', 'running')
)
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
LIMIT $2
`;
const result = await pool.query(query, [cat, flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// Create sandbox entry if needed
const sandboxResult = await pool.query(
`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
VALUES ($1, $2, $3, 'template_learning', 'pending')
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
DO UPDATE SET updated_at = NOW()
RETURNING id`,
[dispensary.id, cat, dispensary.provider]
);
const sandboxId = sandboxResult.rows[0]?.id;
// Create sandbox job
await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`,
[dispensary.id, sandboxId, cat]
);
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
totalQueued++;
} catch (error: any) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function processDetectionJobs(): Promise<void> {
console.log('\n🔍 Processing Detection Jobs...');
// Get pending detection jobs
const jobs = await pool.query(
`SELECT DISTINCT dispensary_id
FROM sandbox_crawl_jobs
WHERE job_type = 'detection' AND status = 'pending'
${flags.category ? `AND category = $2` : ''}
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
LIMIT $1`,
flags.category
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit])
);
for (const job of jobs.rows) {
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
try {
// Get dispensary info
const dispResult = await pool.query(
'SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1',
[job.dispensary_id]
);
const dispensary = dispResult.rows[0];
if (!dispensary) {
console.log(` ✗ Dispensary not found`);
continue;
}
const websiteUrl = dispensary.website || dispensary.menu_url;
if (!websiteUrl) {
console.log(` ✗ No website URL`);
continue;
}
// Mark jobs as running
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`,
[job.dispensary_id]
);
// Run multi-category detection
console.log(` Detecting providers for ${dispensary.name}...`);
const detection = await detectMultiCategoryProviders(websiteUrl, { timeout: 45000 });
// Update all categories
await updateAllCategoryProviders(job.dispensary_id, detection);
// Mark jobs as completed
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
result_summary = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
[JSON.stringify({
product: { provider: detection.product.provider, confidence: detection.product.confidence },
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
}), job.dispensary_id]
);
console.log(` ✓ Detection complete:`);
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
[error.message, job.dispensary_id]
);
}
}
}
async function processCrawlJobs(): Promise<void> {
const categories = flags.category ? [flags.category] : CATEGORIES;
for (const cat of categories) {
console.log(`\n⚙ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
// Process sandbox jobs for this category
if (flags.sandbox || !flags.production) {
await processCategorySandboxJobs(cat, flags.limit);
}
// Process production jobs for this category
if (flags.production && cat === 'product') {
// Get pending production crawls
const prodJobs = await pool.query(
`SELECT d.id
FROM dispensaries d
WHERE d.product_provider = 'dutchie'
AND d.product_crawler_mode = 'production'
AND d.product_confidence >= 70
${flags.dispensary ? 'AND d.id = $2' : ''}
LIMIT $1`,
flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]
);
for (const job of prodJobs.rows) {
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
const result = await runCrawlProductsJob(job.id);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
}
}
}
}
async function processSpecificDispensary(): Promise<void> {
if (!flags.dispensary) return;
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
const dispResult = await pool.query(
'SELECT * FROM dispensaries WHERE id = $1',
[flags.dispensary]
);
if (dispResult.rows.length === 0) {
console.log('Dispensary not found');
return;
}
const dispensary = dispResult.rows[0];
console.log(`Name: ${dispensary.name}`);
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
console.log('');
if (flags.detection) {
console.log('Running multi-category detection...');
const websiteUrl = dispensary.website || dispensary.menu_url;
if (websiteUrl) {
const detection = await detectMultiCategoryProviders(websiteUrl);
await updateAllCategoryProviders(flags.dispensary, detection);
console.log('Detection results:');
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
}
}
if (flags.production) {
console.log('\nRunning production crawls...');
const results = await runAllCategoryProductionCrawls(flags.dispensary);
console.log(` ${results.summary}`);
}
if (flags.sandbox) {
console.log('\nRunning sandbox crawls...');
const results = await runAllCategorySandboxCrawls(flags.dispensary);
console.log(` ${results.summary}`);
}
}
async function showStats(): Promise<void> {
console.log('\n📊 Multi-Category Intelligence Stats:');
// Per-category stats
for (const cat of CATEGORIES) {
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
AVG(${cat}_confidence) as avg_confidence
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
${cat.toUpperCase()}:
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
Modes: Production=${s.production}, Sandbox=${s.sandbox}
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
}
// Job stats per category
console.log('\n Sandbox Jobs by Category:');
const jobStats = await pool.query(`
SELECT
category,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
GROUP BY category
ORDER BY category
`);
for (const row of jobStats.rows) {
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Category Intelligence Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
if (flags.category) {
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
}
try {
// Show current stats first
await showStats();
// If specific dispensary specified, process it directly
if (flags.dispensary && flags.process) {
await processSpecificDispensary();
} else if (flags.process) {
// Process mode - run jobs
if (flags.detection) {
await processDetectionJobs();
}
await processCrawlJobs();
} else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueMultiCategoryDetection();
}
if (flags.production) {
totalQueued += await queueCategoryProductionCrawls(flags.category);
}
if (flags.sandbox) {
totalQueued += await queueCategorySandboxCrawls(flags.category);
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
} catch (error) {
console.error('Fatal error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main();

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,651 @@
/**
* Crawl Scheduler Service
*
* This service manages crawl scheduling using a job queue approach.
* It does NOT modify the crawler - it only TRIGGERS the existing crawler.
*
* Features:
* - Global schedule: crawl all stores every N hours
* - Daily special run: 12:01 AM local store time
* - Per-store schedule overrides
* - Job queue for tracking pending/running crawls
*/
import cron from 'node-cron';
import { pool } from '../db/migrate';
import { scrapeStore } from '../scraper-v2';
import {
runStoreCrawlOrchestrator,
getStoresDueForOrchestration,
} from './store-crawl-orchestrator';
// Worker identification
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
let schedulerCronJob: cron.ScheduledTask | null = null;
let jobProcessorRunning = false;
let orchestratorProcessorRunning = false;
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
let schedulerMode: 'legacy' | 'orchestrator' = 'orchestrator';
// ============================================
// Types
// ============================================
interface GlobalSchedule {
id: number;
schedule_type: string;
enabled: boolean;
interval_hours: number | null;
run_time: string | null;
}
interface StoreScheduleStatus {
store_id: number;
store_name: string;
store_slug: string;
timezone: string;
active: boolean;
scrape_enabled: boolean;
last_scraped_at: Date | null;
schedule_enabled: boolean;
interval_hours: number;
daily_special_enabled: boolean;
daily_special_time: string;
priority: number;
next_scheduled_run: Date;
latest_job_id: number | null;
latest_job_status: string | null;
}
interface CrawlJob {
id: number;
store_id: number;
job_type: string;
trigger_type: string;
status: string;
priority: number;
scheduled_at: Date;
started_at: Date | null;
completed_at: Date | null;
products_found: number | null;
error_message: string | null;
}
// ============================================
// Schedule Management
// ============================================
/**
* Get global schedule settings
*/
export async function getGlobalSchedule(): Promise<GlobalSchedule[]> {
const result = await pool.query(`
SELECT * FROM crawler_schedule ORDER BY id
`);
return result.rows;
}
/**
* Update global schedule setting
*/
export async function updateGlobalSchedule(
scheduleType: string,
updates: { enabled?: boolean; interval_hours?: number; run_time?: string }
): Promise<GlobalSchedule> {
const setClauses: string[] = [];
const values: any[] = [];
let paramIndex = 1;
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
values.push(updates.enabled);
}
if (updates.interval_hours !== undefined) {
setClauses.push(`interval_hours = $${paramIndex++}`);
values.push(updates.interval_hours);
}
if (updates.run_time !== undefined) {
setClauses.push(`run_time = $${paramIndex++}`);
values.push(updates.run_time);
}
values.push(scheduleType);
const result = await pool.query(`
UPDATE crawler_schedule
SET ${setClauses.join(', ')}
WHERE schedule_type = $${paramIndex}
RETURNING *
`, values);
return result.rows[0];
}
/**
* Get all store schedule statuses
*/
export async function getStoreScheduleStatuses(): Promise<StoreScheduleStatus[]> {
const result = await pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
return result.rows;
}
/**
* Get or create per-store schedule override
*/
export async function getStoreSchedule(storeId: number): Promise<any> {
const result = await pool.query(`
SELECT * FROM store_crawl_schedule WHERE store_id = $1
`, [storeId]);
if (result.rows.length > 0) {
return result.rows[0];
}
// Return default (use global)
return {
store_id: storeId,
enabled: true,
interval_hours: null,
daily_special_enabled: true,
daily_special_time: null,
priority: 0
};
}
/**
* Update per-store schedule override
*/
export async function updateStoreSchedule(
storeId: number,
updates: {
enabled?: boolean;
interval_hours?: number | null;
daily_special_enabled?: boolean;
daily_special_time?: string | null;
priority?: number;
}
): Promise<any> {
const result = await pool.query(`
INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (store_id) DO UPDATE SET
enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
interval_hours = EXCLUDED.interval_hours,
daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
daily_special_time = EXCLUDED.daily_special_time,
priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
updated_at = NOW()
RETURNING *
`, [
storeId,
updates.enabled ?? true,
updates.interval_hours ?? null,
updates.daily_special_enabled ?? true,
updates.daily_special_time ?? null,
updates.priority ?? 0
]);
return result.rows[0];
}
// ============================================
// Job Queue Management
// ============================================
/**
* Create a new crawl job
*/
export async function createCrawlJob(
storeId: number,
jobType: 'full_crawl' | 'specials_only' | 'category' = 'full_crawl',
triggerType: 'scheduled' | 'manual' | 'daily_special' = 'scheduled',
scheduledAt: Date = new Date(),
priority: number = 0
): Promise<CrawlJob> {
// Check if there's already a pending or running job for this store
const existing = await pool.query(`
SELECT id FROM crawl_jobs
WHERE store_id = $1 AND status IN ('pending', 'running')
LIMIT 1
`, [storeId]);
if (existing.rows.length > 0) {
console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
return existing.rows[0];
}
const result = await pool.query(`
INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
VALUES ($1, $2, $3, $4, $5, 'pending')
RETURNING *
`, [storeId, jobType, triggerType, scheduledAt, priority]);
console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
return result.rows[0];
}
/**
* Get pending jobs ready to run
*/
export async function getPendingJobs(limit: number = 5): Promise<CrawlJob[]> {
const result = await pool.query(`
SELECT cj.*, s.name as store_name
FROM crawl_jobs cj
JOIN stores s ON s.id = cj.store_id
WHERE cj.status = 'pending'
AND cj.scheduled_at <= NOW()
ORDER BY cj.priority DESC, cj.scheduled_at ASC
LIMIT $1
`, [limit]);
return result.rows;
}
/**
* Claim a job for processing
*/
export async function claimJob(jobId: number): Promise<boolean> {
const result = await pool.query(`
UPDATE crawl_jobs
SET status = 'running', started_at = NOW(), worker_id = $2
WHERE id = $1 AND status = 'pending'
RETURNING id
`, [jobId, WORKER_ID]);
return result.rows.length > 0;
}
/**
* Complete a job
*/
export async function completeJob(
jobId: number,
success: boolean,
results?: { products_found?: number; products_new?: number; products_updated?: number; error_message?: string }
): Promise<void> {
await pool.query(`
UPDATE crawl_jobs
SET
status = $2,
completed_at = NOW(),
products_found = $3,
error_message = $4
WHERE id = $1
`, [
jobId,
success ? 'completed' : 'failed',
results?.products_found ?? null,
results?.error_message ?? null
]);
}
/**
* Get recent jobs for a store
*/
export async function getRecentJobs(storeId: number, limit: number = 10): Promise<CrawlJob[]> {
const result = await pool.query(`
SELECT * FROM crawl_jobs
WHERE store_id = $1
ORDER BY created_at DESC
LIMIT $2
`, [storeId, limit]);
return result.rows;
}
/**
* Get all recent jobs
*/
export async function getAllRecentJobs(limit: number = 50): Promise<any[]> {
const result = await pool.query(`
SELECT cj.*, s.name as store_name, s.slug as store_slug
FROM crawl_jobs cj
JOIN stores s ON s.id = cj.store_id
ORDER BY cj.created_at DESC
LIMIT $1
`, [limit]);
return result.rows;
}
// ============================================
// Scheduler Logic
// ============================================
/**
* Check which stores are due for a crawl and create jobs
*/
export async function checkAndCreateScheduledJobs(): Promise<number> {
console.log('Checking for stores due for crawl...');
// Get global schedule settings
const globalSchedule = await pool.query(`
SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
`);
if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
console.log('Global scheduler is disabled');
return 0;
}
const intervalHours = globalSchedule.rows[0].interval_hours || 4;
// Find stores due for crawl
const result = await pool.query(`
SELECT
s.id,
s.name,
s.timezone,
s.last_scraped_at,
COALESCE(scs.enabled, TRUE) as schedule_enabled,
COALESCE(scs.interval_hours, $1) as interval_hours,
COALESCE(scs.priority, 0) as priority
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.enabled, TRUE) = TRUE
AND (
s.last_scraped_at IS NULL
OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
)
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
)
ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
`, [intervalHours]);
let jobsCreated = 0;
for (const store of result.rows) {
try {
await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
jobsCreated++;
console.log(`Scheduled crawl job for: ${store.name}`);
} catch (error) {
console.error(`Failed to create job for store ${store.name}:`, error);
}
}
console.log(`Created ${jobsCreated} scheduled crawl jobs`);
return jobsCreated;
}
/**
* Check for daily special runs (12:01 AM local time)
*/
export async function checkAndCreateDailySpecialJobs(): Promise<number> {
console.log('Checking for daily special runs...');
// Get daily special schedule
const dailySchedule = await pool.query(`
SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
`);
if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
console.log('Daily special scheduler is disabled');
return 0;
}
const targetTime = dailySchedule.rows[0].run_time || '00:01';
// Find stores where it's currently the target time in their local timezone
// and they haven't had a daily special run today
const result = await pool.query(`
SELECT
s.id,
s.name,
s.timezone,
COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
COALESCE(scs.priority, 0) as priority
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
-- Check if current time in store timezone matches the target time (within 2 minutes)
AND ABS(
EXTRACT(EPOCH FROM (
(NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
- COALESCE(scs.daily_special_time, $1::TIME)
))
) < 120 -- within 2 minutes
-- Ensure we haven't already created a daily_special job today for this store
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id
AND cj.trigger_type = 'daily_special'
AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
)
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
)
ORDER BY COALESCE(scs.priority, 0) DESC
`, [targetTime]);
let jobsCreated = 0;
for (const store of result.rows) {
try {
await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
jobsCreated++;
console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
} catch (error) {
console.error(`Failed to create daily special job for store ${store.name}:`, error);
}
}
if (jobsCreated > 0) {
console.log(`Created ${jobsCreated} daily special crawl jobs`);
}
return jobsCreated;
}
/**
* Process pending jobs
*/
export async function processJobs(): Promise<void> {
if (jobProcessorRunning) {
console.log('Job processor already running, skipping...');
return;
}
jobProcessorRunning = true;
try {
const jobs = await getPendingJobs(1); // Process one at a time for safety
for (const job of jobs) {
console.log(`Processing job ${job.id} for store: ${(job as any).store_name}`);
const claimed = await claimJob(job.id);
if (!claimed) {
console.log(`Job ${job.id} already claimed by another worker`);
continue;
}
try {
// Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
await scrapeStore(job.store_id);
// Update store's last_scraped_at
await pool.query(`
UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
`, [job.store_id]);
await completeJob(job.id, true, {});
console.log(`Job ${job.id} completed successfully`);
} catch (error: any) {
console.error(`Job ${job.id} failed:`, error);
await completeJob(job.id, false, { error_message: error.message });
}
}
} finally {
jobProcessorRunning = false;
}
}
/**
* Process stores using the intelligent orchestrator
* This replaces the simple job queue approach with intelligent provider detection
*/
export async function processOrchestrator(): Promise<void> {
if (orchestratorProcessorRunning) {
console.log('Orchestrator processor already running, skipping...');
return;
}
orchestratorProcessorRunning = true;
try {
// Get stores due for orchestration (respects schedule, intervals, etc.)
const storeIds = await getStoresDueForOrchestration(3); // Process up to 3 at a time
if (storeIds.length === 0) {
return;
}
console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
// Process each store through the orchestrator
for (const storeId of storeIds) {
try {
console.log(`Orchestrator: Starting crawl for store ${storeId}`);
const result = await runStoreCrawlOrchestrator(storeId);
console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
} catch (error: any) {
console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
}
}
console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
} finally {
orchestratorProcessorRunning = false;
}
}
// ============================================
// Scheduler Control
// ============================================
/**
* Set scheduler mode
*/
export function setSchedulerMode(mode: 'legacy' | 'orchestrator'): void {
schedulerMode = mode;
console.log(`Scheduler mode set to: ${mode}`);
}
/**
* Get current scheduler mode
*/
export function getSchedulerMode(): 'legacy' | 'orchestrator' {
return schedulerMode;
}
/**
* Start the scheduler (runs every minute to check for due jobs)
*/
export async function startCrawlScheduler(): Promise<void> {
stopCrawlScheduler();
console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
// Run every minute
schedulerCronJob = cron.schedule('* * * * *', async () => {
try {
if (schedulerMode === 'orchestrator') {
// Use intelligent orchestrator (handles detection + crawl)
await processOrchestrator();
} else {
// Legacy mode: job queue approach
// Check for interval-based scheduled jobs
await checkAndCreateScheduledJobs();
// Check for daily special runs
await checkAndCreateDailySpecialJobs();
// Process any pending jobs
await processJobs();
}
} catch (error) {
console.error('Scheduler tick error:', error);
}
});
console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
}
/**
* Stop the scheduler
*/
export function stopCrawlScheduler(): void {
if (schedulerCronJob) {
schedulerCronJob.stop();
schedulerCronJob = null;
console.log('Crawl scheduler stopped');
}
}
/**
* Restart the scheduler
*/
export async function restartCrawlScheduler(): Promise<void> {
await startCrawlScheduler();
}
// ============================================
// Manual Triggers
// ============================================
/**
* Manually trigger a crawl for a specific store (creates a job immediately)
*/
export async function triggerManualCrawl(storeId: number): Promise<CrawlJob> {
console.log(`Manual crawl triggered for store ID: ${storeId}`);
return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
}
/**
* Manually trigger crawls for all stores
*/
export async function triggerAllStoresCrawl(): Promise<number> {
console.log('Manual crawl triggered for all stores');
const result = await pool.query(`
SELECT id, name FROM stores
WHERE active = TRUE AND scrape_enabled = TRUE
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
)
`);
let jobsCreated = 0;
for (const store of result.rows) {
await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
jobsCreated++;
}
console.log(`Created ${jobsCreated} manual crawl jobs`);
return jobsCreated;
}
/**
* Cancel a pending job
*/
export async function cancelJob(jobId: number): Promise<boolean> {
const result = await pool.query(`
UPDATE crawl_jobs
SET status = 'cancelled'
WHERE id = $1 AND status = 'pending'
RETURNING id
`, [jobId]);
return result.rows.length > 0;
}

View File

@@ -0,0 +1,645 @@
/**
* Crawler Jobs Service
*
* Handles three types of jobs:
* 1. DetectMenuProviderJob - Detect menu provider for a dispensary
* 2. DutchieMenuCrawlJob - Production Dutchie crawl
* 3. SandboxCrawlJob - Learning/testing crawl for unknown providers
*/
import { pool } from '../db/migrate';
import { logger } from './logger';
import { detectMenuProvider, detectProviderChange, MenuProvider } from './menu-provider-detector';
import { scrapeStore } from '../scraper-v2';
import puppeteer, { Browser, Page } from 'puppeteer';
import { promises as fs } from 'fs';
import path from 'path';
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
// ========================================
// Types
// ========================================
interface Dispensary {
id: number;
name: string;
website: string | null;
menu_url: string | null;
menu_provider: MenuProvider | null;
menu_provider_confidence: number;
crawler_mode: 'production' | 'sandbox';
crawler_status: string;
scraper_template: string | null;
}
interface JobResult {
success: boolean;
message: string;
data?: Record<string, any>;
}
// ========================================
// Helper Functions
// ========================================
async function getDispensary(dispensaryId: number): Promise<Dispensary | null> {
const result = await pool.query(
`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence,
crawler_mode, crawler_status, scraper_template
FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
return result.rows[0] || null;
}
async function updateDispensary(
dispensaryId: number,
updates: Partial<Dispensary> & { last_menu_error_at?: Date; last_error_message?: string; provider_detection_data?: any; last_menu_scrape?: Date; menu_scrape_status?: string }
): Promise<void> {
const setClauses: string[] = [];
const values: any[] = [];
let paramIndex = 1;
for (const [key, value] of Object.entries(updates)) {
setClauses.push(`${key} = $${paramIndex}`);
values.push(value);
paramIndex++;
}
setClauses.push(`updated_at = NOW()`);
values.push(dispensaryId);
await pool.query(
`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`,
values
);
}
async function createSandboxEntry(
dispensaryId: number,
suspectedProvider: string | null,
mode: string,
detectionSignals?: any
): Promise<number> {
// First, check if there's an existing active sandbox
const existing = await pool.query(
`SELECT id FROM crawler_sandboxes
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`,
[dispensaryId]
);
if (existing.rows.length > 0) {
// Update existing
await pool.query(
`UPDATE crawler_sandboxes
SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
WHERE id = $1`,
[existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]
);
return existing.rows[0].id;
}
// Create new
const result = await pool.query(
`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status)
VALUES ($1, $2, $3, $4, 'pending')
RETURNING id`,
[dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']
);
return result.rows[0].id;
}
async function createSandboxJob(
dispensaryId: number,
sandboxId: number | null,
jobType: string,
priority: number = 0
): Promise<number> {
const result = await pool.query(
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
VALUES ($1, $2, $3, 'pending', $4)
RETURNING id`,
[dispensaryId, sandboxId, jobType, priority]
);
return result.rows[0].id;
}
// Get linked store ID for a dispensary (for using existing scraper)
async function getStoreIdForDispensary(dispensaryId: number): Promise<number | null> {
// Check if there's a stores entry linked to this dispensary
const result = await pool.query(
`SELECT s.id FROM stores s
JOIN dispensaries d ON d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%'
WHERE d.id = $1
LIMIT 1`,
[dispensaryId]
);
if (result.rows.length > 0) {
return result.rows[0].id;
}
// Try to find by website
const result2 = await pool.query(
`SELECT s.id FROM stores s
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
WHERE d.id = $1
LIMIT 1`,
[dispensaryId]
);
return result2.rows[0]?.id || null;
}
// ========================================
// Job 1: Detect Menu Provider
// ========================================
export async function runDetectMenuProviderJob(dispensaryId: number): Promise<JobResult> {
logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`);
const dispensary = await getDispensary(dispensaryId);
if (!dispensary) {
return { success: false, message: `Dispensary ${dispensaryId} not found` };
}
// Check for website URL
const websiteUrl = dispensary.website || dispensary.menu_url;
if (!websiteUrl) {
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: 'No website URL available for detection',
});
return { success: false, message: 'No website URL available' };
}
try {
// Run detection
const detection = await detectMenuProvider(websiteUrl, {
checkMenuPaths: true,
timeout: 30000,
});
// Update dispensary with results
const updates: any = {
menu_provider: detection.provider,
menu_provider_confidence: detection.confidence,
provider_detection_data: JSON.stringify({
signals: detection.signals,
urlsTested: detection.urlsTested,
menuEntryPoints: detection.menuEntryPoints,
rawSignals: detection.rawSignals,
detectedAt: new Date().toISOString(),
}),
crawler_status: 'idle',
};
// Decide crawler mode based on provider
if (detection.provider === 'dutchie' && detection.confidence >= 70) {
// Dutchie with high confidence -> production
updates.crawler_mode = 'production';
logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`);
} else {
// Unknown or non-Dutchie -> sandbox
updates.crawler_mode = 'sandbox';
// Create sandbox entry for further analysis
const sandboxId = await createSandboxEntry(
dispensaryId,
detection.provider,
'detection',
{
signals: detection.signals,
rawSignals: detection.rawSignals,
}
);
// Queue sandbox crawl job
await createSandboxJob(dispensaryId, sandboxId, 'detection');
logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`);
}
// Update menu entry points if found
if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) {
updates.menu_url = detection.menuEntryPoints[0];
}
await updateDispensary(dispensaryId, updates);
return {
success: true,
message: `Detected provider: ${detection.provider} (${detection.confidence}%)`,
data: {
provider: detection.provider,
confidence: detection.confidence,
mode: updates.crawler_mode,
menuEntryPoints: detection.menuEntryPoints,
},
};
} catch (error: any) {
logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`);
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: `Detection failed: ${error.message}`,
});
return { success: false, message: error.message };
}
}
// ========================================
// Job 2: Dutchie Menu Crawl (Production)
// ========================================
export async function runDutchieMenuCrawlJob(dispensaryId: number): Promise<JobResult> {
logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`);
const dispensary = await getDispensary(dispensaryId);
if (!dispensary) {
return { success: false, message: `Dispensary ${dispensaryId} not found` };
}
// Verify it's a Dutchie production dispensary
if (dispensary.menu_provider !== 'dutchie') {
logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`);
return { success: false, message: 'Not a Dutchie dispensary' };
}
if (dispensary.crawler_mode !== 'production') {
logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`);
return { success: false, message: 'Not in production mode' };
}
// Find linked store ID
const storeId = await getStoreIdForDispensary(dispensaryId);
if (!storeId) {
// Need to create a store entry or handle differently
logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`);
return { success: false, message: 'No linked store found - needs setup' };
}
try {
// Update status to running
await updateDispensary(dispensaryId, { crawler_status: 'running' });
// Run the existing Dutchie scraper
await scrapeStore(storeId, 3); // 3 parallel workers
// Update success status
await updateDispensary(dispensaryId, {
crawler_status: 'ok',
last_menu_scrape: new Date() as any,
menu_scrape_status: 'active' as any,
});
logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`);
return {
success: true,
message: 'Dutchie crawl completed successfully',
data: { storeId },
};
} catch (error: any) {
logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`);
// Check if this might be a provider change
let providerChanged = false;
try {
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
const page = await browser.newPage();
const url = dispensary.menu_url || dispensary.website;
if (url) {
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
const changeResult = await detectProviderChange(page, 'dutchie');
providerChanged = changeResult.changed;
if (providerChanged) {
// Provider changed - move to sandbox
await updateDispensary(dispensaryId, {
crawler_mode: 'sandbox',
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`,
});
const sandboxId = await createSandboxEntry(
dispensaryId,
changeResult.newProvider || 'unknown',
'detection',
{ providerChangeDetected: true, previousProvider: 'dutchie' }
);
await createSandboxJob(dispensaryId, sandboxId, 'detection');
logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`);
}
}
await browser.close();
} catch {
// Ignore detection errors during failure handling
}
if (!providerChanged) {
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: error.message,
});
}
return { success: false, message: error.message };
}
}
// ========================================
// Job 3: Sandbox Crawl (Learning Mode)
// ========================================
export async function runSandboxCrawlJob(dispensaryId: number, sandboxId?: number): Promise<JobResult> {
logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`);
const dispensary = await getDispensary(dispensaryId);
if (!dispensary) {
return { success: false, message: `Dispensary ${dispensaryId} not found` };
}
// Get or create sandbox entry
let sandbox: any;
if (sandboxId) {
const result = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
sandbox = result.rows[0];
} else {
const result = await pool.query(
`SELECT * FROM crawler_sandboxes
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')
ORDER BY created_at DESC LIMIT 1`,
[dispensaryId]
);
sandbox = result.rows[0];
if (!sandbox) {
const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning');
const result = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
sandbox = result.rows[0];
}
}
const websiteUrl = dispensary.menu_url || dispensary.website;
if (!websiteUrl) {
await pool.query(
`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`,
[sandbox.id]
);
return { success: false, message: 'No website URL available' };
}
let browser: Browser | null = null;
try {
// Update status
await pool.query(
`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`,
[sandbox.id]
);
await updateDispensary(dispensaryId, { crawler_status: 'running' });
// Launch browser
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
// URLs to crawl (limited depth for sandbox)
const urlsToVisit = [websiteUrl];
const menuPaths = ['/menu', '/shop', '/products', '/order'];
for (const path of menuPaths) {
const baseUrl = new URL(websiteUrl).origin;
urlsToVisit.push(`${baseUrl}${path}`);
}
const urlsTested: string[] = [];
const menuEntryPoints: string[] = [];
const capturedHtml: { url: string; html: string }[] = [];
const analysisData: any = {
provider_signals: {},
selector_candidates: [],
page_structures: [],
};
// Crawl each URL
for (const url of urlsToVisit) {
try {
urlsTested.push(url);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content
// Get page HTML
const html = await page.content();
// Check if this looks like a menu page
const hasMenuContent = await page.evaluate(() => {
const text = document.body.innerText.toLowerCase();
return (
text.includes('add to cart') ||
text.includes('thc') ||
text.includes('indica') ||
text.includes('sativa')
);
});
if (hasMenuContent) {
menuEntryPoints.push(url);
capturedHtml.push({ url, html });
// Analyze page structure for selector candidates
const structure = await page.evaluate(() => {
const candidates: any[] = [];
// Look for product-like containers
const productSelectors = [
'.product', '.product-card', '.menu-item', '.item-card',
'[data-product]', '[data-item]', '.strain', '.listing',
];
for (const selector of productSelectors) {
const els = document.querySelectorAll(selector);
if (els.length > 3) { // Likely a list
candidates.push({
selector,
count: els.length,
type: 'product_container',
});
}
}
// Look for price patterns
const pricePattern = /\$\d+(\.\d{2})?/;
const textNodes = document.body.innerText;
const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g);
return {
candidates,
priceCount: priceMatches?.length || 0,
hasAddToCart: textNodes.toLowerCase().includes('add to cart'),
};
});
analysisData.page_structures.push({
url,
...structure,
});
}
} catch (pageError: any) {
if (!pageError.message.includes('404')) {
logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`);
}
}
}
// Save HTML to storage (local for now, S3 later)
let rawHtmlLocation: string | null = null;
if (capturedHtml.length > 0) {
const htmlDir = path.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`);
await fs.mkdir(htmlDir, { recursive: true });
for (const { url, html } of capturedHtml) {
const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`;
await fs.writeFile(path.join(htmlDir, filename), html);
}
rawHtmlLocation = htmlDir;
}
// Update sandbox with results
await pool.query(
`UPDATE crawler_sandboxes SET
status = $1,
urls_tested = $2,
menu_entry_points = $3,
raw_html_location = $4,
analysis_json = $5,
confidence_score = $6,
analyzed_at = NOW(),
updated_at = NOW()
WHERE id = $7`,
[
menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending',
JSON.stringify(urlsTested),
JSON.stringify(menuEntryPoints),
rawHtmlLocation,
JSON.stringify(analysisData),
menuEntryPoints.length > 0 ? 50 : 20,
sandbox.id,
]
);
// Update dispensary status
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review', // Sandbox results need review
});
logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`);
return {
success: true,
message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`,
data: {
sandboxId: sandbox.id,
urlsTested: urlsTested.length,
menuEntryPoints,
analysisData,
},
};
} catch (error: any) {
logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`);
await pool.query(
`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`,
[error.message, sandbox.id]
);
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: `Sandbox crawl failed: ${error.message}`,
});
return { success: false, message: error.message };
} finally {
if (browser) {
await browser.close();
}
}
}
// ========================================
// Queue Processing Functions
// ========================================
/**
* Process pending sandbox jobs
*/
export async function processSandboxJobs(limit: number = 5): Promise<void> {
// Claim pending jobs
const jobs = await pool.query(
`UPDATE sandbox_crawl_jobs
SET status = 'running', worker_id = $1, started_at = NOW()
WHERE id IN (
SELECT id FROM sandbox_crawl_jobs
WHERE status = 'pending' AND scheduled_at <= NOW()
ORDER BY priority DESC, scheduled_at ASC
LIMIT $2
FOR UPDATE SKIP LOCKED
)
RETURNING *`,
[WORKER_ID, limit]
);
for (const job of jobs.rows) {
try {
let result: JobResult;
if (job.job_type === 'detection') {
result = await runDetectMenuProviderJob(job.dispensary_id);
} else {
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
}
await pool.query(
`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`,
[
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]
);
} catch (error: any) {
await pool.query(
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
[error.message, job.id]
);
}
}
}

View File

@@ -0,0 +1,414 @@
/**
* CrawlerLogger - Structured logging for crawler operations
*
* High-signal, low-noise logging with JSON output for:
* - Job lifecycle (one summary per job)
* - Provider/mode changes
* - Sandbox events
* - Queue failures
*
* NO per-product logging - that's too noisy.
*/
export type LogLevel = 'info' | 'warn' | 'error' | 'debug';
export type LogEvent =
| 'job_started'
| 'job_completed'
| 'job_failed'
| 'job_cancelled'
| 'provider_detected'
| 'provider_changed'
| 'mode_changed'
| 'sandbox_started'
| 'sandbox_completed'
| 'sandbox_failed'
| 'queue_failure'
| 'detection_scan'
| 'crawl_batch'
| 'intelligence_run';
interface BaseLogPayload {
timestamp: string;
level: LogLevel;
event: LogEvent;
dispensary_id?: number;
store_id?: number;
job_id?: number;
provider?: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
}
interface JobStartedPayload extends BaseLogPayload {
event: 'job_started';
job_type: string;
trigger_type: string;
store_name: string;
}
interface JobCompletedPayload extends BaseLogPayload {
event: 'job_completed';
store_name: string;
duration_ms: number;
products_found: number;
products_new: number;
products_updated: number;
products_marked_oos?: number;
}
interface JobFailedPayload extends BaseLogPayload {
event: 'job_failed';
store_name: string;
duration_ms: number;
error_message: string;
error_code?: string;
}
interface ProviderDetectedPayload extends BaseLogPayload {
event: 'provider_detected';
dispensary_name: string;
detected_provider: string;
confidence: number;
detection_method: string;
menu_url?: string;
}
interface ProviderChangedPayload extends BaseLogPayload {
event: 'provider_changed';
dispensary_name: string;
old_provider: string | null;
new_provider: string;
old_confidence: number;
new_confidence: number;
}
interface ModeChangedPayload extends BaseLogPayload {
event: 'mode_changed';
dispensary_name: string;
old_mode: string;
new_mode: string;
reason: string;
}
interface SandboxEventPayload extends BaseLogPayload {
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
dispensary_name: string;
template_name: string;
quality_score?: number;
products_extracted?: number;
fields_missing?: number;
error_message?: string;
}
interface QueueFailurePayload extends BaseLogPayload {
event: 'queue_failure';
queue_type: string;
error_message: string;
affected_items?: number;
}
interface DetectionScanPayload extends BaseLogPayload {
event: 'detection_scan';
total_scanned: number;
detected: number;
failed: number;
skipped: number;
duration_ms: number;
}
interface IntelligenceRunPayload extends BaseLogPayload {
event: 'intelligence_run';
run_type: 'detection' | 'production' | 'sandbox' | 'full';
dispensaries_processed: number;
jobs_queued: number;
duration_ms: number;
}
type LogPayload =
| JobStartedPayload
| JobCompletedPayload
| JobFailedPayload
| ProviderDetectedPayload
| ProviderChangedPayload
| ModeChangedPayload
| SandboxEventPayload
| QueueFailurePayload
| DetectionScanPayload
| IntelligenceRunPayload;
class CrawlerLoggerService {
private formatLog(payload: LogPayload): string {
return JSON.stringify(payload);
}
private log(payload: LogPayload): void {
const formatted = this.formatLog(payload);
switch (payload.level) {
case 'error':
console.error(`[CRAWLER] ${formatted}`);
break;
case 'warn':
console.warn(`[CRAWLER] ${formatted}`);
break;
case 'debug':
console.debug(`[CRAWLER] ${formatted}`);
break;
default:
console.log(`[CRAWLER] ${formatted}`);
}
}
/**
* Log when a crawl job starts
*/
jobStarted(params: {
job_id: number;
store_id: number;
store_name: string;
job_type: string;
trigger_type: string;
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'job_started',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
job_type: params.job_type,
trigger_type: params.trigger_type,
provider: params.provider,
});
}
/**
* Log when a crawl job completes successfully
*/
jobCompleted(params: {
job_id: number;
store_id: number;
store_name: string;
duration_ms: number;
products_found: number;
products_new: number;
products_updated: number;
products_marked_oos?: number;
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'job_completed',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
duration_ms: params.duration_ms,
products_found: params.products_found,
products_new: params.products_new,
products_updated: params.products_updated,
products_marked_oos: params.products_marked_oos,
provider: params.provider,
});
}
/**
* Log when a crawl job fails
*/
jobFailed(params: {
job_id: number;
store_id: number;
store_name: string;
duration_ms: number;
error_message: string;
error_code?: string;
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'error',
event: 'job_failed',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
duration_ms: params.duration_ms,
error_message: params.error_message,
error_code: params.error_code,
provider: params.provider,
});
}
/**
* Log when a provider is detected for a dispensary
*/
providerDetected(params: {
dispensary_id: number;
dispensary_name: string;
detected_provider: string;
confidence: number;
detection_method: string;
menu_url?: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'provider_detected',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
detected_provider: params.detected_provider,
confidence: params.confidence,
detection_method: params.detection_method,
menu_url: params.menu_url,
category: params.category,
});
}
/**
* Log when a dispensary's provider changes
*/
providerChanged(params: {
dispensary_id: number;
dispensary_name: string;
old_provider: string | null;
new_provider: string;
old_confidence: number;
new_confidence: number;
category?: 'product' | 'specials' | 'brand' | 'metadata';
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'provider_changed',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
old_provider: params.old_provider,
new_provider: params.new_provider,
old_confidence: params.old_confidence,
new_confidence: params.new_confidence,
category: params.category,
});
}
/**
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
*/
modeChanged(params: {
dispensary_id: number;
dispensary_name: string;
old_mode: string;
new_mode: string;
reason: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'mode_changed',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
old_mode: params.old_mode,
new_mode: params.new_mode,
reason: params.reason,
category: params.category,
provider: params.provider,
});
}
/**
* Log sandbox crawl events
*/
sandboxEvent(params: {
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
dispensary_id: number;
dispensary_name: string;
template_name: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
quality_score?: number;
products_extracted?: number;
fields_missing?: number;
error_message?: string;
provider?: string;
}): void {
const level: LogLevel = params.event === 'sandbox_failed' ? 'error' : 'info';
this.log({
timestamp: new Date().toISOString(),
level,
event: params.event,
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
template_name: params.template_name,
category: params.category,
quality_score: params.quality_score,
products_extracted: params.products_extracted,
fields_missing: params.fields_missing,
error_message: params.error_message,
provider: params.provider,
});
}
/**
* Log queue processing failures
*/
queueFailure(params: {
queue_type: string;
error_message: string;
affected_items?: number;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'error',
event: 'queue_failure',
queue_type: params.queue_type,
error_message: params.error_message,
affected_items: params.affected_items,
});
}
/**
* Log detection scan summary
*/
detectionScan(params: {
total_scanned: number;
detected: number;
failed: number;
skipped: number;
duration_ms: number;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'detection_scan',
total_scanned: params.total_scanned,
detected: params.detected,
failed: params.failed,
skipped: params.skipped,
duration_ms: params.duration_ms,
});
}
/**
* Log intelligence run summary
*/
intelligenceRun(params: {
run_type: 'detection' | 'production' | 'sandbox' | 'full';
dispensaries_processed: number;
jobs_queued: number;
duration_ms: number;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'intelligence_run',
run_type: params.run_type,
dispensaries_processed: params.dispensaries_processed,
jobs_queued: params.jobs_queued,
duration_ms: params.duration_ms,
});
}
}
// Export singleton instance
export const crawlerLogger = new CrawlerLoggerService();

View File

@@ -0,0 +1,620 @@
/**
* Multi-Category Intelligence Detector
*
* Detects providers for each intelligence category independently:
* - Products: Which provider serves product data
* - Specials: Which provider serves deals/specials
* - Brand: Which provider serves brand information
* - Metadata: Which provider serves taxonomy/category data
*/
import { pool } from '../db/migrate';
import { logger } from './logger';
import puppeteer, { Browser, Page } from 'puppeteer';
// ========================================
// Types
// ========================================
export type IntelligenceCategory = 'product' | 'specials' | 'brand' | 'metadata';
export type MenuProvider =
| 'dutchie'
| 'treez'
| 'jane'
| 'iheartjane'
| 'weedmaps'
| 'leafly'
| 'meadow'
| 'greenlight'
| 'blaze'
| 'flowhub'
| 'dispense'
| 'cova'
| 'custom_html'
| 'custom_json'
| 'dutchie_json'
| 'other'
| 'unknown';
export interface CategoryDetectionResult {
provider: MenuProvider;
confidence: number;
mode: 'production' | 'sandbox';
signals: Record<string, any>;
templateName?: string;
}
export interface MultiCategoryDetectionResult {
product: CategoryDetectionResult;
specials: CategoryDetectionResult;
brand: CategoryDetectionResult;
metadata: CategoryDetectionResult;
urlsTested: string[];
rawSignals: Record<string, any>;
}
// Production-ready providers per category
// Only these combinations can be set to production mode
const PRODUCTION_READY: Record<IntelligenceCategory, MenuProvider[]> = {
product: ['dutchie'], // Only Dutchie products are production-ready
specials: [], // None yet
brand: [], // None yet
metadata: [], // None yet
};
// Provider detection patterns
const PROVIDER_PATTERNS: Record<string, {
scripts: RegExp[];
iframes: RegExp[];
html: RegExp[];
apiEndpoints: RegExp[];
metaTags: RegExp[];
}> = {
dutchie: {
scripts: [
/dutchie\.com/i,
/dutchie-plus/i,
/dutchie\.js/i,
/__DUTCHIE__/i,
/dutchie-embed/i,
],
iframes: [
/dutchie\.com/i,
/dutchie-plus\.com/i,
/embed\.dutchie/i,
],
html: [
/class="dutchie/i,
/id="dutchie/i,
/data-dutchie/i,
/"menuType":\s*"dutchie"/i,
],
apiEndpoints: [
/dutchie\.com\/graphql/i,
/plus\.dutchie\.com/i,
],
metaTags: [
/dutchie/i,
],
},
treez: {
scripts: [
/treez\.io/i,
/treez-ecommerce/i,
/treez\.js/i,
],
iframes: [
/treez\.io/i,
/shop\.treez/i,
],
html: [
/class="treez/i,
/data-treez/i,
/treez-menu/i,
],
apiEndpoints: [
/api\.treez\.io/i,
/treez\.io\/api/i,
],
metaTags: [],
},
jane: {
scripts: [
/jane\.co/i,
/iheartjane\.com/i,
/jane-frame/i,
/jane\.js/i,
],
iframes: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
html: [
/class="jane/i,
/data-jane/i,
/jane-embed/i,
],
apiEndpoints: [
/api\.iheartjane/i,
/jane\.co\/api/i,
],
metaTags: [],
},
weedmaps: {
scripts: [
/weedmaps\.com/i,
/wm-menu/i,
],
iframes: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
html: [
/data-weedmaps/i,
/wm-menu/i,
],
apiEndpoints: [
/api-g\.weedmaps/i,
/weedmaps\.com\/api/i,
],
metaTags: [],
},
leafly: {
scripts: [
/leafly\.com/i,
/leafly-menu/i,
],
iframes: [
/leafly\.com/i,
/order\.leafly/i,
],
html: [
/data-leafly/i,
/leafly-embed/i,
],
apiEndpoints: [
/api\.leafly/i,
],
metaTags: [],
},
};
// Category-specific detection signals
const CATEGORY_SIGNALS: Record<IntelligenceCategory, {
urlPatterns: RegExp[];
htmlPatterns: RegExp[];
jsonKeys: string[];
}> = {
product: {
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
},
specials: {
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
},
brand: {
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
},
metadata: {
urlPatterns: [/\/categories/i, /\/taxonomy/i],
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
},
};
// ========================================
// Main Detection Function
// ========================================
export async function detectMultiCategoryProviders(
websiteUrl: string,
options: {
timeout?: number;
headless?: boolean;
existingBrowser?: Browser;
} = {}
): Promise<MultiCategoryDetectionResult> {
const { timeout = 30000, headless = true, existingBrowser } = options;
let browser: Browser | null = null;
let page: Page | null = null;
const urlsTested: string[] = [];
const rawSignals: Record<string, any> = {};
try {
browser = existingBrowser || await puppeteer.launch({
headless,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
// Navigate to main site
const baseUrl = normalizeUrl(websiteUrl);
urlsTested.push(baseUrl);
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
// Collect signals from main page
const mainPageSignals = await collectPageSignals(page);
rawSignals.mainPage = mainPageSignals;
// Try common menu URLs
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
for (const path of menuUrls) {
try {
const fullUrl = new URL(path, baseUrl).toString();
urlsTested.push(fullUrl);
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
const signals = await collectPageSignals(page);
rawSignals[path] = signals;
} catch {
// URL doesn't exist or timed out
}
}
// Analyze signals for each category
const result: MultiCategoryDetectionResult = {
product: analyzeCategorySignals('product', rawSignals),
specials: analyzeCategorySignals('specials', rawSignals),
brand: analyzeCategorySignals('brand', rawSignals),
metadata: analyzeCategorySignals('metadata', rawSignals),
urlsTested,
rawSignals,
};
logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
return result;
} catch (error: any) {
logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
// Return unknown results for all categories
return {
product: createUnknownResult(),
specials: createUnknownResult(),
brand: createUnknownResult(),
metadata: createUnknownResult(),
urlsTested,
rawSignals: { error: error.message },
};
} finally {
if (page) await page.close().catch(() => {});
if (browser && !existingBrowser) await browser.close().catch(() => {});
}
}
// ========================================
// Helper Functions
// ========================================
function normalizeUrl(url: string): string {
if (!url.startsWith('http')) {
url = 'https://' + url;
}
return url.replace(/\/$/, '');
}
async function collectPageSignals(page: Page): Promise<Record<string, any>> {
return page.evaluate(() => {
const signals: Record<string, any> = {
scripts: [] as string[],
iframes: [] as string[],
links: [] as string[],
metaTags: [] as string[],
bodyClasses: document.body?.className || '',
bodyId: document.body?.id || '',
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
};
// Collect script sources
document.querySelectorAll('script[src]').forEach((el) => {
signals.scripts.push((el as HTMLScriptElement).src);
});
// Collect inline scripts
document.querySelectorAll('script:not([src])').forEach((el) => {
const content = el.textContent || '';
if (content.length < 5000) {
signals.scripts.push(`inline:${content.slice(0, 500)}`);
}
});
// Collect iframes
document.querySelectorAll('iframe').forEach((el) => {
signals.iframes.push(el.src);
});
// Collect links
document.querySelectorAll('a[href]').forEach((el) => {
signals.links.push((el as HTMLAnchorElement).href);
});
// Collect meta tags
document.querySelectorAll('meta').forEach((el) => {
const content = el.getAttribute('content') || '';
const name = el.getAttribute('name') || el.getAttribute('property') || '';
if (content || name) {
signals.metaTags.push(`${name}:${content}`);
}
});
// Look for JSON data
const jsonBlocks: string[] = [];
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
});
signals.jsonBlocks = jsonBlocks;
return signals;
});
}
function analyzeCategorySignals(
category: IntelligenceCategory,
allSignals: Record<string, any>
): CategoryDetectionResult {
const providerScores: Record<MenuProvider, number> = {} as any;
const detectedSignals: Record<string, any> = {};
// Initialize scores
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
providerScores[provider as MenuProvider] = 0;
}
// Analyze each page's signals
for (const [pagePath, signals] of Object.entries(allSignals)) {
if (!signals || typeof signals !== 'object') continue;
// Check for provider-specific patterns
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
let score = 0;
// Check scripts
if (signals.scripts) {
for (const script of signals.scripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(script)) {
score += 20;
detectedSignals[`${provider}_script_${pagePath}`] = script;
}
}
}
}
// Check iframes
if (signals.iframes) {
for (const iframe of signals.iframes) {
for (const pattern of patterns.iframes) {
if (pattern.test(iframe)) {
score += 25;
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
}
}
}
}
// Check HTML content
if (signals.htmlSnippet) {
for (const pattern of patterns.html) {
if (pattern.test(signals.htmlSnippet)) {
score += 15;
detectedSignals[`${provider}_html_${pagePath}`] = true;
}
}
}
providerScores[provider as MenuProvider] += score;
}
// Check for category-specific signals on relevant pages
const categorySignals = CATEGORY_SIGNALS[category];
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
if (isRelevantPage && signals.htmlSnippet) {
for (const pattern of categorySignals.htmlPatterns) {
if (pattern.test(signals.htmlSnippet)) {
detectedSignals[`${category}_html_pattern`] = true;
}
}
}
// Check JSON blocks for category data
if (signals.jsonBlocks) {
for (const json of signals.jsonBlocks) {
for (const key of categorySignals.jsonKeys) {
if (json.toLowerCase().includes(`"${key}"`)) {
detectedSignals[`${category}_json_key_${key}`] = true;
}
}
}
}
}
// Determine winning provider
let bestProvider: MenuProvider = 'unknown';
let bestScore = 0;
for (const [provider, score] of Object.entries(providerScores)) {
if (score > bestScore) {
bestScore = score;
bestProvider = provider as MenuProvider;
}
}
// Calculate confidence (0-100)
const confidence = Math.min(100, bestScore);
// Determine mode based on provider and confidence
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
const mode: 'production' | 'sandbox' = isProductionReady && confidence >= 70
? 'production'
: 'sandbox';
// Get template name if available
let templateName: string | undefined;
if (bestProvider === 'dutchie' && category === 'product') {
templateName = 'dutchie_standard';
} else if (bestProvider === 'treez') {
templateName = 'treez_products_v0';
}
return {
provider: bestProvider,
confidence,
mode,
signals: detectedSignals,
templateName,
};
}
function createUnknownResult(): CategoryDetectionResult {
return {
provider: 'unknown',
confidence: 0,
mode: 'sandbox',
signals: {},
};
}
// ========================================
// Lightweight Per-Category Change Detection
// ========================================
export async function detectCategoryProviderChange(
page: Page,
category: IntelligenceCategory,
expectedProvider: MenuProvider
): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> {
try {
const signals = await collectPageSignals(page);
const result = analyzeCategorySignals(category, { currentPage: signals });
if (result.provider !== expectedProvider && result.confidence > 50) {
logger.warn(
'provider-detection',
`Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`
);
return {
changed: true,
newProvider: result.provider,
confidence: result.confidence,
};
}
return { changed: false };
} catch (error: any) {
logger.error('provider-detection', `Change detection failed: ${error.message}`);
return { changed: false };
}
}
// ========================================
// Database Operations
// ========================================
export async function updateDispensaryCategoryProvider(
dispensaryId: number,
category: IntelligenceCategory,
result: CategoryDetectionResult
): Promise<void> {
const columnPrefix = category === 'product' ? 'product' :
category === 'specials' ? 'specials' :
category === 'brand' ? 'brand' : 'metadata';
await pool.query(
`UPDATE dispensaries SET
${columnPrefix}_provider = $1,
${columnPrefix}_confidence = $2,
${columnPrefix}_crawler_mode = $3,
${columnPrefix}_detection_data = $4,
updated_at = NOW()
WHERE id = $5`,
[
result.provider,
result.confidence,
result.mode,
JSON.stringify(result.signals),
dispensaryId,
]
);
}
export async function updateAllCategoryProviders(
dispensaryId: number,
result: MultiCategoryDetectionResult
): Promise<void> {
await pool.query(
`UPDATE dispensaries SET
product_provider = $1,
product_confidence = $2,
product_crawler_mode = $3,
product_detection_data = $4,
specials_provider = $5,
specials_confidence = $6,
specials_crawler_mode = $7,
specials_detection_data = $8,
brand_provider = $9,
brand_confidence = $10,
brand_crawler_mode = $11,
brand_detection_data = $12,
metadata_provider = $13,
metadata_confidence = $14,
metadata_crawler_mode = $15,
metadata_detection_data = $16,
updated_at = NOW()
WHERE id = $17`,
[
result.product.provider,
result.product.confidence,
result.product.mode,
JSON.stringify(result.product.signals),
result.specials.provider,
result.specials.confidence,
result.specials.mode,
JSON.stringify(result.specials.signals),
result.brand.provider,
result.brand.confidence,
result.brand.mode,
JSON.stringify(result.brand.signals),
result.metadata.provider,
result.metadata.confidence,
result.metadata.mode,
JSON.stringify(result.metadata.signals),
dispensaryId,
]
);
}
export async function moveCategoryToSandbox(
dispensaryId: number,
category: IntelligenceCategory,
reason: string
): Promise<void> {
const columnPrefix = category === 'product' ? 'product' :
category === 'specials' ? 'specials' :
category === 'brand' ? 'brand' : 'metadata';
await pool.query(
`UPDATE dispensaries SET
${columnPrefix}_crawler_mode = 'sandbox',
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
updated_at = NOW()
WHERE id = $2`,
[
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
dispensaryId,
]
);
logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
}

View File

@@ -1,7 +1,7 @@
interface LogEntry { interface LogEntry {
timestamp: Date; timestamp: Date;
level: 'info' | 'error' | 'warn' | 'debug'; level: 'info' | 'error' | 'warn' | 'debug';
category: 'scraper' | 'images' | 'categories' | 'system' | 'api' | 'pipeline' | 'age-gate' | 'proxy'; category: 'scraper' | 'images' | 'categories' | 'system' | 'api' | 'pipeline' | 'age-gate' | 'proxy' | 'crawler-jobs' | 'provider-detection' | 'sandbox' | 'intelligence';
message: string; message: string;
} }

View File

@@ -0,0 +1,726 @@
/**
* Menu Provider Detection Service
*
* Detects which menu platform a dispensary is using by analyzing:
* - HTML content patterns (scripts, iframes, classes)
* - URL patterns (embedded menu paths)
* - API endpoint signatures
* - Meta tags and headers
*/
import puppeteer, { Browser, Page } from 'puppeteer';
import { logger } from './logger';
// Known menu provider signatures
export type MenuProvider =
| 'dutchie'
| 'treez'
| 'jane'
| 'iheartjane'
| 'weedmaps'
| 'leafly'
| 'meadow'
| 'greenlight'
| 'blaze'
| 'flowhub'
| 'dispense'
| 'cova'
| 'other'
| 'unknown';
export interface DetectionSignal {
provider: MenuProvider;
confidence: number; // 0-100
source: string; // What triggered this detection
details?: string; // Additional context
}
export interface DetectionResult {
provider: MenuProvider;
confidence: number;
signals: DetectionSignal[];
urlsTested: string[];
menuEntryPoints: string[];
rawSignals: Record<string, boolean | string | number>;
error?: string;
}
// Provider detection patterns
const PROVIDER_PATTERNS: Record<string, {
scripts: RegExp[];
iframes: RegExp[];
classes: RegExp[];
urls: RegExp[];
meta: RegExp[];
apiEndpoints: RegExp[];
htmlPatterns: RegExp[];
}> = {
dutchie: {
scripts: [
/dutchie/i,
/dutchie-plus/i,
/dutchie\.com/i,
/dutchie-embed/i,
],
iframes: [
/dutchie\.com/i,
/embed\.dutchie/i,
/iframe\.dutchie/i,
],
classes: [
/dutchie-/i,
/DutchieEmbed/i,
],
urls: [
/dutchie\.com/i,
/\.dutchie\./i,
],
meta: [
/dutchie/i,
],
apiEndpoints: [
/graphql.*dutchie/i,
/api\.dutchie/i,
],
htmlPatterns: [
/data-dutchie/i,
/__DUTCHIE__/i,
/dutchie-plus-iframe/i,
],
},
treez: {
scripts: [
/treez/i,
/treez\.io/i,
/treezpay/i,
],
iframes: [
/treez\.io/i,
/menu\.treez/i,
],
classes: [
/treez-/i,
],
urls: [
/treez\.io/i,
/\.treez\./i,
],
meta: [
/treez/i,
],
apiEndpoints: [
/api\.treez/i,
],
htmlPatterns: [
/data-treez/i,
/treez-embed/i,
],
},
jane: {
scripts: [
/jane\.co/i,
/iheartjane/i,
/jane-embed/i,
/janetechnologies/i,
],
iframes: [
/jane\.co/i,
/iheartjane\.com/i,
/menu\.jane/i,
],
classes: [
/jane-/i,
/iheartjane/i,
],
urls: [
/jane\.co/i,
/iheartjane\.com/i,
],
meta: [
/jane/i,
/iheartjane/i,
],
apiEndpoints: [
/api\.iheartjane/i,
/api\.jane\.co/i,
],
htmlPatterns: [
/data-jane/i,
/jane-root/i,
/jane-embed/i,
],
},
weedmaps: {
scripts: [
/weedmaps/i,
/wm\.com/i,
],
iframes: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
classes: [
/weedmaps-/i,
/wm-/i,
],
urls: [
/weedmaps\.com/i,
],
meta: [
/weedmaps/i,
],
apiEndpoints: [
/api.*weedmaps/i,
],
htmlPatterns: [
/data-weedmaps/i,
],
},
leafly: {
scripts: [
/leafly/i,
/leafly\.com/i,
],
iframes: [
/leafly\.com/i,
/menu\.leafly/i,
],
classes: [
/leafly-/i,
],
urls: [
/leafly\.com/i,
],
meta: [
/leafly/i,
],
apiEndpoints: [
/api\.leafly/i,
],
htmlPatterns: [
/data-leafly/i,
],
},
meadow: {
scripts: [
/meadow/i,
/getmeadow/i,
],
iframes: [
/getmeadow\.com/i,
],
classes: [
/meadow-/i,
],
urls: [
/getmeadow\.com/i,
],
meta: [],
apiEndpoints: [
/api\.getmeadow/i,
],
htmlPatterns: [],
},
greenlight: {
scripts: [
/greenlight/i,
/greenlightmenu/i,
],
iframes: [
/greenlight/i,
],
classes: [
/greenlight-/i,
],
urls: [
/greenlight/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
blaze: {
scripts: [
/blaze\.me/i,
/blazepos/i,
],
iframes: [
/blaze\.me/i,
],
classes: [
/blaze-/i,
],
urls: [
/blaze\.me/i,
],
meta: [],
apiEndpoints: [
/api\.blaze/i,
],
htmlPatterns: [],
},
flowhub: {
scripts: [
/flowhub/i,
],
iframes: [
/flowhub\.com/i,
],
classes: [
/flowhub-/i,
],
urls: [
/flowhub\.com/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
dispense: {
scripts: [
/dispenseapp/i,
],
iframes: [
/dispenseapp\.com/i,
],
classes: [
/dispense-/i,
],
urls: [
/dispenseapp\.com/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
cova: {
scripts: [
/covasoftware/i,
/cova\.software/i,
],
iframes: [
/cova/i,
],
classes: [
/cova-/i,
],
urls: [
/cova/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
};
// Common menu URL paths to check
const MENU_PATHS = [
'/menu',
'/shop',
'/products',
'/order',
'/store',
'/dispensary-menu',
'/online-menu',
'/shop-all',
'/browse',
'/catalog',
];
/**
* Analyze a single page for provider signals
*/
async function analyzePageForProviders(
page: Page,
url: string
): Promise<DetectionSignal[]> {
const signals: DetectionSignal[] = [];
try {
// Get page HTML
const html = await page.content();
const lowerHtml = html.toLowerCase();
// Check each provider's patterns
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
// Check script sources
const scripts = await page.$$eval('script[src]', els =>
els.map(el => el.getAttribute('src') || '')
);
for (const script of scripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(script)) {
signals.push({
provider: provider as MenuProvider,
confidence: 90,
source: 'script_src',
details: script,
});
}
}
}
// Check inline scripts
const inlineScripts = await page.$$eval('script:not([src])', els =>
els.map(el => el.textContent || '')
);
for (const scriptContent of inlineScripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(scriptContent)) {
signals.push({
provider: provider as MenuProvider,
confidence: 70,
source: 'inline_script',
details: `Pattern: ${pattern}`,
});
}
}
}
// Check iframes
const iframes = await page.$$eval('iframe', els =>
els.map(el => el.getAttribute('src') || '')
);
for (const iframe of iframes) {
for (const pattern of patterns.iframes) {
if (pattern.test(iframe)) {
signals.push({
provider: provider as MenuProvider,
confidence: 95,
source: 'iframe_src',
details: iframe,
});
}
}
}
// Check HTML patterns
for (const pattern of patterns.htmlPatterns) {
if (pattern.test(html)) {
signals.push({
provider: provider as MenuProvider,
confidence: 85,
source: 'html_pattern',
details: `Pattern: ${pattern}`,
});
}
}
// Check CSS classes
for (const pattern of patterns.classes) {
if (pattern.test(html)) {
signals.push({
provider: provider as MenuProvider,
confidence: 60,
source: 'css_class',
details: `Pattern: ${pattern}`,
});
}
}
// Check meta tags
const metaTags = await page.$$eval('meta', els =>
els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`)
);
for (const meta of metaTags) {
for (const pattern of patterns.meta) {
if (pattern.test(meta)) {
signals.push({
provider: provider as MenuProvider,
confidence: 80,
source: 'meta_tag',
details: meta,
});
}
}
}
}
// Check for network requests (if we intercepted them)
// This would be enhanced with request interception
} catch (error) {
logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
}
return signals;
}
/**
* Aggregate signals into a final detection result
*/
function aggregateSignals(signals: DetectionSignal[]): { provider: MenuProvider; confidence: number } {
if (signals.length === 0) {
return { provider: 'unknown', confidence: 0 };
}
// Group signals by provider
const providerScores: Record<string, number[]> = {};
for (const signal of signals) {
if (!providerScores[signal.provider]) {
providerScores[signal.provider] = [];
}
providerScores[signal.provider].push(signal.confidence);
}
// Calculate weighted score for each provider
const scores: { provider: MenuProvider; score: number }[] = [];
for (const [provider, confidences] of Object.entries(providerScores)) {
// Use max confidence + bonus for multiple signals
const maxConf = Math.max(...confidences);
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
const score = Math.min(100, maxConf + multiSignalBonus);
scores.push({ provider: provider as MenuProvider, score });
}
// Sort by score descending
scores.sort((a, b) => b.score - a.score);
const best = scores[0];
// If there's a clear winner (20+ point lead), use it
if (scores.length === 1 || best.score - scores[1].score >= 20) {
return { provider: best.provider, confidence: best.score };
}
// Multiple contenders - reduce confidence
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
}
/**
* Detect the menu provider for a dispensary
*/
export async function detectMenuProvider(
websiteUrl: string,
options: {
checkMenuPaths?: boolean;
timeout?: number;
} = {}
): Promise<DetectionResult> {
const { checkMenuPaths = true, timeout = 30000 } = options;
const result: DetectionResult = {
provider: 'unknown',
confidence: 0,
signals: [],
urlsTested: [],
menuEntryPoints: [],
rawSignals: {},
};
let browser: Browser | null = null;
try {
// Normalize URL
let baseUrl = websiteUrl.trim();
if (!baseUrl.startsWith('http')) {
baseUrl = `https://${baseUrl}`;
}
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
// Launch browser
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
],
});
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
// Track network requests for API detection
const apiRequests: string[] = [];
await page.setRequestInterception(true);
page.on('request', (request) => {
const url = request.url();
if (url.includes('api') || url.includes('graphql')) {
apiRequests.push(url);
}
request.continue();
});
// URLs to check
const urlsToCheck = [baseUrl];
if (checkMenuPaths) {
for (const path of MENU_PATHS) {
urlsToCheck.push(`${baseUrl}${path}`);
}
}
// Check each URL
for (const url of urlsToCheck) {
try {
result.urlsTested.push(url);
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
// Wait a bit for dynamic content
await new Promise(r => setTimeout(r, 2000));
// Analyze page
const pageSignals = await analyzePageForProviders(page, url);
result.signals.push(...pageSignals);
// Track if this URL has menu content
const hasMenuContent = await page.evaluate(() => {
const text = document.body.innerText.toLowerCase();
return (
text.includes('add to cart') ||
text.includes('add to bag') ||
text.includes('product') ||
text.includes('indica') ||
text.includes('sativa') ||
text.includes('hybrid') ||
text.includes('thc') ||
text.includes('cbd')
);
});
if (hasMenuContent && url !== baseUrl) {
result.menuEntryPoints.push(url);
}
} catch (pageError: any) {
// 404s are fine, just skip
if (!pageError.message?.includes('404')) {
logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
}
}
}
// Check API requests for provider hints
for (const apiUrl of apiRequests) {
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
for (const pattern of patterns.apiEndpoints) {
if (pattern.test(apiUrl)) {
result.signals.push({
provider: provider as MenuProvider,
confidence: 95,
source: 'api_request',
details: apiUrl,
});
}
}
}
}
// Record raw signals
result.rawSignals = {
apiRequestsFound: apiRequests.length,
menuEntryPointsFound: result.menuEntryPoints.length,
totalSignals: result.signals.length,
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
};
// Aggregate signals into final result
const aggregated = aggregateSignals(result.signals);
result.provider = aggregated.provider;
result.confidence = aggregated.confidence;
} catch (error: any) {
result.error = error.message;
logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
} finally {
if (browser) {
await browser.close();
}
}
return result;
}
/**
* Quick check if a site has Dutchie - used during production crawls
*/
export async function quickDutchieCheck(page: Page): Promise<boolean> {
try {
const html = await page.content();
// Check for Dutchie-specific patterns
const dutchiePatterns = [
/dutchie/i,
/dutchie-plus/i,
/__DUTCHIE__/i,
/data-dutchie/i,
/embed\.dutchie/i,
];
for (const pattern of dutchiePatterns) {
if (pattern.test(html)) {
return true;
}
}
// Check iframes
const iframes = await page.$$eval('iframe', els =>
els.map(el => el.getAttribute('src') || '')
);
for (const iframe of iframes) {
if (/dutchie/i.test(iframe)) {
return true;
}
}
return false;
} catch {
return false;
}
}
/**
* Check if provider has changed from expected
*/
export async function detectProviderChange(
page: Page,
expectedProvider: MenuProvider
): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> {
try {
const signals = await analyzePageForProviders(page, page.url());
const aggregated = aggregateSignals(signals);
// If we expected Dutchie but found something else with high confidence
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
return {
changed: true,
newProvider: aggregated.provider,
confidence: aggregated.confidence,
};
}
// If we expected Dutchie and found nothing/low confidence, might have switched
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
// Check if Dutchie is definitely NOT present
const hasDutchie = await quickDutchieCheck(page);
if (!hasDutchie) {
return {
changed: true,
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
confidence: Math.max(30, aggregated.confidence),
};
}
}
return { changed: false };
} catch {
return { changed: false };
}
}

View File

@@ -0,0 +1,441 @@
/**
* Store Crawl Orchestrator
*
* Orchestrates the complete crawl workflow for a store:
* 1. Load store and its linked dispensary
* 2. Check if provider detection is needed
* 3. Run provider detection if needed
* 4. Queue appropriate crawl jobs based on provider/mode
* 5. Update store_crawl_schedule with meaningful status
*
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
*/
import { v4 as uuidv4 } from 'uuid';
import { pool } from '../db/migrate';
import { crawlerLogger } from './crawler-logger';
import {
detectMultiCategoryProviders,
updateAllCategoryProviders,
MultiCategoryDetectionResult,
} from './intelligence-detector';
import { runCrawlProductsJob, runSandboxProductsJob } from './category-crawler-jobs';
import { scrapeStore } from '../scraper-v2';
// ========================================
// Types
// ========================================
export type OrchestratorStatus = 'success' | 'error' | 'sandbox_only' | 'detection_only' | 'pending' | 'running';
export interface OrchestratorResult {
status: OrchestratorStatus;
summary: string;
runId: string;
storeId: number;
dispensaryId: number | null;
detectionRan: boolean;
detectionResult?: MultiCategoryDetectionResult;
crawlRan: boolean;
crawlType?: 'production' | 'sandbox' | 'none';
productsFound?: number;
productsNew?: number;
productsUpdated?: number;
error?: string;
durationMs: number;
}
interface StoreWithDispensary {
id: number;
name: string;
slug: string;
timezone: string;
dispensary_id: number | null;
dispensary_name: string | null;
dispensary_menu_url: string | null;
dispensary_website: string | null;
product_provider: string | null;
product_confidence: number | null;
product_crawler_mode: string | null;
last_product_scan_at: Date | null;
}
// ========================================
// Main Orchestrator Function
// ========================================
/**
* Run the complete crawl orchestration for a store
*
* Behavior:
* 1. Load the store and its linked dispensary
* 2. If no dispensary is linked, report error
* 3. If product_provider is missing or stale (>7 days), run detection
* 4. After detection:
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
* - Otherwise: Run sandbox crawl
* 5. Update store_crawl_schedule with status/summary
*/
export async function runStoreCrawlOrchestrator(storeId: number): Promise<OrchestratorResult> {
const startTime = Date.now();
const runId = uuidv4();
let result: OrchestratorResult = {
status: 'pending',
summary: '',
runId,
storeId,
dispensaryId: null,
detectionRan: false,
crawlRan: false,
durationMs: 0,
};
try {
// Mark schedule as running
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
// 1. Load store with dispensary info
const store = await getStoreWithDispensary(storeId);
if (!store) {
throw new Error(`Store ${storeId} not found`);
}
result.dispensaryId = store.dispensary_id;
// 2. Check if dispensary is linked
if (!store.dispensary_id) {
result.status = 'error';
result.summary = 'No dispensary linked - cannot determine provider';
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
result.durationMs = Date.now() - startTime;
return result;
}
// 3. Check if provider detection is needed
const needsDetection = await checkNeedsDetection(store);
if (needsDetection) {
// Run provider detection
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
if (!websiteUrl) {
result.status = 'error';
result.summary = 'No website URL available for detection';
result.error = 'Dispensary has no menu_url or website configured';
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
result.durationMs = Date.now() - startTime;
return result;
}
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
const detectionResult = await detectMultiCategoryProviders(websiteUrl);
result.detectionRan = true;
result.detectionResult = detectionResult;
// Save detection results to dispensary
await updateAllCategoryProviders(store.dispensary_id, detectionResult);
crawlerLogger.providerDetected({
dispensary_id: store.dispensary_id,
dispensary_name: store.dispensary_name || store.name,
detected_provider: detectionResult.product.provider,
confidence: detectionResult.product.confidence,
detection_method: 'orchestrator_run',
menu_url: websiteUrl,
category: 'product',
});
// Refresh store info after detection
const updatedStore = await getStoreWithDispensary(storeId);
if (updatedStore) {
Object.assign(store, updatedStore);
}
}
// 4. Determine crawl type and run
const provider = store.product_provider;
const mode = store.product_crawler_mode;
if (provider === 'dutchie' && mode === 'production') {
// Production Dutchie crawl
await updateScheduleStatus(storeId, 'running', 'Running Dutchie production crawl...', runId);
try {
// Run the actual scraper
await scrapeStore(storeId);
// Get crawl stats from the latest job
const stats = await getLatestCrawlStats(storeId);
result.crawlRan = true;
result.crawlType = 'production';
result.productsFound = stats.products_found ?? undefined;
result.productsNew = stats.products_new ?? undefined;
result.productsUpdated = stats.products_updated ?? undefined;
const detectionPart = result.detectionRan ? 'Detection + ' : '';
result.summary = `${detectionPart}Dutchie products crawl (${stats.products_found || 0} items, ${stats.products_new || 0} new, ${stats.products_updated || 0} updated)`;
result.status = 'success';
// Update store's last_scraped_at
await pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
crawlerLogger.jobCompleted({
job_id: 0, // Orchestrator doesn't create traditional jobs
store_id: storeId,
store_name: store.name,
duration_ms: Date.now() - startTime,
products_found: stats.products_found || 0,
products_new: stats.products_new || 0,
products_updated: stats.products_updated || 0,
provider: 'dutchie',
});
} catch (crawlError: any) {
result.status = 'error';
result.error = crawlError.message;
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'production';
crawlerLogger.jobFailed({
job_id: 0,
store_id: storeId,
store_name: store.name,
duration_ms: Date.now() - startTime,
error_message: crawlError.message,
provider: 'dutchie',
});
}
} else if (provider && provider !== 'unknown') {
// Sandbox crawl for non-Dutchie or sandbox mode
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
try {
const sandboxResult = await runSandboxProductsJob(store.dispensary_id);
result.crawlRan = true;
result.crawlType = 'sandbox';
result.productsFound = sandboxResult.data?.productsExtracted || 0;
const detectionPart = result.detectionRan ? 'Detection + ' : '';
if (sandboxResult.success) {
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
result.status = 'sandbox_only';
} else {
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
result.status = 'error';
result.error = sandboxResult.message;
}
} catch (sandboxError: any) {
result.status = 'error';
result.error = sandboxError.message;
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'sandbox';
}
} else {
// No provider detected - detection only
if (result.detectionRan) {
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
result.status = 'detection_only';
} else {
result.summary = 'No provider detected and no crawl possible';
result.status = 'error';
result.error = 'Could not determine menu provider';
}
}
} catch (error: any) {
result.status = 'error';
result.error = error.message;
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
crawlerLogger.queueFailure({
queue_type: 'orchestrator',
error_message: error.message,
});
}
result.durationMs = Date.now() - startTime;
// Update final schedule status
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
// Create a crawl_job record for tracking
await createOrchestratorJobRecord(storeId, result);
return result;
}
// ========================================
// Helper Functions
// ========================================
async function getStoreWithDispensary(storeId: number): Promise<StoreWithDispensary | null> {
const result = await pool.query(
`SELECT
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
d.name as dispensary_name,
d.menu_url as dispensary_menu_url,
d.website as dispensary_website,
d.product_provider,
d.product_confidence,
d.product_crawler_mode,
d.last_product_scan_at
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
WHERE s.id = $1`,
[storeId]
);
return result.rows[0] || null;
}
async function checkNeedsDetection(store: StoreWithDispensary): Promise<boolean> {
// No dispensary = can't detect
if (!store.dispensary_id) return false;
// No provider = definitely needs detection
if (!store.product_provider) return true;
// Unknown provider = needs detection
if (store.product_provider === 'unknown') return true;
// Low confidence = needs re-detection
if (store.product_confidence !== null && store.product_confidence < 50) return true;
// Stale detection (> 7 days) = needs refresh
if (store.last_product_scan_at) {
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
if (daysSince > 7) return true;
}
return false;
}
async function updateScheduleStatus(
storeId: number,
status: OrchestratorStatus,
summary: string,
runId: string,
error?: string
): Promise<void> {
await pool.query(
`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
VALUES ($1, $2, $3, NOW(), $4)
ON CONFLICT (store_id) DO UPDATE SET
last_status = $2,
last_summary = $3,
last_run_at = NOW(),
last_error = $4,
updated_at = NOW()`,
[storeId, status, summary, error || null]
);
}
async function getLatestCrawlStats(storeId: number): Promise<{
products_found: number | null;
products_new: number | null;
products_updated: number | null;
}> {
// Get count of products for this store
const result = await pool.query(
`SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
FROM products
WHERE store_id = $1`,
[storeId]
);
return {
products_found: parseInt(result.rows[0]?.total || '0'),
products_new: parseInt(result.rows[0]?.recent_new || '0'),
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
};
}
async function createOrchestratorJobRecord(storeId: number, result: OrchestratorResult): Promise<void> {
await pool.query(
`INSERT INTO crawl_jobs (
store_id, job_type, trigger_type, status, priority,
scheduled_at, started_at, completed_at,
products_found, products_new, products_updated,
error_message, orchestrator_run_id, detection_result
) VALUES (
$1, 'orchestrator', 'manual', $2, 100,
NOW(), NOW(), NOW(),
$3, $4, $5,
$6, $7, $8
)`,
[
storeId,
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
result.productsFound || null,
result.productsNew || null,
result.productsUpdated || null,
result.error || null,
result.runId,
result.detectionResult ? JSON.stringify({
product_provider: result.detectionResult.product.provider,
product_confidence: result.detectionResult.product.confidence,
product_mode: result.detectionResult.product.mode,
}) : null,
]
);
}
// ========================================
// Batch Orchestration
// ========================================
/**
* Run orchestrator for multiple stores
*/
export async function runBatchOrchestrator(
storeIds: number[],
concurrency: number = 3
): Promise<OrchestratorResult[]> {
const results: OrchestratorResult[] = [];
// Process in batches
for (let i = 0; i < storeIds.length; i += concurrency) {
const batch = storeIds.slice(i, i + concurrency);
const batchResults = await Promise.all(
batch.map(storeId => runStoreCrawlOrchestrator(storeId))
);
results.push(...batchResults);
}
return results;
}
/**
* Get stores that are due for orchestration
*/
export async function getStoresDueForOrchestration(limit: number = 10): Promise<number[]> {
const result = await pool.query(
`SELECT s.id
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.enabled, TRUE) = TRUE
AND (
scs.last_run_at IS NULL
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
)
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
LIMIT $1`,
[limit]
);
return result.rows.map(row => row.id);
}

View File

@@ -16,6 +16,7 @@ import { Settings } from './pages/Settings';
import { Proxies } from './pages/Proxies'; import { Proxies } from './pages/Proxies';
import { Logs } from './pages/Logs'; import { Logs } from './pages/Logs';
import { ScraperMonitor } from './pages/ScraperMonitor'; import { ScraperMonitor } from './pages/ScraperMonitor';
import { ScraperSchedule } from './pages/ScraperSchedule';
import { ScraperTools } from './pages/ScraperTools'; import { ScraperTools } from './pages/ScraperTools';
import { ChangeApproval } from './pages/ChangeApproval'; import { ChangeApproval } from './pages/ChangeApproval';
import { ApiPermissions } from './pages/ApiPermissions'; import { ApiPermissions } from './pages/ApiPermissions';
@@ -44,6 +45,7 @@ export default function App() {
<Route path="/logs" element={<PrivateRoute><Logs /></PrivateRoute>} /> <Route path="/logs" element={<PrivateRoute><Logs /></PrivateRoute>} />
<Route path="/scraper-tools" element={<PrivateRoute><ScraperTools /></PrivateRoute>} /> <Route path="/scraper-tools" element={<PrivateRoute><ScraperTools /></PrivateRoute>} />
<Route path="/scraper-monitor" element={<PrivateRoute><ScraperMonitor /></PrivateRoute>} /> <Route path="/scraper-monitor" element={<PrivateRoute><ScraperMonitor /></PrivateRoute>} />
<Route path="/scraper-schedule" element={<PrivateRoute><ScraperSchedule /></PrivateRoute>} />
<Route path="/api-permissions" element={<PrivateRoute><ApiPermissions /></PrivateRoute>} /> <Route path="/api-permissions" element={<PrivateRoute><ApiPermissions /></PrivateRoute>} />
<Route path="*" element={<Navigate to="/" replace />} /> <Route path="*" element={<Navigate to="/" replace />} />
</Routes> </Routes>

View File

@@ -11,6 +11,7 @@ import {
TrendingUp, TrendingUp,
Wrench, Wrench,
Activity, Activity,
Clock,
Shield, Shield,
FileText, FileText,
Settings, Settings,
@@ -147,6 +148,12 @@ export function Layout({ children }: LayoutProps) {
label="Tools" label="Tools"
isActive={isActive('/scraper-tools')} isActive={isActive('/scraper-tools')}
/> />
<NavLink
to="/scraper-schedule"
icon={<Clock className="w-4 h-4" />}
label="Schedule"
isActive={isActive('/scraper-schedule')}
/>
<NavLink <NavLink
to="/scraper-monitor" to="/scraper-monitor"
icon={<Activity className="w-4 h-4" />} icon={<Activity className="w-4 h-4" />}

View File

@@ -423,6 +423,67 @@ class ApiClient {
method: 'DELETE', method: 'DELETE',
}); });
} }
// Crawler Schedule
async getGlobalSchedule() {
return this.request<{ schedules: any[] }>('/api/schedule/global');
}
async updateGlobalSchedule(type: string, data: { enabled?: boolean; interval_hours?: number; run_time?: string }) {
return this.request<{ schedule: any; message: string }>(`/api/schedule/global/${type}`, {
method: 'PUT',
body: JSON.stringify(data),
});
}
async getStoreSchedules() {
return this.request<{ stores: any[] }>('/api/schedule/stores');
}
async getStoreSchedule(storeId: number) {
return this.request<{ schedule: any }>(`/api/schedule/stores/${storeId}`);
}
async updateStoreSchedule(storeId: number, data: any) {
return this.request<{ schedule: any }>(`/api/schedule/stores/${storeId}`, {
method: 'PUT',
body: JSON.stringify(data),
});
}
async getCrawlJobs(limit?: number) {
const params = limit ? `?limit=${limit}` : '';
return this.request<{ jobs: any[] }>(`/api/schedule/jobs${params}`);
}
async getStoreCrawlJobs(storeId: number, limit?: number) {
const params = limit ? `?limit=${limit}` : '';
return this.request<{ jobs: any[] }>(`/api/schedule/jobs/store/${storeId}${params}`);
}
async cancelCrawlJob(jobId: number) {
return this.request<{ success: boolean; message: string }>(`/api/schedule/jobs/${jobId}/cancel`, {
method: 'POST',
});
}
async triggerStoreCrawl(storeId: number) {
return this.request<{ job: any; message: string }>(`/api/schedule/trigger/store/${storeId}`, {
method: 'POST',
});
}
async triggerAllCrawls() {
return this.request<{ jobs_created: number; message: string }>('/api/schedule/trigger/all', {
method: 'POST',
});
}
async restartScheduler() {
return this.request<{ message: string }>('/api/schedule/restart', {
method: 'POST',
});
}
} }
export const api = new ApiClient(API_URL); export const api = new ApiClient(API_URL);

View File

@@ -1,5 +1,5 @@
import { useEffect, useState } from 'react'; import { useEffect, useState } from 'react';
import { useParams, useNavigate } from 'react-router-dom'; import { useParams, useNavigate, Link } from 'react-router-dom';
import { Layout } from '../components/Layout'; import { Layout } from '../components/Layout';
import { api } from '../lib/api'; import { api } from '../lib/api';
import { import {
@@ -15,7 +15,8 @@ import {
DollarSign, DollarSign,
Calendar, Calendar,
RefreshCw, RefreshCw,
ChevronDown ChevronDown,
Clock
} from 'lucide-react'; } from 'lucide-react';
export function DispensaryDetail() { export function DispensaryDetail() {
@@ -33,6 +34,19 @@ export function DispensaryDetail() {
const [currentPage, setCurrentPage] = useState(1); const [currentPage, setCurrentPage] = useState(1);
const [itemsPerPage] = useState(25); const [itemsPerPage] = useState(25);
const formatDate = (dateStr: string) => {
if (!dateStr) return 'Never';
const date = new Date(dateStr);
const now = new Date();
const diffMs = now.getTime() - date.getTime();
const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
if (diffDays === 0) return 'Today';
if (diffDays === 1) return 'Yesterday';
if (diffDays < 7) return `${diffDays} days ago`;
return date.toLocaleDateString();
};
useEffect(() => { useEffect(() => {
loadDispensary(); loadDispensary();
}, [slug]); }, [slug]);
@@ -274,6 +288,13 @@ export function DispensaryDetail() {
<span>AZDHS Profile</span> <span>AZDHS Profile</span>
</a> </a>
)} )}
<Link
to="/schedule"
className="flex items-center gap-2 text-sm text-blue-600 hover:text-blue-800"
>
<Clock className="w-4 h-4" />
<span>View Schedule</span>
</Link>
</div> </div>
</div> </div>
@@ -424,7 +445,8 @@ export function DispensaryDetail() {
<th className="text-center">CBD %</th> <th className="text-center">CBD %</th>
<th className="text-center">Strain Type</th> <th className="text-center">Strain Type</th>
<th className="text-center">In Stock</th> <th className="text-center">In Stock</th>
<th>Link</th> <th>Last Updated</th>
<th>Actions</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
@@ -490,17 +512,28 @@ export function DispensaryDetail() {
<span className="badge badge-error badge-sm">No</span> <span className="badge badge-error badge-sm">No</span>
) : '-'} ) : '-'}
</td> </td>
<td className="whitespace-nowrap text-xs text-gray-500">
{product.updated_at ? formatDate(product.updated_at) : '-'}
</td>
<td> <td>
{product.dutchie_url ? ( <div className="flex gap-1">
{product.dutchie_url && (
<a <a
href={product.dutchie_url} href={product.dutchie_url}
target="_blank" target="_blank"
rel="noopener noreferrer" rel="noopener noreferrer"
className="btn btn-xs btn-outline" className="btn btn-xs btn-outline"
> >
View Dutchie
</a> </a>
) : '-'} )}
<button
onClick={() => navigate(`/products/${product.id}`)}
className="btn btn-xs btn-primary"
>
Details
</button>
</div>
</td> </td>
</tr> </tr>
))} ))}

View File

@@ -0,0 +1,723 @@
import { useEffect, useState } from 'react';
import { Link } from 'react-router-dom';
import { Layout } from '../components/Layout';
import { api } from '../lib/api';
interface GlobalSchedule {
id: number;
schedule_type: string;
enabled: boolean;
interval_hours?: number;
run_time?: string;
description?: string;
}
interface StoreSchedule {
store_id: number;
store_name: string;
store_slug: string;
timezone: string;
active: boolean;
scrape_enabled: boolean;
last_scraped_at: string | null;
schedule_enabled: boolean;
interval_hours: number;
daily_special_enabled: boolean;
daily_special_time: string;
priority: number;
next_scheduled_run: string;
latest_job_id: number | null;
latest_job_status: string | null;
latest_job_type: string | null;
latest_job_trigger: string | null;
latest_job_started: string | null;
latest_job_completed: string | null;
latest_products_found: number | null;
latest_products_new: number | null;
latest_products_updated: number | null;
latest_job_error: string | null;
// Dispensary info (from master AZDHS directory)
dispensary_id: number | null;
dispensary_name: string | null;
dispensary_company: string | null;
dispensary_city: string | null;
// Provider intelligence (from dispensary)
product_provider: string | null;
product_confidence: number | null;
product_crawler_mode: string | null;
// Orchestrator status
last_status: string | null;
last_summary: string | null;
schedule_last_run: string | null;
last_error: string | null;
}
interface CrawlJob {
id: number;
store_id: number;
store_name: string;
job_type: string;
trigger_type: string;
status: string;
priority: number;
scheduled_at: string;
started_at: string | null;
completed_at: string | null;
products_found: number | null;
products_new: number | null;
products_updated: number | null;
error_message: string | null;
}
export function ScraperSchedule() {
const [globalSchedules, setGlobalSchedules] = useState<GlobalSchedule[]>([]);
const [storeSchedules, setStoreSchedules] = useState<StoreSchedule[]>([]);
const [jobs, setJobs] = useState<CrawlJob[]>([]);
const [loading, setLoading] = useState(true);
const [autoRefresh, setAutoRefresh] = useState(true);
const [activeTab, setActiveTab] = useState<'stores' | 'jobs' | 'global'>('stores');
const [triggeringStore, setTriggeringStore] = useState<number | null>(null);
useEffect(() => {
loadData();
if (autoRefresh) {
const interval = setInterval(loadData, 5000);
return () => clearInterval(interval);
}
}, [autoRefresh]);
const loadData = async () => {
try {
const [globalData, storesData, jobsData] = await Promise.all([
api.getGlobalSchedule(),
api.getStoreSchedules(),
api.getCrawlJobs(100)
]);
setGlobalSchedules(globalData.schedules || []);
setStoreSchedules(storesData.stores || []);
setJobs(jobsData.jobs || []);
} catch (error) {
console.error('Failed to load schedule data:', error);
} finally {
setLoading(false);
}
};
const handleTriggerCrawl = async (storeId: number) => {
setTriggeringStore(storeId);
try {
await api.triggerStoreCrawl(storeId);
await loadData();
} catch (error) {
console.error('Failed to trigger crawl:', error);
} finally {
setTriggeringStore(null);
}
};
const handleTriggerAll = async () => {
if (!confirm('This will create crawl jobs for ALL active stores. Continue?')) return;
try {
const result = await api.triggerAllCrawls();
alert(`Created ${result.jobs_created} crawl jobs`);
await loadData();
} catch (error) {
console.error('Failed to trigger all crawls:', error);
}
};
const handleCancelJob = async (jobId: number) => {
try {
await api.cancelCrawlJob(jobId);
await loadData();
} catch (error) {
console.error('Failed to cancel job:', error);
}
};
const handleUpdateGlobalSchedule = async (type: string, data: any) => {
try {
await api.updateGlobalSchedule(type, data);
await loadData();
} catch (error) {
console.error('Failed to update global schedule:', error);
}
};
const formatTimeAgo = (dateString: string | null) => {
if (!dateString) return 'Never';
const date = new Date(dateString);
const now = new Date();
const diffMs = now.getTime() - date.getTime();
const diffMins = Math.floor(diffMs / 60000);
const diffHours = Math.floor(diffMins / 60);
const diffDays = Math.floor(diffHours / 24);
if (diffMins < 1) return 'Just now';
if (diffMins < 60) return `${diffMins}m ago`;
if (diffHours < 24) return `${diffHours}h ago`;
return `${diffDays}d ago`;
};
const formatTimeUntil = (dateString: string) => {
const date = new Date(dateString);
const now = new Date();
const diffMs = date.getTime() - now.getTime();
if (diffMs < 0) return 'Overdue';
const diffMins = Math.floor(diffMs / 60000);
const diffHours = Math.floor(diffMins / 60);
if (diffMins < 60) return `${diffMins}m`;
return `${diffHours}h ${diffMins % 60}m`;
};
const getStatusColor = (status: string) => {
switch (status) {
case 'completed':
case 'success': return { bg: '#d1fae5', color: '#065f46' };
case 'running': return { bg: '#dbeafe', color: '#1e40af' };
case 'failed':
case 'error': return { bg: '#fee2e2', color: '#991b1b' };
case 'cancelled': return { bg: '#f3f4f6', color: '#374151' };
case 'pending': return { bg: '#fef3c7', color: '#92400e' };
case 'sandbox_only': return { bg: '#e0e7ff', color: '#3730a3' };
case 'detection_only': return { bg: '#fce7f3', color: '#9d174d' };
default: return { bg: '#f3f4f6', color: '#374151' };
}
};
const getProviderBadge = (provider: string | null, mode: string | null) => {
if (!provider) return null;
const isProduction = mode === 'production';
return {
label: provider,
bg: isProduction ? '#d1fae5' : '#fef3c7',
color: isProduction ? '#065f46' : '#92400e',
suffix: isProduction ? '' : ' (sandbox)'
};
};
const globalIntervalSchedule = globalSchedules.find(s => s.schedule_type === 'global_interval');
const dailySpecialSchedule = globalSchedules.find(s => s.schedule_type === 'daily_special');
return (
<Layout>
<div>
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '30px' }}>
<h1 style={{ fontSize: '32px', margin: 0 }}>Crawler Schedule</h1>
<div style={{ display: 'flex', gap: '15px', alignItems: 'center' }}>
<label style={{ display: 'flex', alignItems: 'center', gap: '10px', cursor: 'pointer' }}>
<input
type="checkbox"
checked={autoRefresh}
onChange={(e) => setAutoRefresh(e.target.checked)}
style={{ width: '18px', height: '18px', cursor: 'pointer' }}
/>
<span>Auto-refresh (5s)</span>
</label>
<button
onClick={handleTriggerAll}
style={{
padding: '10px 20px',
background: '#2563eb',
color: 'white',
border: 'none',
borderRadius: '6px',
cursor: 'pointer',
fontWeight: '600'
}}
>
Crawl All Stores
</button>
</div>
</div>
{/* Tabs */}
<div style={{ marginBottom: '30px', display: 'flex', gap: '10px', borderBottom: '2px solid #eee' }}>
<button
onClick={() => setActiveTab('stores')}
style={{
padding: '12px 24px',
background: activeTab === 'stores' ? 'white' : 'transparent',
border: 'none',
borderBottom: activeTab === 'stores' ? '3px solid #2563eb' : '3px solid transparent',
cursor: 'pointer',
fontSize: '16px',
fontWeight: activeTab === 'stores' ? '600' : '400',
color: activeTab === 'stores' ? '#2563eb' : '#666',
marginBottom: '-2px'
}}
>
Store Schedules
</button>
<button
onClick={() => setActiveTab('jobs')}
style={{
padding: '12px 24px',
background: activeTab === 'jobs' ? 'white' : 'transparent',
border: 'none',
borderBottom: activeTab === 'jobs' ? '3px solid #2563eb' : '3px solid transparent',
cursor: 'pointer',
fontSize: '16px',
fontWeight: activeTab === 'jobs' ? '600' : '400',
color: activeTab === 'jobs' ? '#2563eb' : '#666',
marginBottom: '-2px'
}}
>
Job Queue ({jobs.filter(j => j.status === 'pending' || j.status === 'running').length})
</button>
<button
onClick={() => setActiveTab('global')}
style={{
padding: '12px 24px',
background: activeTab === 'global' ? 'white' : 'transparent',
border: 'none',
borderBottom: activeTab === 'global' ? '3px solid #2563eb' : '3px solid transparent',
cursor: 'pointer',
fontSize: '16px',
fontWeight: activeTab === 'global' ? '600' : '400',
color: activeTab === 'global' ? '#2563eb' : '#666',
marginBottom: '-2px'
}}
>
Global Settings
</button>
</div>
{activeTab === 'global' && (
<div style={{ display: 'grid', gap: '20px' }}>
{/* Global Interval Schedule */}
<div style={{
background: 'white',
padding: '24px',
borderRadius: '8px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)'
}}>
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start', marginBottom: '20px' }}>
<div>
<h2 style={{ fontSize: '20px', margin: 0, marginBottom: '8px' }}>Interval Crawl Schedule</h2>
<p style={{ color: '#666', margin: 0 }}>Crawl all stores periodically</p>
</div>
<label style={{ display: 'flex', alignItems: 'center', gap: '10px', cursor: 'pointer' }}>
<span style={{ color: '#666' }}>Enabled</span>
<input
type="checkbox"
checked={globalIntervalSchedule?.enabled ?? true}
onChange={(e) => handleUpdateGlobalSchedule('global_interval', { enabled: e.target.checked })}
style={{ width: '20px', height: '20px', cursor: 'pointer' }}
/>
</label>
</div>
<div style={{ display: 'flex', alignItems: 'center', gap: '15px' }}>
<label style={{ display: 'flex', alignItems: 'center', gap: '10px' }}>
<span>Crawl every</span>
<select
value={globalIntervalSchedule?.interval_hours ?? 4}
onChange={(e) => handleUpdateGlobalSchedule('global_interval', { interval_hours: parseInt(e.target.value) })}
style={{
padding: '8px 12px',
borderRadius: '6px',
border: '1px solid #ddd',
fontSize: '16px'
}}
>
<option value={1}>1 hour</option>
<option value={2}>2 hours</option>
<option value={4}>4 hours</option>
<option value={6}>6 hours</option>
<option value={8}>8 hours</option>
<option value={12}>12 hours</option>
<option value={24}>24 hours</option>
</select>
</label>
</div>
</div>
{/* Daily Special Schedule */}
<div style={{
background: 'white',
padding: '24px',
borderRadius: '8px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)'
}}>
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start', marginBottom: '20px' }}>
<div>
<h2 style={{ fontSize: '20px', margin: 0, marginBottom: '8px' }}>Daily Special Crawl</h2>
<p style={{ color: '#666', margin: 0 }}>Crawl stores at local midnight to capture daily specials</p>
</div>
<label style={{ display: 'flex', alignItems: 'center', gap: '10px', cursor: 'pointer' }}>
<span style={{ color: '#666' }}>Enabled</span>
<input
type="checkbox"
checked={dailySpecialSchedule?.enabled ?? true}
onChange={(e) => handleUpdateGlobalSchedule('daily_special', { enabled: e.target.checked })}
style={{ width: '20px', height: '20px', cursor: 'pointer' }}
/>
</label>
</div>
<div style={{ display: 'flex', alignItems: 'center', gap: '15px' }}>
<label style={{ display: 'flex', alignItems: 'center', gap: '10px' }}>
<span>Run at</span>
<input
type="time"
value={dailySpecialSchedule?.run_time?.slice(0, 5) ?? '00:01'}
onChange={(e) => handleUpdateGlobalSchedule('daily_special', { run_time: e.target.value })}
style={{
padding: '8px 12px',
borderRadius: '6px',
border: '1px solid #ddd',
fontSize: '16px'
}}
/>
<span style={{ color: '#666' }}>(store local time)</span>
</label>
</div>
</div>
</div>
)}
{activeTab === 'stores' && (
<div style={{
background: 'white',
borderRadius: '8px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
overflow: 'hidden'
}}>
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
<thead>
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary / Store</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Provider</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Schedule</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Run</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Next Run</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Result</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Actions</th>
</tr>
</thead>
<tbody>
{storeSchedules.map((store) => (
<tr key={store.store_id} style={{ borderBottom: '1px solid #eee' }}>
<td style={{ padding: '15px' }}>
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
{store.dispensary_id ? (
<Link
to={`/dispensaries/${store.dispensary_id}`}
style={{
fontWeight: '600',
color: '#2563eb',
textDecoration: 'none'
}}
>
{store.dispensary_name || store.store_name}
</Link>
) : (
<span style={{ fontWeight: '600' }}>{store.store_name}</span>
)}
{!store.dispensary_id && (
<span style={{
padding: '2px 6px',
borderRadius: '4px',
fontSize: '10px',
fontWeight: '600',
background: '#fef3c7',
color: '#92400e'
}}>
Unmapped
</span>
)}
</div>
<div style={{ fontSize: '13px', color: '#666' }}>
{store.dispensary_city ? `${store.dispensary_city} | ${store.timezone}` : store.timezone}
</div>
</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
{store.product_provider ? (
<div>
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
background: store.product_crawler_mode === 'production' ? '#d1fae5' : '#fef3c7',
color: store.product_crawler_mode === 'production' ? '#065f46' : '#92400e'
}}>
{store.product_provider}
</span>
{store.product_crawler_mode !== 'production' && (
<div style={{ fontSize: '10px', color: '#92400e', marginTop: '2px' }}>sandbox</div>
)}
</div>
) : (
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
background: '#f3f4f6',
color: '#666'
}}>
Unknown
</span>
)}
</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
<div style={{ display: 'flex', flexDirection: 'column', alignItems: 'center', gap: '4px' }}>
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
background: store.schedule_enabled && store.scrape_enabled ? '#d1fae5' : '#fee2e2',
color: store.schedule_enabled && store.scrape_enabled ? '#065f46' : '#991b1b'
}}>
{store.schedule_enabled && store.scrape_enabled ? 'Active' : 'Disabled'}
</span>
<span style={{ fontSize: '12px', color: '#666' }}>
Every {store.interval_hours}h
</span>
</div>
</td>
<td style={{ padding: '15px' }}>
<div>{formatTimeAgo(store.last_scraped_at)}</div>
{store.last_scraped_at && (
<div style={{ fontSize: '12px', color: '#999' }}>
{new Date(store.last_scraped_at).toLocaleString()}
</div>
)}
</td>
<td style={{ padding: '15px' }}>
<div style={{ fontWeight: '600', color: '#2563eb' }}>
{formatTimeUntil(store.next_scheduled_run)}
</div>
</td>
<td style={{ padding: '15px' }}>
{store.last_status || store.latest_job_status ? (
<div>
<div style={{ display: 'flex', alignItems: 'center', gap: '8px', marginBottom: '4px' }}>
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
...getStatusColor(store.last_status || store.latest_job_status || 'pending')
}}>
{store.last_status || store.latest_job_status}
</span>
{store.last_error && (
<button
onClick={() => alert(store.last_error)}
style={{
padding: '2px 6px',
background: '#fee2e2',
color: '#991b1b',
border: 'none',
borderRadius: '4px',
cursor: 'pointer',
fontSize: '10px'
}}
>
Error
</button>
)}
</div>
{store.last_summary ? (
<div style={{ fontSize: '12px', color: '#666', maxWidth: '250px' }}>
{store.last_summary}
</div>
) : store.latest_products_found !== null ? (
<div style={{ fontSize: '12px', color: '#666' }}>
{store.latest_products_found} products
{store.latest_products_new !== null && ` (+${store.latest_products_new} new)`}
</div>
) : null}
</div>
) : (
<span style={{ color: '#999', fontSize: '13px' }}>No runs yet</span>
)}
</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
<button
onClick={() => handleTriggerCrawl(store.store_id)}
disabled={triggeringStore === store.store_id}
style={{
padding: '6px 12px',
background: triggeringStore === store.store_id ? '#94a3b8' : '#2563eb',
color: 'white',
border: 'none',
borderRadius: '4px',
cursor: triggeringStore === store.store_id ? 'wait' : 'pointer',
fontSize: '13px'
}}
>
{triggeringStore === store.store_id ? 'Starting...' : 'Run Now'}
</button>
</td>
</tr>
))}
</tbody>
</table>
</div>
)}
{activeTab === 'jobs' && (
<>
{/* Job Stats */}
<div style={{ marginBottom: '30px' }}>
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(150px, 1fr))', gap: '15px' }}>
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Pending</div>
<div style={{ fontSize: '32px', fontWeight: '600', color: '#f59e0b' }}>
{jobs.filter(j => j.status === 'pending').length}
</div>
</div>
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Running</div>
<div style={{ fontSize: '32px', fontWeight: '600', color: '#3b82f6' }}>
{jobs.filter(j => j.status === 'running').length}
</div>
</div>
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Completed</div>
<div style={{ fontSize: '32px', fontWeight: '600', color: '#10b981' }}>
{jobs.filter(j => j.status === 'completed').length}
</div>
</div>
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Failed</div>
<div style={{ fontSize: '32px', fontWeight: '600', color: '#ef4444' }}>
{jobs.filter(j => j.status === 'failed').length}
</div>
</div>
</div>
</div>
{/* Jobs Table */}
<div style={{
background: 'white',
borderRadius: '8px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
overflow: 'hidden'
}}>
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
<thead>
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Store</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Type</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Trigger</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Status</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Products</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Started</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Completed</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Actions</th>
</tr>
</thead>
<tbody>
{jobs.length === 0 ? (
<tr>
<td colSpan={8} style={{ padding: '40px', textAlign: 'center', color: '#666' }}>
No crawl jobs found
</td>
</tr>
) : (
jobs.map((job) => (
<tr key={job.id} style={{ borderBottom: '1px solid #eee' }}>
<td style={{ padding: '15px' }}>
<div style={{ fontWeight: '600' }}>{job.store_name}</div>
<div style={{ fontSize: '12px', color: '#999' }}>Job #{job.id}</div>
</td>
<td style={{ padding: '15px', textAlign: 'center', fontSize: '13px' }}>
{job.job_type}
</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
<span style={{
padding: '3px 8px',
borderRadius: '4px',
fontSize: '12px',
background: job.trigger_type === 'manual' ? '#e0e7ff' :
job.trigger_type === 'daily_special' ? '#fce7f3' : '#f3f4f6',
color: job.trigger_type === 'manual' ? '#3730a3' :
job.trigger_type === 'daily_special' ? '#9d174d' : '#374151'
}}>
{job.trigger_type}
</span>
</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
...getStatusColor(job.status)
}}>
{job.status}
</span>
</td>
<td style={{ padding: '15px', textAlign: 'right' }}>
{job.products_found !== null ? (
<div>
<div style={{ fontWeight: '600' }}>{job.products_found}</div>
{job.products_new !== null && job.products_updated !== null && (
<div style={{ fontSize: '12px', color: '#666' }}>
+{job.products_new} / ~{job.products_updated}
</div>
)}
</div>
) : '-'}
</td>
<td style={{ padding: '15px', fontSize: '13px' }}>
{job.started_at ? new Date(job.started_at).toLocaleString() : '-'}
</td>
<td style={{ padding: '15px', fontSize: '13px' }}>
{job.completed_at ? new Date(job.completed_at).toLocaleString() : '-'}
</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
{job.status === 'pending' && (
<button
onClick={() => handleCancelJob(job.id)}
style={{
padding: '4px 10px',
background: '#fee2e2',
color: '#991b1b',
border: 'none',
borderRadius: '4px',
cursor: 'pointer',
fontSize: '12px'
}}
>
Cancel
</button>
)}
{job.error_message && (
<button
onClick={() => alert(job.error_message)}
style={{
padding: '4px 10px',
background: '#fee2e2',
color: '#991b1b',
border: 'none',
borderRadius: '4px',
cursor: 'pointer',
fontSize: '12px'
}}
>
View Error
</button>
)}
</td>
</tr>
))
)}
</tbody>
</table>
</div>
</>
)}
</div>
</Layout>
);
}