Add crawler scheduler, orchestrator, and multi-category intelligence
- Add scheduler UI with store schedules, job queue, and global settings - Add store crawl orchestrator for intelligent crawl workflow - Add multi-category intelligence detection (product, specials, brands, metadata) - Add CrawlerLogger for structured JSON logging - Add migrations for scheduler tables and dispensary linking - Add dispensary → scheduler navigation link - Support production/sandbox crawler modes per provider 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
163
backend/migrations/015_crawler_schedule.sql
Normal file
163
backend/migrations/015_crawler_schedule.sql
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
-- =====================================================
|
||||||
|
-- Crawler Schedule Tables
|
||||||
|
-- =====================================================
|
||||||
|
|
||||||
|
-- Add timezone column to stores table
|
||||||
|
ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix';
|
||||||
|
|
||||||
|
-- 1. Global crawler schedule settings
|
||||||
|
CREATE TABLE IF NOT EXISTS crawler_schedule (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
schedule_type VARCHAR(50) NOT NULL, -- 'global_interval', 'daily_special'
|
||||||
|
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
interval_hours INTEGER, -- For global_interval: every N hours
|
||||||
|
run_time TIME, -- For daily_special: time to run (e.g., 00:01)
|
||||||
|
description TEXT,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT uq_crawler_schedule_type UNIQUE (schedule_type)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Insert default schedules
|
||||||
|
INSERT INTO crawler_schedule (schedule_type, enabled, interval_hours, description)
|
||||||
|
VALUES ('global_interval', TRUE, 4, 'Crawl all stores every N hours')
|
||||||
|
ON CONFLICT (schedule_type) DO NOTHING;
|
||||||
|
|
||||||
|
INSERT INTO crawler_schedule (schedule_type, enabled, run_time, description)
|
||||||
|
VALUES ('daily_special', TRUE, '00:01', 'Daily specials run at store local midnight')
|
||||||
|
ON CONFLICT (schedule_type) DO NOTHING;
|
||||||
|
|
||||||
|
-- 2. Per-store schedule overrides
|
||||||
|
CREATE TABLE IF NOT EXISTS store_crawl_schedule (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
|
||||||
|
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
interval_hours INTEGER, -- NULL = use global setting
|
||||||
|
daily_special_enabled BOOLEAN DEFAULT TRUE,
|
||||||
|
daily_special_time TIME, -- NULL = use store's 00:01 local time
|
||||||
|
priority INTEGER DEFAULT 0, -- Higher priority = scheduled first
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT uq_store_crawl_schedule_store UNIQUE (store_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- 3. Crawl job queue
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
store_id INTEGER NOT NULL REFERENCES stores(id) ON DELETE CASCADE,
|
||||||
|
|
||||||
|
-- Job identification
|
||||||
|
job_type VARCHAR(50) NOT NULL DEFAULT 'full_crawl', -- 'full_crawl', 'specials_only', 'category'
|
||||||
|
trigger_type VARCHAR(50) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_special'
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
|
||||||
|
priority INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Timing
|
||||||
|
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- When job should run
|
||||||
|
started_at TIMESTAMPTZ,
|
||||||
|
completed_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Results
|
||||||
|
products_found INTEGER,
|
||||||
|
products_new INTEGER,
|
||||||
|
products_updated INTEGER,
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
worker_id VARCHAR(100),
|
||||||
|
metadata JSONB DEFAULT '{}',
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT chk_crawl_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for efficient job lookup
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_status ON crawl_jobs(store_id, status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_pending ON crawl_jobs(scheduled_at) WHERE status = 'pending';
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_store_time ON crawl_jobs(store_id, created_at DESC);
|
||||||
|
|
||||||
|
-- 4. Crawl history summary (for UI display)
|
||||||
|
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||||
|
SELECT
|
||||||
|
s.id AS store_id,
|
||||||
|
s.name AS store_name,
|
||||||
|
s.slug AS store_slug,
|
||||||
|
s.timezone,
|
||||||
|
s.active,
|
||||||
|
s.scrape_enabled,
|
||||||
|
s.last_scraped_at,
|
||||||
|
|
||||||
|
-- Schedule settings (use store override or global)
|
||||||
|
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||||
|
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||||
|
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||||
|
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||||
|
COALESCE(scs.priority, 0) AS priority,
|
||||||
|
|
||||||
|
-- Next scheduled run calculation
|
||||||
|
CASE
|
||||||
|
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||||
|
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||||
|
END AS next_scheduled_run,
|
||||||
|
|
||||||
|
-- Latest job info
|
||||||
|
cj.id AS latest_job_id,
|
||||||
|
cj.status AS latest_job_status,
|
||||||
|
cj.started_at AS latest_job_started,
|
||||||
|
cj.completed_at AS latest_job_completed,
|
||||||
|
cj.products_found AS latest_products_found
|
||||||
|
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||||
|
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||||
|
LEFT JOIN LATERAL (
|
||||||
|
SELECT * FROM crawl_jobs cj2
|
||||||
|
WHERE cj2.store_id = s.id
|
||||||
|
ORDER BY cj2.created_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
) cj ON TRUE
|
||||||
|
WHERE s.active = TRUE;
|
||||||
|
|
||||||
|
-- Function to update updated_at timestamps
|
||||||
|
CREATE OR REPLACE FUNCTION update_schedule_updated_at()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.updated_at = NOW();
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Triggers
|
||||||
|
DROP TRIGGER IF EXISTS trigger_crawler_schedule_updated_at ON crawler_schedule;
|
||||||
|
CREATE TRIGGER trigger_crawler_schedule_updated_at
|
||||||
|
BEFORE UPDATE ON crawler_schedule
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_schedule_updated_at();
|
||||||
|
|
||||||
|
DROP TRIGGER IF EXISTS trigger_store_crawl_schedule_updated_at ON store_crawl_schedule;
|
||||||
|
CREATE TRIGGER trigger_store_crawl_schedule_updated_at
|
||||||
|
BEFORE UPDATE ON store_crawl_schedule
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_schedule_updated_at();
|
||||||
|
|
||||||
|
DROP TRIGGER IF EXISTS trigger_crawl_jobs_updated_at ON crawl_jobs;
|
||||||
|
CREATE TRIGGER trigger_crawl_jobs_updated_at
|
||||||
|
BEFORE UPDATE ON crawl_jobs
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_schedule_updated_at();
|
||||||
|
|
||||||
|
-- Grant permissions
|
||||||
|
GRANT SELECT, INSERT, UPDATE, DELETE ON crawler_schedule TO dutchie;
|
||||||
|
GRANT SELECT, INSERT, UPDATE, DELETE ON store_crawl_schedule TO dutchie;
|
||||||
|
GRANT SELECT, INSERT, UPDATE, DELETE ON crawl_jobs TO dutchie;
|
||||||
|
GRANT USAGE, SELECT ON SEQUENCE crawler_schedule_id_seq TO dutchie;
|
||||||
|
GRANT USAGE, SELECT ON SEQUENCE store_crawl_schedule_id_seq TO dutchie;
|
||||||
|
GRANT USAGE, SELECT ON SEQUENCE crawl_jobs_id_seq TO dutchie;
|
||||||
|
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
-- =====================================================
|
||||||
|
-- Extend dispensaries table for multi-provider crawler
|
||||||
|
-- =====================================================
|
||||||
|
|
||||||
|
-- Menu provider detection
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider VARCHAR(50);
|
||||||
|
-- Values: 'dutchie', 'treez', 'jane', 'weedmaps', 'iheartjane', 'leafly', 'meadow', 'greenlight', 'other', 'unknown'
|
||||||
|
|
||||||
|
-- Confidence score for provider detection (0-100)
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_provider_confidence SMALLINT DEFAULT 0;
|
||||||
|
ALTER TABLE dispensaries ADD CONSTRAINT chk_provider_confidence
|
||||||
|
CHECK (menu_provider_confidence >= 0 AND menu_provider_confidence <= 100);
|
||||||
|
|
||||||
|
-- Crawler mode: production (stable templates) vs sandbox (learning/unstable)
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_mode VARCHAR(20) DEFAULT 'production';
|
||||||
|
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_mode
|
||||||
|
CHECK (crawler_mode IN ('production', 'sandbox'));
|
||||||
|
|
||||||
|
-- Crawler status for job orchestration
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS crawler_status VARCHAR(30) DEFAULT 'idle';
|
||||||
|
ALTER TABLE dispensaries ADD CONSTRAINT chk_crawler_status
|
||||||
|
CHECK (crawler_status IN ('idle', 'queued_detection', 'queued_crawl', 'running', 'ok', 'error_needs_review'));
|
||||||
|
|
||||||
|
-- Error tracking
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_menu_error_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_error_message TEXT;
|
||||||
|
|
||||||
|
-- Provider detection metadata (raw signals, detection history)
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS provider_detection_data JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- Indexes for efficient queue queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider ON dispensaries(menu_provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_mode ON dispensaries(crawler_mode);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawler_status ON dispensaries(crawler_status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_confidence ON dispensaries(menu_provider_confidence);
|
||||||
|
|
||||||
|
-- Composite index for production Dutchie crawl queue
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_dutchie_production
|
||||||
|
ON dispensaries(id)
|
||||||
|
WHERE menu_provider = 'dutchie' AND crawler_mode = 'production';
|
||||||
|
|
||||||
|
-- Composite index for sandbox queue
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_sandbox
|
||||||
|
ON dispensaries(id)
|
||||||
|
WHERE crawler_mode = 'sandbox';
|
||||||
|
|
||||||
|
-- Composite index for detection queue
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_detection
|
||||||
|
ON dispensaries(id)
|
||||||
|
WHERE menu_provider IS NULL OR menu_provider_confidence < 70;
|
||||||
|
|
||||||
|
-- Comment on columns for documentation
|
||||||
|
COMMENT ON COLUMN dispensaries.menu_provider IS 'Detected menu platform: dutchie, treez, jane, weedmaps, etc.';
|
||||||
|
COMMENT ON COLUMN dispensaries.menu_provider_confidence IS 'Confidence score 0-100 for provider detection';
|
||||||
|
COMMENT ON COLUMN dispensaries.crawler_mode IS 'production = stable templates, sandbox = learning mode';
|
||||||
|
COMMENT ON COLUMN dispensaries.crawler_status IS 'Current state in crawl pipeline';
|
||||||
|
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSON blob with detection signals and history';
|
||||||
@@ -0,0 +1,237 @@
|
|||||||
|
-- =====================================================
|
||||||
|
-- Crawler Sandboxes and Templates Tables
|
||||||
|
-- =====================================================
|
||||||
|
|
||||||
|
-- 1. Crawler sandboxes - for learning new providers/templates
|
||||||
|
CREATE TABLE IF NOT EXISTS crawler_sandboxes (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
|
||||||
|
-- Detection info
|
||||||
|
suspected_menu_provider VARCHAR(50), -- What we think the provider is
|
||||||
|
mode VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_learning', 'validation'
|
||||||
|
|
||||||
|
-- Captured data
|
||||||
|
raw_html_location TEXT, -- S3 key or local file path to captured HTML
|
||||||
|
screenshot_location TEXT, -- S3 key for screenshot
|
||||||
|
analysis_json JSONB DEFAULT '{}', -- Extracted patterns, selectors, candidate templates
|
||||||
|
|
||||||
|
-- URLs discovered/tested
|
||||||
|
urls_tested JSONB DEFAULT '[]', -- Array of URLs we fetched
|
||||||
|
menu_entry_points JSONB DEFAULT '[]', -- Discovered menu URLs
|
||||||
|
|
||||||
|
-- Detection signals found
|
||||||
|
detection_signals JSONB DEFAULT '{}', -- e.g., {"dutchie_embed": false, "treez_script": true, ...}
|
||||||
|
|
||||||
|
-- Status tracking
|
||||||
|
status VARCHAR(30) NOT NULL DEFAULT 'pending',
|
||||||
|
-- 'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
|
||||||
|
|
||||||
|
-- Results
|
||||||
|
confidence_score SMALLINT DEFAULT 0,
|
||||||
|
failure_reason TEXT,
|
||||||
|
human_review_notes TEXT,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
analyzed_at TIMESTAMPTZ,
|
||||||
|
reviewed_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
CONSTRAINT chk_sandbox_mode CHECK (mode IN ('detection', 'template_learning', 'validation')),
|
||||||
|
CONSTRAINT chk_sandbox_status CHECK (status IN (
|
||||||
|
'pending', 'analyzing', 'template_ready', 'needs_human_review', 'moved_to_production', 'failed'
|
||||||
|
))
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for sandbox queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_dispensary ON crawler_sandboxes(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_status ON crawler_sandboxes(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_mode ON crawler_sandboxes(mode);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_suspected_provider ON crawler_sandboxes(suspected_menu_provider);
|
||||||
|
|
||||||
|
-- Unique constraint: one active sandbox per dispensary (can have historical completed ones)
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_sandbox_active_per_dispensary
|
||||||
|
ON crawler_sandboxes(dispensary_id)
|
||||||
|
WHERE status NOT IN ('moved_to_production', 'failed');
|
||||||
|
|
||||||
|
|
||||||
|
-- 2. Crawler templates - reusable scraping configurations
|
||||||
|
CREATE TABLE IF NOT EXISTS crawler_templates (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
-- Template identification
|
||||||
|
provider VARCHAR(50) NOT NULL, -- 'dutchie', 'treez', 'jane', etc.
|
||||||
|
name VARCHAR(100) NOT NULL, -- 'dutchie_v1', 'treez_standard', 'jane_embedded'
|
||||||
|
version INTEGER DEFAULT 1,
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
is_active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
is_default_for_provider BOOLEAN DEFAULT FALSE,
|
||||||
|
|
||||||
|
-- Selector configuration
|
||||||
|
selector_config JSONB NOT NULL DEFAULT '{}',
|
||||||
|
-- Structure:
|
||||||
|
-- {
|
||||||
|
-- "product_list": "css:.product-card",
|
||||||
|
-- "product_name": "css:.product-name",
|
||||||
|
-- "product_price": "css:.price",
|
||||||
|
-- "product_brand": "css:.brand",
|
||||||
|
-- "product_image": "css:img.product-image@src",
|
||||||
|
-- "pagination_next": "css:.next-page",
|
||||||
|
-- "category_links": "css:.category-nav a",
|
||||||
|
-- ...
|
||||||
|
-- }
|
||||||
|
|
||||||
|
-- Navigation patterns
|
||||||
|
navigation_config JSONB DEFAULT '{}',
|
||||||
|
-- Structure:
|
||||||
|
-- {
|
||||||
|
-- "entry_paths": ["/menu", "/shop", "/order"],
|
||||||
|
-- "age_gate": {"type": "click", "selector": ".age-confirm-btn"},
|
||||||
|
-- "location_modal": {"dismiss_selector": ".modal-close"},
|
||||||
|
-- "infinite_scroll": true,
|
||||||
|
-- "wait_for": ".products-loaded"
|
||||||
|
-- }
|
||||||
|
|
||||||
|
-- Data transformation rules
|
||||||
|
transform_config JSONB DEFAULT '{}',
|
||||||
|
-- Structure:
|
||||||
|
-- {
|
||||||
|
-- "price_regex": "\\$([\\d.]+)",
|
||||||
|
-- "weight_normalizer": "g_to_oz",
|
||||||
|
-- "thc_format": "percentage"
|
||||||
|
-- }
|
||||||
|
|
||||||
|
-- Validation rules
|
||||||
|
validation_rules JSONB DEFAULT '{}',
|
||||||
|
-- Structure:
|
||||||
|
-- {
|
||||||
|
-- "min_products": 5,
|
||||||
|
-- "required_fields": ["name", "price"],
|
||||||
|
-- "price_range": [0.01, 10000]
|
||||||
|
-- }
|
||||||
|
|
||||||
|
-- Test data for validation
|
||||||
|
test_urls JSONB DEFAULT '[]', -- URLs to validate template against
|
||||||
|
expected_structure JSONB DEFAULT '{}', -- What we expect to extract
|
||||||
|
|
||||||
|
-- Stats
|
||||||
|
dispensaries_using INTEGER DEFAULT 0,
|
||||||
|
success_rate DECIMAL(5,2) DEFAULT 0, -- 0-100%
|
||||||
|
last_successful_crawl TIMESTAMPTZ,
|
||||||
|
last_failed_crawl TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
notes TEXT,
|
||||||
|
created_by VARCHAR(100),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT uq_template_name UNIQUE (provider, name, version)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for templates
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_template_provider ON crawler_templates(provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_template_active ON crawler_templates(is_active);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_template_default ON crawler_templates(provider, is_default_for_provider)
|
||||||
|
WHERE is_default_for_provider = TRUE;
|
||||||
|
|
||||||
|
|
||||||
|
-- 3. Sandbox crawl jobs - separate queue for sandbox operations
|
||||||
|
CREATE TABLE IF NOT EXISTS sandbox_crawl_jobs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
sandbox_id INTEGER REFERENCES crawler_sandboxes(id) ON DELETE SET NULL,
|
||||||
|
|
||||||
|
-- Job type
|
||||||
|
job_type VARCHAR(30) NOT NULL DEFAULT 'detection', -- 'detection', 'template_test', 'deep_crawl'
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed', 'cancelled'
|
||||||
|
priority INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Timing
|
||||||
|
scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
started_at TIMESTAMPTZ,
|
||||||
|
completed_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Worker tracking
|
||||||
|
worker_id VARCHAR(100),
|
||||||
|
|
||||||
|
-- Results
|
||||||
|
result_summary JSONB DEFAULT '{}',
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT chk_sandbox_job_type CHECK (job_type IN ('detection', 'template_test', 'deep_crawl')),
|
||||||
|
CONSTRAINT chk_sandbox_job_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for sandbox jobs
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_job_status ON sandbox_crawl_jobs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_job_dispensary ON sandbox_crawl_jobs(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_job_pending ON sandbox_crawl_jobs(scheduled_at) WHERE status = 'pending';
|
||||||
|
|
||||||
|
|
||||||
|
-- 4. Insert default Dutchie template (our only stable one for now)
|
||||||
|
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, selector_config, navigation_config, notes)
|
||||||
|
VALUES (
|
||||||
|
'dutchie',
|
||||||
|
'dutchie_standard',
|
||||||
|
1,
|
||||||
|
TRUE,
|
||||||
|
TRUE,
|
||||||
|
'{
|
||||||
|
"type": "api_based",
|
||||||
|
"graphql_endpoint": "/graphql",
|
||||||
|
"product_container": "data.menu.products",
|
||||||
|
"uses_puppeteer": true,
|
||||||
|
"notes": "Dutchie uses GraphQL API, scraped via puppeteer interception"
|
||||||
|
}'::jsonb,
|
||||||
|
'{
|
||||||
|
"entry_paths": ["/menu", "/order", "/embedded-menu", "/products"],
|
||||||
|
"age_gate": {"type": "auto_detected", "handled_by_stealth": true},
|
||||||
|
"wait_strategy": "networkidle2",
|
||||||
|
"requires_javascript": true
|
||||||
|
}'::jsonb,
|
||||||
|
'Default Dutchie template - uses existing scraper-v2 pipeline'
|
||||||
|
)
|
||||||
|
ON CONFLICT (provider, name, version) DO NOTHING;
|
||||||
|
|
||||||
|
|
||||||
|
-- 5. Triggers for updated_at
|
||||||
|
CREATE OR REPLACE FUNCTION update_sandbox_timestamp()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.updated_at = NOW();
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
DROP TRIGGER IF EXISTS trigger_sandbox_updated_at ON crawler_sandboxes;
|
||||||
|
CREATE TRIGGER trigger_sandbox_updated_at
|
||||||
|
BEFORE UPDATE ON crawler_sandboxes
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_sandbox_timestamp();
|
||||||
|
|
||||||
|
DROP TRIGGER IF EXISTS trigger_template_updated_at ON crawler_templates;
|
||||||
|
CREATE TRIGGER trigger_template_updated_at
|
||||||
|
BEFORE UPDATE ON crawler_templates
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_sandbox_timestamp();
|
||||||
|
|
||||||
|
DROP TRIGGER IF EXISTS trigger_sandbox_job_updated_at ON sandbox_crawl_jobs;
|
||||||
|
CREATE TRIGGER trigger_sandbox_job_updated_at
|
||||||
|
BEFORE UPDATE ON sandbox_crawl_jobs
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION update_sandbox_timestamp();
|
||||||
|
|
||||||
|
|
||||||
|
-- Comments for documentation
|
||||||
|
COMMENT ON TABLE crawler_sandboxes IS 'Learning/testing environment for unknown menu providers';
|
||||||
|
COMMENT ON TABLE crawler_templates IS 'Reusable scraping configurations per menu provider';
|
||||||
|
COMMENT ON TABLE sandbox_crawl_jobs IS 'Job queue for sandbox crawl operations (separate from production)';
|
||||||
118
backend/migrations/018_multi_category_intelligence.sql
Normal file
118
backend/migrations/018_multi_category_intelligence.sql
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
-- =====================================================
|
||||||
|
-- Multi-Category Intelligence Support
|
||||||
|
-- =====================================================
|
||||||
|
-- Each dispensary can have different providers for different
|
||||||
|
-- intelligence categories (products, specials, brands, metadata)
|
||||||
|
|
||||||
|
-- 1. Product Intelligence Columns
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_provider VARCHAR(50);
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_confidence SMALLINT DEFAULT 0;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_product_scan_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS product_detection_data JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- 2. Specials Intelligence Columns
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_provider VARCHAR(50);
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_confidence SMALLINT DEFAULT 0;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_specials_scan_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS specials_detection_data JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- 3. Brand Intelligence Columns
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_provider VARCHAR(50);
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_confidence SMALLINT DEFAULT 0;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_brand_scan_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS brand_detection_data JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- 4. Metadata Intelligence Columns (categories, taxonomy, etc.)
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_provider VARCHAR(50);
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_confidence SMALLINT DEFAULT 0;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_crawler_mode VARCHAR(20) DEFAULT 'sandbox';
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_metadata_scan_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS metadata_detection_data JSONB DEFAULT '{}';
|
||||||
|
|
||||||
|
-- 5. Add category column to crawler_sandboxes
|
||||||
|
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
|
||||||
|
-- Valid categories: 'product', 'specials', 'brand', 'metadata'
|
||||||
|
|
||||||
|
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
|
||||||
|
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS quality_score SMALLINT DEFAULT 0;
|
||||||
|
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS products_extracted INTEGER DEFAULT 0;
|
||||||
|
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS fields_missing INTEGER DEFAULT 0;
|
||||||
|
ALTER TABLE crawler_sandboxes ADD COLUMN IF NOT EXISTS error_count INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
-- 6. Add category column to sandbox_crawl_jobs
|
||||||
|
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS category VARCHAR(30) DEFAULT 'product';
|
||||||
|
ALTER TABLE sandbox_crawl_jobs ADD COLUMN IF NOT EXISTS template_name VARCHAR(100);
|
||||||
|
|
||||||
|
-- 7. Indexes for per-category queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_product_provider ON dispensaries(product_provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_product_mode ON dispensaries(product_crawler_mode);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_specials_provider ON dispensaries(specials_provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_specials_mode ON dispensaries(specials_crawler_mode);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_brand_provider ON dispensaries(brand_provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_brand_mode ON dispensaries(brand_crawler_mode);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_metadata_provider ON dispensaries(metadata_provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_disp_metadata_mode ON dispensaries(metadata_crawler_mode);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_category ON crawler_sandboxes(category);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_template ON crawler_sandboxes(template_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sandbox_job_category ON sandbox_crawl_jobs(category);
|
||||||
|
|
||||||
|
-- 8. Migrate existing menu_provider to product_provider for Dutchie stores
|
||||||
|
-- (Only if menu_provider = 'dutchie' and product_provider is null)
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET
|
||||||
|
product_provider = menu_provider,
|
||||||
|
product_confidence = menu_provider_confidence,
|
||||||
|
product_crawler_mode = CASE
|
||||||
|
WHEN menu_provider = 'dutchie' AND menu_provider_confidence >= 70 THEN 'production'
|
||||||
|
ELSE 'sandbox'
|
||||||
|
END
|
||||||
|
WHERE menu_provider IS NOT NULL
|
||||||
|
AND product_provider IS NULL;
|
||||||
|
|
||||||
|
-- 9. Add environment column to crawler_templates if not exists
|
||||||
|
ALTER TABLE crawler_templates ADD COLUMN IF NOT EXISTS environment VARCHAR(20) DEFAULT 'production';
|
||||||
|
-- Valid: 'production', 'sandbox'
|
||||||
|
|
||||||
|
-- 10. Insert Treez sandbox template (existing code to be linked here)
|
||||||
|
INSERT INTO crawler_templates (provider, name, version, is_active, is_default_for_provider, environment, selector_config, navigation_config, notes)
|
||||||
|
VALUES (
|
||||||
|
'treez',
|
||||||
|
'treez_products_v0',
|
||||||
|
1,
|
||||||
|
FALSE, -- Not active for production
|
||||||
|
FALSE, -- Not default
|
||||||
|
'sandbox',
|
||||||
|
'{
|
||||||
|
"type": "api_based",
|
||||||
|
"notes": "Treez API-based scraper - unreliable, sandbox only",
|
||||||
|
"product_container": "products",
|
||||||
|
"requires_api_key": false,
|
||||||
|
"uses_puppeteer": true
|
||||||
|
}'::jsonb,
|
||||||
|
'{
|
||||||
|
"entry_paths": ["/menu", "/shop"],
|
||||||
|
"wait_strategy": "networkidle2",
|
||||||
|
"requires_javascript": true
|
||||||
|
}'::jsonb,
|
||||||
|
'Treez sandbox template - v0 implementation, needs quality improvement'
|
||||||
|
)
|
||||||
|
ON CONFLICT (provider, name, version) DO NOTHING;
|
||||||
|
|
||||||
|
-- 11. Update existing Dutchie template to specify environment
|
||||||
|
UPDATE crawler_templates
|
||||||
|
SET environment = 'production'
|
||||||
|
WHERE provider = 'dutchie' AND name = 'dutchie_standard';
|
||||||
|
|
||||||
|
-- 12. Comments
|
||||||
|
COMMENT ON COLUMN dispensaries.product_provider IS 'Provider for product intelligence (dutchie, treez, jane, etc.)';
|
||||||
|
COMMENT ON COLUMN dispensaries.product_crawler_mode IS 'production or sandbox mode for product crawling';
|
||||||
|
COMMENT ON COLUMN dispensaries.specials_provider IS 'Provider for specials/deals intelligence';
|
||||||
|
COMMENT ON COLUMN dispensaries.brand_provider IS 'Provider for brand intelligence';
|
||||||
|
COMMENT ON COLUMN dispensaries.metadata_provider IS 'Provider for metadata/taxonomy intelligence';
|
||||||
|
COMMENT ON COLUMN crawler_sandboxes.category IS 'Intelligence category: product, specials, brand, metadata';
|
||||||
|
COMMENT ON COLUMN crawler_sandboxes.quality_score IS 'Quality score 0-100 for sandbox run results';
|
||||||
|
COMMENT ON COLUMN crawler_templates.environment IS 'Template environment: production or sandbox';
|
||||||
69
backend/migrations/019_link_stores_to_dispensaries.sql
Normal file
69
backend/migrations/019_link_stores_to_dispensaries.sql
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
-- =====================================================
|
||||||
|
-- Link Stores to Dispensaries (Master AZDHS Directory)
|
||||||
|
-- =====================================================
|
||||||
|
-- This migration adds a foreign key from stores to dispensaries,
|
||||||
|
-- allowing the scheduler to reference the master dispensary records.
|
||||||
|
|
||||||
|
-- 1. Add dispensary_id column to stores table
|
||||||
|
ALTER TABLE stores ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL;
|
||||||
|
|
||||||
|
-- 2. Create index for efficient lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_stores_dispensary_id ON stores(dispensary_id);
|
||||||
|
|
||||||
|
-- 3. Update the crawl_schedule_status view to include dispensary info
|
||||||
|
DROP VIEW IF EXISTS crawl_schedule_status;
|
||||||
|
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||||
|
SELECT
|
||||||
|
s.id AS store_id,
|
||||||
|
s.name AS store_name,
|
||||||
|
s.slug AS store_slug,
|
||||||
|
s.timezone,
|
||||||
|
s.active,
|
||||||
|
s.scrape_enabled,
|
||||||
|
s.last_scraped_at,
|
||||||
|
|
||||||
|
-- Dispensary info (master record)
|
||||||
|
s.dispensary_id,
|
||||||
|
d.name AS dispensary_name,
|
||||||
|
d.company_name AS dispensary_company,
|
||||||
|
d.city AS dispensary_city,
|
||||||
|
d.address AS dispensary_address,
|
||||||
|
d.menu_url AS dispensary_menu_url,
|
||||||
|
|
||||||
|
-- Schedule settings (use store override or global)
|
||||||
|
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||||
|
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||||
|
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||||
|
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||||
|
COALESCE(scs.priority, 0) AS priority,
|
||||||
|
|
||||||
|
-- Next scheduled run calculation
|
||||||
|
CASE
|
||||||
|
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||||
|
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||||
|
END AS next_scheduled_run,
|
||||||
|
|
||||||
|
-- Latest job info
|
||||||
|
cj.id AS latest_job_id,
|
||||||
|
cj.status AS latest_job_status,
|
||||||
|
cj.started_at AS latest_job_started,
|
||||||
|
cj.completed_at AS latest_job_completed,
|
||||||
|
cj.products_found AS latest_products_found
|
||||||
|
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||||
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||||
|
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||||
|
LEFT JOIN LATERAL (
|
||||||
|
SELECT * FROM crawl_jobs cj2
|
||||||
|
WHERE cj2.store_id = s.id
|
||||||
|
ORDER BY cj2.created_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
) cj ON TRUE
|
||||||
|
WHERE s.active = TRUE;
|
||||||
|
|
||||||
|
-- Grant permissions
|
||||||
|
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||||
|
|
||||||
|
-- 4. Comments
|
||||||
|
COMMENT ON COLUMN stores.dispensary_id IS 'FK to dispensaries table (master AZDHS directory)';
|
||||||
99
backend/migrations/020_scheduler_orchestrator_fields.sql
Normal file
99
backend/migrations/020_scheduler_orchestrator_fields.sql
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
-- =====================================================
|
||||||
|
-- Scheduler Orchestrator Fields
|
||||||
|
-- =====================================================
|
||||||
|
-- Add last_status and last_summary to store_crawl_schedule
|
||||||
|
-- for meaningful job result tracking
|
||||||
|
|
||||||
|
-- 1. Add new columns to store_crawl_schedule
|
||||||
|
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_status VARCHAR(50);
|
||||||
|
-- Valid values: 'success', 'error', 'sandbox_only', 'detection_only', 'pending'
|
||||||
|
|
||||||
|
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_summary TEXT;
|
||||||
|
-- Human-readable summary like "Detection + Dutchie products crawl (187 items)"
|
||||||
|
|
||||||
|
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_run_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
ALTER TABLE store_crawl_schedule ADD COLUMN IF NOT EXISTS last_error TEXT;
|
||||||
|
|
||||||
|
-- 2. Add job_type tracking to crawl_jobs for better job categorization
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS orchestrator_run_id UUID;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS detection_result JSONB;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_new INTEGER;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS products_updated INTEGER;
|
||||||
|
|
||||||
|
-- 3. Update the crawl_schedule_status view to include new fields
|
||||||
|
DROP VIEW IF EXISTS crawl_schedule_status;
|
||||||
|
CREATE OR REPLACE VIEW crawl_schedule_status AS
|
||||||
|
SELECT
|
||||||
|
s.id AS store_id,
|
||||||
|
s.name AS store_name,
|
||||||
|
s.slug AS store_slug,
|
||||||
|
s.timezone,
|
||||||
|
s.active,
|
||||||
|
s.scrape_enabled,
|
||||||
|
s.last_scraped_at,
|
||||||
|
|
||||||
|
-- Dispensary info (master record)
|
||||||
|
s.dispensary_id,
|
||||||
|
d.name AS dispensary_name,
|
||||||
|
d.company_name AS dispensary_company,
|
||||||
|
d.city AS dispensary_city,
|
||||||
|
d.address AS dispensary_address,
|
||||||
|
d.menu_url AS dispensary_menu_url,
|
||||||
|
|
||||||
|
-- Provider intelligence from dispensary (if linked)
|
||||||
|
d.product_provider,
|
||||||
|
d.product_confidence,
|
||||||
|
d.product_crawler_mode,
|
||||||
|
|
||||||
|
-- Schedule settings (use store override or global)
|
||||||
|
COALESCE(scs.enabled, TRUE) AS schedule_enabled,
|
||||||
|
COALESCE(scs.interval_hours, cs_global.interval_hours, 4) AS interval_hours,
|
||||||
|
COALESCE(scs.daily_special_enabled, TRUE) AS daily_special_enabled,
|
||||||
|
COALESCE(scs.daily_special_time, '00:01'::TIME) AS daily_special_time,
|
||||||
|
COALESCE(scs.priority, 0) AS priority,
|
||||||
|
|
||||||
|
-- Orchestrator status
|
||||||
|
scs.last_status,
|
||||||
|
scs.last_summary,
|
||||||
|
scs.last_run_at AS schedule_last_run,
|
||||||
|
scs.last_error,
|
||||||
|
|
||||||
|
-- Next scheduled run calculation
|
||||||
|
CASE
|
||||||
|
WHEN s.last_scraped_at IS NULL THEN NOW()
|
||||||
|
ELSE s.last_scraped_at + (COALESCE(scs.interval_hours, cs_global.interval_hours, 4) || ' hours')::INTERVAL
|
||||||
|
END AS next_scheduled_run,
|
||||||
|
|
||||||
|
-- Latest job info
|
||||||
|
cj.id AS latest_job_id,
|
||||||
|
cj.status AS latest_job_status,
|
||||||
|
cj.job_type AS latest_job_type,
|
||||||
|
cj.trigger_type AS latest_job_trigger,
|
||||||
|
cj.started_at AS latest_job_started,
|
||||||
|
cj.completed_at AS latest_job_completed,
|
||||||
|
cj.products_found AS latest_products_found,
|
||||||
|
cj.products_new AS latest_products_new,
|
||||||
|
cj.products_updated AS latest_products_updated,
|
||||||
|
cj.error_message AS latest_job_error
|
||||||
|
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||||
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||||
|
LEFT JOIN crawler_schedule cs_global ON cs_global.schedule_type = 'global_interval'
|
||||||
|
LEFT JOIN LATERAL (
|
||||||
|
SELECT * FROM crawl_jobs cj2
|
||||||
|
WHERE cj2.store_id = s.id
|
||||||
|
ORDER BY cj2.created_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
) cj ON TRUE
|
||||||
|
WHERE s.active = TRUE;
|
||||||
|
|
||||||
|
-- 4. Grant permissions
|
||||||
|
GRANT SELECT ON crawl_schedule_status TO dutchie;
|
||||||
|
|
||||||
|
-- 5. Comments
|
||||||
|
COMMENT ON COLUMN store_crawl_schedule.last_status IS 'Orchestrator result status: success, error, sandbox_only, detection_only';
|
||||||
|
COMMENT ON COLUMN store_crawl_schedule.last_summary IS 'Human-readable summary of last orchestrator run';
|
||||||
|
COMMENT ON COLUMN store_crawl_schedule.last_run_at IS 'When orchestrator last ran for this store';
|
||||||
|
COMMENT ON COLUMN crawl_jobs.orchestrator_run_id IS 'Groups related jobs from same orchestrator run';
|
||||||
@@ -49,7 +49,10 @@ import scraperMonitorRoutes from './routes/scraper-monitor';
|
|||||||
import apiTokensRoutes from './routes/api-tokens';
|
import apiTokensRoutes from './routes/api-tokens';
|
||||||
import apiPermissionsRoutes from './routes/api-permissions';
|
import apiPermissionsRoutes from './routes/api-permissions';
|
||||||
import parallelScrapeRoutes from './routes/parallel-scrape';
|
import parallelScrapeRoutes from './routes/parallel-scrape';
|
||||||
|
import scheduleRoutes from './routes/schedule';
|
||||||
|
import crawlerSandboxRoutes from './routes/crawler-sandbox';
|
||||||
import { trackApiUsage, checkRateLimit } from './middleware/apiTokenTracker';
|
import { trackApiUsage, checkRateLimit } from './middleware/apiTokenTracker';
|
||||||
|
import { startCrawlScheduler } from './services/crawl-scheduler';
|
||||||
import { validateWordPressPermissions } from './middleware/wordpressPermissions';
|
import { validateWordPressPermissions } from './middleware/wordpressPermissions';
|
||||||
|
|
||||||
// Apply WordPress permissions validation first (sets req.apiToken)
|
// Apply WordPress permissions validation first (sets req.apiToken)
|
||||||
@@ -75,6 +78,8 @@ app.use('/api/scraper-monitor', scraperMonitorRoutes);
|
|||||||
app.use('/api/api-tokens', apiTokensRoutes);
|
app.use('/api/api-tokens', apiTokensRoutes);
|
||||||
app.use('/api/api-permissions', apiPermissionsRoutes);
|
app.use('/api/api-permissions', apiPermissionsRoutes);
|
||||||
app.use('/api/parallel-scrape', parallelScrapeRoutes);
|
app.use('/api/parallel-scrape', parallelScrapeRoutes);
|
||||||
|
app.use('/api/schedule', scheduleRoutes);
|
||||||
|
app.use('/api/crawler-sandbox', crawlerSandboxRoutes);
|
||||||
|
|
||||||
async function startServer() {
|
async function startServer() {
|
||||||
try {
|
try {
|
||||||
@@ -86,6 +91,10 @@ async function startServer() {
|
|||||||
// Clean up any orphaned proxy test jobs from previous server runs
|
// Clean up any orphaned proxy test jobs from previous server runs
|
||||||
await cleanupOrphanedJobs();
|
await cleanupOrphanedJobs();
|
||||||
|
|
||||||
|
// Start the crawl scheduler (checks every minute for jobs to run)
|
||||||
|
startCrawlScheduler();
|
||||||
|
logger.info('system', 'Crawl scheduler started');
|
||||||
|
|
||||||
app.listen(PORT, () => {
|
app.listen(PORT, () => {
|
||||||
logger.info('system', `Server running on port ${PORT}`);
|
logger.info('system', `Server running on port ${PORT}`);
|
||||||
console.log(`🚀 Server running on port ${PORT}`);
|
console.log(`🚀 Server running on port ${PORT}`);
|
||||||
|
|||||||
628
backend/src/routes/crawler-sandbox.ts
Normal file
628
backend/src/routes/crawler-sandbox.ts
Normal file
@@ -0,0 +1,628 @@
|
|||||||
|
/**
|
||||||
|
* Crawler Sandbox API Routes
|
||||||
|
*
|
||||||
|
* Endpoints for managing sandbox crawls, templates, and provider detection
|
||||||
|
*/
|
||||||
|
|
||||||
|
import express from 'express';
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { authMiddleware, requireRole } from '../auth/middleware';
|
||||||
|
import { logger } from '../services/logger';
|
||||||
|
import {
|
||||||
|
runDetectMenuProviderJob,
|
||||||
|
runDutchieMenuCrawlJob,
|
||||||
|
runSandboxCrawlJob,
|
||||||
|
} from '../services/crawler-jobs';
|
||||||
|
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
// Apply auth middleware to all routes
|
||||||
|
router.use(authMiddleware);
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Sandbox Entries
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/crawler-sandbox
|
||||||
|
* List sandbox entries with optional filters
|
||||||
|
*/
|
||||||
|
router.get('/', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
|
||||||
|
|
||||||
|
let query = `
|
||||||
|
SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
|
||||||
|
FROM crawler_sandboxes cs
|
||||||
|
JOIN dispensaries d ON d.id = cs.dispensary_id
|
||||||
|
WHERE 1=1
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
query += ` AND cs.status = $${paramIndex}`;
|
||||||
|
params.push(status);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dispensaryId) {
|
||||||
|
query += ` AND cs.dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(Number(dispensaryId));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
|
||||||
|
params.push(Number(limit), Number(offset));
|
||||||
|
|
||||||
|
const result = await pool.query(query, params);
|
||||||
|
|
||||||
|
// Get total count
|
||||||
|
const countResult = await pool.query(
|
||||||
|
`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
|
||||||
|
${status ? 'AND cs.status = $1' : ''}
|
||||||
|
${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`,
|
||||||
|
status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
sandboxes: result.rows,
|
||||||
|
total: parseInt(countResult.rows[0].count),
|
||||||
|
limit: Number(limit),
|
||||||
|
offset: Number(offset),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Get sandboxes error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/crawler-sandbox/:id
|
||||||
|
* Get a single sandbox entry with full details
|
||||||
|
*/
|
||||||
|
router.get('/:id', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
|
||||||
|
d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
|
||||||
|
FROM crawler_sandboxes cs
|
||||||
|
JOIN dispensaries d ON d.id = cs.dispensary_id
|
||||||
|
WHERE cs.id = $1`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Sandbox entry not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get related jobs
|
||||||
|
const jobs = await pool.query(
|
||||||
|
`SELECT * FROM sandbox_crawl_jobs
|
||||||
|
WHERE sandbox_id = $1 OR dispensary_id = $2
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 10`,
|
||||||
|
[id, result.rows[0].dispensary_id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
sandbox: result.rows[0],
|
||||||
|
jobs: jobs.rows,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Get sandbox error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/crawler-sandbox/:id/analyze
|
||||||
|
* Trigger re-analysis of a sandbox entry
|
||||||
|
*/
|
||||||
|
router.post('/:id/analyze', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const sandbox = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
|
||||||
|
if (sandbox.rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Sandbox entry not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queue a new sandbox job
|
||||||
|
const job = await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
|
||||||
|
VALUES ($1, $2, 'deep_crawl', 'pending', 20)
|
||||||
|
RETURNING id`,
|
||||||
|
[sandbox.rows[0].dispensary_id, id]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update sandbox status
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
message: 'Analysis job queued',
|
||||||
|
jobId: job.rows[0].id,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Analyze sandbox error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/crawler-sandbox/:id/move-to-production
|
||||||
|
* Move a sandbox entry to production (for Dutchie dispensaries)
|
||||||
|
*/
|
||||||
|
router.post('/:id/move-to-production', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const sandbox = await pool.query(
|
||||||
|
`SELECT cs.*, d.menu_provider
|
||||||
|
FROM crawler_sandboxes cs
|
||||||
|
JOIN dispensaries d ON d.id = cs.dispensary_id
|
||||||
|
WHERE cs.id = $1`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (sandbox.rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Sandbox entry not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can only move to production if provider is dutchie
|
||||||
|
if (sandbox.rows[0].menu_provider !== 'dutchie') {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: 'Only Dutchie dispensaries can be moved to production currently',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update dispensary to production mode
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries
|
||||||
|
SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[sandbox.rows[0].dispensary_id]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Mark sandbox as moved
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes
|
||||||
|
SET status = 'moved_to_production', updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({ message: 'Dispensary moved to production' });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Move to production error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PATCH /api/crawler-sandbox/:id
|
||||||
|
* Update sandbox entry (e.g., add human review notes)
|
||||||
|
*/
|
||||||
|
router.patch('/:id', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { human_review_notes, status, suspected_menu_provider } = req.body;
|
||||||
|
|
||||||
|
const updates: string[] = [];
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (human_review_notes !== undefined) {
|
||||||
|
updates.push(`human_review_notes = $${paramIndex}`);
|
||||||
|
params.push(human_review_notes);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
updates.push(`status = $${paramIndex}`);
|
||||||
|
params.push(status);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (suspected_menu_provider !== undefined) {
|
||||||
|
updates.push(`suspected_menu_provider = $${paramIndex}`);
|
||||||
|
params.push(suspected_menu_provider);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (updates.length === 0) {
|
||||||
|
return res.status(400).json({ error: 'No updates provided' });
|
||||||
|
}
|
||||||
|
|
||||||
|
updates.push('updated_at = NOW()');
|
||||||
|
if (human_review_notes !== undefined) {
|
||||||
|
updates.push('reviewed_at = NOW()');
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(id);
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({ message: 'Sandbox updated' });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Update sandbox error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Templates
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/crawler-sandbox/templates
|
||||||
|
* List all crawler templates
|
||||||
|
*/
|
||||||
|
router.get('/templates/list', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`
|
||||||
|
);
|
||||||
|
res.json({ templates: result.rows });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Get templates error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/crawler-sandbox/templates/:id
|
||||||
|
* Get a single template
|
||||||
|
*/
|
||||||
|
router.get('/templates/:id', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const result = await pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Template not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json({ template: result.rows[0] });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Get template error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/crawler-sandbox/templates
|
||||||
|
* Create a new template
|
||||||
|
*/
|
||||||
|
router.post('/templates', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const {
|
||||||
|
provider,
|
||||||
|
name,
|
||||||
|
selector_config,
|
||||||
|
navigation_config,
|
||||||
|
transform_config,
|
||||||
|
validation_rules,
|
||||||
|
notes,
|
||||||
|
} = req.body;
|
||||||
|
|
||||||
|
if (!provider || !name) {
|
||||||
|
return res.status(400).json({ error: 'provider and name are required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO crawler_templates
|
||||||
|
(provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||||
|
RETURNING *`,
|
||||||
|
[
|
||||||
|
provider,
|
||||||
|
name,
|
||||||
|
JSON.stringify(selector_config || {}),
|
||||||
|
JSON.stringify(navigation_config || {}),
|
||||||
|
JSON.stringify(transform_config || {}),
|
||||||
|
JSON.stringify(validation_rules || {}),
|
||||||
|
notes,
|
||||||
|
(req as any).user?.email || 'system',
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.status(201).json({ template: result.rows[0] });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Create template error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PUT /api/crawler-sandbox/templates/:id
|
||||||
|
* Update a template
|
||||||
|
*/
|
||||||
|
router.put('/templates/:id', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const {
|
||||||
|
is_active,
|
||||||
|
is_default_for_provider,
|
||||||
|
selector_config,
|
||||||
|
navigation_config,
|
||||||
|
transform_config,
|
||||||
|
validation_rules,
|
||||||
|
notes,
|
||||||
|
} = req.body;
|
||||||
|
|
||||||
|
const updates: string[] = [];
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (is_active !== undefined) {
|
||||||
|
updates.push(`is_active = $${paramIndex}`);
|
||||||
|
params.push(is_active);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_default_for_provider !== undefined) {
|
||||||
|
updates.push(`is_default_for_provider = $${paramIndex}`);
|
||||||
|
params.push(is_default_for_provider);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (selector_config !== undefined) {
|
||||||
|
updates.push(`selector_config = $${paramIndex}`);
|
||||||
|
params.push(JSON.stringify(selector_config));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (navigation_config !== undefined) {
|
||||||
|
updates.push(`navigation_config = $${paramIndex}`);
|
||||||
|
params.push(JSON.stringify(navigation_config));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (transform_config !== undefined) {
|
||||||
|
updates.push(`transform_config = $${paramIndex}`);
|
||||||
|
params.push(JSON.stringify(transform_config));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (validation_rules !== undefined) {
|
||||||
|
updates.push(`validation_rules = $${paramIndex}`);
|
||||||
|
params.push(JSON.stringify(validation_rules));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notes !== undefined) {
|
||||||
|
updates.push(`notes = $${paramIndex}`);
|
||||||
|
params.push(notes);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (updates.length === 0) {
|
||||||
|
return res.status(400).json({ error: 'No updates provided' });
|
||||||
|
}
|
||||||
|
|
||||||
|
updates.push('updated_at = NOW()');
|
||||||
|
params.push(id);
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
|
||||||
|
res.json({ template: result.rows[0] });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Update template error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Jobs
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/crawler-sandbox/jobs
|
||||||
|
* List sandbox crawl jobs
|
||||||
|
*/
|
||||||
|
router.get('/jobs/list', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { status, dispensaryId, limit = 50 } = req.query;
|
||||||
|
|
||||||
|
let query = `
|
||||||
|
SELECT sj.*, d.name as dispensary_name
|
||||||
|
FROM sandbox_crawl_jobs sj
|
||||||
|
JOIN dispensaries d ON d.id = sj.dispensary_id
|
||||||
|
WHERE 1=1
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
query += ` AND sj.status = $${paramIndex}`;
|
||||||
|
params.push(status);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dispensaryId) {
|
||||||
|
query += ` AND sj.dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(Number(dispensaryId));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
|
||||||
|
params.push(Number(limit));
|
||||||
|
|
||||||
|
const result = await pool.query(query, params);
|
||||||
|
res.json({ jobs: result.rows });
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Get jobs error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/crawler-sandbox/jobs/detect/:dispensaryId
|
||||||
|
* Trigger provider detection for a dispensary
|
||||||
|
*/
|
||||||
|
router.post('/jobs/detect/:dispensaryId', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { dispensaryId } = req.params;
|
||||||
|
|
||||||
|
// Create detection job
|
||||||
|
const job = await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||||
|
VALUES ($1, 'detection', 'pending', 30)
|
||||||
|
RETURNING id`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update dispensary status
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
message: 'Detection job queued',
|
||||||
|
jobId: job.rows[0].id,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Queue detection error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/crawler-sandbox/jobs/run/:id
|
||||||
|
* Immediately run a sandbox job
|
||||||
|
*/
|
||||||
|
router.post('/jobs/run/:id', requireRole('admin'), async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const job = await pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
|
||||||
|
if (job.rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Job not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const jobData = job.rows[0];
|
||||||
|
|
||||||
|
// Run the job immediately
|
||||||
|
let result;
|
||||||
|
if (jobData.job_type === 'detection') {
|
||||||
|
result = await runDetectMenuProviderJob(jobData.dispensary_id);
|
||||||
|
} else {
|
||||||
|
result = await runSandboxCrawlJob(jobData.dispensary_id, jobData.sandbox_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update job status
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs
|
||||||
|
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||||
|
WHERE id = $4`,
|
||||||
|
[
|
||||||
|
result.success ? 'completed' : 'failed',
|
||||||
|
JSON.stringify(result.data || {}),
|
||||||
|
result.success ? null : result.message,
|
||||||
|
id,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json(result);
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Run job error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Stats
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/crawler-sandbox/stats
|
||||||
|
* Get sandbox/crawler statistics
|
||||||
|
*/
|
||||||
|
router.get('/stats/overview', async (req, res) => {
|
||||||
|
try {
|
||||||
|
// Dispensary provider stats
|
||||||
|
const providerStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
menu_provider,
|
||||||
|
COUNT(*) as count,
|
||||||
|
AVG(menu_provider_confidence)::integer as avg_confidence
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE menu_provider IS NOT NULL
|
||||||
|
GROUP BY menu_provider
|
||||||
|
ORDER BY count DESC
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Mode stats
|
||||||
|
const modeStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
crawler_mode,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM dispensaries
|
||||||
|
GROUP BY crawler_mode
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Status stats
|
||||||
|
const statusStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
crawler_status,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM dispensaries
|
||||||
|
GROUP BY crawler_status
|
||||||
|
ORDER BY count DESC
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Sandbox stats
|
||||||
|
const sandboxStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
status,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM crawler_sandboxes
|
||||||
|
GROUP BY status
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Job stats
|
||||||
|
const jobStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
status,
|
||||||
|
job_type,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM sandbox_crawl_jobs
|
||||||
|
GROUP BY status, job_type
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Recent activity
|
||||||
|
const recentActivity = await pool.query(`
|
||||||
|
SELECT 'sandbox' as type, id, dispensary_id, status, created_at
|
||||||
|
FROM crawler_sandboxes
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 5
|
||||||
|
`);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
providers: providerStats.rows,
|
||||||
|
modes: modeStats.rows,
|
||||||
|
statuses: statusStats.rows,
|
||||||
|
sandbox: sandboxStats.rows,
|
||||||
|
jobs: jobStats.rows,
|
||||||
|
recentActivity: recentActivity.rows,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('api', `Get stats error: ${error.message}`);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
export default router;
|
||||||
344
backend/src/routes/schedule.ts
Normal file
344
backend/src/routes/schedule.ts
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
import { Router, Request, Response } from 'express';
|
||||||
|
import { authMiddleware, requireRole } from '../auth/middleware';
|
||||||
|
import {
|
||||||
|
getGlobalSchedule,
|
||||||
|
updateGlobalSchedule,
|
||||||
|
getStoreScheduleStatuses,
|
||||||
|
getStoreSchedule,
|
||||||
|
updateStoreSchedule,
|
||||||
|
getAllRecentJobs,
|
||||||
|
getRecentJobs,
|
||||||
|
triggerManualCrawl,
|
||||||
|
triggerAllStoresCrawl,
|
||||||
|
cancelJob,
|
||||||
|
restartCrawlScheduler,
|
||||||
|
setSchedulerMode,
|
||||||
|
getSchedulerMode,
|
||||||
|
} from '../services/crawl-scheduler';
|
||||||
|
import {
|
||||||
|
runStoreCrawlOrchestrator,
|
||||||
|
runBatchOrchestrator,
|
||||||
|
getStoresDueForOrchestration,
|
||||||
|
} from '../services/store-crawl-orchestrator';
|
||||||
|
|
||||||
|
const router = Router();
|
||||||
|
router.use(authMiddleware);
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Global Schedule Endpoints
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/global
|
||||||
|
* Get global schedule settings
|
||||||
|
*/
|
||||||
|
router.get('/global', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const schedules = await getGlobalSchedule();
|
||||||
|
res.json({ schedules });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error fetching global schedule:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch global schedule' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PUT /api/schedule/global/:type
|
||||||
|
* Update global schedule setting
|
||||||
|
*/
|
||||||
|
router.put('/global/:type', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { type } = req.params;
|
||||||
|
const { enabled, interval_hours, run_time } = req.body;
|
||||||
|
|
||||||
|
if (type !== 'global_interval' && type !== 'daily_special') {
|
||||||
|
return res.status(400).json({ error: 'Invalid schedule type' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const schedule = await updateGlobalSchedule(type, {
|
||||||
|
enabled,
|
||||||
|
interval_hours,
|
||||||
|
run_time
|
||||||
|
});
|
||||||
|
|
||||||
|
// Restart scheduler to apply changes
|
||||||
|
await restartCrawlScheduler();
|
||||||
|
|
||||||
|
res.json({ schedule, message: 'Schedule updated and scheduler restarted' });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error updating global schedule:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to update global schedule' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Store Schedule Endpoints
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/stores
|
||||||
|
* Get all store schedule statuses
|
||||||
|
*/
|
||||||
|
router.get('/stores', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const stores = await getStoreScheduleStatuses();
|
||||||
|
res.json({ stores });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error fetching store schedules:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch store schedules' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/stores/:storeId
|
||||||
|
* Get schedule for a specific store
|
||||||
|
*/
|
||||||
|
router.get('/stores/:storeId', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.storeId);
|
||||||
|
if (isNaN(storeId)) {
|
||||||
|
return res.status(400).json({ error: 'Invalid store ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const schedule = await getStoreSchedule(storeId);
|
||||||
|
res.json({ schedule });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error fetching store schedule:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch store schedule' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PUT /api/schedule/stores/:storeId
|
||||||
|
* Update schedule for a specific store
|
||||||
|
*/
|
||||||
|
router.put('/stores/:storeId', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.storeId);
|
||||||
|
if (isNaN(storeId)) {
|
||||||
|
return res.status(400).json({ error: 'Invalid store ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const {
|
||||||
|
enabled,
|
||||||
|
interval_hours,
|
||||||
|
daily_special_enabled,
|
||||||
|
daily_special_time,
|
||||||
|
priority
|
||||||
|
} = req.body;
|
||||||
|
|
||||||
|
const schedule = await updateStoreSchedule(storeId, {
|
||||||
|
enabled,
|
||||||
|
interval_hours,
|
||||||
|
daily_special_enabled,
|
||||||
|
daily_special_time,
|
||||||
|
priority
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({ schedule });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error updating store schedule:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to update store schedule' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Job Queue Endpoints
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/jobs
|
||||||
|
* Get recent jobs
|
||||||
|
*/
|
||||||
|
router.get('/jobs', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const limit = parseInt(req.query.limit as string) || 50;
|
||||||
|
const jobs = await getAllRecentJobs(Math.min(limit, 200));
|
||||||
|
res.json({ jobs });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error fetching jobs:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch jobs' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/jobs/store/:storeId
|
||||||
|
* Get recent jobs for a specific store
|
||||||
|
*/
|
||||||
|
router.get('/jobs/store/:storeId', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.storeId);
|
||||||
|
if (isNaN(storeId)) {
|
||||||
|
return res.status(400).json({ error: 'Invalid store ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const limit = parseInt(req.query.limit as string) || 10;
|
||||||
|
const jobs = await getRecentJobs(storeId, Math.min(limit, 100));
|
||||||
|
res.json({ jobs });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error fetching store jobs:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch store jobs' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/schedule/jobs/:jobId/cancel
|
||||||
|
* Cancel a pending job
|
||||||
|
*/
|
||||||
|
router.post('/jobs/:jobId/cancel', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const jobId = parseInt(req.params.jobId);
|
||||||
|
if (isNaN(jobId)) {
|
||||||
|
return res.status(400).json({ error: 'Invalid job ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const cancelled = await cancelJob(jobId);
|
||||||
|
if (cancelled) {
|
||||||
|
res.json({ success: true, message: 'Job cancelled' });
|
||||||
|
} else {
|
||||||
|
res.status(400).json({ error: 'Job could not be cancelled (may not be pending)' });
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error cancelling job:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to cancel job' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Manual Trigger Endpoints
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/schedule/trigger/store/:storeId
|
||||||
|
* Manually trigger orchestrated crawl for a specific store
|
||||||
|
* Uses the intelligent orchestrator which:
|
||||||
|
* - Checks provider detection status
|
||||||
|
* - Runs detection if needed
|
||||||
|
* - Queues appropriate crawl type (production/sandbox)
|
||||||
|
*/
|
||||||
|
router.post('/trigger/store/:storeId', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.storeId);
|
||||||
|
if (isNaN(storeId)) {
|
||||||
|
return res.status(400).json({ error: 'Invalid store ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the orchestrator instead of simple triggerManualCrawl
|
||||||
|
const result = await runStoreCrawlOrchestrator(storeId);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
result,
|
||||||
|
message: result.summary,
|
||||||
|
success: result.status === 'success' || result.status === 'sandbox_only',
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error triggering orchestrated crawl:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to trigger crawl' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/schedule/trigger/store/:storeId/legacy
|
||||||
|
* Legacy: Simple job queue trigger (no orchestration)
|
||||||
|
*/
|
||||||
|
router.post('/trigger/store/:storeId/legacy', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.storeId);
|
||||||
|
if (isNaN(storeId)) {
|
||||||
|
return res.status(400).json({ error: 'Invalid store ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const job = await triggerManualCrawl(storeId);
|
||||||
|
res.json({ job, message: 'Crawl job created' });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error triggering manual crawl:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to trigger crawl' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/schedule/trigger/all
|
||||||
|
* Manually trigger crawls for all stores
|
||||||
|
*/
|
||||||
|
router.post('/trigger/all', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const jobsCreated = await triggerAllStoresCrawl();
|
||||||
|
res.json({ jobs_created: jobsCreated, message: `Created ${jobsCreated} crawl jobs` });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error triggering all crawls:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to trigger crawls' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/schedule/restart
|
||||||
|
* Restart the scheduler
|
||||||
|
*/
|
||||||
|
router.post('/restart', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
await restartCrawlScheduler();
|
||||||
|
res.json({ message: 'Scheduler restarted', mode: getSchedulerMode() });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error restarting scheduler:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to restart scheduler' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Scheduler Mode Endpoints
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/mode
|
||||||
|
* Get current scheduler mode
|
||||||
|
*/
|
||||||
|
router.get('/mode', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const mode = getSchedulerMode();
|
||||||
|
res.json({ mode });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error getting scheduler mode:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to get scheduler mode' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PUT /api/schedule/mode
|
||||||
|
* Set scheduler mode (legacy or orchestrator)
|
||||||
|
*/
|
||||||
|
router.put('/mode', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { mode } = req.body;
|
||||||
|
|
||||||
|
if (mode !== 'legacy' && mode !== 'orchestrator') {
|
||||||
|
return res.status(400).json({ error: 'Invalid mode. Must be "legacy" or "orchestrator"' });
|
||||||
|
}
|
||||||
|
|
||||||
|
setSchedulerMode(mode);
|
||||||
|
|
||||||
|
// Restart scheduler with new mode
|
||||||
|
await restartCrawlScheduler();
|
||||||
|
|
||||||
|
res.json({ mode, message: `Scheduler mode set to ${mode} and restarted` });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error setting scheduler mode:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to set scheduler mode' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/schedule/due
|
||||||
|
* Get stores that are due for orchestration
|
||||||
|
*/
|
||||||
|
router.get('/due', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const limit = parseInt(req.query.limit as string) || 10;
|
||||||
|
const storeIds = await getStoresDueForOrchestration(Math.min(limit, 50));
|
||||||
|
res.json({ stores_due: storeIds, count: storeIds.length });
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error getting stores due for orchestration:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to get stores due' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
export default router;
|
||||||
345
backend/src/scripts/backfill-store-dispensary.ts
Normal file
345
backend/src/scripts/backfill-store-dispensary.ts
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Backfill Store-Dispensary Mapping
|
||||||
|
*
|
||||||
|
* Links existing stores (scheduler) to dispensaries (master AZDHS directory)
|
||||||
|
* by matching on name, city, and zip code.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
|
||||||
|
* npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
|
||||||
|
* npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { logger } from '../services/logger';
|
||||||
|
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const flags = {
|
||||||
|
apply: args.includes('--apply'),
|
||||||
|
verbose: args.includes('--verbose'),
|
||||||
|
help: args.includes('--help') || args.includes('-h'),
|
||||||
|
};
|
||||||
|
|
||||||
|
interface Store {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
dispensary_id: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Dispensary {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
company_name: string | null;
|
||||||
|
city: string;
|
||||||
|
address: string;
|
||||||
|
slug: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface MatchResult {
|
||||||
|
store: Store;
|
||||||
|
dispensary: Dispensary | null;
|
||||||
|
matchType: 'exact_name' | 'normalized_name' | 'company_name' | 'slug' | 'fuzzy' | 'none';
|
||||||
|
score: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a store/dispensary name for comparison
|
||||||
|
* Removes common suffixes, punctuation, and extra whitespace
|
||||||
|
*/
|
||||||
|
function normalizeName(name: string): string {
|
||||||
|
return name
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
|
||||||
|
.replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
|
||||||
|
.replace(/['']/g, "'") // Normalize apostrophes
|
||||||
|
.replace(/[^\w\s']/g, '') // Remove other punctuation
|
||||||
|
.replace(/\s+/g, ' ') // Collapse whitespace
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple Levenshtein distance for fuzzy matching
|
||||||
|
*/
|
||||||
|
function levenshteinDistance(a: string, b: string): number {
|
||||||
|
const matrix: number[][] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i <= b.length; i++) {
|
||||||
|
matrix[i] = [i];
|
||||||
|
}
|
||||||
|
for (let j = 0; j <= a.length; j++) {
|
||||||
|
matrix[0][j] = j;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let i = 1; i <= b.length; i++) {
|
||||||
|
for (let j = 1; j <= a.length; j++) {
|
||||||
|
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||||
|
matrix[i][j] = matrix[i - 1][j - 1];
|
||||||
|
} else {
|
||||||
|
matrix[i][j] = Math.min(
|
||||||
|
matrix[i - 1][j - 1] + 1, // substitution
|
||||||
|
matrix[i][j - 1] + 1, // insertion
|
||||||
|
matrix[i - 1][j] + 1 // deletion
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return matrix[b.length][a.length];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate similarity score (0-100)
|
||||||
|
*/
|
||||||
|
function similarityScore(a: string, b: string): number {
|
||||||
|
const maxLen = Math.max(a.length, b.length);
|
||||||
|
if (maxLen === 0) return 100;
|
||||||
|
const distance = levenshteinDistance(a, b);
|
||||||
|
return Math.round((1 - distance / maxLen) * 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the best dispensary match for a store
|
||||||
|
*/
|
||||||
|
function findBestMatch(store: Store, dispensaries: Dispensary[]): MatchResult {
|
||||||
|
const normalizedStoreName = normalizeName(store.name);
|
||||||
|
const storeSlug = store.slug.toLowerCase();
|
||||||
|
|
||||||
|
let bestMatch: MatchResult = {
|
||||||
|
store,
|
||||||
|
dispensary: null,
|
||||||
|
matchType: 'none',
|
||||||
|
score: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const disp of dispensaries) {
|
||||||
|
const normalizedDispName = normalizeName(disp.name);
|
||||||
|
const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
|
||||||
|
const dispSlug = disp.slug.toLowerCase();
|
||||||
|
|
||||||
|
// 1. Exact name match (case-insensitive)
|
||||||
|
if (store.name.toLowerCase() === disp.name.toLowerCase()) {
|
||||||
|
return {
|
||||||
|
store,
|
||||||
|
dispensary: disp,
|
||||||
|
matchType: 'exact_name',
|
||||||
|
score: 100,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Normalized name match
|
||||||
|
if (normalizedStoreName === normalizedDispName) {
|
||||||
|
return {
|
||||||
|
store,
|
||||||
|
dispensary: disp,
|
||||||
|
matchType: 'normalized_name',
|
||||||
|
score: 95,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Store name matches company name
|
||||||
|
if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
|
||||||
|
return {
|
||||||
|
store,
|
||||||
|
dispensary: disp,
|
||||||
|
matchType: 'company_name',
|
||||||
|
score: 90,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Slug match
|
||||||
|
if (storeSlug === dispSlug) {
|
||||||
|
return {
|
||||||
|
store,
|
||||||
|
dispensary: disp,
|
||||||
|
matchType: 'slug',
|
||||||
|
score: 85,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Fuzzy matching (only if score > 70)
|
||||||
|
const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
|
||||||
|
const companyScore = normalizedCompanyName
|
||||||
|
? similarityScore(normalizedStoreName, normalizedCompanyName)
|
||||||
|
: 0;
|
||||||
|
const fuzzyScore = Math.max(nameScore, companyScore);
|
||||||
|
|
||||||
|
if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
|
||||||
|
bestMatch = {
|
||||||
|
store,
|
||||||
|
dispensary: disp,
|
||||||
|
matchType: 'fuzzy',
|
||||||
|
score: fuzzyScore,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
if (flags.help) {
|
||||||
|
console.log(`
|
||||||
|
Backfill Store-Dispensary Mapping
|
||||||
|
|
||||||
|
Links existing stores (scheduler) to dispensaries (master AZDHS directory)
|
||||||
|
by matching on name, company name, or slug similarity.
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
|
||||||
|
|
||||||
|
OPTIONS:
|
||||||
|
--apply Apply the mappings to the database (default: preview only)
|
||||||
|
--verbose Show detailed match information for all stores
|
||||||
|
--help, -h Show this help message
|
||||||
|
|
||||||
|
EXAMPLES:
|
||||||
|
# Preview what would be matched
|
||||||
|
npx tsx src/scripts/backfill-store-dispensary.ts
|
||||||
|
|
||||||
|
# Apply the mappings
|
||||||
|
npx tsx src/scripts/backfill-store-dispensary.ts --apply
|
||||||
|
|
||||||
|
# Show verbose output
|
||||||
|
npx tsx src/scripts/backfill-store-dispensary.ts --verbose
|
||||||
|
`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n📦 Backfill Store-Dispensary Mapping');
|
||||||
|
console.log('=====================================\n');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Fetch all stores without a dispensary_id
|
||||||
|
const storesResult = await pool.query<Store>(`
|
||||||
|
SELECT id, name, slug, dispensary_id
|
||||||
|
FROM stores
|
||||||
|
WHERE dispensary_id IS NULL
|
||||||
|
ORDER BY name
|
||||||
|
`);
|
||||||
|
const unmappedStores = storesResult.rows;
|
||||||
|
|
||||||
|
// Fetch all already-mapped stores for context
|
||||||
|
const mappedResult = await pool.query<Store>(`
|
||||||
|
SELECT id, name, slug, dispensary_id
|
||||||
|
FROM stores
|
||||||
|
WHERE dispensary_id IS NOT NULL
|
||||||
|
ORDER BY name
|
||||||
|
`);
|
||||||
|
const mappedStores = mappedResult.rows;
|
||||||
|
|
||||||
|
// Fetch all dispensaries
|
||||||
|
const dispResult = await pool.query<Dispensary>(`
|
||||||
|
SELECT id, name, company_name, city, address, slug
|
||||||
|
FROM dispensaries
|
||||||
|
ORDER BY name
|
||||||
|
`);
|
||||||
|
const dispensaries = dispResult.rows;
|
||||||
|
|
||||||
|
console.log(`📊 Current Status:`);
|
||||||
|
console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
|
||||||
|
console.log(` Stores already mapped: ${mappedStores.length}`);
|
||||||
|
console.log(` Total dispensaries: ${dispensaries.length}\n`);
|
||||||
|
|
||||||
|
if (unmappedStores.length === 0) {
|
||||||
|
console.log('✅ All stores are already mapped to dispensaries!\n');
|
||||||
|
await pool.end();
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find matches for each unmapped store
|
||||||
|
const matches: MatchResult[] = [];
|
||||||
|
const noMatches: Store[] = [];
|
||||||
|
|
||||||
|
for (const store of unmappedStores) {
|
||||||
|
const match = findBestMatch(store, dispensaries);
|
||||||
|
if (match.dispensary) {
|
||||||
|
matches.push(match);
|
||||||
|
} else {
|
||||||
|
noMatches.push(store);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort matches by score (highest first)
|
||||||
|
matches.sort((a, b) => b.score - a.score);
|
||||||
|
|
||||||
|
// Display results
|
||||||
|
console.log(`\n🔗 Matches Found: ${matches.length}`);
|
||||||
|
console.log('----------------------------------\n');
|
||||||
|
|
||||||
|
if (matches.length > 0) {
|
||||||
|
// Group by match type
|
||||||
|
const byType: Record<string, MatchResult[]> = {};
|
||||||
|
for (const m of matches) {
|
||||||
|
if (!byType[m.matchType]) byType[m.matchType] = [];
|
||||||
|
byType[m.matchType].push(m);
|
||||||
|
}
|
||||||
|
|
||||||
|
const typeLabels: Record<string, string> = {
|
||||||
|
exact_name: '✅ Exact Name Match',
|
||||||
|
normalized_name: '✅ Normalized Name Match',
|
||||||
|
company_name: '🏢 Company Name Match',
|
||||||
|
slug: '🔗 Slug Match',
|
||||||
|
fuzzy: '🔍 Fuzzy Match',
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const [type, results] of Object.entries(byType)) {
|
||||||
|
console.log(`${typeLabels[type]} (${results.length}):`);
|
||||||
|
for (const r of results) {
|
||||||
|
const dispInfo = r.dispensary!;
|
||||||
|
console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (noMatches.length > 0) {
|
||||||
|
console.log(`\n❌ No Match Found: ${noMatches.length}`);
|
||||||
|
console.log('----------------------------------\n');
|
||||||
|
for (const store of noMatches) {
|
||||||
|
console.log(` • "${store.name}" (slug: ${store.slug})`);
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply if requested
|
||||||
|
if (flags.apply && matches.length > 0) {
|
||||||
|
console.log('\n🔧 Applying mappings...\n');
|
||||||
|
|
||||||
|
let updated = 0;
|
||||||
|
for (const match of matches) {
|
||||||
|
if (!match.dispensary) continue;
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
'UPDATE stores SET dispensary_id = $1 WHERE id = $2',
|
||||||
|
[match.dispensary.id, match.store.id]
|
||||||
|
);
|
||||||
|
updated++;
|
||||||
|
|
||||||
|
if (flags.verbose) {
|
||||||
|
console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
|
||||||
|
logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
|
||||||
|
} else if (matches.length > 0 && !flags.apply) {
|
||||||
|
console.log('\n💡 Run with --apply to update the database\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary
|
||||||
|
console.log('📈 Summary:');
|
||||||
|
console.log(` Would match: ${matches.length} stores`);
|
||||||
|
console.log(` No match: ${noMatches.length} stores`);
|
||||||
|
console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error:', error);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch(console.error);
|
||||||
424
backend/src/scripts/queue-dispensaries.ts
Normal file
424
backend/src/scripts/queue-dispensaries.ts
Normal file
@@ -0,0 +1,424 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Queue Dispensaries Script
|
||||||
|
*
|
||||||
|
* Orchestrates the multi-provider crawler system:
|
||||||
|
* 1. Queue dispensaries that need provider detection
|
||||||
|
* 2. Queue Dutchie dispensaries for production crawl
|
||||||
|
* 3. Queue sandbox dispensaries for learning crawls
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
|
||||||
|
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||||||
|
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { logger } from '../services/logger';
|
||||||
|
import {
|
||||||
|
runDetectMenuProviderJob,
|
||||||
|
runDutchieMenuCrawlJob,
|
||||||
|
runSandboxCrawlJob,
|
||||||
|
processSandboxJobs,
|
||||||
|
} from '../services/crawler-jobs';
|
||||||
|
|
||||||
|
// Parse command line args
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const flags = {
|
||||||
|
detection: args.includes('--detection') || args.includes('--all'),
|
||||||
|
production: args.includes('--production') || args.includes('--all'),
|
||||||
|
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||||||
|
dryRun: args.includes('--dry-run'),
|
||||||
|
process: args.includes('--process'),
|
||||||
|
help: args.includes('--help') || args.includes('-h'),
|
||||||
|
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||||||
|
};
|
||||||
|
|
||||||
|
// If no specific flags, default to all
|
||||||
|
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||||||
|
flags.detection = true;
|
||||||
|
flags.production = true;
|
||||||
|
flags.sandbox = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function showHelp() {
|
||||||
|
console.log(`
|
||||||
|
Queue Dispensaries - Multi-Provider Crawler Orchestration
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
|
||||||
|
|
||||||
|
OPTIONS:
|
||||||
|
--detection Queue dispensaries that need provider detection
|
||||||
|
--production Queue Dutchie production crawls
|
||||||
|
--sandbox Queue sandbox/learning crawls
|
||||||
|
--all Queue all job types (default if no specific flag)
|
||||||
|
--process Process queued jobs instead of just queuing
|
||||||
|
--dry-run Show what would be queued without making changes
|
||||||
|
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||||||
|
--help, -h Show this help message
|
||||||
|
|
||||||
|
EXAMPLES:
|
||||||
|
# Queue all dispensaries for appropriate jobs
|
||||||
|
npx tsx src/scripts/queue-dispensaries.ts
|
||||||
|
|
||||||
|
# Only queue detection jobs
|
||||||
|
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
|
||||||
|
|
||||||
|
# Dry run to see what would be queued
|
||||||
|
npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||||||
|
|
||||||
|
# Process sandbox jobs
|
||||||
|
npx tsx src/scripts/queue-dispensaries.ts --process
|
||||||
|
`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queueDetectionJobs(): Promise<number> {
|
||||||
|
console.log('\n📡 Queueing Detection Jobs...');
|
||||||
|
|
||||||
|
// Find dispensaries that need provider detection:
|
||||||
|
// - menu_provider is null OR
|
||||||
|
// - menu_provider_confidence < 70 AND
|
||||||
|
// - crawler_status is idle (not already queued/running)
|
||||||
|
// - has a website URL
|
||||||
|
const query = `
|
||||||
|
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||||||
|
AND crawler_status = 'idle'
|
||||||
|
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
|
||||||
|
ORDER BY
|
||||||
|
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
|
||||||
|
menu_provider_confidence ASC
|
||||||
|
LIMIT $1
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await pool.query(query, [flags.limit]);
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
|
||||||
|
for (const row of result.rows) {
|
||||||
|
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
|
||||||
|
}
|
||||||
|
return result.rows.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
let queued = 0;
|
||||||
|
for (const dispensary of result.rows) {
|
||||||
|
try {
|
||||||
|
// Update status to queued
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
|
||||||
|
[dispensary.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create sandbox job for detection
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||||
|
VALUES ($1, 'detection', 'pending', 10)`,
|
||||||
|
[dispensary.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||||||
|
queued++;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queued;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queueProductionCrawls(): Promise<number> {
|
||||||
|
console.log('\n🏭 Queueing Production Dutchie Crawls...');
|
||||||
|
|
||||||
|
// Find Dutchie dispensaries ready for production crawl:
|
||||||
|
// - menu_provider = 'dutchie'
|
||||||
|
// - crawler_mode = 'production'
|
||||||
|
// - crawler_status is idle
|
||||||
|
// - last_menu_scrape is old or null
|
||||||
|
const query = `
|
||||||
|
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.menu_provider = 'dutchie'
|
||||||
|
AND d.crawler_mode = 'production'
|
||||||
|
AND d.crawler_status = 'idle'
|
||||||
|
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
|
||||||
|
ORDER BY
|
||||||
|
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
|
||||||
|
d.last_menu_scrape ASC
|
||||||
|
LIMIT $1
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await pool.query(query, [flags.limit]);
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
|
||||||
|
for (const row of result.rows) {
|
||||||
|
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
|
||||||
|
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
|
||||||
|
}
|
||||||
|
return result.rows.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
let queued = 0;
|
||||||
|
for (const dispensary of result.rows) {
|
||||||
|
try {
|
||||||
|
// Update status to queued
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
|
||||||
|
[dispensary.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create crawl job in the main crawl_jobs table (production queue)
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||||||
|
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||||
|
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
|
||||||
|
FROM stores s
|
||||||
|
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
|
||||||
|
WHERE d.id = $1
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensary.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
|
||||||
|
queued++;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queued;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queueSandboxCrawls(): Promise<number> {
|
||||||
|
console.log('\n🧪 Queueing Sandbox Crawls...');
|
||||||
|
|
||||||
|
// Find sandbox dispensaries needing crawls:
|
||||||
|
// - crawler_mode = 'sandbox'
|
||||||
|
// - crawler_status in (idle, error_needs_review)
|
||||||
|
// - No recent sandbox job
|
||||||
|
const query = `
|
||||||
|
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.crawler_mode = 'sandbox'
|
||||||
|
AND d.crawler_status IN ('idle', 'error_needs_review')
|
||||||
|
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM sandbox_crawl_jobs sj
|
||||||
|
WHERE sj.dispensary_id = d.id
|
||||||
|
AND sj.status IN ('pending', 'running')
|
||||||
|
)
|
||||||
|
ORDER BY d.updated_at ASC
|
||||||
|
LIMIT $1
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await pool.query(query, [flags.limit]);
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
|
||||||
|
for (const row of result.rows) {
|
||||||
|
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
|
||||||
|
}
|
||||||
|
return result.rows.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
let queued = 0;
|
||||||
|
for (const dispensary of result.rows) {
|
||||||
|
try {
|
||||||
|
// Update status
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`,
|
||||||
|
[dispensary.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create sandbox job
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||||
|
VALUES ($1, 'deep_crawl', 'pending', 5)`,
|
||||||
|
[dispensary.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
|
||||||
|
queued++;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queued;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processJobs(): Promise<void> {
|
||||||
|
console.log('\n⚙️ Processing Queued Jobs...\n');
|
||||||
|
|
||||||
|
// Process sandbox jobs (detection + sandbox crawls)
|
||||||
|
const sandboxJobs = await pool.query(
|
||||||
|
`SELECT * FROM sandbox_crawl_jobs
|
||||||
|
WHERE status = 'pending'
|
||||||
|
ORDER BY priority DESC, scheduled_at ASC
|
||||||
|
LIMIT $1`,
|
||||||
|
[flags.limit]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
|
||||||
|
|
||||||
|
for (const job of sandboxJobs.rows) {
|
||||||
|
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Mark as running
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`,
|
||||||
|
[job.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
let result;
|
||||||
|
if (job.job_type === 'detection') {
|
||||||
|
result = await runDetectMenuProviderJob(job.dispensary_id);
|
||||||
|
} else {
|
||||||
|
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update job status
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs
|
||||||
|
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||||
|
WHERE id = $4`,
|
||||||
|
[
|
||||||
|
result.success ? 'completed' : 'failed',
|
||||||
|
JSON.stringify(result.data || {}),
|
||||||
|
result.success ? null : result.message,
|
||||||
|
job.id,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
|
||||||
|
[error.message, job.id]
|
||||||
|
);
|
||||||
|
console.log(` ✗ Error: ${error.message}\n`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function showStats(): Promise<void> {
|
||||||
|
console.log('\n📊 Current Stats:');
|
||||||
|
|
||||||
|
// Dispensary stats
|
||||||
|
const stats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
|
||||||
|
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
|
||||||
|
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
|
||||||
|
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
|
||||||
|
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
|
||||||
|
FROM dispensaries
|
||||||
|
`);
|
||||||
|
|
||||||
|
const s = stats.rows[0];
|
||||||
|
console.log(`
|
||||||
|
Dispensaries: ${s.total}
|
||||||
|
- No provider detected: ${s.no_provider}
|
||||||
|
- Dutchie: ${s.dutchie}
|
||||||
|
- Other providers: ${s.other_providers}
|
||||||
|
- Unknown: ${s.unknown}
|
||||||
|
|
||||||
|
Crawler Mode:
|
||||||
|
- Production: ${s.production_mode}
|
||||||
|
- Sandbox: ${s.sandbox_mode}
|
||||||
|
|
||||||
|
Status:
|
||||||
|
- Idle: ${s.idle}
|
||||||
|
- Queued: ${s.queued}
|
||||||
|
- Running: ${s.running}
|
||||||
|
- OK: ${s.ok}
|
||||||
|
- Needs Review: ${s.needs_review}
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Job stats
|
||||||
|
const jobStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||||||
|
FROM sandbox_crawl_jobs
|
||||||
|
`);
|
||||||
|
|
||||||
|
const j = jobStats.rows[0];
|
||||||
|
console.log(` Sandbox Jobs:
|
||||||
|
- Pending: ${j.pending}
|
||||||
|
- Running: ${j.running}
|
||||||
|
- Completed: ${j.completed}
|
||||||
|
- Failed: ${j.failed}
|
||||||
|
`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
if (flags.help) {
|
||||||
|
await showHelp();
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('═══════════════════════════════════════════════════════');
|
||||||
|
console.log(' Multi-Provider Crawler Queue Manager');
|
||||||
|
console.log('═══════════════════════════════════════════════════════');
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Show current stats first
|
||||||
|
await showStats();
|
||||||
|
|
||||||
|
if (flags.process) {
|
||||||
|
// Process mode - run jobs instead of queuing
|
||||||
|
await processJobs();
|
||||||
|
} else {
|
||||||
|
// Queuing mode
|
||||||
|
let totalQueued = 0;
|
||||||
|
|
||||||
|
if (flags.detection) {
|
||||||
|
totalQueued += await queueDetectionJobs();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.production) {
|
||||||
|
totalQueued += await queueProductionCrawls();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.sandbox) {
|
||||||
|
totalQueued += await queueSandboxCrawls();
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n═══════════════════════════════════════════════════════');
|
||||||
|
console.log(` Total dispensaries queued: ${totalQueued}`);
|
||||||
|
console.log('═══════════════════════════════════════════════════════\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show updated stats
|
||||||
|
if (!flags.dryRun) {
|
||||||
|
await showStats();
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Fatal error:', error);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
583
backend/src/scripts/queue-intelligence.ts
Normal file
583
backend/src/scripts/queue-intelligence.ts
Normal file
@@ -0,0 +1,583 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Queue Intelligence Script
|
||||||
|
*
|
||||||
|
* Orchestrates the multi-category intelligence crawler system:
|
||||||
|
* 1. Queue dispensaries that need provider detection (all 4 categories)
|
||||||
|
* 2. Queue per-category production crawls (Dutchie products only for now)
|
||||||
|
* 3. Queue per-category sandbox crawls (all providers)
|
||||||
|
*
|
||||||
|
* Each category (product, specials, brand, metadata) is handled independently.
|
||||||
|
* A failure in one category does NOT affect other categories.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
|
||||||
|
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
|
||||||
|
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
|
||||||
|
* npx tsx src/scripts/queue-intelligence.ts --dry-run
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { logger } from '../services/logger';
|
||||||
|
import {
|
||||||
|
detectMultiCategoryProviders,
|
||||||
|
updateAllCategoryProviders,
|
||||||
|
IntelligenceCategory,
|
||||||
|
} from '../services/intelligence-detector';
|
||||||
|
import {
|
||||||
|
runCrawlProductsJob,
|
||||||
|
runCrawlSpecialsJob,
|
||||||
|
runCrawlBrandIntelligenceJob,
|
||||||
|
runCrawlMetadataJob,
|
||||||
|
runSandboxProductsJob,
|
||||||
|
runSandboxSpecialsJob,
|
||||||
|
runSandboxBrandJob,
|
||||||
|
runSandboxMetadataJob,
|
||||||
|
runAllCategoryProductionCrawls,
|
||||||
|
runAllCategorySandboxCrawls,
|
||||||
|
processCategorySandboxJobs,
|
||||||
|
} from '../services/category-crawler-jobs';
|
||||||
|
|
||||||
|
// Parse command line args
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const flags = {
|
||||||
|
detection: args.includes('--detection') || args.includes('--all'),
|
||||||
|
production: args.includes('--production') || args.includes('--all'),
|
||||||
|
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||||||
|
dryRun: args.includes('--dry-run'),
|
||||||
|
process: args.includes('--process'),
|
||||||
|
help: args.includes('--help') || args.includes('-h'),
|
||||||
|
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||||||
|
category: args.find(a => a.startsWith('--category='))?.split('=')[1] as IntelligenceCategory | undefined,
|
||||||
|
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
|
||||||
|
};
|
||||||
|
|
||||||
|
// If no specific flags, default to all
|
||||||
|
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||||||
|
flags.detection = true;
|
||||||
|
flags.production = true;
|
||||||
|
flags.sandbox = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const CATEGORIES: IntelligenceCategory[] = ['product', 'specials', 'brand', 'metadata'];
|
||||||
|
|
||||||
|
async function showHelp() {
|
||||||
|
console.log(`
|
||||||
|
Queue Intelligence - Multi-Category Crawler Orchestration
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
|
||||||
|
|
||||||
|
OPTIONS:
|
||||||
|
--detection Queue dispensaries that need multi-category detection
|
||||||
|
--production Queue per-category production crawls
|
||||||
|
--sandbox Queue per-category sandbox crawls
|
||||||
|
--all Queue all job types (default if no specific flag)
|
||||||
|
--process Process queued jobs instead of just queuing
|
||||||
|
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
|
||||||
|
--dispensary=ID Process only a specific dispensary
|
||||||
|
--dry-run Show what would be queued without making changes
|
||||||
|
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||||||
|
--help, -h Show this help message
|
||||||
|
|
||||||
|
CATEGORIES:
|
||||||
|
product - Product/menu data (Dutchie=production, others=sandbox)
|
||||||
|
specials - Deals and specials (all sandbox for now)
|
||||||
|
brand - Brand intelligence (all sandbox for now)
|
||||||
|
metadata - Categories/taxonomy (all sandbox for now)
|
||||||
|
|
||||||
|
EXAMPLES:
|
||||||
|
# Queue all dispensaries for appropriate jobs
|
||||||
|
npx tsx src/scripts/queue-intelligence.ts
|
||||||
|
|
||||||
|
# Only queue product detection jobs
|
||||||
|
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
|
||||||
|
|
||||||
|
# Process sandbox jobs for specials category
|
||||||
|
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
|
||||||
|
|
||||||
|
# Run full detection for a specific dispensary
|
||||||
|
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
|
||||||
|
|
||||||
|
# Dry run to see what would be queued
|
||||||
|
npx tsx src/scripts/queue-intelligence.ts --dry-run
|
||||||
|
`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queueMultiCategoryDetection(): Promise<number> {
|
||||||
|
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
|
||||||
|
|
||||||
|
// Find dispensaries that need provider detection for any category:
|
||||||
|
// - Any *_provider is null OR
|
||||||
|
// - Any *_confidence < 70
|
||||||
|
// - has a website URL
|
||||||
|
const query = `
|
||||||
|
SELECT id, name, website, menu_url,
|
||||||
|
product_provider, product_confidence, product_crawler_mode,
|
||||||
|
specials_provider, specials_confidence, specials_crawler_mode,
|
||||||
|
brand_provider, brand_confidence, brand_crawler_mode,
|
||||||
|
metadata_provider, metadata_confidence, metadata_crawler_mode
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||||||
|
AND (
|
||||||
|
product_provider IS NULL OR product_confidence < 70 OR
|
||||||
|
specials_provider IS NULL OR specials_confidence < 70 OR
|
||||||
|
brand_provider IS NULL OR brand_confidence < 70 OR
|
||||||
|
metadata_provider IS NULL OR metadata_confidence < 70
|
||||||
|
)
|
||||||
|
ORDER BY
|
||||||
|
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
|
||||||
|
product_confidence ASC
|
||||||
|
LIMIT $1
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await pool.query(query, [flags.limit]);
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
|
||||||
|
for (const row of result.rows) {
|
||||||
|
const needsDetection: string[] = [];
|
||||||
|
if (!row.product_provider || row.product_confidence < 70) needsDetection.push('product');
|
||||||
|
if (!row.specials_provider || row.specials_confidence < 70) needsDetection.push('specials');
|
||||||
|
if (!row.brand_provider || row.brand_confidence < 70) needsDetection.push('brand');
|
||||||
|
if (!row.metadata_provider || row.metadata_confidence < 70) needsDetection.push('metadata');
|
||||||
|
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
|
||||||
|
}
|
||||||
|
return result.rows.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
let queued = 0;
|
||||||
|
for (const dispensary of result.rows) {
|
||||||
|
try {
|
||||||
|
// Create detection jobs for each category that needs it
|
||||||
|
for (const category of CATEGORIES) {
|
||||||
|
const provider = dispensary[`${category}_provider`];
|
||||||
|
const confidence = dispensary[`${category}_confidence`];
|
||||||
|
|
||||||
|
if (!provider || confidence < 70) {
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
|
||||||
|
VALUES ($1, $2, 'detection', 'pending', 10)
|
||||||
|
ON CONFLICT DO NOTHING`,
|
||||||
|
[dispensary.id, category]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||||||
|
queued++;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queued;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queueCategoryProductionCrawls(category?: IntelligenceCategory): Promise<number> {
|
||||||
|
const categories = category ? [category] : CATEGORIES;
|
||||||
|
let totalQueued = 0;
|
||||||
|
|
||||||
|
for (const cat of categories) {
|
||||||
|
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
|
||||||
|
|
||||||
|
// For now, only products have production-ready crawlers (Dutchie only)
|
||||||
|
if (cat !== 'product') {
|
||||||
|
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find dispensaries ready for production crawl
|
||||||
|
const query = `
|
||||||
|
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE ${cat}_provider = 'dutchie'
|
||||||
|
AND ${cat}_crawler_mode = 'production'
|
||||||
|
AND ${cat}_confidence >= 70
|
||||||
|
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
|
||||||
|
ORDER BY
|
||||||
|
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
|
||||||
|
last_${cat}_scan_at ASC
|
||||||
|
LIMIT $1
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await pool.query(query, [flags.limit]);
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
|
||||||
|
for (const row of result.rows) {
|
||||||
|
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
|
||||||
|
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
|
||||||
|
}
|
||||||
|
totalQueued += result.rows.length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const dispensary of result.rows) {
|
||||||
|
try {
|
||||||
|
// For products, use the existing crawl_jobs table for production
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||||||
|
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||||
|
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
|
||||||
|
FROM stores s
|
||||||
|
JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%')
|
||||||
|
WHERE d.id = $1
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensary.id, cat]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
|
||||||
|
totalQueued++;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalQueued;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function queueCategorySandboxCrawls(category?: IntelligenceCategory): Promise<number> {
|
||||||
|
const categories = category ? [category] : CATEGORIES;
|
||||||
|
let totalQueued = 0;
|
||||||
|
|
||||||
|
for (const cat of categories) {
|
||||||
|
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
|
||||||
|
|
||||||
|
// Find dispensaries in sandbox mode for this category
|
||||||
|
const query = `
|
||||||
|
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
|
||||||
|
d.website, d.menu_url
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.${cat}_crawler_mode = 'sandbox'
|
||||||
|
AND d.${cat}_provider IS NOT NULL
|
||||||
|
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM sandbox_crawl_jobs sj
|
||||||
|
WHERE sj.dispensary_id = d.id
|
||||||
|
AND sj.category = $1
|
||||||
|
AND sj.status IN ('pending', 'running')
|
||||||
|
)
|
||||||
|
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
|
||||||
|
LIMIT $2
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await pool.query(query, [cat, flags.limit]);
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
|
||||||
|
for (const row of result.rows) {
|
||||||
|
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
|
||||||
|
}
|
||||||
|
totalQueued += result.rows.length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const dispensary of result.rows) {
|
||||||
|
try {
|
||||||
|
// Create sandbox entry if needed
|
||||||
|
const sandboxResult = await pool.query(
|
||||||
|
`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
|
||||||
|
VALUES ($1, $2, $3, 'template_learning', 'pending')
|
||||||
|
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
|
||||||
|
DO UPDATE SET updated_at = NOW()
|
||||||
|
RETURNING id`,
|
||||||
|
[dispensary.id, cat, dispensary.provider]
|
||||||
|
);
|
||||||
|
|
||||||
|
const sandboxId = sandboxResult.rows[0]?.id;
|
||||||
|
|
||||||
|
// Create sandbox job
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
|
||||||
|
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`,
|
||||||
|
[dispensary.id, sandboxId, cat]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
|
||||||
|
totalQueued++;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalQueued;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processDetectionJobs(): Promise<void> {
|
||||||
|
console.log('\n🔍 Processing Detection Jobs...');
|
||||||
|
|
||||||
|
// Get pending detection jobs
|
||||||
|
const jobs = await pool.query(
|
||||||
|
`SELECT DISTINCT dispensary_id
|
||||||
|
FROM sandbox_crawl_jobs
|
||||||
|
WHERE job_type = 'detection' AND status = 'pending'
|
||||||
|
${flags.category ? `AND category = $2` : ''}
|
||||||
|
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
|
||||||
|
LIMIT $1`,
|
||||||
|
flags.category
|
||||||
|
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
|
||||||
|
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit])
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const job of jobs.rows) {
|
||||||
|
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Get dispensary info
|
||||||
|
const dispResult = await pool.query(
|
||||||
|
'SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1',
|
||||||
|
[job.dispensary_id]
|
||||||
|
);
|
||||||
|
const dispensary = dispResult.rows[0];
|
||||||
|
|
||||||
|
if (!dispensary) {
|
||||||
|
console.log(` ✗ Dispensary not found`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||||
|
if (!websiteUrl) {
|
||||||
|
console.log(` ✗ No website URL`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark jobs as running
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
|
||||||
|
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`,
|
||||||
|
[job.dispensary_id]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Run multi-category detection
|
||||||
|
console.log(` Detecting providers for ${dispensary.name}...`);
|
||||||
|
const detection = await detectMultiCategoryProviders(websiteUrl, { timeout: 45000 });
|
||||||
|
|
||||||
|
// Update all categories
|
||||||
|
await updateAllCategoryProviders(job.dispensary_id, detection);
|
||||||
|
|
||||||
|
// Mark jobs as completed
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
|
||||||
|
result_summary = $1
|
||||||
|
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
|
||||||
|
[JSON.stringify({
|
||||||
|
product: { provider: detection.product.provider, confidence: detection.product.confidence },
|
||||||
|
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
|
||||||
|
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
|
||||||
|
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
|
||||||
|
}), job.dispensary_id]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(` ✓ Detection complete:`);
|
||||||
|
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
|
||||||
|
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
|
||||||
|
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
|
||||||
|
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.log(` ✗ Error: ${error.message}`);
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
|
||||||
|
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`,
|
||||||
|
[error.message, job.dispensary_id]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processCrawlJobs(): Promise<void> {
|
||||||
|
const categories = flags.category ? [flags.category] : CATEGORIES;
|
||||||
|
|
||||||
|
for (const cat of categories) {
|
||||||
|
console.log(`\n⚙️ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
|
||||||
|
|
||||||
|
// Process sandbox jobs for this category
|
||||||
|
if (flags.sandbox || !flags.production) {
|
||||||
|
await processCategorySandboxJobs(cat, flags.limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process production jobs for this category
|
||||||
|
if (flags.production && cat === 'product') {
|
||||||
|
// Get pending production crawls
|
||||||
|
const prodJobs = await pool.query(
|
||||||
|
`SELECT d.id
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.product_provider = 'dutchie'
|
||||||
|
AND d.product_crawler_mode = 'production'
|
||||||
|
AND d.product_confidence >= 70
|
||||||
|
${flags.dispensary ? 'AND d.id = $2' : ''}
|
||||||
|
LIMIT $1`,
|
||||||
|
flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const job of prodJobs.rows) {
|
||||||
|
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
|
||||||
|
const result = await runCrawlProductsJob(job.id);
|
||||||
|
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processSpecificDispensary(): Promise<void> {
|
||||||
|
if (!flags.dispensary) return;
|
||||||
|
|
||||||
|
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
|
||||||
|
|
||||||
|
const dispResult = await pool.query(
|
||||||
|
'SELECT * FROM dispensaries WHERE id = $1',
|
||||||
|
[flags.dispensary]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (dispResult.rows.length === 0) {
|
||||||
|
console.log('Dispensary not found');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const dispensary = dispResult.rows[0];
|
||||||
|
console.log(`Name: ${dispensary.name}`);
|
||||||
|
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
if (flags.detection) {
|
||||||
|
console.log('Running multi-category detection...');
|
||||||
|
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||||
|
if (websiteUrl) {
|
||||||
|
const detection = await detectMultiCategoryProviders(websiteUrl);
|
||||||
|
await updateAllCategoryProviders(flags.dispensary, detection);
|
||||||
|
console.log('Detection results:');
|
||||||
|
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
|
||||||
|
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
|
||||||
|
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
|
||||||
|
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.production) {
|
||||||
|
console.log('\nRunning production crawls...');
|
||||||
|
const results = await runAllCategoryProductionCrawls(flags.dispensary);
|
||||||
|
console.log(` ${results.summary}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.sandbox) {
|
||||||
|
console.log('\nRunning sandbox crawls...');
|
||||||
|
const results = await runAllCategorySandboxCrawls(flags.dispensary);
|
||||||
|
console.log(` ${results.summary}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function showStats(): Promise<void> {
|
||||||
|
console.log('\n📊 Multi-Category Intelligence Stats:');
|
||||||
|
|
||||||
|
// Per-category stats
|
||||||
|
for (const cat of CATEGORIES) {
|
||||||
|
const stats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
|
||||||
|
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
|
||||||
|
AVG(${cat}_confidence) as avg_confidence
|
||||||
|
FROM dispensaries
|
||||||
|
`);
|
||||||
|
|
||||||
|
const s = stats.rows[0];
|
||||||
|
console.log(`
|
||||||
|
${cat.toUpperCase()}:
|
||||||
|
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
|
||||||
|
Modes: Production=${s.production}, Sandbox=${s.sandbox}
|
||||||
|
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Job stats per category
|
||||||
|
console.log('\n Sandbox Jobs by Category:');
|
||||||
|
const jobStats = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
category,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||||||
|
FROM sandbox_crawl_jobs
|
||||||
|
GROUP BY category
|
||||||
|
ORDER BY category
|
||||||
|
`);
|
||||||
|
|
||||||
|
for (const row of jobStats.rows) {
|
||||||
|
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
if (flags.help) {
|
||||||
|
await showHelp();
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('═══════════════════════════════════════════════════════');
|
||||||
|
console.log(' Multi-Category Intelligence Queue Manager');
|
||||||
|
console.log('═══════════════════════════════════════════════════════');
|
||||||
|
|
||||||
|
if (flags.dryRun) {
|
||||||
|
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.category) {
|
||||||
|
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Show current stats first
|
||||||
|
await showStats();
|
||||||
|
|
||||||
|
// If specific dispensary specified, process it directly
|
||||||
|
if (flags.dispensary && flags.process) {
|
||||||
|
await processSpecificDispensary();
|
||||||
|
} else if (flags.process) {
|
||||||
|
// Process mode - run jobs
|
||||||
|
if (flags.detection) {
|
||||||
|
await processDetectionJobs();
|
||||||
|
}
|
||||||
|
await processCrawlJobs();
|
||||||
|
} else {
|
||||||
|
// Queuing mode
|
||||||
|
let totalQueued = 0;
|
||||||
|
|
||||||
|
if (flags.detection) {
|
||||||
|
totalQueued += await queueMultiCategoryDetection();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.production) {
|
||||||
|
totalQueued += await queueCategoryProductionCrawls(flags.category);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags.sandbox) {
|
||||||
|
totalQueued += await queueCategorySandboxCrawls(flags.category);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n═══════════════════════════════════════════════════════');
|
||||||
|
console.log(` Total queued: ${totalQueued}`);
|
||||||
|
console.log('═══════════════════════════════════════════════════════\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show updated stats
|
||||||
|
if (!flags.dryRun) {
|
||||||
|
await showStats();
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Fatal error:', error);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
1462
backend/src/services/category-crawler-jobs.ts
Normal file
1462
backend/src/services/category-crawler-jobs.ts
Normal file
File diff suppressed because it is too large
Load Diff
651
backend/src/services/crawl-scheduler.ts
Normal file
651
backend/src/services/crawl-scheduler.ts
Normal file
@@ -0,0 +1,651 @@
|
|||||||
|
/**
|
||||||
|
* Crawl Scheduler Service
|
||||||
|
*
|
||||||
|
* This service manages crawl scheduling using a job queue approach.
|
||||||
|
* It does NOT modify the crawler - it only TRIGGERS the existing crawler.
|
||||||
|
*
|
||||||
|
* Features:
|
||||||
|
* - Global schedule: crawl all stores every N hours
|
||||||
|
* - Daily special run: 12:01 AM local store time
|
||||||
|
* - Per-store schedule overrides
|
||||||
|
* - Job queue for tracking pending/running crawls
|
||||||
|
*/
|
||||||
|
|
||||||
|
import cron from 'node-cron';
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { scrapeStore } from '../scraper-v2';
|
||||||
|
import {
|
||||||
|
runStoreCrawlOrchestrator,
|
||||||
|
getStoresDueForOrchestration,
|
||||||
|
} from './store-crawl-orchestrator';
|
||||||
|
|
||||||
|
// Worker identification
|
||||||
|
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
|
||||||
|
|
||||||
|
let schedulerCronJob: cron.ScheduledTask | null = null;
|
||||||
|
let jobProcessorRunning = false;
|
||||||
|
let orchestratorProcessorRunning = false;
|
||||||
|
|
||||||
|
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
|
||||||
|
let schedulerMode: 'legacy' | 'orchestrator' = 'orchestrator';
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Types
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
interface GlobalSchedule {
|
||||||
|
id: number;
|
||||||
|
schedule_type: string;
|
||||||
|
enabled: boolean;
|
||||||
|
interval_hours: number | null;
|
||||||
|
run_time: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface StoreScheduleStatus {
|
||||||
|
store_id: number;
|
||||||
|
store_name: string;
|
||||||
|
store_slug: string;
|
||||||
|
timezone: string;
|
||||||
|
active: boolean;
|
||||||
|
scrape_enabled: boolean;
|
||||||
|
last_scraped_at: Date | null;
|
||||||
|
schedule_enabled: boolean;
|
||||||
|
interval_hours: number;
|
||||||
|
daily_special_enabled: boolean;
|
||||||
|
daily_special_time: string;
|
||||||
|
priority: number;
|
||||||
|
next_scheduled_run: Date;
|
||||||
|
latest_job_id: number | null;
|
||||||
|
latest_job_status: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CrawlJob {
|
||||||
|
id: number;
|
||||||
|
store_id: number;
|
||||||
|
job_type: string;
|
||||||
|
trigger_type: string;
|
||||||
|
status: string;
|
||||||
|
priority: number;
|
||||||
|
scheduled_at: Date;
|
||||||
|
started_at: Date | null;
|
||||||
|
completed_at: Date | null;
|
||||||
|
products_found: number | null;
|
||||||
|
error_message: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Schedule Management
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get global schedule settings
|
||||||
|
*/
|
||||||
|
export async function getGlobalSchedule(): Promise<GlobalSchedule[]> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT * FROM crawler_schedule ORDER BY id
|
||||||
|
`);
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update global schedule setting
|
||||||
|
*/
|
||||||
|
export async function updateGlobalSchedule(
|
||||||
|
scheduleType: string,
|
||||||
|
updates: { enabled?: boolean; interval_hours?: number; run_time?: string }
|
||||||
|
): Promise<GlobalSchedule> {
|
||||||
|
const setClauses: string[] = [];
|
||||||
|
const values: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (updates.enabled !== undefined) {
|
||||||
|
setClauses.push(`enabled = $${paramIndex++}`);
|
||||||
|
values.push(updates.enabled);
|
||||||
|
}
|
||||||
|
if (updates.interval_hours !== undefined) {
|
||||||
|
setClauses.push(`interval_hours = $${paramIndex++}`);
|
||||||
|
values.push(updates.interval_hours);
|
||||||
|
}
|
||||||
|
if (updates.run_time !== undefined) {
|
||||||
|
setClauses.push(`run_time = $${paramIndex++}`);
|
||||||
|
values.push(updates.run_time);
|
||||||
|
}
|
||||||
|
|
||||||
|
values.push(scheduleType);
|
||||||
|
|
||||||
|
const result = await pool.query(`
|
||||||
|
UPDATE crawler_schedule
|
||||||
|
SET ${setClauses.join(', ')}
|
||||||
|
WHERE schedule_type = $${paramIndex}
|
||||||
|
RETURNING *
|
||||||
|
`, values);
|
||||||
|
|
||||||
|
return result.rows[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all store schedule statuses
|
||||||
|
*/
|
||||||
|
export async function getStoreScheduleStatuses(): Promise<StoreScheduleStatus[]> {
|
||||||
|
const result = await pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get or create per-store schedule override
|
||||||
|
*/
|
||||||
|
export async function getStoreSchedule(storeId: number): Promise<any> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT * FROM store_crawl_schedule WHERE store_id = $1
|
||||||
|
`, [storeId]);
|
||||||
|
|
||||||
|
if (result.rows.length > 0) {
|
||||||
|
return result.rows[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return default (use global)
|
||||||
|
return {
|
||||||
|
store_id: storeId,
|
||||||
|
enabled: true,
|
||||||
|
interval_hours: null,
|
||||||
|
daily_special_enabled: true,
|
||||||
|
daily_special_time: null,
|
||||||
|
priority: 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update per-store schedule override
|
||||||
|
*/
|
||||||
|
export async function updateStoreSchedule(
|
||||||
|
storeId: number,
|
||||||
|
updates: {
|
||||||
|
enabled?: boolean;
|
||||||
|
interval_hours?: number | null;
|
||||||
|
daily_special_enabled?: boolean;
|
||||||
|
daily_special_time?: string | null;
|
||||||
|
priority?: number;
|
||||||
|
}
|
||||||
|
): Promise<any> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6)
|
||||||
|
ON CONFLICT (store_id) DO UPDATE SET
|
||||||
|
enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
|
||||||
|
interval_hours = EXCLUDED.interval_hours,
|
||||||
|
daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
|
||||||
|
daily_special_time = EXCLUDED.daily_special_time,
|
||||||
|
priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING *
|
||||||
|
`, [
|
||||||
|
storeId,
|
||||||
|
updates.enabled ?? true,
|
||||||
|
updates.interval_hours ?? null,
|
||||||
|
updates.daily_special_enabled ?? true,
|
||||||
|
updates.daily_special_time ?? null,
|
||||||
|
updates.priority ?? 0
|
||||||
|
]);
|
||||||
|
|
||||||
|
return result.rows[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Job Queue Management
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new crawl job
|
||||||
|
*/
|
||||||
|
export async function createCrawlJob(
|
||||||
|
storeId: number,
|
||||||
|
jobType: 'full_crawl' | 'specials_only' | 'category' = 'full_crawl',
|
||||||
|
triggerType: 'scheduled' | 'manual' | 'daily_special' = 'scheduled',
|
||||||
|
scheduledAt: Date = new Date(),
|
||||||
|
priority: number = 0
|
||||||
|
): Promise<CrawlJob> {
|
||||||
|
// Check if there's already a pending or running job for this store
|
||||||
|
const existing = await pool.query(`
|
||||||
|
SELECT id FROM crawl_jobs
|
||||||
|
WHERE store_id = $1 AND status IN ('pending', 'running')
|
||||||
|
LIMIT 1
|
||||||
|
`, [storeId]);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
|
||||||
|
return existing.rows[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await pool.query(`
|
||||||
|
INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, 'pending')
|
||||||
|
RETURNING *
|
||||||
|
`, [storeId, jobType, triggerType, scheduledAt, priority]);
|
||||||
|
|
||||||
|
console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
|
||||||
|
return result.rows[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get pending jobs ready to run
|
||||||
|
*/
|
||||||
|
export async function getPendingJobs(limit: number = 5): Promise<CrawlJob[]> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT cj.*, s.name as store_name
|
||||||
|
FROM crawl_jobs cj
|
||||||
|
JOIN stores s ON s.id = cj.store_id
|
||||||
|
WHERE cj.status = 'pending'
|
||||||
|
AND cj.scheduled_at <= NOW()
|
||||||
|
ORDER BY cj.priority DESC, cj.scheduled_at ASC
|
||||||
|
LIMIT $1
|
||||||
|
`, [limit]);
|
||||||
|
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Claim a job for processing
|
||||||
|
*/
|
||||||
|
export async function claimJob(jobId: number): Promise<boolean> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
UPDATE crawl_jobs
|
||||||
|
SET status = 'running', started_at = NOW(), worker_id = $2
|
||||||
|
WHERE id = $1 AND status = 'pending'
|
||||||
|
RETURNING id
|
||||||
|
`, [jobId, WORKER_ID]);
|
||||||
|
|
||||||
|
return result.rows.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Complete a job
|
||||||
|
*/
|
||||||
|
export async function completeJob(
|
||||||
|
jobId: number,
|
||||||
|
success: boolean,
|
||||||
|
results?: { products_found?: number; products_new?: number; products_updated?: number; error_message?: string }
|
||||||
|
): Promise<void> {
|
||||||
|
await pool.query(`
|
||||||
|
UPDATE crawl_jobs
|
||||||
|
SET
|
||||||
|
status = $2,
|
||||||
|
completed_at = NOW(),
|
||||||
|
products_found = $3,
|
||||||
|
error_message = $4
|
||||||
|
WHERE id = $1
|
||||||
|
`, [
|
||||||
|
jobId,
|
||||||
|
success ? 'completed' : 'failed',
|
||||||
|
results?.products_found ?? null,
|
||||||
|
results?.error_message ?? null
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get recent jobs for a store
|
||||||
|
*/
|
||||||
|
export async function getRecentJobs(storeId: number, limit: number = 10): Promise<CrawlJob[]> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT * FROM crawl_jobs
|
||||||
|
WHERE store_id = $1
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT $2
|
||||||
|
`, [storeId, limit]);
|
||||||
|
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all recent jobs
|
||||||
|
*/
|
||||||
|
export async function getAllRecentJobs(limit: number = 50): Promise<any[]> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT cj.*, s.name as store_name, s.slug as store_slug
|
||||||
|
FROM crawl_jobs cj
|
||||||
|
JOIN stores s ON s.id = cj.store_id
|
||||||
|
ORDER BY cj.created_at DESC
|
||||||
|
LIMIT $1
|
||||||
|
`, [limit]);
|
||||||
|
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Scheduler Logic
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check which stores are due for a crawl and create jobs
|
||||||
|
*/
|
||||||
|
export async function checkAndCreateScheduledJobs(): Promise<number> {
|
||||||
|
console.log('Checking for stores due for crawl...');
|
||||||
|
|
||||||
|
// Get global schedule settings
|
||||||
|
const globalSchedule = await pool.query(`
|
||||||
|
SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
|
||||||
|
`);
|
||||||
|
|
||||||
|
if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
|
||||||
|
console.log('Global scheduler is disabled');
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const intervalHours = globalSchedule.rows[0].interval_hours || 4;
|
||||||
|
|
||||||
|
// Find stores due for crawl
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
s.id,
|
||||||
|
s.name,
|
||||||
|
s.timezone,
|
||||||
|
s.last_scraped_at,
|
||||||
|
COALESCE(scs.enabled, TRUE) as schedule_enabled,
|
||||||
|
COALESCE(scs.interval_hours, $1) as interval_hours,
|
||||||
|
COALESCE(scs.priority, 0) as priority
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||||
|
WHERE s.active = TRUE
|
||||||
|
AND s.scrape_enabled = TRUE
|
||||||
|
AND COALESCE(scs.enabled, TRUE) = TRUE
|
||||||
|
AND (
|
||||||
|
s.last_scraped_at IS NULL
|
||||||
|
OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
|
||||||
|
)
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawl_jobs cj
|
||||||
|
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
|
||||||
|
)
|
||||||
|
ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
|
||||||
|
`, [intervalHours]);
|
||||||
|
|
||||||
|
let jobsCreated = 0;
|
||||||
|
|
||||||
|
for (const store of result.rows) {
|
||||||
|
try {
|
||||||
|
await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
|
||||||
|
jobsCreated++;
|
||||||
|
console.log(`Scheduled crawl job for: ${store.name}`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to create job for store ${store.name}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Created ${jobsCreated} scheduled crawl jobs`);
|
||||||
|
return jobsCreated;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check for daily special runs (12:01 AM local time)
|
||||||
|
*/
|
||||||
|
export async function checkAndCreateDailySpecialJobs(): Promise<number> {
|
||||||
|
console.log('Checking for daily special runs...');
|
||||||
|
|
||||||
|
// Get daily special schedule
|
||||||
|
const dailySchedule = await pool.query(`
|
||||||
|
SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
|
||||||
|
`);
|
||||||
|
|
||||||
|
if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
|
||||||
|
console.log('Daily special scheduler is disabled');
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const targetTime = dailySchedule.rows[0].run_time || '00:01';
|
||||||
|
|
||||||
|
// Find stores where it's currently the target time in their local timezone
|
||||||
|
// and they haven't had a daily special run today
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
s.id,
|
||||||
|
s.name,
|
||||||
|
s.timezone,
|
||||||
|
COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
|
||||||
|
COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
|
||||||
|
COALESCE(scs.priority, 0) as priority
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||||
|
WHERE s.active = TRUE
|
||||||
|
AND s.scrape_enabled = TRUE
|
||||||
|
AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
|
||||||
|
-- Check if current time in store timezone matches the target time (within 2 minutes)
|
||||||
|
AND ABS(
|
||||||
|
EXTRACT(EPOCH FROM (
|
||||||
|
(NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
|
||||||
|
- COALESCE(scs.daily_special_time, $1::TIME)
|
||||||
|
))
|
||||||
|
) < 120 -- within 2 minutes
|
||||||
|
-- Ensure we haven't already created a daily_special job today for this store
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawl_jobs cj
|
||||||
|
WHERE cj.store_id = s.id
|
||||||
|
AND cj.trigger_type = 'daily_special'
|
||||||
|
AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
|
||||||
|
)
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawl_jobs cj
|
||||||
|
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
|
||||||
|
)
|
||||||
|
ORDER BY COALESCE(scs.priority, 0) DESC
|
||||||
|
`, [targetTime]);
|
||||||
|
|
||||||
|
let jobsCreated = 0;
|
||||||
|
|
||||||
|
for (const store of result.rows) {
|
||||||
|
try {
|
||||||
|
await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
|
||||||
|
jobsCreated++;
|
||||||
|
console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to create daily special job for store ${store.name}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jobsCreated > 0) {
|
||||||
|
console.log(`Created ${jobsCreated} daily special crawl jobs`);
|
||||||
|
}
|
||||||
|
return jobsCreated;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process pending jobs
|
||||||
|
*/
|
||||||
|
export async function processJobs(): Promise<void> {
|
||||||
|
if (jobProcessorRunning) {
|
||||||
|
console.log('Job processor already running, skipping...');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
jobProcessorRunning = true;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const jobs = await getPendingJobs(1); // Process one at a time for safety
|
||||||
|
|
||||||
|
for (const job of jobs) {
|
||||||
|
console.log(`Processing job ${job.id} for store: ${(job as any).store_name}`);
|
||||||
|
|
||||||
|
const claimed = await claimJob(job.id);
|
||||||
|
if (!claimed) {
|
||||||
|
console.log(`Job ${job.id} already claimed by another worker`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
|
||||||
|
await scrapeStore(job.store_id);
|
||||||
|
|
||||||
|
// Update store's last_scraped_at
|
||||||
|
await pool.query(`
|
||||||
|
UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
|
||||||
|
`, [job.store_id]);
|
||||||
|
|
||||||
|
await completeJob(job.id, true, {});
|
||||||
|
console.log(`Job ${job.id} completed successfully`);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`Job ${job.id} failed:`, error);
|
||||||
|
await completeJob(job.id, false, { error_message: error.message });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
jobProcessorRunning = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process stores using the intelligent orchestrator
|
||||||
|
* This replaces the simple job queue approach with intelligent provider detection
|
||||||
|
*/
|
||||||
|
export async function processOrchestrator(): Promise<void> {
|
||||||
|
if (orchestratorProcessorRunning) {
|
||||||
|
console.log('Orchestrator processor already running, skipping...');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
orchestratorProcessorRunning = true;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Get stores due for orchestration (respects schedule, intervals, etc.)
|
||||||
|
const storeIds = await getStoresDueForOrchestration(3); // Process up to 3 at a time
|
||||||
|
|
||||||
|
if (storeIds.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
|
||||||
|
|
||||||
|
// Process each store through the orchestrator
|
||||||
|
for (const storeId of storeIds) {
|
||||||
|
try {
|
||||||
|
console.log(`Orchestrator: Starting crawl for store ${storeId}`);
|
||||||
|
const result = await runStoreCrawlOrchestrator(storeId);
|
||||||
|
console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
|
||||||
|
} finally {
|
||||||
|
orchestratorProcessorRunning = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Scheduler Control
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set scheduler mode
|
||||||
|
*/
|
||||||
|
export function setSchedulerMode(mode: 'legacy' | 'orchestrator'): void {
|
||||||
|
schedulerMode = mode;
|
||||||
|
console.log(`Scheduler mode set to: ${mode}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current scheduler mode
|
||||||
|
*/
|
||||||
|
export function getSchedulerMode(): 'legacy' | 'orchestrator' {
|
||||||
|
return schedulerMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start the scheduler (runs every minute to check for due jobs)
|
||||||
|
*/
|
||||||
|
export async function startCrawlScheduler(): Promise<void> {
|
||||||
|
stopCrawlScheduler();
|
||||||
|
|
||||||
|
console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
|
||||||
|
|
||||||
|
// Run every minute
|
||||||
|
schedulerCronJob = cron.schedule('* * * * *', async () => {
|
||||||
|
try {
|
||||||
|
if (schedulerMode === 'orchestrator') {
|
||||||
|
// Use intelligent orchestrator (handles detection + crawl)
|
||||||
|
await processOrchestrator();
|
||||||
|
} else {
|
||||||
|
// Legacy mode: job queue approach
|
||||||
|
// Check for interval-based scheduled jobs
|
||||||
|
await checkAndCreateScheduledJobs();
|
||||||
|
|
||||||
|
// Check for daily special runs
|
||||||
|
await checkAndCreateDailySpecialJobs();
|
||||||
|
|
||||||
|
// Process any pending jobs
|
||||||
|
await processJobs();
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Scheduler tick error:', error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop the scheduler
|
||||||
|
*/
|
||||||
|
export function stopCrawlScheduler(): void {
|
||||||
|
if (schedulerCronJob) {
|
||||||
|
schedulerCronJob.stop();
|
||||||
|
schedulerCronJob = null;
|
||||||
|
console.log('Crawl scheduler stopped');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Restart the scheduler
|
||||||
|
*/
|
||||||
|
export async function restartCrawlScheduler(): Promise<void> {
|
||||||
|
await startCrawlScheduler();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================
|
||||||
|
// Manual Triggers
|
||||||
|
// ============================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Manually trigger a crawl for a specific store (creates a job immediately)
|
||||||
|
*/
|
||||||
|
export async function triggerManualCrawl(storeId: number): Promise<CrawlJob> {
|
||||||
|
console.log(`Manual crawl triggered for store ID: ${storeId}`);
|
||||||
|
return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Manually trigger crawls for all stores
|
||||||
|
*/
|
||||||
|
export async function triggerAllStoresCrawl(): Promise<number> {
|
||||||
|
console.log('Manual crawl triggered for all stores');
|
||||||
|
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT id, name FROM stores
|
||||||
|
WHERE active = TRUE AND scrape_enabled = TRUE
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawl_jobs cj
|
||||||
|
WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
|
||||||
|
)
|
||||||
|
`);
|
||||||
|
|
||||||
|
let jobsCreated = 0;
|
||||||
|
for (const store of result.rows) {
|
||||||
|
await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
|
||||||
|
jobsCreated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Created ${jobsCreated} manual crawl jobs`);
|
||||||
|
return jobsCreated;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cancel a pending job
|
||||||
|
*/
|
||||||
|
export async function cancelJob(jobId: number): Promise<boolean> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
UPDATE crawl_jobs
|
||||||
|
SET status = 'cancelled'
|
||||||
|
WHERE id = $1 AND status = 'pending'
|
||||||
|
RETURNING id
|
||||||
|
`, [jobId]);
|
||||||
|
|
||||||
|
return result.rows.length > 0;
|
||||||
|
}
|
||||||
645
backend/src/services/crawler-jobs.ts
Normal file
645
backend/src/services/crawler-jobs.ts
Normal file
@@ -0,0 +1,645 @@
|
|||||||
|
/**
|
||||||
|
* Crawler Jobs Service
|
||||||
|
*
|
||||||
|
* Handles three types of jobs:
|
||||||
|
* 1. DetectMenuProviderJob - Detect menu provider for a dispensary
|
||||||
|
* 2. DutchieMenuCrawlJob - Production Dutchie crawl
|
||||||
|
* 3. SandboxCrawlJob - Learning/testing crawl for unknown providers
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { logger } from './logger';
|
||||||
|
import { detectMenuProvider, detectProviderChange, MenuProvider } from './menu-provider-detector';
|
||||||
|
import { scrapeStore } from '../scraper-v2';
|
||||||
|
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||||
|
import { promises as fs } from 'fs';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
|
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Types
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
interface Dispensary {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
website: string | null;
|
||||||
|
menu_url: string | null;
|
||||||
|
menu_provider: MenuProvider | null;
|
||||||
|
menu_provider_confidence: number;
|
||||||
|
crawler_mode: 'production' | 'sandbox';
|
||||||
|
crawler_status: string;
|
||||||
|
scraper_template: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface JobResult {
|
||||||
|
success: boolean;
|
||||||
|
message: string;
|
||||||
|
data?: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Helper Functions
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
async function getDispensary(dispensaryId: number): Promise<Dispensary | null> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence,
|
||||||
|
crawler_mode, crawler_status, scraper_template
|
||||||
|
FROM dispensaries WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
return result.rows[0] || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function updateDispensary(
|
||||||
|
dispensaryId: number,
|
||||||
|
updates: Partial<Dispensary> & { last_menu_error_at?: Date; last_error_message?: string; provider_detection_data?: any; last_menu_scrape?: Date; menu_scrape_status?: string }
|
||||||
|
): Promise<void> {
|
||||||
|
const setClauses: string[] = [];
|
||||||
|
const values: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(updates)) {
|
||||||
|
setClauses.push(`${key} = $${paramIndex}`);
|
||||||
|
values.push(value);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
setClauses.push(`updated_at = NOW()`);
|
||||||
|
values.push(dispensaryId);
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`,
|
||||||
|
values
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createSandboxEntry(
|
||||||
|
dispensaryId: number,
|
||||||
|
suspectedProvider: string | null,
|
||||||
|
mode: string,
|
||||||
|
detectionSignals?: any
|
||||||
|
): Promise<number> {
|
||||||
|
// First, check if there's an existing active sandbox
|
||||||
|
const existing = await pool.query(
|
||||||
|
`SELECT id FROM crawler_sandboxes
|
||||||
|
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
// Update existing
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes
|
||||||
|
SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]
|
||||||
|
);
|
||||||
|
return existing.rows[0].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create new
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status)
|
||||||
|
VALUES ($1, $2, $3, $4, 'pending')
|
||||||
|
RETURNING id`,
|
||||||
|
[dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']
|
||||||
|
);
|
||||||
|
return result.rows[0].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createSandboxJob(
|
||||||
|
dispensaryId: number,
|
||||||
|
sandboxId: number | null,
|
||||||
|
jobType: string,
|
||||||
|
priority: number = 0
|
||||||
|
): Promise<number> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
|
||||||
|
VALUES ($1, $2, $3, 'pending', $4)
|
||||||
|
RETURNING id`,
|
||||||
|
[dispensaryId, sandboxId, jobType, priority]
|
||||||
|
);
|
||||||
|
return result.rows[0].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get linked store ID for a dispensary (for using existing scraper)
|
||||||
|
async function getStoreIdForDispensary(dispensaryId: number): Promise<number | null> {
|
||||||
|
// Check if there's a stores entry linked to this dispensary
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT s.id FROM stores s
|
||||||
|
JOIN dispensaries d ON d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%'
|
||||||
|
WHERE d.id = $1
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows.length > 0) {
|
||||||
|
return result.rows[0].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find by website
|
||||||
|
const result2 = await pool.query(
|
||||||
|
`SELECT s.id FROM stores s
|
||||||
|
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
|
||||||
|
WHERE d.id = $1
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
return result2.rows[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Job 1: Detect Menu Provider
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export async function runDetectMenuProviderJob(dispensaryId: number): Promise<JobResult> {
|
||||||
|
logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
const dispensary = await getDispensary(dispensaryId);
|
||||||
|
|
||||||
|
if (!dispensary) {
|
||||||
|
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for website URL
|
||||||
|
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||||
|
if (!websiteUrl) {
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_status: 'error_needs_review',
|
||||||
|
last_menu_error_at: new Date(),
|
||||||
|
last_error_message: 'No website URL available for detection',
|
||||||
|
});
|
||||||
|
return { success: false, message: 'No website URL available' };
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Run detection
|
||||||
|
const detection = await detectMenuProvider(websiteUrl, {
|
||||||
|
checkMenuPaths: true,
|
||||||
|
timeout: 30000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Update dispensary with results
|
||||||
|
const updates: any = {
|
||||||
|
menu_provider: detection.provider,
|
||||||
|
menu_provider_confidence: detection.confidence,
|
||||||
|
provider_detection_data: JSON.stringify({
|
||||||
|
signals: detection.signals,
|
||||||
|
urlsTested: detection.urlsTested,
|
||||||
|
menuEntryPoints: detection.menuEntryPoints,
|
||||||
|
rawSignals: detection.rawSignals,
|
||||||
|
detectedAt: new Date().toISOString(),
|
||||||
|
}),
|
||||||
|
crawler_status: 'idle',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Decide crawler mode based on provider
|
||||||
|
if (detection.provider === 'dutchie' && detection.confidence >= 70) {
|
||||||
|
// Dutchie with high confidence -> production
|
||||||
|
updates.crawler_mode = 'production';
|
||||||
|
logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`);
|
||||||
|
} else {
|
||||||
|
// Unknown or non-Dutchie -> sandbox
|
||||||
|
updates.crawler_mode = 'sandbox';
|
||||||
|
|
||||||
|
// Create sandbox entry for further analysis
|
||||||
|
const sandboxId = await createSandboxEntry(
|
||||||
|
dispensaryId,
|
||||||
|
detection.provider,
|
||||||
|
'detection',
|
||||||
|
{
|
||||||
|
signals: detection.signals,
|
||||||
|
rawSignals: detection.rawSignals,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Queue sandbox crawl job
|
||||||
|
await createSandboxJob(dispensaryId, sandboxId, 'detection');
|
||||||
|
|
||||||
|
logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update menu entry points if found
|
||||||
|
if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) {
|
||||||
|
updates.menu_url = detection.menuEntryPoints[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
await updateDispensary(dispensaryId, updates);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
message: `Detected provider: ${detection.provider} (${detection.confidence}%)`,
|
||||||
|
data: {
|
||||||
|
provider: detection.provider,
|
||||||
|
confidence: detection.confidence,
|
||||||
|
mode: updates.crawler_mode,
|
||||||
|
menuEntryPoints: detection.menuEntryPoints,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||||
|
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_status: 'error_needs_review',
|
||||||
|
last_menu_error_at: new Date(),
|
||||||
|
last_error_message: `Detection failed: ${error.message}`,
|
||||||
|
});
|
||||||
|
|
||||||
|
return { success: false, message: error.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Job 2: Dutchie Menu Crawl (Production)
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export async function runDutchieMenuCrawlJob(dispensaryId: number): Promise<JobResult> {
|
||||||
|
logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
const dispensary = await getDispensary(dispensaryId);
|
||||||
|
|
||||||
|
if (!dispensary) {
|
||||||
|
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify it's a Dutchie production dispensary
|
||||||
|
if (dispensary.menu_provider !== 'dutchie') {
|
||||||
|
logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`);
|
||||||
|
return { success: false, message: 'Not a Dutchie dispensary' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dispensary.crawler_mode !== 'production') {
|
||||||
|
logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`);
|
||||||
|
return { success: false, message: 'Not in production mode' };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find linked store ID
|
||||||
|
const storeId = await getStoreIdForDispensary(dispensaryId);
|
||||||
|
|
||||||
|
if (!storeId) {
|
||||||
|
// Need to create a store entry or handle differently
|
||||||
|
logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`);
|
||||||
|
return { success: false, message: 'No linked store found - needs setup' };
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Update status to running
|
||||||
|
await updateDispensary(dispensaryId, { crawler_status: 'running' });
|
||||||
|
|
||||||
|
// Run the existing Dutchie scraper
|
||||||
|
await scrapeStore(storeId, 3); // 3 parallel workers
|
||||||
|
|
||||||
|
// Update success status
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_status: 'ok',
|
||||||
|
last_menu_scrape: new Date() as any,
|
||||||
|
menu_scrape_status: 'active' as any,
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
message: 'Dutchie crawl completed successfully',
|
||||||
|
data: { storeId },
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||||
|
|
||||||
|
// Check if this might be a provider change
|
||||||
|
let providerChanged = false;
|
||||||
|
try {
|
||||||
|
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox'] });
|
||||||
|
const page = await browser.newPage();
|
||||||
|
const url = dispensary.menu_url || dispensary.website;
|
||||||
|
if (url) {
|
||||||
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||||
|
const changeResult = await detectProviderChange(page, 'dutchie');
|
||||||
|
providerChanged = changeResult.changed;
|
||||||
|
|
||||||
|
if (providerChanged) {
|
||||||
|
// Provider changed - move to sandbox
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_mode: 'sandbox',
|
||||||
|
crawler_status: 'error_needs_review',
|
||||||
|
last_menu_error_at: new Date(),
|
||||||
|
last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`,
|
||||||
|
});
|
||||||
|
|
||||||
|
const sandboxId = await createSandboxEntry(
|
||||||
|
dispensaryId,
|
||||||
|
changeResult.newProvider || 'unknown',
|
||||||
|
'detection',
|
||||||
|
{ providerChangeDetected: true, previousProvider: 'dutchie' }
|
||||||
|
);
|
||||||
|
|
||||||
|
await createSandboxJob(dispensaryId, sandboxId, 'detection');
|
||||||
|
|
||||||
|
logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await browser.close();
|
||||||
|
} catch {
|
||||||
|
// Ignore detection errors during failure handling
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!providerChanged) {
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_status: 'error_needs_review',
|
||||||
|
last_menu_error_at: new Date(),
|
||||||
|
last_error_message: error.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return { success: false, message: error.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Job 3: Sandbox Crawl (Learning Mode)
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export async function runSandboxCrawlJob(dispensaryId: number, sandboxId?: number): Promise<JobResult> {
|
||||||
|
logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
const dispensary = await getDispensary(dispensaryId);
|
||||||
|
|
||||||
|
if (!dispensary) {
|
||||||
|
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get or create sandbox entry
|
||||||
|
let sandbox: any;
|
||||||
|
if (sandboxId) {
|
||||||
|
const result = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
|
||||||
|
sandbox = result.rows[0];
|
||||||
|
} else {
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT * FROM crawler_sandboxes
|
||||||
|
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')
|
||||||
|
ORDER BY created_at DESC LIMIT 1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
sandbox = result.rows[0];
|
||||||
|
|
||||||
|
if (!sandbox) {
|
||||||
|
const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning');
|
||||||
|
const result = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
|
||||||
|
sandbox = result.rows[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const websiteUrl = dispensary.menu_url || dispensary.website;
|
||||||
|
if (!websiteUrl) {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`,
|
||||||
|
[sandbox.id]
|
||||||
|
);
|
||||||
|
return { success: false, message: 'No website URL available' };
|
||||||
|
}
|
||||||
|
|
||||||
|
let browser: Browser | null = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Update status
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`,
|
||||||
|
[sandbox.id]
|
||||||
|
);
|
||||||
|
await updateDispensary(dispensaryId, { crawler_status: 'running' });
|
||||||
|
|
||||||
|
// Launch browser
|
||||||
|
browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setUserAgent(
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
);
|
||||||
|
|
||||||
|
// URLs to crawl (limited depth for sandbox)
|
||||||
|
const urlsToVisit = [websiteUrl];
|
||||||
|
const menuPaths = ['/menu', '/shop', '/products', '/order'];
|
||||||
|
for (const path of menuPaths) {
|
||||||
|
const baseUrl = new URL(websiteUrl).origin;
|
||||||
|
urlsToVisit.push(`${baseUrl}${path}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlsTested: string[] = [];
|
||||||
|
const menuEntryPoints: string[] = [];
|
||||||
|
const capturedHtml: { url: string; html: string }[] = [];
|
||||||
|
const analysisData: any = {
|
||||||
|
provider_signals: {},
|
||||||
|
selector_candidates: [],
|
||||||
|
page_structures: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
// Crawl each URL
|
||||||
|
for (const url of urlsToVisit) {
|
||||||
|
try {
|
||||||
|
urlsTested.push(url);
|
||||||
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||||
|
await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content
|
||||||
|
|
||||||
|
// Get page HTML
|
||||||
|
const html = await page.content();
|
||||||
|
|
||||||
|
// Check if this looks like a menu page
|
||||||
|
const hasMenuContent = await page.evaluate(() => {
|
||||||
|
const text = document.body.innerText.toLowerCase();
|
||||||
|
return (
|
||||||
|
text.includes('add to cart') ||
|
||||||
|
text.includes('thc') ||
|
||||||
|
text.includes('indica') ||
|
||||||
|
text.includes('sativa')
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (hasMenuContent) {
|
||||||
|
menuEntryPoints.push(url);
|
||||||
|
capturedHtml.push({ url, html });
|
||||||
|
|
||||||
|
// Analyze page structure for selector candidates
|
||||||
|
const structure = await page.evaluate(() => {
|
||||||
|
const candidates: any[] = [];
|
||||||
|
|
||||||
|
// Look for product-like containers
|
||||||
|
const productSelectors = [
|
||||||
|
'.product', '.product-card', '.menu-item', '.item-card',
|
||||||
|
'[data-product]', '[data-item]', '.strain', '.listing',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of productSelectors) {
|
||||||
|
const els = document.querySelectorAll(selector);
|
||||||
|
if (els.length > 3) { // Likely a list
|
||||||
|
candidates.push({
|
||||||
|
selector,
|
||||||
|
count: els.length,
|
||||||
|
type: 'product_container',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for price patterns
|
||||||
|
const pricePattern = /\$\d+(\.\d{2})?/;
|
||||||
|
const textNodes = document.body.innerText;
|
||||||
|
const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g);
|
||||||
|
|
||||||
|
return {
|
||||||
|
candidates,
|
||||||
|
priceCount: priceMatches?.length || 0,
|
||||||
|
hasAddToCart: textNodes.toLowerCase().includes('add to cart'),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
analysisData.page_structures.push({
|
||||||
|
url,
|
||||||
|
...structure,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (pageError: any) {
|
||||||
|
if (!pageError.message.includes('404')) {
|
||||||
|
logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save HTML to storage (local for now, S3 later)
|
||||||
|
let rawHtmlLocation: string | null = null;
|
||||||
|
if (capturedHtml.length > 0) {
|
||||||
|
const htmlDir = path.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`);
|
||||||
|
await fs.mkdir(htmlDir, { recursive: true });
|
||||||
|
|
||||||
|
for (const { url, html } of capturedHtml) {
|
||||||
|
const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`;
|
||||||
|
await fs.writeFile(path.join(htmlDir, filename), html);
|
||||||
|
}
|
||||||
|
rawHtmlLocation = htmlDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update sandbox with results
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes SET
|
||||||
|
status = $1,
|
||||||
|
urls_tested = $2,
|
||||||
|
menu_entry_points = $3,
|
||||||
|
raw_html_location = $4,
|
||||||
|
analysis_json = $5,
|
||||||
|
confidence_score = $6,
|
||||||
|
analyzed_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $7`,
|
||||||
|
[
|
||||||
|
menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending',
|
||||||
|
JSON.stringify(urlsTested),
|
||||||
|
JSON.stringify(menuEntryPoints),
|
||||||
|
rawHtmlLocation,
|
||||||
|
JSON.stringify(analysisData),
|
||||||
|
menuEntryPoints.length > 0 ? 50 : 20,
|
||||||
|
sandbox.id,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update dispensary status
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_status: 'error_needs_review', // Sandbox results need review
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`,
|
||||||
|
data: {
|
||||||
|
sandboxId: sandbox.id,
|
||||||
|
urlsTested: urlsTested.length,
|
||||||
|
menuEntryPoints,
|
||||||
|
analysisData,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`,
|
||||||
|
[error.message, sandbox.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
await updateDispensary(dispensaryId, {
|
||||||
|
crawler_status: 'error_needs_review',
|
||||||
|
last_menu_error_at: new Date(),
|
||||||
|
last_error_message: `Sandbox crawl failed: ${error.message}`,
|
||||||
|
});
|
||||||
|
|
||||||
|
return { success: false, message: error.message };
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
if (browser) {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Queue Processing Functions
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process pending sandbox jobs
|
||||||
|
*/
|
||||||
|
export async function processSandboxJobs(limit: number = 5): Promise<void> {
|
||||||
|
// Claim pending jobs
|
||||||
|
const jobs = await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs
|
||||||
|
SET status = 'running', worker_id = $1, started_at = NOW()
|
||||||
|
WHERE id IN (
|
||||||
|
SELECT id FROM sandbox_crawl_jobs
|
||||||
|
WHERE status = 'pending' AND scheduled_at <= NOW()
|
||||||
|
ORDER BY priority DESC, scheduled_at ASC
|
||||||
|
LIMIT $2
|
||||||
|
FOR UPDATE SKIP LOCKED
|
||||||
|
)
|
||||||
|
RETURNING *`,
|
||||||
|
[WORKER_ID, limit]
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const job of jobs.rows) {
|
||||||
|
try {
|
||||||
|
let result: JobResult;
|
||||||
|
|
||||||
|
if (job.job_type === 'detection') {
|
||||||
|
result = await runDetectMenuProviderJob(job.dispensary_id);
|
||||||
|
} else {
|
||||||
|
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs
|
||||||
|
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||||
|
WHERE id = $4`,
|
||||||
|
[
|
||||||
|
result.success ? 'completed' : 'failed',
|
||||||
|
JSON.stringify(result.data || {}),
|
||||||
|
result.success ? null : result.message,
|
||||||
|
job.id,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`,
|
||||||
|
[error.message, job.id]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
414
backend/src/services/crawler-logger.ts
Normal file
414
backend/src/services/crawler-logger.ts
Normal file
@@ -0,0 +1,414 @@
|
|||||||
|
/**
|
||||||
|
* CrawlerLogger - Structured logging for crawler operations
|
||||||
|
*
|
||||||
|
* High-signal, low-noise logging with JSON output for:
|
||||||
|
* - Job lifecycle (one summary per job)
|
||||||
|
* - Provider/mode changes
|
||||||
|
* - Sandbox events
|
||||||
|
* - Queue failures
|
||||||
|
*
|
||||||
|
* NO per-product logging - that's too noisy.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export type LogLevel = 'info' | 'warn' | 'error' | 'debug';
|
||||||
|
|
||||||
|
export type LogEvent =
|
||||||
|
| 'job_started'
|
||||||
|
| 'job_completed'
|
||||||
|
| 'job_failed'
|
||||||
|
| 'job_cancelled'
|
||||||
|
| 'provider_detected'
|
||||||
|
| 'provider_changed'
|
||||||
|
| 'mode_changed'
|
||||||
|
| 'sandbox_started'
|
||||||
|
| 'sandbox_completed'
|
||||||
|
| 'sandbox_failed'
|
||||||
|
| 'queue_failure'
|
||||||
|
| 'detection_scan'
|
||||||
|
| 'crawl_batch'
|
||||||
|
| 'intelligence_run';
|
||||||
|
|
||||||
|
interface BaseLogPayload {
|
||||||
|
timestamp: string;
|
||||||
|
level: LogLevel;
|
||||||
|
event: LogEvent;
|
||||||
|
dispensary_id?: number;
|
||||||
|
store_id?: number;
|
||||||
|
job_id?: number;
|
||||||
|
provider?: string;
|
||||||
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
||||||
|
}
|
||||||
|
|
||||||
|
interface JobStartedPayload extends BaseLogPayload {
|
||||||
|
event: 'job_started';
|
||||||
|
job_type: string;
|
||||||
|
trigger_type: string;
|
||||||
|
store_name: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface JobCompletedPayload extends BaseLogPayload {
|
||||||
|
event: 'job_completed';
|
||||||
|
store_name: string;
|
||||||
|
duration_ms: number;
|
||||||
|
products_found: number;
|
||||||
|
products_new: number;
|
||||||
|
products_updated: number;
|
||||||
|
products_marked_oos?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface JobFailedPayload extends BaseLogPayload {
|
||||||
|
event: 'job_failed';
|
||||||
|
store_name: string;
|
||||||
|
duration_ms: number;
|
||||||
|
error_message: string;
|
||||||
|
error_code?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ProviderDetectedPayload extends BaseLogPayload {
|
||||||
|
event: 'provider_detected';
|
||||||
|
dispensary_name: string;
|
||||||
|
detected_provider: string;
|
||||||
|
confidence: number;
|
||||||
|
detection_method: string;
|
||||||
|
menu_url?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ProviderChangedPayload extends BaseLogPayload {
|
||||||
|
event: 'provider_changed';
|
||||||
|
dispensary_name: string;
|
||||||
|
old_provider: string | null;
|
||||||
|
new_provider: string;
|
||||||
|
old_confidence: number;
|
||||||
|
new_confidence: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ModeChangedPayload extends BaseLogPayload {
|
||||||
|
event: 'mode_changed';
|
||||||
|
dispensary_name: string;
|
||||||
|
old_mode: string;
|
||||||
|
new_mode: string;
|
||||||
|
reason: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SandboxEventPayload extends BaseLogPayload {
|
||||||
|
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
|
||||||
|
dispensary_name: string;
|
||||||
|
template_name: string;
|
||||||
|
quality_score?: number;
|
||||||
|
products_extracted?: number;
|
||||||
|
fields_missing?: number;
|
||||||
|
error_message?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface QueueFailurePayload extends BaseLogPayload {
|
||||||
|
event: 'queue_failure';
|
||||||
|
queue_type: string;
|
||||||
|
error_message: string;
|
||||||
|
affected_items?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface DetectionScanPayload extends BaseLogPayload {
|
||||||
|
event: 'detection_scan';
|
||||||
|
total_scanned: number;
|
||||||
|
detected: number;
|
||||||
|
failed: number;
|
||||||
|
skipped: number;
|
||||||
|
duration_ms: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface IntelligenceRunPayload extends BaseLogPayload {
|
||||||
|
event: 'intelligence_run';
|
||||||
|
run_type: 'detection' | 'production' | 'sandbox' | 'full';
|
||||||
|
dispensaries_processed: number;
|
||||||
|
jobs_queued: number;
|
||||||
|
duration_ms: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
type LogPayload =
|
||||||
|
| JobStartedPayload
|
||||||
|
| JobCompletedPayload
|
||||||
|
| JobFailedPayload
|
||||||
|
| ProviderDetectedPayload
|
||||||
|
| ProviderChangedPayload
|
||||||
|
| ModeChangedPayload
|
||||||
|
| SandboxEventPayload
|
||||||
|
| QueueFailurePayload
|
||||||
|
| DetectionScanPayload
|
||||||
|
| IntelligenceRunPayload;
|
||||||
|
|
||||||
|
class CrawlerLoggerService {
|
||||||
|
private formatLog(payload: LogPayload): string {
|
||||||
|
return JSON.stringify(payload);
|
||||||
|
}
|
||||||
|
|
||||||
|
private log(payload: LogPayload): void {
|
||||||
|
const formatted = this.formatLog(payload);
|
||||||
|
|
||||||
|
switch (payload.level) {
|
||||||
|
case 'error':
|
||||||
|
console.error(`[CRAWLER] ${formatted}`);
|
||||||
|
break;
|
||||||
|
case 'warn':
|
||||||
|
console.warn(`[CRAWLER] ${formatted}`);
|
||||||
|
break;
|
||||||
|
case 'debug':
|
||||||
|
console.debug(`[CRAWLER] ${formatted}`);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log(`[CRAWLER] ${formatted}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log when a crawl job starts
|
||||||
|
*/
|
||||||
|
jobStarted(params: {
|
||||||
|
job_id: number;
|
||||||
|
store_id: number;
|
||||||
|
store_name: string;
|
||||||
|
job_type: string;
|
||||||
|
trigger_type: string;
|
||||||
|
provider?: string;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'job_started',
|
||||||
|
job_id: params.job_id,
|
||||||
|
store_id: params.store_id,
|
||||||
|
store_name: params.store_name,
|
||||||
|
job_type: params.job_type,
|
||||||
|
trigger_type: params.trigger_type,
|
||||||
|
provider: params.provider,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log when a crawl job completes successfully
|
||||||
|
*/
|
||||||
|
jobCompleted(params: {
|
||||||
|
job_id: number;
|
||||||
|
store_id: number;
|
||||||
|
store_name: string;
|
||||||
|
duration_ms: number;
|
||||||
|
products_found: number;
|
||||||
|
products_new: number;
|
||||||
|
products_updated: number;
|
||||||
|
products_marked_oos?: number;
|
||||||
|
provider?: string;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'job_completed',
|
||||||
|
job_id: params.job_id,
|
||||||
|
store_id: params.store_id,
|
||||||
|
store_name: params.store_name,
|
||||||
|
duration_ms: params.duration_ms,
|
||||||
|
products_found: params.products_found,
|
||||||
|
products_new: params.products_new,
|
||||||
|
products_updated: params.products_updated,
|
||||||
|
products_marked_oos: params.products_marked_oos,
|
||||||
|
provider: params.provider,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log when a crawl job fails
|
||||||
|
*/
|
||||||
|
jobFailed(params: {
|
||||||
|
job_id: number;
|
||||||
|
store_id: number;
|
||||||
|
store_name: string;
|
||||||
|
duration_ms: number;
|
||||||
|
error_message: string;
|
||||||
|
error_code?: string;
|
||||||
|
provider?: string;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'error',
|
||||||
|
event: 'job_failed',
|
||||||
|
job_id: params.job_id,
|
||||||
|
store_id: params.store_id,
|
||||||
|
store_name: params.store_name,
|
||||||
|
duration_ms: params.duration_ms,
|
||||||
|
error_message: params.error_message,
|
||||||
|
error_code: params.error_code,
|
||||||
|
provider: params.provider,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log when a provider is detected for a dispensary
|
||||||
|
*/
|
||||||
|
providerDetected(params: {
|
||||||
|
dispensary_id: number;
|
||||||
|
dispensary_name: string;
|
||||||
|
detected_provider: string;
|
||||||
|
confidence: number;
|
||||||
|
detection_method: string;
|
||||||
|
menu_url?: string;
|
||||||
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'provider_detected',
|
||||||
|
dispensary_id: params.dispensary_id,
|
||||||
|
dispensary_name: params.dispensary_name,
|
||||||
|
detected_provider: params.detected_provider,
|
||||||
|
confidence: params.confidence,
|
||||||
|
detection_method: params.detection_method,
|
||||||
|
menu_url: params.menu_url,
|
||||||
|
category: params.category,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log when a dispensary's provider changes
|
||||||
|
*/
|
||||||
|
providerChanged(params: {
|
||||||
|
dispensary_id: number;
|
||||||
|
dispensary_name: string;
|
||||||
|
old_provider: string | null;
|
||||||
|
new_provider: string;
|
||||||
|
old_confidence: number;
|
||||||
|
new_confidence: number;
|
||||||
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'provider_changed',
|
||||||
|
dispensary_id: params.dispensary_id,
|
||||||
|
dispensary_name: params.dispensary_name,
|
||||||
|
old_provider: params.old_provider,
|
||||||
|
new_provider: params.new_provider,
|
||||||
|
old_confidence: params.old_confidence,
|
||||||
|
new_confidence: params.new_confidence,
|
||||||
|
category: params.category,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
|
||||||
|
*/
|
||||||
|
modeChanged(params: {
|
||||||
|
dispensary_id: number;
|
||||||
|
dispensary_name: string;
|
||||||
|
old_mode: string;
|
||||||
|
new_mode: string;
|
||||||
|
reason: string;
|
||||||
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
||||||
|
provider?: string;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'mode_changed',
|
||||||
|
dispensary_id: params.dispensary_id,
|
||||||
|
dispensary_name: params.dispensary_name,
|
||||||
|
old_mode: params.old_mode,
|
||||||
|
new_mode: params.new_mode,
|
||||||
|
reason: params.reason,
|
||||||
|
category: params.category,
|
||||||
|
provider: params.provider,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log sandbox crawl events
|
||||||
|
*/
|
||||||
|
sandboxEvent(params: {
|
||||||
|
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
|
||||||
|
dispensary_id: number;
|
||||||
|
dispensary_name: string;
|
||||||
|
template_name: string;
|
||||||
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
||||||
|
quality_score?: number;
|
||||||
|
products_extracted?: number;
|
||||||
|
fields_missing?: number;
|
||||||
|
error_message?: string;
|
||||||
|
provider?: string;
|
||||||
|
}): void {
|
||||||
|
const level: LogLevel = params.event === 'sandbox_failed' ? 'error' : 'info';
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level,
|
||||||
|
event: params.event,
|
||||||
|
dispensary_id: params.dispensary_id,
|
||||||
|
dispensary_name: params.dispensary_name,
|
||||||
|
template_name: params.template_name,
|
||||||
|
category: params.category,
|
||||||
|
quality_score: params.quality_score,
|
||||||
|
products_extracted: params.products_extracted,
|
||||||
|
fields_missing: params.fields_missing,
|
||||||
|
error_message: params.error_message,
|
||||||
|
provider: params.provider,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log queue processing failures
|
||||||
|
*/
|
||||||
|
queueFailure(params: {
|
||||||
|
queue_type: string;
|
||||||
|
error_message: string;
|
||||||
|
affected_items?: number;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'error',
|
||||||
|
event: 'queue_failure',
|
||||||
|
queue_type: params.queue_type,
|
||||||
|
error_message: params.error_message,
|
||||||
|
affected_items: params.affected_items,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log detection scan summary
|
||||||
|
*/
|
||||||
|
detectionScan(params: {
|
||||||
|
total_scanned: number;
|
||||||
|
detected: number;
|
||||||
|
failed: number;
|
||||||
|
skipped: number;
|
||||||
|
duration_ms: number;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'detection_scan',
|
||||||
|
total_scanned: params.total_scanned,
|
||||||
|
detected: params.detected,
|
||||||
|
failed: params.failed,
|
||||||
|
skipped: params.skipped,
|
||||||
|
duration_ms: params.duration_ms,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log intelligence run summary
|
||||||
|
*/
|
||||||
|
intelligenceRun(params: {
|
||||||
|
run_type: 'detection' | 'production' | 'sandbox' | 'full';
|
||||||
|
dispensaries_processed: number;
|
||||||
|
jobs_queued: number;
|
||||||
|
duration_ms: number;
|
||||||
|
}): void {
|
||||||
|
this.log({
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
level: 'info',
|
||||||
|
event: 'intelligence_run',
|
||||||
|
run_type: params.run_type,
|
||||||
|
dispensaries_processed: params.dispensaries_processed,
|
||||||
|
jobs_queued: params.jobs_queued,
|
||||||
|
duration_ms: params.duration_ms,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Export singleton instance
|
||||||
|
export const crawlerLogger = new CrawlerLoggerService();
|
||||||
620
backend/src/services/intelligence-detector.ts
Normal file
620
backend/src/services/intelligence-detector.ts
Normal file
@@ -0,0 +1,620 @@
|
|||||||
|
/**
|
||||||
|
* Multi-Category Intelligence Detector
|
||||||
|
*
|
||||||
|
* Detects providers for each intelligence category independently:
|
||||||
|
* - Products: Which provider serves product data
|
||||||
|
* - Specials: Which provider serves deals/specials
|
||||||
|
* - Brand: Which provider serves brand information
|
||||||
|
* - Metadata: Which provider serves taxonomy/category data
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { logger } from './logger';
|
||||||
|
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Types
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export type IntelligenceCategory = 'product' | 'specials' | 'brand' | 'metadata';
|
||||||
|
|
||||||
|
export type MenuProvider =
|
||||||
|
| 'dutchie'
|
||||||
|
| 'treez'
|
||||||
|
| 'jane'
|
||||||
|
| 'iheartjane'
|
||||||
|
| 'weedmaps'
|
||||||
|
| 'leafly'
|
||||||
|
| 'meadow'
|
||||||
|
| 'greenlight'
|
||||||
|
| 'blaze'
|
||||||
|
| 'flowhub'
|
||||||
|
| 'dispense'
|
||||||
|
| 'cova'
|
||||||
|
| 'custom_html'
|
||||||
|
| 'custom_json'
|
||||||
|
| 'dutchie_json'
|
||||||
|
| 'other'
|
||||||
|
| 'unknown';
|
||||||
|
|
||||||
|
export interface CategoryDetectionResult {
|
||||||
|
provider: MenuProvider;
|
||||||
|
confidence: number;
|
||||||
|
mode: 'production' | 'sandbox';
|
||||||
|
signals: Record<string, any>;
|
||||||
|
templateName?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MultiCategoryDetectionResult {
|
||||||
|
product: CategoryDetectionResult;
|
||||||
|
specials: CategoryDetectionResult;
|
||||||
|
brand: CategoryDetectionResult;
|
||||||
|
metadata: CategoryDetectionResult;
|
||||||
|
urlsTested: string[];
|
||||||
|
rawSignals: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Production-ready providers per category
|
||||||
|
// Only these combinations can be set to production mode
|
||||||
|
const PRODUCTION_READY: Record<IntelligenceCategory, MenuProvider[]> = {
|
||||||
|
product: ['dutchie'], // Only Dutchie products are production-ready
|
||||||
|
specials: [], // None yet
|
||||||
|
brand: [], // None yet
|
||||||
|
metadata: [], // None yet
|
||||||
|
};
|
||||||
|
|
||||||
|
// Provider detection patterns
|
||||||
|
const PROVIDER_PATTERNS: Record<string, {
|
||||||
|
scripts: RegExp[];
|
||||||
|
iframes: RegExp[];
|
||||||
|
html: RegExp[];
|
||||||
|
apiEndpoints: RegExp[];
|
||||||
|
metaTags: RegExp[];
|
||||||
|
}> = {
|
||||||
|
dutchie: {
|
||||||
|
scripts: [
|
||||||
|
/dutchie\.com/i,
|
||||||
|
/dutchie-plus/i,
|
||||||
|
/dutchie\.js/i,
|
||||||
|
/__DUTCHIE__/i,
|
||||||
|
/dutchie-embed/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/dutchie\.com/i,
|
||||||
|
/dutchie-plus\.com/i,
|
||||||
|
/embed\.dutchie/i,
|
||||||
|
],
|
||||||
|
html: [
|
||||||
|
/class="dutchie/i,
|
||||||
|
/id="dutchie/i,
|
||||||
|
/data-dutchie/i,
|
||||||
|
/"menuType":\s*"dutchie"/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/dutchie\.com\/graphql/i,
|
||||||
|
/plus\.dutchie\.com/i,
|
||||||
|
],
|
||||||
|
metaTags: [
|
||||||
|
/dutchie/i,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
treez: {
|
||||||
|
scripts: [
|
||||||
|
/treez\.io/i,
|
||||||
|
/treez-ecommerce/i,
|
||||||
|
/treez\.js/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/treez\.io/i,
|
||||||
|
/shop\.treez/i,
|
||||||
|
],
|
||||||
|
html: [
|
||||||
|
/class="treez/i,
|
||||||
|
/data-treez/i,
|
||||||
|
/treez-menu/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.treez\.io/i,
|
||||||
|
/treez\.io\/api/i,
|
||||||
|
],
|
||||||
|
metaTags: [],
|
||||||
|
},
|
||||||
|
jane: {
|
||||||
|
scripts: [
|
||||||
|
/jane\.co/i,
|
||||||
|
/iheartjane\.com/i,
|
||||||
|
/jane-frame/i,
|
||||||
|
/jane\.js/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/jane\.co/i,
|
||||||
|
/iheartjane\.com/i,
|
||||||
|
/embed\.iheartjane/i,
|
||||||
|
],
|
||||||
|
html: [
|
||||||
|
/class="jane/i,
|
||||||
|
/data-jane/i,
|
||||||
|
/jane-embed/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.iheartjane/i,
|
||||||
|
/jane\.co\/api/i,
|
||||||
|
],
|
||||||
|
metaTags: [],
|
||||||
|
},
|
||||||
|
weedmaps: {
|
||||||
|
scripts: [
|
||||||
|
/weedmaps\.com/i,
|
||||||
|
/wm-menu/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/weedmaps\.com/i,
|
||||||
|
/menu\.weedmaps/i,
|
||||||
|
],
|
||||||
|
html: [
|
||||||
|
/data-weedmaps/i,
|
||||||
|
/wm-menu/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api-g\.weedmaps/i,
|
||||||
|
/weedmaps\.com\/api/i,
|
||||||
|
],
|
||||||
|
metaTags: [],
|
||||||
|
},
|
||||||
|
leafly: {
|
||||||
|
scripts: [
|
||||||
|
/leafly\.com/i,
|
||||||
|
/leafly-menu/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/leafly\.com/i,
|
||||||
|
/order\.leafly/i,
|
||||||
|
],
|
||||||
|
html: [
|
||||||
|
/data-leafly/i,
|
||||||
|
/leafly-embed/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.leafly/i,
|
||||||
|
],
|
||||||
|
metaTags: [],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Category-specific detection signals
|
||||||
|
const CATEGORY_SIGNALS: Record<IntelligenceCategory, {
|
||||||
|
urlPatterns: RegExp[];
|
||||||
|
htmlPatterns: RegExp[];
|
||||||
|
jsonKeys: string[];
|
||||||
|
}> = {
|
||||||
|
product: {
|
||||||
|
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
|
||||||
|
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
|
||||||
|
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
|
||||||
|
},
|
||||||
|
specials: {
|
||||||
|
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
|
||||||
|
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
|
||||||
|
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
|
||||||
|
},
|
||||||
|
brand: {
|
||||||
|
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
|
||||||
|
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
|
||||||
|
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
|
||||||
|
},
|
||||||
|
metadata: {
|
||||||
|
urlPatterns: [/\/categories/i, /\/taxonomy/i],
|
||||||
|
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
|
||||||
|
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Main Detection Function
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export async function detectMultiCategoryProviders(
|
||||||
|
websiteUrl: string,
|
||||||
|
options: {
|
||||||
|
timeout?: number;
|
||||||
|
headless?: boolean;
|
||||||
|
existingBrowser?: Browser;
|
||||||
|
} = {}
|
||||||
|
): Promise<MultiCategoryDetectionResult> {
|
||||||
|
const { timeout = 30000, headless = true, existingBrowser } = options;
|
||||||
|
|
||||||
|
let browser: Browser | null = null;
|
||||||
|
let page: Page | null = null;
|
||||||
|
const urlsTested: string[] = [];
|
||||||
|
const rawSignals: Record<string, any> = {};
|
||||||
|
|
||||||
|
try {
|
||||||
|
browser = existingBrowser || await puppeteer.launch({
|
||||||
|
headless,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||||
|
});
|
||||||
|
|
||||||
|
page = await browser.newPage();
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
||||||
|
|
||||||
|
// Navigate to main site
|
||||||
|
const baseUrl = normalizeUrl(websiteUrl);
|
||||||
|
urlsTested.push(baseUrl);
|
||||||
|
|
||||||
|
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
|
||||||
|
|
||||||
|
// Collect signals from main page
|
||||||
|
const mainPageSignals = await collectPageSignals(page);
|
||||||
|
rawSignals.mainPage = mainPageSignals;
|
||||||
|
|
||||||
|
// Try common menu URLs
|
||||||
|
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
|
||||||
|
for (const path of menuUrls) {
|
||||||
|
try {
|
||||||
|
const fullUrl = new URL(path, baseUrl).toString();
|
||||||
|
urlsTested.push(fullUrl);
|
||||||
|
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
|
||||||
|
const signals = await collectPageSignals(page);
|
||||||
|
rawSignals[path] = signals;
|
||||||
|
} catch {
|
||||||
|
// URL doesn't exist or timed out
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Analyze signals for each category
|
||||||
|
const result: MultiCategoryDetectionResult = {
|
||||||
|
product: analyzeCategorySignals('product', rawSignals),
|
||||||
|
specials: analyzeCategorySignals('specials', rawSignals),
|
||||||
|
brand: analyzeCategorySignals('brand', rawSignals),
|
||||||
|
metadata: analyzeCategorySignals('metadata', rawSignals),
|
||||||
|
urlsTested,
|
||||||
|
rawSignals,
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
|
||||||
|
return result;
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||||
|
|
||||||
|
// Return unknown results for all categories
|
||||||
|
return {
|
||||||
|
product: createUnknownResult(),
|
||||||
|
specials: createUnknownResult(),
|
||||||
|
brand: createUnknownResult(),
|
||||||
|
metadata: createUnknownResult(),
|
||||||
|
urlsTested,
|
||||||
|
rawSignals: { error: error.message },
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
if (page) await page.close().catch(() => {});
|
||||||
|
if (browser && !existingBrowser) await browser.close().catch(() => {});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Helper Functions
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
function normalizeUrl(url: string): string {
|
||||||
|
if (!url.startsWith('http')) {
|
||||||
|
url = 'https://' + url;
|
||||||
|
}
|
||||||
|
return url.replace(/\/$/, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function collectPageSignals(page: Page): Promise<Record<string, any>> {
|
||||||
|
return page.evaluate(() => {
|
||||||
|
const signals: Record<string, any> = {
|
||||||
|
scripts: [] as string[],
|
||||||
|
iframes: [] as string[],
|
||||||
|
links: [] as string[],
|
||||||
|
metaTags: [] as string[],
|
||||||
|
bodyClasses: document.body?.className || '',
|
||||||
|
bodyId: document.body?.id || '',
|
||||||
|
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Collect script sources
|
||||||
|
document.querySelectorAll('script[src]').forEach((el) => {
|
||||||
|
signals.scripts.push((el as HTMLScriptElement).src);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Collect inline scripts
|
||||||
|
document.querySelectorAll('script:not([src])').forEach((el) => {
|
||||||
|
const content = el.textContent || '';
|
||||||
|
if (content.length < 5000) {
|
||||||
|
signals.scripts.push(`inline:${content.slice(0, 500)}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Collect iframes
|
||||||
|
document.querySelectorAll('iframe').forEach((el) => {
|
||||||
|
signals.iframes.push(el.src);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Collect links
|
||||||
|
document.querySelectorAll('a[href]').forEach((el) => {
|
||||||
|
signals.links.push((el as HTMLAnchorElement).href);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Collect meta tags
|
||||||
|
document.querySelectorAll('meta').forEach((el) => {
|
||||||
|
const content = el.getAttribute('content') || '';
|
||||||
|
const name = el.getAttribute('name') || el.getAttribute('property') || '';
|
||||||
|
if (content || name) {
|
||||||
|
signals.metaTags.push(`${name}:${content}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Look for JSON data
|
||||||
|
const jsonBlocks: string[] = [];
|
||||||
|
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
|
||||||
|
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
|
||||||
|
});
|
||||||
|
signals.jsonBlocks = jsonBlocks;
|
||||||
|
|
||||||
|
return signals;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function analyzeCategorySignals(
|
||||||
|
category: IntelligenceCategory,
|
||||||
|
allSignals: Record<string, any>
|
||||||
|
): CategoryDetectionResult {
|
||||||
|
const providerScores: Record<MenuProvider, number> = {} as any;
|
||||||
|
const detectedSignals: Record<string, any> = {};
|
||||||
|
|
||||||
|
// Initialize scores
|
||||||
|
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
|
||||||
|
providerScores[provider as MenuProvider] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Analyze each page's signals
|
||||||
|
for (const [pagePath, signals] of Object.entries(allSignals)) {
|
||||||
|
if (!signals || typeof signals !== 'object') continue;
|
||||||
|
|
||||||
|
// Check for provider-specific patterns
|
||||||
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
// Check scripts
|
||||||
|
if (signals.scripts) {
|
||||||
|
for (const script of signals.scripts) {
|
||||||
|
for (const pattern of patterns.scripts) {
|
||||||
|
if (pattern.test(script)) {
|
||||||
|
score += 20;
|
||||||
|
detectedSignals[`${provider}_script_${pagePath}`] = script;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check iframes
|
||||||
|
if (signals.iframes) {
|
||||||
|
for (const iframe of signals.iframes) {
|
||||||
|
for (const pattern of patterns.iframes) {
|
||||||
|
if (pattern.test(iframe)) {
|
||||||
|
score += 25;
|
||||||
|
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check HTML content
|
||||||
|
if (signals.htmlSnippet) {
|
||||||
|
for (const pattern of patterns.html) {
|
||||||
|
if (pattern.test(signals.htmlSnippet)) {
|
||||||
|
score += 15;
|
||||||
|
detectedSignals[`${provider}_html_${pagePath}`] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
providerScores[provider as MenuProvider] += score;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for category-specific signals on relevant pages
|
||||||
|
const categorySignals = CATEGORY_SIGNALS[category];
|
||||||
|
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
|
||||||
|
|
||||||
|
if (isRelevantPage && signals.htmlSnippet) {
|
||||||
|
for (const pattern of categorySignals.htmlPatterns) {
|
||||||
|
if (pattern.test(signals.htmlSnippet)) {
|
||||||
|
detectedSignals[`${category}_html_pattern`] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check JSON blocks for category data
|
||||||
|
if (signals.jsonBlocks) {
|
||||||
|
for (const json of signals.jsonBlocks) {
|
||||||
|
for (const key of categorySignals.jsonKeys) {
|
||||||
|
if (json.toLowerCase().includes(`"${key}"`)) {
|
||||||
|
detectedSignals[`${category}_json_key_${key}`] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine winning provider
|
||||||
|
let bestProvider: MenuProvider = 'unknown';
|
||||||
|
let bestScore = 0;
|
||||||
|
|
||||||
|
for (const [provider, score] of Object.entries(providerScores)) {
|
||||||
|
if (score > bestScore) {
|
||||||
|
bestScore = score;
|
||||||
|
bestProvider = provider as MenuProvider;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate confidence (0-100)
|
||||||
|
const confidence = Math.min(100, bestScore);
|
||||||
|
|
||||||
|
// Determine mode based on provider and confidence
|
||||||
|
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
|
||||||
|
const mode: 'production' | 'sandbox' = isProductionReady && confidence >= 70
|
||||||
|
? 'production'
|
||||||
|
: 'sandbox';
|
||||||
|
|
||||||
|
// Get template name if available
|
||||||
|
let templateName: string | undefined;
|
||||||
|
if (bestProvider === 'dutchie' && category === 'product') {
|
||||||
|
templateName = 'dutchie_standard';
|
||||||
|
} else if (bestProvider === 'treez') {
|
||||||
|
templateName = 'treez_products_v0';
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: bestProvider,
|
||||||
|
confidence,
|
||||||
|
mode,
|
||||||
|
signals: detectedSignals,
|
||||||
|
templateName,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createUnknownResult(): CategoryDetectionResult {
|
||||||
|
return {
|
||||||
|
provider: 'unknown',
|
||||||
|
confidence: 0,
|
||||||
|
mode: 'sandbox',
|
||||||
|
signals: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Lightweight Per-Category Change Detection
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export async function detectCategoryProviderChange(
|
||||||
|
page: Page,
|
||||||
|
category: IntelligenceCategory,
|
||||||
|
expectedProvider: MenuProvider
|
||||||
|
): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> {
|
||||||
|
try {
|
||||||
|
const signals = await collectPageSignals(page);
|
||||||
|
const result = analyzeCategorySignals(category, { currentPage: signals });
|
||||||
|
|
||||||
|
if (result.provider !== expectedProvider && result.confidence > 50) {
|
||||||
|
logger.warn(
|
||||||
|
'provider-detection',
|
||||||
|
`Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
changed: true,
|
||||||
|
newProvider: result.provider,
|
||||||
|
confidence: result.confidence,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return { changed: false };
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error('provider-detection', `Change detection failed: ${error.message}`);
|
||||||
|
return { changed: false };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Database Operations
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export async function updateDispensaryCategoryProvider(
|
||||||
|
dispensaryId: number,
|
||||||
|
category: IntelligenceCategory,
|
||||||
|
result: CategoryDetectionResult
|
||||||
|
): Promise<void> {
|
||||||
|
const columnPrefix = category === 'product' ? 'product' :
|
||||||
|
category === 'specials' ? 'specials' :
|
||||||
|
category === 'brand' ? 'brand' : 'metadata';
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET
|
||||||
|
${columnPrefix}_provider = $1,
|
||||||
|
${columnPrefix}_confidence = $2,
|
||||||
|
${columnPrefix}_crawler_mode = $3,
|
||||||
|
${columnPrefix}_detection_data = $4,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $5`,
|
||||||
|
[
|
||||||
|
result.provider,
|
||||||
|
result.confidence,
|
||||||
|
result.mode,
|
||||||
|
JSON.stringify(result.signals),
|
||||||
|
dispensaryId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateAllCategoryProviders(
|
||||||
|
dispensaryId: number,
|
||||||
|
result: MultiCategoryDetectionResult
|
||||||
|
): Promise<void> {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET
|
||||||
|
product_provider = $1,
|
||||||
|
product_confidence = $2,
|
||||||
|
product_crawler_mode = $3,
|
||||||
|
product_detection_data = $4,
|
||||||
|
specials_provider = $5,
|
||||||
|
specials_confidence = $6,
|
||||||
|
specials_crawler_mode = $7,
|
||||||
|
specials_detection_data = $8,
|
||||||
|
brand_provider = $9,
|
||||||
|
brand_confidence = $10,
|
||||||
|
brand_crawler_mode = $11,
|
||||||
|
brand_detection_data = $12,
|
||||||
|
metadata_provider = $13,
|
||||||
|
metadata_confidence = $14,
|
||||||
|
metadata_crawler_mode = $15,
|
||||||
|
metadata_detection_data = $16,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $17`,
|
||||||
|
[
|
||||||
|
result.product.provider,
|
||||||
|
result.product.confidence,
|
||||||
|
result.product.mode,
|
||||||
|
JSON.stringify(result.product.signals),
|
||||||
|
result.specials.provider,
|
||||||
|
result.specials.confidence,
|
||||||
|
result.specials.mode,
|
||||||
|
JSON.stringify(result.specials.signals),
|
||||||
|
result.brand.provider,
|
||||||
|
result.brand.confidence,
|
||||||
|
result.brand.mode,
|
||||||
|
JSON.stringify(result.brand.signals),
|
||||||
|
result.metadata.provider,
|
||||||
|
result.metadata.confidence,
|
||||||
|
result.metadata.mode,
|
||||||
|
JSON.stringify(result.metadata.signals),
|
||||||
|
dispensaryId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function moveCategoryToSandbox(
|
||||||
|
dispensaryId: number,
|
||||||
|
category: IntelligenceCategory,
|
||||||
|
reason: string
|
||||||
|
): Promise<void> {
|
||||||
|
const columnPrefix = category === 'product' ? 'product' :
|
||||||
|
category === 'specials' ? 'specials' :
|
||||||
|
category === 'brand' ? 'brand' : 'metadata';
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries SET
|
||||||
|
${columnPrefix}_crawler_mode = 'sandbox',
|
||||||
|
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $2`,
|
||||||
|
[
|
||||||
|
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
|
||||||
|
dispensaryId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
interface LogEntry {
|
interface LogEntry {
|
||||||
timestamp: Date;
|
timestamp: Date;
|
||||||
level: 'info' | 'error' | 'warn' | 'debug';
|
level: 'info' | 'error' | 'warn' | 'debug';
|
||||||
category: 'scraper' | 'images' | 'categories' | 'system' | 'api' | 'pipeline' | 'age-gate' | 'proxy';
|
category: 'scraper' | 'images' | 'categories' | 'system' | 'api' | 'pipeline' | 'age-gate' | 'proxy' | 'crawler-jobs' | 'provider-detection' | 'sandbox' | 'intelligence';
|
||||||
message: string;
|
message: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
726
backend/src/services/menu-provider-detector.ts
Normal file
726
backend/src/services/menu-provider-detector.ts
Normal file
@@ -0,0 +1,726 @@
|
|||||||
|
/**
|
||||||
|
* Menu Provider Detection Service
|
||||||
|
*
|
||||||
|
* Detects which menu platform a dispensary is using by analyzing:
|
||||||
|
* - HTML content patterns (scripts, iframes, classes)
|
||||||
|
* - URL patterns (embedded menu paths)
|
||||||
|
* - API endpoint signatures
|
||||||
|
* - Meta tags and headers
|
||||||
|
*/
|
||||||
|
|
||||||
|
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||||
|
import { logger } from './logger';
|
||||||
|
|
||||||
|
// Known menu provider signatures
|
||||||
|
export type MenuProvider =
|
||||||
|
| 'dutchie'
|
||||||
|
| 'treez'
|
||||||
|
| 'jane'
|
||||||
|
| 'iheartjane'
|
||||||
|
| 'weedmaps'
|
||||||
|
| 'leafly'
|
||||||
|
| 'meadow'
|
||||||
|
| 'greenlight'
|
||||||
|
| 'blaze'
|
||||||
|
| 'flowhub'
|
||||||
|
| 'dispense'
|
||||||
|
| 'cova'
|
||||||
|
| 'other'
|
||||||
|
| 'unknown';
|
||||||
|
|
||||||
|
export interface DetectionSignal {
|
||||||
|
provider: MenuProvider;
|
||||||
|
confidence: number; // 0-100
|
||||||
|
source: string; // What triggered this detection
|
||||||
|
details?: string; // Additional context
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DetectionResult {
|
||||||
|
provider: MenuProvider;
|
||||||
|
confidence: number;
|
||||||
|
signals: DetectionSignal[];
|
||||||
|
urlsTested: string[];
|
||||||
|
menuEntryPoints: string[];
|
||||||
|
rawSignals: Record<string, boolean | string | number>;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Provider detection patterns
|
||||||
|
const PROVIDER_PATTERNS: Record<string, {
|
||||||
|
scripts: RegExp[];
|
||||||
|
iframes: RegExp[];
|
||||||
|
classes: RegExp[];
|
||||||
|
urls: RegExp[];
|
||||||
|
meta: RegExp[];
|
||||||
|
apiEndpoints: RegExp[];
|
||||||
|
htmlPatterns: RegExp[];
|
||||||
|
}> = {
|
||||||
|
dutchie: {
|
||||||
|
scripts: [
|
||||||
|
/dutchie/i,
|
||||||
|
/dutchie-plus/i,
|
||||||
|
/dutchie\.com/i,
|
||||||
|
/dutchie-embed/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/dutchie\.com/i,
|
||||||
|
/embed\.dutchie/i,
|
||||||
|
/iframe\.dutchie/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/dutchie-/i,
|
||||||
|
/DutchieEmbed/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/dutchie\.com/i,
|
||||||
|
/\.dutchie\./i,
|
||||||
|
],
|
||||||
|
meta: [
|
||||||
|
/dutchie/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/graphql.*dutchie/i,
|
||||||
|
/api\.dutchie/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [
|
||||||
|
/data-dutchie/i,
|
||||||
|
/__DUTCHIE__/i,
|
||||||
|
/dutchie-plus-iframe/i,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
treez: {
|
||||||
|
scripts: [
|
||||||
|
/treez/i,
|
||||||
|
/treez\.io/i,
|
||||||
|
/treezpay/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/treez\.io/i,
|
||||||
|
/menu\.treez/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/treez-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/treez\.io/i,
|
||||||
|
/\.treez\./i,
|
||||||
|
],
|
||||||
|
meta: [
|
||||||
|
/treez/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.treez/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [
|
||||||
|
/data-treez/i,
|
||||||
|
/treez-embed/i,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
jane: {
|
||||||
|
scripts: [
|
||||||
|
/jane\.co/i,
|
||||||
|
/iheartjane/i,
|
||||||
|
/jane-embed/i,
|
||||||
|
/janetechnologies/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/jane\.co/i,
|
||||||
|
/iheartjane\.com/i,
|
||||||
|
/menu\.jane/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/jane-/i,
|
||||||
|
/iheartjane/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/jane\.co/i,
|
||||||
|
/iheartjane\.com/i,
|
||||||
|
],
|
||||||
|
meta: [
|
||||||
|
/jane/i,
|
||||||
|
/iheartjane/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.iheartjane/i,
|
||||||
|
/api\.jane\.co/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [
|
||||||
|
/data-jane/i,
|
||||||
|
/jane-root/i,
|
||||||
|
/jane-embed/i,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
weedmaps: {
|
||||||
|
scripts: [
|
||||||
|
/weedmaps/i,
|
||||||
|
/wm\.com/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/weedmaps\.com/i,
|
||||||
|
/menu\.weedmaps/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/weedmaps-/i,
|
||||||
|
/wm-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/weedmaps\.com/i,
|
||||||
|
],
|
||||||
|
meta: [
|
||||||
|
/weedmaps/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api.*weedmaps/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [
|
||||||
|
/data-weedmaps/i,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
leafly: {
|
||||||
|
scripts: [
|
||||||
|
/leafly/i,
|
||||||
|
/leafly\.com/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/leafly\.com/i,
|
||||||
|
/menu\.leafly/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/leafly-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/leafly\.com/i,
|
||||||
|
],
|
||||||
|
meta: [
|
||||||
|
/leafly/i,
|
||||||
|
],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.leafly/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [
|
||||||
|
/data-leafly/i,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
meadow: {
|
||||||
|
scripts: [
|
||||||
|
/meadow/i,
|
||||||
|
/getmeadow/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/getmeadow\.com/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/meadow-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/getmeadow\.com/i,
|
||||||
|
],
|
||||||
|
meta: [],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.getmeadow/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [],
|
||||||
|
},
|
||||||
|
|
||||||
|
greenlight: {
|
||||||
|
scripts: [
|
||||||
|
/greenlight/i,
|
||||||
|
/greenlightmenu/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/greenlight/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/greenlight-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/greenlight/i,
|
||||||
|
],
|
||||||
|
meta: [],
|
||||||
|
apiEndpoints: [],
|
||||||
|
htmlPatterns: [],
|
||||||
|
},
|
||||||
|
|
||||||
|
blaze: {
|
||||||
|
scripts: [
|
||||||
|
/blaze\.me/i,
|
||||||
|
/blazepos/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/blaze\.me/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/blaze-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/blaze\.me/i,
|
||||||
|
],
|
||||||
|
meta: [],
|
||||||
|
apiEndpoints: [
|
||||||
|
/api\.blaze/i,
|
||||||
|
],
|
||||||
|
htmlPatterns: [],
|
||||||
|
},
|
||||||
|
|
||||||
|
flowhub: {
|
||||||
|
scripts: [
|
||||||
|
/flowhub/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/flowhub\.com/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/flowhub-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/flowhub\.com/i,
|
||||||
|
],
|
||||||
|
meta: [],
|
||||||
|
apiEndpoints: [],
|
||||||
|
htmlPatterns: [],
|
||||||
|
},
|
||||||
|
|
||||||
|
dispense: {
|
||||||
|
scripts: [
|
||||||
|
/dispenseapp/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/dispenseapp\.com/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/dispense-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/dispenseapp\.com/i,
|
||||||
|
],
|
||||||
|
meta: [],
|
||||||
|
apiEndpoints: [],
|
||||||
|
htmlPatterns: [],
|
||||||
|
},
|
||||||
|
|
||||||
|
cova: {
|
||||||
|
scripts: [
|
||||||
|
/covasoftware/i,
|
||||||
|
/cova\.software/i,
|
||||||
|
],
|
||||||
|
iframes: [
|
||||||
|
/cova/i,
|
||||||
|
],
|
||||||
|
classes: [
|
||||||
|
/cova-/i,
|
||||||
|
],
|
||||||
|
urls: [
|
||||||
|
/cova/i,
|
||||||
|
],
|
||||||
|
meta: [],
|
||||||
|
apiEndpoints: [],
|
||||||
|
htmlPatterns: [],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Common menu URL paths to check
|
||||||
|
const MENU_PATHS = [
|
||||||
|
'/menu',
|
||||||
|
'/shop',
|
||||||
|
'/products',
|
||||||
|
'/order',
|
||||||
|
'/store',
|
||||||
|
'/dispensary-menu',
|
||||||
|
'/online-menu',
|
||||||
|
'/shop-all',
|
||||||
|
'/browse',
|
||||||
|
'/catalog',
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze a single page for provider signals
|
||||||
|
*/
|
||||||
|
async function analyzePageForProviders(
|
||||||
|
page: Page,
|
||||||
|
url: string
|
||||||
|
): Promise<DetectionSignal[]> {
|
||||||
|
const signals: DetectionSignal[] = [];
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Get page HTML
|
||||||
|
const html = await page.content();
|
||||||
|
const lowerHtml = html.toLowerCase();
|
||||||
|
|
||||||
|
// Check each provider's patterns
|
||||||
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||||
|
// Check script sources
|
||||||
|
const scripts = await page.$$eval('script[src]', els =>
|
||||||
|
els.map(el => el.getAttribute('src') || '')
|
||||||
|
);
|
||||||
|
for (const script of scripts) {
|
||||||
|
for (const pattern of patterns.scripts) {
|
||||||
|
if (pattern.test(script)) {
|
||||||
|
signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 90,
|
||||||
|
source: 'script_src',
|
||||||
|
details: script,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check inline scripts
|
||||||
|
const inlineScripts = await page.$$eval('script:not([src])', els =>
|
||||||
|
els.map(el => el.textContent || '')
|
||||||
|
);
|
||||||
|
for (const scriptContent of inlineScripts) {
|
||||||
|
for (const pattern of patterns.scripts) {
|
||||||
|
if (pattern.test(scriptContent)) {
|
||||||
|
signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 70,
|
||||||
|
source: 'inline_script',
|
||||||
|
details: `Pattern: ${pattern}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check iframes
|
||||||
|
const iframes = await page.$$eval('iframe', els =>
|
||||||
|
els.map(el => el.getAttribute('src') || '')
|
||||||
|
);
|
||||||
|
for (const iframe of iframes) {
|
||||||
|
for (const pattern of patterns.iframes) {
|
||||||
|
if (pattern.test(iframe)) {
|
||||||
|
signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 95,
|
||||||
|
source: 'iframe_src',
|
||||||
|
details: iframe,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check HTML patterns
|
||||||
|
for (const pattern of patterns.htmlPatterns) {
|
||||||
|
if (pattern.test(html)) {
|
||||||
|
signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 85,
|
||||||
|
source: 'html_pattern',
|
||||||
|
details: `Pattern: ${pattern}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check CSS classes
|
||||||
|
for (const pattern of patterns.classes) {
|
||||||
|
if (pattern.test(html)) {
|
||||||
|
signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 60,
|
||||||
|
source: 'css_class',
|
||||||
|
details: `Pattern: ${pattern}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check meta tags
|
||||||
|
const metaTags = await page.$$eval('meta', els =>
|
||||||
|
els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`)
|
||||||
|
);
|
||||||
|
for (const meta of metaTags) {
|
||||||
|
for (const pattern of patterns.meta) {
|
||||||
|
if (pattern.test(meta)) {
|
||||||
|
signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 80,
|
||||||
|
source: 'meta_tag',
|
||||||
|
details: meta,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for network requests (if we intercepted them)
|
||||||
|
// This would be enhanced with request interception
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return signals;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregate signals into a final detection result
|
||||||
|
*/
|
||||||
|
function aggregateSignals(signals: DetectionSignal[]): { provider: MenuProvider; confidence: number } {
|
||||||
|
if (signals.length === 0) {
|
||||||
|
return { provider: 'unknown', confidence: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group signals by provider
|
||||||
|
const providerScores: Record<string, number[]> = {};
|
||||||
|
for (const signal of signals) {
|
||||||
|
if (!providerScores[signal.provider]) {
|
||||||
|
providerScores[signal.provider] = [];
|
||||||
|
}
|
||||||
|
providerScores[signal.provider].push(signal.confidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate weighted score for each provider
|
||||||
|
const scores: { provider: MenuProvider; score: number }[] = [];
|
||||||
|
for (const [provider, confidences] of Object.entries(providerScores)) {
|
||||||
|
// Use max confidence + bonus for multiple signals
|
||||||
|
const maxConf = Math.max(...confidences);
|
||||||
|
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
|
||||||
|
const score = Math.min(100, maxConf + multiSignalBonus);
|
||||||
|
scores.push({ provider: provider as MenuProvider, score });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by score descending
|
||||||
|
scores.sort((a, b) => b.score - a.score);
|
||||||
|
|
||||||
|
const best = scores[0];
|
||||||
|
|
||||||
|
// If there's a clear winner (20+ point lead), use it
|
||||||
|
if (scores.length === 1 || best.score - scores[1].score >= 20) {
|
||||||
|
return { provider: best.provider, confidence: best.score };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multiple contenders - reduce confidence
|
||||||
|
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect the menu provider for a dispensary
|
||||||
|
*/
|
||||||
|
export async function detectMenuProvider(
|
||||||
|
websiteUrl: string,
|
||||||
|
options: {
|
||||||
|
checkMenuPaths?: boolean;
|
||||||
|
timeout?: number;
|
||||||
|
} = {}
|
||||||
|
): Promise<DetectionResult> {
|
||||||
|
const { checkMenuPaths = true, timeout = 30000 } = options;
|
||||||
|
|
||||||
|
const result: DetectionResult = {
|
||||||
|
provider: 'unknown',
|
||||||
|
confidence: 0,
|
||||||
|
signals: [],
|
||||||
|
urlsTested: [],
|
||||||
|
menuEntryPoints: [],
|
||||||
|
rawSignals: {},
|
||||||
|
};
|
||||||
|
|
||||||
|
let browser: Browser | null = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Normalize URL
|
||||||
|
let baseUrl = websiteUrl.trim();
|
||||||
|
if (!baseUrl.startsWith('http')) {
|
||||||
|
baseUrl = `https://${baseUrl}`;
|
||||||
|
}
|
||||||
|
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
|
||||||
|
|
||||||
|
// Launch browser
|
||||||
|
browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-gpu',
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setUserAgent(
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Track network requests for API detection
|
||||||
|
const apiRequests: string[] = [];
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (request) => {
|
||||||
|
const url = request.url();
|
||||||
|
if (url.includes('api') || url.includes('graphql')) {
|
||||||
|
apiRequests.push(url);
|
||||||
|
}
|
||||||
|
request.continue();
|
||||||
|
});
|
||||||
|
|
||||||
|
// URLs to check
|
||||||
|
const urlsToCheck = [baseUrl];
|
||||||
|
if (checkMenuPaths) {
|
||||||
|
for (const path of MENU_PATHS) {
|
||||||
|
urlsToCheck.push(`${baseUrl}${path}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check each URL
|
||||||
|
for (const url of urlsToCheck) {
|
||||||
|
try {
|
||||||
|
result.urlsTested.push(url);
|
||||||
|
|
||||||
|
await page.goto(url, {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait a bit for dynamic content
|
||||||
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
|
||||||
|
// Analyze page
|
||||||
|
const pageSignals = await analyzePageForProviders(page, url);
|
||||||
|
result.signals.push(...pageSignals);
|
||||||
|
|
||||||
|
// Track if this URL has menu content
|
||||||
|
const hasMenuContent = await page.evaluate(() => {
|
||||||
|
const text = document.body.innerText.toLowerCase();
|
||||||
|
return (
|
||||||
|
text.includes('add to cart') ||
|
||||||
|
text.includes('add to bag') ||
|
||||||
|
text.includes('product') ||
|
||||||
|
text.includes('indica') ||
|
||||||
|
text.includes('sativa') ||
|
||||||
|
text.includes('hybrid') ||
|
||||||
|
text.includes('thc') ||
|
||||||
|
text.includes('cbd')
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (hasMenuContent && url !== baseUrl) {
|
||||||
|
result.menuEntryPoints.push(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (pageError: any) {
|
||||||
|
// 404s are fine, just skip
|
||||||
|
if (!pageError.message?.includes('404')) {
|
||||||
|
logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check API requests for provider hints
|
||||||
|
for (const apiUrl of apiRequests) {
|
||||||
|
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||||
|
for (const pattern of patterns.apiEndpoints) {
|
||||||
|
if (pattern.test(apiUrl)) {
|
||||||
|
result.signals.push({
|
||||||
|
provider: provider as MenuProvider,
|
||||||
|
confidence: 95,
|
||||||
|
source: 'api_request',
|
||||||
|
details: apiUrl,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record raw signals
|
||||||
|
result.rawSignals = {
|
||||||
|
apiRequestsFound: apiRequests.length,
|
||||||
|
menuEntryPointsFound: result.menuEntryPoints.length,
|
||||||
|
totalSignals: result.signals.length,
|
||||||
|
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Aggregate signals into final result
|
||||||
|
const aggregated = aggregateSignals(result.signals);
|
||||||
|
result.provider = aggregated.provider;
|
||||||
|
result.confidence = aggregated.confidence;
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
result.error = error.message;
|
||||||
|
logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||||
|
} finally {
|
||||||
|
if (browser) {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Quick check if a site has Dutchie - used during production crawls
|
||||||
|
*/
|
||||||
|
export async function quickDutchieCheck(page: Page): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const html = await page.content();
|
||||||
|
|
||||||
|
// Check for Dutchie-specific patterns
|
||||||
|
const dutchiePatterns = [
|
||||||
|
/dutchie/i,
|
||||||
|
/dutchie-plus/i,
|
||||||
|
/__DUTCHIE__/i,
|
||||||
|
/data-dutchie/i,
|
||||||
|
/embed\.dutchie/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const pattern of dutchiePatterns) {
|
||||||
|
if (pattern.test(html)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check iframes
|
||||||
|
const iframes = await page.$$eval('iframe', els =>
|
||||||
|
els.map(el => el.getAttribute('src') || '')
|
||||||
|
);
|
||||||
|
for (const iframe of iframes) {
|
||||||
|
if (/dutchie/i.test(iframe)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if provider has changed from expected
|
||||||
|
*/
|
||||||
|
export async function detectProviderChange(
|
||||||
|
page: Page,
|
||||||
|
expectedProvider: MenuProvider
|
||||||
|
): Promise<{ changed: boolean; newProvider?: MenuProvider; confidence?: number }> {
|
||||||
|
try {
|
||||||
|
const signals = await analyzePageForProviders(page, page.url());
|
||||||
|
const aggregated = aggregateSignals(signals);
|
||||||
|
|
||||||
|
// If we expected Dutchie but found something else with high confidence
|
||||||
|
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
|
||||||
|
return {
|
||||||
|
changed: true,
|
||||||
|
newProvider: aggregated.provider,
|
||||||
|
confidence: aggregated.confidence,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we expected Dutchie and found nothing/low confidence, might have switched
|
||||||
|
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
|
||||||
|
// Check if Dutchie is definitely NOT present
|
||||||
|
const hasDutchie = await quickDutchieCheck(page);
|
||||||
|
if (!hasDutchie) {
|
||||||
|
return {
|
||||||
|
changed: true,
|
||||||
|
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
|
||||||
|
confidence: Math.max(30, aggregated.confidence),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { changed: false };
|
||||||
|
} catch {
|
||||||
|
return { changed: false };
|
||||||
|
}
|
||||||
|
}
|
||||||
441
backend/src/services/store-crawl-orchestrator.ts
Normal file
441
backend/src/services/store-crawl-orchestrator.ts
Normal file
@@ -0,0 +1,441 @@
|
|||||||
|
/**
|
||||||
|
* Store Crawl Orchestrator
|
||||||
|
*
|
||||||
|
* Orchestrates the complete crawl workflow for a store:
|
||||||
|
* 1. Load store and its linked dispensary
|
||||||
|
* 2. Check if provider detection is needed
|
||||||
|
* 3. Run provider detection if needed
|
||||||
|
* 4. Queue appropriate crawl jobs based on provider/mode
|
||||||
|
* 5. Update store_crawl_schedule with meaningful status
|
||||||
|
*
|
||||||
|
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
|
import { pool } from '../db/migrate';
|
||||||
|
import { crawlerLogger } from './crawler-logger';
|
||||||
|
import {
|
||||||
|
detectMultiCategoryProviders,
|
||||||
|
updateAllCategoryProviders,
|
||||||
|
MultiCategoryDetectionResult,
|
||||||
|
} from './intelligence-detector';
|
||||||
|
import { runCrawlProductsJob, runSandboxProductsJob } from './category-crawler-jobs';
|
||||||
|
import { scrapeStore } from '../scraper-v2';
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Types
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
export type OrchestratorStatus = 'success' | 'error' | 'sandbox_only' | 'detection_only' | 'pending' | 'running';
|
||||||
|
|
||||||
|
export interface OrchestratorResult {
|
||||||
|
status: OrchestratorStatus;
|
||||||
|
summary: string;
|
||||||
|
runId: string;
|
||||||
|
storeId: number;
|
||||||
|
dispensaryId: number | null;
|
||||||
|
detectionRan: boolean;
|
||||||
|
detectionResult?: MultiCategoryDetectionResult;
|
||||||
|
crawlRan: boolean;
|
||||||
|
crawlType?: 'production' | 'sandbox' | 'none';
|
||||||
|
productsFound?: number;
|
||||||
|
productsNew?: number;
|
||||||
|
productsUpdated?: number;
|
||||||
|
error?: string;
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface StoreWithDispensary {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
timezone: string;
|
||||||
|
dispensary_id: number | null;
|
||||||
|
dispensary_name: string | null;
|
||||||
|
dispensary_menu_url: string | null;
|
||||||
|
dispensary_website: string | null;
|
||||||
|
product_provider: string | null;
|
||||||
|
product_confidence: number | null;
|
||||||
|
product_crawler_mode: string | null;
|
||||||
|
last_product_scan_at: Date | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Main Orchestrator Function
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the complete crawl orchestration for a store
|
||||||
|
*
|
||||||
|
* Behavior:
|
||||||
|
* 1. Load the store and its linked dispensary
|
||||||
|
* 2. If no dispensary is linked, report error
|
||||||
|
* 3. If product_provider is missing or stale (>7 days), run detection
|
||||||
|
* 4. After detection:
|
||||||
|
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
||||||
|
* - Otherwise: Run sandbox crawl
|
||||||
|
* 5. Update store_crawl_schedule with status/summary
|
||||||
|
*/
|
||||||
|
export async function runStoreCrawlOrchestrator(storeId: number): Promise<OrchestratorResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const runId = uuidv4();
|
||||||
|
|
||||||
|
let result: OrchestratorResult = {
|
||||||
|
status: 'pending',
|
||||||
|
summary: '',
|
||||||
|
runId,
|
||||||
|
storeId,
|
||||||
|
dispensaryId: null,
|
||||||
|
detectionRan: false,
|
||||||
|
crawlRan: false,
|
||||||
|
durationMs: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Mark schedule as running
|
||||||
|
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
|
||||||
|
|
||||||
|
// 1. Load store with dispensary info
|
||||||
|
const store = await getStoreWithDispensary(storeId);
|
||||||
|
if (!store) {
|
||||||
|
throw new Error(`Store ${storeId} not found`);
|
||||||
|
}
|
||||||
|
|
||||||
|
result.dispensaryId = store.dispensary_id;
|
||||||
|
|
||||||
|
// 2. Check if dispensary is linked
|
||||||
|
if (!store.dispensary_id) {
|
||||||
|
result.status = 'error';
|
||||||
|
result.summary = 'No dispensary linked - cannot determine provider';
|
||||||
|
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
|
||||||
|
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
||||||
|
result.durationMs = Date.now() - startTime;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Check if provider detection is needed
|
||||||
|
const needsDetection = await checkNeedsDetection(store);
|
||||||
|
|
||||||
|
if (needsDetection) {
|
||||||
|
// Run provider detection
|
||||||
|
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
|
||||||
|
if (!websiteUrl) {
|
||||||
|
result.status = 'error';
|
||||||
|
result.summary = 'No website URL available for detection';
|
||||||
|
result.error = 'Dispensary has no menu_url or website configured';
|
||||||
|
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
||||||
|
result.durationMs = Date.now() - startTime;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
|
||||||
|
|
||||||
|
const detectionResult = await detectMultiCategoryProviders(websiteUrl);
|
||||||
|
result.detectionRan = true;
|
||||||
|
result.detectionResult = detectionResult;
|
||||||
|
|
||||||
|
// Save detection results to dispensary
|
||||||
|
await updateAllCategoryProviders(store.dispensary_id, detectionResult);
|
||||||
|
|
||||||
|
crawlerLogger.providerDetected({
|
||||||
|
dispensary_id: store.dispensary_id,
|
||||||
|
dispensary_name: store.dispensary_name || store.name,
|
||||||
|
detected_provider: detectionResult.product.provider,
|
||||||
|
confidence: detectionResult.product.confidence,
|
||||||
|
detection_method: 'orchestrator_run',
|
||||||
|
menu_url: websiteUrl,
|
||||||
|
category: 'product',
|
||||||
|
});
|
||||||
|
|
||||||
|
// Refresh store info after detection
|
||||||
|
const updatedStore = await getStoreWithDispensary(storeId);
|
||||||
|
if (updatedStore) {
|
||||||
|
Object.assign(store, updatedStore);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Determine crawl type and run
|
||||||
|
const provider = store.product_provider;
|
||||||
|
const mode = store.product_crawler_mode;
|
||||||
|
|
||||||
|
if (provider === 'dutchie' && mode === 'production') {
|
||||||
|
// Production Dutchie crawl
|
||||||
|
await updateScheduleStatus(storeId, 'running', 'Running Dutchie production crawl...', runId);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Run the actual scraper
|
||||||
|
await scrapeStore(storeId);
|
||||||
|
|
||||||
|
// Get crawl stats from the latest job
|
||||||
|
const stats = await getLatestCrawlStats(storeId);
|
||||||
|
|
||||||
|
result.crawlRan = true;
|
||||||
|
result.crawlType = 'production';
|
||||||
|
result.productsFound = stats.products_found ?? undefined;
|
||||||
|
result.productsNew = stats.products_new ?? undefined;
|
||||||
|
result.productsUpdated = stats.products_updated ?? undefined;
|
||||||
|
|
||||||
|
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||||
|
result.summary = `${detectionPart}Dutchie products crawl (${stats.products_found || 0} items, ${stats.products_new || 0} new, ${stats.products_updated || 0} updated)`;
|
||||||
|
result.status = 'success';
|
||||||
|
|
||||||
|
// Update store's last_scraped_at
|
||||||
|
await pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
|
||||||
|
|
||||||
|
crawlerLogger.jobCompleted({
|
||||||
|
job_id: 0, // Orchestrator doesn't create traditional jobs
|
||||||
|
store_id: storeId,
|
||||||
|
store_name: store.name,
|
||||||
|
duration_ms: Date.now() - startTime,
|
||||||
|
products_found: stats.products_found || 0,
|
||||||
|
products_new: stats.products_new || 0,
|
||||||
|
products_updated: stats.products_updated || 0,
|
||||||
|
provider: 'dutchie',
|
||||||
|
});
|
||||||
|
|
||||||
|
} catch (crawlError: any) {
|
||||||
|
result.status = 'error';
|
||||||
|
result.error = crawlError.message;
|
||||||
|
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
||||||
|
result.crawlRan = true;
|
||||||
|
result.crawlType = 'production';
|
||||||
|
|
||||||
|
crawlerLogger.jobFailed({
|
||||||
|
job_id: 0,
|
||||||
|
store_id: storeId,
|
||||||
|
store_name: store.name,
|
||||||
|
duration_ms: Date.now() - startTime,
|
||||||
|
error_message: crawlError.message,
|
||||||
|
provider: 'dutchie',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (provider && provider !== 'unknown') {
|
||||||
|
// Sandbox crawl for non-Dutchie or sandbox mode
|
||||||
|
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const sandboxResult = await runSandboxProductsJob(store.dispensary_id);
|
||||||
|
|
||||||
|
result.crawlRan = true;
|
||||||
|
result.crawlType = 'sandbox';
|
||||||
|
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
||||||
|
|
||||||
|
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||||
|
if (sandboxResult.success) {
|
||||||
|
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
||||||
|
result.status = 'sandbox_only';
|
||||||
|
} else {
|
||||||
|
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
||||||
|
result.status = 'error';
|
||||||
|
result.error = sandboxResult.message;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (sandboxError: any) {
|
||||||
|
result.status = 'error';
|
||||||
|
result.error = sandboxError.message;
|
||||||
|
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
||||||
|
result.crawlRan = true;
|
||||||
|
result.crawlType = 'sandbox';
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// No provider detected - detection only
|
||||||
|
if (result.detectionRan) {
|
||||||
|
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
|
||||||
|
result.status = 'detection_only';
|
||||||
|
} else {
|
||||||
|
result.summary = 'No provider detected and no crawl possible';
|
||||||
|
result.status = 'error';
|
||||||
|
result.error = 'Could not determine menu provider';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
result.status = 'error';
|
||||||
|
result.error = error.message;
|
||||||
|
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
||||||
|
|
||||||
|
crawlerLogger.queueFailure({
|
||||||
|
queue_type: 'orchestrator',
|
||||||
|
error_message: error.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
result.durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
// Update final schedule status
|
||||||
|
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
|
||||||
|
|
||||||
|
// Create a crawl_job record for tracking
|
||||||
|
await createOrchestratorJobRecord(storeId, result);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Helper Functions
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
async function getStoreWithDispensary(storeId: number): Promise<StoreWithDispensary | null> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT
|
||||||
|
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
|
||||||
|
d.name as dispensary_name,
|
||||||
|
d.menu_url as dispensary_menu_url,
|
||||||
|
d.website as dispensary_website,
|
||||||
|
d.product_provider,
|
||||||
|
d.product_confidence,
|
||||||
|
d.product_crawler_mode,
|
||||||
|
d.last_product_scan_at
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||||
|
WHERE s.id = $1`,
|
||||||
|
[storeId]
|
||||||
|
);
|
||||||
|
return result.rows[0] || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function checkNeedsDetection(store: StoreWithDispensary): Promise<boolean> {
|
||||||
|
// No dispensary = can't detect
|
||||||
|
if (!store.dispensary_id) return false;
|
||||||
|
|
||||||
|
// No provider = definitely needs detection
|
||||||
|
if (!store.product_provider) return true;
|
||||||
|
|
||||||
|
// Unknown provider = needs detection
|
||||||
|
if (store.product_provider === 'unknown') return true;
|
||||||
|
|
||||||
|
// Low confidence = needs re-detection
|
||||||
|
if (store.product_confidence !== null && store.product_confidence < 50) return true;
|
||||||
|
|
||||||
|
// Stale detection (> 7 days) = needs refresh
|
||||||
|
if (store.last_product_scan_at) {
|
||||||
|
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
||||||
|
if (daysSince > 7) return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function updateScheduleStatus(
|
||||||
|
storeId: number,
|
||||||
|
status: OrchestratorStatus,
|
||||||
|
summary: string,
|
||||||
|
runId: string,
|
||||||
|
error?: string
|
||||||
|
): Promise<void> {
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
|
||||||
|
VALUES ($1, $2, $3, NOW(), $4)
|
||||||
|
ON CONFLICT (store_id) DO UPDATE SET
|
||||||
|
last_status = $2,
|
||||||
|
last_summary = $3,
|
||||||
|
last_run_at = NOW(),
|
||||||
|
last_error = $4,
|
||||||
|
updated_at = NOW()`,
|
||||||
|
[storeId, status, summary, error || null]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getLatestCrawlStats(storeId: number): Promise<{
|
||||||
|
products_found: number | null;
|
||||||
|
products_new: number | null;
|
||||||
|
products_updated: number | null;
|
||||||
|
}> {
|
||||||
|
// Get count of products for this store
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
|
||||||
|
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
|
||||||
|
FROM products
|
||||||
|
WHERE store_id = $1`,
|
||||||
|
[storeId]
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
products_found: parseInt(result.rows[0]?.total || '0'),
|
||||||
|
products_new: parseInt(result.rows[0]?.recent_new || '0'),
|
||||||
|
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createOrchestratorJobRecord(storeId: number, result: OrchestratorResult): Promise<void> {
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO crawl_jobs (
|
||||||
|
store_id, job_type, trigger_type, status, priority,
|
||||||
|
scheduled_at, started_at, completed_at,
|
||||||
|
products_found, products_new, products_updated,
|
||||||
|
error_message, orchestrator_run_id, detection_result
|
||||||
|
) VALUES (
|
||||||
|
$1, 'orchestrator', 'manual', $2, 100,
|
||||||
|
NOW(), NOW(), NOW(),
|
||||||
|
$3, $4, $5,
|
||||||
|
$6, $7, $8
|
||||||
|
)`,
|
||||||
|
[
|
||||||
|
storeId,
|
||||||
|
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
||||||
|
result.productsFound || null,
|
||||||
|
result.productsNew || null,
|
||||||
|
result.productsUpdated || null,
|
||||||
|
result.error || null,
|
||||||
|
result.runId,
|
||||||
|
result.detectionResult ? JSON.stringify({
|
||||||
|
product_provider: result.detectionResult.product.provider,
|
||||||
|
product_confidence: result.detectionResult.product.confidence,
|
||||||
|
product_mode: result.detectionResult.product.mode,
|
||||||
|
}) : null,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================
|
||||||
|
// Batch Orchestration
|
||||||
|
// ========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run orchestrator for multiple stores
|
||||||
|
*/
|
||||||
|
export async function runBatchOrchestrator(
|
||||||
|
storeIds: number[],
|
||||||
|
concurrency: number = 3
|
||||||
|
): Promise<OrchestratorResult[]> {
|
||||||
|
const results: OrchestratorResult[] = [];
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < storeIds.length; i += concurrency) {
|
||||||
|
const batch = storeIds.slice(i, i + concurrency);
|
||||||
|
const batchResults = await Promise.all(
|
||||||
|
batch.map(storeId => runStoreCrawlOrchestrator(storeId))
|
||||||
|
);
|
||||||
|
results.push(...batchResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get stores that are due for orchestration
|
||||||
|
*/
|
||||||
|
export async function getStoresDueForOrchestration(limit: number = 10): Promise<number[]> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT s.id
|
||||||
|
FROM stores s
|
||||||
|
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||||
|
WHERE s.active = TRUE
|
||||||
|
AND s.scrape_enabled = TRUE
|
||||||
|
AND COALESCE(scs.enabled, TRUE) = TRUE
|
||||||
|
AND (
|
||||||
|
scs.last_run_at IS NULL
|
||||||
|
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
|
||||||
|
)
|
||||||
|
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
|
||||||
|
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
|
||||||
|
LIMIT $1`,
|
||||||
|
[limit]
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.rows.map(row => row.id);
|
||||||
|
}
|
||||||
@@ -16,6 +16,7 @@ import { Settings } from './pages/Settings';
|
|||||||
import { Proxies } from './pages/Proxies';
|
import { Proxies } from './pages/Proxies';
|
||||||
import { Logs } from './pages/Logs';
|
import { Logs } from './pages/Logs';
|
||||||
import { ScraperMonitor } from './pages/ScraperMonitor';
|
import { ScraperMonitor } from './pages/ScraperMonitor';
|
||||||
|
import { ScraperSchedule } from './pages/ScraperSchedule';
|
||||||
import { ScraperTools } from './pages/ScraperTools';
|
import { ScraperTools } from './pages/ScraperTools';
|
||||||
import { ChangeApproval } from './pages/ChangeApproval';
|
import { ChangeApproval } from './pages/ChangeApproval';
|
||||||
import { ApiPermissions } from './pages/ApiPermissions';
|
import { ApiPermissions } from './pages/ApiPermissions';
|
||||||
@@ -44,6 +45,7 @@ export default function App() {
|
|||||||
<Route path="/logs" element={<PrivateRoute><Logs /></PrivateRoute>} />
|
<Route path="/logs" element={<PrivateRoute><Logs /></PrivateRoute>} />
|
||||||
<Route path="/scraper-tools" element={<PrivateRoute><ScraperTools /></PrivateRoute>} />
|
<Route path="/scraper-tools" element={<PrivateRoute><ScraperTools /></PrivateRoute>} />
|
||||||
<Route path="/scraper-monitor" element={<PrivateRoute><ScraperMonitor /></PrivateRoute>} />
|
<Route path="/scraper-monitor" element={<PrivateRoute><ScraperMonitor /></PrivateRoute>} />
|
||||||
|
<Route path="/scraper-schedule" element={<PrivateRoute><ScraperSchedule /></PrivateRoute>} />
|
||||||
<Route path="/api-permissions" element={<PrivateRoute><ApiPermissions /></PrivateRoute>} />
|
<Route path="/api-permissions" element={<PrivateRoute><ApiPermissions /></PrivateRoute>} />
|
||||||
<Route path="*" element={<Navigate to="/" replace />} />
|
<Route path="*" element={<Navigate to="/" replace />} />
|
||||||
</Routes>
|
</Routes>
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import {
|
|||||||
TrendingUp,
|
TrendingUp,
|
||||||
Wrench,
|
Wrench,
|
||||||
Activity,
|
Activity,
|
||||||
|
Clock,
|
||||||
Shield,
|
Shield,
|
||||||
FileText,
|
FileText,
|
||||||
Settings,
|
Settings,
|
||||||
@@ -147,6 +148,12 @@ export function Layout({ children }: LayoutProps) {
|
|||||||
label="Tools"
|
label="Tools"
|
||||||
isActive={isActive('/scraper-tools')}
|
isActive={isActive('/scraper-tools')}
|
||||||
/>
|
/>
|
||||||
|
<NavLink
|
||||||
|
to="/scraper-schedule"
|
||||||
|
icon={<Clock className="w-4 h-4" />}
|
||||||
|
label="Schedule"
|
||||||
|
isActive={isActive('/scraper-schedule')}
|
||||||
|
/>
|
||||||
<NavLink
|
<NavLink
|
||||||
to="/scraper-monitor"
|
to="/scraper-monitor"
|
||||||
icon={<Activity className="w-4 h-4" />}
|
icon={<Activity className="w-4 h-4" />}
|
||||||
|
|||||||
@@ -423,6 +423,67 @@ class ApiClient {
|
|||||||
method: 'DELETE',
|
method: 'DELETE',
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Crawler Schedule
|
||||||
|
async getGlobalSchedule() {
|
||||||
|
return this.request<{ schedules: any[] }>('/api/schedule/global');
|
||||||
|
}
|
||||||
|
|
||||||
|
async updateGlobalSchedule(type: string, data: { enabled?: boolean; interval_hours?: number; run_time?: string }) {
|
||||||
|
return this.request<{ schedule: any; message: string }>(`/api/schedule/global/${type}`, {
|
||||||
|
method: 'PUT',
|
||||||
|
body: JSON.stringify(data),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async getStoreSchedules() {
|
||||||
|
return this.request<{ stores: any[] }>('/api/schedule/stores');
|
||||||
|
}
|
||||||
|
|
||||||
|
async getStoreSchedule(storeId: number) {
|
||||||
|
return this.request<{ schedule: any }>(`/api/schedule/stores/${storeId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async updateStoreSchedule(storeId: number, data: any) {
|
||||||
|
return this.request<{ schedule: any }>(`/api/schedule/stores/${storeId}`, {
|
||||||
|
method: 'PUT',
|
||||||
|
body: JSON.stringify(data),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async getCrawlJobs(limit?: number) {
|
||||||
|
const params = limit ? `?limit=${limit}` : '';
|
||||||
|
return this.request<{ jobs: any[] }>(`/api/schedule/jobs${params}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async getStoreCrawlJobs(storeId: number, limit?: number) {
|
||||||
|
const params = limit ? `?limit=${limit}` : '';
|
||||||
|
return this.request<{ jobs: any[] }>(`/api/schedule/jobs/store/${storeId}${params}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async cancelCrawlJob(jobId: number) {
|
||||||
|
return this.request<{ success: boolean; message: string }>(`/api/schedule/jobs/${jobId}/cancel`, {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async triggerStoreCrawl(storeId: number) {
|
||||||
|
return this.request<{ job: any; message: string }>(`/api/schedule/trigger/store/${storeId}`, {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async triggerAllCrawls() {
|
||||||
|
return this.request<{ jobs_created: number; message: string }>('/api/schedule/trigger/all', {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async restartScheduler() {
|
||||||
|
return this.request<{ message: string }>('/api/schedule/restart', {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const api = new ApiClient(API_URL);
|
export const api = new ApiClient(API_URL);
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { useEffect, useState } from 'react';
|
import { useEffect, useState } from 'react';
|
||||||
import { useParams, useNavigate } from 'react-router-dom';
|
import { useParams, useNavigate, Link } from 'react-router-dom';
|
||||||
import { Layout } from '../components/Layout';
|
import { Layout } from '../components/Layout';
|
||||||
import { api } from '../lib/api';
|
import { api } from '../lib/api';
|
||||||
import {
|
import {
|
||||||
@@ -15,7 +15,8 @@ import {
|
|||||||
DollarSign,
|
DollarSign,
|
||||||
Calendar,
|
Calendar,
|
||||||
RefreshCw,
|
RefreshCw,
|
||||||
ChevronDown
|
ChevronDown,
|
||||||
|
Clock
|
||||||
} from 'lucide-react';
|
} from 'lucide-react';
|
||||||
|
|
||||||
export function DispensaryDetail() {
|
export function DispensaryDetail() {
|
||||||
@@ -33,6 +34,19 @@ export function DispensaryDetail() {
|
|||||||
const [currentPage, setCurrentPage] = useState(1);
|
const [currentPage, setCurrentPage] = useState(1);
|
||||||
const [itemsPerPage] = useState(25);
|
const [itemsPerPage] = useState(25);
|
||||||
|
|
||||||
|
const formatDate = (dateStr: string) => {
|
||||||
|
if (!dateStr) return 'Never';
|
||||||
|
const date = new Date(dateStr);
|
||||||
|
const now = new Date();
|
||||||
|
const diffMs = now.getTime() - date.getTime();
|
||||||
|
const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
|
||||||
|
|
||||||
|
if (diffDays === 0) return 'Today';
|
||||||
|
if (diffDays === 1) return 'Yesterday';
|
||||||
|
if (diffDays < 7) return `${diffDays} days ago`;
|
||||||
|
return date.toLocaleDateString();
|
||||||
|
};
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
loadDispensary();
|
loadDispensary();
|
||||||
}, [slug]);
|
}, [slug]);
|
||||||
@@ -274,6 +288,13 @@ export function DispensaryDetail() {
|
|||||||
<span>AZDHS Profile</span>
|
<span>AZDHS Profile</span>
|
||||||
</a>
|
</a>
|
||||||
)}
|
)}
|
||||||
|
<Link
|
||||||
|
to="/schedule"
|
||||||
|
className="flex items-center gap-2 text-sm text-blue-600 hover:text-blue-800"
|
||||||
|
>
|
||||||
|
<Clock className="w-4 h-4" />
|
||||||
|
<span>View Schedule</span>
|
||||||
|
</Link>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -424,7 +445,8 @@ export function DispensaryDetail() {
|
|||||||
<th className="text-center">CBD %</th>
|
<th className="text-center">CBD %</th>
|
||||||
<th className="text-center">Strain Type</th>
|
<th className="text-center">Strain Type</th>
|
||||||
<th className="text-center">In Stock</th>
|
<th className="text-center">In Stock</th>
|
||||||
<th>Link</th>
|
<th>Last Updated</th>
|
||||||
|
<th>Actions</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
@@ -490,17 +512,28 @@ export function DispensaryDetail() {
|
|||||||
<span className="badge badge-error badge-sm">No</span>
|
<span className="badge badge-error badge-sm">No</span>
|
||||||
) : '-'}
|
) : '-'}
|
||||||
</td>
|
</td>
|
||||||
|
<td className="whitespace-nowrap text-xs text-gray-500">
|
||||||
|
{product.updated_at ? formatDate(product.updated_at) : '-'}
|
||||||
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{product.dutchie_url ? (
|
<div className="flex gap-1">
|
||||||
|
{product.dutchie_url && (
|
||||||
<a
|
<a
|
||||||
href={product.dutchie_url}
|
href={product.dutchie_url}
|
||||||
target="_blank"
|
target="_blank"
|
||||||
rel="noopener noreferrer"
|
rel="noopener noreferrer"
|
||||||
className="btn btn-xs btn-outline"
|
className="btn btn-xs btn-outline"
|
||||||
>
|
>
|
||||||
View
|
Dutchie
|
||||||
</a>
|
</a>
|
||||||
) : '-'}
|
)}
|
||||||
|
<button
|
||||||
|
onClick={() => navigate(`/products/${product.id}`)}
|
||||||
|
className="btn btn-xs btn-primary"
|
||||||
|
>
|
||||||
|
Details
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
))}
|
))}
|
||||||
|
|||||||
723
frontend/src/pages/ScraperSchedule.tsx
Normal file
723
frontend/src/pages/ScraperSchedule.tsx
Normal file
@@ -0,0 +1,723 @@
|
|||||||
|
import { useEffect, useState } from 'react';
|
||||||
|
import { Link } from 'react-router-dom';
|
||||||
|
import { Layout } from '../components/Layout';
|
||||||
|
import { api } from '../lib/api';
|
||||||
|
|
||||||
|
interface GlobalSchedule {
|
||||||
|
id: number;
|
||||||
|
schedule_type: string;
|
||||||
|
enabled: boolean;
|
||||||
|
interval_hours?: number;
|
||||||
|
run_time?: string;
|
||||||
|
description?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface StoreSchedule {
|
||||||
|
store_id: number;
|
||||||
|
store_name: string;
|
||||||
|
store_slug: string;
|
||||||
|
timezone: string;
|
||||||
|
active: boolean;
|
||||||
|
scrape_enabled: boolean;
|
||||||
|
last_scraped_at: string | null;
|
||||||
|
schedule_enabled: boolean;
|
||||||
|
interval_hours: number;
|
||||||
|
daily_special_enabled: boolean;
|
||||||
|
daily_special_time: string;
|
||||||
|
priority: number;
|
||||||
|
next_scheduled_run: string;
|
||||||
|
latest_job_id: number | null;
|
||||||
|
latest_job_status: string | null;
|
||||||
|
latest_job_type: string | null;
|
||||||
|
latest_job_trigger: string | null;
|
||||||
|
latest_job_started: string | null;
|
||||||
|
latest_job_completed: string | null;
|
||||||
|
latest_products_found: number | null;
|
||||||
|
latest_products_new: number | null;
|
||||||
|
latest_products_updated: number | null;
|
||||||
|
latest_job_error: string | null;
|
||||||
|
// Dispensary info (from master AZDHS directory)
|
||||||
|
dispensary_id: number | null;
|
||||||
|
dispensary_name: string | null;
|
||||||
|
dispensary_company: string | null;
|
||||||
|
dispensary_city: string | null;
|
||||||
|
// Provider intelligence (from dispensary)
|
||||||
|
product_provider: string | null;
|
||||||
|
product_confidence: number | null;
|
||||||
|
product_crawler_mode: string | null;
|
||||||
|
// Orchestrator status
|
||||||
|
last_status: string | null;
|
||||||
|
last_summary: string | null;
|
||||||
|
schedule_last_run: string | null;
|
||||||
|
last_error: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CrawlJob {
|
||||||
|
id: number;
|
||||||
|
store_id: number;
|
||||||
|
store_name: string;
|
||||||
|
job_type: string;
|
||||||
|
trigger_type: string;
|
||||||
|
status: string;
|
||||||
|
priority: number;
|
||||||
|
scheduled_at: string;
|
||||||
|
started_at: string | null;
|
||||||
|
completed_at: string | null;
|
||||||
|
products_found: number | null;
|
||||||
|
products_new: number | null;
|
||||||
|
products_updated: number | null;
|
||||||
|
error_message: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function ScraperSchedule() {
|
||||||
|
const [globalSchedules, setGlobalSchedules] = useState<GlobalSchedule[]>([]);
|
||||||
|
const [storeSchedules, setStoreSchedules] = useState<StoreSchedule[]>([]);
|
||||||
|
const [jobs, setJobs] = useState<CrawlJob[]>([]);
|
||||||
|
const [loading, setLoading] = useState(true);
|
||||||
|
const [autoRefresh, setAutoRefresh] = useState(true);
|
||||||
|
const [activeTab, setActiveTab] = useState<'stores' | 'jobs' | 'global'>('stores');
|
||||||
|
const [triggeringStore, setTriggeringStore] = useState<number | null>(null);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
loadData();
|
||||||
|
|
||||||
|
if (autoRefresh) {
|
||||||
|
const interval = setInterval(loadData, 5000);
|
||||||
|
return () => clearInterval(interval);
|
||||||
|
}
|
||||||
|
}, [autoRefresh]);
|
||||||
|
|
||||||
|
const loadData = async () => {
|
||||||
|
try {
|
||||||
|
const [globalData, storesData, jobsData] = await Promise.all([
|
||||||
|
api.getGlobalSchedule(),
|
||||||
|
api.getStoreSchedules(),
|
||||||
|
api.getCrawlJobs(100)
|
||||||
|
]);
|
||||||
|
|
||||||
|
setGlobalSchedules(globalData.schedules || []);
|
||||||
|
setStoreSchedules(storesData.stores || []);
|
||||||
|
setJobs(jobsData.jobs || []);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to load schedule data:', error);
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleTriggerCrawl = async (storeId: number) => {
|
||||||
|
setTriggeringStore(storeId);
|
||||||
|
try {
|
||||||
|
await api.triggerStoreCrawl(storeId);
|
||||||
|
await loadData();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to trigger crawl:', error);
|
||||||
|
} finally {
|
||||||
|
setTriggeringStore(null);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleTriggerAll = async () => {
|
||||||
|
if (!confirm('This will create crawl jobs for ALL active stores. Continue?')) return;
|
||||||
|
try {
|
||||||
|
const result = await api.triggerAllCrawls();
|
||||||
|
alert(`Created ${result.jobs_created} crawl jobs`);
|
||||||
|
await loadData();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to trigger all crawls:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleCancelJob = async (jobId: number) => {
|
||||||
|
try {
|
||||||
|
await api.cancelCrawlJob(jobId);
|
||||||
|
await loadData();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to cancel job:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleUpdateGlobalSchedule = async (type: string, data: any) => {
|
||||||
|
try {
|
||||||
|
await api.updateGlobalSchedule(type, data);
|
||||||
|
await loadData();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to update global schedule:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const formatTimeAgo = (dateString: string | null) => {
|
||||||
|
if (!dateString) return 'Never';
|
||||||
|
const date = new Date(dateString);
|
||||||
|
const now = new Date();
|
||||||
|
const diffMs = now.getTime() - date.getTime();
|
||||||
|
const diffMins = Math.floor(diffMs / 60000);
|
||||||
|
const diffHours = Math.floor(diffMins / 60);
|
||||||
|
const diffDays = Math.floor(diffHours / 24);
|
||||||
|
|
||||||
|
if (diffMins < 1) return 'Just now';
|
||||||
|
if (diffMins < 60) return `${diffMins}m ago`;
|
||||||
|
if (diffHours < 24) return `${diffHours}h ago`;
|
||||||
|
return `${diffDays}d ago`;
|
||||||
|
};
|
||||||
|
|
||||||
|
const formatTimeUntil = (dateString: string) => {
|
||||||
|
const date = new Date(dateString);
|
||||||
|
const now = new Date();
|
||||||
|
const diffMs = date.getTime() - now.getTime();
|
||||||
|
|
||||||
|
if (diffMs < 0) return 'Overdue';
|
||||||
|
|
||||||
|
const diffMins = Math.floor(diffMs / 60000);
|
||||||
|
const diffHours = Math.floor(diffMins / 60);
|
||||||
|
|
||||||
|
if (diffMins < 60) return `${diffMins}m`;
|
||||||
|
return `${diffHours}h ${diffMins % 60}m`;
|
||||||
|
};
|
||||||
|
|
||||||
|
const getStatusColor = (status: string) => {
|
||||||
|
switch (status) {
|
||||||
|
case 'completed':
|
||||||
|
case 'success': return { bg: '#d1fae5', color: '#065f46' };
|
||||||
|
case 'running': return { bg: '#dbeafe', color: '#1e40af' };
|
||||||
|
case 'failed':
|
||||||
|
case 'error': return { bg: '#fee2e2', color: '#991b1b' };
|
||||||
|
case 'cancelled': return { bg: '#f3f4f6', color: '#374151' };
|
||||||
|
case 'pending': return { bg: '#fef3c7', color: '#92400e' };
|
||||||
|
case 'sandbox_only': return { bg: '#e0e7ff', color: '#3730a3' };
|
||||||
|
case 'detection_only': return { bg: '#fce7f3', color: '#9d174d' };
|
||||||
|
default: return { bg: '#f3f4f6', color: '#374151' };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getProviderBadge = (provider: string | null, mode: string | null) => {
|
||||||
|
if (!provider) return null;
|
||||||
|
const isProduction = mode === 'production';
|
||||||
|
return {
|
||||||
|
label: provider,
|
||||||
|
bg: isProduction ? '#d1fae5' : '#fef3c7',
|
||||||
|
color: isProduction ? '#065f46' : '#92400e',
|
||||||
|
suffix: isProduction ? '' : ' (sandbox)'
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const globalIntervalSchedule = globalSchedules.find(s => s.schedule_type === 'global_interval');
|
||||||
|
const dailySpecialSchedule = globalSchedules.find(s => s.schedule_type === 'daily_special');
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Layout>
|
||||||
|
<div>
|
||||||
|
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '30px' }}>
|
||||||
|
<h1 style={{ fontSize: '32px', margin: 0 }}>Crawler Schedule</h1>
|
||||||
|
<div style={{ display: 'flex', gap: '15px', alignItems: 'center' }}>
|
||||||
|
<label style={{ display: 'flex', alignItems: 'center', gap: '10px', cursor: 'pointer' }}>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={autoRefresh}
|
||||||
|
onChange={(e) => setAutoRefresh(e.target.checked)}
|
||||||
|
style={{ width: '18px', height: '18px', cursor: 'pointer' }}
|
||||||
|
/>
|
||||||
|
<span>Auto-refresh (5s)</span>
|
||||||
|
</label>
|
||||||
|
<button
|
||||||
|
onClick={handleTriggerAll}
|
||||||
|
style={{
|
||||||
|
padding: '10px 20px',
|
||||||
|
background: '#2563eb',
|
||||||
|
color: 'white',
|
||||||
|
border: 'none',
|
||||||
|
borderRadius: '6px',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontWeight: '600'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Crawl All Stores
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Tabs */}
|
||||||
|
<div style={{ marginBottom: '30px', display: 'flex', gap: '10px', borderBottom: '2px solid #eee' }}>
|
||||||
|
<button
|
||||||
|
onClick={() => setActiveTab('stores')}
|
||||||
|
style={{
|
||||||
|
padding: '12px 24px',
|
||||||
|
background: activeTab === 'stores' ? 'white' : 'transparent',
|
||||||
|
border: 'none',
|
||||||
|
borderBottom: activeTab === 'stores' ? '3px solid #2563eb' : '3px solid transparent',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '16px',
|
||||||
|
fontWeight: activeTab === 'stores' ? '600' : '400',
|
||||||
|
color: activeTab === 'stores' ? '#2563eb' : '#666',
|
||||||
|
marginBottom: '-2px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Store Schedules
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setActiveTab('jobs')}
|
||||||
|
style={{
|
||||||
|
padding: '12px 24px',
|
||||||
|
background: activeTab === 'jobs' ? 'white' : 'transparent',
|
||||||
|
border: 'none',
|
||||||
|
borderBottom: activeTab === 'jobs' ? '3px solid #2563eb' : '3px solid transparent',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '16px',
|
||||||
|
fontWeight: activeTab === 'jobs' ? '600' : '400',
|
||||||
|
color: activeTab === 'jobs' ? '#2563eb' : '#666',
|
||||||
|
marginBottom: '-2px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Job Queue ({jobs.filter(j => j.status === 'pending' || j.status === 'running').length})
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setActiveTab('global')}
|
||||||
|
style={{
|
||||||
|
padding: '12px 24px',
|
||||||
|
background: activeTab === 'global' ? 'white' : 'transparent',
|
||||||
|
border: 'none',
|
||||||
|
borderBottom: activeTab === 'global' ? '3px solid #2563eb' : '3px solid transparent',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '16px',
|
||||||
|
fontWeight: activeTab === 'global' ? '600' : '400',
|
||||||
|
color: activeTab === 'global' ? '#2563eb' : '#666',
|
||||||
|
marginBottom: '-2px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Global Settings
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{activeTab === 'global' && (
|
||||||
|
<div style={{ display: 'grid', gap: '20px' }}>
|
||||||
|
{/* Global Interval Schedule */}
|
||||||
|
<div style={{
|
||||||
|
background: 'white',
|
||||||
|
padding: '24px',
|
||||||
|
borderRadius: '8px',
|
||||||
|
boxShadow: '0 2px 8px rgba(0,0,0,0.1)'
|
||||||
|
}}>
|
||||||
|
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start', marginBottom: '20px' }}>
|
||||||
|
<div>
|
||||||
|
<h2 style={{ fontSize: '20px', margin: 0, marginBottom: '8px' }}>Interval Crawl Schedule</h2>
|
||||||
|
<p style={{ color: '#666', margin: 0 }}>Crawl all stores periodically</p>
|
||||||
|
</div>
|
||||||
|
<label style={{ display: 'flex', alignItems: 'center', gap: '10px', cursor: 'pointer' }}>
|
||||||
|
<span style={{ color: '#666' }}>Enabled</span>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={globalIntervalSchedule?.enabled ?? true}
|
||||||
|
onChange={(e) => handleUpdateGlobalSchedule('global_interval', { enabled: e.target.checked })}
|
||||||
|
style={{ width: '20px', height: '20px', cursor: 'pointer' }}
|
||||||
|
/>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div style={{ display: 'flex', alignItems: 'center', gap: '15px' }}>
|
||||||
|
<label style={{ display: 'flex', alignItems: 'center', gap: '10px' }}>
|
||||||
|
<span>Crawl every</span>
|
||||||
|
<select
|
||||||
|
value={globalIntervalSchedule?.interval_hours ?? 4}
|
||||||
|
onChange={(e) => handleUpdateGlobalSchedule('global_interval', { interval_hours: parseInt(e.target.value) })}
|
||||||
|
style={{
|
||||||
|
padding: '8px 12px',
|
||||||
|
borderRadius: '6px',
|
||||||
|
border: '1px solid #ddd',
|
||||||
|
fontSize: '16px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<option value={1}>1 hour</option>
|
||||||
|
<option value={2}>2 hours</option>
|
||||||
|
<option value={4}>4 hours</option>
|
||||||
|
<option value={6}>6 hours</option>
|
||||||
|
<option value={8}>8 hours</option>
|
||||||
|
<option value={12}>12 hours</option>
|
||||||
|
<option value={24}>24 hours</option>
|
||||||
|
</select>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Daily Special Schedule */}
|
||||||
|
<div style={{
|
||||||
|
background: 'white',
|
||||||
|
padding: '24px',
|
||||||
|
borderRadius: '8px',
|
||||||
|
boxShadow: '0 2px 8px rgba(0,0,0,0.1)'
|
||||||
|
}}>
|
||||||
|
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start', marginBottom: '20px' }}>
|
||||||
|
<div>
|
||||||
|
<h2 style={{ fontSize: '20px', margin: 0, marginBottom: '8px' }}>Daily Special Crawl</h2>
|
||||||
|
<p style={{ color: '#666', margin: 0 }}>Crawl stores at local midnight to capture daily specials</p>
|
||||||
|
</div>
|
||||||
|
<label style={{ display: 'flex', alignItems: 'center', gap: '10px', cursor: 'pointer' }}>
|
||||||
|
<span style={{ color: '#666' }}>Enabled</span>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={dailySpecialSchedule?.enabled ?? true}
|
||||||
|
onChange={(e) => handleUpdateGlobalSchedule('daily_special', { enabled: e.target.checked })}
|
||||||
|
style={{ width: '20px', height: '20px', cursor: 'pointer' }}
|
||||||
|
/>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<div style={{ display: 'flex', alignItems: 'center', gap: '15px' }}>
|
||||||
|
<label style={{ display: 'flex', alignItems: 'center', gap: '10px' }}>
|
||||||
|
<span>Run at</span>
|
||||||
|
<input
|
||||||
|
type="time"
|
||||||
|
value={dailySpecialSchedule?.run_time?.slice(0, 5) ?? '00:01'}
|
||||||
|
onChange={(e) => handleUpdateGlobalSchedule('daily_special', { run_time: e.target.value })}
|
||||||
|
style={{
|
||||||
|
padding: '8px 12px',
|
||||||
|
borderRadius: '6px',
|
||||||
|
border: '1px solid #ddd',
|
||||||
|
fontSize: '16px'
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
<span style={{ color: '#666' }}>(store local time)</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{activeTab === 'stores' && (
|
||||||
|
<div style={{
|
||||||
|
background: 'white',
|
||||||
|
borderRadius: '8px',
|
||||||
|
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
|
||||||
|
overflow: 'hidden'
|
||||||
|
}}>
|
||||||
|
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||||
|
<thead>
|
||||||
|
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary / Store</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Provider</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Schedule</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Run</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Next Run</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Result</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Actions</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{storeSchedules.map((store) => (
|
||||||
|
<tr key={store.store_id} style={{ borderBottom: '1px solid #eee' }}>
|
||||||
|
<td style={{ padding: '15px' }}>
|
||||||
|
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||||
|
{store.dispensary_id ? (
|
||||||
|
<Link
|
||||||
|
to={`/dispensaries/${store.dispensary_id}`}
|
||||||
|
style={{
|
||||||
|
fontWeight: '600',
|
||||||
|
color: '#2563eb',
|
||||||
|
textDecoration: 'none'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{store.dispensary_name || store.store_name}
|
||||||
|
</Link>
|
||||||
|
) : (
|
||||||
|
<span style={{ fontWeight: '600' }}>{store.store_name}</span>
|
||||||
|
)}
|
||||||
|
{!store.dispensary_id && (
|
||||||
|
<span style={{
|
||||||
|
padding: '2px 6px',
|
||||||
|
borderRadius: '4px',
|
||||||
|
fontSize: '10px',
|
||||||
|
fontWeight: '600',
|
||||||
|
background: '#fef3c7',
|
||||||
|
color: '#92400e'
|
||||||
|
}}>
|
||||||
|
Unmapped
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<div style={{ fontSize: '13px', color: '#666' }}>
|
||||||
|
{store.dispensary_city ? `${store.dispensary_city} | ${store.timezone}` : store.timezone}
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||||
|
{store.product_provider ? (
|
||||||
|
<div>
|
||||||
|
<span style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontSize: '12px',
|
||||||
|
fontWeight: '600',
|
||||||
|
background: store.product_crawler_mode === 'production' ? '#d1fae5' : '#fef3c7',
|
||||||
|
color: store.product_crawler_mode === 'production' ? '#065f46' : '#92400e'
|
||||||
|
}}>
|
||||||
|
{store.product_provider}
|
||||||
|
</span>
|
||||||
|
{store.product_crawler_mode !== 'production' && (
|
||||||
|
<div style={{ fontSize: '10px', color: '#92400e', marginTop: '2px' }}>sandbox</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<span style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontSize: '12px',
|
||||||
|
fontWeight: '600',
|
||||||
|
background: '#f3f4f6',
|
||||||
|
color: '#666'
|
||||||
|
}}>
|
||||||
|
Unknown
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||||
|
<div style={{ display: 'flex', flexDirection: 'column', alignItems: 'center', gap: '4px' }}>
|
||||||
|
<span style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontSize: '12px',
|
||||||
|
fontWeight: '600',
|
||||||
|
background: store.schedule_enabled && store.scrape_enabled ? '#d1fae5' : '#fee2e2',
|
||||||
|
color: store.schedule_enabled && store.scrape_enabled ? '#065f46' : '#991b1b'
|
||||||
|
}}>
|
||||||
|
{store.schedule_enabled && store.scrape_enabled ? 'Active' : 'Disabled'}
|
||||||
|
</span>
|
||||||
|
<span style={{ fontSize: '12px', color: '#666' }}>
|
||||||
|
Every {store.interval_hours}h
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px' }}>
|
||||||
|
<div>{formatTimeAgo(store.last_scraped_at)}</div>
|
||||||
|
{store.last_scraped_at && (
|
||||||
|
<div style={{ fontSize: '12px', color: '#999' }}>
|
||||||
|
{new Date(store.last_scraped_at).toLocaleString()}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px' }}>
|
||||||
|
<div style={{ fontWeight: '600', color: '#2563eb' }}>
|
||||||
|
{formatTimeUntil(store.next_scheduled_run)}
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px' }}>
|
||||||
|
{store.last_status || store.latest_job_status ? (
|
||||||
|
<div>
|
||||||
|
<div style={{ display: 'flex', alignItems: 'center', gap: '8px', marginBottom: '4px' }}>
|
||||||
|
<span style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontSize: '12px',
|
||||||
|
fontWeight: '600',
|
||||||
|
...getStatusColor(store.last_status || store.latest_job_status || 'pending')
|
||||||
|
}}>
|
||||||
|
{store.last_status || store.latest_job_status}
|
||||||
|
</span>
|
||||||
|
{store.last_error && (
|
||||||
|
<button
|
||||||
|
onClick={() => alert(store.last_error)}
|
||||||
|
style={{
|
||||||
|
padding: '2px 6px',
|
||||||
|
background: '#fee2e2',
|
||||||
|
color: '#991b1b',
|
||||||
|
border: 'none',
|
||||||
|
borderRadius: '4px',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '10px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Error
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
{store.last_summary ? (
|
||||||
|
<div style={{ fontSize: '12px', color: '#666', maxWidth: '250px' }}>
|
||||||
|
{store.last_summary}
|
||||||
|
</div>
|
||||||
|
) : store.latest_products_found !== null ? (
|
||||||
|
<div style={{ fontSize: '12px', color: '#666' }}>
|
||||||
|
{store.latest_products_found} products
|
||||||
|
{store.latest_products_new !== null && ` (+${store.latest_products_new} new)`}
|
||||||
|
</div>
|
||||||
|
) : null}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<span style={{ color: '#999', fontSize: '13px' }}>No runs yet</span>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||||
|
<button
|
||||||
|
onClick={() => handleTriggerCrawl(store.store_id)}
|
||||||
|
disabled={triggeringStore === store.store_id}
|
||||||
|
style={{
|
||||||
|
padding: '6px 12px',
|
||||||
|
background: triggeringStore === store.store_id ? '#94a3b8' : '#2563eb',
|
||||||
|
color: 'white',
|
||||||
|
border: 'none',
|
||||||
|
borderRadius: '4px',
|
||||||
|
cursor: triggeringStore === store.store_id ? 'wait' : 'pointer',
|
||||||
|
fontSize: '13px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{triggeringStore === store.store_id ? 'Starting...' : 'Run Now'}
|
||||||
|
</button>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{activeTab === 'jobs' && (
|
||||||
|
<>
|
||||||
|
{/* Job Stats */}
|
||||||
|
<div style={{ marginBottom: '30px' }}>
|
||||||
|
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(150px, 1fr))', gap: '15px' }}>
|
||||||
|
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
|
||||||
|
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Pending</div>
|
||||||
|
<div style={{ fontSize: '32px', fontWeight: '600', color: '#f59e0b' }}>
|
||||||
|
{jobs.filter(j => j.status === 'pending').length}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
|
||||||
|
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Running</div>
|
||||||
|
<div style={{ fontSize: '32px', fontWeight: '600', color: '#3b82f6' }}>
|
||||||
|
{jobs.filter(j => j.status === 'running').length}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
|
||||||
|
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Completed</div>
|
||||||
|
<div style={{ fontSize: '32px', fontWeight: '600', color: '#10b981' }}>
|
||||||
|
{jobs.filter(j => j.status === 'completed').length}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div style={{ background: 'white', padding: '20px', borderRadius: '8px', boxShadow: '0 2px 8px rgba(0,0,0,0.1)' }}>
|
||||||
|
<div style={{ fontSize: '14px', color: '#999', marginBottom: '8px' }}>Failed</div>
|
||||||
|
<div style={{ fontSize: '32px', fontWeight: '600', color: '#ef4444' }}>
|
||||||
|
{jobs.filter(j => j.status === 'failed').length}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Jobs Table */}
|
||||||
|
<div style={{
|
||||||
|
background: 'white',
|
||||||
|
borderRadius: '8px',
|
||||||
|
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
|
||||||
|
overflow: 'hidden'
|
||||||
|
}}>
|
||||||
|
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||||
|
<thead>
|
||||||
|
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Store</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Type</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Trigger</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Status</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Products</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Started</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Completed</th>
|
||||||
|
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Actions</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{jobs.length === 0 ? (
|
||||||
|
<tr>
|
||||||
|
<td colSpan={8} style={{ padding: '40px', textAlign: 'center', color: '#666' }}>
|
||||||
|
No crawl jobs found
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
) : (
|
||||||
|
jobs.map((job) => (
|
||||||
|
<tr key={job.id} style={{ borderBottom: '1px solid #eee' }}>
|
||||||
|
<td style={{ padding: '15px' }}>
|
||||||
|
<div style={{ fontWeight: '600' }}>{job.store_name}</div>
|
||||||
|
<div style={{ fontSize: '12px', color: '#999' }}>Job #{job.id}</div>
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center', fontSize: '13px' }}>
|
||||||
|
{job.job_type}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||||
|
<span style={{
|
||||||
|
padding: '3px 8px',
|
||||||
|
borderRadius: '4px',
|
||||||
|
fontSize: '12px',
|
||||||
|
background: job.trigger_type === 'manual' ? '#e0e7ff' :
|
||||||
|
job.trigger_type === 'daily_special' ? '#fce7f3' : '#f3f4f6',
|
||||||
|
color: job.trigger_type === 'manual' ? '#3730a3' :
|
||||||
|
job.trigger_type === 'daily_special' ? '#9d174d' : '#374151'
|
||||||
|
}}>
|
||||||
|
{job.trigger_type}
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||||
|
<span style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontSize: '12px',
|
||||||
|
fontWeight: '600',
|
||||||
|
...getStatusColor(job.status)
|
||||||
|
}}>
|
||||||
|
{job.status}
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'right' }}>
|
||||||
|
{job.products_found !== null ? (
|
||||||
|
<div>
|
||||||
|
<div style={{ fontWeight: '600' }}>{job.products_found}</div>
|
||||||
|
{job.products_new !== null && job.products_updated !== null && (
|
||||||
|
<div style={{ fontSize: '12px', color: '#666' }}>
|
||||||
|
+{job.products_new} / ~{job.products_updated}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
) : '-'}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', fontSize: '13px' }}>
|
||||||
|
{job.started_at ? new Date(job.started_at).toLocaleString() : '-'}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', fontSize: '13px' }}>
|
||||||
|
{job.completed_at ? new Date(job.completed_at).toLocaleString() : '-'}
|
||||||
|
</td>
|
||||||
|
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||||
|
{job.status === 'pending' && (
|
||||||
|
<button
|
||||||
|
onClick={() => handleCancelJob(job.id)}
|
||||||
|
style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
background: '#fee2e2',
|
||||||
|
color: '#991b1b',
|
||||||
|
border: 'none',
|
||||||
|
borderRadius: '4px',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '12px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Cancel
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
{job.error_message && (
|
||||||
|
<button
|
||||||
|
onClick={() => alert(job.error_message)}
|
||||||
|
style={{
|
||||||
|
padding: '4px 10px',
|
||||||
|
background: '#fee2e2',
|
||||||
|
color: '#991b1b',
|
||||||
|
border: 'none',
|
||||||
|
borderRadius: '4px',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '12px'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
View Error
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))
|
||||||
|
)}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</Layout>
|
||||||
|
);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user