Add CLAUDE guidelines for consolidated pipeline

This commit is contained in:
Kelly
2025-12-02 13:28:23 -07:00
parent 9219d8a77a
commit 04b5c3bd09
32 changed files with 4485 additions and 169 deletions

View File

@@ -0,0 +1,13 @@
-- Migration: Add dispensary_id to wp_dutchie_api_permissions
-- This allows API tokens to be associated with a specific dispensary
-- Add dispensary_id column to wp_dutchie_api_permissions
ALTER TABLE wp_dutchie_api_permissions
ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id);
-- Add index for faster lookups
CREATE INDEX IF NOT EXISTS idx_wp_api_permissions_dispensary_id ON wp_dutchie_api_permissions(dispensary_id);
-- Add dispensary_name column to return dispensary info without join
ALTER TABLE wp_dutchie_api_permissions
ADD COLUMN IF NOT EXISTS dispensary_name VARCHAR(255);

View File

@@ -896,6 +896,7 @@ SELECT
subcategory,
COUNT(*) as product_count,
COUNT(DISTINCT dispensary_id) as dispensary_count,
COUNT(DISTINCT brand_name) as brand_count,
AVG(thc) as avg_thc,
MIN(thc) as min_thc,
MAX(thc) as max_thc

View File

@@ -0,0 +1,36 @@
-- Migration 031: Add Normalized Fields to Products
-- For improved product matching and deduplication
-- Add normalized columns to products table
ALTER TABLE products ADD COLUMN IF NOT EXISTS name_normalized VARCHAR(500);
ALTER TABLE products ADD COLUMN IF NOT EXISTS brand_normalized VARCHAR(255);
ALTER TABLE products ADD COLUMN IF NOT EXISTS external_id VARCHAR(255); -- Platform-specific ID (Dutchie, Treez, etc)
ALTER TABLE products ADD COLUMN IF NOT EXISTS source_platform VARCHAR(50); -- 'dutchie', 'treez', 'jane', 'wp'
-- Create indexes for efficient matching
CREATE INDEX IF NOT EXISTS idx_products_external_id ON products(external_id) WHERE external_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_products_name_normalized ON products(store_id, name_normalized);
CREATE INDEX IF NOT EXISTS idx_products_matching ON products(store_id, name_normalized, brand_normalized, category_id);
-- Backfill normalized names for existing products
UPDATE products SET
name_normalized = LOWER(TRIM(REGEXP_REPLACE(name, '[^a-zA-Z0-9 ]', ' ', 'g'))),
brand_normalized = LOWER(TRIM(COALESCE(brand, ''))),
external_id = COALESCE(external_id, dutchie_product_id),
source_platform = COALESCE(source_platform, 'dutchie')
WHERE name_normalized IS NULL;
-- Add constraint to prevent true duplicates going forward
-- Note: We use a partial unique index to allow multiple NULLs
CREATE UNIQUE INDEX IF NOT EXISTS idx_products_no_duplicate_external_id
ON products(store_id, external_id)
WHERE external_id IS NOT NULL;
-- Comments
COMMENT ON COLUMN products.name_normalized IS 'Lowercase, trimmed product name with punctuation removed for matching';
COMMENT ON COLUMN products.brand_normalized IS 'Lowercase, trimmed brand name for matching';
COMMENT ON COLUMN products.external_id IS 'Platform-specific product ID (Dutchie ID, Treez SKU, etc)';
COMMENT ON COLUMN products.source_platform IS 'Source platform: dutchie, treez, jane, wp';
-- Grant permissions
GRANT SELECT, INSERT, UPDATE ON products TO scraper;

View File

@@ -0,0 +1,61 @@
-- Migration 032: Add menu_type column and local image storage columns
-- Run with: psql $DATABASE_URL -f migrations/032_menu_type_and_local_images.sql
-- ============================================
-- 1. Add menu_type column to dispensaries
-- ============================================
-- menu_type: canonical, admin-editable field for menu provider type
-- Separate from menu_provider (auto-detected) to allow manual override
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_type VARCHAR(50);
-- Index for filtering by menu_type
CREATE INDEX IF NOT EXISTS idx_dispensaries_menu_type ON dispensaries(menu_type) WHERE menu_type IS NOT NULL;
-- Backfill menu_type from existing detection data:
-- Priority: menu_provider (if set) > menu_url pattern matching
UPDATE dispensaries
SET menu_type = CASE
-- Use existing menu_provider if set
WHEN menu_provider IS NOT NULL AND menu_provider != '' THEN menu_provider
-- Detect from menu_url patterns
WHEN menu_url ILIKE '%dutchie%' THEN 'dutchie'
WHEN menu_url ILIKE '%treez%' THEN 'treez'
WHEN menu_url ILIKE '%jane%' OR menu_url ILIKE '%iheartjane%' THEN 'jane'
WHEN menu_url ILIKE '%weedmaps%' THEN 'weedmaps'
WHEN menu_url ILIKE '%leafly%' THEN 'leafly'
WHEN menu_url ILIKE '%meadow%' OR menu_url ILIKE '%getmeadow%' THEN 'meadow'
WHEN menu_url ILIKE '%blaze%' THEN 'blaze'
WHEN menu_url ILIKE '%flowhub%' THEN 'flowhub'
WHEN menu_url ILIKE '%dispenseapp%' THEN 'dispense'
WHEN menu_url ILIKE '%cova%' THEN 'cova'
ELSE NULL
END
WHERE menu_type IS NULL;
-- ============================================
-- 2. Add local image columns to dutchie_products
-- ============================================
-- local_image_url: the URL path for serving the downloaded image (e.g., /images/products/123/456.webp)
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_url TEXT;
-- local_image_thumb_url: thumbnail version
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT;
-- local_image_medium_url: medium version
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT;
-- original_image_url: preserved third-party URL for fallback/reference
-- (primary_image_url will be updated to local path when downloaded)
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS original_image_url TEXT;
-- Backfill original_image_url from primary_image_url (preserve third-party URLs)
UPDATE dutchie_products
SET original_image_url = primary_image_url
WHERE original_image_url IS NULL AND primary_image_url IS NOT NULL;
-- ============================================
-- Done
-- ============================================
SELECT 'Migration 032 completed: menu_type column added, local image columns added' as status;

View File

@@ -0,0 +1,63 @@
-- Migration 033: Add platform_dispensary_id to dispensary_crawl_status view
-- This exposes platform ID status for scheduling transparency
-- Works with both local (interval_minutes) and K8s (cron_expression) schema variants
-- Recreate the dispensary_crawl_status view with platform_dispensary_id
DROP VIEW IF EXISTS public.dispensary_crawl_status CASCADE;
CREATE OR REPLACE VIEW public.dispensary_crawl_status AS
SELECT
d.id AS dispensary_id,
COALESCE(d.dba_name, d.name) AS dispensary_name,
d.slug AS dispensary_slug,
d.city,
d.state,
d.menu_url,
d.menu_type,
d.platform_dispensary_id,
d.scrape_enabled,
d.last_crawl_at,
d.crawl_status,
d.product_crawler_mode,
d.product_provider,
cs.interval_minutes,
cs.is_active,
cs.priority,
cs.last_run_at,
cs.next_run_at,
cs.last_status AS schedule_last_status,
cs.last_error AS schedule_last_error,
cs.consecutive_failures,
j.id AS latest_job_id,
j.status AS latest_job_status,
j.job_type AS latest_job_type,
j.started_at AS latest_job_started,
j.completed_at AS latest_job_completed,
j.products_found AS latest_products_found,
j.products_new AS latest_products_created,
j.products_updated AS latest_products_updated,
j.error_message AS latest_job_error,
-- Computed scheduling eligibility
CASE
WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true
ELSE false
END AS can_crawl,
CASE
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected'
WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform'
WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved'
WHEN d.scrape_enabled = false THEN 'scraping disabled'
ELSE 'ready'
END AS schedule_status_reason
FROM public.dispensaries d
LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id
LEFT JOIN LATERAL (
SELECT *
FROM public.dispensary_crawl_jobs dj
WHERE dj.dispensary_id = d.id
ORDER BY dj.created_at DESC
LIMIT 1
) j ON true
WHERE d.state = 'AZ';
-- Done!
SELECT 'Migration 033 completed successfully' as status;