diff --git a/backend/migrations/098_dispensary_stage_tracking.sql b/backend/migrations/098_dispensary_stage_tracking.sql index e4a725d5..7b0b302e 100644 --- a/backend/migrations/098_dispensary_stage_tracking.sql +++ b/backend/migrations/098_dispensary_stage_tracking.sql @@ -2,8 +2,12 @@ -- Required for stage checkpoint feature in task handlers ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS consecutive_successes INTEGER DEFAULT 0; +ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS consecutive_failures INTEGER DEFAULT 0; ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS last_successful_crawl_at TIMESTAMPTZ; --- Index for finding failing stores +-- Indexes for finding stores by status CREATE INDEX IF NOT EXISTS idx_dispensaries_consecutive_successes ON dispensaries(consecutive_successes) WHERE consecutive_successes > 0; + +CREATE INDEX IF NOT EXISTS idx_dispensaries_consecutive_failures +ON dispensaries(consecutive_failures) WHERE consecutive_failures > 0; diff --git a/backend/migrations/099_brand_aliases.sql b/backend/migrations/099_brand_aliases.sql new file mode 100644 index 00000000..227e10b9 --- /dev/null +++ b/backend/migrations/099_brand_aliases.sql @@ -0,0 +1,97 @@ +-- Migration: Add brand aliases table and brand normalization +-- Enables case-insensitive brand matching and alias mapping + +-- Create brand_aliases table to map variations to canonical brands +CREATE TABLE IF NOT EXISTS brand_aliases ( + id SERIAL PRIMARY KEY, + brand_id INTEGER NOT NULL REFERENCES brands(id) ON DELETE CASCADE, + alias VARCHAR(255) NOT NULL, -- The raw brand name variation + alias_normalized VARCHAR(255) NOT NULL, -- Lowercase, trimmed for matching + source VARCHAR(50) DEFAULT 'auto', -- 'auto' = auto-detected, 'manual' = user-added + created_at TIMESTAMPTZ DEFAULT NOW(), + + UNIQUE(alias_normalized) +); + +-- Index for fast lookups +CREATE INDEX idx_brand_aliases_normalized ON brand_aliases(alias_normalized); +CREATE INDEX idx_brand_aliases_brand_id ON brand_aliases(brand_id); + +-- Create function to normalize brand names for matching +CREATE OR REPLACE FUNCTION normalize_brand_name(name TEXT) +RETURNS TEXT AS $$ +BEGIN + RETURN LOWER(TRIM( + -- Remove extra spaces + REGEXP_REPLACE( + -- Remove non-alphanumeric except spaces + REGEXP_REPLACE(name, '[^a-zA-Z0-9\s]', '', 'g'), + '\s+', ' ', 'g' + ) + )); +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +-- Create function to find or create brand and return brand_id +CREATE OR REPLACE FUNCTION resolve_brand_id(raw_brand_name TEXT) +RETURNS INTEGER AS $$ +DECLARE + normalized_name TEXT; + found_brand_id INTEGER; +BEGIN + IF raw_brand_name IS NULL OR TRIM(raw_brand_name) = '' THEN + RETURN NULL; + END IF; + + normalized_name := normalize_brand_name(raw_brand_name); + + -- First check aliases + SELECT brand_id INTO found_brand_id + FROM brand_aliases + WHERE alias_normalized = normalized_name; + + IF found_brand_id IS NOT NULL THEN + RETURN found_brand_id; + END IF; + + -- Check brands table directly (case-insensitive) + SELECT id INTO found_brand_id + FROM brands + WHERE normalize_brand_name(name) = normalized_name; + + RETURN found_brand_id; +END; +$$ LANGUAGE plpgsql; + +-- Seed brand_aliases with existing brand names +INSERT INTO brand_aliases (brand_id, alias, alias_normalized, source) +SELECT + id, + name, + normalize_brand_name(name), + 'canonical' +FROM brands +ON CONFLICT (alias_normalized) DO NOTHING; + +-- Auto-generate aliases from store_products brand variations +INSERT INTO brand_aliases (brand_id, alias, alias_normalized, source) +SELECT DISTINCT ON (normalize_brand_name(sp.brand_name_raw)) + b.id, + sp.brand_name_raw, + normalize_brand_name(sp.brand_name_raw), + 'auto' +FROM store_products sp +CROSS JOIN LATERAL ( + SELECT id, name + FROM brands + WHERE normalize_brand_name(brands.name) = normalize_brand_name(sp.brand_name_raw) + LIMIT 1 +) b +WHERE sp.brand_name_raw IS NOT NULL + AND sp.brand_name_raw != '' +ON CONFLICT (alias_normalized) DO NOTHING; + +-- Add comment +COMMENT ON TABLE brand_aliases IS 'Maps brand name variations to canonical brand IDs'; +COMMENT ON FUNCTION normalize_brand_name(TEXT) IS 'Normalizes brand name for case-insensitive matching'; +COMMENT ON FUNCTION resolve_brand_id(TEXT) IS 'Returns brand_id for a raw brand name, using aliases'; diff --git a/backend/migrations/100_backfill_brand_ids.sql b/backend/migrations/100_backfill_brand_ids.sql new file mode 100644 index 00000000..874b639c --- /dev/null +++ b/backend/migrations/100_backfill_brand_ids.sql @@ -0,0 +1,41 @@ +-- Migration: Backfill brand_id in store_products using brand aliases +-- Links products to canonical brands via case-insensitive matching + +-- Update store_products.brand_id using the resolve_brand_id function +UPDATE store_products sp +SET brand_id = resolve_brand_id(sp.brand_name_raw) +WHERE sp.brand_name_raw IS NOT NULL + AND sp.brand_name_raw != '' + AND (sp.brand_id IS NULL OR sp.brand_id != resolve_brand_id(sp.brand_name_raw)); + +-- Create trigger to auto-link brand_id on insert/update +CREATE OR REPLACE FUNCTION auto_link_brand_id() +RETURNS TRIGGER AS $$ +BEGIN + IF NEW.brand_name_raw IS NOT NULL AND NEW.brand_name_raw != '' THEN + NEW.brand_id := COALESCE(resolve_brand_id(NEW.brand_name_raw), NEW.brand_id); + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Drop trigger if exists and recreate +DROP TRIGGER IF EXISTS trg_auto_link_brand_id ON store_products; +CREATE TRIGGER trg_auto_link_brand_id + BEFORE INSERT OR UPDATE OF brand_name_raw ON store_products + FOR EACH ROW + EXECUTE FUNCTION auto_link_brand_id(); + +-- Log the results +DO $$ +DECLARE + linked_count INTEGER; + unlinked_count INTEGER; +BEGIN + SELECT COUNT(*) INTO linked_count FROM store_products WHERE brand_id IS NOT NULL; + SELECT COUNT(*) INTO unlinked_count FROM store_products WHERE brand_id IS NULL AND brand_name_raw IS NOT NULL; + RAISE NOTICE 'Brand linking complete: % products linked, % products unlinked', linked_count, unlinked_count; +END $$; + +COMMENT ON FUNCTION auto_link_brand_id() IS 'Auto-links brand_id when brand_name_raw is set'; +COMMENT ON TRIGGER trg_auto_link_brand_id ON store_products IS 'Auto-links products to canonical brands on insert/update';