Merge pull request 'feature/workers-dashboard' (#1) from feature/workers-dashboard into master

Reviewed-on: https://code.cannabrands.app/Creationshop/dispensary-scraper/pulls/1
This commit is contained in:
kelly
2025-12-08 02:54:26 +00:00
289 changed files with 87754 additions and 4063 deletions

53
.gitignore vendored Normal file
View File

@@ -0,0 +1,53 @@
# Dependencies
node_modules/
# Build outputs (compiled JS, not source)
backend/dist/
cannaiq/dist/
findadispo/build/
findagram/build/
frontend/dist/
# Environment files (local secrets)
.env
.env.local
.env.*.local
backend/.env
backend/.env.local
# Database dumps and backups (large files)
*.dump
*.sql.backup
backup_*.sql
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Logs
*.log
npm-debug.log*
# Local storage (runtime data, not source)
backend/storage/
# Product images (crawled data, not source)
backend/public/images/products/
backend/public/images/brands/
# Vite cache
**/node_modules/.vite/
# Test coverage
coverage/
# Temporary files
*.tmp
*.temp
llm-scraper/

140
.woodpecker/.ci.yml Normal file
View File

@@ -0,0 +1,140 @@
when:
- event: [push, pull_request]
steps:
# Build checks
typecheck-backend:
image: node:20
commands:
- cd backend
- npm ci
- npx tsc --noEmit || true
build-cannaiq:
image: node:20
commands:
- cd cannaiq
- npm ci
- npx tsc --noEmit
- npm run build
build-findadispo:
image: node:20
commands:
- cd findadispo/frontend
- npm ci
- npm run build
build-findagram:
image: node:20
commands:
- cd findagram/frontend
- npm ci
- npm run build
# Docker builds - only on master
docker-backend:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/dispensary-scraper
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: backend/Dockerfile
context: backend
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
when:
branch: master
event: push
docker-cannaiq:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/cannaiq-frontend
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: cannaiq/Dockerfile
context: cannaiq
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
when:
branch: master
event: push
docker-findadispo:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/findadispo-frontend
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: findadispo/frontend/Dockerfile
context: findadispo/frontend
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
when:
branch: master
event: push
docker-findagram:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/findagram-frontend
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: findagram/frontend/Dockerfile
context: findagram/frontend
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
when:
branch: master
event: push
# Deploy to Kubernetes
deploy:
image: bitnami/kubectl:latest
environment:
KUBECONFIG_CONTENT:
from_secret: kubeconfig_data
commands:
- echo "Deploying to Kubernetes..."
- mkdir -p ~/.kube
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
- chmod 600 ~/.kube/config
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/scraper-worker scraper-worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
- kubectl rollout status deployment/scraper-worker -n dispensary-scraper --timeout=300s
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
- kubectl rollout status deployment/findadispo-frontend -n dispensary-scraper --timeout=120s
- kubectl rollout status deployment/findagram-frontend -n dispensary-scraper --timeout=120s
- echo "All deployments complete!"
when:
branch: master
event: push

1095
CLAUDE.md

File diff suppressed because it is too large Load Diff

4
README.md Normal file
View File

@@ -0,0 +1,4 @@
# CI/CD enabled
test trigger
# CI trigger

View File

@@ -1,17 +1,30 @@
PORT=3010
NODE_ENV=development
# Database
DATABASE_URL=postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus
# =============================================================================
# CannaiQ Database (dutchie_menus) - PRIMARY DATABASE
# =============================================================================
# This is where all schema migrations run and where canonical tables live.
# All CANNAIQ_DB_* variables are REQUIRED - connection will fail if missing.
CANNAIQ_DB_HOST=localhost
CANNAIQ_DB_PORT=54320
CANNAIQ_DB_NAME=dutchie_menus
CANNAIQ_DB_USER=dutchie
CANNAIQ_DB_PASS=dutchie_local_pass
# MinIO (connecting to Docker from host)
MINIO_ENDPOINT=localhost
MINIO_PORT=9020
MINIO_USE_SSL=false
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
MINIO_BUCKET=dutchie
MINIO_PUBLIC_ENDPOINT=http://localhost:9020
# =============================================================================
# Legacy Database (dutchie_legacy) - READ-ONLY SOURCE
# =============================================================================
# Used ONLY by ETL scripts to read historical data.
# NEVER run migrations against this database.
LEGACY_DB_HOST=localhost
LEGACY_DB_PORT=54320
LEGACY_DB_NAME=dutchie_legacy
LEGACY_DB_USER=dutchie
LEGACY_DB_PASS=dutchie_local_pass
# Local image storage (no MinIO per CLAUDE.md)
LOCAL_IMAGES_PATH=./public/images
# JWT
JWT_SECRET=your-secret-key-change-in-production

50
backend/.env.example Normal file
View File

@@ -0,0 +1,50 @@
# CannaiQ Backend Environment Configuration
# Copy this file to .env and fill in the values
# Server
PORT=3010
NODE_ENV=development
# =============================================================================
# CANNAIQ DATABASE (dutchie_menus) - PRIMARY DATABASE
# =============================================================================
# This is where ALL schema migrations run and where canonical tables live.
# All CANNAIQ_DB_* variables are REQUIRED - no defaults.
# The application will fail to start if any are missing.
CANNAIQ_DB_HOST=localhost
CANNAIQ_DB_PORT=54320
CANNAIQ_DB_NAME=dutchie_menus # MUST be dutchie_menus - NOT dutchie_legacy
CANNAIQ_DB_USER=dutchie
CANNAIQ_DB_PASS=
# Alternative: Use a full connection URL instead of individual vars
# If set, this takes priority over individual vars above
# CANNAIQ_DB_URL=postgresql://user:pass@host:port/dutchie_menus
# =============================================================================
# LEGACY DATABASE (dutchie_legacy) - READ-ONLY FOR ETL
# =============================================================================
# Used ONLY by ETL scripts to read historical data.
# NEVER run migrations against this database.
# These are only needed when running 042_legacy_import.ts
LEGACY_DB_HOST=localhost
LEGACY_DB_PORT=54320
LEGACY_DB_NAME=dutchie_legacy # READ-ONLY - never migrated
LEGACY_DB_USER=dutchie
LEGACY_DB_PASS=
# Alternative: Use a full connection URL instead of individual vars
# LEGACY_DB_URL=postgresql://user:pass@host:port/dutchie_legacy
# =============================================================================
# LOCAL STORAGE
# =============================================================================
# Local image storage path (no MinIO)
LOCAL_IMAGES_PATH=./public/images
# =============================================================================
# AUTHENTICATION
# =============================================================================
JWT_SECRET=your-secret-key-change-in-production

View File

@@ -0,0 +1,30 @@
# CannaiQ Local Development Environment
# Run: docker-compose -f docker-compose.local.yml up -d
#
# Services:
# - cannaiq-postgres: PostgreSQL at localhost:54320
#
# Note: Backend and frontend run outside Docker for faster dev iteration
version: '3.8'
services:
cannaiq-postgres:
image: postgres:15-alpine
container_name: cannaiq-postgres
environment:
POSTGRES_USER: cannaiq
POSTGRES_PASSWORD: cannaiq_local_pass
POSTGRES_DB: cannaiq
ports:
- "54320:5432"
volumes:
- cannaiq-postgres-data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U cannaiq"]
interval: 10s
timeout: 5s
retries: 5
volumes:
cannaiq-postgres-data:

View File

@@ -0,0 +1,712 @@
# CannaiQ Analytics Runbook
Phase 3: Analytics Engine - Complete Implementation Guide
## Overview
The CannaiQ Analytics Engine provides real-time insights into cannabis market data across price trends, brand penetration, category performance, store changes, and competitive positioning.
## Architecture
```
┌─────────────────────────────────────────────────────────────────┐
│ API Layer │
│ /api/az/analytics/* │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ Analytics Services │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
│ │PriceTrend │ │Penetration │ │CategoryAnalytics │ │
│ │Service │ │Service │ │Service │ │
│ └──────────────┘ └──────────────┘ └──────────────────────┘ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
│ │StoreChange │ │BrandOpportunity│ │AnalyticsCache │ │
│ │Service │ │Service │ │(15-min TTL) │ │
│ └──────────────┘ └──────────────┘ └──────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ Canonical Tables │
│ store_products │ store_product_snapshots │ brands │ categories │
│ dispensaries │ brand_snapshots │ category_snapshots │
└─────────────────────────────────────────────────────────────────┘
```
## Services
### 1. PriceTrendService
Provides time-series price analytics.
**Key Methods:**
| Method | Description |
|--------|-------------|
| `getProductPriceTrend(productId, storeId?, days)` | Price history for a product |
| `getBrandPriceTrend(brandName, filters)` | Average prices for a brand |
| `getCategoryPriceTrend(category, filters)` | Category-level price trends |
| `getPriceSummary(filters)` | 7d/30d/90d price averages |
| `detectPriceCompression(category, state?)` | Price war detection |
| `getGlobalPriceStats()` | Market-wide pricing overview |
**Filters:**
```typescript
interface PriceFilters {
storeId?: number;
brandName?: string;
category?: string;
state?: string;
days?: number; // default: 30
}
```
**Price Compression Detection:**
- Calculates standard deviation of prices within category
- Returns compression score 0-100 (higher = more compressed)
- Identifies brands converging toward mean price
---
### 2. PenetrationService
Tracks brand market presence across stores and states.
**Key Methods:**
| Method | Description |
|--------|-------------|
| `getBrandPenetration(brandName, filters)` | Store count, SKU count, coverage |
| `getTopBrandsByPenetration(limit, filters)` | Leaderboard of dominant brands |
| `getPenetrationTrend(brandName, days)` | Historical penetration growth |
| `getShelfShareByCategory(brandName)` | % of shelf per category |
| `getBrandPresenceByState(brandName)` | Multi-state presence map |
| `getStoresCarryingBrand(brandName)` | List of stores carrying brand |
| `getPenetrationHeatmap(brandName?)` | Geographic distribution |
**Penetration Calculation:**
```
Penetration % = (Stores with Brand / Total Stores in Market) × 100
```
---
### 3. CategoryAnalyticsService
Analyzes category performance and trends.
**Key Methods:**
| Method | Description |
|--------|-------------|
| `getCategorySummary(category?, filters)` | SKU count, avg price, stores |
| `getCategoryGrowth(days, filters)` | 7d/30d/90d growth rates |
| `getCategoryGrowthTrend(category, days)` | Time-series category growth |
| `getCategoryHeatmap(metric, periods)` | Visual heatmap data |
| `getTopMovers(limit, days)` | Fastest growing/declining categories |
| `getSubcategoryBreakdown(category)` | Drill-down into subcategories |
**Time Windows:**
- 7 days: Short-term volatility
- 30 days: Monthly trends
- 90 days: Seasonal patterns
---
### 4. StoreChangeService
Tracks product adds/drops, brand changes, and price movements per store.
**Key Methods:**
| Method | Description |
|--------|-------------|
| `getStoreChangeSummary(storeId)` | Overview of recent changes |
| `getStoreChangeEvents(storeId, filters)` | Event log (add, drop, price, OOS) |
| `getNewBrands(storeId, days)` | Brands added to store |
| `getLostBrands(storeId, days)` | Brands dropped from store |
| `getProductChanges(storeId, type, days)` | Filtered product changes |
| `getCategoryLeaderboard(category, limit)` | Top stores for category |
| `getMostActiveStores(days, limit)` | Stores with most changes |
| `compareStores(store1, store2)` | Side-by-side store comparison |
**Event Types:**
- `added` - New product appeared
- `discontinued` - Product removed
- `price_drop` - Price decreased
- `price_increase` - Price increased
- `restocked` - OOS → In Stock
- `out_of_stock` - In Stock → OOS
---
### 5. BrandOpportunityService
Competitive intelligence and opportunity identification.
**Key Methods:**
| Method | Description |
|--------|-------------|
| `getBrandOpportunity(brandName)` | Full opportunity analysis |
| `getMarketPositionSummary(brandName)` | Market position vs competitors |
| `getAlerts(filters)` | Analytics-generated alerts |
| `markAlertsRead(alertIds)` | Mark alerts as read |
**Opportunity Analysis Includes:**
- White space stores (potential targets)
- Competitive threats (brands gaining share)
- Pricing opportunities (underpriced vs market)
- Missing SKU recommendations
---
### 6. AnalyticsCache
In-memory caching with database fallback.
**Configuration:**
```typescript
const cache = new AnalyticsCache(pool, {
defaultTtlMinutes: 15,
});
```
**Usage Pattern:**
```typescript
const data = await cache.getOrCompute(cacheKey, async () => {
// Expensive query here
return result;
});
```
**Cache Management:**
- `GET /api/az/analytics/cache/stats` - View cache stats
- `POST /api/az/analytics/cache/clear?pattern=price*` - Clear by pattern
- Auto-cleanup of expired entries every 5 minutes
---
## API Endpoints Reference
### Price Endpoints
```bash
# Product price trend (last 30 days)
GET /api/az/analytics/price/product/12345?days=30
# Brand price trend with filters
GET /api/az/analytics/price/brand/Cookies?storeId=101&category=Flower&days=90
# Category median price
GET /api/az/analytics/price/category/Vaporizers?state=AZ
# Price summary (7d/30d/90d)
GET /api/az/analytics/price/summary?brand=Stiiizy&state=AZ
# Detect price wars
GET /api/az/analytics/price/compression/Flower?state=AZ
# Global stats
GET /api/az/analytics/price/global
```
### Penetration Endpoints
```bash
# Brand penetration
GET /api/az/analytics/penetration/brand/Cookies
# Top brands leaderboard
GET /api/az/analytics/penetration/top?limit=20&state=AZ&category=Flower
# Penetration trend
GET /api/az/analytics/penetration/trend/Cookies?days=90
# Shelf share by category
GET /api/az/analytics/penetration/shelf-share/Cookies
# Multi-state presence
GET /api/az/analytics/penetration/by-state/Cookies
# Stores carrying brand
GET /api/az/analytics/penetration/stores/Cookies
# Heatmap data
GET /api/az/analytics/penetration/heatmap?brand=Cookies
```
### Category Endpoints
```bash
# Category summary
GET /api/az/analytics/category/summary?category=Flower&state=AZ
# Category growth (7d/30d/90d)
GET /api/az/analytics/category/growth?days=30&state=AZ
# Category trend
GET /api/az/analytics/category/trend/Concentrates?days=90
# Heatmap
GET /api/az/analytics/category/heatmap?metric=growth&periods=12
# Top movers (growing/declining)
GET /api/az/analytics/category/top-movers?limit=5&days=30
# Subcategory breakdown
GET /api/az/analytics/category/Edibles/subcategories
```
### Store Endpoints
```bash
# Store change summary
GET /api/az/analytics/store/101/summary
# Event log
GET /api/az/analytics/store/101/events?type=price_drop&days=7&limit=50
# New brands
GET /api/az/analytics/store/101/brands/new?days=30
# Lost brands
GET /api/az/analytics/store/101/brands/lost?days=30
# Product changes by type
GET /api/az/analytics/store/101/products/changes?type=added&days=7
# Category leaderboard
GET /api/az/analytics/store/leaderboard/Flower?limit=20
# Most active stores
GET /api/az/analytics/store/most-active?days=7&limit=10
# Compare two stores
GET /api/az/analytics/store/compare?store1=101&store2=102
```
### Brand Opportunity Endpoints
```bash
# Full opportunity analysis
GET /api/az/analytics/brand/Cookies/opportunity
# Market position summary
GET /api/az/analytics/brand/Cookies/position
# Get alerts
GET /api/az/analytics/alerts?brand=Cookies&type=competitive&unreadOnly=true
# Mark alerts read
POST /api/az/analytics/alerts/mark-read
Body: { "alertIds": [1, 2, 3] }
```
### Maintenance Endpoints
```bash
# Capture daily snapshots (run by scheduler)
POST /api/az/analytics/snapshots/capture
# Cache statistics
GET /api/az/analytics/cache/stats
# Clear cache (admin)
POST /api/az/analytics/cache/clear?pattern=price*
```
---
## Incremental Computation
Analytics are designed for real-time queries without full recomputation:
### Snapshot Strategy
1. **Raw Data**: `store_products` (current state)
2. **Historical**: `store_product_snapshots` (time-series)
3. **Aggregated**: `brand_snapshots`, `category_snapshots` (daily rollups)
### Window Calculations
```sql
-- 7-day window
WHERE crawled_at >= NOW() - INTERVAL '7 days'
-- 30-day window
WHERE crawled_at >= NOW() - INTERVAL '30 days'
-- 90-day window
WHERE crawled_at >= NOW() - INTERVAL '90 days'
```
### Materialized Views (Optional)
For heavy queries, create materialized views:
```sql
CREATE MATERIALIZED VIEW mv_brand_daily_metrics AS
SELECT
DATE(sps.captured_at) as date,
sp.brand_id,
COUNT(DISTINCT sp.dispensary_id) as store_count,
COUNT(*) as sku_count,
AVG(sp.price_rec) as avg_price
FROM store_product_snapshots sps
JOIN store_products sp ON sps.store_product_id = sp.id
WHERE sps.captured_at >= NOW() - INTERVAL '90 days'
GROUP BY DATE(sps.captured_at), sp.brand_id;
-- Refresh daily
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_brand_daily_metrics;
```
---
## Scheduled Jobs
### Daily Snapshot Capture
Trigger via cron or scheduler:
```bash
curl -X POST http://localhost:3010/api/az/analytics/snapshots/capture
```
This calls:
- `capture_brand_snapshots()` - Captures brand metrics
- `capture_category_snapshots()` - Captures category metrics
### Cache Cleanup
Automatic cleanup every 5 minutes via in-memory timer.
For manual cleanup:
```bash
curl -X POST http://localhost:3010/api/az/analytics/cache/clear
```
---
## Extending Analytics (Future Phases)
### Phase 6: Intelligence Engine
- Automated alert generation
- Recommendation engine
- Price prediction
### Phase 7: Orders Integration
- Sales velocity analytics
- Reorder predictions
- Inventory turnover
### Phase 8: Advanced ML
- Demand forecasting
- Price elasticity modeling
- Customer segmentation
---
## Troubleshooting
### Common Issues
**1. Slow queries**
- Check cache stats: `GET /api/az/analytics/cache/stats`
- Increase cache TTL if data doesn't need real-time freshness
- Add indexes on frequently filtered columns
**2. Empty results**
- Verify data exists in source tables
- Check filter parameters (case-sensitive brand names)
- Verify state codes are valid
**3. Stale data**
- Run snapshot capture: `POST /api/az/analytics/snapshots/capture`
- Clear cache: `POST /api/az/analytics/cache/clear`
### Debugging
Enable query logging:
```typescript
// In service constructor
this.debug = process.env.ANALYTICS_DEBUG === 'true';
```
---
## Data Contracts
### Price Trend Response
```typescript
interface PriceTrend {
productId?: number;
storeId?: number;
brandName?: string;
category?: string;
dataPoints: Array<{
date: string;
minPrice: number | null;
maxPrice: number | null;
avgPrice: number | null;
wholesalePrice: number | null;
sampleSize: number;
}>;
summary: {
currentAvg: number | null;
previousAvg: number | null;
changePercent: number | null;
trend: 'up' | 'down' | 'stable';
volatilityScore: number | null;
};
}
```
### Brand Penetration Response
```typescript
interface BrandPenetration {
brandName: string;
totalStores: number;
storesWithBrand: number;
penetrationPercent: number;
skuCount: number;
avgPrice: number | null;
priceRange: { min: number; max: number } | null;
topCategories: Array<{ category: string; count: number }>;
stateBreakdown?: Array<{ state: string; storeCount: number }>;
}
```
### Category Growth Response
```typescript
interface CategoryGrowth {
category: string;
currentCount: number;
previousCount: number;
growthPercent: number;
growthTrend: 'up' | 'down' | 'stable';
avgPrice: number | null;
priceChange: number | null;
topBrands: Array<{ brandName: string; count: number }>;
}
```
---
## Files Reference
| File | Purpose |
|------|---------|
| `src/dutchie-az/services/analytics/price-trends.ts` | Price analytics |
| `src/dutchie-az/services/analytics/penetration.ts` | Brand penetration |
| `src/dutchie-az/services/analytics/category-analytics.ts` | Category metrics |
| `src/dutchie-az/services/analytics/store-changes.ts` | Store event tracking |
| `src/dutchie-az/services/analytics/brand-opportunity.ts` | Competitive intel |
| `src/dutchie-az/services/analytics/cache.ts` | Caching layer |
| `src/dutchie-az/services/analytics/index.ts` | Module exports |
| `src/dutchie-az/routes/analytics.ts` | API routes (680 LOC) |
| `src/multi-state/state-query-service.ts` | Cross-state analytics |
---
---
## Analytics V2: Rec/Med State Segmentation
Phase 3 Enhancement: Enhanced analytics with recreational vs medical-only state analysis.
### V2 API Endpoints
All V2 endpoints are prefixed with `/api/analytics/v2`
#### V2 Price Analytics
```bash
# Price trends for a specific product
GET /api/analytics/v2/price/product/12345?window=30d
# Price by category and state (with rec/med segmentation)
GET /api/analytics/v2/price/category/Flower?state=AZ
# Price by brand and state
GET /api/analytics/v2/price/brand/Cookies?state=AZ
# Most volatile products
GET /api/analytics/v2/price/volatile?window=30d&limit=50&state=AZ
# Rec vs Med price comparison by category
GET /api/analytics/v2/price/rec-vs-med?category=Flower
```
#### V2 Brand Penetration
```bash
# Brand penetration metrics with state breakdown
GET /api/analytics/v2/brand/Cookies/penetration?window=30d
# Brand market position within categories
GET /api/analytics/v2/brand/Cookies/market-position?category=Flower&state=AZ
# Brand presence in rec vs med-only states
GET /api/analytics/v2/brand/Cookies/rec-vs-med
# Top brands by penetration
GET /api/analytics/v2/brand/top?limit=25&state=AZ
# Brands expanding or contracting
GET /api/analytics/v2/brand/expansion-contraction?window=30d&limit=25
```
#### V2 Category Analytics
```bash
# Category growth metrics
GET /api/analytics/v2/category/Flower/growth?window=30d
# Category growth trend over time
GET /api/analytics/v2/category/Flower/trend?window=30d
# Top brands in category
GET /api/analytics/v2/category/Flower/top-brands?limit=25&state=AZ
# All categories with metrics
GET /api/analytics/v2/category/all?state=AZ&limit=50
# Rec vs Med category comparison
GET /api/analytics/v2/category/rec-vs-med?category=Flower
# Fastest growing categories
GET /api/analytics/v2/category/fastest-growing?window=30d&limit=25
```
#### V2 Store Analytics
```bash
# Store change summary
GET /api/analytics/v2/store/101/summary?window=30d
# Product change events
GET /api/analytics/v2/store/101/events?window=7d&limit=100
# Store inventory composition
GET /api/analytics/v2/store/101/inventory
# Store price positioning vs market
GET /api/analytics/v2/store/101/price-position
# Most active stores by changes
GET /api/analytics/v2/store/most-active?window=7d&limit=25&state=AZ
```
#### V2 State Analytics
```bash
# State market summary
GET /api/analytics/v2/state/AZ/summary
# All states with coverage metrics
GET /api/analytics/v2/state/all
# Legal state breakdown (rec, med-only, no program)
GET /api/analytics/v2/state/legal-breakdown
# Rec vs Med pricing by category
GET /api/analytics/v2/state/rec-vs-med-pricing?category=Flower
# States with coverage gaps
GET /api/analytics/v2/state/coverage-gaps
# Cross-state pricing comparison
GET /api/analytics/v2/state/price-comparison
```
### V2 Services Architecture
```
src/services/analytics/
├── index.ts # Exports all V2 services
├── types.ts # Shared type definitions
├── PriceAnalyticsService.ts # Price trends and volatility
├── BrandPenetrationService.ts # Brand market presence
├── CategoryAnalyticsService.ts # Category growth analysis
├── StoreAnalyticsService.ts # Store change tracking
└── StateAnalyticsService.ts # State-level analytics
src/routes/analytics-v2.ts # V2 API route handlers
```
### Key V2 Features
1. **Rec/Med State Segmentation**: All analytics can be filtered and compared by legal status
2. **State Coverage Gaps**: Identify legal states with missing or stale data
3. **Cross-State Pricing**: Compare prices across recreational and medical-only markets
4. **Brand Footprint Analysis**: Track brand presence in rec vs med states
5. **Category Comparison**: Compare category performance by legal status
### V2 Migration Path
1. Run migration 052 for state cannabis flags:
```bash
psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
```
2. Run migration 053 for analytics indexes:
```bash
psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
```
3. Restart backend to pick up new routes
### V2 Response Examples
**Rec vs Med Price Comparison:**
```json
{
"category": "Flower",
"recreational": {
"state_count": 15,
"product_count": 12500,
"avg_price": 35.50,
"median_price": 32.00
},
"medical_only": {
"state_count": 8,
"product_count": 5200,
"avg_price": 42.00,
"median_price": 40.00
},
"price_diff_percent": -15.48
}
```
**Legal State Breakdown:**
```json
{
"recreational_states": {
"count": 24,
"dispensary_count": 850,
"product_count": 125000,
"states": [
{ "code": "CA", "name": "California", "dispensary_count": 250 },
{ "code": "CO", "name": "Colorado", "dispensary_count": 150 }
]
},
"medical_only_states": {
"count": 18,
"dispensary_count": 320,
"product_count": 45000,
"states": [
{ "code": "FL", "name": "Florida", "dispensary_count": 120 }
]
},
"no_program_states": {
"count": 9,
"states": [
{ "code": "ID", "name": "Idaho" }
]
}
}
```
---
*Phase 3 Analytics Engine - Fully Implemented*
*V2 Rec/Med State Analytics - Added December 2024*

View File

@@ -0,0 +1,594 @@
# Analytics V2 API Examples
## Overview
All endpoints are prefixed with `/api/analytics/v2`
### Filtering Options
**Time Windows:**
- `?window=7d` - Last 7 days
- `?window=30d` - Last 30 days (default)
- `?window=90d` - Last 90 days
**Legal Type Filtering:**
- `?legalType=recreational` - Recreational states only
- `?legalType=medical_only` - Medical-only states (not recreational)
- `?legalType=no_program` - States with no cannabis program
---
## 1. Price Analytics
### GET /price/product/:id
Get price trends for a specific store product.
**Request:**
```bash
GET /api/analytics/v2/price/product/12345?window=30d
```
**Response:**
```json
{
"store_product_id": 12345,
"product_name": "Blue Dream 3.5g",
"brand_name": "Cookies",
"category": "Flower",
"dispensary_id": 101,
"dispensary_name": "Green Leaf Dispensary",
"state_code": "AZ",
"data_points": [
{
"date": "2024-11-06",
"price_rec": 45.00,
"price_med": 40.00,
"price_rec_special": null,
"price_med_special": null,
"is_on_special": false
},
{
"date": "2024-11-07",
"price_rec": 42.00,
"price_med": 38.00,
"price_rec_special": null,
"price_med_special": null,
"is_on_special": false
}
],
"summary": {
"current_price": 42.00,
"min_price": 40.00,
"max_price": 48.00,
"avg_price": 43.50,
"price_change_count": 3,
"volatility_percent": 8.2
}
}
```
### GET /price/rec-vs-med
Get recreational vs medical-only price comparison by category.
**Request:**
```bash
GET /api/analytics/v2/price/rec-vs-med?category=Flower
```
**Response:**
```json
[
{
"category": "Flower",
"rec_avg": 38.50,
"rec_median": 35.00,
"med_avg": 42.00,
"med_median": 40.00
},
{
"category": "Concentrates",
"rec_avg": 45.00,
"rec_median": 42.00,
"med_avg": 48.00,
"med_median": 45.00
}
]
```
---
## 2. Brand Analytics
### GET /brand/:name/penetration
Get brand penetration metrics with state breakdown.
**Request:**
```bash
GET /api/analytics/v2/brand/Cookies/penetration?window=30d
```
**Response:**
```json
{
"brand_name": "Cookies",
"total_dispensaries": 125,
"total_skus": 450,
"avg_skus_per_dispensary": 3.6,
"states_present": ["AZ", "CA", "CO", "NV", "MI"],
"state_breakdown": [
{
"state_code": "CA",
"state_name": "California",
"legal_type": "recreational",
"dispensary_count": 45,
"sku_count": 180,
"avg_skus_per_dispensary": 4.0,
"market_share_percent": 12.5
},
{
"state_code": "AZ",
"state_name": "Arizona",
"legal_type": "recreational",
"dispensary_count": 32,
"sku_count": 128,
"avg_skus_per_dispensary": 4.0,
"market_share_percent": 15.2
}
],
"penetration_trend": [
{
"date": "2024-11-01",
"dispensary_count": 120,
"new_dispensaries": 0,
"dropped_dispensaries": 0
},
{
"date": "2024-11-08",
"dispensary_count": 123,
"new_dispensaries": 3,
"dropped_dispensaries": 0
},
{
"date": "2024-11-15",
"dispensary_count": 125,
"new_dispensaries": 2,
"dropped_dispensaries": 0
}
]
}
```
### GET /brand/:name/rec-vs-med
Get brand presence in recreational vs medical-only states.
**Request:**
```bash
GET /api/analytics/v2/brand/Cookies/rec-vs-med
```
**Response:**
```json
{
"brand_name": "Cookies",
"rec_states_count": 4,
"rec_states": ["AZ", "CA", "CO", "NV"],
"rec_dispensary_count": 110,
"rec_avg_skus": 3.8,
"med_only_states_count": 2,
"med_only_states": ["FL", "OH"],
"med_only_dispensary_count": 15,
"med_only_avg_skus": 2.5
}
```
---
## 3. Category Analytics
### GET /category/:name/growth
Get category growth metrics with state breakdown.
**Request:**
```bash
GET /api/analytics/v2/category/Flower/growth?window=30d
```
**Response:**
```json
{
"category": "Flower",
"current_sku_count": 5200,
"current_dispensary_count": 320,
"avg_price": 38.50,
"growth_data": [
{
"date": "2024-11-01",
"sku_count": 4800,
"dispensary_count": 310,
"avg_price": 39.00
},
{
"date": "2024-11-15",
"sku_count": 5000,
"dispensary_count": 315,
"avg_price": 38.75
},
{
"date": "2024-12-01",
"sku_count": 5200,
"dispensary_count": 320,
"avg_price": 38.50
}
],
"state_breakdown": [
{
"state_code": "CA",
"state_name": "California",
"legal_type": "recreational",
"sku_count": 2100,
"dispensary_count": 145,
"avg_price": 36.00
},
{
"state_code": "AZ",
"state_name": "Arizona",
"legal_type": "recreational",
"sku_count": 950,
"dispensary_count": 85,
"avg_price": 40.00
}
]
}
```
### GET /category/rec-vs-med
Get category comparison between recreational and medical-only states.
**Request:**
```bash
GET /api/analytics/v2/category/rec-vs-med
```
**Response:**
```json
[
{
"category": "Flower",
"recreational": {
"state_count": 15,
"dispensary_count": 650,
"sku_count": 12500,
"avg_price": 35.50,
"median_price": 32.00
},
"medical_only": {
"state_count": 8,
"dispensary_count": 220,
"sku_count": 4200,
"avg_price": 42.00,
"median_price": 40.00
},
"price_diff_percent": -15.48
},
{
"category": "Concentrates",
"recreational": {
"state_count": 15,
"dispensary_count": 600,
"sku_count": 8500,
"avg_price": 42.00,
"median_price": 40.00
},
"medical_only": {
"state_count": 8,
"dispensary_count": 200,
"sku_count": 3100,
"avg_price": 48.00,
"median_price": 45.00
},
"price_diff_percent": -12.50
}
]
```
---
## 4. Store Analytics
### GET /store/:id/summary
Get change summary for a store over a time window.
**Request:**
```bash
GET /api/analytics/v2/store/101/summary?window=30d
```
**Response:**
```json
{
"dispensary_id": 101,
"dispensary_name": "Green Leaf Dispensary",
"state_code": "AZ",
"window": "30d",
"products_added": 45,
"products_dropped": 12,
"brands_added": ["Alien Labs", "Connected"],
"brands_dropped": ["House Brand"],
"price_changes": 156,
"avg_price_change_percent": 3.2,
"stock_in_events": 89,
"stock_out_events": 34,
"current_product_count": 512,
"current_in_stock_count": 478
}
```
### GET /store/:id/events
Get recent product change events for a store.
**Request:**
```bash
GET /api/analytics/v2/store/101/events?window=7d&limit=50
```
**Response:**
```json
[
{
"store_product_id": 12345,
"product_name": "Blue Dream 3.5g",
"brand_name": "Cookies",
"category": "Flower",
"event_type": "price_change",
"event_date": "2024-12-05T14:30:00.000Z",
"old_value": "45.00",
"new_value": "42.00"
},
{
"store_product_id": 12346,
"product_name": "OG Kush 1g",
"brand_name": "Alien Labs",
"category": "Flower",
"event_type": "added",
"event_date": "2024-12-04T10:00:00.000Z",
"old_value": null,
"new_value": null
},
{
"store_product_id": 12300,
"product_name": "Sour Diesel Cart",
"brand_name": "Select",
"category": "Vaporizers",
"event_type": "stock_out",
"event_date": "2024-12-03T16:45:00.000Z",
"old_value": "true",
"new_value": "false"
}
]
```
---
## 5. State Analytics
### GET /state/:code/summary
Get market summary for a specific state with rec/med breakdown.
**Request:**
```bash
GET /api/analytics/v2/state/AZ/summary
```
**Response:**
```json
{
"state_code": "AZ",
"state_name": "Arizona",
"legal_status": {
"recreational_legal": true,
"rec_year": 2020,
"medical_legal": true,
"med_year": 2010
},
"coverage": {
"dispensary_count": 145,
"product_count": 18500,
"brand_count": 320,
"category_count": 12,
"snapshot_count": 2450000,
"last_crawl_at": "2024-12-06T02:30:00.000Z"
},
"pricing": {
"avg_price": 42.50,
"median_price": 38.00,
"min_price": 5.00,
"max_price": 250.00
},
"top_categories": [
{ "category": "Flower", "count": 5200 },
{ "category": "Concentrates", "count": 3800 },
{ "category": "Vaporizers", "count": 2950 },
{ "category": "Edibles", "count": 2400 },
{ "category": "Pre-Rolls", "count": 1850 }
],
"top_brands": [
{ "brand": "Cookies", "count": 450 },
{ "brand": "Alien Labs", "count": 380 },
{ "brand": "Connected", "count": 320 },
{ "brand": "Stiiizy", "count": 290 },
{ "brand": "Raw Garden", "count": 275 }
]
}
```
### GET /state/legal-breakdown
Get breakdown by legal status (recreational, medical-only, no program).
**Request:**
```bash
GET /api/analytics/v2/state/legal-breakdown
```
**Response:**
```json
{
"recreational_states": {
"count": 24,
"dispensary_count": 850,
"product_count": 125000,
"snapshot_count": 15000000,
"states": [
{ "code": "CA", "name": "California", "dispensary_count": 250 },
{ "code": "CO", "name": "Colorado", "dispensary_count": 150 },
{ "code": "AZ", "name": "Arizona", "dispensary_count": 145 },
{ "code": "MI", "name": "Michigan", "dispensary_count": 120 }
]
},
"medical_only_states": {
"count": 18,
"dispensary_count": 320,
"product_count": 45000,
"snapshot_count": 5000000,
"states": [
{ "code": "FL", "name": "Florida", "dispensary_count": 120 },
{ "code": "OH", "name": "Ohio", "dispensary_count": 85 },
{ "code": "PA", "name": "Pennsylvania", "dispensary_count": 75 }
]
},
"no_program_states": {
"count": 9,
"states": [
{ "code": "ID", "name": "Idaho" },
{ "code": "WY", "name": "Wyoming" },
{ "code": "KS", "name": "Kansas" }
]
}
}
```
### GET /state/recreational
Get list of recreational state codes.
**Request:**
```bash
GET /api/analytics/v2/state/recreational
```
**Response:**
```json
{
"legal_type": "recreational",
"states": ["AK", "AZ", "CA", "CO", "CT", "DE", "IL", "MA", "MD", "ME", "MI", "MN", "MO", "MT", "NJ", "NM", "NV", "NY", "OH", "OR", "RI", "VA", "VT", "WA"],
"count": 24
}
```
### GET /state/medical-only
Get list of medical-only state codes (not recreational).
**Request:**
```bash
GET /api/analytics/v2/state/medical-only
```
**Response:**
```json
{
"legal_type": "medical_only",
"states": ["AR", "FL", "HI", "LA", "MS", "ND", "NH", "OK", "PA", "SD", "UT", "WV"],
"count": 12
}
```
### GET /state/rec-vs-med-pricing
Get rec vs med price comparison by category.
**Request:**
```bash
GET /api/analytics/v2/state/rec-vs-med-pricing?category=Flower
```
**Response:**
```json
[
{
"category": "Flower",
"recreational": {
"state_count": 15,
"product_count": 12500,
"avg_price": 35.50,
"median_price": 32.00
},
"medical_only": {
"state_count": 8,
"product_count": 5200,
"avg_price": 42.00,
"median_price": 40.00
},
"price_diff_percent": -15.48
}
]
```
---
## How These Endpoints Support Portals
### Brand Portal Use Cases
1. **Track brand penetration**: Use `/brand/:name/penetration` to see how many stores carry the brand
2. **Compare rec vs med markets**: Use `/brand/:name/rec-vs-med` to understand footprint by legal status
3. **Identify expansion opportunities**: Use `/state/coverage-gaps` to find underserved markets
4. **Monitor pricing**: Use `/price/brand/:brand` to track pricing by state
### Buyer Portal Use Cases
1. **Compare stores**: Use `/store/:id/summary` to see activity levels
2. **Track price changes**: Use `/store/:id/events` to monitor competitor pricing
3. **Analyze categories**: Use `/category/:name/growth` to identify trending products
4. **State-level insights**: Use `/state/:code/summary` for market overview
---
## Time Window Filtering
All time-based endpoints support the `window` query parameter:
| Value | Description |
|-------|-------------|
| `7d` | Last 7 days |
| `30d` | Last 30 days (default) |
| `90d` | Last 90 days |
The window affects:
- `store_product_snapshots.captured_at` for historical data
- `store_products.first_seen_at` / `last_seen_at` for product lifecycle
- `crawl_runs.started_at` for crawl-based metrics
---
## Rec/Med Segmentation
All state-level endpoints automatically segment by:
- **Recreational**: `states.recreational_legal = TRUE`
- **Medical-only**: `states.medical_legal = TRUE AND states.recreational_legal = FALSE`
- **No program**: Both flags are FALSE or NULL
This segmentation appears in:
- `legal_type` field in responses
- State breakdown arrays
- Price comparison endpoints

View File

@@ -0,0 +1,90 @@
-- Migration 037: Add per-store crawler profiles for Dutchie dispensaries
-- This enables per-store crawler configuration without changing shared logic
-- Phase 1: Schema only - no automatic behavior changes
-- Create the crawler profiles table
CREATE TABLE IF NOT EXISTS dispensary_crawler_profiles (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Human readable name for this profile
profile_name VARCHAR(255) NOT NULL,
-- High-level type, e.g. 'dutchie', 'treez', 'jane'
crawler_type VARCHAR(50) NOT NULL,
-- Optional key for mapping to a per-store crawler module later,
-- e.g. 'curaleaf-dispensary-gilbert'
profile_key VARCHAR(255),
-- Generic configuration bucket; will hold selectors, URLs, flags, etc.
config JSONB NOT NULL DEFAULT '{}'::jsonb,
-- Execution hints (safe defaults; can be overridden in config if needed)
timeout_ms INTEGER DEFAULT 30000,
download_images BOOLEAN DEFAULT TRUE,
track_stock BOOLEAN DEFAULT TRUE,
version INTEGER DEFAULT 1,
enabled BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Unique index on dispensary_id + profile_name
CREATE UNIQUE INDEX IF NOT EXISTS dispensary_crawler_profiles_unique_name
ON dispensary_crawler_profiles (dispensary_id, profile_name);
-- Index for finding enabled profiles by type
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_type_enabled
ON dispensary_crawler_profiles (crawler_type, enabled);
-- Index for dispensary lookup
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_dispensary
ON dispensary_crawler_profiles (dispensary_id);
-- Add FK from dispensaries to active profile
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries'
AND column_name = 'active_crawler_profile_id') THEN
ALTER TABLE dispensaries
ADD COLUMN active_crawler_profile_id INTEGER NULL
REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL;
END IF;
END $$;
-- Create index on the FK for faster joins
CREATE INDEX IF NOT EXISTS idx_dispensaries_active_profile
ON dispensaries (active_crawler_profile_id)
WHERE active_crawler_profile_id IS NOT NULL;
-- Create or replace trigger function for updated_at
CREATE OR REPLACE FUNCTION set_updated_at_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Add trigger to keep updated_at fresh (drop first if exists to avoid duplicates)
DROP TRIGGER IF EXISTS dispensary_crawler_profiles_set_timestamp ON dispensary_crawler_profiles;
CREATE TRIGGER dispensary_crawler_profiles_set_timestamp
BEFORE UPDATE ON dispensary_crawler_profiles
FOR EACH ROW EXECUTE PROCEDURE set_updated_at_timestamp();
-- Add comments for documentation
COMMENT ON TABLE dispensary_crawler_profiles IS 'Per-store crawler configuration profiles. Each dispensary can have multiple profiles but only one active at a time.';
COMMENT ON COLUMN dispensary_crawler_profiles.profile_name IS 'Human readable name for the profile, e.g. "Curaleaf Gilbert - Dutchie v1"';
COMMENT ON COLUMN dispensary_crawler_profiles.crawler_type IS 'The crawler implementation type: dutchie, treez, jane, sandbox, custom';
COMMENT ON COLUMN dispensary_crawler_profiles.profile_key IS 'Optional identifier for per-store crawler module mapping';
COMMENT ON COLUMN dispensary_crawler_profiles.config IS 'JSONB configuration for the crawler. Schema depends on crawler_type.';
COMMENT ON COLUMN dispensary_crawler_profiles.timeout_ms IS 'Request timeout in milliseconds (default 30000)';
COMMENT ON COLUMN dispensary_crawler_profiles.download_images IS 'Whether to download product images locally';
COMMENT ON COLUMN dispensary_crawler_profiles.track_stock IS 'Whether to track inventory/stock levels';
COMMENT ON COLUMN dispensary_crawler_profiles.version IS 'Profile version number for A/B testing or upgrades';
COMMENT ON COLUMN dispensary_crawler_profiles.enabled IS 'Whether this profile can be used (soft delete)';
COMMENT ON COLUMN dispensaries.active_crawler_profile_id IS 'FK to the currently active crawler profile for this dispensary';

View File

@@ -0,0 +1,84 @@
-- Migration: Add status field to dispensary_crawler_profiles
-- This adds a proper status column for crawler state machine
-- Status values: 'production', 'sandbox', 'needs_manual', 'disabled'
-- Add status column with default 'production' for existing profiles
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'production';
-- Add next_retry_at column for sandbox retry scheduling
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS next_retry_at TIMESTAMPTZ;
-- Add sandbox_attempt_count for quick lookup
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS sandbox_attempt_count INTEGER DEFAULT 0;
-- Add last_sandbox_at for tracking
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS last_sandbox_at TIMESTAMPTZ;
-- Create index for finding profiles by status
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_status
ON dispensary_crawler_profiles(status) WHERE enabled = true;
-- Create index for finding profiles needing retry
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_next_retry
ON dispensary_crawler_profiles(next_retry_at) WHERE enabled = true AND status = 'sandbox';
-- Add comment explaining status values
COMMENT ON COLUMN dispensary_crawler_profiles.status IS
'Crawler status: production (ready for regular crawls), sandbox (discovery mode), needs_manual (max retries exceeded), disabled (turned off)';
-- Update existing profiles to have status based on config if present
UPDATE dispensary_crawler_profiles
SET status = COALESCE(config->>'status', 'production')
WHERE status IS NULL OR status = '';
-- Backfill sandbox_attempt_count from config
UPDATE dispensary_crawler_profiles
SET sandbox_attempt_count = COALESCE(
jsonb_array_length(config->'sandboxAttempts'),
0
)
WHERE config->'sandboxAttempts' IS NOT NULL;
-- Backfill next_retry_at from config
UPDATE dispensary_crawler_profiles
SET next_retry_at = (config->>'nextRetryAt')::timestamptz
WHERE config->>'nextRetryAt' IS NOT NULL;
-- Create view for crawler profile summary
CREATE OR REPLACE VIEW v_crawler_profile_summary AS
SELECT
dcp.id,
dcp.dispensary_id,
d.name AS dispensary_name,
d.city,
d.menu_type,
dcp.profile_name,
dcp.profile_key,
dcp.crawler_type,
dcp.status,
dcp.enabled,
dcp.sandbox_attempt_count,
dcp.next_retry_at,
dcp.last_sandbox_at,
dcp.created_at,
dcp.updated_at,
CASE
WHEN dcp.profile_key IS NOT NULL THEN 'per-store'
ELSE 'legacy'
END AS crawler_mode,
CASE
WHEN dcp.status = 'production' THEN 'Ready'
WHEN dcp.status = 'sandbox' AND dcp.next_retry_at <= NOW() THEN 'Retry Due'
WHEN dcp.status = 'sandbox' THEN 'Waiting'
WHEN dcp.status = 'needs_manual' THEN 'Needs Manual'
WHEN dcp.status = 'disabled' THEN 'Disabled'
ELSE 'Unknown'
END AS status_display
FROM dispensary_crawler_profiles dcp
JOIN dispensaries d ON d.id = dcp.dispensary_id
WHERE dcp.enabled = true
ORDER BY dcp.status, dcp.updated_at DESC;

View File

@@ -0,0 +1,73 @@
-- Migration: Create crawl_orchestration_traces table
-- Purpose: Store detailed step-by-step traces for every crawl orchestration run
-- This enables full visibility into per-store crawler behavior
CREATE TABLE IF NOT EXISTS crawl_orchestration_traces (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
run_id VARCHAR(255), -- UUID or job ID for this crawl run
profile_id INTEGER REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL,
profile_key VARCHAR(255), -- e.g. "trulieve-scottsdale"
crawler_module VARCHAR(255), -- Full path to .ts file loaded
state_at_start VARCHAR(50), -- sandbox, production, legacy, disabled
state_at_end VARCHAR(50), -- sandbox, production, needs_manual, etc.
-- The trace: ordered array of step objects
trace JSONB NOT NULL DEFAULT '[]'::jsonb,
-- Summary metrics for quick querying
total_steps INTEGER DEFAULT 0,
duration_ms INTEGER,
success BOOLEAN,
error_message TEXT,
products_found INTEGER,
-- Timestamps
started_at TIMESTAMPTZ DEFAULT NOW(),
completed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Index for quick lookup by dispensary
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_id
ON crawl_orchestration_traces(dispensary_id);
-- Index for finding latest trace per dispensary
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_created
ON crawl_orchestration_traces(dispensary_id, created_at DESC);
-- Index for finding traces by run_id
CREATE INDEX IF NOT EXISTS idx_traces_run_id
ON crawl_orchestration_traces(run_id) WHERE run_id IS NOT NULL;
-- Index for finding traces by profile
CREATE INDEX IF NOT EXISTS idx_traces_profile_id
ON crawl_orchestration_traces(profile_id) WHERE profile_id IS NOT NULL;
-- Comment explaining trace structure
COMMENT ON COLUMN crawl_orchestration_traces.trace IS
'Ordered array of step objects. Each step has:
{
"step": 1,
"action": "load_profile",
"description": "Loading crawler profile for dispensary",
"timestamp": 1701234567890,
"duration_ms": 45,
"input": { ... },
"output": { ... },
"what": "Description of what happened",
"why": "Reason this step was taken",
"where": "Code location / module",
"how": "Method or approach used",
"when": "ISO timestamp"
}';
-- View for easy access to latest traces
CREATE OR REPLACE VIEW v_latest_crawl_traces AS
SELECT DISTINCT ON (dispensary_id)
cot.*,
d.name AS dispensary_name,
d.city AS dispensary_city
FROM crawl_orchestration_traces cot
JOIN dispensaries d ON d.id = cot.dispensary_id
ORDER BY dispensary_id, cot.created_at DESC;

View File

@@ -0,0 +1,73 @@
-- Migration 040: Add dba_name column to dispensaries table
-- DBA (Doing Business As) name - the name the dispensary operates under,
-- which may differ from the legal entity name
-- This migration is idempotent - safe to run multiple times
-- Add dba_name column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'dba_name') THEN
ALTER TABLE dispensaries ADD COLUMN dba_name TEXT DEFAULT NULL;
END IF;
END $$;
-- Add company_name column (legal entity name)
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'company_name') THEN
ALTER TABLE dispensaries ADD COLUMN company_name TEXT DEFAULT NULL;
END IF;
END $$;
-- Add azdhs_id for Arizona Department of Health Services license number
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'azdhs_id') THEN
ALTER TABLE dispensaries ADD COLUMN azdhs_id INTEGER DEFAULT NULL;
END IF;
END $$;
-- Add phone column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'phone') THEN
ALTER TABLE dispensaries ADD COLUMN phone TEXT DEFAULT NULL;
END IF;
END $$;
-- Add email column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'email') THEN
ALTER TABLE dispensaries ADD COLUMN email TEXT DEFAULT NULL;
END IF;
END $$;
-- Add google_rating column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_rating') THEN
ALTER TABLE dispensaries ADD COLUMN google_rating NUMERIC(2,1) DEFAULT NULL;
END IF;
END $$;
-- Add google_review_count column
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_review_count') THEN
ALTER TABLE dispensaries ADD COLUMN google_review_count INTEGER DEFAULT NULL;
END IF;
END $$;
-- Add comments for documentation
COMMENT ON COLUMN dispensaries.dba_name IS 'DBA (Doing Business As) name - the public-facing name the dispensary operates under';
COMMENT ON COLUMN dispensaries.company_name IS 'Legal entity/company name that owns the dispensary';
COMMENT ON COLUMN dispensaries.azdhs_id IS 'Arizona Department of Health Services license number';
COMMENT ON COLUMN dispensaries.phone IS 'Contact phone number';
COMMENT ON COLUMN dispensaries.email IS 'Contact email address';
COMMENT ON COLUMN dispensaries.google_rating IS 'Google Maps rating (1.0 to 5.0)';
COMMENT ON COLUMN dispensaries.google_review_count IS 'Number of Google reviews';
-- Create index for searching by dba_name
CREATE INDEX IF NOT EXISTS idx_dispensaries_dba_name ON dispensaries (dba_name);
CREATE INDEX IF NOT EXISTS idx_dispensaries_azdhs_id ON dispensaries (azdhs_id);

View File

@@ -0,0 +1,376 @@
-- Migration 041: CannaiQ Canonical Schema
--
-- This migration adds the canonical CannaiQ schema tables and columns.
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
--
-- Run with: psql $CANNAIQ_DB_URL -f migrations/041_cannaiq_canonical_schema.sql
--
-- Tables created:
-- - states (new)
-- - chains (new)
-- - brands (new)
-- - store_products (new - normalized view of current menu)
-- - store_product_snapshots (new - historical crawl data)
-- - crawl_runs (new - replaces/supplements dispensary_crawl_jobs)
--
-- Tables modified:
-- - dispensaries (add state_id, chain_id FKs)
-- - dispensary_crawler_profiles (add status, allow_autopromote, validated_at)
-- - crawl_orchestration_traces (add run_id FK)
--
-- =====================================================
-- 1) STATES TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL UNIQUE,
name VARCHAR(100) NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert known states
INSERT INTO states (code, name) VALUES
('AZ', 'Arizona'),
('CA', 'California'),
('CO', 'Colorado'),
('FL', 'Florida'),
('IL', 'Illinois'),
('MA', 'Massachusetts'),
('MD', 'Maryland'),
('MI', 'Michigan'),
('MO', 'Missouri'),
('NV', 'Nevada'),
('NJ', 'New Jersey'),
('NY', 'New York'),
('OH', 'Ohio'),
('OK', 'Oklahoma'),
('OR', 'Oregon'),
('PA', 'Pennsylvania'),
('WA', 'Washington')
ON CONFLICT (code) DO NOTHING;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state codes.';
-- =====================================================
-- 2) CHAINS TABLE (retail groups)
-- =====================================================
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
website_url TEXT,
logo_url TEXT,
description TEXT,
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations (e.g., Curaleaf, Trulieve).';
-- =====================================================
-- 3) BRANDS TABLE (canonical brand catalog)
-- =====================================================
CREATE TABLE IF NOT EXISTS brands (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
external_id VARCHAR(100), -- Provider-specific brand ID
website_url TEXT,
instagram_handle VARCHAR(100),
logo_url TEXT,
description TEXT,
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_brands_slug ON brands(slug);
CREATE INDEX IF NOT EXISTS idx_brands_external_id ON brands(external_id) WHERE external_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_brands_portfolio ON brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
COMMENT ON TABLE brands IS 'Canonical brand catalog. Brands may appear across multiple dispensaries.';
COMMENT ON COLUMN brands.is_portfolio_brand IS 'TRUE if this is a brand we represent/manage (vs third-party brand)';
-- =====================================================
-- 4) ADD state_id AND chain_id TO dispensaries
-- =====================================================
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
-- NOTE: state_id backfill is done by ETL script (042_legacy_import.ts), not this migration.
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- =====================================================
-- 5) STORE_PRODUCTS TABLE (current menu state)
-- =====================================================
-- This is the normalized "what is currently on the menu" table.
-- It supplements dutchie_products with a provider-agnostic structure.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL, -- Link to canonical product
brand_id INTEGER REFERENCES brands(id) ON DELETE SET NULL, -- Link to canonical brand
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', -- dutchie, treez, jane, etc.
provider_product_id VARCHAR(100), -- Platform-specific product ID
provider_brand_id VARCHAR(100), -- Platform-specific brand ID
-- Raw data from platform (not normalized)
name_raw VARCHAR(500) NOT NULL,
brand_name_raw VARCHAR(255),
category_raw VARCHAR(100),
subcategory_raw VARCHAR(100),
-- Pricing
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Images
image_url TEXT,
local_image_path TEXT,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, provider, provider_product_id)
);
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_product ON store_products(product_id) WHERE product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(brand_id) WHERE brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
COMMENT ON COLUMN store_products.product_id IS 'FK to canonical products table. NULL if not yet mapped.';
COMMENT ON COLUMN store_products.brand_id IS 'FK to canonical brands table. NULL if not yet mapped.';
-- =====================================================
-- 6) STORE_PRODUCT_SNAPSHOTS TABLE (historical data)
-- =====================================================
-- This is the critical time-series table for analytics.
-- One row per product per crawl.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER, -- FK added after crawl_runs table created
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name_raw VARCHAR(500),
brand_name_raw VARCHAR(255),
category_raw VARCHAR(100),
subcategory_raw VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(product_id, captured_at DESC) WHERE product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_store_product ON store_product_snapshots(store_product_id) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
COMMENT ON COLUMN store_product_snapshots.captured_at IS 'When this snapshot was captured (crawl time).';
-- =====================================================
-- 7) CRAWL_RUNS TABLE (job execution records)
-- =====================================================
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Provider
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Execution times
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
error_message TEXT,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
snapshots_written INTEGER DEFAULT 0,
-- Metadata
worker_id VARCHAR(100),
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- Add FK from store_product_snapshots to crawl_runs
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.table_constraints
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
END $$;
-- =====================================================
-- 8) UPDATE crawl_orchestration_traces
-- =====================================================
-- Add run_id FK if not exists
ALTER TABLE crawl_orchestration_traces
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
ON crawl_orchestration_traces(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- =====================================================
-- 9) UPDATE dispensary_crawler_profiles
-- =====================================================
-- Add missing columns from canonical schema
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_profiles_status
ON dispensary_crawler_profiles(status);
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
COMMENT ON COLUMN dispensary_crawler_profiles.allow_autopromote IS 'Whether this profile can be auto-promoted from sandbox to production';
COMMENT ON COLUMN dispensary_crawler_profiles.validated_at IS 'When this profile was last validated as working';
-- =====================================================
-- 10) VIEWS FOR BACKWARD COMPATIBILITY
-- =====================================================
-- View to get latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View to get crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
d.name AS dispensary_name,
d.city,
d.state,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.name, d.city, d.state;
-- =====================================================
-- 11) COMMENTS
-- =====================================================
COMMENT ON TABLE states IS 'Canonical list of US states. Use state_id FK in dispensaries.';
COMMENT ON TABLE chains IS 'Retail chains (multi-location operators).';
COMMENT ON TABLE brands IS 'Canonical brand catalog across all providers.';
COMMENT ON TABLE store_products IS 'Current menu state per dispensary. Provider-agnostic.';
COMMENT ON TABLE store_product_snapshots IS 'Historical price/stock data. One row per product per crawl.';
COMMENT ON TABLE crawl_runs IS 'Crawl execution records. Links snapshots to runs.';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================
--
-- Next steps (manual - not in this migration):
-- 1. Populate chains table from known retail groups
-- 2. Populate brands table from existing dutchie_products.brand_name
-- 3. Migrate data from dutchie_products → store_products
-- 4. Migrate data from dutchie_product_snapshots → store_product_snapshots
-- 5. Link dispensaries.chain_id to chains where applicable
--

View File

@@ -0,0 +1,50 @@
-- Migration 043: Add States Table
--
-- Creates the states table if it does not exist.
-- Safe to run multiple times (idempotent).
--
-- Run with:
-- CANNAIQ_DB_URL="postgresql://..." psql $CANNAIQ_DB_URL -f migrations/043_add_states_table.sql
-- =====================================================
-- 1) CREATE STATES TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- =====================================================
-- 2) INSERT CORE US STATES
-- =====================================================
INSERT INTO states (code, name) VALUES
('AZ', 'Arizona'),
('CA', 'California'),
('CO', 'Colorado'),
('FL', 'Florida'),
('IL', 'Illinois'),
('MA', 'Massachusetts'),
('MD', 'Maryland'),
('MI', 'Michigan'),
('MO', 'Missouri'),
('NV', 'Nevada'),
('NJ', 'New Jersey'),
('NY', 'New York'),
('OH', 'Ohio'),
('OK', 'Oklahoma'),
('OR', 'Oregon'),
('PA', 'Pennsylvania'),
('WA', 'Washington')
ON CONFLICT (code) DO NOTHING;
-- =====================================================
-- 3) ADD INDEX
-- =====================================================
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
-- =====================================================
-- DONE
-- =====================================================

View File

@@ -0,0 +1,45 @@
-- Migration 044: Add provider_detection_data column to dispensaries
--
-- This column stores detection metadata for menu provider discovery.
-- Used by menu-detection.ts and discovery.ts to track:
-- - Detected provider type
-- - Resolution attempts
-- - Error messages
-- - not_crawlable flag
--
-- Run with: psql $CANNAIQ_DB_URL -f migrations/044_add_provider_detection_data.sql
--
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
-- Add provider_detection_data to dispensaries table
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
) THEN
ALTER TABLE dispensaries
ADD COLUMN provider_detection_data JSONB DEFAULT NULL;
RAISE NOTICE 'Added provider_detection_data column to dispensaries table';
ELSE
RAISE NOTICE 'provider_detection_data column already exists on dispensaries table';
END IF;
END;
$$ LANGUAGE plpgsql;
-- Add index for querying by not_crawlable flag
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_not_crawlable
ON dispensaries ((provider_detection_data->>'not_crawlable'))
WHERE provider_detection_data IS NOT NULL;
-- Add index for querying by detected provider
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_provider
ON dispensaries ((provider_detection_data->>'detected_provider'))
WHERE provider_detection_data IS NOT NULL;
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSONB metadata from menu provider detection. Keys: detected_provider, resolution_error, not_crawlable, detection_timestamp';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================

View File

@@ -0,0 +1,27 @@
-- Migration 045: Add thumbnail_url columns to canonical tables
--
-- NOTE: image_url already exists in both tables from migration 041.
-- This migration adds thumbnail_url for cached thumbnail images.
DO $$
BEGIN
-- Add thumbnail_url to store_products if not exists
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'store_products' AND column_name = 'thumbnail_url'
) THEN
ALTER TABLE store_products ADD COLUMN thumbnail_url TEXT NULL;
END IF;
-- Add thumbnail_url to store_product_snapshots if not exists
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'store_product_snapshots' AND column_name = 'thumbnail_url'
) THEN
ALTER TABLE store_product_snapshots ADD COLUMN thumbnail_url TEXT NULL;
END IF;
END;
$$ LANGUAGE plpgsql;
COMMENT ON COLUMN store_products.thumbnail_url IS 'URL to cached thumbnail image';
COMMENT ON COLUMN store_product_snapshots.thumbnail_url IS 'URL to cached thumbnail image at time of snapshot';

View File

@@ -0,0 +1,351 @@
-- Migration 046: Crawler Reliability & Stabilization
-- Phase 1: Add fields for error taxonomy, retry management, and self-healing
-- ============================================================
-- PART 1: Error Taxonomy - Standardized error codes
-- ============================================================
-- Create enum for standardized error codes
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN
CREATE TYPE crawl_error_code AS ENUM (
'SUCCESS',
'RATE_LIMITED',
'BLOCKED_PROXY',
'HTML_CHANGED',
'TIMEOUT',
'AUTH_FAILED',
'NETWORK_ERROR',
'PARSE_ERROR',
'NO_PRODUCTS',
'UNKNOWN_ERROR'
);
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 2: Dispensary Crawl Configuration
-- ============================================================
-- Add crawl config columns to dispensaries
DO $$
BEGIN
-- Crawl frequency (minutes between crawls)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes'
) THEN
ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240;
END IF;
-- Max retries per crawl
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'max_retries'
) THEN
ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3;
END IF;
-- Current proxy ID
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id'
) THEN
ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL;
END IF;
-- Current user agent
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent'
) THEN
ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL;
END IF;
-- Next scheduled run
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at'
) THEN
ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL;
END IF;
-- Last successful crawl
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'last_success_at'
) THEN
ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL;
END IF;
-- Last error code (using text for flexibility, validated in app)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'last_error_code'
) THEN
ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL;
END IF;
-- Crawl status: active, degraded, paused, failed
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'crawl_status'
) THEN
ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active';
END IF;
-- Backoff multiplier (increases with failures)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier'
) THEN
ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0;
END IF;
-- Total attempt count (lifetime)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'total_attempts'
) THEN
ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0;
END IF;
-- Total success count (lifetime)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'total_successes'
) THEN
ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 3: Enhanced Job Tracking
-- ============================================================
-- Add columns to dispensary_crawl_jobs
DO $$
BEGIN
-- Error code
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL;
END IF;
-- Proxy used for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL;
END IF;
-- User agent used for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL;
END IF;
-- Attempt number for this job
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1;
END IF;
-- Backoff delay applied (ms)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0;
END IF;
-- HTTP status code received
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL;
END IF;
-- Response time (ms)
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms'
) THEN
ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- PART 4: Crawl History Table (for detailed tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS crawl_attempts (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
job_id INTEGER REFERENCES dispensary_crawl_jobs(id),
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Result
error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR',
error_message TEXT,
http_status INTEGER,
-- Context
attempt_number INTEGER NOT NULL DEFAULT 1,
proxy_used TEXT,
user_agent_used TEXT,
-- Metrics
products_found INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
-- Metadata
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Index for quick lookups
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code);
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC);
-- ============================================================
-- PART 5: Views for Monitoring
-- ============================================================
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawler_status;
-- Crawler status view with all reliability fields
CREATE VIEW v_crawler_status AS
SELECT
d.id,
d.name,
d.slug,
d.menu_type,
d.platform_dispensary_id,
d.crawl_status,
d.consecutive_failures,
d.last_crawl_at,
d.last_success_at,
d.last_failure_at,
d.last_error_code,
d.next_crawl_at,
d.crawl_frequency_minutes,
d.max_retries,
d.current_proxy_id,
d.current_user_agent,
d.backoff_multiplier,
d.total_attempts,
d.total_successes,
d.product_count,
CASE
WHEN d.total_attempts > 0
THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1)
ELSE 0
END AS success_rate,
CASE
WHEN d.crawl_status = 'failed' THEN 'FAILED'
WHEN d.crawl_status = 'paused' THEN 'PAUSED'
WHEN d.crawl_status = 'degraded' THEN 'DEGRADED'
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION'
WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID'
WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED'
WHEN d.next_crawl_at <= NOW() THEN 'DUE'
ELSE 'SCHEDULED'
END AS schedule_status,
d.failed_at,
d.failure_notes
FROM dispensaries d
WHERE d.state = 'AZ';
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawl_error_summary;
-- Error summary view
CREATE VIEW v_crawl_error_summary AS
SELECT
error_code,
COUNT(*) as total_occurrences,
COUNT(DISTINCT dispensary_id) as affected_stores,
MAX(started_at) as last_occurrence,
AVG(duration_ms)::INTEGER as avg_duration_ms
FROM crawl_attempts
WHERE started_at > NOW() - INTERVAL '7 days'
GROUP BY error_code
ORDER BY total_occurrences DESC;
-- Drop existing view if exists
DROP VIEW IF EXISTS v_crawl_health;
-- Overall crawl health view
CREATE VIEW v_crawl_health AS
SELECT
COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers,
COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers,
COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now,
COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures,
AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures,
COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h
FROM dispensaries
WHERE state = 'AZ' AND menu_type = 'dutchie';
-- ============================================================
-- PART 6: Constraint for minimum crawl gap
-- ============================================================
-- Function to check minimum crawl gap (2 minutes)
CREATE OR REPLACE FUNCTION check_minimum_crawl_gap()
RETURNS TRIGGER AS $$
BEGIN
-- Only check for new pending jobs
IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN
-- Check if there's a recent job for same dispensary
IF EXISTS (
SELECT 1 FROM dispensary_crawl_jobs
WHERE dispensary_id = NEW.dispensary_id
AND id != NEW.id
AND status IN ('pending', 'running')
AND created_at > NOW() - INTERVAL '2 minutes'
) THEN
RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary';
END IF;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create trigger (drop first if exists)
DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs;
CREATE TRIGGER enforce_minimum_crawl_gap
BEFORE INSERT ON dispensary_crawl_jobs
FOR EACH ROW
EXECUTE FUNCTION check_minimum_crawl_gap();
-- ============================================================
-- PART 7: Comments
-- ============================================================
COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging';
COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics';
COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days';
COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';

View File

@@ -0,0 +1,130 @@
-- Migration 046: Raw Payloads Table
--
-- Immutable event stream for raw crawler responses.
-- NEVER delete or overwrite historical payloads.
--
-- Run with:
-- DATABASE_URL="postgresql://..." psql $DATABASE_URL -f migrations/046_raw_payloads_table.sql
-- =====================================================
-- 1) RAW_PAYLOADS TABLE
-- =====================================================
CREATE TABLE IF NOT EXISTS raw_payloads (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- Store reference
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- Crawl run reference (nullable for backfilled data)
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
-- Platform identification
platform VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Versioning for schema evolution
payload_version INTEGER NOT NULL DEFAULT 1,
-- The raw JSON response from the crawler (immutable)
raw_json JSONB NOT NULL,
-- Metadata
product_count INTEGER, -- Number of products in payload
pricing_type VARCHAR(20), -- 'rec', 'med', or 'both'
crawl_mode VARCHAR(20), -- 'mode_a', 'mode_b', 'dual'
-- Timestamps
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Hydration status
processed BOOLEAN NOT NULL DEFAULT FALSE,
normalized_at TIMESTAMPTZ,
hydration_error TEXT,
hydration_attempts INTEGER DEFAULT 0,
-- Audit
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- =====================================================
-- 2) INDEXES FOR EFFICIENT QUERYING
-- =====================================================
-- Primary lookup: unprocessed payloads in FIFO order
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed
ON raw_payloads(fetched_at ASC)
WHERE processed = FALSE;
-- Store-based lookups
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary
ON raw_payloads(dispensary_id, fetched_at DESC);
-- Platform filtering
CREATE INDEX IF NOT EXISTS idx_raw_payloads_platform
ON raw_payloads(platform);
-- Crawl run linkage
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run
ON raw_payloads(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- Error tracking
CREATE INDEX IF NOT EXISTS idx_raw_payloads_errors
ON raw_payloads(hydration_attempts, processed)
WHERE hydration_error IS NOT NULL;
-- =====================================================
-- 3) HYDRATION LOCKS TABLE (distributed locking)
-- =====================================================
CREATE TABLE IF NOT EXISTS hydration_locks (
id SERIAL PRIMARY KEY,
lock_name VARCHAR(100) NOT NULL UNIQUE,
worker_id VARCHAR(100) NOT NULL,
acquired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ NOT NULL,
heartbeat_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_hydration_locks_expires
ON hydration_locks(expires_at);
-- =====================================================
-- 4) HYDRATION_RUNS TABLE (audit trail)
-- =====================================================
CREATE TABLE IF NOT EXISTS hydration_runs (
id SERIAL PRIMARY KEY,
worker_id VARCHAR(100) NOT NULL,
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed
-- Metrics
payloads_processed INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
brands_created INTEGER DEFAULT 0,
errors_count INTEGER DEFAULT 0,
-- Error details
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_hydration_runs_status
ON hydration_runs(status, started_at DESC);
-- =====================================================
-- 5) COMMENTS
-- =====================================================
COMMENT ON TABLE raw_payloads IS 'Immutable event stream of raw crawler responses. NEVER DELETE.';
COMMENT ON COLUMN raw_payloads.raw_json IS 'Complete raw JSON from GraphQL/API response. Immutable.';
COMMENT ON COLUMN raw_payloads.payload_version IS 'Schema version for normalization compatibility.';
COMMENT ON COLUMN raw_payloads.processed IS 'TRUE when payload has been hydrated to canonical tables.';
COMMENT ON COLUMN raw_payloads.normalized_at IS 'When the payload was successfully hydrated.';
COMMENT ON TABLE hydration_locks IS 'Distributed locks for hydration workers to prevent double-processing.';
COMMENT ON TABLE hydration_runs IS 'Audit trail of hydration job executions.';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================

View File

@@ -0,0 +1,473 @@
-- Migration 047: Analytics Infrastructure
-- Phase 3: Analytics Dashboards for CannaiQ
-- Creates views, functions, and tables for price trends, brand penetration, category growth, etc.
-- ============================================================
-- ANALYTICS CACHE TABLE (for expensive query results)
-- ============================================================
CREATE TABLE IF NOT EXISTS analytics_cache (
id SERIAL PRIMARY KEY,
cache_key VARCHAR(255) NOT NULL UNIQUE,
cache_data JSONB NOT NULL,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ NOT NULL,
query_time_ms INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_analytics_cache_key ON analytics_cache(cache_key);
CREATE INDEX IF NOT EXISTS idx_analytics_cache_expires ON analytics_cache(expires_at);
-- ============================================================
-- PRICE EXTRACTION HELPER FUNCTION
-- Extracts pricing from JSONB latest_raw_payload
-- ============================================================
CREATE OR REPLACE FUNCTION extract_min_price(payload JSONB)
RETURNS NUMERIC AS $$
DECLARE
prices JSONB;
min_val NUMERIC;
BEGIN
-- Try recPrices first (retail prices)
prices := payload->'recPrices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
END IF;
-- Try Prices array
prices := payload->'Prices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION extract_max_price(payload JSONB)
RETURNS NUMERIC AS $$
DECLARE
prices JSONB;
max_val NUMERIC;
BEGIN
prices := payload->'recPrices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
END IF;
prices := payload->'Prices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION extract_wholesale_price(payload JSONB)
RETURNS NUMERIC AS $$
DECLARE
prices JSONB;
min_val NUMERIC;
BEGIN
prices := payload->'wholesalePrices';
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
RETURN min_val;
END IF;
RETURN NULL;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- ============================================================
-- VIEW: v_product_pricing
-- Flattened view of products with extracted pricing
-- ============================================================
CREATE OR REPLACE VIEW v_product_pricing AS
SELECT
dp.id,
dp.dispensary_id,
dp.name,
dp.brand_name,
dp.brand_id,
dp.type as category,
dp.subcategory,
dp.strain_type,
dp.stock_status,
dp.status,
d.name as store_name,
d.city,
d.state,
extract_min_price(dp.latest_raw_payload) as min_price,
extract_max_price(dp.latest_raw_payload) as max_price,
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price,
dp.thc,
dp.cbd,
dp.updated_at,
dp.created_at
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id;
-- ============================================================
-- VIEW: v_brand_store_presence
-- Which brands are in which stores
-- ============================================================
CREATE OR REPLACE VIEW v_brand_store_presence AS
SELECT
dp.brand_name,
dp.brand_id,
dp.dispensary_id,
d.name as store_name,
d.city,
d.state,
dp.type as category,
COUNT(*) as sku_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count,
MAX(dp.updated_at) as last_updated
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name IS NOT NULL
GROUP BY dp.brand_name, dp.brand_id, dp.dispensary_id, d.name, d.city, d.state, dp.type;
-- ============================================================
-- VIEW: v_category_store_summary
-- Category breakdown per store
-- ============================================================
CREATE OR REPLACE VIEW v_category_store_summary AS
SELECT
dp.dispensary_id,
d.name as store_name,
d.city,
d.state,
dp.type as category,
COUNT(*) as sku_count,
COUNT(DISTINCT dp.brand_name) as brand_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.type IS NOT NULL
GROUP BY dp.dispensary_id, d.name, d.city, d.state, dp.type;
-- ============================================================
-- VIEW: v_brand_summary
-- Global brand statistics
-- ============================================================
CREATE OR REPLACE VIEW v_brand_summary AS
SELECT
dp.brand_name,
dp.brand_id,
COUNT(*) as total_skus,
COUNT(DISTINCT dp.dispensary_id) as store_count,
COUNT(DISTINCT dp.type) as category_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
MAX(dp.updated_at) as last_updated
FROM dutchie_products dp
WHERE dp.brand_name IS NOT NULL
GROUP BY dp.brand_name, dp.brand_id
ORDER BY total_skus DESC;
-- ============================================================
-- VIEW: v_category_summary
-- Global category statistics
-- ============================================================
CREATE OR REPLACE VIEW v_category_summary AS
SELECT
dp.type as category,
COUNT(*) as total_skus,
COUNT(DISTINCT dp.brand_name) as brand_count,
COUNT(DISTINCT dp.dispensary_id) as store_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus
FROM dutchie_products dp
WHERE dp.type IS NOT NULL
GROUP BY dp.type
ORDER BY total_skus DESC;
-- ============================================================
-- VIEW: v_store_summary
-- Store-level statistics
-- ============================================================
CREATE OR REPLACE VIEW v_store_summary AS
SELECT
d.id as store_id,
d.name as store_name,
d.city,
d.state,
d.chain_id,
COUNT(dp.id) as total_skus,
COUNT(DISTINCT dp.brand_name) as brand_count,
COUNT(DISTINCT dp.type) as category_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
d.last_crawl_at,
d.product_count
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
GROUP BY d.id, d.name, d.city, d.state, d.chain_id, d.last_crawl_at, d.product_count;
-- ============================================================
-- TABLE: brand_snapshots (for historical brand tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS brand_snapshots (
id SERIAL PRIMARY KEY,
brand_name VARCHAR(255) NOT NULL,
brand_id VARCHAR(255),
snapshot_date DATE NOT NULL,
store_count INTEGER NOT NULL DEFAULT 0,
total_skus INTEGER NOT NULL DEFAULT 0,
avg_price NUMERIC(10,2),
in_stock_skus INTEGER NOT NULL DEFAULT 0,
categories TEXT[],
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(brand_name, snapshot_date)
);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name);
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_date ON brand_snapshots(snapshot_date);
-- ============================================================
-- TABLE: category_snapshots (for historical category tracking)
-- ============================================================
CREATE TABLE IF NOT EXISTS category_snapshots (
id SERIAL PRIMARY KEY,
category VARCHAR(255) NOT NULL,
snapshot_date DATE NOT NULL,
store_count INTEGER NOT NULL DEFAULT 0,
brand_count INTEGER NOT NULL DEFAULT 0,
total_skus INTEGER NOT NULL DEFAULT 0,
avg_price NUMERIC(10,2),
in_stock_skus INTEGER NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(category, snapshot_date)
);
CREATE INDEX IF NOT EXISTS idx_category_snapshots_cat ON category_snapshots(category);
CREATE INDEX IF NOT EXISTS idx_category_snapshots_date ON category_snapshots(snapshot_date);
-- ============================================================
-- TABLE: store_change_events (for tracking store changes)
-- ============================================================
CREATE TABLE IF NOT EXISTS store_change_events (
id SERIAL PRIMARY KEY,
store_id INTEGER NOT NULL REFERENCES dispensaries(id),
event_type VARCHAR(50) NOT NULL, -- brand_added, brand_removed, product_added, product_removed, price_change, stock_change
event_date DATE NOT NULL,
brand_name VARCHAR(255),
product_id INTEGER,
product_name VARCHAR(500),
category VARCHAR(255),
old_value TEXT,
new_value TEXT,
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_store_events_store ON store_change_events(store_id);
CREATE INDEX IF NOT EXISTS idx_store_events_type ON store_change_events(event_type);
CREATE INDEX IF NOT EXISTS idx_store_events_date ON store_change_events(event_date);
CREATE INDEX IF NOT EXISTS idx_store_events_brand ON store_change_events(brand_name);
-- ============================================================
-- TABLE: analytics_alerts
-- ============================================================
CREATE TABLE IF NOT EXISTS analytics_alerts (
id SERIAL PRIMARY KEY,
alert_type VARCHAR(50) NOT NULL, -- price_warning, brand_dropped, competitive_intrusion, restock_event
severity VARCHAR(20) NOT NULL DEFAULT 'info', -- info, warning, critical
title VARCHAR(255) NOT NULL,
description TEXT,
store_id INTEGER REFERENCES dispensaries(id),
brand_name VARCHAR(255),
product_id INTEGER,
category VARCHAR(255),
metadata JSONB,
is_read BOOLEAN DEFAULT FALSE,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_type ON analytics_alerts(alert_type);
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_read ON analytics_alerts(is_read);
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_created ON analytics_alerts(created_at DESC);
-- ============================================================
-- FUNCTION: Capture daily brand snapshots
-- ============================================================
CREATE OR REPLACE FUNCTION capture_brand_snapshots()
RETURNS INTEGER AS $$
DECLARE
inserted_count INTEGER;
BEGIN
INSERT INTO brand_snapshots (brand_name, brand_id, snapshot_date, store_count, total_skus, avg_price, in_stock_skus, categories)
SELECT
brand_name,
brand_id,
CURRENT_DATE,
COUNT(DISTINCT dispensary_id),
COUNT(*),
AVG(extract_min_price(latest_raw_payload)),
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END),
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL)
FROM dutchie_products
WHERE brand_name IS NOT NULL
GROUP BY brand_name, brand_id
ON CONFLICT (brand_name, snapshot_date)
DO UPDATE SET
store_count = EXCLUDED.store_count,
total_skus = EXCLUDED.total_skus,
avg_price = EXCLUDED.avg_price,
in_stock_skus = EXCLUDED.in_stock_skus,
categories = EXCLUDED.categories;
GET DIAGNOSTICS inserted_count = ROW_COUNT;
RETURN inserted_count;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- FUNCTION: Capture daily category snapshots
-- ============================================================
CREATE OR REPLACE FUNCTION capture_category_snapshots()
RETURNS INTEGER AS $$
DECLARE
inserted_count INTEGER;
BEGIN
INSERT INTO category_snapshots (category, snapshot_date, store_count, brand_count, total_skus, avg_price, in_stock_skus)
SELECT
type,
CURRENT_DATE,
COUNT(DISTINCT dispensary_id),
COUNT(DISTINCT brand_name),
COUNT(*),
AVG(extract_min_price(latest_raw_payload)),
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END)
FROM dutchie_products
WHERE type IS NOT NULL
GROUP BY type
ON CONFLICT (category, snapshot_date)
DO UPDATE SET
store_count = EXCLUDED.store_count,
brand_count = EXCLUDED.brand_count,
total_skus = EXCLUDED.total_skus,
avg_price = EXCLUDED.avg_price,
in_stock_skus = EXCLUDED.in_stock_skus;
GET DIAGNOSTICS inserted_count = ROW_COUNT;
RETURN inserted_count;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- FUNCTION: Calculate price volatility for a product
-- ============================================================
CREATE OR REPLACE FUNCTION calculate_price_volatility(
p_product_id INTEGER,
p_days INTEGER DEFAULT 30
)
RETURNS NUMERIC AS $$
DECLARE
std_dev NUMERIC;
avg_price NUMERIC;
BEGIN
-- Using dutchie_product_snapshots if available
SELECT
STDDEV(rec_min_price_cents / 100.0),
AVG(rec_min_price_cents / 100.0)
INTO std_dev, avg_price
FROM dutchie_product_snapshots
WHERE dutchie_product_id = p_product_id
AND crawled_at >= NOW() - (p_days || ' days')::INTERVAL
AND rec_min_price_cents IS NOT NULL;
IF avg_price IS NULL OR avg_price = 0 THEN
RETURN NULL;
END IF;
-- Return coefficient of variation (CV)
RETURN ROUND((std_dev / avg_price) * 100, 2);
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- FUNCTION: Get brand penetration stats
-- ============================================================
CREATE OR REPLACE FUNCTION get_brand_penetration(
p_brand_name VARCHAR,
p_state VARCHAR DEFAULT NULL
)
RETURNS TABLE (
total_stores BIGINT,
stores_carrying BIGINT,
penetration_pct NUMERIC,
total_skus BIGINT,
avg_skus_per_store NUMERIC,
shelf_share_pct NUMERIC
) AS $$
BEGIN
RETURN QUERY
WITH store_counts AS (
SELECT
COUNT(DISTINCT d.id) as total,
COUNT(DISTINCT CASE WHEN dp.brand_name = p_brand_name THEN dp.dispensary_id END) as carrying
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
WHERE (p_state IS NULL OR d.state = p_state)
),
sku_counts AS (
SELECT
COUNT(*) as brand_skus,
COUNT(DISTINCT dispensary_id) as stores_with_brand
FROM dutchie_products
WHERE brand_name = p_brand_name
),
total_skus AS (
SELECT COUNT(*) as total FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE (p_state IS NULL OR d.state = p_state)
)
SELECT
sc.total,
sc.carrying,
ROUND((sc.carrying::NUMERIC / NULLIF(sc.total, 0)) * 100, 2),
skc.brand_skus,
ROUND(skc.brand_skus::NUMERIC / NULLIF(skc.stores_with_brand, 0), 2),
ROUND((skc.brand_skus::NUMERIC / NULLIF(ts.total, 0)) * 100, 2)
FROM store_counts sc, sku_counts skc, total_skus ts;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- Initial snapshot capture (run manually if needed)
-- ============================================================
-- Note: Run these after migration to capture initial snapshots:
-- SELECT capture_brand_snapshots();
-- SELECT capture_category_snapshots();
-- ============================================================
-- Grant permissions
-- ============================================================
-- Views are accessible to all roles by default
COMMENT ON VIEW v_product_pricing IS 'Flattened product view with extracted pricing from JSONB';
COMMENT ON VIEW v_brand_store_presence IS 'Brand presence across stores with SKU counts';
COMMENT ON VIEW v_brand_summary IS 'Global brand statistics';
COMMENT ON VIEW v_category_summary IS 'Global category statistics';
COMMENT ON VIEW v_store_summary IS 'Store-level statistics';
COMMENT ON TABLE analytics_cache IS 'Cache for expensive analytics queries';
COMMENT ON TABLE brand_snapshots IS 'Historical daily snapshots of brand metrics';
COMMENT ON TABLE category_snapshots IS 'Historical daily snapshots of category metrics';
COMMENT ON TABLE store_change_events IS 'Log of brand/product changes at stores';
COMMENT ON TABLE analytics_alerts IS 'Analytics-generated alerts and notifications';

View File

@@ -0,0 +1,598 @@
-- Migration 048: Production Sync + Monitoring Infrastructure
-- Phase 5: Full Production Sync + Monitoring
--
-- Creates:
-- 1. Sync orchestrator tables
-- 2. Dead-letter queue (DLQ)
-- 3. System metrics tracking
-- 4. Integrity check results
-- 5. Auto-fix audit log
-- ============================================================
-- SYNC ORCHESTRATOR TABLES
-- ============================================================
-- Orchestrator state and control
CREATE TABLE IF NOT EXISTS sync_orchestrator_state (
id INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1), -- Singleton row
status VARCHAR(20) NOT NULL DEFAULT 'SLEEPING', -- RUNNING, SLEEPING, LOCKED, PAUSED
current_worker_id VARCHAR(100),
last_heartbeat_at TIMESTAMPTZ,
last_run_started_at TIMESTAMPTZ,
last_run_completed_at TIMESTAMPTZ,
last_run_duration_ms INTEGER,
last_run_payloads_processed INTEGER DEFAULT 0,
last_run_errors INTEGER DEFAULT 0,
consecutive_failures INTEGER DEFAULT 0,
is_paused BOOLEAN DEFAULT FALSE,
pause_reason TEXT,
config JSONB DEFAULT '{
"batchSize": 50,
"pollIntervalMs": 5000,
"maxRetries": 3,
"lockTimeoutMs": 300000,
"enableAnalyticsPrecompute": true,
"enableIntegrityChecks": true
}'::jsonb,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert singleton row if not exists
INSERT INTO sync_orchestrator_state (id) VALUES (1) ON CONFLICT (id) DO NOTHING;
-- Sync run history
CREATE TABLE IF NOT EXISTS sync_runs (
id SERIAL PRIMARY KEY,
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
worker_id VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed, cancelled
started_at TIMESTAMPTZ DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Metrics
payloads_queued INTEGER DEFAULT 0,
payloads_processed INTEGER DEFAULT 0,
payloads_skipped INTEGER DEFAULT 0,
payloads_failed INTEGER DEFAULT 0,
payloads_dlq INTEGER DEFAULT 0,
products_upserted INTEGER DEFAULT 0,
products_inserted INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
products_discontinued INTEGER DEFAULT 0,
snapshots_created INTEGER DEFAULT 0,
-- Error tracking
errors JSONB DEFAULT '[]'::jsonb,
error_summary TEXT,
-- Diff stats (before/after)
diff_stats JSONB DEFAULT '{}'::jsonb,
-- Analytics precompute triggered
analytics_updated BOOLEAN DEFAULT FALSE,
analytics_duration_ms INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_sync_runs_status ON sync_runs(status);
CREATE INDEX IF NOT EXISTS idx_sync_runs_started_at ON sync_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_sync_runs_run_id ON sync_runs(run_id);
-- ============================================================
-- DEAD-LETTER QUEUE (DLQ)
-- ============================================================
-- DLQ for failed payloads
CREATE TABLE IF NOT EXISTS raw_payloads_dlq (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
original_payload_id UUID NOT NULL,
dispensary_id INTEGER REFERENCES dispensaries(id),
state_code VARCHAR(2),
platform VARCHAR(50) DEFAULT 'dutchie',
-- Original payload data (preserved)
raw_json JSONB NOT NULL,
product_count INTEGER,
pricing_type VARCHAR(10),
crawl_mode VARCHAR(20),
-- DLQ metadata
moved_to_dlq_at TIMESTAMPTZ DEFAULT NOW(),
failure_count INTEGER DEFAULT 0,
-- Error history (array of error objects)
error_history JSONB DEFAULT '[]'::jsonb,
last_error_type VARCHAR(50),
last_error_message TEXT,
last_error_at TIMESTAMPTZ,
-- Retry tracking
retry_count INTEGER DEFAULT 0,
last_retry_at TIMESTAMPTZ,
next_retry_at TIMESTAMPTZ,
-- Resolution
status VARCHAR(20) DEFAULT 'pending', -- pending, retrying, resolved, abandoned
resolved_at TIMESTAMPTZ,
resolved_by VARCHAR(100),
resolution_notes TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_dlq_status ON raw_payloads_dlq(status);
CREATE INDEX IF NOT EXISTS idx_dlq_dispensary ON raw_payloads_dlq(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_dlq_error_type ON raw_payloads_dlq(last_error_type);
CREATE INDEX IF NOT EXISTS idx_dlq_moved_at ON raw_payloads_dlq(moved_to_dlq_at DESC);
-- ============================================================
-- SYSTEM METRICS
-- ============================================================
-- System metrics time series
CREATE TABLE IF NOT EXISTS system_metrics (
id SERIAL PRIMARY KEY,
metric_name VARCHAR(100) NOT NULL,
metric_value NUMERIC NOT NULL,
labels JSONB DEFAULT '{}',
recorded_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_metrics_name_time ON system_metrics(metric_name, recorded_at DESC);
CREATE INDEX IF NOT EXISTS idx_metrics_recorded_at ON system_metrics(recorded_at DESC);
-- Metrics snapshot (current state, updated continuously)
CREATE TABLE IF NOT EXISTS system_metrics_current (
metric_name VARCHAR(100) PRIMARY KEY,
metric_value NUMERIC NOT NULL,
labels JSONB DEFAULT '{}',
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Error buckets for classification
CREATE TABLE IF NOT EXISTS error_buckets (
id SERIAL PRIMARY KEY,
error_type VARCHAR(50) NOT NULL,
error_message TEXT,
source_table VARCHAR(50),
source_id TEXT,
dispensary_id INTEGER,
state_code VARCHAR(2),
context JSONB DEFAULT '{}',
occurred_at TIMESTAMPTZ DEFAULT NOW(),
acknowledged BOOLEAN DEFAULT FALSE,
acknowledged_at TIMESTAMPTZ,
acknowledged_by VARCHAR(100)
);
CREATE INDEX IF NOT EXISTS idx_error_buckets_type ON error_buckets(error_type);
CREATE INDEX IF NOT EXISTS idx_error_buckets_occurred ON error_buckets(occurred_at DESC);
CREATE INDEX IF NOT EXISTS idx_error_buckets_unacked ON error_buckets(acknowledged) WHERE acknowledged = FALSE;
-- ============================================================
-- INTEGRITY CHECK RESULTS
-- ============================================================
CREATE TABLE IF NOT EXISTS integrity_check_runs (
id SERIAL PRIMARY KEY,
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
check_type VARCHAR(50) NOT NULL, -- daily, on_demand, scheduled
triggered_by VARCHAR(100),
started_at TIMESTAMPTZ DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed
-- Results summary
total_checks INTEGER DEFAULT 0,
passed_checks INTEGER DEFAULT 0,
failed_checks INTEGER DEFAULT 0,
warning_checks INTEGER DEFAULT 0,
-- Detailed results
results JSONB DEFAULT '[]'::jsonb,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_integrity_runs_status ON integrity_check_runs(status);
CREATE INDEX IF NOT EXISTS idx_integrity_runs_started ON integrity_check_runs(started_at DESC);
-- Individual integrity check results
CREATE TABLE IF NOT EXISTS integrity_check_results (
id SERIAL PRIMARY KEY,
run_id UUID REFERENCES integrity_check_runs(run_id) ON DELETE CASCADE,
check_name VARCHAR(100) NOT NULL,
check_category VARCHAR(50) NOT NULL,
status VARCHAR(20) NOT NULL, -- passed, failed, warning, skipped
-- Check details
expected_value TEXT,
actual_value TEXT,
difference TEXT,
affected_count INTEGER DEFAULT 0,
-- Context
details JSONB DEFAULT '{}',
affected_ids JSONB DEFAULT '[]'::jsonb,
-- Remediation
can_auto_fix BOOLEAN DEFAULT FALSE,
fix_routine VARCHAR(100),
checked_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_integrity_results_run ON integrity_check_results(run_id);
CREATE INDEX IF NOT EXISTS idx_integrity_results_status ON integrity_check_results(status);
-- ============================================================
-- AUTO-FIX AUDIT LOG
-- ============================================================
CREATE TABLE IF NOT EXISTS auto_fix_runs (
id SERIAL PRIMARY KEY,
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
routine_name VARCHAR(100) NOT NULL,
triggered_by VARCHAR(100) NOT NULL,
trigger_type VARCHAR(20) NOT NULL, -- manual, auto, scheduled
started_at TIMESTAMPTZ DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed, rolled_back
-- What was changed
rows_affected INTEGER DEFAULT 0,
changes JSONB DEFAULT '[]'::jsonb,
-- Dry run support
is_dry_run BOOLEAN DEFAULT FALSE,
dry_run_preview JSONB,
-- Error handling
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_fix_runs_routine ON auto_fix_runs(routine_name);
CREATE INDEX IF NOT EXISTS idx_fix_runs_started ON auto_fix_runs(started_at DESC);
-- ============================================================
-- ALERTS TABLE
-- ============================================================
CREATE TABLE IF NOT EXISTS system_alerts (
id SERIAL PRIMARY KEY,
alert_type VARCHAR(50) NOT NULL,
severity VARCHAR(20) NOT NULL, -- info, warning, error, critical
title VARCHAR(255) NOT NULL,
message TEXT,
source VARCHAR(100),
-- Context
context JSONB DEFAULT '{}',
-- State
status VARCHAR(20) DEFAULT 'active', -- active, acknowledged, resolved, muted
acknowledged_at TIMESTAMPTZ,
acknowledged_by VARCHAR(100),
resolved_at TIMESTAMPTZ,
resolved_by VARCHAR(100),
-- Deduplication
fingerprint VARCHAR(64), -- Hash for dedup
occurrence_count INTEGER DEFAULT 1,
first_occurred_at TIMESTAMPTZ DEFAULT NOW(),
last_occurred_at TIMESTAMPTZ DEFAULT NOW(),
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_alerts_status ON system_alerts(status);
CREATE INDEX IF NOT EXISTS idx_alerts_severity ON system_alerts(severity);
CREATE INDEX IF NOT EXISTS idx_alerts_type ON system_alerts(alert_type);
CREATE INDEX IF NOT EXISTS idx_alerts_fingerprint ON system_alerts(fingerprint);
CREATE INDEX IF NOT EXISTS idx_alerts_active ON system_alerts(status, created_at DESC) WHERE status = 'active';
-- ============================================================
-- HELPER VIEWS
-- ============================================================
-- Current sync status view
CREATE OR REPLACE VIEW v_sync_status AS
SELECT
sos.status as orchestrator_status,
sos.current_worker_id,
sos.last_heartbeat_at,
sos.is_paused,
sos.pause_reason,
sos.consecutive_failures,
sos.last_run_started_at,
sos.last_run_completed_at,
sos.last_run_duration_ms,
sos.last_run_payloads_processed,
sos.last_run_errors,
sos.config,
(SELECT COUNT(*) FROM raw_payloads WHERE processed = FALSE) as unprocessed_payloads,
(SELECT COUNT(*) FROM raw_payloads_dlq WHERE status = 'pending') as dlq_pending,
(SELECT COUNT(*) FROM system_alerts WHERE status = 'active') as active_alerts,
(
SELECT json_build_object(
'total', COUNT(*),
'completed', COUNT(*) FILTER (WHERE status = 'completed'),
'failed', COUNT(*) FILTER (WHERE status = 'failed')
)
FROM sync_runs
WHERE started_at >= NOW() - INTERVAL '24 hours'
) as runs_24h
FROM sync_orchestrator_state sos
WHERE sos.id = 1;
-- DLQ summary view
CREATE OR REPLACE VIEW v_dlq_summary AS
SELECT
status,
last_error_type,
COUNT(*) as count,
MIN(moved_to_dlq_at) as oldest,
MAX(moved_to_dlq_at) as newest
FROM raw_payloads_dlq
GROUP BY status, last_error_type
ORDER BY count DESC;
-- Error bucket summary (last 24h)
CREATE OR REPLACE VIEW v_error_summary AS
SELECT
error_type,
COUNT(*) as count,
COUNT(*) FILTER (WHERE acknowledged = FALSE) as unacknowledged,
MIN(occurred_at) as first_occurred,
MAX(occurred_at) as last_occurred
FROM error_buckets
WHERE occurred_at >= NOW() - INTERVAL '24 hours'
GROUP BY error_type
ORDER BY count DESC;
-- Metrics summary view
CREATE OR REPLACE VIEW v_metrics_summary AS
SELECT
metric_name,
metric_value,
labels,
updated_at,
NOW() - updated_at as age
FROM system_metrics_current
ORDER BY metric_name;
-- ============================================================
-- HELPER FUNCTIONS
-- ============================================================
-- Record a metric
CREATE OR REPLACE FUNCTION record_metric(
p_name VARCHAR(100),
p_value NUMERIC,
p_labels JSONB DEFAULT '{}'
) RETURNS VOID AS $$
BEGIN
-- Insert into time series
INSERT INTO system_metrics (metric_name, metric_value, labels)
VALUES (p_name, p_value, p_labels);
-- Upsert current value
INSERT INTO system_metrics_current (metric_name, metric_value, labels, updated_at)
VALUES (p_name, p_value, p_labels, NOW())
ON CONFLICT (metric_name) DO UPDATE SET
metric_value = EXCLUDED.metric_value,
labels = EXCLUDED.labels,
updated_at = NOW();
END;
$$ LANGUAGE plpgsql;
-- Record an error
CREATE OR REPLACE FUNCTION record_error(
p_type VARCHAR(50),
p_message TEXT,
p_source_table VARCHAR(50) DEFAULT NULL,
p_source_id TEXT DEFAULT NULL,
p_dispensary_id INTEGER DEFAULT NULL,
p_context JSONB DEFAULT '{}'
) RETURNS INTEGER AS $$
DECLARE
v_id INTEGER;
BEGIN
INSERT INTO error_buckets (
error_type, error_message, source_table, source_id,
dispensary_id, context
)
VALUES (
p_type, p_message, p_source_table, p_source_id,
p_dispensary_id, p_context
)
RETURNING id INTO v_id;
-- Update error count metric
PERFORM record_metric(
'error_count_' || p_type,
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'error_count_' || p_type), 0) + 1
);
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Create or update alert (with deduplication)
CREATE OR REPLACE FUNCTION upsert_alert(
p_type VARCHAR(50),
p_severity VARCHAR(20),
p_title VARCHAR(255),
p_message TEXT DEFAULT NULL,
p_source VARCHAR(100) DEFAULT NULL,
p_context JSONB DEFAULT '{}'
) RETURNS INTEGER AS $$
DECLARE
v_fingerprint VARCHAR(64);
v_id INTEGER;
BEGIN
-- Generate fingerprint for dedup
v_fingerprint := md5(p_type || p_title || COALESCE(p_source, ''));
-- Try to find existing active alert
SELECT id INTO v_id
FROM system_alerts
WHERE fingerprint = v_fingerprint AND status = 'active';
IF v_id IS NOT NULL THEN
-- Update existing alert
UPDATE system_alerts
SET occurrence_count = occurrence_count + 1,
last_occurred_at = NOW(),
context = p_context
WHERE id = v_id;
ELSE
-- Create new alert
INSERT INTO system_alerts (
alert_type, severity, title, message, source, context, fingerprint
)
VALUES (
p_type, p_severity, p_title, p_message, p_source, p_context, v_fingerprint
)
RETURNING id INTO v_id;
END IF;
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Move payload to DLQ
CREATE OR REPLACE FUNCTION move_to_dlq(
p_payload_id UUID,
p_error_type VARCHAR(50),
p_error_message TEXT
) RETURNS UUID AS $$
DECLARE
v_dlq_id UUID;
v_payload RECORD;
BEGIN
-- Get the original payload
SELECT * INTO v_payload
FROM raw_payloads
WHERE id = p_payload_id;
IF v_payload IS NULL THEN
RAISE EXCEPTION 'Payload not found: %', p_payload_id;
END IF;
-- Insert into DLQ
INSERT INTO raw_payloads_dlq (
original_payload_id, dispensary_id, state_code, platform,
raw_json, product_count, pricing_type, crawl_mode,
failure_count, last_error_type, last_error_message, last_error_at,
error_history
)
VALUES (
p_payload_id, v_payload.dispensary_id,
(SELECT state FROM dispensaries WHERE id = v_payload.dispensary_id),
v_payload.platform,
v_payload.raw_json, v_payload.product_count, v_payload.pricing_type, v_payload.crawl_mode,
v_payload.hydration_attempts,
p_error_type, p_error_message, NOW(),
COALESCE(v_payload.hydration_error::jsonb, '[]'::jsonb) || jsonb_build_object(
'type', p_error_type,
'message', p_error_message,
'at', NOW()
)
)
RETURNING id INTO v_dlq_id;
-- Mark original as processed (moved to DLQ)
UPDATE raw_payloads
SET processed = TRUE,
hydration_error = 'Moved to DLQ: ' || p_error_message
WHERE id = p_payload_id;
-- Record metric
PERFORM record_metric('payloads_dlq_total',
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'payloads_dlq_total'), 0) + 1
);
-- Create alert for DLQ
PERFORM upsert_alert(
'DLQ_ARRIVAL',
'warning',
'Payload moved to Dead-Letter Queue',
p_error_message,
'hydration',
jsonb_build_object('payload_id', p_payload_id, 'dlq_id', v_dlq_id, 'error_type', p_error_type)
);
RETURN v_dlq_id;
END;
$$ LANGUAGE plpgsql;
-- Cleanup old metrics (keep 7 days of time series)
CREATE OR REPLACE FUNCTION cleanup_old_metrics() RETURNS INTEGER AS $$
DECLARE
v_deleted INTEGER;
BEGIN
DELETE FROM system_metrics
WHERE recorded_at < NOW() - INTERVAL '7 days';
GET DIAGNOSTICS v_deleted = ROW_COUNT;
RETURN v_deleted;
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- ENSURE RAW_PAYLOADS HAS REQUIRED COLUMNS
-- ============================================================
-- Add state column to raw_payloads if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'raw_payloads' AND column_name = 'state_code'
) THEN
ALTER TABLE raw_payloads ADD COLUMN state_code VARCHAR(2);
END IF;
END $$;
-- ============================================================
-- INITIAL METRICS
-- ============================================================
-- Initialize core metrics
INSERT INTO system_metrics_current (metric_name, metric_value, labels)
VALUES
('payloads_unprocessed', 0, '{}'),
('payloads_processed_today', 0, '{}'),
('hydration_errors', 0, '{}'),
('hydration_success_rate', 100, '{}'),
('canonical_rows_inserted', 0, '{}'),
('canonical_rows_updated', 0, '{}'),
('canonical_rows_discontinued', 0, '{}'),
('snapshot_volume', 0, '{}'),
('ingestion_latency_avg_ms', 0, '{}'),
('payloads_dlq_total', 0, '{}')
ON CONFLICT (metric_name) DO NOTHING;
-- ============================================================
-- COMMENTS
-- ============================================================
COMMENT ON TABLE sync_orchestrator_state IS 'Singleton table tracking orchestrator status and config';
COMMENT ON TABLE sync_runs IS 'History of sync runs with metrics';
COMMENT ON TABLE raw_payloads_dlq IS 'Dead-letter queue for failed payloads';
COMMENT ON TABLE system_metrics IS 'Time-series metrics storage';
COMMENT ON TABLE system_metrics_current IS 'Current metric values (fast lookup)';
COMMENT ON TABLE error_buckets IS 'Classified errors for monitoring';
COMMENT ON TABLE integrity_check_runs IS 'Integrity check execution history';
COMMENT ON TABLE integrity_check_results IS 'Individual check results';
COMMENT ON TABLE auto_fix_runs IS 'Audit log for auto-fix routines';
COMMENT ON TABLE system_alerts IS 'System alerts with deduplication';

View File

@@ -0,0 +1,750 @@
-- ============================================================================
-- Migration 050: CannaiQ Canonical Schema v2
-- ============================================================================
--
-- Purpose: Add canonical tables for multi-state analytics, pricing engine,
-- promotions, intelligence, and brand/buyer portals.
--
-- RULES:
-- - STRICTLY ADDITIVE (no DROP, DELETE, TRUNCATE, or ALTER column type)
-- - All new tables use IF NOT EXISTS
-- - All new columns use ADD COLUMN IF NOT EXISTS
-- - All indexes use IF NOT EXISTS
-- - Compatible with existing dutchie_products, dispensaries, etc.
--
-- Run with:
-- psql $CANNAIQ_DB_URL -f migrations/050_cannaiq_canonical_v2.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: STATES TABLE
-- ============================================================================
-- Reference table for US states. Already may exist from 041/043.
-- This is idempotent.
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL UNIQUE,
name VARCHAR(100) NOT NULL,
timezone VARCHAR(50) DEFAULT 'America/Phoenix',
is_active BOOLEAN DEFAULT TRUE,
crawl_enabled BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert states if not present
INSERT INTO states (code, name, timezone) VALUES
('AZ', 'Arizona', 'America/Phoenix'),
('CA', 'California', 'America/Los_Angeles'),
('CO', 'Colorado', 'America/Denver'),
('FL', 'Florida', 'America/New_York'),
('IL', 'Illinois', 'America/Chicago'),
('MA', 'Massachusetts', 'America/New_York'),
('MD', 'Maryland', 'America/New_York'),
('MI', 'Michigan', 'America/Detroit'),
('MO', 'Missouri', 'America/Chicago'),
('NV', 'Nevada', 'America/Los_Angeles'),
('NJ', 'New Jersey', 'America/New_York'),
('NY', 'New York', 'America/New_York'),
('OH', 'Ohio', 'America/New_York'),
('OK', 'Oklahoma', 'America/Chicago'),
('OR', 'Oregon', 'America/Los_Angeles'),
('PA', 'Pennsylvania', 'America/New_York'),
('WA', 'Washington', 'America/Los_Angeles')
ON CONFLICT (code) DO UPDATE SET
timezone = EXCLUDED.timezone,
updated_at = NOW();
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
-- ============================================================================
-- SECTION 2: CHAINS TABLE (Retail Groups)
-- ============================================================================
-- Chains are multi-location operators like Curaleaf, Trulieve, Harvest, etc.
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
-- Branding
website_url TEXT,
logo_url TEXT,
description TEXT,
-- Business info
headquarters_city VARCHAR(100),
headquarters_state_id INTEGER REFERENCES states(id),
founded_year INTEGER,
-- Status
is_active BOOLEAN DEFAULT TRUE,
is_public BOOLEAN DEFAULT FALSE, -- Publicly traded?
stock_ticker VARCHAR(10),
-- Metadata
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
-- ============================================================================
-- SECTION 3: CANONICAL BRANDS TABLE
-- ============================================================================
-- This is the master brand catalog across all providers and states.
-- Distinct from the per-store `brands` table which tracks store-level brand presence.
CREATE TABLE IF NOT EXISTS canonical_brands (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL UNIQUE,
-- External IDs from various platforms
dutchie_brand_id VARCHAR(100),
jane_brand_id VARCHAR(100),
treez_brand_id VARCHAR(100),
weedmaps_brand_id VARCHAR(100),
-- Branding
logo_url TEXT,
local_logo_path TEXT, -- Local storage path
website_url TEXT,
instagram_handle VARCHAR(100),
description TEXT,
-- Classification
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
is_house_brand BOOLEAN DEFAULT FALSE, -- TRUE if dispensary house brand
parent_company VARCHAR(255), -- Parent company name if subsidiary
-- State presence
states_available TEXT[], -- Array of state codes where brand is present
-- Status
is_active BOOLEAN DEFAULT TRUE,
is_verified BOOLEAN DEFAULT FALSE, -- Manually verified brand info
verified_at TIMESTAMPTZ,
-- Metadata
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_canonical_brands_slug ON canonical_brands(slug);
CREATE INDEX IF NOT EXISTS idx_canonical_brands_dutchie ON canonical_brands(dutchie_brand_id) WHERE dutchie_brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_canonical_brands_portfolio ON canonical_brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
CREATE INDEX IF NOT EXISTS idx_canonical_brands_states ON canonical_brands USING GIN(states_available);
COMMENT ON TABLE canonical_brands IS 'Canonical brand catalog across all providers. Master brand reference.';
COMMENT ON COLUMN canonical_brands.is_portfolio_brand IS 'TRUE if this is a brand CannaiQ represents/manages.';
-- ============================================================================
-- SECTION 4: CRAWL_RUNS TABLE
-- ============================================================================
-- One record per crawl execution. Links to snapshots.
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
error_code VARCHAR(50),
error_message TEXT,
http_status INTEGER,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
products_missing INTEGER DEFAULT 0, -- Products gone from feed
snapshots_written INTEGER DEFAULT 0,
-- Infrastructure
worker_id VARCHAR(100),
worker_hostname VARCHAR(100),
proxy_used TEXT,
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
-- Metadata
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- ============================================================================
-- SECTION 5: STORE_PRODUCTS TABLE (Current Menu State)
-- ============================================================================
-- Canonical representation of what's currently on the menu.
-- Provider-agnostic structure for analytics.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
-- Links to canonical entities
canonical_brand_id INTEGER REFERENCES canonical_brands(id) ON DELETE SET NULL,
category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL,
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100) NOT NULL, -- Platform product ID
provider_brand_id VARCHAR(100), -- Platform brand ID
enterprise_product_id VARCHAR(100), -- Cross-store product ID
-- Raw data from platform (not normalized)
name VARCHAR(500) NOT NULL,
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
strain_type VARCHAR(50),
description TEXT,
-- Pricing (current)
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
price_unit VARCHAR(20) DEFAULT 'each', -- gram, ounce, each, mg
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock', -- in_stock, out_of_stock, low_stock, missing_from_feed
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
thc_mg NUMERIC(10,2),
cbd_mg NUMERIC(10,2),
-- Weight/Size
weight_value NUMERIC(10,2),
weight_unit VARCHAR(20), -- g, oz, mg
-- Images
image_url TEXT,
local_image_path TEXT,
thumbnail_url TEXT,
-- Flags
is_featured BOOLEAN DEFAULT FALSE,
medical_only BOOLEAN DEFAULT FALSE,
rec_only BOOLEAN DEFAULT FALSE,
-- Menu position (for tracking prominence)
menu_position INTEGER,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_price_change_at TIMESTAMPTZ,
last_stock_change_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, provider, provider_product_id)
);
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(canonical_brand_id) WHERE canonical_brand_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
-- ============================================================================
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE (Historical Data)
-- ============================================================================
-- Time-series data for analytics. One row per product per crawl.
-- CRITICAL: NEVER DELETE from this table.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
state_id INTEGER REFERENCES states(id),
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name VARCHAR(500),
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
is_present_in_feed BOOLEAN DEFAULT TRUE, -- FALSE = missing from feed
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Menu position (for tracking prominence changes)
menu_position INTEGER,
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Partitioning-ready indexes (for future table partitioning by month)
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
-- ============================================================================
-- SECTION 7: ADD state_id AND chain_id TO DISPENSARIES
-- ============================================================================
-- Link dispensaries to states and chains tables.
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
-- Backfill state_id from existing state column
UPDATE dispensaries d
SET state_id = s.id
FROM states s
WHERE d.state = s.code
AND d.state_id IS NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- ============================================================================
-- SECTION 8: BRAND PENETRATION TABLE
-- ============================================================================
-- Pre-computed brand presence across stores for analytics dashboards.
CREATE TABLE IF NOT EXISTS brand_penetration (
id SERIAL PRIMARY KEY,
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
state_id INTEGER NOT NULL REFERENCES states(id) ON DELETE CASCADE,
-- Metrics
stores_carrying INTEGER DEFAULT 0,
stores_total INTEGER DEFAULT 0,
penetration_pct NUMERIC(5,2) DEFAULT 0,
-- Product breakdown
products_count INTEGER DEFAULT 0,
products_in_stock INTEGER DEFAULT 0,
products_on_special INTEGER DEFAULT 0,
-- Pricing
avg_price NUMERIC(10,2),
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
-- Time range
calculated_at TIMESTAMPTZ DEFAULT NOW(),
period_start TIMESTAMPTZ,
period_end TIMESTAMPTZ,
UNIQUE(canonical_brand_id, state_id, calculated_at)
);
CREATE INDEX IF NOT EXISTS idx_brand_penetration_brand ON brand_penetration(canonical_brand_id);
CREATE INDEX IF NOT EXISTS idx_brand_penetration_state ON brand_penetration(state_id);
CREATE INDEX IF NOT EXISTS idx_brand_penetration_calculated ON brand_penetration(calculated_at DESC);
COMMENT ON TABLE brand_penetration IS 'Pre-computed brand penetration metrics by state.';
-- ============================================================================
-- SECTION 9: PRICE_ALERTS TABLE
-- ============================================================================
-- Track significant price changes for intelligence/alerts.
CREATE TABLE IF NOT EXISTS price_alerts (
id SERIAL PRIMARY KEY,
store_product_id INTEGER REFERENCES store_products(id) ON DELETE CASCADE,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
-- What changed
alert_type VARCHAR(50) NOT NULL, -- price_drop, price_increase, new_special, special_ended
-- Values
old_price NUMERIC(10,2),
new_price NUMERIC(10,2),
change_amount NUMERIC(10,2),
change_percent NUMERIC(5,2),
-- Context
product_name VARCHAR(500),
brand_name VARCHAR(255),
category VARCHAR(100),
-- Status
is_processed BOOLEAN DEFAULT FALSE,
processed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_price_alerts_dispensary ON price_alerts(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_price_alerts_state ON price_alerts(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_price_alerts_type ON price_alerts(alert_type);
CREATE INDEX IF NOT EXISTS idx_price_alerts_unprocessed ON price_alerts(is_processed) WHERE is_processed = FALSE;
CREATE INDEX IF NOT EXISTS idx_price_alerts_created ON price_alerts(created_at DESC);
COMMENT ON TABLE price_alerts IS 'Significant price changes for intelligence/alerting.';
-- ============================================================================
-- SECTION 10: RAW_PAYLOADS TABLE
-- ============================================================================
-- Store raw API responses for replay/debugging. Separate from snapshots.
CREATE TABLE IF NOT EXISTS raw_payloads (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
-- Payload info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
payload_type VARCHAR(50) NOT NULL DEFAULT 'products', -- products, brands, specials
-- The raw data
payload JSONB NOT NULL,
payload_size_bytes INTEGER,
-- Deduplication
payload_hash VARCHAR(64), -- SHA256 for deduplication
-- Processing status
is_processed BOOLEAN DEFAULT FALSE,
processed_at TIMESTAMPTZ,
captured_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary ON raw_payloads(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run ON raw_payloads(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed ON raw_payloads(is_processed) WHERE is_processed = FALSE;
CREATE INDEX IF NOT EXISTS idx_raw_payloads_hash ON raw_payloads(payload_hash) WHERE payload_hash IS NOT NULL;
COMMENT ON TABLE raw_payloads IS 'Raw API responses for replay/debugging. Enables re-hydration.';
-- ============================================================================
-- SECTION 11: ANALYTICS CACHE TABLES
-- ============================================================================
-- Pre-computed analytics for dashboard performance.
-- Daily store metrics
CREATE TABLE IF NOT EXISTS analytics_store_daily (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
date DATE NOT NULL,
-- Product counts
total_products INTEGER DEFAULT 0,
in_stock_products INTEGER DEFAULT 0,
out_of_stock_products INTEGER DEFAULT 0,
on_special_products INTEGER DEFAULT 0,
-- Brand/category diversity
unique_brands INTEGER DEFAULT 0,
unique_categories INTEGER DEFAULT 0,
-- Pricing
avg_price NUMERIC(10,2),
median_price NUMERIC(10,2),
-- Crawl health
crawl_count INTEGER DEFAULT 0,
successful_crawls INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(dispensary_id, date)
);
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_dispensary ON analytics_store_daily(dispensary_id, date DESC);
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_state ON analytics_store_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_date ON analytics_store_daily(date DESC);
-- Daily brand metrics
CREATE TABLE IF NOT EXISTS analytics_brand_daily (
id SERIAL PRIMARY KEY,
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
state_id INTEGER REFERENCES states(id),
date DATE NOT NULL,
-- Presence
stores_carrying INTEGER DEFAULT 0,
products_count INTEGER DEFAULT 0,
-- Stock
in_stock_count INTEGER DEFAULT 0,
out_of_stock_count INTEGER DEFAULT 0,
-- Pricing
avg_price NUMERIC(10,2),
min_price NUMERIC(10,2),
max_price NUMERIC(10,2),
on_special_count INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(canonical_brand_id, state_id, date)
);
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_brand ON analytics_brand_daily(canonical_brand_id, date DESC);
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_state ON analytics_brand_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
-- ============================================================================
-- SECTION 12: VIEWS FOR COMPATIBILITY
-- ============================================================================
-- View: Latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View: Crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
COALESCE(d.dba_name, d.name) AS dispensary_name,
d.city,
d.state,
d.state_id,
s.name AS state_name,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN states s ON s.id = d.state_id
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
-- View: Brand presence across stores
CREATE OR REPLACE VIEW v_brand_store_presence AS
SELECT
cb.id AS brand_id,
cb.name AS brand_name,
cb.slug AS brand_slug,
s.id AS state_id,
s.code AS state_code,
COUNT(DISTINCT sp.dispensary_id) AS store_count,
COUNT(sp.id) AS product_count,
COUNT(sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
AVG(sp.price_rec) AS avg_price,
MIN(sp.price_rec) AS min_price,
MAX(sp.price_rec) AS max_price
FROM canonical_brands cb
JOIN store_products sp ON sp.canonical_brand_id = cb.id
LEFT JOIN states s ON s.id = sp.state_id
GROUP BY cb.id, cb.name, cb.slug, s.id, s.code;
-- ============================================================================
-- SECTION 13: ADD FK FROM store_product_snapshots TO crawl_runs
-- ============================================================================
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.table_constraints
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
END $$;
-- ============================================================================
-- SECTION 14: ADD crawl_run_id TO crawl_orchestration_traces
-- ============================================================================
ALTER TABLE crawl_orchestration_traces
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
ON crawl_orchestration_traces(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- ============================================================================
-- SECTION 15: UPDATE dispensary_crawler_profiles
-- ============================================================================
-- Add status columns for profile lifecycle.
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
ALTER TABLE dispensary_crawler_profiles
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
CREATE INDEX IF NOT EXISTS idx_profiles_status
ON dispensary_crawler_profiles(status);
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
-- ============================================================================
-- SECTION 16: UPDATE dispensary_crawl_jobs WITH ADDITIONAL COLUMNS
-- ============================================================================
-- Add columns needed for enhanced job tracking.
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS worker_id VARCHAR(100);
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS worker_hostname VARCHAR(100);
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS claimed_by VARCHAR(100);
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS claimed_at TIMESTAMPTZ;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS locked_until TIMESTAMPTZ;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS last_heartbeat_at TIMESTAMPTZ;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS max_retries INTEGER DEFAULT 3;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS products_upserted INTEGER DEFAULT 0;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS snapshots_created INTEGER DEFAULT 0;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS current_page INTEGER DEFAULT 0;
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS total_pages INTEGER;
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status_pending ON dispensary_crawl_jobs(status) WHERE status = 'pending';
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_claimed_by ON dispensary_crawl_jobs(claimed_by) WHERE claimed_by IS NOT NULL;
-- ============================================================================
-- SECTION 17: QUEUE MONITORING VIEWS
-- ============================================================================
CREATE OR REPLACE VIEW v_queue_stats AS
SELECT
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'pending') AS pending_jobs,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'running') AS running_jobs,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS completed_1h,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') AS failed_1h,
(SELECT COUNT(DISTINCT worker_id) FROM dispensary_crawl_jobs WHERE status = 'running' AND worker_id IS NOT NULL) AS active_workers,
(SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS avg_duration_seconds;
CREATE OR REPLACE VIEW v_active_workers AS
SELECT
worker_id,
worker_hostname,
COUNT(*) AS current_jobs,
SUM(products_found) AS total_products_found,
SUM(products_upserted) AS total_products_upserted,
SUM(snapshots_created) AS total_snapshots,
MIN(claimed_at) AS first_claimed_at,
MAX(last_heartbeat_at) AS last_heartbeat
FROM dispensary_crawl_jobs
WHERE status = 'running' AND worker_id IS NOT NULL
GROUP BY worker_id, worker_hostname;
-- ============================================================================
-- DONE
-- ============================================================================
SELECT 'Migration 050 completed successfully. Canonical schema v2 is ready.' AS status;

View File

@@ -0,0 +1,642 @@
-- ============================================================================
-- Migration 051: CannaiQ Canonical Schema - Safe Bootstrap
-- ============================================================================
--
-- Purpose: Create the canonical CannaiQ schema tables from scratch.
-- This migration is FULLY IDEMPOTENT and safe to run multiple times.
--
-- SAFETY RULES FOLLOWED:
-- 1. ALL tables use CREATE TABLE IF NOT EXISTS
-- 2. ALL columns use ALTER TABLE ADD COLUMN IF NOT EXISTS
-- 3. ALL indexes use CREATE INDEX IF NOT EXISTS
-- 4. NO DROP, DELETE, TRUNCATE, or destructive operations
-- 5. NO assumptions about existing data or column existence
-- 6. NO dependencies on migrations 041, 043, or 050
-- 7. Compatible with dutchie_menus database as it exists today
-- 8. Safe handling of pre-existing states table with missing columns
--
-- Tables Created:
-- - states (US state reference table)
-- - chains (retail chain/group table)
-- - crawl_runs (crawl execution records)
-- - store_products (current menu state)
-- - store_product_snapshots (historical price/stock data)
--
-- Columns Added:
-- - dispensaries.state_id (FK to states)
-- - dispensaries.chain_id (FK to chains)
--
-- Run with:
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
-- -f migrations/051_cannaiq_canonical_safe_bootstrap.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: STATES TABLE
-- ============================================================================
-- Reference table for US states where CannaiQ operates.
-- This section handles the case where the table exists but is missing columns.
-- First, create the table if it doesn't exist (minimal definition)
CREATE TABLE IF NOT EXISTS states (
id SERIAL PRIMARY KEY,
code VARCHAR(2) NOT NULL,
name VARCHAR(100) NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Now safely add any missing columns (each is independent, won't fail if exists)
ALTER TABLE states ADD COLUMN IF NOT EXISTS timezone TEXT;
ALTER TABLE states ADD COLUMN IF NOT EXISTS is_active BOOLEAN DEFAULT TRUE;
ALTER TABLE states ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE;
-- Add unique constraint on code if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'states_code_key' AND conrelid = 'states'::regclass
) THEN
-- Check if there's already a unique constraint with a different name
IF NOT EXISTS (
SELECT 1 FROM pg_indexes
WHERE tablename = 'states' AND indexdef LIKE '%UNIQUE%code%'
) THEN
ALTER TABLE states ADD CONSTRAINT states_code_key UNIQUE (code);
END IF;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL; -- Constraint already exists
WHEN OTHERS THEN
NULL; -- Handle any other errors gracefully
END $$;
-- Set default timezone values for existing rows that have NULL
UPDATE states SET timezone = 'America/Phoenix' WHERE timezone IS NULL AND code = 'AZ';
UPDATE states SET timezone = 'America/Los_Angeles' WHERE timezone IS NULL AND code IN ('CA', 'NV', 'OR', 'WA');
UPDATE states SET timezone = 'America/Denver' WHERE timezone IS NULL AND code = 'CO';
UPDATE states SET timezone = 'America/New_York' WHERE timezone IS NULL AND code IN ('FL', 'MA', 'MD', 'NJ', 'NY', 'OH', 'PA');
UPDATE states SET timezone = 'America/Chicago' WHERE timezone IS NULL AND code IN ('IL', 'MO', 'OK');
UPDATE states SET timezone = 'America/Detroit' WHERE timezone IS NULL AND code = 'MI';
-- Set default is_active for existing rows
UPDATE states SET is_active = TRUE WHERE is_active IS NULL;
UPDATE states SET crawl_enabled = TRUE WHERE crawl_enabled IS NULL;
-- Insert known states (idempotent - ON CONFLICT DO UPDATE to fill missing values)
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
('AZ', 'Arizona', 'America/Phoenix', TRUE, TRUE),
('CA', 'California', 'America/Los_Angeles', TRUE, TRUE),
('CO', 'Colorado', 'America/Denver', TRUE, TRUE),
('FL', 'Florida', 'America/New_York', TRUE, TRUE),
('IL', 'Illinois', 'America/Chicago', TRUE, TRUE),
('MA', 'Massachusetts', 'America/New_York', TRUE, TRUE),
('MD', 'Maryland', 'America/New_York', TRUE, TRUE),
('MI', 'Michigan', 'America/Detroit', TRUE, TRUE),
('MO', 'Missouri', 'America/Chicago', TRUE, TRUE),
('NV', 'Nevada', 'America/Los_Angeles', TRUE, TRUE),
('NJ', 'New Jersey', 'America/New_York', TRUE, TRUE),
('NY', 'New York', 'America/New_York', TRUE, TRUE),
('OH', 'Ohio', 'America/New_York', TRUE, TRUE),
('OK', 'Oklahoma', 'America/Chicago', TRUE, TRUE),
('OR', 'Oregon', 'America/Los_Angeles', TRUE, TRUE),
('PA', 'Pennsylvania', 'America/New_York', TRUE, TRUE),
('WA', 'Washington', 'America/Los_Angeles', TRUE, TRUE)
ON CONFLICT (code) DO UPDATE SET
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
is_active = COALESCE(states.is_active, EXCLUDED.is_active),
crawl_enabled = COALESCE(states.crawl_enabled, EXCLUDED.crawl_enabled),
updated_at = NOW();
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
-- ============================================================================
-- SECTION 2: CHAINS TABLE
-- ============================================================================
-- Retail chains/groups that own multiple dispensary locations.
-- Examples: Curaleaf, Trulieve, Harvest, Columbia Care
CREATE TABLE IF NOT EXISTS chains (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL,
website_url TEXT,
logo_url TEXT,
description TEXT,
headquarters_city VARCHAR(100),
headquarters_state_id INTEGER,
founded_year INTEGER,
is_active BOOLEAN DEFAULT TRUE,
is_public BOOLEAN DEFAULT FALSE,
stock_ticker VARCHAR(10),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add unique constraint on slug if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'chains_slug_key' AND conrelid = 'chains'::regclass
) THEN
ALTER TABLE chains ADD CONSTRAINT chains_slug_key UNIQUE (slug);
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
-- Add FK to states if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'chains_headquarters_state_id_fkey'
) THEN
ALTER TABLE chains
ADD CONSTRAINT chains_headquarters_state_id_fkey
FOREIGN KEY (headquarters_state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
-- ============================================================================
-- SECTION 3: ADD state_id AND chain_id TO DISPENSARIES
-- ============================================================================
-- Link existing dispensaries table to states and chains.
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER;
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER;
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dispensaries_state_id_fkey'
) THEN
ALTER TABLE dispensaries
ADD CONSTRAINT dispensaries_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dispensaries_chain_id_fkey'
) THEN
ALTER TABLE dispensaries
ADD CONSTRAINT dispensaries_chain_id_fkey
FOREIGN KEY (chain_id) REFERENCES chains(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
-- Backfill state_id from existing state column (safe - only updates NULL values)
UPDATE dispensaries d
SET state_id = s.id
FROM states s
WHERE d.state = s.code
AND d.state_id IS NULL;
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
-- ============================================================================
-- SECTION 4: CRAWL_RUNS TABLE
-- ============================================================================
-- One record per crawl execution. Links to snapshots.
CREATE TABLE IF NOT EXISTS crawl_runs (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL,
state_id INTEGER,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
-- Timing
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_ms INTEGER,
-- Status
status VARCHAR(20) NOT NULL DEFAULT 'running',
error_code VARCHAR(50),
error_message TEXT,
http_status INTEGER,
-- Results
products_found INTEGER DEFAULT 0,
products_new INTEGER DEFAULT 0,
products_updated INTEGER DEFAULT 0,
products_missing INTEGER DEFAULT 0,
snapshots_written INTEGER DEFAULT 0,
-- Infrastructure
worker_id VARCHAR(100),
worker_hostname VARCHAR(100),
proxy_used TEXT,
trigger_type VARCHAR(50) DEFAULT 'scheduled',
-- Metadata
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'crawl_runs_dispensary_id_fkey'
) THEN
ALTER TABLE crawl_runs
ADD CONSTRAINT crawl_runs_dispensary_id_fkey
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'crawl_runs_state_id_fkey'
) THEN
ALTER TABLE crawl_runs
ADD CONSTRAINT crawl_runs_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
-- ============================================================================
-- SECTION 5: STORE_PRODUCTS TABLE
-- ============================================================================
-- Current state of products on each dispensary menu.
-- Provider-agnostic structure for analytics.
CREATE TABLE IF NOT EXISTS store_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL,
state_id INTEGER,
-- Provider-specific identifiers
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100) NOT NULL,
provider_brand_id VARCHAR(100),
enterprise_product_id VARCHAR(100),
-- Raw data from platform (not normalized)
name VARCHAR(500) NOT NULL,
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
strain_type VARCHAR(50),
description TEXT,
-- Pricing (current)
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
special_name TEXT,
discount_percent NUMERIC(5,2),
price_unit VARCHAR(20) DEFAULT 'each',
-- Inventory
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
-- Potency
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
thc_mg NUMERIC(10,2),
cbd_mg NUMERIC(10,2),
-- Weight/Size
weight_value NUMERIC(10,2),
weight_unit VARCHAR(20),
-- Images
image_url TEXT,
local_image_path TEXT,
thumbnail_url TEXT,
-- Flags
is_featured BOOLEAN DEFAULT FALSE,
medical_only BOOLEAN DEFAULT FALSE,
rec_only BOOLEAN DEFAULT FALSE,
-- Menu position (for tracking prominence)
menu_position INTEGER,
-- Timestamps
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
last_price_change_at TIMESTAMPTZ,
last_stock_change_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add unique constraint if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_products_dispensary_provider_product_key'
) THEN
ALTER TABLE store_products
ADD CONSTRAINT store_products_dispensary_provider_product_key
UNIQUE (dispensary_id, provider, provider_product_id);
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_products_dispensary_id_fkey'
) THEN
ALTER TABLE store_products
ADD CONSTRAINT store_products_dispensary_id_fkey
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_products_state_id_fkey'
) THEN
ALTER TABLE store_products
ADD CONSTRAINT store_products_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_brand_name ON store_products(brand_name) WHERE brand_name IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
-- ============================================================================
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE
-- ============================================================================
-- Historical price/stock data. One row per product per crawl.
-- CRITICAL: NEVER DELETE from this table.
CREATE TABLE IF NOT EXISTS store_product_snapshots (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL,
store_product_id INTEGER,
state_id INTEGER,
-- Provider info
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
provider_product_id VARCHAR(100),
-- Link to crawl run
crawl_run_id INTEGER,
-- Capture timestamp
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Raw data from platform
name VARCHAR(500),
brand_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
-- Pricing at time of capture
price_rec NUMERIC(10,2),
price_med NUMERIC(10,2),
price_rec_special NUMERIC(10,2),
price_med_special NUMERIC(10,2),
is_on_special BOOLEAN DEFAULT FALSE,
discount_percent NUMERIC(5,2),
-- Inventory at time of capture
is_in_stock BOOLEAN DEFAULT TRUE,
stock_quantity INTEGER,
stock_status VARCHAR(50) DEFAULT 'in_stock',
is_present_in_feed BOOLEAN DEFAULT TRUE,
-- Potency at time of capture
thc_percent NUMERIC(5,2),
cbd_percent NUMERIC(5,2),
-- Menu position (for tracking prominence changes)
menu_position INTEGER,
-- Image URL at time of capture
image_url TEXT,
-- Full raw response for debugging
raw_data JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Add FK constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_dispensary_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_dispensary_id_fkey
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_store_product_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_store_product_id_fkey
FOREIGN KEY (store_product_id) REFERENCES store_products(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_state_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_state_id_fkey
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_crawl_run_id_fkey'
) THEN
ALTER TABLE store_product_snapshots
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN
NULL;
WHEN OTHERS THEN
NULL;
END $$;
-- Indexes optimized for analytics queries
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_product ON store_product_snapshots(provider_product_id) WHERE provider_product_id IS NOT NULL;
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
-- ============================================================================
-- SECTION 7: VIEWS FOR BACKWARD COMPATIBILITY
-- ============================================================================
-- View: Latest snapshot per store product
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
SELECT DISTINCT ON (dispensary_id, provider_product_id)
sps.*
FROM store_product_snapshots sps
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
-- View: Crawl run summary per dispensary
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
SELECT
d.id AS dispensary_id,
COALESCE(d.dba_name, d.name) AS dispensary_name,
d.city,
d.state,
d.state_id,
s.name AS state_name,
COUNT(DISTINCT sp.id) AS current_product_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
MAX(cr.finished_at) AS last_crawl_at,
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
FROM dispensaries d
LEFT JOIN states s ON s.id = d.state_id
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
-- ============================================================================
-- MIGRATION 051 COMPLETE
-- ============================================================================
SELECT 'Migration 051 completed successfully. Canonical schema is ready.' AS status;

View File

@@ -0,0 +1,98 @@
-- Migration 051: Create materialized view for state metrics
-- Used by Analytics V2 state endpoints for fast aggregated queries
-- Canonical tables: states, dispensaries, store_products, store_product_snapshots, brands
-- Drop existing view if it exists (for clean recreation)
DROP MATERIALIZED VIEW IF EXISTS mv_state_metrics;
-- Create materialized view with comprehensive state metrics
-- Schema verified via information_schema on 2025-12-06
-- Real columns used:
-- states: id, code, name, recreational_legal, medical_legal, rec_year, med_year
-- dispensaries: id, state_id (NO is_active column)
-- store_products: id, dispensary_id, brand_id, category_raw, price_rec, price_med, is_in_stock
-- store_product_snapshots: id, store_product_id, captured_at
-- brands: id (joined via sp.brand_id)
CREATE MATERIALIZED VIEW mv_state_metrics AS
SELECT
s.id AS state_id,
s.code AS state,
s.name AS state_name,
COALESCE(s.recreational_legal, FALSE) AS recreational_legal,
COALESCE(s.medical_legal, FALSE) AS medical_legal,
s.rec_year,
s.med_year,
-- Dispensary metrics
COUNT(DISTINCT d.id) AS dispensary_count,
-- Product metrics
COUNT(DISTINCT sp.id) AS total_products,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = TRUE) AS in_stock_products,
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = FALSE) AS out_of_stock_products,
-- Brand metrics (using brand_id FK, not brand_name)
COUNT(DISTINCT sp.brand_id) FILTER (WHERE sp.brand_id IS NOT NULL) AS unique_brands,
-- Category metrics (using category_raw, not category)
COUNT(DISTINCT sp.category_raw) FILTER (WHERE sp.category_raw IS NOT NULL) AS unique_categories,
-- Pricing metrics (recreational)
AVG(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_rec,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)
FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_rec,
MIN(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS min_price_rec,
MAX(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS max_price_rec,
-- Pricing metrics (medical)
AVG(sp.price_med) FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_med,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_med)
FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_med,
-- Snapshot/crawl metrics
COUNT(sps.id) AS total_snapshots,
MAX(sps.captured_at) AS last_crawl_at,
MIN(sps.captured_at) AS first_crawl_at,
-- Data freshness
CASE
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '24 hours' THEN 'fresh'
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '7 days' THEN 'recent'
WHEN MAX(sps.captured_at) IS NOT NULL THEN 'stale'
ELSE 'no_data'
END AS data_freshness,
-- Metadata
NOW() AS refreshed_at
FROM states s
LEFT JOIN dispensaries d ON d.state_id = s.id
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN store_product_snapshots sps ON sps.store_product_id = sp.id
GROUP BY s.id, s.code, s.name, s.recreational_legal, s.medical_legal, s.rec_year, s.med_year;
-- Create unique index on state code for fast lookups
CREATE UNIQUE INDEX IF NOT EXISTS mv_state_metrics_state_idx
ON mv_state_metrics (state);
-- Create index on state_id for joins
CREATE INDEX IF NOT EXISTS mv_state_metrics_state_id_idx
ON mv_state_metrics (state_id);
-- Create index for legal status filtering
CREATE INDEX IF NOT EXISTS mv_state_metrics_legal_idx
ON mv_state_metrics (recreational_legal, medical_legal);
-- Create index for data freshness queries
CREATE INDEX IF NOT EXISTS mv_state_metrics_freshness_idx
ON mv_state_metrics (data_freshness);
-- Comment on the view
COMMENT ON MATERIALIZED VIEW mv_state_metrics IS
'Aggregated state-level metrics for Analytics V2 endpoints. Refresh periodically with: REFRESH MATERIALIZED VIEW CONCURRENTLY mv_state_metrics;';
-- Record migration
INSERT INTO schema_migrations (version, name, applied_at)
VALUES ('051', 'create_mv_state_metrics', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -0,0 +1,96 @@
-- Migration 052: Add provider_data JSONB and frequently-queried columns
--
-- Adds hybrid storage for legacy data:
-- 1. provider_data JSONB on both tables for all extra fields
-- 2. Specific columns for frequently-queried fields
-- ============================================================================
-- store_products: Add provider_data and queryable columns
-- ============================================================================
-- JSONB for all extra provider-specific data
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS provider_data JSONB;
-- Frequently-queried columns
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS strain_type TEXT;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS medical_only BOOLEAN DEFAULT FALSE;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS rec_only BOOLEAN DEFAULT FALSE;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS brand_logo_url TEXT;
ALTER TABLE store_products
ADD COLUMN IF NOT EXISTS platform_dispensary_id TEXT;
-- Index for strain_type queries
CREATE INDEX IF NOT EXISTS idx_store_products_strain_type
ON store_products(strain_type)
WHERE strain_type IS NOT NULL;
-- Index for medical/rec filtering
CREATE INDEX IF NOT EXISTS idx_store_products_medical_rec
ON store_products(medical_only, rec_only);
-- GIN index for provider_data JSONB queries
CREATE INDEX IF NOT EXISTS idx_store_products_provider_data
ON store_products USING GIN (provider_data);
-- ============================================================================
-- store_product_snapshots: Add provider_data and queryable columns
-- ============================================================================
-- JSONB for all extra provider-specific data
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS provider_data JSONB;
-- Frequently-queried columns
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS featured BOOLEAN DEFAULT FALSE;
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
ALTER TABLE store_product_snapshots
ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
-- Index for featured products
CREATE INDEX IF NOT EXISTS idx_snapshots_featured
ON store_product_snapshots(dispensary_id, featured)
WHERE featured = TRUE;
-- Index for low stock alerts
CREATE INDEX IF NOT EXISTS idx_snapshots_below_threshold
ON store_product_snapshots(dispensary_id, is_below_threshold)
WHERE is_below_threshold = TRUE;
-- GIN index for provider_data JSONB queries
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_data
ON store_product_snapshots USING GIN (provider_data);
-- ============================================================================
-- Comments for documentation
-- ============================================================================
COMMENT ON COLUMN store_products.provider_data IS
'JSONB blob containing all provider-specific fields not in canonical columns (effects, terpenes, cannabinoids_v2, etc.)';
COMMENT ON COLUMN store_products.strain_type IS
'Cannabis strain type: Indica, Sativa, Hybrid, Indica-Hybrid, Sativa-Hybrid';
COMMENT ON COLUMN store_products.platform_dispensary_id IS
'Provider platform dispensary ID (e.g., Dutchie MongoDB ObjectId)';
COMMENT ON COLUMN store_product_snapshots.provider_data IS
'JSONB blob containing all provider-specific snapshot fields (options, kiosk data, etc.)';
COMMENT ON COLUMN store_product_snapshots.featured IS
'Whether product was featured/highlighted at capture time';
COMMENT ON COLUMN store_product_snapshots.is_below_threshold IS
'Whether product was below inventory threshold at capture time';

View File

@@ -0,0 +1,127 @@
-- ============================================================================
-- Migration 052: Add Cannabis Legalization Flags to States
-- ============================================================================
--
-- Purpose: Add recreational/medical cannabis legalization status and years
-- to the existing states table, then seed all 50 states + DC.
--
-- SAFETY RULES:
-- - Uses ADD COLUMN IF NOT EXISTS (idempotent)
-- - Uses INSERT ... ON CONFLICT (code) DO UPDATE (idempotent)
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Safe to run multiple times
--
-- Run with:
-- psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: Add cannabis legalization columns
-- ============================================================================
ALTER TABLE states ADD COLUMN IF NOT EXISTS recreational_legal BOOLEAN;
ALTER TABLE states ADD COLUMN IF NOT EXISTS rec_year INTEGER;
ALTER TABLE states ADD COLUMN IF NOT EXISTS medical_legal BOOLEAN;
ALTER TABLE states ADD COLUMN IF NOT EXISTS med_year INTEGER;
COMMENT ON COLUMN states.recreational_legal IS 'Whether recreational cannabis is legal in this state';
COMMENT ON COLUMN states.rec_year IS 'Year recreational cannabis was legalized (NULL if not legal)';
COMMENT ON COLUMN states.medical_legal IS 'Whether medical cannabis is legal in this state';
COMMENT ON COLUMN states.med_year IS 'Year medical cannabis was legalized (NULL if not legal)';
-- ============================================================================
-- SECTION 2: Seed all 50 states + DC with cannabis legalization data
-- ============================================================================
-- Data sourced from state legalization records as of 2024
-- States ordered by medical legalization year, then alphabetically
INSERT INTO states (code, name, timezone, recreational_legal, rec_year, medical_legal, med_year)
VALUES
-- Recreational + Medical States (ordered by rec year)
('WA', 'Washington', 'America/Los_Angeles', TRUE, 2012, TRUE, 1998),
('CO', 'Colorado', 'America/Denver', TRUE, 2012, TRUE, 2000),
('AK', 'Alaska', 'America/Anchorage', TRUE, 2014, TRUE, 1998),
('OR', 'Oregon', 'America/Los_Angeles', TRUE, 2014, TRUE, 1998),
('DC', 'District of Columbia', 'America/New_York', TRUE, 2015, TRUE, 2011),
('CA', 'California', 'America/Los_Angeles', TRUE, 2016, TRUE, 1996),
('NV', 'Nevada', 'America/Los_Angeles', TRUE, 2016, TRUE, 1998),
('ME', 'Maine', 'America/New_York', TRUE, 2016, TRUE, 1999),
('MA', 'Massachusetts', 'America/New_York', TRUE, 2016, TRUE, 2012),
('MI', 'Michigan', 'America/Detroit', TRUE, 2018, TRUE, 2008),
('IL', 'Illinois', 'America/Chicago', TRUE, 2019, TRUE, 2013),
('AZ', 'Arizona', 'America/Phoenix', TRUE, 2020, TRUE, 2010),
('MT', 'Montana', 'America/Denver', TRUE, 2020, TRUE, 2004),
('NJ', 'New Jersey', 'America/New_York', TRUE, 2020, TRUE, 2010),
('VT', 'Vermont', 'America/New_York', TRUE, 2020, TRUE, 2004),
('CT', 'Connecticut', 'America/New_York', TRUE, 2021, TRUE, 2012),
('NM', 'New Mexico', 'America/Denver', TRUE, 2021, TRUE, 2007),
('NY', 'New York', 'America/New_York', TRUE, 2021, TRUE, 2014),
('VA', 'Virginia', 'America/New_York', TRUE, 2021, TRUE, 2020),
('MD', 'Maryland', 'America/New_York', TRUE, 2022, TRUE, 2013),
('MO', 'Missouri', 'America/Chicago', TRUE, 2022, TRUE, 2018),
('RI', 'Rhode Island', 'America/New_York', TRUE, 2022, TRUE, 2006),
('DE', 'Delaware', 'America/New_York', TRUE, 2023, TRUE, 2011),
('MN', 'Minnesota', 'America/Chicago', TRUE, 2023, TRUE, 2014),
('OH', 'Ohio', 'America/New_York', TRUE, 2023, TRUE, 2016),
-- Medical Only States (no recreational)
('HI', 'Hawaii', 'Pacific/Honolulu', FALSE, NULL, TRUE, 2000),
('NH', 'New Hampshire', 'America/New_York', FALSE, NULL, TRUE, 2013),
('GA', 'Georgia', 'America/New_York', FALSE, NULL, TRUE, 2015),
('LA', 'Louisiana', 'America/Chicago', FALSE, NULL, TRUE, 2015),
('TX', 'Texas', 'America/Chicago', FALSE, NULL, TRUE, 2015),
('AR', 'Arkansas', 'America/Chicago', FALSE, NULL, TRUE, 2016),
('FL', 'Florida', 'America/New_York', FALSE, NULL, TRUE, 2016),
('ND', 'North Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2016),
('PA', 'Pennsylvania', 'America/New_York', FALSE, NULL, TRUE, 2016),
('IA', 'Iowa', 'America/Chicago', FALSE, NULL, TRUE, 2017),
('WV', 'West Virginia', 'America/New_York', FALSE, NULL, TRUE, 2017),
('OK', 'Oklahoma', 'America/Chicago', FALSE, NULL, TRUE, 2018),
('UT', 'Utah', 'America/Denver', FALSE, NULL, TRUE, 2018),
('SD', 'South Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2020),
('AL', 'Alabama', 'America/Chicago', FALSE, NULL, TRUE, 2021),
('MS', 'Mississippi', 'America/Chicago', FALSE, NULL, TRUE, 2022),
('KY', 'Kentucky', 'America/New_York', FALSE, NULL, TRUE, 2023),
('NE', 'Nebraska', 'America/Chicago', FALSE, NULL, TRUE, 2024),
-- No Cannabis Programs (neither rec nor medical)
('ID', 'Idaho', 'America/Boise', FALSE, NULL, FALSE, NULL),
('IN', 'Indiana', 'America/Indiana/Indianapolis', FALSE, NULL, FALSE, NULL),
('KS', 'Kansas', 'America/Chicago', FALSE, NULL, FALSE, NULL),
('NC', 'North Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
('SC', 'South Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
('TN', 'Tennessee', 'America/Chicago', FALSE, NULL, FALSE, NULL),
('WI', 'Wisconsin', 'America/Chicago', FALSE, NULL, FALSE, NULL),
('WY', 'Wyoming', 'America/Denver', FALSE, NULL, FALSE, NULL)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
recreational_legal = EXCLUDED.recreational_legal,
rec_year = EXCLUDED.rec_year,
medical_legal = EXCLUDED.medical_legal,
med_year = EXCLUDED.med_year,
updated_at = NOW();
-- ============================================================================
-- SECTION 3: Add indexes for common queries
-- ============================================================================
CREATE INDEX IF NOT EXISTS idx_states_recreational ON states(recreational_legal) WHERE recreational_legal = TRUE;
CREATE INDEX IF NOT EXISTS idx_states_medical ON states(medical_legal) WHERE medical_legal = TRUE;
-- ============================================================================
-- SECTION 4: Verification query (informational only)
-- ============================================================================
SELECT
'Migration 052 completed successfully.' AS status,
(SELECT COUNT(*) FROM states WHERE recreational_legal = TRUE) AS rec_states,
(SELECT COUNT(*) FROM states WHERE medical_legal = TRUE AND recreational_legal = FALSE) AS med_only_states,
(SELECT COUNT(*) FROM states WHERE medical_legal = FALSE OR medical_legal IS NULL) AS no_program_states,
(SELECT COUNT(*) FROM states) AS total_states;

View File

@@ -0,0 +1,249 @@
-- ============================================================================
-- Migration 052: Hydration Schema Alignment
-- ============================================================================
--
-- Purpose: Add columns to canonical tables needed for hydration from
-- dutchie_products and dutchie_product_snapshots.
--
-- This migration ensures store_products and store_product_snapshots can
-- receive all data from the legacy dutchie_* tables.
--
-- SAFETY RULES:
-- - ALL columns use ADD COLUMN IF NOT EXISTS
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Fully idempotent - safe to run multiple times
--
-- Run with:
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
-- -f migrations/052_hydration_schema_alignment.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: store_products - Additional columns from dutchie_products
-- ============================================================================
-- Brand ID from Dutchie GraphQL (brandId field)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_brand_id VARCHAR(100);
-- Legacy dutchie_products.id for cross-reference during migration
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
-- THC/CBD content as text (from dutchie_products.thc_content/cbd_content)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS thc_content_text VARCHAR(50);
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cbd_content_text VARCHAR(50);
-- Full cannabinoid data
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cannabinoids JSONB;
-- Effects array
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS effects TEXT[];
-- Type (Flower, Edible, etc.) - maps to category in legacy
-- Already have category VARCHAR(100), but type may differ
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS product_type VARCHAR(100);
-- Additional images array
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS additional_images TEXT[];
-- Local image paths (from 032 migration)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_url TEXT;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS original_image_url TEXT;
-- Status from Dutchie (Active/Inactive)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
-- Threshold flags
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
-- cName / slug from Dutchie
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
-- Coming soon flag
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_coming_soon BOOLEAN DEFAULT FALSE;
-- Provider column already exists, ensure we have provider_dispensary_id
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
-- Enterprise product ID (cross-store product linking)
-- Already exists from migration 051
-- Total quantity available (from POSMetaData.children)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
-- Weight
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weight VARCHAR(50);
-- Options array (size/weight options)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options TEXT[];
-- Measurements
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS measurements JSONB;
-- Raw data from last crawl
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS raw_data JSONB;
-- Source timestamps from Dutchie
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_created_at TIMESTAMPTZ;
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_updated_at TIMESTAMPTZ;
-- ============================================================================
-- SECTION 2: store_product_snapshots - Additional columns for hydration
-- ============================================================================
-- Legacy dutchie_product_snapshot.id for cross-reference
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_snapshot_id INTEGER;
-- Legacy dutchie_product_id reference
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
-- Options JSONB from dutchie_product_snapshots
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS options JSONB;
-- Provider dispensary ID
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
-- Inventory details
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
-- Platform status at time of snapshot
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
-- Threshold flags at time of snapshot
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
-- Special data
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_data JSONB;
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_name TEXT;
-- Pricing mode (rec/med)
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS pricing_type VARCHAR(10);
-- Crawl mode (mode_a/mode_b)
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS crawl_mode VARCHAR(20);
-- ============================================================================
-- SECTION 3: crawl_runs - Additional columns for hydration
-- ============================================================================
-- Legacy job ID references
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_dispensary_crawl_job_id INTEGER;
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_job_run_log_id INTEGER;
-- Schedule reference
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS schedule_id INTEGER;
-- Job type
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS job_type VARCHAR(50);
-- Brands found count
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS brands_found INTEGER DEFAULT 0;
-- Retry count
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0;
-- ============================================================================
-- SECTION 4: INDEXES for hydration queries
-- ============================================================================
-- Index on legacy IDs for migration lookups
CREATE INDEX IF NOT EXISTS idx_store_products_legacy_id
ON store_products(legacy_dutchie_product_id)
WHERE legacy_dutchie_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_id
ON store_product_snapshots(legacy_snapshot_id)
WHERE legacy_snapshot_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_product_id
ON store_product_snapshots(legacy_dutchie_product_id)
WHERE legacy_dutchie_product_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_runs_legacy_job_id
ON crawl_runs(legacy_dispensary_crawl_job_id)
WHERE legacy_dispensary_crawl_job_id IS NOT NULL;
-- Index on provider_product_id for upserts
CREATE INDEX IF NOT EXISTS idx_store_products_provider_id
ON store_products(provider_product_id);
-- Composite index for canonical key lookup
CREATE INDEX IF NOT EXISTS idx_store_products_canonical_key
ON store_products(dispensary_id, provider, provider_product_id);
-- ============================================================================
-- SECTION 5: Unique constraint for idempotent hydration
-- ============================================================================
-- Ensure unique snapshots per product per crawl
-- This prevents duplicate snapshots during re-runs
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'store_product_snapshots_unique_per_crawl'
) THEN
-- Can't add unique constraint on nullable columns directly,
-- so we use a partial unique index instead
CREATE UNIQUE INDEX IF NOT EXISTS idx_snapshots_unique_per_crawl
ON store_product_snapshots(store_product_id, crawl_run_id)
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- ============================================================================
-- SECTION 6: View for hydration status monitoring
-- ============================================================================
CREATE OR REPLACE VIEW v_hydration_status AS
SELECT
'dutchie_products' AS source_table,
(SELECT COUNT(*) FROM dutchie_products) AS source_count,
(SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) AS hydrated_count,
ROUND(
100.0 * (SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) /
NULLIF((SELECT COUNT(*) FROM dutchie_products), 0),
2
) AS hydration_pct
UNION ALL
SELECT
'dutchie_product_snapshots' AS source_table,
(SELECT COUNT(*) FROM dutchie_product_snapshots) AS source_count,
(SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) AS hydrated_count,
ROUND(
100.0 * (SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) /
NULLIF((SELECT COUNT(*) FROM dutchie_product_snapshots), 0),
2
) AS hydration_pct
UNION ALL
SELECT
'dispensary_crawl_jobs' AS source_table,
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed') AS source_count,
(SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) AS hydrated_count,
ROUND(
100.0 * (SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) /
NULLIF((SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed'), 0),
2
) AS hydration_pct;
-- ============================================================================
-- DONE
-- ============================================================================
SELECT 'Migration 052 completed successfully. Hydration schema aligned.' AS status;

View File

@@ -0,0 +1,157 @@
-- ============================================================================
-- Migration 053: Analytics Engine Indexes
-- ============================================================================
--
-- Purpose: Add indexes optimized for analytics queries on canonical tables.
-- These indexes support price trends, brand penetration, category
-- growth, and state-level analytics.
--
-- SAFETY RULES:
-- - Uses CREATE INDEX IF NOT EXISTS (idempotent)
-- - Uses ADD COLUMN IF NOT EXISTS for helper columns
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Safe to run multiple times
--
-- Run with:
-- psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: Helper columns for analytics (if missing)
-- ============================================================================
-- Ensure store_products has brand_id for faster brand analytics joins
-- (brand_name exists, but a normalized brand_id helps)
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS brand_id INTEGER;
-- Ensure snapshots have category for time-series category analytics
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS category VARCHAR(100);
-- ============================================================================
-- SECTION 2: Price Analytics Indexes
-- ============================================================================
-- Price trends by store_product over time
CREATE INDEX IF NOT EXISTS idx_snapshots_product_price_time
ON store_product_snapshots(store_product_id, captured_at DESC, price_rec, price_med)
WHERE store_product_id IS NOT NULL;
-- Price by category over time (for category price trends)
CREATE INDEX IF NOT EXISTS idx_snapshots_category_price_time
ON store_product_snapshots(category, captured_at DESC, price_rec)
WHERE category IS NOT NULL;
-- Price changes detection (for volatility analysis)
CREATE INDEX IF NOT EXISTS idx_products_price_change
ON store_products(last_price_change_at DESC)
WHERE last_price_change_at IS NOT NULL;
-- ============================================================================
-- SECTION 3: Brand Penetration Indexes
-- ============================================================================
-- Brand by dispensary (for penetration counts)
CREATE INDEX IF NOT EXISTS idx_products_brand_dispensary
ON store_products(brand_name, dispensary_id)
WHERE brand_name IS NOT NULL;
-- Brand by state (for state-level brand analytics)
CREATE INDEX IF NOT EXISTS idx_products_brand_state
ON store_products(brand_name, state_id)
WHERE brand_name IS NOT NULL AND state_id IS NOT NULL;
-- Brand first/last seen (for penetration trends)
CREATE INDEX IF NOT EXISTS idx_products_brand_first_seen
ON store_products(brand_name, first_seen_at)
WHERE brand_name IS NOT NULL;
-- ============================================================================
-- SECTION 4: Category Analytics Indexes
-- ============================================================================
-- Category by state (for state-level category analytics)
CREATE INDEX IF NOT EXISTS idx_products_category_state
ON store_products(category, state_id)
WHERE category IS NOT NULL;
-- Category by dispensary
CREATE INDEX IF NOT EXISTS idx_products_category_dispensary
ON store_products(category, dispensary_id)
WHERE category IS NOT NULL;
-- Category first seen (for growth tracking)
CREATE INDEX IF NOT EXISTS idx_products_category_first_seen
ON store_products(category, first_seen_at)
WHERE category IS NOT NULL;
-- ============================================================================
-- SECTION 5: Store Analytics Indexes
-- ============================================================================
-- Products added/removed by dispensary
CREATE INDEX IF NOT EXISTS idx_products_dispensary_first_seen
ON store_products(dispensary_id, first_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_products_dispensary_last_seen
ON store_products(dispensary_id, last_seen_at DESC);
-- Stock status changes
CREATE INDEX IF NOT EXISTS idx_products_stock_change
ON store_products(dispensary_id, last_stock_change_at DESC)
WHERE last_stock_change_at IS NOT NULL;
-- ============================================================================
-- SECTION 6: State Analytics Indexes
-- ============================================================================
-- Dispensary count by state
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_active
ON dispensaries(state_id)
WHERE state_id IS NOT NULL;
-- Products by state
CREATE INDEX IF NOT EXISTS idx_products_state_active
ON store_products(state_id, is_in_stock)
WHERE state_id IS NOT NULL;
-- Snapshots by state for time-series
CREATE INDEX IF NOT EXISTS idx_snapshots_state_time
ON store_product_snapshots(state_id, captured_at DESC)
WHERE state_id IS NOT NULL;
-- ============================================================================
-- SECTION 7: Composite indexes for common analytics queries
-- ============================================================================
-- Brand + Category + State (for market share calculations)
CREATE INDEX IF NOT EXISTS idx_products_brand_category_state
ON store_products(brand_name, category, state_id)
WHERE brand_name IS NOT NULL AND category IS NOT NULL;
-- Dispensary + Category + Brand (for store-level brand analysis)
CREATE INDEX IF NOT EXISTS idx_products_disp_cat_brand
ON store_products(dispensary_id, category, brand_name)
WHERE category IS NOT NULL;
-- Special pricing by category (for promo analysis)
CREATE INDEX IF NOT EXISTS idx_products_special_category
ON store_products(category, is_on_special)
WHERE is_on_special = TRUE;
-- ============================================================================
-- SECTION 8: Verification
-- ============================================================================
SELECT
'Migration 053 completed successfully.' AS status,
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_products_%') AS product_indexes,
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_snapshots_%') AS snapshot_indexes;

View File

@@ -0,0 +1,346 @@
-- ============================================================================
-- Migration 053: Dutchie Discovery Schema
-- ============================================================================
--
-- Purpose: Create tables for Dutchie store discovery workflow.
-- Stores are discovered and held in staging tables until verified,
-- then promoted to the canonical dispensaries table.
--
-- Tables Created:
-- - dutchie_discovery_cities: City pages from Dutchie
-- - dutchie_discovery_locations: Individual store locations
--
-- SAFETY RULES:
-- - ALL tables use CREATE TABLE IF NOT EXISTS
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
-- - Does NOT touch canonical dispensaries table
-- - Fully idempotent - safe to run multiple times
--
-- Run with:
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
-- -f migrations/053_dutchie_discovery_schema.sql
--
-- ============================================================================
-- ============================================================================
-- SECTION 1: DUTCHIE_DISCOVERY_CITIES
-- ============================================================================
-- Stores Dutchie city pages for systematic crawling.
-- Each city can contain multiple dispensary locations.
CREATE TABLE IF NOT EXISTS dutchie_discovery_cities (
id BIGSERIAL PRIMARY KEY,
-- Platform identification (future-proof for other platforms)
platform TEXT NOT NULL DEFAULT 'dutchie',
-- City identification
city_name TEXT NOT NULL,
city_slug TEXT NOT NULL,
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
country_code TEXT NOT NULL DEFAULT 'US',
-- Crawl management
last_crawled_at TIMESTAMPTZ,
crawl_enabled BOOLEAN NOT NULL DEFAULT TRUE,
location_count INTEGER, -- Number of locations found in this city
-- Metadata
notes TEXT,
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Add unique constraint if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_cities_unique'
) THEN
ALTER TABLE dutchie_discovery_cities
ADD CONSTRAINT dutchie_discovery_cities_unique
UNIQUE (platform, country_code, state_code, city_slug);
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Indexes
CREATE INDEX IF NOT EXISTS idx_discovery_cities_platform
ON dutchie_discovery_cities(platform);
CREATE INDEX IF NOT EXISTS idx_discovery_cities_state
ON dutchie_discovery_cities(country_code, state_code);
CREATE INDEX IF NOT EXISTS idx_discovery_cities_crawl_enabled
ON dutchie_discovery_cities(crawl_enabled)
WHERE crawl_enabled = TRUE;
CREATE INDEX IF NOT EXISTS idx_discovery_cities_last_crawled
ON dutchie_discovery_cities(last_crawled_at);
COMMENT ON TABLE dutchie_discovery_cities IS 'City pages from Dutchie for systematic store discovery.';
-- ============================================================================
-- SECTION 2: DUTCHIE_DISCOVERY_LOCATIONS
-- ============================================================================
-- Individual store locations discovered from Dutchie.
-- These are NOT promoted to canonical dispensaries until verified.
CREATE TABLE IF NOT EXISTS dutchie_discovery_locations (
id BIGSERIAL PRIMARY KEY,
-- Platform identification
platform TEXT NOT NULL DEFAULT 'dutchie',
platform_location_id TEXT NOT NULL, -- Dutchie's internal Location ID
platform_slug TEXT NOT NULL, -- URL slug for the store
platform_menu_url TEXT NOT NULL, -- Full menu URL
-- Store name
name TEXT NOT NULL,
-- Address components
raw_address TEXT,
address_line1 TEXT,
address_line2 TEXT,
city TEXT,
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
postal_code TEXT,
country_code TEXT, -- 'US' or 'CA'
-- Coordinates
latitude DOUBLE PRECISION,
longitude DOUBLE PRECISION,
timezone TEXT,
-- Discovery status
status TEXT NOT NULL DEFAULT 'discovered',
-- discovered: Just found, not yet verified
-- verified: Verified and promoted to canonical dispensaries
-- rejected: Manually rejected (e.g., duplicate, test store)
-- merged: Linked to existing canonical dispensary
-- Link to canonical dispensaries (only after verification)
dispensary_id INTEGER,
-- Reference to discovery city
discovery_city_id BIGINT,
-- Raw data from Dutchie
metadata JSONB,
notes TEXT,
-- Store capabilities (from Dutchie)
offers_delivery BOOLEAN,
offers_pickup BOOLEAN,
is_recreational BOOLEAN,
is_medical BOOLEAN,
-- Tracking
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_checked_at TIMESTAMPTZ,
verified_at TIMESTAMPTZ,
verified_by TEXT, -- User who verified
active BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Add unique constraints if not exist
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_platform_id_unique'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_platform_id_unique
UNIQUE (platform, platform_location_id);
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_slug_unique'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_slug_unique
UNIQUE (platform, platform_slug, country_code, state_code, city);
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Add FK to dispensaries if not exists (allows NULL)
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_dispensary_fk'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_dispensary_fk
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Add FK to discovery cities if not exists
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint
WHERE conname = 'dutchie_discovery_locations_city_fk'
) THEN
ALTER TABLE dutchie_discovery_locations
ADD CONSTRAINT dutchie_discovery_locations_city_fk
FOREIGN KEY (discovery_city_id) REFERENCES dutchie_discovery_cities(id) ON DELETE SET NULL;
END IF;
EXCEPTION
WHEN duplicate_object THEN NULL;
WHEN OTHERS THEN NULL;
END $$;
-- Indexes
CREATE INDEX IF NOT EXISTS idx_discovery_locations_platform
ON dutchie_discovery_locations(platform);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_status
ON dutchie_discovery_locations(status);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_state
ON dutchie_discovery_locations(country_code, state_code);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_city
ON dutchie_discovery_locations(city, state_code);
CREATE INDEX IF NOT EXISTS idx_discovery_locations_dispensary
ON dutchie_discovery_locations(dispensary_id)
WHERE dispensary_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_discovery_locations_discovered
ON dutchie_discovery_locations(status, first_seen_at DESC)
WHERE status = 'discovered';
CREATE INDEX IF NOT EXISTS idx_discovery_locations_active
ON dutchie_discovery_locations(active)
WHERE active = TRUE;
CREATE INDEX IF NOT EXISTS idx_discovery_locations_coords
ON dutchie_discovery_locations(latitude, longitude)
WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
COMMENT ON TABLE dutchie_discovery_locations IS 'Discovered store locations from Dutchie. Held in staging until verified.';
-- ============================================================================
-- SECTION 3: ADD CANADIAN PROVINCES TO STATES TABLE
-- ============================================================================
-- Support for Canadian provinces (Ontario, BC, Alberta, etc.)
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
('AB', 'Alberta', 'America/Edmonton', TRUE, TRUE),
('BC', 'British Columbia', 'America/Vancouver', TRUE, TRUE),
('MB', 'Manitoba', 'America/Winnipeg', TRUE, TRUE),
('NB', 'New Brunswick', 'America/Moncton', TRUE, TRUE),
('NL', 'Newfoundland and Labrador', 'America/St_Johns', TRUE, TRUE),
('NS', 'Nova Scotia', 'America/Halifax', TRUE, TRUE),
('NT', 'Northwest Territories', 'America/Yellowknife', TRUE, TRUE),
('NU', 'Nunavut', 'America/Iqaluit', TRUE, TRUE),
('ON', 'Ontario', 'America/Toronto', TRUE, TRUE),
('PE', 'Prince Edward Island', 'America/Halifax', TRUE, TRUE),
('QC', 'Quebec', 'America/Montreal', TRUE, TRUE),
('SK', 'Saskatchewan', 'America/Regina', TRUE, TRUE),
('YT', 'Yukon', 'America/Whitehorse', TRUE, TRUE)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
updated_at = NOW();
-- ============================================================================
-- SECTION 4: VIEWS FOR DISCOVERY MONITORING
-- ============================================================================
-- View: Discovery status summary
CREATE OR REPLACE VIEW v_discovery_status AS
SELECT
platform,
country_code,
state_code,
status,
COUNT(*) AS location_count,
COUNT(*) FILTER (WHERE dispensary_id IS NOT NULL) AS linked_count,
MIN(first_seen_at) AS earliest_discovery,
MAX(last_seen_at) AS latest_activity
FROM dutchie_discovery_locations
GROUP BY platform, country_code, state_code, status
ORDER BY country_code, state_code, status;
-- View: Unverified discoveries awaiting action
CREATE OR REPLACE VIEW v_discovery_pending AS
SELECT
dl.id,
dl.platform,
dl.name,
dl.city,
dl.state_code,
dl.country_code,
dl.platform_menu_url,
dl.first_seen_at,
dl.last_seen_at,
dl.offers_delivery,
dl.offers_pickup,
dl.is_recreational,
dl.is_medical,
dc.city_name AS discovery_city_name
FROM dutchie_discovery_locations dl
LEFT JOIN dutchie_discovery_cities dc ON dc.id = dl.discovery_city_id
WHERE dl.status = 'discovered'
AND dl.active = TRUE
ORDER BY dl.state_code, dl.city, dl.name;
-- View: City crawl status
CREATE OR REPLACE VIEW v_discovery_cities_status AS
SELECT
dc.id,
dc.platform,
dc.city_name,
dc.state_code,
dc.country_code,
dc.crawl_enabled,
dc.last_crawled_at,
dc.location_count,
COUNT(dl.id) AS actual_locations,
COUNT(dl.id) FILTER (WHERE dl.status = 'discovered') AS pending_count,
COUNT(dl.id) FILTER (WHERE dl.status = 'verified') AS verified_count,
COUNT(dl.id) FILTER (WHERE dl.status = 'rejected') AS rejected_count
FROM dutchie_discovery_cities dc
LEFT JOIN dutchie_discovery_locations dl ON dl.discovery_city_id = dc.id
GROUP BY dc.id, dc.platform, dc.city_name, dc.state_code, dc.country_code,
dc.crawl_enabled, dc.last_crawled_at, dc.location_count
ORDER BY dc.country_code, dc.state_code, dc.city_name;
-- ============================================================================
-- DONE
-- ============================================================================
SELECT 'Migration 053 completed successfully. Discovery schema created.' AS status;

View File

@@ -0,0 +1,49 @@
-- Migration 054: Worker Metadata for Named Workforce
-- Adds worker_name and worker_role to job tables for displaying friendly worker identities
-- Add worker metadata columns to job_schedules
ALTER TABLE job_schedules
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., Store Discovery Worker, GraphQL Product Sync)';
-- Add worker metadata columns to job_run_logs
ALTER TABLE job_run_logs
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
ADD COLUMN IF NOT EXISTS run_role VARCHAR(100);
COMMENT ON COLUMN job_run_logs.worker_name IS 'Name of the worker that executed this run (copied from schedule)';
COMMENT ON COLUMN job_run_logs.run_role IS 'Role description for this specific run';
-- Add worker_name to dispensary_crawl_jobs (for tracking which named worker enqueued it)
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
-- Update existing schedules with worker names
UPDATE job_schedules SET
worker_name = 'Bella',
worker_role = 'GraphQL Product Sync'
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Henry',
worker_role = 'Entry Point Finder'
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Alice',
worker_role = 'Store Discovery'
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Oscar',
worker_role = 'Analytics Refresh'
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
-- Create index for worker name lookups
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by ON dispensary_crawl_jobs(enqueued_by_worker);

View File

@@ -0,0 +1,123 @@
-- Migration 055: Workforce System Enhancements
-- Adds visibility tracking, slug change tracking, and scope support for workers
-- ============================================================
-- 1. VISIBILITY TRACKING FOR BELLA (Product Sync)
-- ============================================================
-- Add visibility tracking to dutchie_products
ALTER TABLE dutchie_products
ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE,
ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMPTZ;
COMMENT ON COLUMN dutchie_products.visibility_lost IS 'True if product disappeared from GraphQL results';
COMMENT ON COLUMN dutchie_products.visibility_lost_at IS 'When product was last marked as visibility lost';
COMMENT ON COLUMN dutchie_products.visibility_restored_at IS 'When product reappeared after being lost';
-- Index for visibility queries
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost
ON dutchie_products(dispensary_id, visibility_lost)
WHERE visibility_lost = TRUE;
-- ============================================================
-- 2. SLUG CHANGE TRACKING FOR ALICE (Store Discovery)
-- ============================================================
-- Add slug change and retirement tracking to discovery locations
ALTER TABLE dutchie_discovery_locations
ADD COLUMN IF NOT EXISTS slug_changed_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS previous_slug VARCHAR(255),
ADD COLUMN IF NOT EXISTS retired_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS retirement_reason VARCHAR(100);
COMMENT ON COLUMN dutchie_discovery_locations.slug_changed_at IS 'When the platform slug was last changed';
COMMENT ON COLUMN dutchie_discovery_locations.previous_slug IS 'Previous slug before the last change';
COMMENT ON COLUMN dutchie_discovery_locations.retired_at IS 'When store was marked as retired/removed';
COMMENT ON COLUMN dutchie_discovery_locations.retirement_reason IS 'Reason for retirement (removed_from_source, closed, etc.)';
-- Index for finding retired stores
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_retired
ON dutchie_discovery_locations(retired_at)
WHERE retired_at IS NOT NULL;
-- ============================================================
-- 3. ID RESOLUTION TRACKING FOR HENRY (Entry Point Finder)
-- ============================================================
-- Add resolution tracking to dispensaries
ALTER TABLE dispensaries
ADD COLUMN IF NOT EXISTS last_id_resolution_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS id_resolution_attempts INT DEFAULT 0,
ADD COLUMN IF NOT EXISTS id_resolution_error TEXT;
COMMENT ON COLUMN dispensaries.last_id_resolution_at IS 'When platform_dispensary_id was last resolved/attempted';
COMMENT ON COLUMN dispensaries.id_resolution_attempts IS 'Number of resolution attempts';
COMMENT ON COLUMN dispensaries.id_resolution_error IS 'Last error message from resolution attempt';
-- Index for finding stores needing resolution
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_resolution
ON dispensaries(state, menu_type)
WHERE platform_dispensary_id IS NULL AND menu_type = 'dutchie';
-- ============================================================
-- 4. ENHANCED CITIES TABLE FOR ALICE
-- ============================================================
-- Add tracking columns to cities table
ALTER TABLE dutchie_discovery_cities
ADD COLUMN IF NOT EXISTS state_name VARCHAR(100),
ADD COLUMN IF NOT EXISTS discovered_at TIMESTAMPTZ DEFAULT NOW(),
ADD COLUMN IF NOT EXISTS last_verified_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS store_count_reported INT,
ADD COLUMN IF NOT EXISTS store_count_actual INT;
COMMENT ON COLUMN dutchie_discovery_cities.state_name IS 'Full state name from source';
COMMENT ON COLUMN dutchie_discovery_cities.discovered_at IS 'When city was first discovered';
COMMENT ON COLUMN dutchie_discovery_cities.last_verified_at IS 'When city was last verified to exist';
COMMENT ON COLUMN dutchie_discovery_cities.store_count_reported IS 'Store count reported by source';
COMMENT ON COLUMN dutchie_discovery_cities.store_count_actual IS 'Actual store count from discovery';
-- ============================================================
-- 5. UPDATE WORKER ROLES (Standardize naming)
-- ============================================================
-- Update existing workers to use standardized role names
UPDATE job_schedules SET worker_role = 'store_discovery'
WHERE worker_name = 'Alice' AND worker_role = 'Store Discovery';
UPDATE job_schedules SET worker_role = 'entry_point_finder'
WHERE worker_name = 'Henry' AND worker_role = 'Entry Point Finder';
UPDATE job_schedules SET worker_role = 'product_sync'
WHERE worker_name = 'Bella' AND worker_role = 'GraphQL Product Sync';
UPDATE job_schedules SET worker_role = 'analytics_refresh'
WHERE worker_name = 'Oscar' AND worker_role = 'Analytics Refresh';
-- ============================================================
-- 6. VISIBILITY EVENTS IN SNAPSHOTS (JSONB approach)
-- ============================================================
-- Add visibility_events array to product snapshots metadata
-- This will store: [{event_type, timestamp, worker_name}]
-- No schema change needed - we use existing metadata JSONB column
-- ============================================================
-- 7. INDEXES FOR WORKER QUERIES
-- ============================================================
-- Index for finding recently added stores (for Henry)
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_created
ON dutchie_discovery_locations(created_at DESC)
WHERE active = TRUE;
-- Index for scope-based queries (by state)
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_menu
ON dispensaries(state, menu_type)
WHERE menu_type IS NOT NULL;
-- Record migration
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (55, '055_workforce_enhancements', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -0,0 +1,110 @@
-- Migration 056: Fix Worker Metadata and Job Run Logs
--
-- This migration safely ensures all expected schema exists for:
-- 1. job_schedules - worker_name, worker_role columns
-- 2. job_run_logs - entire table creation if missing
--
-- Uses IF NOT EXISTS / ADD COLUMN IF NOT EXISTS for idempotency.
-- Safe to run on databases that already have some or all of these changes.
-- ============================================================
-- 1. ADD MISSING COLUMNS TO job_schedules
-- ============================================================
ALTER TABLE job_schedules
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., store_discovery, product_sync)';
-- ============================================================
-- 2. CREATE job_run_logs TABLE IF NOT EXISTS
-- ============================================================
CREATE TABLE IF NOT EXISTS job_run_logs (
id SERIAL PRIMARY KEY,
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
job_name VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
duration_ms INTEGER,
error_message TEXT,
-- Results summary
items_processed INTEGER DEFAULT 0,
items_succeeded INTEGER DEFAULT 0,
items_failed INTEGER DEFAULT 0,
-- Worker metadata (from scheduler.ts createRunLog function)
worker_name VARCHAR(50),
run_role VARCHAR(100),
-- Additional run details
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Create indexes if they don't exist
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
-- ============================================================
-- 3. ADD enqueued_by_worker TO dispensary_crawl_jobs IF EXISTS
-- ============================================================
DO $$
BEGIN
-- Only add column if dispensary_crawl_jobs table exists
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'dispensary_crawl_jobs') THEN
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by
ON dispensary_crawl_jobs(enqueued_by_worker);
END IF;
END $$;
-- ============================================================
-- 4. SEED DEFAULT WORKER NAMES FOR EXISTING SCHEDULES
-- ============================================================
UPDATE job_schedules SET
worker_name = 'Bella',
worker_role = 'product_sync'
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Henry',
worker_role = 'entry_point_finder'
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Alice',
worker_role = 'store_discovery'
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
UPDATE job_schedules SET
worker_name = 'Oscar',
worker_role = 'analytics_refresh'
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
-- ============================================================
-- 5. RECORD MIGRATION (if schema_migrations table exists)
-- ============================================================
DO $$
BEGIN
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'schema_migrations') THEN
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (56, '056_fix_worker_and_run_logs', NOW())
ON CONFLICT (version) DO NOTHING;
END IF;
END $$;

View File

@@ -0,0 +1,64 @@
-- Migration 057: Add visibility tracking columns to dutchie_products
--
-- Supports Bella (Product Sync) worker visibility-loss tracking:
-- - visibility_lost: TRUE when product disappears from GraphQL feed
-- - visibility_lost_at: Timestamp when product first went missing
-- - visibility_restored_at: Timestamp when product reappeared
--
-- These columns enable tracking of products that temporarily or permanently
-- disappear from Dutchie GraphQL API responses.
-- ============================================================
-- 1. ADD VISIBILITY TRACKING COLUMNS TO dutchie_products
-- ============================================================
ALTER TABLE dutchie_products
ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE,
ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMPTZ;
COMMENT ON COLUMN dutchie_products.visibility_lost IS 'TRUE when product is missing from GraphQL feed';
COMMENT ON COLUMN dutchie_products.visibility_lost_at IS 'Timestamp when product first went missing from feed';
COMMENT ON COLUMN dutchie_products.visibility_restored_at IS 'Timestamp when product reappeared after being missing';
-- ============================================================
-- 2. CREATE INDEX FOR VISIBILITY QUERIES
-- ============================================================
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost
ON dutchie_products(visibility_lost)
WHERE visibility_lost = TRUE;
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost_at
ON dutchie_products(visibility_lost_at)
WHERE visibility_lost_at IS NOT NULL;
-- ============================================================
-- 3. CREATE VIEW FOR VISIBILITY ANALYTICS
-- ============================================================
CREATE OR REPLACE VIEW v_visibility_summary AS
SELECT
d.id AS dispensary_id,
d.name AS dispensary_name,
d.state,
COUNT(dp.id) AS total_products,
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = TRUE) AS visibility_lost_count,
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = FALSE OR dp.visibility_lost IS NULL) AS visible_count,
COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at IS NOT NULL) AS restored_count,
MAX(dp.visibility_lost_at) AS latest_loss_at,
MAX(dp.visibility_restored_at) AS latest_restore_at
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
WHERE d.menu_type = 'dutchie'
GROUP BY d.id, d.name, d.state;
COMMENT ON VIEW v_visibility_summary IS 'Aggregated visibility metrics per dispensary for dashboard analytics';
-- ============================================================
-- 4. RECORD MIGRATION
-- ============================================================
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (57, '057_visibility_tracking_columns', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -0,0 +1,46 @@
-- Migration 058: Add ID resolution tracking columns to dispensaries
--
-- Supports Henry (Entry Point Finder) worker tracking:
-- - id_resolution_attempts: Count of how many times we've tried to resolve platform ID
-- - last_id_resolution_at: When we last tried (matches code expectation)
-- - id_resolution_status: Current status (pending, resolved, failed)
-- - id_resolution_error: Last error message from resolution attempt
-- ============================================================
-- 1. ADD ID RESOLUTION COLUMNS TO dispensaries
-- ============================================================
ALTER TABLE dispensaries
ADD COLUMN IF NOT EXISTS id_resolution_attempts INTEGER DEFAULT 0,
ADD COLUMN IF NOT EXISTS last_id_resolution_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS id_resolution_status VARCHAR(20) DEFAULT 'pending',
ADD COLUMN IF NOT EXISTS id_resolution_error TEXT;
COMMENT ON COLUMN dispensaries.id_resolution_attempts IS 'Number of attempts to resolve platform_dispensary_id';
COMMENT ON COLUMN dispensaries.last_id_resolution_at IS 'Timestamp of last ID resolution attempt';
COMMENT ON COLUMN dispensaries.id_resolution_status IS 'Status: pending, resolved, failed';
COMMENT ON COLUMN dispensaries.id_resolution_error IS 'Last error message from ID resolution attempt';
-- Additional columns needed by worker/scheduler
ALTER TABLE dispensaries
ADD COLUMN IF NOT EXISTS failed_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS failure_notes TEXT;
COMMENT ON COLUMN dispensaries.failed_at IS 'Timestamp when dispensary was marked as permanently failed';
COMMENT ON COLUMN dispensaries.failure_notes IS 'Notes about why dispensary was marked as failed';
-- ============================================================
-- 2. CREATE INDEX FOR RESOLUTION QUERIES
-- ============================================================
CREATE INDEX IF NOT EXISTS idx_dispensaries_id_resolution_status
ON dispensaries(id_resolution_status)
WHERE id_resolution_status = 'pending';
-- ============================================================
-- 3. RECORD MIGRATION
-- ============================================================
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (58, '058_add_id_resolution_columns', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -0,0 +1,67 @@
-- Migration 059: Add missing columns to dispensary_crawl_jobs
--
-- Required for worker job processing:
-- - max_retries: Maximum retry attempts for a job
-- - retry_count: Current retry count
-- - worker_id: ID of worker processing the job
-- - locked_at: When the job was locked by a worker
-- - locked_by: Hostname of worker that locked the job
-- ============================================================
-- 1. ADD JOB QUEUE COLUMNS
-- ============================================================
ALTER TABLE dispensary_crawl_jobs
ADD COLUMN IF NOT EXISTS max_retries INTEGER DEFAULT 3,
ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0,
ADD COLUMN IF NOT EXISTS worker_id VARCHAR(100),
ADD COLUMN IF NOT EXISTS locked_at TIMESTAMPTZ,
ADD COLUMN IF NOT EXISTS locked_by VARCHAR(100),
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT NOW();
COMMENT ON COLUMN dispensary_crawl_jobs.max_retries IS 'Maximum number of retry attempts';
COMMENT ON COLUMN dispensary_crawl_jobs.retry_count IS 'Current retry count';
COMMENT ON COLUMN dispensary_crawl_jobs.worker_id IS 'ID of worker processing this job';
COMMENT ON COLUMN dispensary_crawl_jobs.locked_at IS 'When job was locked by worker';
COMMENT ON COLUMN dispensary_crawl_jobs.locked_by IS 'Hostname of worker that locked job';
-- ============================================================
-- 2. CREATE INDEXES FOR JOB QUEUE QUERIES
-- ============================================================
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status_priority
ON dispensary_crawl_jobs(status, priority DESC)
WHERE status = 'pending';
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_worker_id
ON dispensary_crawl_jobs(worker_id)
WHERE worker_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_locked_at
ON dispensary_crawl_jobs(locked_at)
WHERE locked_at IS NOT NULL;
-- ============================================================
-- 3. CREATE QUEUE STATS VIEW
-- ============================================================
CREATE OR REPLACE VIEW v_queue_stats AS
SELECT
COUNT(*) FILTER (WHERE status = 'pending') AS pending_jobs,
COUNT(*) FILTER (WHERE status = 'running') AS running_jobs,
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS completed_1h,
COUNT(*) FILTER (WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') AS failed_1h,
COUNT(DISTINCT worker_id) FILTER (WHERE status = 'running') AS active_workers,
ROUND((AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'))::numeric, 2) AS avg_duration_seconds
FROM dispensary_crawl_jobs;
COMMENT ON VIEW v_queue_stats IS 'Real-time queue statistics for monitoring dashboard';
-- ============================================================
-- 4. RECORD MIGRATION
-- ============================================================
INSERT INTO schema_migrations (version, name, applied_at)
VALUES (59, '059_job_queue_columns', NOW())
ON CONFLICT (version) DO NOTHING;

View File

@@ -1,6 +1,6 @@
{
"name": "dutchie-menus-backend",
"version": "1.0.0",
"version": "1.5.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
@@ -575,6 +575,11 @@
"npm": "1.2.8000 || >= 1.4.16"
}
},
"node_modules/boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
},
"node_modules/brace-expansion": {
"version": "1.1.12",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
@@ -685,6 +690,46 @@
"node": ">=6"
}
},
"node_modules/cheerio": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz",
"integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==",
"dependencies": {
"cheerio-select": "^2.1.0",
"dom-serializer": "^2.0.0",
"domhandler": "^5.0.3",
"domutils": "^3.2.2",
"encoding-sniffer": "^0.2.1",
"htmlparser2": "^10.0.0",
"parse5": "^7.3.0",
"parse5-htmlparser2-tree-adapter": "^7.1.0",
"parse5-parser-stream": "^7.1.2",
"undici": "^7.12.0",
"whatwg-mimetype": "^4.0.0"
},
"engines": {
"node": ">=20.18.1"
},
"funding": {
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
}
},
"node_modules/cheerio-select": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
"dependencies": {
"boolbase": "^1.0.0",
"css-select": "^5.1.0",
"css-what": "^6.1.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/chownr": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz",
@@ -876,6 +921,32 @@
"node-fetch": "^2.6.12"
}
},
"node_modules/css-select": {
"version": "5.2.2",
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
"dependencies": {
"boolbase": "^1.0.0",
"css-what": "^6.1.0",
"domhandler": "^5.0.2",
"domutils": "^3.0.1",
"nth-check": "^2.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/css-what": {
"version": "6.2.2",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/data-uri-to-buffer": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
@@ -1002,6 +1073,57 @@
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1232444.tgz",
"integrity": "sha512-pM27vqEfxSxRkTMnF+XCmxSEb6duO5R+t8A9DEEJgy4Wz2RVanje2mmj99B6A3zv2r/qGfYlOvYznUhuokizmg=="
},
"node_modules/dom-serializer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"entities": "^4.2.0"
},
"funding": {
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
}
},
"node_modules/domelementtype": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
]
},
"node_modules/domhandler": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
"dependencies": {
"domelementtype": "^2.3.0"
},
"engines": {
"node": ">= 4"
},
"funding": {
"url": "https://github.com/fb55/domhandler?sponsor=1"
}
},
"node_modules/domutils": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3"
},
"funding": {
"url": "https://github.com/fb55/domutils?sponsor=1"
}
},
"node_modules/dotenv": {
"version": "16.6.1",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
@@ -1052,6 +1174,29 @@
"node": ">= 0.8"
}
},
"node_modules/encoding-sniffer": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz",
"integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==",
"dependencies": {
"iconv-lite": "^0.6.3",
"whatwg-encoding": "^3.1.1"
},
"funding": {
"url": "https://github.com/fb55/encoding-sniffer?sponsor=1"
}
},
"node_modules/encoding-sniffer/node_modules/iconv-lite": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/end-of-stream": {
"version": "1.4.5",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
@@ -1060,6 +1205,17 @@
"once": "^1.4.0"
}
},
"node_modules/entities": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/env-paths": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
@@ -1765,6 +1921,35 @@
"node": ">=16.0.0"
}
},
"node_modules/htmlparser2": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
"integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==",
"funding": [
"https://github.com/fb55/htmlparser2?sponsor=1",
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
],
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.2.1",
"entities": "^6.0.0"
}
},
"node_modules/htmlparser2/node_modules/entities": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/http-errors": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@@ -2530,6 +2715,17 @@
"set-blocking": "^2.0.0"
}
},
"node_modules/nth-check": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
"dependencies": {
"boolbase": "^1.0.0"
},
"funding": {
"url": "https://github.com/fb55/nth-check?sponsor=1"
}
},
"node_modules/object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
@@ -2647,6 +2843,51 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/parse5": {
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
"dependencies": {
"entities": "^6.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-htmlparser2-tree-adapter": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
"dependencies": {
"domhandler": "^5.0.3",
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-parser-stream": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz",
"integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==",
"dependencies": {
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5/node_modules/entities": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@@ -4040,6 +4281,14 @@
"through": "^2.3.8"
}
},
"node_modules/undici": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz",
"integrity": "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g==",
"engines": {
"node": ">=20.18.1"
}
},
"node_modules/undici-types": {
"version": "6.21.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
@@ -4128,6 +4377,36 @@
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
},
"node_modules/whatwg-encoding": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
"dependencies": {
"iconv-lite": "0.6.3"
},
"engines": {
"node": ">=18"
}
},
"node_modules/whatwg-encoding/node_modules/iconv-lite": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/whatwg-mimetype": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
"engines": {
"node": ">=18"
}
},
"node_modules/whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",

View File

@@ -1,15 +1,16 @@
{
"name": "dutchie-menus-backend",
"version": "1.0.0",
"version": "1.5.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "dutchie-menus-backend",
"version": "1.0.0",
"version": "1.5.1",
"dependencies": {
"axios": "^1.6.2",
"bcrypt": "^5.1.1",
"cheerio": "^1.1.2",
"cors": "^2.8.5",
"dotenv": "^16.3.1",
"express": "^4.18.2",
@@ -1015,6 +1016,11 @@
"npm": "1.2.8000 || >= 1.4.16"
}
},
"node_modules/boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
},
"node_modules/brace-expansion": {
"version": "1.1.12",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
@@ -1125,6 +1131,46 @@
"node": ">=6"
}
},
"node_modules/cheerio": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz",
"integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==",
"dependencies": {
"cheerio-select": "^2.1.0",
"dom-serializer": "^2.0.0",
"domhandler": "^5.0.3",
"domutils": "^3.2.2",
"encoding-sniffer": "^0.2.1",
"htmlparser2": "^10.0.0",
"parse5": "^7.3.0",
"parse5-htmlparser2-tree-adapter": "^7.1.0",
"parse5-parser-stream": "^7.1.2",
"undici": "^7.12.0",
"whatwg-mimetype": "^4.0.0"
},
"engines": {
"node": ">=20.18.1"
},
"funding": {
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
}
},
"node_modules/cheerio-select": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
"dependencies": {
"boolbase": "^1.0.0",
"css-select": "^5.1.0",
"css-what": "^6.1.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/chownr": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz",
@@ -1316,6 +1362,32 @@
"node-fetch": "^2.6.12"
}
},
"node_modules/css-select": {
"version": "5.2.2",
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
"dependencies": {
"boolbase": "^1.0.0",
"css-what": "^6.1.0",
"domhandler": "^5.0.2",
"domutils": "^3.0.1",
"nth-check": "^2.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/css-what": {
"version": "6.2.2",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/data-uri-to-buffer": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
@@ -1442,6 +1514,57 @@
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1232444.tgz",
"integrity": "sha512-pM27vqEfxSxRkTMnF+XCmxSEb6duO5R+t8A9DEEJgy4Wz2RVanje2mmj99B6A3zv2r/qGfYlOvYznUhuokizmg=="
},
"node_modules/dom-serializer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"entities": "^4.2.0"
},
"funding": {
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
}
},
"node_modules/domelementtype": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
]
},
"node_modules/domhandler": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
"dependencies": {
"domelementtype": "^2.3.0"
},
"engines": {
"node": ">= 4"
},
"funding": {
"url": "https://github.com/fb55/domhandler?sponsor=1"
}
},
"node_modules/domutils": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3"
},
"funding": {
"url": "https://github.com/fb55/domutils?sponsor=1"
}
},
"node_modules/dotenv": {
"version": "16.6.1",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
@@ -1492,6 +1615,29 @@
"node": ">= 0.8"
}
},
"node_modules/encoding-sniffer": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz",
"integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==",
"dependencies": {
"iconv-lite": "^0.6.3",
"whatwg-encoding": "^3.1.1"
},
"funding": {
"url": "https://github.com/fb55/encoding-sniffer?sponsor=1"
}
},
"node_modules/encoding-sniffer/node_modules/iconv-lite": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/end-of-stream": {
"version": "1.4.5",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
@@ -1500,6 +1646,17 @@
"once": "^1.4.0"
}
},
"node_modules/entities": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/env-paths": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
@@ -2219,6 +2376,35 @@
"node": ">=16.0.0"
}
},
"node_modules/htmlparser2": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
"integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==",
"funding": [
"https://github.com/fb55/htmlparser2?sponsor=1",
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
],
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.2.1",
"entities": "^6.0.0"
}
},
"node_modules/htmlparser2/node_modules/entities": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/http-errors": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@@ -2984,6 +3170,17 @@
"set-blocking": "^2.0.0"
}
},
"node_modules/nth-check": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
"dependencies": {
"boolbase": "^1.0.0"
},
"funding": {
"url": "https://github.com/fb55/nth-check?sponsor=1"
}
},
"node_modules/object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
@@ -3101,6 +3298,51 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/parse5": {
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
"dependencies": {
"entities": "^6.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-htmlparser2-tree-adapter": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
"dependencies": {
"domhandler": "^5.0.3",
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-parser-stream": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz",
"integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==",
"dependencies": {
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5/node_modules/entities": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@@ -4507,6 +4749,14 @@
"through": "^2.3.8"
}
},
"node_modules/undici": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz",
"integrity": "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g==",
"engines": {
"node": ">=20.18.1"
}
},
"node_modules/undici-types": {
"version": "6.21.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
@@ -4595,6 +4845,36 @@
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
},
"node_modules/whatwg-encoding": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
"dependencies": {
"iconv-lite": "0.6.3"
},
"engines": {
"node": ">=18"
}
},
"node_modules/whatwg-encoding/node_modules/iconv-lite": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/whatwg-mimetype": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
"engines": {
"node": ">=18"
}
},
"node_modules/whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",

View File

@@ -10,11 +10,18 @@
"migrate": "tsx src/db/migrate.ts",
"seed": "tsx src/db/seed.ts",
"migrate:az": "tsx src/dutchie-az/db/migrate.ts",
"health:az": "tsx -e \"import { healthCheck } from './src/dutchie-az/db/connection'; (async()=>{ const ok=await healthCheck(); console.log(ok?'AZ DB healthy':'AZ DB NOT reachable'); process.exit(ok?0:1); })();\""
"health:az": "tsx -e \"import { healthCheck } from './src/dutchie-az/db/connection'; (async()=>{ const ok=await healthCheck(); console.log(ok?'AZ DB healthy':'AZ DB NOT reachable'); process.exit(ok?0:1); })();\"",
"system:smoke-test": "tsx src/scripts/system-smoke-test.ts",
"discovery:dt:cities:auto": "tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts",
"discovery:dt:cities:manual": "tsx src/dutchie-az/discovery/discovery-dt-cities-manual-seed.ts",
"discovery:dt:locations": "tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts",
"backfill:legacy:canonical": "tsx src/scripts/backfill-legacy-to-canonical.ts",
"seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts"
},
"dependencies": {
"axios": "^1.6.2",
"bcrypt": "^5.1.1",
"cheerio": "^1.1.2",
"cors": "^2.8.5",
"dotenv": "^16.3.1",
"express": "^4.18.2",

224
backend/setup-local.sh Executable file
View File

@@ -0,0 +1,224 @@
#!/bin/bash
# CannaiQ Local Development Setup (Idempotent)
#
# This script starts the complete local development environment:
# - PostgreSQL (cannaiq-postgres) on port 54320
# - Backend API on port 3010
# - CannaiQ Admin UI on port 8080
# - FindADispo Consumer UI on port 3001
# - Findagram Consumer UI on port 3002
#
# Usage: ./setup-local.sh
#
# URLs:
# Admin: http://localhost:8080/admin
# FindADispo: http://localhost:3001
# Findagram: http://localhost:3002
# Backend: http://localhost:3010
#
# Idempotent: Safe to run multiple times. Already-running services are left alone.
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}================================${NC}"
echo -e "${BLUE} CannaiQ Local Dev Setup${NC}"
echo -e "${BLUE}================================${NC}"
echo ""
# Check for required tools
command -v docker >/dev/null 2>&1 || { echo -e "${RED}Error: docker is required but not installed.${NC}" >&2; exit 1; }
command -v npm >/dev/null 2>&1 || { echo -e "${RED}Error: npm is required but not installed.${NC}" >&2; exit 1; }
# Get the script directory
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
ROOT_DIR="$SCRIPT_DIR/.."
cd "$SCRIPT_DIR"
# Step 1: PostgreSQL
PG_RUNNING=$(docker ps --filter "name=cannaiq-postgres" --filter "status=running" -q)
if [ -n "$PG_RUNNING" ]; then
echo -e "${GREEN}[1/6] PostgreSQL already running (cannaiq-postgres)${NC}"
else
echo -e "${YELLOW}[1/6] Starting PostgreSQL (cannaiq-postgres)...${NC}"
docker compose -f docker-compose.local.yml up -d cannaiq-postgres
# Wait for PostgreSQL to be ready
echo -e "${YELLOW} Waiting for PostgreSQL to be ready...${NC}"
until docker exec cannaiq-postgres pg_isready -U cannaiq >/dev/null 2>&1; do
sleep 1
done
echo -e "${GREEN} PostgreSQL ready on port 54320${NC}"
fi
# Step 2: Create storage directories (always safe to run)
mkdir -p storage/images/products
mkdir -p storage/images/brands
mkdir -p public/images
# Step 3: Backend
if lsof -i:3010 >/dev/null 2>&1; then
echo -e "${GREEN}[2/6] Backend already running on port 3010${NC}"
else
echo -e "${YELLOW}[2/6] Starting Backend API...${NC}"
# Install dependencies if needed
if [ ! -d "node_modules" ]; then
echo -e "${YELLOW} Installing backend dependencies...${NC}"
npm install
fi
# Set environment for local mode
export STORAGE_DRIVER=local
export STORAGE_BASE_PATH=./storage
export PORT=3010
# Start backend in background
npm run dev > /tmp/cannaiq-backend.log 2>&1 &
BACKEND_PID=$!
echo $BACKEND_PID > /tmp/cannaiq-backend.pid
echo -e "${GREEN} Backend starting (PID: $BACKEND_PID)${NC}"
# Wait briefly for backend to start
sleep 3
fi
# Step 4: CannaiQ Admin UI
if lsof -i:8080 >/dev/null 2>&1; then
echo -e "${GREEN}[3/6] CannaiQ Admin already running on port 8080${NC}"
else
echo -e "${YELLOW}[3/6] Starting CannaiQ Admin UI...${NC}"
cd "$ROOT_DIR/cannaiq"
# Install dependencies if needed
if [ ! -d "node_modules" ]; then
echo -e "${YELLOW} Installing cannaiq dependencies...${NC}"
npm install
fi
# Start frontend in background
npm run dev:admin > /tmp/cannaiq-frontend.log 2>&1 &
FRONTEND_PID=$!
echo $FRONTEND_PID > /tmp/cannaiq-frontend.pid
echo -e "${GREEN} CannaiQ Admin starting (PID: $FRONTEND_PID)${NC}"
cd "$SCRIPT_DIR"
fi
# Step 5: FindADispo Consumer UI
if lsof -i:3001 >/dev/null 2>&1; then
echo -e "${GREEN}[4/6] FindADispo already running on port 3001${NC}"
else
echo -e "${YELLOW}[4/6] Starting FindADispo Consumer UI...${NC}"
cd "$ROOT_DIR/findadispo/frontend"
# Install dependencies if needed
if [ ! -d "node_modules" ]; then
echo -e "${YELLOW} Installing findadispo dependencies...${NC}"
npm install
fi
# Start in background on port 3001
PORT=3001 npm run dev > /tmp/findadispo-frontend.log 2>&1 &
FINDADISPO_PID=$!
echo $FINDADISPO_PID > /tmp/findadispo-frontend.pid
echo -e "${GREEN} FindADispo starting (PID: $FINDADISPO_PID)${NC}"
cd "$SCRIPT_DIR"
fi
# Step 6: Findagram Consumer UI
if lsof -i:3002 >/dev/null 2>&1; then
echo -e "${GREEN}[5/6] Findagram already running on port 3002${NC}"
else
echo -e "${YELLOW}[5/6] Starting Findagram Consumer UI...${NC}"
cd "$ROOT_DIR/findagram/frontend"
# Install dependencies if needed
if [ ! -d "node_modules" ]; then
echo -e "${YELLOW} Installing findagram dependencies...${NC}"
npm install
fi
# Start in background on port 3002
PORT=3002 npm run dev > /tmp/findagram-frontend.log 2>&1 &
FINDAGRAM_PID=$!
echo $FINDAGRAM_PID > /tmp/findagram-frontend.pid
echo -e "${GREEN} Findagram starting (PID: $FINDAGRAM_PID)${NC}"
cd "$SCRIPT_DIR"
fi
# Step 7: Health checks for newly started services
echo ""
echo -e "${YELLOW}[6/6] Checking service health...${NC}"
# Check backend if it was just started
if ! lsof -i:3010 >/dev/null 2>&1; then
for i in {1..15}; do
if curl -s http://localhost:3010/health > /dev/null 2>&1; then
break
fi
sleep 1
done
fi
if curl -s http://localhost:3010/health > /dev/null 2>&1; then
echo -e "${GREEN} Backend API: OK (port 3010)${NC}"
else
echo -e "${YELLOW} Backend API: Starting (check: tail -f /tmp/cannaiq-backend.log)${NC}"
fi
# Check CannaiQ Admin
if curl -s http://localhost:8080 > /dev/null 2>&1; then
echo -e "${GREEN} CannaiQ Admin: OK (port 8080)${NC}"
else
echo -e "${YELLOW} CannaiQ Admin: Starting (check: tail -f /tmp/cannaiq-frontend.log)${NC}"
fi
# Check FindADispo
sleep 2
if curl -s http://localhost:3001 > /dev/null 2>&1; then
echo -e "${GREEN} FindADispo: OK (port 3001)${NC}"
else
echo -e "${YELLOW} FindADispo: Starting (check: tail -f /tmp/findadispo-frontend.log)${NC}"
fi
# Check Findagram
if curl -s http://localhost:3002 > /dev/null 2>&1; then
echo -e "${GREEN} Findagram: OK (port 3002)${NC}"
else
echo -e "${YELLOW} Findagram: Starting (check: tail -f /tmp/findagram-frontend.log)${NC}"
fi
# Print final status
echo ""
echo -e "${BLUE}================================${NC}"
echo -e "${GREEN} Local Environment Ready${NC}"
echo -e "${BLUE}================================${NC}"
echo ""
echo -e " ${BLUE}Services:${NC}"
echo -e " Postgres: localhost:54320"
echo -e " Backend API: http://localhost:3010"
echo ""
echo -e " ${BLUE}Frontends:${NC}"
echo -e " CannaiQ Admin: http://localhost:8080/admin"
echo -e " FindADispo: http://localhost:3001"
echo -e " Findagram: http://localhost:3002"
echo ""
echo -e "${YELLOW}To stop services:${NC} ./stop-local.sh"
echo -e "${YELLOW}View logs:${NC}"
echo " Backend: tail -f /tmp/cannaiq-backend.log"
echo " CannaiQ: tail -f /tmp/cannaiq-frontend.log"
echo " FindADispo: tail -f /tmp/findadispo-frontend.log"
echo " Findagram: tail -f /tmp/findagram-frontend.log"
echo ""

View File

@@ -1,7 +1,7 @@
import { Request, Response, NextFunction } from 'express';
import jwt from 'jsonwebtoken';
import bcrypt from 'bcrypt';
import { pool } from '../db/migrate';
import { pool } from '../db/pool';
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';

View File

@@ -0,0 +1,204 @@
# Canonical Hydration Pipeline - Runbook
## Overview
The Canonical Hydration Pipeline transforms data from the `dutchie_*` source tables into the provider-agnostic canonical tables (`store_products`, `store_product_snapshots`, `crawl_runs`). This enables:
- Unified analytics across multiple data providers
- Historical price/inventory tracking
- Provider-agnostic API endpoints
## Architecture
```
Source Tables (read-only):
dutchie_products → StoreProductNormalizer → store_products
dutchie_product_snapshots → SnapshotWriter → store_product_snapshots
dispensary_crawl_jobs → CrawlRunRecorder → crawl_runs
Orchestration:
CanonicalHydrationService coordinates all transformations
```
## Table Mappings
### dutchie_products → store_products
| Source Column | Target Column | Notes |
|---------------|---------------|-------|
| dispensary_id | dispensary_id | Direct mapping |
| external_product_id | provider_product_id | Canonical key |
| platform | provider | 'dutchie' |
| name | name_raw | Raw product name |
| brand_name | brand_name_raw | Raw brand name |
| type/subcategory | category_raw | Category info |
| price_rec (JSONB) | price_rec (DECIMAL) | Extracted from JSONB |
| price_med (JSONB) | price_med (DECIMAL) | Extracted from JSONB |
| thc | thc_percent | Parsed percentage |
| cbd | cbd_percent | Parsed percentage |
| stock_status | is_in_stock | Boolean conversion |
| total_quantity_available | stock_quantity | Direct mapping |
| primary_image_url | image_url | Direct mapping |
| created_at | first_seen_at | First seen timestamp |
| updated_at | last_seen_at | Last seen timestamp |
### Canonical Keys
- **store_products**: `(dispensary_id, provider, provider_product_id)`
- **store_product_snapshots**: `(store_product_id, crawl_run_id)`
- **crawl_runs**: `(source_job_type, source_job_id)`
## CLI Commands
### Check Hydration Status
```bash
# Overall status
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --status
# Single dispensary status
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --status --dispensary-id 112
```
### Products-Only Hydration
Use when source data has products but no historical snapshots/job records.
```bash
# Dry run (see what would be done)
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
# Hydrate single dispensary
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
# Hydrate all dispensaries
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts
```
### Backfill Hydration
Use when source data has historical job records in `dispensary_crawl_jobs`.
```bash
# Dry run
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
# Backfill with date range
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
# Backfill single dispensary
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
```
### Incremental Hydration
Use for ongoing hydration of new data.
```bash
# Single run
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts
# Continuous loop (runs every 60 seconds)
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts --loop
# Continuous loop with custom interval
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
```
## Migration
Apply the schema migration before first use:
```bash
# Apply migration 050
DATABASE_URL="..." psql -f src/migrations/050_canonical_hydration_schema.sql
```
This migration adds:
- `source_job_type` and `source_job_id` columns to `crawl_runs`
- Unique index on `crawl_runs (source_job_type, source_job_id)`
- Unique index on `store_product_snapshots (store_product_id, crawl_run_id)`
- Performance indexes for hydration queries
## Idempotency
All hydration operations are idempotent:
- **crawl_runs**: ON CONFLICT updates existing records
- **store_products**: ON CONFLICT updates mutable fields
- **store_product_snapshots**: ON CONFLICT DO NOTHING
Re-running hydration is safe and will not create duplicates.
## Monitoring
### Check Canonical Data
```sql
-- Count canonical records
SELECT
(SELECT COUNT(*) FROM crawl_runs WHERE provider = 'dutchie') as crawl_runs,
(SELECT COUNT(*) FROM store_products WHERE provider = 'dutchie') as products,
(SELECT COUNT(*) FROM store_product_snapshots) as snapshots;
-- Products by dispensary
SELECT dispensary_id, COUNT(*) as products
FROM store_products
WHERE provider = 'dutchie'
GROUP BY dispensary_id
ORDER BY products DESC;
-- Recent crawl runs
SELECT id, dispensary_id, started_at, products_found, snapshots_written
FROM crawl_runs
ORDER BY started_at DESC
LIMIT 10;
```
### Verify Hydration Completeness
```sql
-- Compare source vs canonical product counts
SELECT
dp.dispensary_id,
COUNT(DISTINCT dp.id) as source_products,
COUNT(DISTINCT sp.id) as canonical_products
FROM dutchie_products dp
LEFT JOIN store_products sp
ON sp.dispensary_id = dp.dispensary_id
AND sp.provider = 'dutchie'
AND sp.provider_product_id = dp.external_product_id
GROUP BY dp.dispensary_id
ORDER BY dp.dispensary_id;
```
## Troubleshooting
### "invalid input syntax for type integer"
This usually means a type mismatch between source and target columns. The most common case is `brand_id` - the source has UUID strings but the target expects integers. The normalizer sets `brand_id = null` to handle this.
### "could not determine data type of parameter $1"
This indicates a batch insert issue with parameter indexing. Ensure each batch has its own parameter indexing starting from $1.
### Empty Snapshots
If `snapshotsWritten` is 0 but products were upserted:
1. Check if snapshots already exist for the crawl run (ON CONFLICT DO NOTHING)
2. Verify store_products exist with the correct dispensary_id and provider
## Performance
Typical performance metrics:
- ~1000 products/second for upsert
- ~2000 snapshots/second for insert
- 39 dispensaries with 37K products: ~17 seconds
For large backfills, use `--batch-size` to control memory usage.
## Known Limitations
1. **brand_id not mapped**: Source brand_id is UUID, target expects integer. Currently set to null.
2. **No historical snapshots**: If source has no `dutchie_product_snapshots`, use products-only mode which creates initial snapshots from current product state.
3. **Source jobs empty**: If `dispensary_crawl_jobs` is empty, use products-only mode.

View File

@@ -0,0 +1,170 @@
#!/usr/bin/env npx tsx
/**
* Backfill CLI - Historical data hydration
*
* Usage:
* npx tsx src/canonical-hydration/cli/backfill.ts [options]
*
* Options:
* --dispensary-id <id> Hydrate only a specific dispensary
* --start-date <date> Start date for backfill (ISO format)
* --end-date <date> End date for backfill (ISO format)
* --batch-size <n> Number of jobs to process per batch (default: 50)
* --dry-run Show what would be done without making changes
* --status Show hydration status and exit
*
* Examples:
* npx tsx src/canonical-hydration/cli/backfill.ts --status
* npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
* npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
* npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
*/
import { Pool } from 'pg';
import { CanonicalHydrationService } from '../hydration-service';
import { HydrationOptions } from '../types';
async function main() {
const args = process.argv.slice(2);
// Parse command line arguments
const options: HydrationOptions = {
mode: 'backfill',
};
let showStatus = false;
for (let i = 0; i < args.length; i++) {
const arg = args[i];
switch (arg) {
case '--dispensary-id':
options.dispensaryId = parseInt(args[++i]);
break;
case '--start-date':
options.startDate = new Date(args[++i]);
break;
case '--end-date':
options.endDate = new Date(args[++i]);
break;
case '--batch-size':
options.batchSize = parseInt(args[++i]);
break;
case '--dry-run':
options.dryRun = true;
break;
case '--status':
showStatus = true;
break;
case '--help':
console.log(`
Backfill CLI - Historical data hydration
Usage:
npx tsx src/canonical-hydration/cli/backfill.ts [options]
Options:
--dispensary-id <id> Hydrate only a specific dispensary
--start-date <date> Start date for backfill (ISO format)
--end-date <date> End date for backfill (ISO format)
--batch-size <n> Number of jobs to process per batch (default: 50)
--dry-run Show what would be done without making changes
--status Show hydration status and exit
Examples:
npx tsx src/canonical-hydration/cli/backfill.ts --status
npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
`);
process.exit(0);
}
}
// Connect to database
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
const service = new CanonicalHydrationService({
pool,
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
});
try {
if (showStatus) {
// Show status and exit
if (options.dispensaryId) {
const status = await service.getHydrationStatus(options.dispensaryId);
console.log(`\nHydration Status for Dispensary ${options.dispensaryId}:`);
console.log('═'.repeat(50));
console.log(` Source Jobs (completed): ${status.sourceJobs}`);
console.log(` Hydrated Jobs: ${status.hydratedJobs}`);
console.log(` Unhydrated Jobs: ${status.unhydratedJobs}`);
console.log('');
console.log(` Source Products: ${status.sourceProducts}`);
console.log(` Store Products: ${status.storeProducts}`);
console.log('');
console.log(` Source Snapshots: ${status.sourceSnapshots}`);
console.log(` Store Snapshots: ${status.storeSnapshots}`);
} else {
const status = await service.getOverallStatus();
console.log('\nOverall Hydration Status:');
console.log('═'.repeat(50));
console.log(` Dispensaries with Data: ${status.dispensariesWithData}`);
console.log('');
console.log(` Source Jobs (completed): ${status.totalSourceJobs}`);
console.log(` Hydrated Jobs: ${status.totalHydratedJobs}`);
console.log(` Unhydrated Jobs: ${status.totalSourceJobs - status.totalHydratedJobs}`);
console.log('');
console.log(` Source Products: ${status.totalSourceProducts}`);
console.log(` Store Products: ${status.totalStoreProducts}`);
console.log('');
console.log(` Source Snapshots: ${status.totalSourceSnapshots}`);
console.log(` Store Snapshots: ${status.totalStoreSnapshots}`);
}
process.exit(0);
}
// Run backfill
console.log('\n' + '═'.repeat(60));
console.log(' CANONICAL HYDRATION - BACKFILL MODE');
console.log('═'.repeat(60));
console.log(` Dispensary ID: ${options.dispensaryId || 'ALL'}`);
console.log(` Start Date: ${options.startDate?.toISOString() || 'N/A'}`);
console.log(` End Date: ${options.endDate?.toISOString() || 'N/A'}`);
console.log(` Batch Size: ${options.batchSize || 50}`);
console.log(` Dry Run: ${options.dryRun ? 'YES' : 'NO'}`);
console.log('═'.repeat(60) + '\n');
const result = await service.hydrate(options);
console.log('\n' + '═'.repeat(60));
console.log(' HYDRATION COMPLETE');
console.log('═'.repeat(60));
console.log(` Crawl Runs Created: ${result.crawlRunsCreated}`);
console.log(` Crawl Runs Skipped: ${result.crawlRunsSkipped}`);
console.log(` Products Upserted: ${result.productsUpserted}`);
console.log(` Snapshots Written: ${result.snapshotsWritten}`);
console.log(` Duration: ${result.durationMs}ms`);
console.log(` Errors: ${result.errors.length}`);
if (result.errors.length > 0) {
console.log('\nErrors:');
for (const error of result.errors.slice(0, 10)) {
console.log(` - ${error}`);
}
if (result.errors.length > 10) {
console.log(` ... and ${result.errors.length - 10} more`);
}
}
console.log('═'.repeat(60) + '\n');
process.exit(result.errors.length > 0 ? 1 : 0);
} catch (error: any) {
console.error('Fatal error:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env npx tsx
/**
* Incremental CLI - Ongoing data hydration
*
* Usage:
* npx tsx src/canonical-hydration/cli/incremental.ts [options]
*
* Options:
* --dispensary-id <id> Hydrate only a specific dispensary
* --batch-size <n> Number of jobs to process per batch (default: 100)
* --loop Run continuously in a loop
* --interval <seconds> Interval between loops (default: 60)
* --dry-run Show what would be done without making changes
*
* Examples:
* npx tsx src/canonical-hydration/cli/incremental.ts
* npx tsx src/canonical-hydration/cli/incremental.ts --dispensary-id 112
* npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
* npx tsx src/canonical-hydration/cli/incremental.ts --dry-run
*/
import { Pool } from 'pg';
import { CanonicalHydrationService } from '../hydration-service';
import { HydrationOptions } from '../types';
async function main() {
const args = process.argv.slice(2);
// Parse command line arguments
const options: HydrationOptions = {
mode: 'incremental',
};
let loop = false;
let intervalSeconds = 60;
for (let i = 0; i < args.length; i++) {
const arg = args[i];
switch (arg) {
case '--dispensary-id':
options.dispensaryId = parseInt(args[++i]);
break;
case '--batch-size':
options.batchSize = parseInt(args[++i]);
break;
case '--loop':
loop = true;
break;
case '--interval':
intervalSeconds = parseInt(args[++i]);
break;
case '--dry-run':
options.dryRun = true;
break;
case '--help':
console.log(`
Incremental CLI - Ongoing data hydration
Usage:
npx tsx src/canonical-hydration/cli/incremental.ts [options]
Options:
--dispensary-id <id> Hydrate only a specific dispensary
--batch-size <n> Number of jobs to process per batch (default: 100)
--loop Run continuously in a loop
--interval <seconds> Interval between loops (default: 60)
--dry-run Show what would be done without making changes
Examples:
npx tsx src/canonical-hydration/cli/incremental.ts
npx tsx src/canonical-hydration/cli/incremental.ts --dispensary-id 112
npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
npx tsx src/canonical-hydration/cli/incremental.ts --dry-run
`);
process.exit(0);
}
}
// Connect to database
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
const service = new CanonicalHydrationService({
pool,
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
});
const log = (msg: string) => console.log(`[${new Date().toISOString()}] ${msg}`);
// Graceful shutdown
let running = true;
process.on('SIGINT', () => {
log('Received SIGINT, shutting down...');
running = false;
});
process.on('SIGTERM', () => {
log('Received SIGTERM, shutting down...');
running = false;
});
try {
console.log('\n' + '═'.repeat(60));
console.log(' CANONICAL HYDRATION - INCREMENTAL MODE');
console.log('═'.repeat(60));
console.log(` Dispensary ID: ${options.dispensaryId || 'ALL'}`);
console.log(` Batch Size: ${options.batchSize || 100}`);
console.log(` Loop Mode: ${loop ? 'YES' : 'NO'}`);
if (loop) {
console.log(` Interval: ${intervalSeconds}s`);
}
console.log(` Dry Run: ${options.dryRun ? 'YES' : 'NO'}`);
console.log('═'.repeat(60) + '\n');
do {
const result = await service.hydrate(options);
log(`Hydration complete: ${result.crawlRunsCreated} runs, ${result.productsUpserted} products, ${result.snapshotsWritten} snapshots (${result.durationMs}ms)`);
if (result.errors.length > 0) {
log(`Errors: ${result.errors.length}`);
for (const error of result.errors.slice(0, 5)) {
log(` - ${error}`);
}
}
if (loop && running) {
log(`Sleeping for ${intervalSeconds}s...`);
await new Promise(resolve => setTimeout(resolve, intervalSeconds * 1000));
}
} while (loop && running);
log('Incremental hydration completed');
process.exit(0);
} catch (error: any) {
console.error('Fatal error:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,113 @@
#!/usr/bin/env npx tsx
/**
* Products-Only Hydration CLI
*
* Used when there are no historical job records - creates synthetic crawl runs
* from current product data.
*
* Usage:
* npx tsx src/canonical-hydration/cli/products-only.ts [options]
*
* Options:
* --dispensary-id <id> Hydrate only a specific dispensary
* --dry-run Show what would be done without making changes
*
* Examples:
* npx tsx src/canonical-hydration/cli/products-only.ts
* npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
* npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
*/
import { Pool } from 'pg';
import { CanonicalHydrationService } from '../hydration-service';
async function main() {
const args = process.argv.slice(2);
// Parse command line arguments
let dispensaryId: number | undefined;
let dryRun = false;
for (let i = 0; i < args.length; i++) {
const arg = args[i];
switch (arg) {
case '--dispensary-id':
dispensaryId = parseInt(args[++i]);
break;
case '--dry-run':
dryRun = true;
break;
case '--help':
console.log(`
Products-Only Hydration CLI
Used when there are no historical job records - creates synthetic crawl runs
from current product data.
Usage:
npx tsx src/canonical-hydration/cli/products-only.ts [options]
Options:
--dispensary-id <id> Hydrate only a specific dispensary
--dry-run Show what would be done without making changes
Examples:
npx tsx src/canonical-hydration/cli/products-only.ts
npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
`);
process.exit(0);
}
}
// Connect to database
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
const service = new CanonicalHydrationService({
pool,
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
});
try {
console.log('\n' + '═'.repeat(60));
console.log(' CANONICAL HYDRATION - PRODUCTS-ONLY MODE');
console.log('═'.repeat(60));
console.log(` Dispensary ID: ${dispensaryId || 'ALL'}`);
console.log(` Dry Run: ${dryRun ? 'YES' : 'NO'}`);
console.log('═'.repeat(60) + '\n');
const result = await service.hydrateProductsOnly({ dispensaryId, dryRun });
console.log('\n' + '═'.repeat(60));
console.log(' HYDRATION COMPLETE');
console.log('═'.repeat(60));
console.log(` Crawl Runs Created: ${result.crawlRunsCreated}`);
console.log(` Crawl Runs Skipped: ${result.crawlRunsSkipped}`);
console.log(` Products Upserted: ${result.productsUpserted}`);
console.log(` Snapshots Written: ${result.snapshotsWritten}`);
console.log(` Duration: ${result.durationMs}ms`);
console.log(` Errors: ${result.errors.length}`);
if (result.errors.length > 0) {
console.log('\nErrors:');
for (const error of result.errors.slice(0, 10)) {
console.log(` - ${error}`);
}
if (result.errors.length > 10) {
console.log(` ... and ${result.errors.length - 10} more`);
}
}
console.log('═'.repeat(60) + '\n');
process.exit(result.errors.length > 0 ? 1 : 0);
} catch (error: any) {
console.error('Fatal error:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,226 @@
/**
* CrawlRunRecorder
* Records crawl runs from source job tables (dispensary_crawl_jobs) to canonical crawl_runs table
*/
import { Pool, PoolClient } from 'pg';
import { SourceJob, CrawlRun, ServiceContext, SourceJobType } from './types';
export class CrawlRunRecorder {
private pool: Pool;
private log: (message: string) => void;
constructor(ctx: ServiceContext) {
this.pool = ctx.pool;
this.log = ctx.logger || console.log;
}
/**
* Record a single crawl run from a source job
* Uses ON CONFLICT to ensure idempotency
*/
async recordCrawlRun(
sourceJob: SourceJob,
sourceJobType: SourceJobType = 'dispensary_crawl_jobs'
): Promise<number | null> {
// Skip jobs that aren't completed successfully
if (sourceJob.status !== 'completed') {
return null;
}
const crawlRun: Partial<CrawlRun> = {
dispensary_id: sourceJob.dispensary_id,
provider: 'dutchie', // Source is always dutchie for now
started_at: sourceJob.started_at || new Date(),
finished_at: sourceJob.completed_at,
duration_ms: sourceJob.duration_ms,
status: this.mapStatus(sourceJob.status),
error_message: sourceJob.error_message,
products_found: sourceJob.products_found,
products_new: sourceJob.products_new,
products_updated: sourceJob.products_updated,
snapshots_written: null, // Will be updated after snapshot insertion
worker_id: null,
trigger_type: sourceJob.job_type === 'dutchie_product_crawl' ? 'scheduled' : 'manual',
metadata: { sourceJobType, originalJobType: sourceJob.job_type },
source_job_type: sourceJobType,
source_job_id: sourceJob.id,
};
const result = await this.pool.query(
`INSERT INTO crawl_runs (
dispensary_id, provider, started_at, finished_at, duration_ms,
status, error_message, products_found, products_new, products_updated,
snapshots_written, worker_id, trigger_type, metadata,
source_job_type, source_job_id
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
ON CONFLICT (source_job_type, source_job_id) WHERE source_job_id IS NOT NULL
DO UPDATE SET
finished_at = EXCLUDED.finished_at,
duration_ms = EXCLUDED.duration_ms,
status = EXCLUDED.status,
error_message = EXCLUDED.error_message,
products_found = EXCLUDED.products_found,
products_new = EXCLUDED.products_new,
products_updated = EXCLUDED.products_updated
RETURNING id`,
[
crawlRun.dispensary_id,
crawlRun.provider,
crawlRun.started_at,
crawlRun.finished_at,
crawlRun.duration_ms,
crawlRun.status,
crawlRun.error_message,
crawlRun.products_found,
crawlRun.products_new,
crawlRun.products_updated,
crawlRun.snapshots_written,
crawlRun.worker_id,
crawlRun.trigger_type,
JSON.stringify(crawlRun.metadata),
crawlRun.source_job_type,
crawlRun.source_job_id,
]
);
return result.rows[0]?.id || null;
}
/**
* Record multiple crawl runs in a batch
*/
async recordCrawlRunsBatch(
sourceJobs: SourceJob[],
sourceJobType: SourceJobType = 'dispensary_crawl_jobs'
): Promise<{ created: number; skipped: number; crawlRunIds: Map<number, number> }> {
let created = 0;
let skipped = 0;
const crawlRunIds = new Map<number, number>(); // sourceJobId -> crawlRunId
for (const job of sourceJobs) {
const crawlRunId = await this.recordCrawlRun(job, sourceJobType);
if (crawlRunId) {
created++;
crawlRunIds.set(job.id, crawlRunId);
} else {
skipped++;
}
}
return { created, skipped, crawlRunIds };
}
/**
* Update snapshots_written count for a crawl run
*/
async updateSnapshotsWritten(crawlRunId: number, snapshotsWritten: number): Promise<void> {
await this.pool.query(
'UPDATE crawl_runs SET snapshots_written = $1 WHERE id = $2',
[snapshotsWritten, crawlRunId]
);
}
/**
* Get crawl run ID by source job
*/
async getCrawlRunIdBySourceJob(
sourceJobType: SourceJobType,
sourceJobId: number
): Promise<number | null> {
const result = await this.pool.query(
'SELECT id FROM crawl_runs WHERE source_job_type = $1 AND source_job_id = $2',
[sourceJobType, sourceJobId]
);
return result.rows[0]?.id || null;
}
/**
* Get unhydrated source jobs (jobs not yet recorded in crawl_runs)
*/
async getUnhydratedJobs(
dispensaryId?: number,
startDate?: Date,
limit: number = 100
): Promise<SourceJob[]> {
let query = `
SELECT j.*
FROM dispensary_crawl_jobs j
LEFT JOIN crawl_runs cr ON cr.source_job_type = 'dispensary_crawl_jobs' AND cr.source_job_id = j.id
WHERE cr.id IS NULL
AND j.status = 'completed'
AND j.job_type = 'dutchie_product_crawl'
`;
const params: any[] = [];
let paramIndex = 1;
if (dispensaryId) {
query += ` AND j.dispensary_id = $${paramIndex++}`;
params.push(dispensaryId);
}
if (startDate) {
query += ` AND j.completed_at >= $${paramIndex++}`;
params.push(startDate);
}
query += ` ORDER BY j.completed_at ASC LIMIT $${paramIndex}`;
params.push(limit);
const result = await this.pool.query(query, params);
return result.rows;
}
/**
* Get all source jobs for backfill (within date range)
*/
async getSourceJobsForBackfill(
startDate?: Date,
endDate?: Date,
dispensaryId?: number,
limit: number = 1000
): Promise<SourceJob[]> {
let query = `
SELECT *
FROM dispensary_crawl_jobs
WHERE status = 'completed'
AND job_type = 'dutchie_product_crawl'
`;
const params: any[] = [];
let paramIndex = 1;
if (startDate) {
query += ` AND completed_at >= $${paramIndex++}`;
params.push(startDate);
}
if (endDate) {
query += ` AND completed_at <= $${paramIndex++}`;
params.push(endDate);
}
if (dispensaryId) {
query += ` AND dispensary_id = $${paramIndex++}`;
params.push(dispensaryId);
}
query += ` ORDER BY completed_at ASC LIMIT $${paramIndex}`;
params.push(limit);
const result = await this.pool.query(query, params);
return result.rows;
}
private mapStatus(sourceStatus: string): string {
switch (sourceStatus) {
case 'completed':
return 'success';
case 'failed':
return 'failed';
case 'running':
return 'running';
default:
return sourceStatus;
}
}
}

View File

@@ -0,0 +1,560 @@
/**
* CanonicalHydrationService
* Orchestrates the full hydration pipeline from dutchie_* to canonical tables
*/
import { Pool } from 'pg';
import { CrawlRunRecorder } from './crawl-run-recorder';
import { StoreProductNormalizer } from './store-product-normalizer';
import { SnapshotWriter } from './snapshot-writer';
import { HydrationOptions, HydrationResult, ServiceContext, SourceJob } from './types';
export class CanonicalHydrationService {
private pool: Pool;
private log: (message: string) => void;
private crawlRunRecorder: CrawlRunRecorder;
private productNormalizer: StoreProductNormalizer;
private snapshotWriter: SnapshotWriter;
constructor(ctx: ServiceContext) {
this.pool = ctx.pool;
this.log = ctx.logger || console.log;
this.crawlRunRecorder = new CrawlRunRecorder(ctx);
this.productNormalizer = new StoreProductNormalizer(ctx);
this.snapshotWriter = new SnapshotWriter(ctx);
}
/**
* Run the full hydration pipeline
* Supports both backfill (historical) and incremental (ongoing) modes
*/
async hydrate(options: HydrationOptions): Promise<HydrationResult> {
const startTime = Date.now();
const result: HydrationResult = {
crawlRunsCreated: 0,
crawlRunsSkipped: 0,
productsUpserted: 0,
snapshotsWritten: 0,
errors: [],
durationMs: 0,
};
this.log(`Starting hydration in ${options.mode} mode`);
try {
if (options.mode === 'backfill') {
await this.runBackfill(options, result);
} else {
await this.runIncremental(options, result);
}
} catch (err: any) {
result.errors.push(`Fatal error: ${err.message}`);
this.log(`Hydration failed: ${err.message}`);
}
result.durationMs = Date.now() - startTime;
this.log(`Hydration completed in ${result.durationMs}ms: ${JSON.stringify({
crawlRunsCreated: result.crawlRunsCreated,
crawlRunsSkipped: result.crawlRunsSkipped,
productsUpserted: result.productsUpserted,
snapshotsWritten: result.snapshotsWritten,
errors: result.errors.length,
})}`);
return result;
}
/**
* Backfill mode: Process historical data from source tables
*/
private async runBackfill(options: HydrationOptions, result: HydrationResult): Promise<void> {
const batchSize = options.batchSize || 50;
// Get source jobs to process
const sourceJobs = await this.crawlRunRecorder.getSourceJobsForBackfill(
options.startDate,
options.endDate,
options.dispensaryId,
1000 // Max jobs to process
);
this.log(`Found ${sourceJobs.length} source jobs to backfill`);
// Group jobs by dispensary for efficient processing
const jobsByDispensary = this.groupJobsByDispensary(sourceJobs);
for (const [dispensaryId, jobs] of jobsByDispensary) {
this.log(`Processing dispensary ${dispensaryId} (${jobs.length} jobs)`);
try {
// Step 1: Upsert products for this dispensary
if (!options.dryRun) {
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
result.productsUpserted += productResult.upserted;
if (productResult.errors.length > 0) {
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
}
}
// Get store_product_id map for snapshot writing
const storeProductIdMap = await this.productNormalizer.getStoreProductIdMap(dispensaryId);
// Step 2: Record crawl runs and write snapshots for each job
for (const job of jobs) {
try {
await this.processJob(job, storeProductIdMap, result, options.dryRun);
} catch (err: any) {
result.errors.push(`Job ${job.id}: ${err.message}`);
}
}
} catch (err: any) {
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
}
}
}
/**
* Incremental mode: Process only unhydrated jobs
*/
private async runIncremental(options: HydrationOptions, result: HydrationResult): Promise<void> {
const limit = options.batchSize || 100;
// Get unhydrated jobs
const unhydratedJobs = await this.crawlRunRecorder.getUnhydratedJobs(
options.dispensaryId,
options.startDate,
limit
);
this.log(`Found ${unhydratedJobs.length} unhydrated jobs`);
// Group by dispensary
const jobsByDispensary = this.groupJobsByDispensary(unhydratedJobs);
for (const [dispensaryId, jobs] of jobsByDispensary) {
this.log(`Processing dispensary ${dispensaryId} (${jobs.length} jobs)`);
try {
// Step 1: Upsert products
if (!options.dryRun) {
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
result.productsUpserted += productResult.upserted;
if (productResult.errors.length > 0) {
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
}
}
// Get store_product_id map
const storeProductIdMap = await this.productNormalizer.getStoreProductIdMap(dispensaryId);
// Step 2: Process each job
for (const job of jobs) {
try {
await this.processJob(job, storeProductIdMap, result, options.dryRun);
} catch (err: any) {
result.errors.push(`Job ${job.id}: ${err.message}`);
}
}
} catch (err: any) {
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
}
}
}
/**
* Process a single job: record crawl run and write snapshots
*/
private async processJob(
job: SourceJob,
storeProductIdMap: Map<string, number>,
result: HydrationResult,
dryRun?: boolean
): Promise<void> {
// Step 1: Record the crawl run
let crawlRunId: number | null = null;
if (!dryRun) {
crawlRunId = await this.crawlRunRecorder.recordCrawlRun(job);
if (crawlRunId) {
result.crawlRunsCreated++;
} else {
result.crawlRunsSkipped++;
return; // Skip snapshot writing if crawl run wasn't created
}
} else {
// In dry run, check if it would be created
const existingId = await this.crawlRunRecorder.getCrawlRunIdBySourceJob(
'dispensary_crawl_jobs',
job.id
);
if (existingId) {
result.crawlRunsSkipped++;
return;
}
result.crawlRunsCreated++;
return; // Skip snapshot writing in dry run
}
// Step 2: Write snapshots for this crawl run
if (crawlRunId && job.completed_at) {
const snapshotResult = await this.snapshotWriter.writeSnapshotsForCrawlRun(
crawlRunId,
job.dispensary_id,
storeProductIdMap,
job.completed_at
);
result.snapshotsWritten += snapshotResult.written;
if (snapshotResult.errors.length > 0) {
result.errors.push(...snapshotResult.errors);
}
// Update crawl_run with snapshots_written count
await this.crawlRunRecorder.updateSnapshotsWritten(crawlRunId, snapshotResult.written);
}
}
/**
* Hydrate a single dispensary (convenience method)
*/
async hydrateDispensary(
dispensaryId: number,
mode: 'backfill' | 'incremental' = 'incremental'
): Promise<HydrationResult> {
return this.hydrate({
mode,
dispensaryId,
});
}
/**
* Get hydration status for a dispensary
*/
async getHydrationStatus(dispensaryId: number): Promise<{
sourceJobs: number;
hydratedJobs: number;
unhydratedJobs: number;
sourceProducts: number;
storeProducts: number;
sourceSnapshots: number;
storeSnapshots: number;
}> {
const [sourceJobs, hydratedJobs, sourceProducts, storeProducts, sourceSnapshots, storeSnapshots] =
await Promise.all([
this.pool.query(
`SELECT COUNT(*) FROM dispensary_crawl_jobs
WHERE dispensary_id = $1 AND status = 'completed' AND job_type = 'dutchie_product_crawl'`,
[dispensaryId]
),
this.pool.query(
`SELECT COUNT(*) FROM crawl_runs
WHERE dispensary_id = $1 AND source_job_type = 'dispensary_crawl_jobs'`,
[dispensaryId]
),
this.pool.query(
`SELECT COUNT(*) FROM dutchie_products WHERE dispensary_id = $1`,
[dispensaryId]
),
this.pool.query(
`SELECT COUNT(*) FROM store_products WHERE dispensary_id = $1 AND provider = 'dutchie'`,
[dispensaryId]
),
this.pool.query(
`SELECT COUNT(*) FROM dutchie_product_snapshots WHERE dispensary_id = $1`,
[dispensaryId]
),
this.pool.query(
`SELECT COUNT(*) FROM store_product_snapshots WHERE dispensary_id = $1`,
[dispensaryId]
),
]);
const sourceJobCount = parseInt(sourceJobs.rows[0].count);
const hydratedJobCount = parseInt(hydratedJobs.rows[0].count);
return {
sourceJobs: sourceJobCount,
hydratedJobs: hydratedJobCount,
unhydratedJobs: sourceJobCount - hydratedJobCount,
sourceProducts: parseInt(sourceProducts.rows[0].count),
storeProducts: parseInt(storeProducts.rows[0].count),
sourceSnapshots: parseInt(sourceSnapshots.rows[0].count),
storeSnapshots: parseInt(storeSnapshots.rows[0].count),
};
}
/**
* Get overall hydration status
*/
async getOverallStatus(): Promise<{
totalSourceJobs: number;
totalHydratedJobs: number;
totalSourceProducts: number;
totalStoreProducts: number;
totalSourceSnapshots: number;
totalStoreSnapshots: number;
dispensariesWithData: number;
}> {
const [sourceJobs, hydratedJobs, sourceProducts, storeProducts, sourceSnapshots, storeSnapshots, dispensaries] =
await Promise.all([
this.pool.query(
`SELECT COUNT(*) FROM dispensary_crawl_jobs
WHERE status = 'completed' AND job_type = 'dutchie_product_crawl'`
),
this.pool.query(
`SELECT COUNT(*) FROM crawl_runs WHERE source_job_type = 'dispensary_crawl_jobs'`
),
this.pool.query(`SELECT COUNT(*) FROM dutchie_products`),
this.pool.query(`SELECT COUNT(*) FROM store_products WHERE provider = 'dutchie'`),
this.pool.query(`SELECT COUNT(*) FROM dutchie_product_snapshots`),
this.pool.query(`SELECT COUNT(*) FROM store_product_snapshots`),
this.pool.query(
`SELECT COUNT(DISTINCT dispensary_id) FROM dutchie_products`
),
]);
return {
totalSourceJobs: parseInt(sourceJobs.rows[0].count),
totalHydratedJobs: parseInt(hydratedJobs.rows[0].count),
totalSourceProducts: parseInt(sourceProducts.rows[0].count),
totalStoreProducts: parseInt(storeProducts.rows[0].count),
totalSourceSnapshots: parseInt(sourceSnapshots.rows[0].count),
totalStoreSnapshots: parseInt(storeSnapshots.rows[0].count),
dispensariesWithData: parseInt(dispensaries.rows[0].count),
};
}
/**
* Group jobs by dispensary ID
*/
private groupJobsByDispensary(jobs: SourceJob[]): Map<number, SourceJob[]> {
const map = new Map<number, SourceJob[]>();
for (const job of jobs) {
const list = map.get(job.dispensary_id) || [];
list.push(job);
map.set(job.dispensary_id, list);
}
return map;
}
/**
* Products-only hydration mode
* Used when there are no historical job records - creates synthetic crawl runs
* from current product data
*/
async hydrateProductsOnly(options: {
dispensaryId?: number;
dryRun?: boolean;
} = {}): Promise<HydrationResult> {
const startTime = Date.now();
const result: HydrationResult = {
crawlRunsCreated: 0,
crawlRunsSkipped: 0,
productsUpserted: 0,
snapshotsWritten: 0,
errors: [],
durationMs: 0,
};
this.log('Starting products-only hydration mode');
try {
// Get all dispensaries with products
let dispensaryIds: number[];
if (options.dispensaryId) {
dispensaryIds = [options.dispensaryId];
} else {
const dispResult = await this.pool.query(
'SELECT DISTINCT dispensary_id FROM dutchie_products ORDER BY dispensary_id'
);
dispensaryIds = dispResult.rows.map(r => r.dispensary_id);
}
this.log(`Processing ${dispensaryIds.length} dispensaries`);
for (const dispensaryId of dispensaryIds) {
try {
await this.hydrateDispensaryProductsOnly(dispensaryId, result, options.dryRun);
} catch (err: any) {
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
}
}
} catch (err: any) {
result.errors.push(`Fatal error: ${err.message}`);
}
result.durationMs = Date.now() - startTime;
this.log(`Products-only hydration completed in ${result.durationMs}ms: ${JSON.stringify({
crawlRunsCreated: result.crawlRunsCreated,
productsUpserted: result.productsUpserted,
snapshotsWritten: result.snapshotsWritten,
errors: result.errors.length,
})}`);
return result;
}
/**
* Hydrate a single dispensary in products-only mode
*/
private async hydrateDispensaryProductsOnly(
dispensaryId: number,
result: HydrationResult,
dryRun?: boolean
): Promise<void> {
// Get product count and timestamps for this dispensary
const statsResult = await this.pool.query(
`SELECT COUNT(*) as cnt, MIN(created_at) as min_date, MAX(updated_at) as max_date
FROM dutchie_products WHERE dispensary_id = $1`,
[dispensaryId]
);
const stats = statsResult.rows[0];
const productCount = parseInt(stats.cnt);
if (productCount === 0) {
this.log(`Dispensary ${dispensaryId}: No products, skipping`);
return;
}
this.log(`Dispensary ${dispensaryId}: ${productCount} products`);
// Step 1: Create synthetic crawl run
let crawlRunId: number | null = null;
const now = new Date();
if (!dryRun) {
// Check if we already have a synthetic run for this dispensary
const existingRun = await this.pool.query(
`SELECT id FROM crawl_runs
WHERE dispensary_id = $1
AND source_job_type = 'products_only_hydration'
LIMIT 1`,
[dispensaryId]
);
if (existingRun.rows.length > 0) {
crawlRunId = existingRun.rows[0].id;
this.log(`Dispensary ${dispensaryId}: Using existing synthetic crawl run ${crawlRunId}`);
result.crawlRunsSkipped++;
} else {
// Create new synthetic crawl run
const insertResult = await this.pool.query(
`INSERT INTO crawl_runs (
dispensary_id, provider, started_at, finished_at, duration_ms,
status, products_found, trigger_type, metadata,
source_job_type, source_job_id
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
RETURNING id`,
[
dispensaryId,
'dutchie',
stats.min_date || now,
stats.max_date || now,
0,
'success',
productCount,
'hydration',
JSON.stringify({ mode: 'products_only', hydratedAt: now.toISOString() }),
'products_only_hydration',
dispensaryId, // Use dispensary_id as synthetic job_id
]
);
crawlRunId = insertResult.rows[0].id;
result.crawlRunsCreated++;
this.log(`Dispensary ${dispensaryId}: Created synthetic crawl run ${crawlRunId}`);
}
// Step 2: Upsert products
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
result.productsUpserted += productResult.upserted;
if (productResult.errors.length > 0) {
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
}
// Step 3: Create initial snapshots from current product state
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId);
result.snapshotsWritten += snapshotsWritten;
// Update crawl run with snapshot count
await this.pool.query(
'UPDATE crawl_runs SET snapshots_written = $1 WHERE id = $2',
[snapshotsWritten, crawlRunId]
);
} else {
// Dry run - just count what would be done
result.crawlRunsCreated++;
result.productsUpserted += productCount;
result.snapshotsWritten += productCount;
}
}
/**
* Create initial snapshots from current product state
*/
private async createInitialSnapshots(
dispensaryId: number,
crawlRunId: number
): Promise<number> {
// Get all store products for this dispensary
const products = await this.pool.query(
`SELECT sp.id, sp.price_rec, sp.price_med, sp.is_on_special, sp.is_in_stock,
sp.stock_quantity, sp.thc_percent, sp.cbd_percent
FROM store_products sp
WHERE sp.dispensary_id = $1 AND sp.provider = 'dutchie'`,
[dispensaryId]
);
if (products.rows.length === 0) return 0;
const now = new Date();
const batchSize = 100;
let totalInserted = 0;
// Process in batches
for (let i = 0; i < products.rows.length; i += batchSize) {
const batch = products.rows.slice(i, i + batchSize);
const values: any[] = [];
const placeholders: string[] = [];
let paramIndex = 1;
for (const product of batch) {
values.push(
dispensaryId,
product.id,
crawlRunId,
now,
product.price_rec,
product.price_med,
product.is_on_special || false,
product.is_in_stock || false,
product.stock_quantity,
product.thc_percent,
product.cbd_percent,
JSON.stringify({ source: 'initial_hydration' })
);
const rowPlaceholders = [];
for (let j = 0; j < 12; j++) {
rowPlaceholders.push(`$${paramIndex++}`);
}
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW())`);
}
const query = `
INSERT INTO store_product_snapshots (
dispensary_id, store_product_id, crawl_run_id, captured_at,
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
thc_percent, cbd_percent, raw_data, created_at
) VALUES ${placeholders.join(', ')}
ON CONFLICT (store_product_id, crawl_run_id)
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
DO NOTHING
`;
const result = await this.pool.query(query, values);
totalInserted += result.rowCount || 0;
}
return totalInserted;
}
}

View File

@@ -0,0 +1,13 @@
/**
* Canonical Hydration Module
* Phase 2: Hydration Pipeline from dutchie_* to store_products/store_product_snapshots/crawl_runs
*/
// Types
export * from './types';
// Services
export { CrawlRunRecorder } from './crawl-run-recorder';
export { StoreProductNormalizer } from './store-product-normalizer';
export { SnapshotWriter } from './snapshot-writer';
export { CanonicalHydrationService } from './hydration-service';

View File

@@ -0,0 +1,303 @@
/**
* SnapshotWriter
* Inserts store_product_snapshots from dutchie_product_snapshots source table
*/
import { Pool } from 'pg';
import { SourceSnapshot, StoreProductSnapshot, ServiceContext } from './types';
export class SnapshotWriter {
private pool: Pool;
private log: (message: string) => void;
private batchSize: number;
constructor(ctx: ServiceContext, batchSize: number = 100) {
this.pool = ctx.pool;
this.log = ctx.logger || console.log;
this.batchSize = batchSize;
}
/**
* Write snapshots for a crawl run
* Reads from dutchie_product_snapshots and inserts to store_product_snapshots
*/
async writeSnapshotsForCrawlRun(
crawlRunId: number,
dispensaryId: number,
storeProductIdMap: Map<string, number>,
crawledAt: Date
): Promise<{ written: number; skipped: number; errors: string[] }> {
const errors: string[] = [];
let written = 0;
let skipped = 0;
// Get source snapshots for this dispensary at this crawl time
const sourceSnapshots = await this.getSourceSnapshots(dispensaryId, crawledAt);
this.log(`Found ${sourceSnapshots.length} source snapshots for dispensary ${dispensaryId} at ${crawledAt.toISOString()}`);
// Process in batches
for (let i = 0; i < sourceSnapshots.length; i += this.batchSize) {
const batch = sourceSnapshots.slice(i, i + this.batchSize);
try {
const { batchWritten, batchSkipped } = await this.writeBatch(
batch,
crawlRunId,
storeProductIdMap
);
written += batchWritten;
skipped += batchSkipped;
} catch (err: any) {
errors.push(`Batch ${i / this.batchSize}: ${err.message}`);
}
}
return { written, skipped, errors };
}
/**
* Write a single snapshot
*/
async writeSnapshot(
source: SourceSnapshot,
crawlRunId: number,
storeProductId: number
): Promise<number | null> {
const normalized = this.normalizeSnapshot(source, crawlRunId, storeProductId);
const result = await this.pool.query(
`INSERT INTO store_product_snapshots (
dispensary_id, store_product_id, crawl_run_id, captured_at,
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
thc_percent, cbd_percent, raw_data, created_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
ON CONFLICT (store_product_id, crawl_run_id)
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
DO UPDATE SET
price_rec = EXCLUDED.price_rec,
price_med = EXCLUDED.price_med,
is_on_special = EXCLUDED.is_on_special,
is_in_stock = EXCLUDED.is_in_stock,
stock_quantity = EXCLUDED.stock_quantity,
thc_percent = EXCLUDED.thc_percent,
cbd_percent = EXCLUDED.cbd_percent,
raw_data = EXCLUDED.raw_data
RETURNING id`,
[
normalized.dispensary_id,
normalized.store_product_id,
normalized.crawl_run_id,
normalized.captured_at,
normalized.price_rec,
normalized.price_med,
normalized.is_on_special,
normalized.is_in_stock,
normalized.stock_quantity,
normalized.thc_percent,
normalized.cbd_percent,
JSON.stringify(normalized.raw_data),
]
);
return result.rows[0]?.id || null;
}
/**
* Write a batch of snapshots
*/
async writeBatch(
sourceSnapshots: SourceSnapshot[],
crawlRunId: number,
storeProductIdMap: Map<string, number>
): Promise<{ batchWritten: number; batchSkipped: number }> {
if (sourceSnapshots.length === 0) return { batchWritten: 0, batchSkipped: 0 };
const values: any[] = [];
const placeholders: string[] = [];
let paramIndex = 1;
let skipped = 0;
for (const source of sourceSnapshots) {
// Look up store_product_id
const storeProductId = storeProductIdMap.get(source.external_product_id);
if (!storeProductId) {
skipped++;
continue;
}
const normalized = this.normalizeSnapshot(source, crawlRunId, storeProductId);
values.push(
normalized.dispensary_id,
normalized.store_product_id,
normalized.crawl_run_id,
normalized.captured_at,
normalized.price_rec,
normalized.price_med,
normalized.is_on_special,
normalized.is_in_stock,
normalized.stock_quantity,
normalized.thc_percent,
normalized.cbd_percent,
JSON.stringify(normalized.raw_data)
);
const rowPlaceholders = [];
for (let j = 0; j < 12; j++) {
rowPlaceholders.push(`$${paramIndex++}`);
}
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW())`);
}
if (placeholders.length === 0) {
return { batchWritten: 0, batchSkipped: skipped };
}
const query = `
INSERT INTO store_product_snapshots (
dispensary_id, store_product_id, crawl_run_id, captured_at,
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
thc_percent, cbd_percent, raw_data, created_at
) VALUES ${placeholders.join(', ')}
ON CONFLICT (store_product_id, crawl_run_id)
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
DO UPDATE SET
price_rec = EXCLUDED.price_rec,
price_med = EXCLUDED.price_med,
is_on_special = EXCLUDED.is_on_special,
is_in_stock = EXCLUDED.is_in_stock,
stock_quantity = EXCLUDED.stock_quantity,
thc_percent = EXCLUDED.thc_percent,
cbd_percent = EXCLUDED.cbd_percent,
raw_data = EXCLUDED.raw_data
`;
const result = await this.pool.query(query, values);
return { batchWritten: result.rowCount || 0, batchSkipped: skipped };
}
/**
* Get source snapshots from dutchie_product_snapshots for a specific crawl time
* Groups snapshots by crawled_at time (within a 5-minute window)
*/
async getSourceSnapshots(
dispensaryId: number,
crawledAt: Date
): Promise<SourceSnapshot[]> {
// Find snapshots within 5 minutes of the target time
const windowMinutes = 5;
const result = await this.pool.query(
`SELECT * FROM dutchie_product_snapshots
WHERE dispensary_id = $1
AND crawled_at >= $2 - INTERVAL '${windowMinutes} minutes'
AND crawled_at <= $2 + INTERVAL '${windowMinutes} minutes'
ORDER BY crawled_at ASC`,
[dispensaryId, crawledAt]
);
return result.rows;
}
/**
* Get distinct crawl times from dutchie_product_snapshots for a dispensary
* Used for backfill to identify each crawl run
*/
async getDistinctCrawlTimes(
dispensaryId: number,
startDate?: Date,
endDate?: Date
): Promise<Date[]> {
let query = `
SELECT DISTINCT date_trunc('minute', crawled_at) as crawl_time
FROM dutchie_product_snapshots
WHERE dispensary_id = $1
`;
const params: any[] = [dispensaryId];
let paramIndex = 2;
if (startDate) {
query += ` AND crawled_at >= $${paramIndex++}`;
params.push(startDate);
}
if (endDate) {
query += ` AND crawled_at <= $${paramIndex++}`;
params.push(endDate);
}
query += ' ORDER BY crawl_time ASC';
const result = await this.pool.query(query, params);
return result.rows.map(row => new Date(row.crawl_time));
}
/**
* Check if snapshots already exist for a crawl run
*/
async snapshotsExistForCrawlRun(crawlRunId: number): Promise<boolean> {
const result = await this.pool.query(
'SELECT 1 FROM store_product_snapshots WHERE crawl_run_id = $1 LIMIT 1',
[crawlRunId]
);
return result.rows.length > 0;
}
/**
* Normalize a source snapshot to store_product_snapshot format
*/
private normalizeSnapshot(
source: SourceSnapshot,
crawlRunId: number,
storeProductId: number
): StoreProductSnapshot {
// Convert cents to dollars
const priceRec = source.rec_min_price_cents !== null
? source.rec_min_price_cents / 100
: null;
const priceMed = source.med_min_price_cents !== null
? source.med_min_price_cents / 100
: null;
// Determine stock status
const isInStock = this.isSnapshotInStock(source.stock_status, source.total_quantity_available);
return {
dispensary_id: source.dispensary_id,
store_product_id: storeProductId,
crawl_run_id: crawlRunId,
captured_at: source.crawled_at,
price_rec: priceRec,
price_med: priceMed,
is_on_special: false, // Source doesn't have special flag
is_in_stock: isInStock,
stock_quantity: source.total_quantity_available,
thc_percent: null, // Not in snapshot, would need to join with product
cbd_percent: null, // Not in snapshot, would need to join with product
raw_data: {
source_id: source.id,
status: source.status,
rec_min_price_cents: source.rec_min_price_cents,
rec_max_price_cents: source.rec_max_price_cents,
med_min_price_cents: source.med_min_price_cents,
med_max_price_cents: source.med_max_price_cents,
},
};
}
/**
* Determine if snapshot is in stock
*/
private isSnapshotInStock(stockStatus: string | null, quantity: number | null): boolean {
if (quantity !== null && quantity > 0) return true;
if (stockStatus) {
const status = stockStatus.toLowerCase();
if (status === 'in_stock' || status === 'instock' || status === 'available') {
return true;
}
if (status === 'out_of_stock' || status === 'outofstock' || status === 'unavailable') {
return false;
}
}
return false;
}
}

View File

@@ -0,0 +1,322 @@
/**
* StoreProductNormalizer
* Upserts store_products from dutchie_products source table
*/
import { Pool } from 'pg';
import { SourceProduct, StoreProduct, ServiceContext } from './types';
export class StoreProductNormalizer {
private pool: Pool;
private log: (message: string) => void;
private batchSize: number;
constructor(ctx: ServiceContext, batchSize: number = 100) {
this.pool = ctx.pool;
this.log = ctx.logger || console.log;
this.batchSize = batchSize;
}
/**
* Upsert products for a specific dispensary
* Reads from dutchie_products and upserts to store_products
*/
async upsertProductsForDispensary(dispensaryId: number): Promise<{ upserted: number; errors: string[] }> {
const errors: string[] = [];
let upserted = 0;
// Get all products for this dispensary from source
const sourceProducts = await this.getSourceProducts(dispensaryId);
this.log(`Found ${sourceProducts.length} source products for dispensary ${dispensaryId}`);
// Process in batches to avoid memory issues
for (let i = 0; i < sourceProducts.length; i += this.batchSize) {
const batch = sourceProducts.slice(i, i + this.batchSize);
try {
const batchUpserted = await this.upsertBatch(batch);
upserted += batchUpserted;
} catch (err: any) {
errors.push(`Batch ${i / this.batchSize}: ${err.message}`);
}
}
return { upserted, errors };
}
/**
* Upsert a single product
*/
async upsertProduct(source: SourceProduct): Promise<number | null> {
const normalized = this.normalizeProduct(source);
const result = await this.pool.query(
`INSERT INTO store_products (
dispensary_id, brand_id, provider, provider_product_id,
name_raw, brand_name_raw, category_raw,
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
thc_percent, cbd_percent, image_url,
first_seen_at, last_seen_at, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, NOW(), NOW())
ON CONFLICT (dispensary_id, provider, provider_product_id)
DO UPDATE SET
name_raw = EXCLUDED.name_raw,
brand_name_raw = EXCLUDED.brand_name_raw,
category_raw = EXCLUDED.category_raw,
price_rec = EXCLUDED.price_rec,
price_med = EXCLUDED.price_med,
is_on_special = EXCLUDED.is_on_special,
is_in_stock = EXCLUDED.is_in_stock,
stock_quantity = EXCLUDED.stock_quantity,
thc_percent = EXCLUDED.thc_percent,
cbd_percent = EXCLUDED.cbd_percent,
image_url = COALESCE(EXCLUDED.image_url, store_products.image_url),
last_seen_at = EXCLUDED.last_seen_at,
updated_at = NOW()
RETURNING id`,
[
normalized.dispensary_id,
normalized.brand_id,
normalized.provider,
normalized.provider_product_id,
normalized.name_raw,
normalized.brand_name_raw,
normalized.category_raw,
normalized.price_rec,
normalized.price_med,
normalized.is_on_special,
normalized.is_in_stock,
normalized.stock_quantity,
normalized.thc_percent,
normalized.cbd_percent,
normalized.image_url,
normalized.first_seen_at,
normalized.last_seen_at,
]
);
return result.rows[0]?.id || null;
}
/**
* Upsert a batch of products
*/
async upsertBatch(sourceProducts: SourceProduct[]): Promise<number> {
if (sourceProducts.length === 0) return 0;
// Build multi-row INSERT with ON CONFLICT
const values: any[] = [];
const placeholders: string[] = [];
let paramIndex = 1;
for (const source of sourceProducts) {
const normalized = this.normalizeProduct(source);
values.push(
normalized.dispensary_id,
normalized.brand_id,
normalized.provider,
normalized.provider_product_id,
normalized.name_raw,
normalized.brand_name_raw,
normalized.category_raw,
normalized.price_rec,
normalized.price_med,
normalized.is_on_special,
normalized.is_in_stock,
normalized.stock_quantity,
normalized.thc_percent,
normalized.cbd_percent,
normalized.image_url,
normalized.first_seen_at,
normalized.last_seen_at
);
const rowPlaceholders = [];
for (let j = 0; j < 17; j++) {
rowPlaceholders.push(`$${paramIndex++}`);
}
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW(), NOW())`);
}
const query = `
INSERT INTO store_products (
dispensary_id, brand_id, provider, provider_product_id,
name_raw, brand_name_raw, category_raw,
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
thc_percent, cbd_percent, image_url,
first_seen_at, last_seen_at, created_at, updated_at
) VALUES ${placeholders.join(', ')}
ON CONFLICT (dispensary_id, provider, provider_product_id)
DO UPDATE SET
name_raw = EXCLUDED.name_raw,
brand_name_raw = EXCLUDED.brand_name_raw,
category_raw = EXCLUDED.category_raw,
price_rec = EXCLUDED.price_rec,
price_med = EXCLUDED.price_med,
is_on_special = EXCLUDED.is_on_special,
is_in_stock = EXCLUDED.is_in_stock,
stock_quantity = EXCLUDED.stock_quantity,
thc_percent = EXCLUDED.thc_percent,
cbd_percent = EXCLUDED.cbd_percent,
image_url = COALESCE(EXCLUDED.image_url, store_products.image_url),
last_seen_at = EXCLUDED.last_seen_at,
updated_at = NOW()
`;
const result = await this.pool.query(query, values);
return result.rowCount || 0;
}
/**
* Get store_product ID by canonical key
*/
async getStoreProductId(
dispensaryId: number,
provider: string,
providerProductId: string
): Promise<number | null> {
const result = await this.pool.query(
'SELECT id FROM store_products WHERE dispensary_id = $1 AND provider = $2 AND provider_product_id = $3',
[dispensaryId, provider, providerProductId]
);
return result.rows[0]?.id || null;
}
/**
* Get all store_product IDs for a dispensary (for snapshot writing)
*/
async getStoreProductIdMap(dispensaryId: number): Promise<Map<string, number>> {
const result = await this.pool.query(
'SELECT id, provider_product_id FROM store_products WHERE dispensary_id = $1',
[dispensaryId]
);
const map = new Map<string, number>();
for (const row of result.rows) {
map.set(row.provider_product_id, row.id);
}
return map;
}
/**
* Get source products from dutchie_products
*/
private async getSourceProducts(dispensaryId: number): Promise<SourceProduct[]> {
const result = await this.pool.query(
`SELECT * FROM dutchie_products WHERE dispensary_id = $1`,
[dispensaryId]
);
return result.rows;
}
/**
* Normalize a source product to store_product format
*/
private normalizeProduct(source: SourceProduct): StoreProduct {
// Extract price from JSONB if present
const priceRec = this.extractPrice(source.price_rec);
const priceMed = this.extractPrice(source.price_med);
// Parse THC/CBD percentages
const thcPercent = this.parsePercentage(source.thc);
const cbdPercent = this.parsePercentage(source.cbd);
// Determine stock status
const isInStock = this.isProductInStock(source.stock_status, source.total_quantity_available);
return {
dispensary_id: source.dispensary_id,
brand_id: null, // Source has UUID strings, target expects integer - set to null for now
provider: source.platform || 'dutchie',
provider_product_id: source.external_product_id,
name_raw: source.name,
brand_name_raw: source.brand_name,
category_raw: source.type || source.subcategory,
price_rec: priceRec,
price_med: priceMed,
is_on_special: false, // Dutchie doesn't have a direct special flag, would need to check specials table
is_in_stock: isInStock,
stock_quantity: source.total_quantity_available,
thc_percent: thcPercent,
cbd_percent: cbdPercent,
image_url: source.primary_image_url,
first_seen_at: source.created_at,
last_seen_at: source.updated_at,
};
}
/**
* Extract price from JSONB price field
* Handles formats like: {min: 10, max: 20}, {value: 15}, or just a number
*/
private extractPrice(priceData: any): number | null {
if (priceData === null || priceData === undefined) return null;
// If it's already a number
if (typeof priceData === 'number') return priceData;
// If it's a string that looks like a number
if (typeof priceData === 'string') {
const parsed = parseFloat(priceData);
return isNaN(parsed) ? null : parsed;
}
// If it's an object with price data
if (typeof priceData === 'object') {
// Try common price formats
if (priceData.min !== undefined && priceData.min !== null) {
return typeof priceData.min === 'number' ? priceData.min : parseFloat(priceData.min);
}
if (priceData.value !== undefined && priceData.value !== null) {
return typeof priceData.value === 'number' ? priceData.value : parseFloat(priceData.value);
}
if (priceData.price !== undefined && priceData.price !== null) {
return typeof priceData.price === 'number' ? priceData.price : parseFloat(priceData.price);
}
// Check for array of variants
if (Array.isArray(priceData) && priceData.length > 0) {
const firstVariant = priceData[0];
if (firstVariant.price !== undefined) {
return typeof firstVariant.price === 'number' ? firstVariant.price : parseFloat(firstVariant.price);
}
}
}
return null;
}
/**
* Parse percentage string to number
* Handles formats like: "25.5%", "25.5", "25.5 %", etc.
*/
private parsePercentage(value: string | null | undefined): number | null {
if (value === null || value === undefined) return null;
// Remove percentage sign and whitespace
const cleaned = value.toString().replace(/%/g, '').trim();
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? null : parsed;
}
/**
* Determine if product is in stock based on status and quantity
*/
private isProductInStock(stockStatus: string | null, quantity: number | null): boolean {
// Check quantity first
if (quantity !== null && quantity > 0) return true;
// Check status string
if (stockStatus) {
const status = stockStatus.toLowerCase();
if (status === 'in_stock' || status === 'instock' || status === 'available') {
return true;
}
if (status === 'out_of_stock' || status === 'outofstock' || status === 'unavailable') {
return false;
}
}
// Default to false if unknown
return false;
}
}

View File

@@ -0,0 +1,150 @@
/**
* Canonical Hydration Types
* Phase 2: Hydration Pipeline from dutchie_* to store_products/store_product_snapshots/crawl_runs
*/
import { Pool } from 'pg';
// Source job types for hydration
export type SourceJobType = 'dispensary_crawl_jobs' | 'crawl_jobs' | 'job_run_logs';
// Source job record (from dispensary_crawl_jobs)
export interface SourceJob {
id: number;
dispensary_id: number;
job_type: string;
status: string;
started_at: Date | null;
completed_at: Date | null;
duration_ms: number | null;
products_found: number | null;
products_new: number | null;
products_updated: number | null;
error_message: string | null;
}
// Source product record (from dutchie_products)
export interface SourceProduct {
id: number;
dispensary_id: number;
platform: string;
external_product_id: string;
name: string;
brand_name: string | null;
brand_id: number | null;
type: string | null;
subcategory: string | null;
strain_type: string | null;
thc: string | null;
cbd: string | null;
price_rec: any; // JSONB
price_med: any; // JSONB
stock_status: string | null;
total_quantity_available: number | null;
primary_image_url: string | null;
created_at: Date;
updated_at: Date;
}
// Source snapshot record (from dutchie_product_snapshots)
export interface SourceSnapshot {
id: number;
dutchie_product_id: number;
dispensary_id: number;
external_product_id: string;
status: string | null;
rec_min_price_cents: number | null;
rec_max_price_cents: number | null;
med_min_price_cents: number | null;
med_max_price_cents: number | null;
stock_status: string | null;
total_quantity_available: number | null;
crawled_at: Date;
created_at: Date;
}
// Crawl run record for canonical table
export interface CrawlRun {
id?: number;
dispensary_id: number;
provider: string;
started_at: Date;
finished_at: Date | null;
duration_ms: number | null;
status: string;
error_message: string | null;
products_found: number | null;
products_new: number | null;
products_updated: number | null;
snapshots_written: number | null;
worker_id: string | null;
trigger_type: string | null;
metadata: any;
source_job_type: SourceJobType;
source_job_id: number;
}
// Store product record for canonical table
export interface StoreProduct {
id?: number;
dispensary_id: number;
brand_id: number | null;
provider: string;
provider_product_id: string;
name_raw: string;
brand_name_raw: string | null;
category_raw: string | null;
price_rec: number | null;
price_med: number | null;
is_on_special: boolean;
is_in_stock: boolean;
stock_quantity: number | null;
thc_percent: number | null;
cbd_percent: number | null;
image_url: string | null;
first_seen_at: Date;
last_seen_at: Date;
}
// Store product snapshot record for canonical table
export interface StoreProductSnapshot {
id?: number;
dispensary_id: number;
store_product_id: number;
crawl_run_id: number;
captured_at: Date;
price_rec: number | null;
price_med: number | null;
is_on_special: boolean;
is_in_stock: boolean;
stock_quantity: number | null;
thc_percent: number | null;
cbd_percent: number | null;
raw_data: any;
}
// Hydration options
export interface HydrationOptions {
mode: 'backfill' | 'incremental';
dispensaryId?: number;
startDate?: Date;
endDate?: Date;
batchSize?: number;
dryRun?: boolean;
}
// Hydration result
export interface HydrationResult {
crawlRunsCreated: number;
crawlRunsSkipped: number;
productsUpserted: number;
snapshotsWritten: number;
errors: string[];
durationMs: number;
}
// Service context
export interface ServiceContext {
pool: Pool;
logger?: (message: string) => void;
}

View File

@@ -0,0 +1,657 @@
/**
* Base Dutchie Crawler Template
*
* This is the base template for all Dutchie store crawlers.
* Per-store crawlers extend this by overriding specific methods.
*
* Exports:
* - crawlProducts(dispensary, options) - Main crawl entry point
* - detectStructure(page) - Detect page structure for sandbox mode
* - extractProducts(document) - Extract product data
* - extractImages(document) - Extract product images
* - extractStock(document) - Extract stock status
* - extractPagination(document) - Extract pagination info
*/
import {
crawlDispensaryProducts as baseCrawlDispensaryProducts,
CrawlResult,
} from '../../dutchie-az/services/product-crawler';
import { Dispensary, CrawlerProfileOptions } from '../../dutchie-az/types';
// Re-export CrawlResult for convenience
export { CrawlResult };
// ============================================================
// TYPES
// ============================================================
/**
* Options passed to the per-store crawler
*/
export interface StoreCrawlOptions {
pricingType?: 'rec' | 'med';
useBothModes?: boolean;
downloadImages?: boolean;
trackStock?: boolean;
timeoutMs?: number;
config?: Record<string, any>;
}
/**
* Progress callback for reporting crawl progress
*/
export interface CrawlProgressCallback {
phase: 'fetching' | 'processing' | 'saving' | 'images' | 'complete';
current: number;
total: number;
message?: string;
}
/**
* Structure detection result for sandbox mode
*/
export interface StructureDetectionResult {
success: boolean;
menuType: 'dutchie' | 'treez' | 'jane' | 'unknown';
iframeUrl?: string;
graphqlEndpoint?: string;
dispensaryId?: string;
selectors: {
productContainer?: string;
productName?: string;
productPrice?: string;
productImage?: string;
productCategory?: string;
pagination?: string;
loadMore?: string;
};
pagination: {
type: 'scroll' | 'click' | 'graphql' | 'none';
hasMore?: boolean;
pageSize?: number;
};
errors: string[];
metadata: Record<string, any>;
}
/**
* Product extraction result
*/
export interface ExtractedProduct {
externalId: string;
name: string;
brand?: string;
category?: string;
subcategory?: string;
price?: number;
priceRec?: number;
priceMed?: number;
weight?: string;
thcContent?: string;
cbdContent?: string;
description?: string;
imageUrl?: string;
stockStatus?: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
quantity?: number;
raw?: Record<string, any>;
}
/**
* Image extraction result
*/
export interface ExtractedImage {
productId: string;
imageUrl: string;
isPrimary: boolean;
position: number;
}
/**
* Stock extraction result
*/
export interface ExtractedStock {
productId: string;
status: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
quantity?: number;
lastChecked: Date;
}
/**
* Pagination extraction result
*/
export interface ExtractedPagination {
hasNextPage: boolean;
currentPage?: number;
totalPages?: number;
totalProducts?: number;
nextCursor?: string;
loadMoreSelector?: string;
}
/**
* Hook points that per-store crawlers can override
*/
export interface DutchieCrawlerHooks {
/**
* Called before fetching products
* Can be used to set up custom headers, cookies, etc.
*/
beforeFetch?: (dispensary: Dispensary) => Promise<void>;
/**
* Called after fetching products, before processing
* Can be used to filter or transform raw products
*/
afterFetch?: (products: any[], dispensary: Dispensary) => Promise<any[]>;
/**
* Called after all processing is complete
* Can be used for cleanup or post-processing
*/
afterComplete?: (result: CrawlResult, dispensary: Dispensary) => Promise<void>;
/**
* Custom selector resolver for iframe detection
*/
resolveIframe?: (page: any) => Promise<string | null>;
/**
* Custom product container selector
*/
getProductContainerSelector?: () => string;
/**
* Custom product extraction from container element
*/
extractProductFromElement?: (element: any) => Promise<ExtractedProduct | null>;
}
/**
* Selectors configuration for per-store overrides
*/
export interface DutchieSelectors {
iframe?: string;
productContainer?: string;
productName?: string;
productPrice?: string;
productPriceRec?: string;
productPriceMed?: string;
productImage?: string;
productCategory?: string;
productBrand?: string;
productWeight?: string;
productThc?: string;
productCbd?: string;
productDescription?: string;
productStock?: string;
loadMore?: string;
pagination?: string;
}
// ============================================================
// DEFAULT SELECTORS
// ============================================================
export const DEFAULT_DUTCHIE_SELECTORS: DutchieSelectors = {
iframe: 'iframe[src*="dutchie.com"]',
productContainer: '[data-testid="product-card"], .product-card, [class*="ProductCard"]',
productName: '[data-testid="product-title"], .product-title, [class*="ProductTitle"]',
productPrice: '[data-testid="product-price"], .product-price, [class*="ProductPrice"]',
productImage: 'img[src*="dutchie"], img[src*="product"], .product-image img',
productCategory: '[data-testid="category-name"], .category-name',
productBrand: '[data-testid="brand-name"], .brand-name, [class*="BrandName"]',
loadMore: 'button[data-testid="load-more"], .load-more-button',
pagination: '.pagination, [class*="Pagination"]',
};
// ============================================================
// BASE CRAWLER CLASS
// ============================================================
/**
* BaseDutchieCrawler - Base class for all Dutchie store crawlers
*
* Per-store crawlers extend this class and override methods as needed.
* The default implementation delegates to the existing shared Dutchie logic.
*/
export class BaseDutchieCrawler {
protected dispensary: Dispensary;
protected options: StoreCrawlOptions;
protected hooks: DutchieCrawlerHooks;
protected selectors: DutchieSelectors;
constructor(
dispensary: Dispensary,
options: StoreCrawlOptions = {},
hooks: DutchieCrawlerHooks = {},
selectors: DutchieSelectors = {}
) {
this.dispensary = dispensary;
this.options = {
pricingType: 'rec',
useBothModes: true,
downloadImages: true,
trackStock: true,
timeoutMs: 30000,
...options,
};
this.hooks = hooks;
this.selectors = { ...DEFAULT_DUTCHIE_SELECTORS, ...selectors };
}
/**
* Main entry point - crawl products for this dispensary
* Override this in per-store crawlers to customize behavior
*/
async crawlProducts(): Promise<CrawlResult> {
// Call beforeFetch hook if defined
if (this.hooks.beforeFetch) {
await this.hooks.beforeFetch(this.dispensary);
}
// Use the existing shared Dutchie crawl logic
const result = await baseCrawlDispensaryProducts(
this.dispensary,
this.options.pricingType || 'rec',
{
useBothModes: this.options.useBothModes,
downloadImages: this.options.downloadImages,
}
);
// Call afterComplete hook if defined
if (this.hooks.afterComplete) {
await this.hooks.afterComplete(result, this.dispensary);
}
return result;
}
/**
* Detect page structure for sandbox discovery mode
* Override in per-store crawlers if needed
*
* @param page - Puppeteer page object or HTML string
* @returns Structure detection result
*/
async detectStructure(page: any): Promise<StructureDetectionResult> {
const result: StructureDetectionResult = {
success: false,
menuType: 'unknown',
selectors: {},
pagination: { type: 'none' },
errors: [],
metadata: {},
};
try {
// Default implementation: check for Dutchie iframe
if (typeof page === 'string') {
// HTML string mode
if (page.includes('dutchie.com')) {
result.menuType = 'dutchie';
result.success = true;
}
} else if (page && typeof page.evaluate === 'function') {
// Puppeteer page mode
const detection = await page.evaluate((selectorConfig: DutchieSelectors) => {
const iframe = document.querySelector(selectorConfig.iframe || '') as HTMLIFrameElement;
const iframeUrl = iframe?.src || null;
// Check for product containers
const containers = document.querySelectorAll(selectorConfig.productContainer || '');
return {
hasIframe: !!iframe,
iframeUrl,
productCount: containers.length,
isDutchie: !!iframeUrl?.includes('dutchie.com'),
};
}, this.selectors);
if (detection.isDutchie) {
result.menuType = 'dutchie';
result.iframeUrl = detection.iframeUrl;
result.success = true;
}
result.metadata = detection;
}
// Set default selectors for Dutchie
if (result.menuType === 'dutchie') {
result.selectors = {
productContainer: this.selectors.productContainer,
productName: this.selectors.productName,
productPrice: this.selectors.productPrice,
productImage: this.selectors.productImage,
productCategory: this.selectors.productCategory,
};
result.pagination = { type: 'graphql' };
}
} catch (error: any) {
result.errors.push(`Detection error: ${error.message}`);
}
return result;
}
/**
* Extract products from page/document
* Override in per-store crawlers for custom extraction
*
* @param document - DOM document, Puppeteer page, or raw products array
* @returns Array of extracted products
*/
async extractProducts(document: any): Promise<ExtractedProduct[]> {
// Default implementation: assume document is already an array of products
// from the GraphQL response
if (Array.isArray(document)) {
return document.map((product) => this.mapRawProduct(product));
}
// If document is a Puppeteer page, extract from DOM
if (document && typeof document.evaluate === 'function') {
return this.extractProductsFromPage(document);
}
return [];
}
/**
* Extract products from Puppeteer page
* Override for custom DOM extraction
*/
protected async extractProductsFromPage(page: any): Promise<ExtractedProduct[]> {
const products = await page.evaluate((selectors: DutchieSelectors) => {
const containers = document.querySelectorAll(selectors.productContainer || '');
return Array.from(containers).map((container) => {
const nameEl = container.querySelector(selectors.productName || '');
const priceEl = container.querySelector(selectors.productPrice || '');
const imageEl = container.querySelector(selectors.productImage || '') as HTMLImageElement;
const brandEl = container.querySelector(selectors.productBrand || '');
return {
name: nameEl?.textContent?.trim() || '',
price: priceEl?.textContent?.trim() || '',
imageUrl: imageEl?.src || '',
brand: brandEl?.textContent?.trim() || '',
};
});
}, this.selectors);
return products.map((p: any, i: number) => ({
externalId: `dom-product-${i}`,
name: p.name,
brand: p.brand,
price: this.parsePrice(p.price),
imageUrl: p.imageUrl,
stockStatus: 'unknown' as const,
}));
}
/**
* Map raw product from GraphQL to ExtractedProduct
* Override for custom mapping
*/
protected mapRawProduct(raw: any): ExtractedProduct {
return {
externalId: raw.id || raw._id || raw.externalId,
name: raw.name || raw.Name,
brand: raw.brand?.name || raw.brandName || raw.brand,
category: raw.type || raw.category || raw.Category,
subcategory: raw.subcategory || raw.Subcategory,
price: raw.recPrice || raw.price || raw.Price,
priceRec: raw.recPrice || raw.Prices?.rec,
priceMed: raw.medPrice || raw.Prices?.med,
weight: raw.weight || raw.Weight,
thcContent: raw.potencyThc?.formatted || raw.THCContent?.formatted,
cbdContent: raw.potencyCbd?.formatted || raw.CBDContent?.formatted,
description: raw.description || raw.Description,
imageUrl: raw.image || raw.Image,
stockStatus: this.mapStockStatus(raw),
quantity: raw.quantity || raw.Quantity,
raw,
};
}
/**
* Map raw stock status to standardized value
*/
protected mapStockStatus(raw: any): 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown' {
const status = raw.Status || raw.status || raw.stockStatus;
if (status === 'Active' || status === 'active' || status === 'in_stock') {
return 'in_stock';
}
if (status === 'Inactive' || status === 'inactive' || status === 'out_of_stock') {
return 'out_of_stock';
}
if (status === 'low_stock') {
return 'low_stock';
}
return 'unknown';
}
/**
* Parse price string to number
*/
protected parsePrice(priceStr: string): number | undefined {
if (!priceStr) return undefined;
const cleaned = priceStr.replace(/[^0-9.]/g, '');
const num = parseFloat(cleaned);
return isNaN(num) ? undefined : num;
}
/**
* Extract images from document
* Override for custom image extraction
*
* @param document - DOM document, Puppeteer page, or products array
* @returns Array of extracted images
*/
async extractImages(document: any): Promise<ExtractedImage[]> {
if (Array.isArray(document)) {
return document
.filter((p) => p.image || p.Image || p.imageUrl)
.map((p, i) => ({
productId: p.id || p._id || `product-${i}`,
imageUrl: p.image || p.Image || p.imageUrl,
isPrimary: true,
position: 0,
}));
}
// Puppeteer page extraction
if (document && typeof document.evaluate === 'function') {
return this.extractImagesFromPage(document);
}
return [];
}
/**
* Extract images from Puppeteer page
*/
protected async extractImagesFromPage(page: any): Promise<ExtractedImage[]> {
const images = await page.evaluate((selector: string) => {
const imgs = document.querySelectorAll(selector);
return Array.from(imgs).map((img, i) => ({
src: (img as HTMLImageElement).src,
position: i,
}));
}, this.selectors.productImage || 'img');
return images.map((img: any, i: number) => ({
productId: `dom-product-${i}`,
imageUrl: img.src,
isPrimary: i === 0,
position: img.position,
}));
}
/**
* Extract stock information from document
* Override for custom stock extraction
*
* @param document - DOM document, Puppeteer page, or products array
* @returns Array of extracted stock statuses
*/
async extractStock(document: any): Promise<ExtractedStock[]> {
if (Array.isArray(document)) {
return document.map((p) => ({
productId: p.id || p._id || p.externalId,
status: this.mapStockStatus(p),
quantity: p.quantity || p.Quantity,
lastChecked: new Date(),
}));
}
return [];
}
/**
* Extract pagination information from document
* Override for custom pagination handling
*
* @param document - DOM document, Puppeteer page, or GraphQL response
* @returns Pagination info
*/
async extractPagination(document: any): Promise<ExtractedPagination> {
// Default: check for page info in GraphQL response
if (document && document.pageInfo) {
return {
hasNextPage: document.pageInfo.hasNextPage || false,
currentPage: document.pageInfo.currentPage,
totalPages: document.pageInfo.totalPages,
totalProducts: document.pageInfo.totalCount || document.totalCount,
nextCursor: document.pageInfo.endCursor,
};
}
// Default: no pagination
return {
hasNextPage: false,
};
}
/**
* Get the cName (Dutchie slug) for this dispensary
* Override to customize cName extraction
*/
getCName(): string {
if (this.dispensary.menuUrl) {
try {
const url = new URL(this.dispensary.menuUrl);
const segments = url.pathname.split('/').filter(Boolean);
if (segments.length >= 2) {
return segments[segments.length - 1];
}
} catch {
// Fall through to default
}
}
return this.dispensary.slug || '';
}
/**
* Get custom headers for API requests
* Override for store-specific headers
*/
getCustomHeaders(): Record<string, string> {
const cName = this.getCName();
return {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
Origin: 'https://dutchie.com',
Referer: `https://dutchie.com/embedded-menu/${cName}`,
};
}
}
// ============================================================
// FACTORY FUNCTION
// ============================================================
/**
* Create a base Dutchie crawler instance
* This is the default export used when no per-store override exists
*/
export function createCrawler(
dispensary: Dispensary,
options: StoreCrawlOptions = {},
hooks: DutchieCrawlerHooks = {},
selectors: DutchieSelectors = {}
): BaseDutchieCrawler {
return new BaseDutchieCrawler(dispensary, options, hooks, selectors);
}
// ============================================================
// STANDALONE FUNCTIONS (required exports for orchestrator)
// ============================================================
/**
* Crawl products using the base Dutchie logic
* Per-store files can call this or override it completely
*/
export async function crawlProducts(
dispensary: Dispensary,
options: StoreCrawlOptions = {}
): Promise<CrawlResult> {
const crawler = createCrawler(dispensary, options);
return crawler.crawlProducts();
}
/**
* Detect structure using the base Dutchie logic
*/
export async function detectStructure(
page: any,
dispensary?: Dispensary
): Promise<StructureDetectionResult> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.detectStructure(page);
}
/**
* Extract products using the base Dutchie logic
*/
export async function extractProducts(
document: any,
dispensary?: Dispensary
): Promise<ExtractedProduct[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractProducts(document);
}
/**
* Extract images using the base Dutchie logic
*/
export async function extractImages(
document: any,
dispensary?: Dispensary
): Promise<ExtractedImage[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractImages(document);
}
/**
* Extract stock using the base Dutchie logic
*/
export async function extractStock(
document: any,
dispensary?: Dispensary
): Promise<ExtractedStock[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractStock(document);
}
/**
* Extract pagination using the base Dutchie logic
*/
export async function extractPagination(
document: any,
dispensary?: Dispensary
): Promise<ExtractedPagination> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractPagination(document);
}

View File

@@ -0,0 +1,330 @@
/**
* Base Jane Crawler Template (PLACEHOLDER)
*
* This is the base template for all Jane (iheartjane) store crawlers.
* Per-store crawlers extend this by overriding specific methods.
*
* TODO: Implement Jane-specific crawling logic (Algolia-based)
*/
import { Dispensary } from '../../dutchie-az/types';
import {
StoreCrawlOptions,
CrawlResult,
StructureDetectionResult,
ExtractedProduct,
ExtractedImage,
ExtractedStock,
ExtractedPagination,
} from './base-dutchie';
// Re-export types
export {
StoreCrawlOptions,
CrawlResult,
StructureDetectionResult,
ExtractedProduct,
ExtractedImage,
ExtractedStock,
ExtractedPagination,
};
// ============================================================
// JANE-SPECIFIC TYPES
// ============================================================
export interface JaneConfig {
algoliaAppId?: string;
algoliaApiKey?: string;
algoliaIndex?: string;
storeId?: string;
}
export interface JaneSelectors {
productContainer?: string;
productName?: string;
productPrice?: string;
productImage?: string;
productCategory?: string;
productBrand?: string;
pagination?: string;
loadMore?: string;
}
export const DEFAULT_JANE_SELECTORS: JaneSelectors = {
productContainer: '[data-testid="product-card"], .product-card',
productName: '[data-testid="product-name"], .product-name',
productPrice: '[data-testid="product-price"], .product-price',
productImage: '.product-image img, [data-testid="product-image"] img',
productCategory: '.product-category',
productBrand: '.product-brand, [data-testid="brand-name"]',
loadMore: '[data-testid="load-more"], .load-more-btn',
};
// ============================================================
// BASE JANE CRAWLER CLASS
// ============================================================
export class BaseJaneCrawler {
protected dispensary: Dispensary;
protected options: StoreCrawlOptions;
protected selectors: JaneSelectors;
protected janeConfig: JaneConfig;
constructor(
dispensary: Dispensary,
options: StoreCrawlOptions = {},
selectors: JaneSelectors = {},
janeConfig: JaneConfig = {}
) {
this.dispensary = dispensary;
this.options = {
pricingType: 'rec',
useBothModes: false,
downloadImages: true,
trackStock: true,
timeoutMs: 30000,
...options,
};
this.selectors = { ...DEFAULT_JANE_SELECTORS, ...selectors };
this.janeConfig = janeConfig;
}
/**
* Main entry point - crawl products for this dispensary
* TODO: Implement Jane/Algolia-specific crawling
*/
async crawlProducts(): Promise<CrawlResult> {
const startTime = Date.now();
console.warn(`[BaseJaneCrawler] Jane crawling not yet implemented for ${this.dispensary.name}`);
return {
success: false,
dispensaryId: this.dispensary.id || 0,
productsFound: 0,
productsFetched: 0,
productsUpserted: 0,
snapshotsCreated: 0,
imagesDownloaded: 0,
errorMessage: 'Jane crawler not yet implemented',
durationMs: Date.now() - startTime,
};
}
/**
* Detect page structure for sandbox discovery mode
* Jane uses Algolia, so we look for Algolia config
*/
async detectStructure(page: any): Promise<StructureDetectionResult> {
const result: StructureDetectionResult = {
success: false,
menuType: 'unknown',
selectors: {},
pagination: { type: 'none' },
errors: [],
metadata: {},
};
try {
if (page && typeof page.evaluate === 'function') {
// Look for Jane/Algolia indicators
const detection = await page.evaluate(() => {
// Check for iheartjane in page
const hasJane = document.documentElement.innerHTML.includes('iheartjane') ||
document.documentElement.innerHTML.includes('jane-menu');
// Look for Algolia config
const scripts = Array.from(document.querySelectorAll('script'));
let algoliaConfig: any = null;
for (const script of scripts) {
const content = script.textContent || '';
if (content.includes('algolia') || content.includes('ALGOLIA')) {
// Try to extract config
const appIdMatch = content.match(/applicationId['":\s]+['"]([^'"]+)['"]/);
const apiKeyMatch = content.match(/apiKey['":\s]+['"]([^'"]+)['"]/);
if (appIdMatch && apiKeyMatch) {
algoliaConfig = {
appId: appIdMatch[1],
apiKey: apiKeyMatch[1],
};
}
}
}
return {
hasJane,
algoliaConfig,
};
});
if (detection.hasJane) {
result.menuType = 'jane';
result.success = true;
result.metadata = detection;
if (detection.algoliaConfig) {
result.metadata.algoliaAppId = detection.algoliaConfig.appId;
result.metadata.algoliaApiKey = detection.algoliaConfig.apiKey;
}
}
}
} catch (error: any) {
result.errors.push(`Detection error: ${error.message}`);
}
return result;
}
/**
* Extract products from Algolia response or page
*/
async extractProducts(document: any): Promise<ExtractedProduct[]> {
// If document is Algolia hits array
if (Array.isArray(document)) {
return document.map((hit) => this.mapAlgoliaHit(hit));
}
console.warn('[BaseJaneCrawler] extractProducts not yet fully implemented');
return [];
}
/**
* Map Algolia hit to ExtractedProduct
*/
protected mapAlgoliaHit(hit: any): ExtractedProduct {
return {
externalId: hit.objectID || hit.id || hit.product_id,
name: hit.name || hit.product_name,
brand: hit.brand || hit.brand_name,
category: hit.category || hit.kind,
subcategory: hit.subcategory,
price: hit.price || hit.bucket_price,
priceRec: hit.prices?.rec || hit.price_rec,
priceMed: hit.prices?.med || hit.price_med,
weight: hit.weight || hit.amount,
thcContent: hit.percent_thc ? `${hit.percent_thc}%` : undefined,
cbdContent: hit.percent_cbd ? `${hit.percent_cbd}%` : undefined,
description: hit.description,
imageUrl: hit.image_url || hit.product_image_url,
stockStatus: hit.available ? 'in_stock' : 'out_of_stock',
quantity: hit.quantity_available,
raw: hit,
};
}
/**
* Extract images from document
*/
async extractImages(document: any): Promise<ExtractedImage[]> {
if (Array.isArray(document)) {
return document
.filter((hit) => hit.image_url || hit.product_image_url)
.map((hit, i) => ({
productId: hit.objectID || hit.id || `jane-product-${i}`,
imageUrl: hit.image_url || hit.product_image_url,
isPrimary: true,
position: 0,
}));
}
return [];
}
/**
* Extract stock information from document
*/
async extractStock(document: any): Promise<ExtractedStock[]> {
if (Array.isArray(document)) {
return document.map((hit) => ({
productId: hit.objectID || hit.id,
status: hit.available ? 'in_stock' as const : 'out_of_stock' as const,
quantity: hit.quantity_available,
lastChecked: new Date(),
}));
}
return [];
}
/**
* Extract pagination information
* Algolia uses cursor-based pagination
*/
async extractPagination(document: any): Promise<ExtractedPagination> {
if (document && typeof document === 'object' && !Array.isArray(document)) {
return {
hasNextPage: document.page < document.nbPages - 1,
currentPage: document.page,
totalPages: document.nbPages,
totalProducts: document.nbHits,
};
}
return { hasNextPage: false };
}
}
// ============================================================
// FACTORY FUNCTION
// ============================================================
export function createCrawler(
dispensary: Dispensary,
options: StoreCrawlOptions = {},
selectors: JaneSelectors = {},
janeConfig: JaneConfig = {}
): BaseJaneCrawler {
return new BaseJaneCrawler(dispensary, options, selectors, janeConfig);
}
// ============================================================
// STANDALONE FUNCTIONS
// ============================================================
export async function crawlProducts(
dispensary: Dispensary,
options: StoreCrawlOptions = {}
): Promise<CrawlResult> {
const crawler = createCrawler(dispensary, options);
return crawler.crawlProducts();
}
export async function detectStructure(
page: any,
dispensary?: Dispensary
): Promise<StructureDetectionResult> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.detectStructure(page);
}
export async function extractProducts(
document: any,
dispensary?: Dispensary
): Promise<ExtractedProduct[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractProducts(document);
}
export async function extractImages(
document: any,
dispensary?: Dispensary
): Promise<ExtractedImage[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractImages(document);
}
export async function extractStock(
document: any,
dispensary?: Dispensary
): Promise<ExtractedStock[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractStock(document);
}
export async function extractPagination(
document: any,
dispensary?: Dispensary
): Promise<ExtractedPagination> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractPagination(document);
}

View File

@@ -0,0 +1,212 @@
/**
* Base Treez Crawler Template (PLACEHOLDER)
*
* This is the base template for all Treez store crawlers.
* Per-store crawlers extend this by overriding specific methods.
*
* TODO: Implement Treez-specific crawling logic
*/
import { Dispensary } from '../../dutchie-az/types';
import {
StoreCrawlOptions,
CrawlResult,
StructureDetectionResult,
ExtractedProduct,
ExtractedImage,
ExtractedStock,
ExtractedPagination,
} from './base-dutchie';
// Re-export types
export {
StoreCrawlOptions,
CrawlResult,
StructureDetectionResult,
ExtractedProduct,
ExtractedImage,
ExtractedStock,
ExtractedPagination,
};
// ============================================================
// TREEZ-SPECIFIC TYPES
// ============================================================
export interface TreezSelectors {
productContainer?: string;
productName?: string;
productPrice?: string;
productImage?: string;
productCategory?: string;
productBrand?: string;
addToCart?: string;
pagination?: string;
}
export const DEFAULT_TREEZ_SELECTORS: TreezSelectors = {
productContainer: '.product-tile, [class*="ProductCard"]',
productName: '.product-name, [class*="ProductName"]',
productPrice: '.product-price, [class*="ProductPrice"]',
productImage: '.product-image img',
productCategory: '.product-category',
productBrand: '.product-brand',
addToCart: '.add-to-cart-btn',
pagination: '.pagination',
};
// ============================================================
// BASE TREEZ CRAWLER CLASS
// ============================================================
export class BaseTreezCrawler {
protected dispensary: Dispensary;
protected options: StoreCrawlOptions;
protected selectors: TreezSelectors;
constructor(
dispensary: Dispensary,
options: StoreCrawlOptions = {},
selectors: TreezSelectors = {}
) {
this.dispensary = dispensary;
this.options = {
pricingType: 'rec',
useBothModes: false,
downloadImages: true,
trackStock: true,
timeoutMs: 30000,
...options,
};
this.selectors = { ...DEFAULT_TREEZ_SELECTORS, ...selectors };
}
/**
* Main entry point - crawl products for this dispensary
* TODO: Implement Treez-specific crawling
*/
async crawlProducts(): Promise<CrawlResult> {
const startTime = Date.now();
console.warn(`[BaseTreezCrawler] Treez crawling not yet implemented for ${this.dispensary.name}`);
return {
success: false,
dispensaryId: this.dispensary.id || 0,
productsFound: 0,
productsFetched: 0,
productsUpserted: 0,
snapshotsCreated: 0,
imagesDownloaded: 0,
errorMessage: 'Treez crawler not yet implemented',
durationMs: Date.now() - startTime,
};
}
/**
* Detect page structure for sandbox discovery mode
*/
async detectStructure(page: any): Promise<StructureDetectionResult> {
return {
success: false,
menuType: 'unknown',
selectors: {},
pagination: { type: 'none' },
errors: ['Treez structure detection not yet implemented'],
metadata: {},
};
}
/**
* Extract products from page/document
*/
async extractProducts(document: any): Promise<ExtractedProduct[]> {
console.warn('[BaseTreezCrawler] extractProducts not yet implemented');
return [];
}
/**
* Extract images from document
*/
async extractImages(document: any): Promise<ExtractedImage[]> {
console.warn('[BaseTreezCrawler] extractImages not yet implemented');
return [];
}
/**
* Extract stock information from document
*/
async extractStock(document: any): Promise<ExtractedStock[]> {
console.warn('[BaseTreezCrawler] extractStock not yet implemented');
return [];
}
/**
* Extract pagination information from document
*/
async extractPagination(document: any): Promise<ExtractedPagination> {
return { hasNextPage: false };
}
}
// ============================================================
// FACTORY FUNCTION
// ============================================================
export function createCrawler(
dispensary: Dispensary,
options: StoreCrawlOptions = {},
selectors: TreezSelectors = {}
): BaseTreezCrawler {
return new BaseTreezCrawler(dispensary, options, selectors);
}
// ============================================================
// STANDALONE FUNCTIONS
// ============================================================
export async function crawlProducts(
dispensary: Dispensary,
options: StoreCrawlOptions = {}
): Promise<CrawlResult> {
const crawler = createCrawler(dispensary, options);
return crawler.crawlProducts();
}
export async function detectStructure(
page: any,
dispensary?: Dispensary
): Promise<StructureDetectionResult> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.detectStructure(page);
}
export async function extractProducts(
document: any,
dispensary?: Dispensary
): Promise<ExtractedProduct[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractProducts(document);
}
export async function extractImages(
document: any,
dispensary?: Dispensary
): Promise<ExtractedImage[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractImages(document);
}
export async function extractStock(
document: any,
dispensary?: Dispensary
): Promise<ExtractedStock[]> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractStock(document);
}
export async function extractPagination(
document: any,
dispensary?: Dispensary
): Promise<ExtractedPagination> {
const crawler = createCrawler(dispensary || ({} as Dispensary));
return crawler.extractPagination(document);
}

View File

@@ -0,0 +1,27 @@
/**
* Base Crawler Templates Index
*
* Exports all base crawler templates for easy importing.
*/
// Dutchie base (primary implementation)
export * from './base-dutchie';
// Treez base (placeholder)
export * as Treez from './base-treez';
// Jane base (placeholder)
export * as Jane from './base-jane';
// Re-export common types from dutchie for convenience
export type {
StoreCrawlOptions,
CrawlResult,
StructureDetectionResult,
ExtractedProduct,
ExtractedImage,
ExtractedStock,
ExtractedPagination,
DutchieCrawlerHooks,
DutchieSelectors,
} from './base-dutchie';

View File

@@ -0,0 +1,9 @@
/**
* Base Dutchie Crawler Template (Re-export for backward compatibility)
*
* DEPRECATED: Import from '../base/base-dutchie' instead.
* This file re-exports everything from the new location for existing code.
*/
// Re-export everything from the new base location
export * from '../base/base-dutchie';

View File

@@ -0,0 +1,118 @@
/**
* Trulieve Scottsdale - Per-Store Dutchie Crawler
*
* Store ID: 101
* Profile Key: trulieve-scottsdale
* Platform Dispensary ID: 5eaf489fa8a61801212577cc
*
* Phase 1: Identity implementation - no overrides, just uses base Dutchie logic.
* Future: Add store-specific selectors, timing, or custom logic as needed.
*/
import {
BaseDutchieCrawler,
StoreCrawlOptions,
CrawlResult,
DutchieSelectors,
crawlProducts as baseCrawlProducts,
} from '../../base/base-dutchie';
import { Dispensary } from '../../../dutchie-az/types';
// Re-export CrawlResult for the orchestrator
export { CrawlResult };
// ============================================================
// STORE CONFIGURATION
// ============================================================
/**
* Store-specific configuration
* These can be used to customize crawler behavior for this store
*/
export const STORE_CONFIG = {
storeId: 101,
profileKey: 'trulieve-scottsdale',
name: 'Trulieve of Scottsdale Dispensary',
platformDispensaryId: '5eaf489fa8a61801212577cc',
// Store-specific overrides (none for Phase 1)
customOptions: {
// Example future overrides:
// pricingType: 'rec',
// useBothModes: true,
// customHeaders: {},
// maxRetries: 3,
},
};
// ============================================================
// STORE CRAWLER CLASS
// ============================================================
/**
* TrulieveScottsdaleCrawler - Per-store crawler for Trulieve Scottsdale
*
* Phase 1: Identity implementation - extends BaseDutchieCrawler with no overrides.
* Future phases can override methods like:
* - getCName() for custom slug handling
* - crawlProducts() for completely custom logic
* - Add hooks for pre/post processing
*/
export class TrulieveScottsdaleCrawler extends BaseDutchieCrawler {
constructor(dispensary: Dispensary, options: StoreCrawlOptions = {}) {
// Merge store-specific options with provided options
const mergedOptions: StoreCrawlOptions = {
...STORE_CONFIG.customOptions,
...options,
};
super(dispensary, mergedOptions);
}
// Phase 1: No overrides - use base implementation
// Future phases can add overrides here:
//
// async crawlProducts(): Promise<CrawlResult> {
// // Custom pre-processing
// // ...
// const result = await super.crawlProducts();
// // Custom post-processing
// // ...
// return result;
// }
}
// ============================================================
// EXPORTED CRAWL FUNCTION
// ============================================================
/**
* Main entry point for the orchestrator
*
* The orchestrator calls: mod.crawlProducts(dispensary, options)
* This function creates a TrulieveScottsdaleCrawler and runs it.
*/
export async function crawlProducts(
dispensary: Dispensary,
options: StoreCrawlOptions = {}
): Promise<CrawlResult> {
console.log(`[TrulieveScottsdale] Using per-store crawler for ${dispensary.name}`);
const crawler = new TrulieveScottsdaleCrawler(dispensary, options);
return crawler.crawlProducts();
}
// ============================================================
// FACTORY FUNCTION (alternative API)
// ============================================================
/**
* Create a crawler instance without running it
* Useful for testing or when you need to configure before running
*/
export function createCrawler(
dispensary: Dispensary,
options: StoreCrawlOptions = {}
): TrulieveScottsdaleCrawler {
return new TrulieveScottsdaleCrawler(dispensary, options);
}

View File

@@ -1,4 +1,4 @@
import { pool } from './migrate';
import { pool } from './pool';
async function addJobsTable() {
const client = await pool.connect();

View File

@@ -1,18 +1,58 @@
/**
* Database Migration Script (CLI-ONLY)
*
* This file is for running migrations via CLI only:
* npx tsx src/db/migrate.ts
*
* DO NOT import this file from runtime code.
* Runtime code should import from src/db/pool.ts instead.
*/
import { Pool } from 'pg';
import dotenv from 'dotenv';
// Consolidated DB connection:
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
// - Then DATABASE_URL (default)
const DATABASE_URL =
process.env.CRAWLSY_DATABASE_URL ||
process.env.DATABASE_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
// Load .env BEFORE any env var access
dotenv.config();
const pool = new Pool({
connectionString: DATABASE_URL,
});
/**
* Get the database connection string from environment variables.
* Strict validation - will throw if required vars are missing.
*/
function getConnectionString(): string {
// Priority 1: Full connection URL
if (process.env.CANNAIQ_DB_URL) {
return process.env.CANNAIQ_DB_URL;
}
// Priority 2: Build from individual env vars (all required)
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
const missing = required.filter((key) => !process.env[key]);
if (missing.length > 0) {
throw new Error(
`[Migrate] Missing required environment variables: ${missing.join(', ')}\n` +
`Either set CANNAIQ_DB_URL or all of: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS`
);
}
const host = process.env.CANNAIQ_DB_HOST!;
const port = process.env.CANNAIQ_DB_PORT!;
const name = process.env.CANNAIQ_DB_NAME!;
const user = process.env.CANNAIQ_DB_USER!;
const pass = process.env.CANNAIQ_DB_PASS!;
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
/**
* Run all database migrations
*/
async function runMigrations() {
// Create pool only when migrations are actually run
const pool = new Pool({
connectionString: getConnectionString(),
});
export async function runMigrations() {
const client = await pool.connect();
try {
@@ -340,12 +380,12 @@ export async function runMigrations() {
throw error;
} finally {
client.release();
await pool.end();
}
}
export { pool };
// Run migrations if this file is executed directly
// Only run when executed directly (CLI mode)
// DO NOT export pool - runtime code must use src/db/pool.ts
if (require.main === module) {
runMigrations()
.then(() => process.exit(0))

94
backend/src/db/pool.ts Normal file
View File

@@ -0,0 +1,94 @@
/**
* Runtime Database Pool
*
* This is the canonical database pool for all runtime services.
* Import pool from here, NOT from migrate.ts.
*
* migrate.ts is for CLI migrations only and must NOT be imported at runtime.
*/
import dotenv from 'dotenv';
import { Pool } from 'pg';
// Load .env before any env var access
dotenv.config();
/**
* Get the database connection string from environment variables.
* Supports both CANNAIQ_DB_URL and individual CANNAIQ_DB_* vars.
*/
function getConnectionString(): string {
// Priority 1: Full connection URL
if (process.env.CANNAIQ_DB_URL) {
return process.env.CANNAIQ_DB_URL;
}
// Priority 2: Build from individual env vars
const host = process.env.CANNAIQ_DB_HOST;
const port = process.env.CANNAIQ_DB_PORT;
const name = process.env.CANNAIQ_DB_NAME;
const user = process.env.CANNAIQ_DB_USER;
const pass = process.env.CANNAIQ_DB_PASS;
// Check if all individual vars are present
if (host && port && name && user && pass) {
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
// Fallback: Try DATABASE_URL for legacy compatibility
if (process.env.DATABASE_URL) {
return process.env.DATABASE_URL;
}
// Report what's missing
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
const missing = required.filter((key) => !process.env[key]);
throw new Error(
`[DB Pool] Missing database configuration.\n` +
`Set CANNAIQ_DB_URL, or all of: ${missing.join(', ')}`
);
}
// Lazy-initialized pool singleton
let _pool: Pool | null = null;
/**
* Get the database pool (lazy singleton)
*/
export function getPool(): Pool {
if (!_pool) {
_pool = new Pool({
connectionString: getConnectionString(),
max: 10,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 5000,
});
_pool.on('error', (err) => {
console.error('[DB Pool] Unexpected error on idle client:', err);
});
}
return _pool;
}
/**
* The database pool for runtime use.
* This is a getter that lazily initializes on first access.
*/
export const pool = {
query: (...args: Parameters<Pool['query']>) => getPool().query(...args),
connect: () => getPool().connect(),
end: () => getPool().end(),
on: (event: 'error' | 'connect' | 'acquire' | 'remove' | 'release', listener: (...args: any[]) => void) => getPool().on(event as any, listener),
};
/**
* Close the pool connection
*/
export async function closePool(): Promise<void> {
if (_pool) {
await _pool.end();
_pool = null;
}
}

View File

@@ -1,4 +1,4 @@
import { pool } from './migrate';
import { pool } from './pool';
import * as fs from 'fs';
import * as path from 'path';

View File

@@ -1,4 +1,4 @@
import { pool } from './migrate';
import { pool } from './pool';
import bcrypt from 'bcrypt';
export async function seedDatabase() {

View File

@@ -1,4 +1,4 @@
import { pool } from './migrate';
import { pool } from './pool';
async function updateCategoriesHierarchy() {
const client = await pool.connect();

View File

@@ -0,0 +1,474 @@
/**
* Dutchie City Discovery Service
*
* Discovers cities from the Dutchie cities page.
* Each city can contain multiple dispensary locations.
*
* Source: https://dutchie.com/cities
*
* This module ONLY handles city discovery and upserts to dutchie_discovery_cities.
* It does NOT create any dispensary records.
*/
import { Pool } from 'pg';
import axios from 'axios';
import * as cheerio from 'cheerio';
import {
DiscoveryCity,
DiscoveryCityRow,
DutchieCityResponse,
CityDiscoveryResult,
mapCityRowToCity,
} from './types';
const CITIES_PAGE_URL = 'https://dutchie.com/cities';
const PLATFORM = 'dutchie';
// ============================================================
// CITY PAGE SCRAPING
// ============================================================
/**
* Fetch and parse the Dutchie cities page.
* Returns a list of cities with their slugs and states.
*/
export async function fetchCitiesFromPage(): Promise<DutchieCityResponse[]> {
console.log(`[CityDiscovery] Fetching cities from ${CITIES_PAGE_URL}...`);
const response = await axios.get(CITIES_PAGE_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
},
timeout: 30000,
});
const $ = cheerio.load(response.data);
const cities: DutchieCityResponse[] = [];
// Look for city links in various possible structures
// Structure 1: Links in /dispensaries/{state}/{city} format
$('a[href*="/dispensaries/"]').each((_, element) => {
const href = $(element).attr('href') || '';
const text = $(element).text().trim();
// Match /dispensaries/{state}/{city} pattern
const match = href.match(/\/dispensaries\/([a-z]{2,3})\/([a-z0-9-]+)/i);
if (match) {
const [, stateCode, citySlug] = match;
cities.push({
slug: citySlug,
name: text || citySlug.replace(/-/g, ' '),
stateCode: stateCode.toUpperCase(),
countryCode: stateCode.length === 2 ? 'US' : 'CA', // 2-letter = US state, 3+ = Canadian province
});
}
});
// Structure 2: Links in /city/{slug} format
$('a[href*="/city/"]').each((_, element) => {
const href = $(element).attr('href') || '';
const text = $(element).text().trim();
const match = href.match(/\/city\/([a-z0-9-]+)/i);
if (match) {
const [, citySlug] = match;
cities.push({
slug: citySlug,
name: text || citySlug.replace(/-/g, ' '),
});
}
});
// Dedupe by slug
const uniqueCities = new Map<string, DutchieCityResponse>();
for (const city of cities) {
const key = `${city.countryCode || 'unknown'}-${city.stateCode || 'unknown'}-${city.slug}`;
if (!uniqueCities.has(key)) {
uniqueCities.set(key, city);
}
}
const result = Array.from(uniqueCities.values());
console.log(`[CityDiscovery] Found ${result.length} unique cities`);
return result;
}
/**
* Alternative: Fetch cities from Dutchie's internal API/GraphQL
* This is a fallback if the HTML scraping doesn't work.
*/
export async function fetchCitiesFromApi(): Promise<DutchieCityResponse[]> {
console.log('[CityDiscovery] Attempting to fetch cities from API...');
// Try to find the cities endpoint - this is exploratory
// Dutchie may expose cities via their public API
// Common patterns to try:
const possibleEndpoints = [
'https://dutchie.com/api/cities',
'https://dutchie.com/api-3/cities',
'https://api.dutchie.com/v1/cities',
];
for (const endpoint of possibleEndpoints) {
try {
const response = await axios.get(endpoint, {
headers: {
'Accept': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
},
timeout: 10000,
validateStatus: () => true,
});
if (response.status === 200 && Array.isArray(response.data)) {
console.log(`[CityDiscovery] Found cities at ${endpoint}`);
return response.data.map((city: any) => ({
slug: city.slug || city.city_slug,
name: city.name || city.city_name,
stateCode: city.stateCode || city.state_code || city.state,
countryCode: city.countryCode || city.country_code || city.country || 'US',
}));
}
} catch {
// Continue to next endpoint
}
}
console.log('[CityDiscovery] No API endpoint found, falling back to page scraping');
return [];
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a city into dutchie_discovery_cities.
* Returns the city ID.
*/
export async function upsertCity(
pool: Pool,
city: DutchieCityResponse
): Promise<{ id: number; isNew: boolean }> {
const result = await pool.query(
`INSERT INTO dutchie_discovery_cities (
platform,
city_name,
city_slug,
state_code,
country_code,
updated_at
) VALUES ($1, $2, $3, $4, $5, NOW())
ON CONFLICT (platform, country_code, state_code, city_slug)
DO UPDATE SET
city_name = EXCLUDED.city_name,
updated_at = NOW()
RETURNING id, (xmax = 0) as is_new`,
[
PLATFORM,
city.name,
city.slug,
city.stateCode || null,
city.countryCode || 'US',
]
);
return {
id: result.rows[0].id,
isNew: result.rows[0].is_new,
};
}
/**
* Mark a city as crawled and update location count.
*/
export async function markCityCrawled(
pool: Pool,
cityId: number,
locationCount: number
): Promise<void> {
await pool.query(
`UPDATE dutchie_discovery_cities
SET last_crawled_at = NOW(),
location_count = $2,
updated_at = NOW()
WHERE id = $1`,
[cityId, locationCount]
);
}
/**
* Get all cities that need to be crawled.
*/
export async function getCitiesToCrawl(
pool: Pool,
options: {
stateCode?: string;
countryCode?: string;
limit?: number;
onlyStale?: boolean;
staleDays?: number;
} = {}
): Promise<DiscoveryCity[]> {
const {
stateCode,
countryCode,
limit = 100,
onlyStale = false,
staleDays = 7,
} = options;
let query = `
SELECT *
FROM dutchie_discovery_cities
WHERE crawl_enabled = TRUE
`;
const params: any[] = [];
let paramIdx = 1;
if (stateCode) {
query += ` AND state_code = $${paramIdx}`;
params.push(stateCode);
paramIdx++;
}
if (countryCode) {
query += ` AND country_code = $${paramIdx}`;
params.push(countryCode);
paramIdx++;
}
if (onlyStale) {
query += ` AND (last_crawled_at IS NULL OR last_crawled_at < NOW() - INTERVAL '${staleDays} days')`;
}
query += ` ORDER BY last_crawled_at ASC NULLS FIRST LIMIT $${paramIdx}`;
params.push(limit);
const result = await pool.query<DiscoveryCityRow>(query, params);
return result.rows.map(mapCityRowToCity);
}
/**
* Get a city by ID.
*/
export async function getCityById(
pool: Pool,
id: number
): Promise<DiscoveryCity | null> {
const result = await pool.query<DiscoveryCityRow>(
`SELECT * FROM dutchie_discovery_cities WHERE id = $1`,
[id]
);
if (result.rows.length === 0) {
return null;
}
return mapCityRowToCity(result.rows[0]);
}
/**
* Get a city by slug.
*/
export async function getCityBySlug(
pool: Pool,
slug: string,
stateCode?: string,
countryCode: string = 'US'
): Promise<DiscoveryCity | null> {
let query = `
SELECT * FROM dutchie_discovery_cities
WHERE platform = $1 AND city_slug = $2 AND country_code = $3
`;
const params: any[] = [PLATFORM, slug, countryCode];
if (stateCode) {
query += ` AND state_code = $4`;
params.push(stateCode);
}
const result = await pool.query<DiscoveryCityRow>(query, params);
if (result.rows.length === 0) {
return null;
}
return mapCityRowToCity(result.rows[0]);
}
// ============================================================
// MAIN DISCOVERY FUNCTION
// ============================================================
/**
* Run the full city discovery process.
* Fetches cities from Dutchie and upserts them into the database.
*/
export async function discoverCities(
pool: Pool,
options: {
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<CityDiscoveryResult> {
const startTime = Date.now();
const { dryRun = false, verbose = false } = options;
const errors: string[] = [];
console.log('[CityDiscovery] Starting city discovery...');
console.log(`[CityDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
// Try API first, fall back to page scraping
let cities = await fetchCitiesFromApi();
if (cities.length === 0) {
cities = await fetchCitiesFromPage();
}
if (cities.length === 0) {
console.log('[CityDiscovery] No cities found');
return {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: ['No cities found from page or API'],
durationMs: Date.now() - startTime,
};
}
let upserted = 0;
let skipped = 0;
for (const city of cities) {
try {
if (dryRun) {
if (verbose) {
console.log(`[CityDiscovery][DryRun] Would upsert: ${city.name} (${city.stateCode}, ${city.countryCode})`);
}
upserted++;
continue;
}
const result = await upsertCity(pool, city);
upserted++;
if (verbose) {
const action = result.isNew ? 'Created' : 'Updated';
console.log(`[CityDiscovery] ${action}: ${city.name} (${city.stateCode}, ${city.countryCode}) -> ID ${result.id}`);
}
} catch (error: any) {
errors.push(`City ${city.slug}: ${error.message}`);
skipped++;
}
}
const durationMs = Date.now() - startTime;
console.log(`[CityDiscovery] Complete: ${upserted} upserted, ${skipped} skipped, ${errors.length} errors in ${durationMs}ms`);
return {
citiesFound: cities.length,
citiesUpserted: upserted,
citiesSkipped: skipped,
errors,
durationMs,
};
}
// ============================================================
// MANUAL CITY SEEDING
// ============================================================
/**
* Seed known cities manually.
* Use this when the cities page doesn't expose all cities.
*/
export async function seedKnownCities(
pool: Pool,
cities: Array<{
name: string;
slug: string;
stateCode: string;
countryCode?: string;
}>
): Promise<{ created: number; updated: number }> {
let created = 0;
let updated = 0;
for (const city of cities) {
const result = await upsertCity(pool, {
name: city.name,
slug: city.slug,
stateCode: city.stateCode,
countryCode: city.countryCode || 'US',
});
if (result.isNew) {
created++;
} else {
updated++;
}
}
return { created, updated };
}
/**
* Pre-defined Arizona cities for seeding.
*/
export const ARIZONA_CITIES = [
{ name: 'Phoenix', slug: 'phoenix', stateCode: 'AZ' },
{ name: 'Tucson', slug: 'tucson', stateCode: 'AZ' },
{ name: 'Mesa', slug: 'mesa', stateCode: 'AZ' },
{ name: 'Chandler', slug: 'chandler', stateCode: 'AZ' },
{ name: 'Scottsdale', slug: 'scottsdale', stateCode: 'AZ' },
{ name: 'Glendale', slug: 'glendale', stateCode: 'AZ' },
{ name: 'Gilbert', slug: 'gilbert', stateCode: 'AZ' },
{ name: 'Tempe', slug: 'tempe', stateCode: 'AZ' },
{ name: 'Peoria', slug: 'peoria', stateCode: 'AZ' },
{ name: 'Surprise', slug: 'surprise', stateCode: 'AZ' },
{ name: 'Yuma', slug: 'yuma', stateCode: 'AZ' },
{ name: 'Avondale', slug: 'avondale', stateCode: 'AZ' },
{ name: 'Flagstaff', slug: 'flagstaff', stateCode: 'AZ' },
{ name: 'Goodyear', slug: 'goodyear', stateCode: 'AZ' },
{ name: 'Lake Havasu City', slug: 'lake-havasu-city', stateCode: 'AZ' },
{ name: 'Buckeye', slug: 'buckeye', stateCode: 'AZ' },
{ name: 'Casa Grande', slug: 'casa-grande', stateCode: 'AZ' },
{ name: 'Sierra Vista', slug: 'sierra-vista', stateCode: 'AZ' },
{ name: 'Maricopa', slug: 'maricopa', stateCode: 'AZ' },
{ name: 'Oro Valley', slug: 'oro-valley', stateCode: 'AZ' },
{ name: 'Prescott', slug: 'prescott', stateCode: 'AZ' },
{ name: 'Bullhead City', slug: 'bullhead-city', stateCode: 'AZ' },
{ name: 'Prescott Valley', slug: 'prescott-valley', stateCode: 'AZ' },
{ name: 'Apache Junction', slug: 'apache-junction', stateCode: 'AZ' },
{ name: 'Marana', slug: 'marana', stateCode: 'AZ' },
{ name: 'El Mirage', slug: 'el-mirage', stateCode: 'AZ' },
{ name: 'Kingman', slug: 'kingman', stateCode: 'AZ' },
{ name: 'Queen Creek', slug: 'queen-creek', stateCode: 'AZ' },
{ name: 'San Luis', slug: 'san-luis', stateCode: 'AZ' },
{ name: 'Sahuarita', slug: 'sahuarita', stateCode: 'AZ' },
{ name: 'Fountain Hills', slug: 'fountain-hills', stateCode: 'AZ' },
{ name: 'Nogales', slug: 'nogales', stateCode: 'AZ' },
{ name: 'Douglas', slug: 'douglas', stateCode: 'AZ' },
{ name: 'Eloy', slug: 'eloy', stateCode: 'AZ' },
{ name: 'Somerton', slug: 'somerton', stateCode: 'AZ' },
{ name: 'Paradise Valley', slug: 'paradise-valley', stateCode: 'AZ' },
{ name: 'Coolidge', slug: 'coolidge', stateCode: 'AZ' },
{ name: 'Cottonwood', slug: 'cottonwood', stateCode: 'AZ' },
{ name: 'Camp Verde', slug: 'camp-verde', stateCode: 'AZ' },
{ name: 'Show Low', slug: 'show-low', stateCode: 'AZ' },
{ name: 'Payson', slug: 'payson', stateCode: 'AZ' },
{ name: 'Sedona', slug: 'sedona', stateCode: 'AZ' },
{ name: 'Winslow', slug: 'winslow', stateCode: 'AZ' },
{ name: 'Globe', slug: 'globe', stateCode: 'AZ' },
{ name: 'Safford', slug: 'safford', stateCode: 'AZ' },
{ name: 'Bisbee', slug: 'bisbee', stateCode: 'AZ' },
{ name: 'Wickenburg', slug: 'wickenburg', stateCode: 'AZ' },
{ name: 'Page', slug: 'page', stateCode: 'AZ' },
{ name: 'Holbrook', slug: 'holbrook', stateCode: 'AZ' },
{ name: 'Willcox', slug: 'willcox', stateCode: 'AZ' },
];

View File

@@ -0,0 +1,327 @@
/**
* Dutchie Discovery Crawler
*
* Main orchestrator for the Dutchie store discovery pipeline.
*
* Flow:
* 1. Discover cities from Dutchie (or use seeded cities)
* 2. For each city, discover store locations
* 3. Upsert all data to discovery tables
* 4. Admin verifies locations manually
* 5. Verified locations are promoted to canonical dispensaries
*
* This module does NOT create canonical dispensaries automatically.
*/
import { Pool } from 'pg';
import {
FullDiscoveryResult,
LocationDiscoveryResult,
DiscoveryCity,
} from './types';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
import {
discoverLocationsForCity,
} from './location-discovery';
// ============================================================
// FULL DISCOVERY
// ============================================================
export interface DiscoveryCrawlerOptions {
dryRun?: boolean;
verbose?: boolean;
stateCode?: string;
countryCode?: string;
cityLimit?: number;
skipCityDiscovery?: boolean;
onlyStale?: boolean;
staleDays?: number;
}
/**
* Run the full discovery pipeline:
* 1. Discover/refresh cities
* 2. For each city, discover locations
*/
export async function runFullDiscovery(
pool: Pool,
options: DiscoveryCrawlerOptions = {}
): Promise<FullDiscoveryResult> {
const startTime = Date.now();
const {
dryRun = false,
verbose = false,
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
} = options;
console.log('='.repeat(60));
console.log('DUTCHIE DISCOVERY CRAWLER');
console.log('='.repeat(60));
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
if (stateCode) console.log(`State: ${stateCode}`);
console.log(`Country: ${countryCode}`);
console.log(`City limit: ${cityLimit}`);
console.log('');
// Step 1: Discover/refresh cities
let cityResult = {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: [] as string[],
durationMs: 0,
};
if (!skipCityDiscovery) {
console.log('[Discovery] Step 1: Discovering cities...');
cityResult = await discoverCities(pool, { dryRun, verbose });
} else {
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
}
// Step 2: Get cities to crawl
console.log('[Discovery] Step 2: Getting cities to crawl...');
const cities = await getCitiesToCrawl(pool, {
stateCode,
countryCode,
limit: cityLimit,
onlyStale,
staleDays,
});
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
// Step 3: Discover locations for each city
console.log('[Discovery] Step 3: Discovering locations...');
const locationResults: LocationDiscoveryResult[] = [];
let totalLocationsFound = 0;
let totalLocationsUpserted = 0;
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
locationResults.push(result);
totalLocationsFound += result.locationsFound;
totalLocationsUpserted += result.locationsUpserted;
// Rate limiting between cities
if (i < cities.length - 1) {
await new Promise((r) => setTimeout(r, 2000));
}
} catch (error: any) {
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
locationResults.push({
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [error.message],
durationMs: 0,
});
}
}
const durationMs = Date.now() - startTime;
// Summary
console.log('\n' + '='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
console.log('');
console.log('Cities:');
console.log(` Discovered: ${cityResult.citiesFound}`);
console.log(` Upserted: ${cityResult.citiesUpserted}`);
console.log(` Crawled: ${cities.length}`);
console.log('');
console.log('Locations:');
console.log(` Found: ${totalLocationsFound}`);
console.log(` Upserted: ${totalLocationsUpserted}`);
console.log('');
const totalErrors = cityResult.errors.length +
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
if (totalErrors > 0) {
console.log(`Errors: ${totalErrors}`);
}
return {
cities: cityResult,
locations: locationResults,
totalLocationsFound,
totalLocationsUpserted,
durationMs,
};
}
// ============================================================
// SINGLE CITY DISCOVERY
// ============================================================
/**
* Discover locations for a single city by slug.
*/
export async function discoverCity(
pool: Pool,
citySlug: string,
options: {
stateCode?: string;
countryCode?: string;
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult | null> {
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
// Find the city
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
if (!city) {
// Try to create it if we have enough info
if (stateCode) {
console.log(`[Discovery] City ${citySlug} not found, creating...`);
await seedKnownCities(pool, [{
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
slug: citySlug,
stateCode,
countryCode,
}]);
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
}
if (!city) {
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
return null;
}
}
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
}
// ============================================================
// STATE-WIDE DISCOVERY
// ============================================================
/**
* Seed and discover all cities for a state.
*/
export async function discoverState(
pool: Pool,
stateCode: string,
options: {
dryRun?: boolean;
verbose?: boolean;
cityLimit?: number;
} = {}
): Promise<FullDiscoveryResult> {
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
console.log(`[Discovery] Discovering state: ${stateCode}`);
// Seed known cities for this state
if (stateCode === 'AZ') {
console.log('[Discovery] Seeding Arizona cities...');
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
}
// Run full discovery for this state
return await runFullDiscovery(pool, {
dryRun,
verbose,
stateCode,
countryCode: 'US',
cityLimit,
skipCityDiscovery: true, // Use seeded cities
onlyStale: false, // Crawl all
});
}
// ============================================================
// STATISTICS
// ============================================================
export interface DiscoveryStats {
cities: {
total: number;
crawledLast24h: number;
neverCrawled: number;
};
locations: {
total: number;
discovered: number;
verified: number;
rejected: number;
merged: number;
byState: Array<{ stateCode: string; count: number }>;
};
}
/**
* Get discovery statistics.
*/
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
]);
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE
GROUP BY status
`),
pool.query(`
SELECT state_code, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY cnt DESC
`),
]);
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
acc[row.status] = parseInt(row.cnt, 10);
return acc;
}, {} as Record<string, number>);
return {
cities: {
total: parseInt(citiesTotal.rows[0].cnt, 10),
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
},
locations: {
total: parseInt(locsTotal.rows[0].cnt, 10),
discovered: statusCounts.discovered || 0,
verified: statusCounts.verified || 0,
rejected: statusCounts.rejected || 0,
merged: statusCounts.merged || 0,
byState: locsByState.rows.map(row => ({
stateCode: row.state_code,
count: parseInt(row.cnt, 10),
})),
},
};
}

View File

@@ -0,0 +1,37 @@
/**
* Dutchie Discovery Module
*
* Exports all discovery-related functionality for use in the main application.
*/
// Types
export * from './types';
// City Discovery
export {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
// Location Discovery
export {
discoverLocationsForCity,
fetchLocationsForCity,
upsertLocation,
} from './location-discovery';
// Discovery Crawler (Orchestrator)
export {
runFullDiscovery,
discoverCity,
discoverState,
getDiscoveryStats,
DiscoveryCrawlerOptions,
DiscoveryStats,
} from './discovery-crawler';
// Routes
export { createDiscoveryRoutes } from './routes';

View File

@@ -0,0 +1,686 @@
/**
* Dutchie Location Discovery Service
*
* Discovers store locations from Dutchie city pages.
* Each city can contain multiple dispensary locations.
*
* This module:
* 1. Fetches location listings for a given city
* 2. Upserts locations into dutchie_discovery_locations
* 3. Does NOT create any canonical dispensary records
*
* Locations remain in "discovered" status until manually verified.
*/
import { Pool } from 'pg';
import axios from 'axios';
import puppeteer from 'puppeteer-extra';
import type { Browser, Page, Protocol } from 'puppeteer';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import {
DiscoveryLocation,
DiscoveryLocationRow,
DutchieLocationResponse,
LocationDiscoveryResult,
DiscoveryStatus,
mapLocationRowToLocation,
} from './types';
import { DiscoveryCity } from './types';
puppeteer.use(StealthPlugin());
const PLATFORM = 'dutchie';
// ============================================================
// GRAPHQL / API FETCHING
// ============================================================
interface SessionCredentials {
cookies: string;
userAgent: string;
browser: Browser;
page: Page;
}
/**
* Create a browser session for fetching location data.
*/
async function createSession(citySlug: string): Promise<SessionCredentials> {
const browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
const page = await browser.newPage();
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
await page.setUserAgent(userAgent);
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} };
});
// Navigate to a dispensaries page to get cookies
const url = `https://dutchie.com/dispensaries/az/${citySlug}`;
console.log(`[LocationDiscovery] Loading ${url} to establish session...`);
try {
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 2000));
} catch (error: any) {
console.warn(`[LocationDiscovery] Navigation warning: ${error.message}`);
}
const cookies = await page.cookies();
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
return { cookies: cookieString, userAgent, browser, page };
}
async function closeSession(session: SessionCredentials): Promise<void> {
await session.browser.close();
}
/**
* Fetch locations for a city using Dutchie's internal search API.
*/
export async function fetchLocationsForCity(
city: DiscoveryCity,
options: {
session?: SessionCredentials;
verbose?: boolean;
} = {}
): Promise<DutchieLocationResponse[]> {
const { verbose = false } = options;
let session = options.session;
let shouldCloseSession = false;
if (!session) {
session = await createSession(city.citySlug);
shouldCloseSession = true;
}
try {
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
// Try multiple approaches to get location data
// Approach 1: Extract from page __NEXT_DATA__ or similar
const locations = await extractLocationsFromPage(session.page, verbose);
if (locations.length > 0) {
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data`);
return locations;
}
// Approach 2: Try the geo-based GraphQL query
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
if (geoLocations.length > 0) {
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from GraphQL`);
return geoLocations;
}
// Approach 3: Scrape visible location cards
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
if (scrapedLocations.length > 0) {
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping`);
return scrapedLocations;
}
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
return [];
} finally {
if (shouldCloseSession) {
await closeSession(session);
}
}
}
/**
* Extract locations from page's embedded data (__NEXT_DATA__, window.*, etc.)
*/
async function extractLocationsFromPage(
page: Page,
verbose: boolean
): Promise<DutchieLocationResponse[]> {
try {
const data = await page.evaluate(() => {
// Try __NEXT_DATA__
const nextDataEl = document.querySelector('#__NEXT_DATA__');
if (nextDataEl?.textContent) {
try {
const nextData = JSON.parse(nextDataEl.textContent);
// Look for dispensaries in various paths
const dispensaries =
nextData?.props?.pageProps?.dispensaries ||
nextData?.props?.pageProps?.initialDispensaries ||
nextData?.props?.pageProps?.data?.dispensaries ||
[];
if (Array.isArray(dispensaries) && dispensaries.length > 0) {
return { source: '__NEXT_DATA__', dispensaries };
}
} catch {
// Ignore parse errors
}
}
// Try window variables
const win = window as any;
if (win.__APOLLO_STATE__) {
// Extract from Apollo cache
const entries = Object.entries(win.__APOLLO_STATE__).filter(
([key]) => key.startsWith('Dispensary:')
);
if (entries.length > 0) {
return { source: 'APOLLO_STATE', dispensaries: entries.map(([, v]) => v) };
}
}
return { source: 'none', dispensaries: [] };
});
if (verbose) {
console.log(`[LocationDiscovery] Page data source: ${data.source}, count: ${data.dispensaries.length}`);
}
return data.dispensaries.map((d: any) => normalizeLocationResponse(d));
} catch (error: any) {
if (verbose) {
console.log(`[LocationDiscovery] Could not extract from page data: ${error.message}`);
}
return [];
}
}
/**
* Fetch locations via GraphQL geo-based query.
*/
async function fetchLocationsViaGraphQL(
session: SessionCredentials,
city: DiscoveryCity,
verbose: boolean
): Promise<DutchieLocationResponse[]> {
// Use a known center point for the city or default to a central US location
const CITY_COORDS: Record<string, { lat: number; lng: number }> = {
'phoenix': { lat: 33.4484, lng: -112.074 },
'tucson': { lat: 32.2226, lng: -110.9747 },
'scottsdale': { lat: 33.4942, lng: -111.9261 },
'mesa': { lat: 33.4152, lng: -111.8315 },
'tempe': { lat: 33.4255, lng: -111.94 },
'flagstaff': { lat: 35.1983, lng: -111.6513 },
// Add more as needed
};
const coords = CITY_COORDS[city.citySlug] || { lat: 33.4484, lng: -112.074 };
const variables = {
dispensariesFilter: {
latitude: coords.lat,
longitude: coords.lng,
distance: 50, // miles
state: city.stateCode,
city: city.cityName,
},
};
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
try {
const response = await axios.post(
'https://dutchie.com/api-3/graphql',
{
operationName: 'ConsumerDispensaries',
variables,
extensions: {
persistedQuery: { version: 1, sha256Hash: hash },
},
},
{
headers: {
'content-type': 'application/json',
'origin': 'https://dutchie.com',
'referer': `https://dutchie.com/dispensaries/${city.stateCode?.toLowerCase()}/${city.citySlug}`,
'user-agent': session.userAgent,
'cookie': session.cookies,
},
timeout: 30000,
validateStatus: () => true,
}
);
if (response.status !== 200) {
if (verbose) {
console.log(`[LocationDiscovery] GraphQL returned ${response.status}`);
}
return [];
}
const dispensaries = response.data?.data?.consumerDispensaries || [];
return dispensaries.map((d: any) => normalizeLocationResponse(d));
} catch (error: any) {
if (verbose) {
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
}
return [];
}
}
/**
* Scrape location cards from the visible page.
*/
async function scrapeLocationCards(
page: Page,
verbose: boolean
): Promise<DutchieLocationResponse[]> {
try {
const locations = await page.evaluate(() => {
const cards: any[] = [];
// Look for common dispensary card patterns
const selectors = [
'[data-testid="dispensary-card"]',
'.dispensary-card',
'a[href*="/dispensary/"]',
'[class*="DispensaryCard"]',
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
elements.forEach((el) => {
const link = el.querySelector('a')?.href || (el as HTMLAnchorElement).href || '';
const name = el.querySelector('h2, h3, [class*="name"]')?.textContent?.trim() || '';
const address = el.querySelector('[class*="address"], address')?.textContent?.trim() || '';
// Extract slug from URL
const slugMatch = link.match(/\/dispensary\/([^/?]+)/);
const slug = slugMatch ? slugMatch[1] : '';
if (slug && name) {
cards.push({
slug,
name,
address,
menuUrl: link,
});
}
});
break; // Stop after first successful selector
}
}
return cards;
});
return locations.map((d: any) => ({
id: '',
name: d.name,
slug: d.slug,
address: d.address,
menuUrl: d.menuUrl,
}));
} catch (error: any) {
if (verbose) {
console.log(`[LocationDiscovery] Scraping error: ${error.message}`);
}
return [];
}
}
/**
* Normalize a raw location response to a consistent format.
*/
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
const slug = raw.slug || raw.cName || raw.urlSlug || '';
const id = raw.id || raw._id || raw.dispensaryId || '';
return {
id,
name: raw.name || raw.dispensaryName || '',
slug,
address: raw.address || raw.fullAddress || '',
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || '',
address2: raw.address2 || raw.addressLine2 || '',
city: raw.city || '',
state: raw.state || raw.stateCode || '',
zip: raw.zip || raw.zipCode || raw.postalCode || '',
country: raw.country || raw.countryCode || 'US',
latitude: raw.latitude || raw.lat || raw.location?.latitude,
longitude: raw.longitude || raw.lng || raw.location?.longitude,
timezone: raw.timezone || raw.tz || '',
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
retailType: raw.retailType || raw.type || '',
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
isRecreational: raw.isRecreational ?? raw.retailType?.includes('Recreational') ?? true,
isMedical: raw.isMedical ?? raw.retailType?.includes('Medical') ?? true,
// Preserve raw data
...raw,
};
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a location into dutchie_discovery_locations.
*/
export async function upsertLocation(
pool: Pool,
location: DutchieLocationResponse,
cityId: number | null
): Promise<{ id: number; isNew: boolean }> {
const platformLocationId = location.id || location.slug;
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
const result = await pool.query(
`INSERT INTO dutchie_discovery_locations (
platform,
platform_location_id,
platform_slug,
platform_menu_url,
name,
raw_address,
address_line1,
address_line2,
city,
state_code,
postal_code,
country_code,
latitude,
longitude,
timezone,
discovery_city_id,
metadata,
offers_delivery,
offers_pickup,
is_recreational,
is_medical,
last_seen_at,
updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW(), NOW())
ON CONFLICT (platform, platform_location_id)
DO UPDATE SET
name = EXCLUDED.name,
platform_menu_url = EXCLUDED.platform_menu_url,
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
latitude = COALESCE(EXCLUDED.latitude, dutchie_discovery_locations.latitude),
longitude = COALESCE(EXCLUDED.longitude, dutchie_discovery_locations.longitude),
timezone = COALESCE(EXCLUDED.timezone, dutchie_discovery_locations.timezone),
metadata = EXCLUDED.metadata,
offers_delivery = COALESCE(EXCLUDED.offers_delivery, dutchie_discovery_locations.offers_delivery),
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
last_seen_at = NOW(),
updated_at = NOW()
RETURNING id, (xmax = 0) as is_new`,
[
PLATFORM,
platformLocationId,
location.slug,
menuUrl,
location.name,
location.address || null,
location.address1 || null,
location.address2 || null,
location.city || null,
location.state || null,
location.zip || null,
location.country || 'US',
location.latitude || null,
location.longitude || null,
location.timezone || null,
cityId,
JSON.stringify(location),
location.offerDelivery ?? null,
location.offerPickup ?? null,
location.isRecreational ?? null,
location.isMedical ?? null,
]
);
return {
id: result.rows[0].id,
isNew: result.rows[0].is_new,
};
}
/**
* Get locations by status.
*/
export async function getLocationsByStatus(
pool: Pool,
status: DiscoveryStatus,
options: {
stateCode?: string;
countryCode?: string;
limit?: number;
offset?: number;
} = {}
): Promise<DiscoveryLocation[]> {
const { stateCode, countryCode, limit = 100, offset = 0 } = options;
let query = `
SELECT * FROM dutchie_discovery_locations
WHERE status = $1 AND active = TRUE
`;
const params: any[] = [status];
let paramIdx = 2;
if (stateCode) {
query += ` AND state_code = $${paramIdx}`;
params.push(stateCode);
paramIdx++;
}
if (countryCode) {
query += ` AND country_code = $${paramIdx}`;
params.push(countryCode);
paramIdx++;
}
query += ` ORDER BY first_seen_at DESC LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`;
params.push(limit, offset);
const result = await pool.query<DiscoveryLocationRow>(query, params);
return result.rows.map(mapLocationRowToLocation);
}
/**
* Get a location by ID.
*/
export async function getLocationById(
pool: Pool,
id: number
): Promise<DiscoveryLocation | null> {
const result = await pool.query<DiscoveryLocationRow>(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[id]
);
if (result.rows.length === 0) {
return null;
}
return mapLocationRowToLocation(result.rows[0]);
}
/**
* Update location status.
*/
export async function updateLocationStatus(
pool: Pool,
locationId: number,
status: DiscoveryStatus,
options: {
dispensaryId?: number;
verifiedBy?: string;
notes?: string;
} = {}
): Promise<void> {
const { dispensaryId, verifiedBy, notes } = options;
await pool.query(
`UPDATE dutchie_discovery_locations
SET status = $2,
dispensary_id = COALESCE($3, dispensary_id),
verified_at = CASE WHEN $2 IN ('verified', 'merged') THEN NOW() ELSE verified_at END,
verified_by = COALESCE($4, verified_by),
notes = COALESCE($5, notes),
updated_at = NOW()
WHERE id = $1`,
[locationId, status, dispensaryId || null, verifiedBy || null, notes || null]
);
}
/**
* Search locations by name or address.
*/
export async function searchLocations(
pool: Pool,
query: string,
options: {
status?: DiscoveryStatus;
stateCode?: string;
limit?: number;
} = {}
): Promise<DiscoveryLocation[]> {
const { status, stateCode, limit = 50 } = options;
const searchPattern = `%${query}%`;
let sql = `
SELECT * FROM dutchie_discovery_locations
WHERE active = TRUE
AND (name ILIKE $1 OR city ILIKE $1 OR raw_address ILIKE $1 OR platform_slug ILIKE $1)
`;
const params: any[] = [searchPattern];
let paramIdx = 2;
if (status) {
sql += ` AND status = $${paramIdx}`;
params.push(status);
paramIdx++;
}
if (stateCode) {
sql += ` AND state_code = $${paramIdx}`;
params.push(stateCode);
paramIdx++;
}
sql += ` ORDER BY name LIMIT $${paramIdx}`;
params.push(limit);
const result = await pool.query<DiscoveryLocationRow>(sql, params);
return result.rows.map(mapLocationRowToLocation);
}
// ============================================================
// MAIN DISCOVERY FUNCTION
// ============================================================
/**
* Discover locations for a specific city.
*/
export async function discoverLocationsForCity(
pool: Pool,
city: DiscoveryCity,
options: {
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult> {
const startTime = Date.now();
const { dryRun = false, verbose = false } = options;
const errors: string[] = [];
console.log(`[LocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
console.log(`[LocationDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
const locations = await fetchLocationsForCity(city, { verbose });
if (locations.length === 0) {
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
return {
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [],
durationMs: Date.now() - startTime,
};
}
let newCount = 0;
let updatedCount = 0;
for (const location of locations) {
try {
if (dryRun) {
if (verbose) {
console.log(`[LocationDiscovery][DryRun] Would upsert: ${location.name} (${location.slug})`);
}
newCount++;
continue;
}
const result = await upsertLocation(pool, location, city.id);
if (result.isNew) {
newCount++;
} else {
updatedCount++;
}
if (verbose) {
const action = result.isNew ? 'Created' : 'Updated';
console.log(`[LocationDiscovery] ${action}: ${location.name} -> ID ${result.id}`);
}
} catch (error: any) {
errors.push(`Location ${location.slug}: ${error.message}`);
}
}
// Update city crawl status
if (!dryRun) {
await pool.query(
`UPDATE dutchie_discovery_cities
SET last_crawled_at = NOW(),
location_count = $2,
updated_at = NOW()
WHERE id = $1`,
[city.id, locations.length]
);
}
const durationMs = Date.now() - startTime;
console.log(`[LocationDiscovery] Complete for ${city.cityName}: ${newCount} new, ${updatedCount} updated, ${errors.length} errors in ${durationMs}ms`);
return {
cityId: city.id,
citySlug: city.citySlug,
locationsFound: locations.length,
locationsUpserted: newCount + updatedCount,
locationsNew: newCount,
locationsUpdated: updatedCount,
errors,
durationMs,
};
}

View File

@@ -0,0 +1,840 @@
/**
* Dutchie Discovery API Routes
*
* Express routes for the Dutchie store discovery pipeline.
* Provides endpoints for discovering, listing, and verifying locations.
*/
import { Router, Request, Response } from 'express';
import { Pool } from 'pg';
import {
runFullDiscovery,
discoverCity,
discoverState,
getDiscoveryStats,
} from './discovery-crawler';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
import {
DiscoveryLocation,
DiscoveryCity,
DiscoveryStatus,
mapLocationRowToLocation,
mapCityRowToCity,
} from './types';
export function createDiscoveryRoutes(pool: Pool): Router {
const router = Router();
// ============================================================
// DISCOVERY LOCATIONS
// ============================================================
/**
* GET /api/discovery/locations
* List discovered locations with filtering
*/
router.get('/locations', async (req: Request, res: Response) => {
try {
const {
status,
stateCode,
countryCode,
city,
platform = 'dutchie',
search,
hasDispensary,
limit = '50',
offset = '0',
} = req.query;
let whereClause = 'WHERE platform = $1 AND active = TRUE';
const params: any[] = [platform];
let paramIndex = 2;
if (status) {
whereClause += ` AND status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (stateCode) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(stateCode);
paramIndex++;
}
if (countryCode) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(countryCode);
paramIndex++;
}
if (city) {
whereClause += ` AND city ILIKE $${paramIndex}`;
params.push(`%${city}%`);
paramIndex++;
}
if (search) {
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
params.push(`%${search}%`);
paramIndex++;
}
if (hasDispensary === 'true') {
whereClause += ' AND dispensary_id IS NOT NULL';
} else if (hasDispensary === 'false') {
whereClause += ' AND dispensary_id IS NULL';
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await pool.query(
`
SELECT
dl.*,
d.name as dispensary_name,
dc.city_name as discovery_city_name
FROM dutchie_discovery_locations dl
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
${whereClause}
ORDER BY dl.first_seen_at DESC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
);
const { rows: countRows } = await pool.query(
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
params.slice(0, -2)
);
const locations = rows.map((row: any) => ({
...mapLocationRowToLocation(row),
dispensaryName: row.dispensary_name,
discoveryCityName: row.discovery_city_name,
}));
res.json({
locations,
total: parseInt(countRows[0]?.total || '0', 10),
limit: parseInt(limit as string, 10),
offset: parseInt(offset as string, 10),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/discovery/locations/:id
* Get a single discovery location
*/
router.get('/locations/:id', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { rows } = await pool.query(
`
SELECT
dl.*,
d.name as dispensary_name,
d.menu_url as dispensary_menu_url,
dc.city_name as discovery_city_name
FROM dutchie_discovery_locations dl
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
WHERE dl.id = $1
`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
res.json({
...mapLocationRowToLocation(rows[0]),
dispensaryName: rows[0].dispensary_name,
dispensaryMenuUrl: rows[0].dispensary_menu_url,
discoveryCityName: rows[0].discovery_city_name,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/discovery/locations/pending
* Get locations awaiting verification
*/
router.get('/locations/pending', async (req: Request, res: Response) => {
try {
const { stateCode, countryCode, limit = '100' } = req.query;
let whereClause = `WHERE status = 'discovered' AND active = TRUE`;
const params: any[] = [];
let paramIndex = 1;
if (stateCode) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(stateCode);
paramIndex++;
}
if (countryCode) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(countryCode);
paramIndex++;
}
params.push(parseInt(limit as string, 10));
const { rows } = await pool.query(
`
SELECT * FROM dutchie_discovery_locations
${whereClause}
ORDER BY state_code, city, name
LIMIT $${paramIndex}
`,
params
);
res.json({
locations: rows.map(mapLocationRowToLocation),
total: rows.length,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// DISCOVERY CITIES
// ============================================================
/**
* GET /api/discovery/cities
* List discovery cities
*/
router.get('/cities', async (req: Request, res: Response) => {
try {
const {
stateCode,
countryCode,
crawlEnabled,
platform = 'dutchie',
limit = '100',
offset = '0',
} = req.query;
let whereClause = 'WHERE platform = $1';
const params: any[] = [platform];
let paramIndex = 2;
if (stateCode) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(stateCode);
paramIndex++;
}
if (countryCode) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(countryCode);
paramIndex++;
}
if (crawlEnabled === 'true') {
whereClause += ' AND crawl_enabled = TRUE';
} else if (crawlEnabled === 'false') {
whereClause += ' AND crawl_enabled = FALSE';
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await pool.query(
`
SELECT
dc.*,
(SELECT COUNT(*) FROM dutchie_discovery_locations dl WHERE dl.discovery_city_id = dc.id) as actual_location_count
FROM dutchie_discovery_cities dc
${whereClause}
ORDER BY dc.country_code, dc.state_code, dc.city_name
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
);
const { rows: countRows } = await pool.query(
`SELECT COUNT(*) as total FROM dutchie_discovery_cities dc ${whereClause}`,
params.slice(0, -2)
);
const cities = rows.map((row: any) => ({
...mapCityRowToCity(row),
actualLocationCount: parseInt(row.actual_location_count || '0', 10),
}));
res.json({
cities,
total: parseInt(countRows[0]?.total || '0', 10),
limit: parseInt(limit as string, 10),
offset: parseInt(offset as string, 10),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// STATISTICS
// ============================================================
/**
* GET /api/discovery/stats
* Get discovery statistics
*/
router.get('/stats', async (_req: Request, res: Response) => {
try {
const stats = await getDiscoveryStats(pool);
res.json(stats);
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// VERIFICATION ACTIONS
// ============================================================
/**
* POST /api/discovery/locations/:id/verify
* Verify a discovered location and create a new canonical dispensary
*/
router.post('/locations/:id/verify', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { verifiedBy = 'admin' } = req.body;
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
const location = locRows[0];
if (location.status !== 'discovered') {
return res.status(400).json({
error: `Location already has status: ${location.status}`,
});
}
// Create the canonical dispensary
const { rows: dispRows } = await pool.query(
`
INSERT INTO dispensaries (
name,
slug,
address,
city,
state,
zip,
latitude,
longitude,
timezone,
menu_type,
menu_url,
platform_dispensary_id,
active,
created_at,
updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, TRUE, NOW(), NOW()
)
RETURNING id
`,
[
location.name,
location.platform_slug,
location.address_line1,
location.city,
location.state_code,
location.postal_code,
location.latitude,
location.longitude,
location.timezone,
location.platform,
location.platform_menu_url,
location.platform_location_id,
]
);
const dispensaryId = dispRows[0].id;
// Update the discovery location
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'verified',
dispensary_id = $1,
verified_at = NOW(),
verified_by = $2,
updated_at = NOW()
WHERE id = $3
`,
[dispensaryId, verifiedBy, id]
);
res.json({
success: true,
action: 'created',
discoveryId: parseInt(id, 10),
dispensaryId,
message: `Created new dispensary (ID: ${dispensaryId})`,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/locations/:id/link
* Link a discovered location to an existing dispensary
*/
router.post('/locations/:id/link', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { dispensaryId, verifiedBy = 'admin' } = req.body;
if (!dispensaryId) {
return res.status(400).json({ error: 'dispensaryId is required' });
}
// Verify dispensary exists
const { rows: dispRows } = await pool.query(
`SELECT id, name FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (dispRows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
const location = locRows[0];
if (location.status !== 'discovered') {
return res.status(400).json({
error: `Location already has status: ${location.status}`,
});
}
// Update dispensary with platform info if missing
await pool.query(
`
UPDATE dispensaries
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
menu_url = COALESCE(menu_url, $2),
menu_type = COALESCE(menu_type, $3),
updated_at = NOW()
WHERE id = $4
`,
[
location.platform_location_id,
location.platform_menu_url,
location.platform,
dispensaryId,
]
);
// Update the discovery location
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'merged',
dispensary_id = $1,
verified_at = NOW(),
verified_by = $2,
updated_at = NOW()
WHERE id = $3
`,
[dispensaryId, verifiedBy, id]
);
res.json({
success: true,
action: 'linked',
discoveryId: parseInt(id, 10),
dispensaryId,
dispensaryName: dispRows[0].name,
message: `Linked to existing dispensary: ${dispRows[0].name}`,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/locations/:id/reject
* Reject a discovered location
*/
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { reason, verifiedBy = 'admin' } = req.body;
const { rows } = await pool.query(
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
if (rows[0].status !== 'discovered') {
return res.status(400).json({
error: `Location already has status: ${rows[0].status}`,
});
}
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'rejected',
verified_at = NOW(),
verified_by = $1,
notes = $2,
updated_at = NOW()
WHERE id = $3
`,
[verifiedBy, reason || 'Rejected by admin', id]
);
res.json({
success: true,
action: 'rejected',
discoveryId: parseInt(id, 10),
message: 'Location rejected',
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/locations/:id/unreject
* Restore a rejected location back to discovered status
*/
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { rows } = await pool.query(
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
if (rows[0].status !== 'rejected') {
return res.status(400).json({
error: `Location is not rejected. Current status: ${rows[0].status}`,
});
}
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'discovered',
verified_at = NULL,
verified_by = NULL,
updated_at = NOW()
WHERE id = $1
`,
[id]
);
res.json({
success: true,
action: 'unrejected',
discoveryId: parseInt(id, 10),
message: 'Location restored to discovered status',
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// DISCOVERY ADMIN ACTIONS
// ============================================================
/**
* POST /api/discovery/admin/discover-state
* Run discovery for an entire state
*/
router.post('/admin/discover-state', async (req: Request, res: Response) => {
try {
const { stateCode, dryRun = false, cityLimit = 100 } = req.body;
if (!stateCode) {
return res.status(400).json({ error: 'stateCode is required' });
}
console.log(`[Discovery API] Starting state discovery for ${stateCode}`);
const result = await discoverState(pool, stateCode, {
dryRun,
cityLimit,
verbose: true,
});
res.json({
success: true,
stateCode,
result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/admin/discover-city
* Run discovery for a single city
*/
router.post('/admin/discover-city', async (req: Request, res: Response) => {
try {
const { citySlug, stateCode, countryCode = 'US', dryRun = false } = req.body;
if (!citySlug) {
return res.status(400).json({ error: 'citySlug is required' });
}
console.log(`[Discovery API] Starting city discovery for ${citySlug}`);
const result = await discoverCity(pool, citySlug, {
stateCode,
countryCode,
dryRun,
verbose: true,
});
if (!result) {
return res.status(404).json({ error: `City not found: ${citySlug}` });
}
res.json({
success: true,
citySlug,
result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/admin/run-full
* Run full discovery pipeline
*/
router.post('/admin/run-full', async (req: Request, res: Response) => {
try {
const {
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
dryRun = false,
} = req.body;
console.log(`[Discovery API] Starting full discovery`);
const result = await runFullDiscovery(pool, {
stateCode,
countryCode,
cityLimit,
skipCityDiscovery,
onlyStale,
staleDays,
dryRun,
verbose: true,
});
res.json({
success: true,
result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/admin/seed-cities
* Seed known cities for a state
*/
router.post('/admin/seed-cities', async (req: Request, res: Response) => {
try {
const { stateCode } = req.body;
if (!stateCode) {
return res.status(400).json({ error: 'stateCode is required' });
}
let cities: any[] = [];
if (stateCode === 'AZ') {
cities = ARIZONA_CITIES;
} else {
return res.status(400).json({
error: `No predefined cities for state: ${stateCode}. Add cities to city-discovery.ts`,
});
}
const result = await seedKnownCities(pool, cities);
res.json({
success: true,
stateCode,
...result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/discovery/admin/match-candidates/:id
* Find potential dispensary matches for a discovery location
*/
router.get('/admin/match-candidates/:id', async (req: Request, res: Response) => {
try {
const { id } = req.params;
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
const location = locRows[0];
// Find potential matches by name similarity and location
const { rows: candidates } = await pool.query(
`
SELECT
d.id,
d.name,
d.city,
d.state,
d.address,
d.menu_type,
d.platform_dispensary_id,
d.menu_url,
d.latitude,
d.longitude,
CASE
WHEN d.name ILIKE $1 THEN 'exact_name'
WHEN d.name ILIKE $2 THEN 'partial_name'
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
ELSE 'location_match'
END as match_type,
-- Distance in miles if coordinates available
CASE
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
THEN (3959 * acos(
cos(radians($5::float)) * cos(radians(d.latitude)) *
cos(radians(d.longitude) - radians($6::float)) +
sin(radians($5::float)) * sin(radians(d.latitude))
))
ELSE NULL
END as distance_miles
FROM dispensaries d
WHERE d.state = $4
AND (
d.name ILIKE $1
OR d.name ILIKE $2
OR d.city ILIKE $3
OR (
d.latitude IS NOT NULL
AND d.longitude IS NOT NULL
AND $5::float IS NOT NULL
AND $6::float IS NOT NULL
AND (3959 * acos(
cos(radians($5::float)) * cos(radians(d.latitude)) *
cos(radians(d.longitude) - radians($6::float)) +
sin(radians($5::float)) * sin(radians(d.latitude))
)) < 5
)
)
ORDER BY
CASE
WHEN d.name ILIKE $1 THEN 1
WHEN d.name ILIKE $2 THEN 2
ELSE 3
END,
distance_miles NULLS LAST
LIMIT 10
`,
[
location.name,
`%${location.name.split(' ')[0]}%`,
location.city,
location.state_code,
location.latitude,
location.longitude,
]
);
res.json({
location: mapLocationRowToLocation(location),
candidates: candidates.map((c: any) => ({
id: c.id,
name: c.name,
city: c.city,
state: c.state,
address: c.address,
menuType: c.menu_type,
platformDispensaryId: c.platform_dispensary_id,
menuUrl: c.menu_url,
matchType: c.match_type,
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
})),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
return router;
}
export default createDiscoveryRoutes;

View File

@@ -0,0 +1,269 @@
/**
* Dutchie Discovery Types
*
* Type definitions for the Dutchie store discovery pipeline.
*/
// ============================================================
// DISCOVERY CITY
// ============================================================
export interface DiscoveryCity {
id: number;
platform: string;
cityName: string;
citySlug: string;
stateCode: string | null;
countryCode: string;
lastCrawledAt: Date | null;
crawlEnabled: boolean;
locationCount: number | null;
notes: string | null;
metadata: Record<string, any> | null;
createdAt: Date;
updatedAt: Date;
}
export interface DiscoveryCityRow {
id: number;
platform: string;
city_name: string;
city_slug: string;
state_code: string | null;
country_code: string;
last_crawled_at: Date | null;
crawl_enabled: boolean;
location_count: number | null;
notes: string | null;
metadata: Record<string, any> | null;
created_at: Date;
updated_at: Date;
}
// ============================================================
// DISCOVERY LOCATION
// ============================================================
export type DiscoveryStatus = 'discovered' | 'verified' | 'rejected' | 'merged';
export interface DiscoveryLocation {
id: number;
platform: string;
platformLocationId: string;
platformSlug: string;
platformMenuUrl: string;
name: string;
rawAddress: string | null;
addressLine1: string | null;
addressLine2: string | null;
city: string | null;
stateCode: string | null;
postalCode: string | null;
countryCode: string | null;
latitude: number | null;
longitude: number | null;
timezone: string | null;
status: DiscoveryStatus;
dispensaryId: number | null;
discoveryCityId: number | null;
metadata: Record<string, any> | null;
notes: string | null;
offersDelivery: boolean | null;
offersPickup: boolean | null;
isRecreational: boolean | null;
isMedical: boolean | null;
firstSeenAt: Date;
lastSeenAt: Date;
lastCheckedAt: Date | null;
verifiedAt: Date | null;
verifiedBy: string | null;
active: boolean;
createdAt: Date;
updatedAt: Date;
}
export interface DiscoveryLocationRow {
id: number;
platform: string;
platform_location_id: string;
platform_slug: string;
platform_menu_url: string;
name: string;
raw_address: string | null;
address_line1: string | null;
address_line2: string | null;
city: string | null;
state_code: string | null;
postal_code: string | null;
country_code: string | null;
latitude: number | null;
longitude: number | null;
timezone: string | null;
status: DiscoveryStatus;
dispensary_id: number | null;
discovery_city_id: number | null;
metadata: Record<string, any> | null;
notes: string | null;
offers_delivery: boolean | null;
offers_pickup: boolean | null;
is_recreational: boolean | null;
is_medical: boolean | null;
first_seen_at: Date;
last_seen_at: Date;
last_checked_at: Date | null;
verified_at: Date | null;
verified_by: string | null;
active: boolean;
created_at: Date;
updated_at: Date;
}
// ============================================================
// RAW API RESPONSES
// ============================================================
export interface DutchieCityResponse {
slug: string;
name: string;
state?: string;
stateCode?: string;
country?: string;
countryCode?: string;
}
export interface DutchieLocationResponse {
id: string;
name: string;
slug: string;
address?: string;
address1?: string;
address2?: string;
city?: string;
state?: string;
zip?: string;
zipCode?: string;
country?: string;
latitude?: number;
longitude?: number;
timezone?: string;
menuUrl?: string;
retailType?: string;
offerPickup?: boolean;
offerDelivery?: boolean;
isRecreational?: boolean;
isMedical?: boolean;
// Raw response preserved
[key: string]: any;
}
// ============================================================
// DISCOVERY RESULTS
// ============================================================
export interface CityDiscoveryResult {
citiesFound: number;
citiesUpserted: number;
citiesSkipped: number;
errors: string[];
durationMs: number;
}
export interface LocationDiscoveryResult {
cityId: number;
citySlug: string;
locationsFound: number;
locationsUpserted: number;
locationsNew: number;
locationsUpdated: number;
errors: string[];
durationMs: number;
}
export interface FullDiscoveryResult {
cities: CityDiscoveryResult;
locations: LocationDiscoveryResult[];
totalLocationsFound: number;
totalLocationsUpserted: number;
durationMs: number;
}
// ============================================================
// VERIFICATION
// ============================================================
export interface VerificationResult {
success: boolean;
discoveryId: number;
dispensaryId: number | null;
action: 'created' | 'linked' | 'rejected';
error?: string;
}
export interface PromotionResult {
success: boolean;
discoveryId: number;
dispensaryId: number;
crawlProfileId?: number;
scheduleId?: number;
error?: string;
}
// ============================================================
// MAPPER FUNCTIONS
// ============================================================
export function mapCityRowToCity(row: DiscoveryCityRow): DiscoveryCity {
return {
id: row.id,
platform: row.platform,
cityName: row.city_name,
citySlug: row.city_slug,
stateCode: row.state_code,
countryCode: row.country_code,
lastCrawledAt: row.last_crawled_at,
crawlEnabled: row.crawl_enabled,
locationCount: row.location_count,
notes: row.notes,
metadata: row.metadata,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}
export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLocation {
return {
id: row.id,
platform: row.platform,
platformLocationId: row.platform_location_id,
platformSlug: row.platform_slug,
platformMenuUrl: row.platform_menu_url,
name: row.name,
rawAddress: row.raw_address,
addressLine1: row.address_line1,
addressLine2: row.address_line2,
city: row.city,
stateCode: row.state_code,
postalCode: row.postal_code,
countryCode: row.country_code,
latitude: row.latitude,
longitude: row.longitude,
timezone: row.timezone,
status: row.status,
dispensaryId: row.dispensary_id,
discoveryCityId: row.discovery_city_id,
metadata: row.metadata,
notes: row.notes,
offersDelivery: row.offers_delivery,
offersPickup: row.offers_pickup,
isRecreational: row.is_recreational,
isMedical: row.is_medical,
firstSeenAt: row.first_seen_at,
lastSeenAt: row.last_seen_at,
lastCheckedAt: row.last_checked_at,
verifiedAt: row.verified_at,
verifiedBy: row.verified_by,
active: row.active,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}

View File

@@ -1,50 +1,99 @@
/**
* Dutchie AZ Database Connection
* CannaiQ Database Connection
*
* Isolated database connection for Dutchie Arizona data.
* Uses a separate database/schema to prevent cross-contamination with main app data.
* All database access for the CannaiQ platform goes through this module.
*
* SINGLE DATABASE ARCHITECTURE:
* - All services (auth, orchestrator, crawlers, admin) use this ONE database
* - States are modeled via states table + state_id on dispensaries (not separate DBs)
*
* CONFIGURATION (in priority order):
* 1. CANNAIQ_DB_URL - Full connection string (preferred)
* 2. Individual vars: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS
* 3. DATABASE_URL - Legacy fallback for K8s compatibility
*
* IMPORTANT:
* - Do NOT create separate pools elsewhere
* - All services should import from this module
*/
import { Pool, PoolClient } from 'pg';
// Consolidated DB naming:
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
// - Then DUTCHIE_AZ_DATABASE_URL (legacy)
// - Finally DATABASE_URL (legacy main DB)
const DUTCHIE_AZ_DATABASE_URL =
process.env.CRAWLSY_DATABASE_URL ||
process.env.DUTCHIE_AZ_DATABASE_URL ||
process.env.DATABASE_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
/**
* Get the database connection string from environment variables.
* Supports multiple configuration methods with fallback for legacy compatibility.
*/
function getConnectionString(): string {
// Priority 1: Full CANNAIQ connection URL
if (process.env.CANNAIQ_DB_URL) {
return process.env.CANNAIQ_DB_URL;
}
// Priority 2: Build from individual CANNAIQ env vars
const host = process.env.CANNAIQ_DB_HOST;
const port = process.env.CANNAIQ_DB_PORT;
const name = process.env.CANNAIQ_DB_NAME;
const user = process.env.CANNAIQ_DB_USER;
const pass = process.env.CANNAIQ_DB_PASS;
if (host && port && name && user && pass) {
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
}
// Priority 3: Fallback to DATABASE_URL for legacy/K8s compatibility
if (process.env.DATABASE_URL) {
return process.env.DATABASE_URL;
}
// Report what's missing
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
const missing = required.filter((key) => !process.env[key]);
throw new Error(
`[CannaiQ DB] Missing database configuration.\n` +
`Set CANNAIQ_DB_URL, DATABASE_URL, or all of: ${missing.join(', ')}`
);
}
let pool: Pool | null = null;
/**
* Get the Dutchie AZ database pool (singleton)
* Get the CannaiQ database pool (singleton)
*
* This is the canonical pool for all CannaiQ services.
* Do NOT create separate pools elsewhere.
*/
export function getDutchieAZPool(): Pool {
export function getPool(): Pool {
if (!pool) {
pool = new Pool({
connectionString: DUTCHIE_AZ_DATABASE_URL,
connectionString: getConnectionString(),
max: 10,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 5000,
});
pool.on('error', (err) => {
console.error('[DutchieAZ DB] Unexpected error on idle client:', err);
console.error('[CannaiQ DB] Unexpected error on idle client:', err);
});
console.log('[DutchieAZ DB] Pool initialized');
console.log('[CannaiQ DB] Pool initialized');
}
return pool;
}
/**
* Execute a query on the Dutchie AZ database
* @deprecated Use getPool() instead
*/
export function getDutchieAZPool(): Pool {
console.warn('[CannaiQ DB] getDutchieAZPool() is deprecated. Use getPool() instead.');
return getPool();
}
/**
* Execute a query on the CannaiQ database
*/
export async function query<T = any>(text: string, params?: any[]): Promise<{ rows: T[]; rowCount: number }> {
const p = getDutchieAZPool();
const p = getPool();
const result = await p.query(text, params);
return { rows: result.rows as T[], rowCount: result.rowCount || 0 };
}
@@ -53,7 +102,7 @@ export async function query<T = any>(text: string, params?: any[]): Promise<{ ro
* Get a client from the pool for transaction use
*/
export async function getClient(): Promise<PoolClient> {
const p = getDutchieAZPool();
const p = getPool();
return p.connect();
}
@@ -64,7 +113,7 @@ export async function closePool(): Promise<void> {
if (pool) {
await pool.end();
pool = null;
console.log('[DutchieAZ DB] Pool closed');
console.log('[CannaiQ DB] Pool closed');
}
}
@@ -76,7 +125,7 @@ export async function healthCheck(): Promise<boolean> {
const result = await query('SELECT 1 as ok');
return result.rows.length > 0 && result.rows[0].ok === 1;
} catch (error) {
console.error('[DutchieAZ DB] Health check failed:', error);
console.error('[CannaiQ DB] Health check failed:', error);
return false;
}
}

View File

@@ -0,0 +1,137 @@
/**
* Dispensary Column Definitions
*
* Centralized column list for dispensaries table queries.
* Handles optional columns that may not exist in all environments.
*
* USAGE:
* import { DISPENSARY_COLUMNS, DISPENSARY_COLUMNS_WITH_FAILED } from '../db/dispensary-columns';
* const result = await query(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE ...`);
*/
/**
* Core dispensary columns that always exist.
* These are guaranteed to be present in all environments.
*/
const CORE_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
created_at, updated_at
`;
/**
* Optional columns with NULL fallback.
*
* provider_detection_data: Added in migration 044
* active_crawler_profile_id: Added in migration 041
*
* Using COALESCE ensures the query works whether or not the column exists:
* - If column exists: returns the actual value
* - If column doesn't exist: query fails (but migration should be run)
*
* For pre-migration compatibility, we select NULL::jsonb which always works.
* After migration 044 is applied, this can be changed to the real column.
*/
// TEMPORARY: Use NULL fallback until migration 044 is applied
// After running 044, change this to: provider_detection_data
const PROVIDER_DETECTION_COLUMN = `NULL::jsonb AS provider_detection_data`;
// After migration 044 is applied, uncomment this line and remove the above:
// const PROVIDER_DETECTION_COLUMN = `provider_detection_data`;
/**
* Standard dispensary columns for most queries.
* Includes provider_detection_data with NULL fallback for pre-migration compatibility.
*/
export const DISPENSARY_COLUMNS = `${CORE_COLUMNS.trim()},
${PROVIDER_DETECTION_COLUMN}`;
/**
* Dispensary columns including active_crawler_profile_id.
* Used by routes that need profile information.
*/
export const DISPENSARY_COLUMNS_WITH_PROFILE = `${CORE_COLUMNS.trim()},
${PROVIDER_DETECTION_COLUMN},
active_crawler_profile_id`;
/**
* Dispensary columns including failed_at.
* Used by worker for compatibility checks.
*/
export const DISPENSARY_COLUMNS_WITH_FAILED = `${CORE_COLUMNS.trim()},
${PROVIDER_DETECTION_COLUMN},
failed_at`;
/**
* NOTE: After migration 044 is applied, update PROVIDER_DETECTION_COLUMN above
* to use the real column instead of NULL fallback.
*
* To verify migration status:
* SELECT column_name FROM information_schema.columns
* WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data';
*/
// Cache for column existence check
let _providerDetectionColumnExists: boolean | null = null;
/**
* Check if provider_detection_data column exists in dispensaries table.
* Result is cached after first check.
*/
export async function hasProviderDetectionColumn(pool: { query: (sql: string) => Promise<{ rows: any[] }> }): Promise<boolean> {
if (_providerDetectionColumnExists !== null) {
return _providerDetectionColumnExists;
}
try {
const result = await pool.query(`
SELECT 1 FROM information_schema.columns
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
`);
_providerDetectionColumnExists = result.rows.length > 0;
} catch {
_providerDetectionColumnExists = false;
}
return _providerDetectionColumnExists;
}
/**
* Safely update provider_detection_data column.
* If column doesn't exist, logs a warning but doesn't crash.
*
* @param pool - Database pool with query method
* @param dispensaryId - ID of dispensary to update
* @param data - JSONB data to merge into provider_detection_data
* @returns true if update succeeded, false if column doesn't exist
*/
export async function safeUpdateProviderDetectionData(
pool: { query: (sql: string, params?: any[]) => Promise<any> },
dispensaryId: number,
data: Record<string, any>
): Promise<boolean> {
const hasColumn = await hasProviderDetectionColumn(pool);
if (!hasColumn) {
console.warn(`[DispensaryColumns] provider_detection_data column not found. Run migration 044 to add it.`);
return false;
}
try {
await pool.query(
`UPDATE dispensaries
SET provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || $1::jsonb,
updated_at = NOW()
WHERE id = $2`,
[JSON.stringify(data), dispensaryId]
);
return true;
} catch (error: any) {
if (error.message?.includes('provider_detection_data')) {
console.warn(`[DispensaryColumns] Failed to update provider_detection_data: ${error.message}`);
return false;
}
throw error;
}
}

View File

@@ -0,0 +1,403 @@
/**
* DtCityDiscoveryService
*
* Core service for Dutchie city discovery.
* Contains shared logic used by multiple entrypoints.
*
* Responsibilities:
* - Browser/API-based city fetching
* - Manual city seeding
* - City upsert operations
*/
import { Pool } from 'pg';
import axios from 'axios';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
// ============================================================
// TYPES
// ============================================================
export interface DutchieCity {
name: string;
slug: string;
stateCode: string | null;
countryCode: string;
url?: string;
}
export interface CityDiscoveryResult {
citiesFound: number;
citiesInserted: number;
citiesUpdated: number;
errors: string[];
durationMs: number;
}
export interface ManualSeedResult {
city: DutchieCity;
id: number;
wasInserted: boolean;
}
// ============================================================
// US STATE CODE MAPPING
// ============================================================
export const US_STATE_MAP: Record<string, string> = {
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
};
// Canadian province mapping
export const CA_PROVINCE_MAP: Record<string, string> = {
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
'saskatchewan': 'SK', 'yukon': 'YT',
};
// ============================================================
// CITY FETCHING (AUTO DISCOVERY)
// ============================================================
/**
* Fetch cities from Dutchie's /cities page using Puppeteer.
*/
export async function fetchCitiesFromBrowser(): Promise<DutchieCity[]> {
console.log('[DtCityDiscoveryService] Launching browser to fetch cities...');
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
console.log('[DtCityDiscoveryService] Navigating to https://dutchie.com/cities...');
await page.goto('https://dutchie.com/cities', {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
const cities = await page.evaluate(() => {
const cityLinks: Array<{
name: string;
slug: string;
url: string;
stateSlug: string | null;
}> = [];
const links = document.querySelectorAll('a[href*="/city/"]');
links.forEach((link) => {
const href = (link as HTMLAnchorElement).href;
const text = (link as HTMLElement).innerText?.trim();
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
if (match && text) {
cityLinks.push({
name: text,
slug: match[2],
url: href,
stateSlug: match[1],
});
}
});
return cityLinks;
});
console.log(`[DtCityDiscoveryService] Extracted ${cities.length} city links from page`);
return cities.map((city) => {
let countryCode = 'US';
let stateCode: string | null = null;
if (city.stateSlug) {
if (US_STATE_MAP[city.stateSlug]) {
stateCode = US_STATE_MAP[city.stateSlug];
countryCode = 'US';
} else if (CA_PROVINCE_MAP[city.stateSlug]) {
stateCode = CA_PROVINCE_MAP[city.stateSlug];
countryCode = 'CA';
} else if (city.stateSlug.length === 2) {
stateCode = city.stateSlug.toUpperCase();
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
countryCode = 'CA';
}
}
}
return {
name: city.name,
slug: city.slug,
stateCode,
countryCode,
url: city.url,
};
});
} finally {
await browser.close();
}
}
/**
* Fetch cities via API endpoints (fallback).
*/
export async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
console.log('[DtCityDiscoveryService] Attempting API-based city discovery...');
const apiEndpoints = [
'https://dutchie.com/api/cities',
'https://api.dutchie.com/v1/cities',
];
for (const endpoint of apiEndpoints) {
try {
const response = await axios.get(endpoint, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
Accept: 'application/json',
},
timeout: 15000,
});
if (response.data && Array.isArray(response.data)) {
console.log(`[DtCityDiscoveryService] API returned ${response.data.length} cities`);
return response.data.map((c: any) => ({
name: c.name || c.city,
slug: c.slug || c.citySlug,
stateCode: c.stateCode || c.state,
countryCode: c.countryCode || c.country || 'US',
}));
}
} catch (error: any) {
console.log(`[DtCityDiscoveryService] API ${endpoint} failed: ${error.message}`);
}
}
return [];
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a city into dutchie_discovery_cities
*/
export async function upsertCity(
pool: Pool,
city: DutchieCity
): Promise<{ id: number; inserted: boolean; updated: boolean }> {
const result = await pool.query(
`
INSERT INTO dutchie_discovery_cities (
platform,
city_name,
city_slug,
state_code,
country_code,
crawl_enabled,
created_at,
updated_at
) VALUES (
'dutchie',
$1,
$2,
$3,
$4,
TRUE,
NOW(),
NOW()
)
ON CONFLICT (platform, country_code, state_code, city_slug)
DO UPDATE SET
city_name = EXCLUDED.city_name,
crawl_enabled = TRUE,
updated_at = NOW()
RETURNING id, (xmax = 0) AS inserted
`,
[city.name, city.slug, city.stateCode, city.countryCode]
);
const inserted = result.rows[0]?.inserted === true;
return {
id: result.rows[0]?.id,
inserted,
updated: !inserted,
};
}
// ============================================================
// MAIN SERVICE CLASS
// ============================================================
export class DtCityDiscoveryService {
constructor(private pool: Pool) {}
/**
* Run auto-discovery (browser + API fallback)
*/
async runAutoDiscovery(): Promise<CityDiscoveryResult> {
const startTime = Date.now();
const errors: string[] = [];
let citiesFound = 0;
let citiesInserted = 0;
let citiesUpdated = 0;
console.log('[DtCityDiscoveryService] Starting auto city discovery...');
try {
let cities = await fetchCitiesFromBrowser();
if (cities.length === 0) {
console.log('[DtCityDiscoveryService] Browser returned 0 cities, trying API...');
cities = await fetchCitiesFromAPI();
}
citiesFound = cities.length;
console.log(`[DtCityDiscoveryService] Found ${citiesFound} cities`);
for (const city of cities) {
try {
const result = await upsertCity(this.pool, city);
if (result.inserted) citiesInserted++;
else if (result.updated) citiesUpdated++;
} catch (error: any) {
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
console.error(`[DtCityDiscoveryService] ${msg}`);
errors.push(msg);
}
}
} catch (error: any) {
const msg = `Auto discovery failed: ${error.message}`;
console.error(`[DtCityDiscoveryService] ${msg}`);
errors.push(msg);
}
const durationMs = Date.now() - startTime;
return {
citiesFound,
citiesInserted,
citiesUpdated,
errors,
durationMs,
};
}
/**
* Seed a single city manually
*/
async seedCity(city: DutchieCity): Promise<ManualSeedResult> {
console.log(`[DtCityDiscoveryService] Seeding city: ${city.name} (${city.slug}), ${city.stateCode}, ${city.countryCode}`);
const result = await upsertCity(this.pool, city);
return {
city,
id: result.id,
wasInserted: result.inserted,
};
}
/**
* Seed multiple cities from a list
*/
async seedCities(cities: DutchieCity[]): Promise<{
results: ManualSeedResult[];
errors: string[];
}> {
const results: ManualSeedResult[] = [];
const errors: string[] = [];
for (const city of cities) {
try {
const result = await this.seedCity(city);
results.push(result);
} catch (error: any) {
errors.push(`${city.slug}: ${error.message}`);
}
}
return { results, errors };
}
/**
* Get statistics about discovered cities
*/
async getStats(): Promise<{
total: number;
byCountry: Array<{ countryCode: string; count: number }>;
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
crawlEnabled: number;
neverCrawled: number;
}> {
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE platform = \'dutchie\''),
this.pool.query(`
SELECT country_code, COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE platform = 'dutchie'
GROUP BY country_code
ORDER BY cnt DESC
`),
this.pool.query(`
SELECT state_code, country_code, COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND state_code IS NOT NULL
GROUP BY state_code, country_code
ORDER BY cnt DESC
`),
this.pool.query(`
SELECT COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
`),
this.pool.query(`
SELECT COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND last_crawled_at IS NULL
`),
]);
return {
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
byCountry: byCountryRes.rows.map((r) => ({
countryCode: r.country_code,
count: parseInt(r.cnt, 10),
})),
byState: byStateRes.rows.map((r) => ({
stateCode: r.state_code,
countryCode: r.country_code,
count: parseInt(r.cnt, 10),
})),
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
};
}
}
export default DtCityDiscoveryService;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,390 @@
/**
* DutchieCityDiscovery
*
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
*
* Responsibilities:
* - Fetch all cities available on Dutchie
* - For each city derive: city_name, city_slug, state_code, country_code
* - Upsert into dutchie_discovery_cities
*/
import { Pool } from 'pg';
import axios from 'axios';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import type { Browser, Page } from 'puppeteer';
puppeteer.use(StealthPlugin());
// ============================================================
// TYPES
// ============================================================
export interface DutchieCity {
name: string;
slug: string;
stateCode: string | null;
countryCode: string;
url?: string;
}
export interface CityDiscoveryResult {
citiesFound: number;
citiesInserted: number;
citiesUpdated: number;
errors: string[];
durationMs: number;
}
// ============================================================
// US STATE CODE MAPPING
// ============================================================
const US_STATE_MAP: Record<string, string> = {
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
};
// Canadian province mapping
const CA_PROVINCE_MAP: Record<string, string> = {
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
'saskatchewan': 'SK', 'yukon': 'YT',
};
// ============================================================
// CITY FETCHING
// ============================================================
/**
* Fetch cities from Dutchie's /cities page using Puppeteer to extract data.
*/
async function fetchCitiesFromDutchie(): Promise<DutchieCity[]> {
console.log('[DutchieCityDiscovery] Launching browser to fetch cities...');
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
// Navigate to cities page
console.log('[DutchieCityDiscovery] Navigating to https://dutchie.com/cities...');
await page.goto('https://dutchie.com/cities', {
waitUntil: 'networkidle2',
timeout: 60000,
});
// Wait for content to load
await new Promise((r) => setTimeout(r, 3000));
// Extract city links from the page
const cities = await page.evaluate(() => {
const cityLinks: Array<{
name: string;
slug: string;
url: string;
stateSlug: string | null;
}> = [];
// Find all city links - they typically follow pattern /city/{state}/{city}
const links = document.querySelectorAll('a[href*="/city/"]');
links.forEach((link) => {
const href = (link as HTMLAnchorElement).href;
const text = (link as HTMLElement).innerText?.trim();
// Parse URL: https://dutchie.com/city/{state}/{city}
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
if (match && text) {
cityLinks.push({
name: text,
slug: match[2],
url: href,
stateSlug: match[1],
});
}
});
return cityLinks;
});
console.log(`[DutchieCityDiscovery] Extracted ${cities.length} city links from page`);
// Convert to DutchieCity format
const result: DutchieCity[] = [];
for (const city of cities) {
// Determine country and state code
let countryCode = 'US';
let stateCode: string | null = null;
if (city.stateSlug) {
// Check if it's a US state
if (US_STATE_MAP[city.stateSlug]) {
stateCode = US_STATE_MAP[city.stateSlug];
countryCode = 'US';
}
// Check if it's a Canadian province
else if (CA_PROVINCE_MAP[city.stateSlug]) {
stateCode = CA_PROVINCE_MAP[city.stateSlug];
countryCode = 'CA';
}
// Check if it's already a 2-letter code
else if (city.stateSlug.length === 2) {
stateCode = city.stateSlug.toUpperCase();
// Determine country based on state code
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
countryCode = 'CA';
}
}
}
result.push({
name: city.name,
slug: city.slug,
stateCode,
countryCode,
url: city.url,
});
}
return result;
} finally {
await browser.close();
}
}
/**
* Alternative: Fetch cities by making API/GraphQL requests.
* Falls back to this if scraping fails.
*/
async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
console.log('[DutchieCityDiscovery] Attempting API-based city discovery...');
// Dutchie may have an API endpoint for cities
// Try common patterns
const apiEndpoints = [
'https://dutchie.com/api/cities',
'https://api.dutchie.com/v1/cities',
];
for (const endpoint of apiEndpoints) {
try {
const response = await axios.get(endpoint, {
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
Accept: 'application/json',
},
timeout: 15000,
});
if (response.data && Array.isArray(response.data)) {
console.log(`[DutchieCityDiscovery] API returned ${response.data.length} cities`);
return response.data.map((c: any) => ({
name: c.name || c.city,
slug: c.slug || c.citySlug,
stateCode: c.stateCode || c.state,
countryCode: c.countryCode || c.country || 'US',
}));
}
} catch (error: any) {
console.log(`[DutchieCityDiscovery] API ${endpoint} failed: ${error.message}`);
}
}
return [];
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a city into dutchie_discovery_cities
*/
async function upsertCity(
pool: Pool,
city: DutchieCity
): Promise<{ inserted: boolean; updated: boolean }> {
const result = await pool.query(
`
INSERT INTO dutchie_discovery_cities (
platform,
city_name,
city_slug,
state_code,
country_code,
last_crawled_at,
updated_at
) VALUES (
'dutchie',
$1,
$2,
$3,
$4,
NOW(),
NOW()
)
ON CONFLICT (platform, country_code, state_code, city_slug)
DO UPDATE SET
city_name = EXCLUDED.city_name,
last_crawled_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS inserted
`,
[city.name, city.slug, city.stateCode, city.countryCode]
);
const inserted = result.rows[0]?.inserted === true;
return { inserted, updated: !inserted };
}
// ============================================================
// MAIN DISCOVERY FUNCTION
// ============================================================
export class DutchieCityDiscovery {
private pool: Pool;
constructor(pool: Pool) {
this.pool = pool;
}
/**
* Run the city discovery process
*/
async run(): Promise<CityDiscoveryResult> {
const startTime = Date.now();
const errors: string[] = [];
let citiesFound = 0;
let citiesInserted = 0;
let citiesUpdated = 0;
console.log('[DutchieCityDiscovery] Starting city discovery...');
try {
// Try scraping first, fall back to API
let cities = await fetchCitiesFromDutchie();
if (cities.length === 0) {
console.log('[DutchieCityDiscovery] Scraping returned 0 cities, trying API...');
cities = await fetchCitiesFromAPI();
}
citiesFound = cities.length;
console.log(`[DutchieCityDiscovery] Found ${citiesFound} cities`);
// Upsert each city
for (const city of cities) {
try {
const result = await upsertCity(this.pool, city);
if (result.inserted) {
citiesInserted++;
} else if (result.updated) {
citiesUpdated++;
}
} catch (error: any) {
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
console.error(`[DutchieCityDiscovery] ${msg}`);
errors.push(msg);
}
}
} catch (error: any) {
const msg = `City discovery failed: ${error.message}`;
console.error(`[DutchieCityDiscovery] ${msg}`);
errors.push(msg);
}
const durationMs = Date.now() - startTime;
console.log('[DutchieCityDiscovery] Discovery complete:');
console.log(` Cities found: ${citiesFound}`);
console.log(` Inserted: ${citiesInserted}`);
console.log(` Updated: ${citiesUpdated}`);
console.log(` Errors: ${errors.length}`);
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
return {
citiesFound,
citiesInserted,
citiesUpdated,
errors,
durationMs,
};
}
/**
* Get statistics about discovered cities
*/
async getStats(): Promise<{
total: number;
byCountry: Array<{ countryCode: string; count: number }>;
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
crawlEnabled: number;
neverCrawled: number;
}> {
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
this.pool.query(`
SELECT country_code, COUNT(*) as cnt
FROM dutchie_discovery_cities
GROUP BY country_code
ORDER BY cnt DESC
`),
this.pool.query(`
SELECT state_code, country_code, COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE state_code IS NOT NULL
GROUP BY state_code, country_code
ORDER BY cnt DESC
`),
this.pool.query(`
SELECT COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE crawl_enabled = TRUE
`),
this.pool.query(`
SELECT COUNT(*) as cnt
FROM dutchie_discovery_cities
WHERE last_crawled_at IS NULL
`),
]);
return {
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
byCountry: byCountryRes.rows.map((r) => ({
countryCode: r.country_code,
count: parseInt(r.cnt, 10),
})),
byState: byStateRes.rows.map((r) => ({
stateCode: r.state_code,
countryCode: r.country_code,
count: parseInt(r.cnt, 10),
})),
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
};
}
}
export default DutchieCityDiscovery;

View File

@@ -0,0 +1,639 @@
/**
* DutchieLocationDiscovery
*
* Discovers store locations for each city from Dutchie and upserts to dutchie_discovery_locations.
*
* Responsibilities:
* - Given a dutchie_discovery_cities row, call Dutchie's location/search endpoint
* - For each store: extract platform_location_id, platform_slug, platform_menu_url, name, address, coords
* - Upsert into dutchie_discovery_locations
* - DO NOT overwrite status if already verified/merged/rejected
* - DO NOT overwrite dispensary_id if already set
*/
import { Pool } from 'pg';
import axios from 'axios';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
// ============================================================
// TYPES
// ============================================================
export interface DiscoveryCity {
id: number;
platform: string;
cityName: string;
citySlug: string;
stateCode: string | null;
countryCode: string;
crawlEnabled: boolean;
}
export interface DutchieLocation {
platformLocationId: string;
platformSlug: string;
platformMenuUrl: string;
name: string;
rawAddress: string | null;
addressLine1: string | null;
addressLine2: string | null;
city: string | null;
stateCode: string | null;
postalCode: string | null;
countryCode: string | null;
latitude: number | null;
longitude: number | null;
timezone: string | null;
offersDelivery: boolean | null;
offersPickup: boolean | null;
isRecreational: boolean | null;
isMedical: boolean | null;
metadata: Record<string, any>;
}
export interface LocationDiscoveryResult {
cityId: number;
citySlug: string;
locationsFound: number;
locationsInserted: number;
locationsUpdated: number;
locationsSkipped: number;
errors: string[];
durationMs: number;
}
// ============================================================
// LOCATION FETCHING
// ============================================================
/**
* Fetch locations for a city using Puppeteer to scrape the city page
*/
async function fetchLocationsForCity(city: DiscoveryCity): Promise<DutchieLocation[]> {
console.log(`[DutchieLocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
// Navigate to city page - use /us/dispensaries/{city_slug} pattern
const cityUrl = `https://dutchie.com/us/dispensaries/${city.citySlug}`;
console.log(`[DutchieLocationDiscovery] Navigating to ${cityUrl}...`);
await page.goto(cityUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
// Wait for content
await new Promise((r) => setTimeout(r, 3000));
// Try to extract __NEXT_DATA__ which often contains store data
const nextData = await page.evaluate(() => {
const script = document.querySelector('script#__NEXT_DATA__');
if (script) {
try {
return JSON.parse(script.textContent || '{}');
} catch {
return null;
}
}
return null;
});
let locations: DutchieLocation[] = [];
if (nextData?.props?.pageProps?.dispensaries) {
// Extract from Next.js data
const dispensaries = nextData.props.pageProps.dispensaries;
console.log(`[DutchieLocationDiscovery] Found ${dispensaries.length} dispensaries in __NEXT_DATA__`);
locations = dispensaries.map((d: any) => parseDispensaryData(d, city));
} else {
// Fall back to DOM scraping
console.log('[DutchieLocationDiscovery] No __NEXT_DATA__, trying DOM scraping...');
const scrapedData = await page.evaluate(() => {
const stores: Array<{
name: string;
href: string;
address: string | null;
}> = [];
// Look for dispensary cards/links
const cards = document.querySelectorAll('[data-testid="dispensary-card"], .dispensary-card, a[href*="/dispensary/"]');
cards.forEach((card) => {
const link = card.querySelector('a[href*="/dispensary/"]') || (card as HTMLAnchorElement);
const href = (link as HTMLAnchorElement).href || '';
const name =
card.querySelector('[data-testid="dispensary-name"]')?.textContent ||
card.querySelector('h2, h3, .name')?.textContent ||
link.textContent ||
'';
const address = card.querySelector('[data-testid="dispensary-address"], .address')?.textContent || null;
if (href && name) {
stores.push({
name: name.trim(),
href,
address: address?.trim() || null,
});
}
});
return stores;
});
console.log(`[DutchieLocationDiscovery] DOM scraping found ${scrapedData.length} stores`);
locations = scrapedData.map((s) => {
// Parse slug from URL
const match = s.href.match(/\/dispensary\/([^/?]+)/);
const slug = match ? match[1] : s.name.toLowerCase().replace(/\s+/g, '-');
return {
platformLocationId: slug, // Will be resolved later
platformSlug: slug,
platformMenuUrl: `https://dutchie.com/dispensary/${slug}`,
name: s.name,
rawAddress: s.address,
addressLine1: null,
addressLine2: null,
city: city.cityName,
stateCode: city.stateCode,
postalCode: null,
countryCode: city.countryCode,
latitude: null,
longitude: null,
timezone: null,
offersDelivery: null,
offersPickup: null,
isRecreational: null,
isMedical: null,
metadata: { source: 'dom_scrape', originalUrl: s.href },
};
});
}
return locations;
} finally {
await browser.close();
}
}
/**
* Parse dispensary data from Dutchie's API/JSON response
*/
function parseDispensaryData(d: any, city: DiscoveryCity): DutchieLocation {
const id = d.id || d._id || d.dispensaryId || '';
const slug = d.slug || d.cName || d.name?.toLowerCase().replace(/\s+/g, '-') || '';
// Build menu URL
let menuUrl = `https://dutchie.com/dispensary/${slug}`;
if (d.menuUrl) {
menuUrl = d.menuUrl;
} else if (d.embeddedMenuUrl) {
menuUrl = d.embeddedMenuUrl;
}
// Parse address
const address = d.address || d.location?.address || {};
const rawAddress = [
address.line1 || address.street1 || d.address1,
address.line2 || address.street2 || d.address2,
[
address.city || d.city,
address.state || address.stateCode || d.state,
address.zip || address.zipCode || address.postalCode || d.zip,
]
.filter(Boolean)
.join(' '),
]
.filter(Boolean)
.join(', ');
return {
platformLocationId: id,
platformSlug: slug,
platformMenuUrl: menuUrl,
name: d.name || d.dispensaryName || '',
rawAddress: rawAddress || null,
addressLine1: address.line1 || address.street1 || d.address1 || null,
addressLine2: address.line2 || address.street2 || d.address2 || null,
city: address.city || d.city || city.cityName,
stateCode: address.state || address.stateCode || d.state || city.stateCode,
postalCode: address.zip || address.zipCode || address.postalCode || d.zip || null,
countryCode: address.country || address.countryCode || d.country || city.countryCode,
latitude: d.latitude ?? d.location?.latitude ?? d.location?.lat ?? null,
longitude: d.longitude ?? d.location?.longitude ?? d.location?.lng ?? null,
timezone: d.timezone || d.timeZone || null,
offersDelivery: d.offerDelivery ?? d.offersDelivery ?? d.delivery ?? null,
offersPickup: d.offerPickup ?? d.offersPickup ?? d.pickup ?? null,
isRecreational: d.isRecreational ?? d.recreational ?? (d.retailType === 'recreational' || d.retailType === 'both'),
isMedical: d.isMedical ?? d.medical ?? (d.retailType === 'medical' || d.retailType === 'both'),
metadata: {
source: 'next_data',
retailType: d.retailType,
brand: d.brand,
logo: d.logo || d.logoUrl,
raw: d,
},
};
}
/**
* Alternative: Use GraphQL to discover locations
*/
async function fetchLocationsViaGraphQL(city: DiscoveryCity): Promise<DutchieLocation[]> {
console.log(`[DutchieLocationDiscovery] Trying GraphQL for ${city.cityName}...`);
// Try geo-based search
// This would require knowing the city's coordinates
// For now, return empty and rely on page scraping
return [];
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a location into dutchie_discovery_locations
* Does NOT overwrite status if already verified/merged/rejected
* Does NOT overwrite dispensary_id if already set
*/
async function upsertLocation(
pool: Pool,
location: DutchieLocation,
cityId: number
): Promise<{ inserted: boolean; updated: boolean; skipped: boolean }> {
// First check if this location exists and has a protected status
const existing = await pool.query(
`
SELECT id, status, dispensary_id
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND platform_location_id = $1
`,
[location.platformLocationId]
);
if (existing.rows.length > 0) {
const row = existing.rows[0];
const protectedStatuses = ['verified', 'merged', 'rejected'];
if (protectedStatuses.includes(row.status)) {
// Only update last_seen_at for protected statuses
await pool.query(
`
UPDATE dutchie_discovery_locations
SET last_seen_at = NOW(), updated_at = NOW()
WHERE id = $1
`,
[row.id]
);
return { inserted: false, updated: false, skipped: true };
}
// Update existing discovered location (but preserve dispensary_id if set)
await pool.query(
`
UPDATE dutchie_discovery_locations
SET
platform_slug = $2,
platform_menu_url = $3,
name = $4,
raw_address = COALESCE($5, raw_address),
address_line1 = COALESCE($6, address_line1),
address_line2 = COALESCE($7, address_line2),
city = COALESCE($8, city),
state_code = COALESCE($9, state_code),
postal_code = COALESCE($10, postal_code),
country_code = COALESCE($11, country_code),
latitude = COALESCE($12, latitude),
longitude = COALESCE($13, longitude),
timezone = COALESCE($14, timezone),
offers_delivery = COALESCE($15, offers_delivery),
offers_pickup = COALESCE($16, offers_pickup),
is_recreational = COALESCE($17, is_recreational),
is_medical = COALESCE($18, is_medical),
metadata = COALESCE($19, metadata),
discovery_city_id = $20,
last_seen_at = NOW(),
updated_at = NOW()
WHERE id = $1
`,
[
row.id,
location.platformSlug,
location.platformMenuUrl,
location.name,
location.rawAddress,
location.addressLine1,
location.addressLine2,
location.city,
location.stateCode,
location.postalCode,
location.countryCode,
location.latitude,
location.longitude,
location.timezone,
location.offersDelivery,
location.offersPickup,
location.isRecreational,
location.isMedical,
JSON.stringify(location.metadata),
cityId,
]
);
return { inserted: false, updated: true, skipped: false };
}
// Insert new location
await pool.query(
`
INSERT INTO dutchie_discovery_locations (
platform,
platform_location_id,
platform_slug,
platform_menu_url,
name,
raw_address,
address_line1,
address_line2,
city,
state_code,
postal_code,
country_code,
latitude,
longitude,
timezone,
status,
offers_delivery,
offers_pickup,
is_recreational,
is_medical,
metadata,
discovery_city_id,
first_seen_at,
last_seen_at,
active,
created_at,
updated_at
) VALUES (
'dutchie',
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
'discovered',
$15, $16, $17, $18, $19, $20,
NOW(), NOW(), TRUE, NOW(), NOW()
)
`,
[
location.platformLocationId,
location.platformSlug,
location.platformMenuUrl,
location.name,
location.rawAddress,
location.addressLine1,
location.addressLine2,
location.city,
location.stateCode,
location.postalCode,
location.countryCode,
location.latitude,
location.longitude,
location.timezone,
location.offersDelivery,
location.offersPickup,
location.isRecreational,
location.isMedical,
JSON.stringify(location.metadata),
cityId,
]
);
return { inserted: true, updated: false, skipped: false };
}
// ============================================================
// MAIN DISCOVERY CLASS
// ============================================================
export class DutchieLocationDiscovery {
private pool: Pool;
constructor(pool: Pool) {
this.pool = pool;
}
/**
* Get a city by slug
*/
async getCityBySlug(citySlug: string): Promise<DiscoveryCity | null> {
const { rows } = await this.pool.query(
`
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND city_slug = $1
LIMIT 1
`,
[citySlug]
);
if (rows.length === 0) return null;
const r = rows[0];
return {
id: r.id,
platform: r.platform,
cityName: r.city_name,
citySlug: r.city_slug,
stateCode: r.state_code,
countryCode: r.country_code,
crawlEnabled: r.crawl_enabled,
};
}
/**
* Get all crawl-enabled cities
*/
async getEnabledCities(limit?: number): Promise<DiscoveryCity[]> {
const { rows } = await this.pool.query(
`
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
ORDER BY last_crawled_at ASC NULLS FIRST, city_name ASC
${limit ? `LIMIT ${limit}` : ''}
`
);
return rows.map((r) => ({
id: r.id,
platform: r.platform,
cityName: r.city_name,
citySlug: r.city_slug,
stateCode: r.state_code,
countryCode: r.country_code,
crawlEnabled: r.crawl_enabled,
}));
}
/**
* Discover locations for a single city
*/
async discoverForCity(city: DiscoveryCity): Promise<LocationDiscoveryResult> {
const startTime = Date.now();
const errors: string[] = [];
let locationsFound = 0;
let locationsInserted = 0;
let locationsUpdated = 0;
let locationsSkipped = 0;
console.log(`[DutchieLocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
try {
// Fetch locations
let locations = await fetchLocationsForCity(city);
// If scraping fails, try GraphQL
if (locations.length === 0) {
locations = await fetchLocationsViaGraphQL(city);
}
locationsFound = locations.length;
console.log(`[DutchieLocationDiscovery] Found ${locationsFound} locations`);
// Upsert each location
for (const location of locations) {
try {
const result = await upsertLocation(this.pool, location, city.id);
if (result.inserted) locationsInserted++;
else if (result.updated) locationsUpdated++;
else if (result.skipped) locationsSkipped++;
} catch (error: any) {
const msg = `Failed to upsert location ${location.platformSlug}: ${error.message}`;
console.error(`[DutchieLocationDiscovery] ${msg}`);
errors.push(msg);
}
}
// Update city's last_crawled_at and location_count
await this.pool.query(
`
UPDATE dutchie_discovery_cities
SET last_crawled_at = NOW(),
location_count = $1,
updated_at = NOW()
WHERE id = $2
`,
[locationsFound, city.id]
);
} catch (error: any) {
const msg = `Location discovery failed for ${city.citySlug}: ${error.message}`;
console.error(`[DutchieLocationDiscovery] ${msg}`);
errors.push(msg);
}
const durationMs = Date.now() - startTime;
console.log(`[DutchieLocationDiscovery] City ${city.citySlug} complete:`);
console.log(` Locations found: ${locationsFound}`);
console.log(` Inserted: ${locationsInserted}`);
console.log(` Updated: ${locationsUpdated}`);
console.log(` Skipped (protected): ${locationsSkipped}`);
console.log(` Errors: ${errors.length}`);
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
return {
cityId: city.id,
citySlug: city.citySlug,
locationsFound,
locationsInserted,
locationsUpdated,
locationsSkipped,
errors,
durationMs,
};
}
/**
* Discover locations for all enabled cities
*/
async discoverAllEnabled(options: {
limit?: number;
delayMs?: number;
} = {}): Promise<{
totalCities: number;
totalLocationsFound: number;
totalInserted: number;
totalUpdated: number;
totalSkipped: number;
errors: string[];
durationMs: number;
}> {
const { limit, delayMs = 2000 } = options;
const startTime = Date.now();
let totalLocationsFound = 0;
let totalInserted = 0;
let totalUpdated = 0;
let totalSkipped = 0;
const allErrors: string[] = [];
const cities = await this.getEnabledCities(limit);
console.log(`[DutchieLocationDiscovery] Discovering locations for ${cities.length} cities...`);
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[DutchieLocationDiscovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await this.discoverForCity(city);
totalLocationsFound += result.locationsFound;
totalInserted += result.locationsInserted;
totalUpdated += result.locationsUpdated;
totalSkipped += result.locationsSkipped;
allErrors.push(...result.errors);
} catch (error: any) {
allErrors.push(`City ${city.citySlug} failed: ${error.message}`);
}
// Delay between cities
if (i < cities.length - 1 && delayMs > 0) {
await new Promise((r) => setTimeout(r, delayMs));
}
}
const durationMs = Date.now() - startTime;
console.log('\n[DutchieLocationDiscovery] All cities complete:');
console.log(` Total cities: ${cities.length}`);
console.log(` Total locations found: ${totalLocationsFound}`);
console.log(` Total inserted: ${totalInserted}`);
console.log(` Total updated: ${totalUpdated}`);
console.log(` Total skipped: ${totalSkipped}`);
console.log(` Total errors: ${allErrors.length}`);
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
return {
totalCities: cities.length,
totalLocationsFound,
totalInserted,
totalUpdated,
totalSkipped,
errors: allErrors,
durationMs,
};
}
}
export default DutchieLocationDiscovery;

View File

@@ -0,0 +1,73 @@
#!/usr/bin/env npx tsx
/**
* Discovery Entrypoint: Dutchie Cities (Auto)
*
* Attempts browser/API-based /cities discovery.
* Even if currently blocked (403), this runner preserves the auto-discovery path.
*
* Usage:
* npm run discovery:dt:cities:auto
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts
*/
import { Pool } from 'pg';
import { DtCityDiscoveryService } from './DtCityDiscoveryService';
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
async function main() {
console.log('╔══════════════════════════════════════════════════╗');
console.log('║ Dutchie City Discovery (AUTO) ║');
console.log('║ Browser + API fallback ║');
console.log('╚══════════════════════════════════════════════════╝');
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
const pool = new Pool({ connectionString: DB_URL });
try {
const { rows } = await pool.query('SELECT NOW() as time');
console.log(`Connected at: ${rows[0].time}\n`);
const service = new DtCityDiscoveryService(pool);
const result = await service.runAutoDiscovery();
console.log('\n' + '═'.repeat(50));
console.log('SUMMARY');
console.log('═'.repeat(50));
console.log(`Cities found: ${result.citiesFound}`);
console.log(`Cities inserted: ${result.citiesInserted}`);
console.log(`Cities updated: ${result.citiesUpdated}`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0) {
console.log('\nErrors:');
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
}
const stats = await service.getStats();
console.log('\nCurrent Database Stats:');
console.log(` Total cities: ${stats.total}`);
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
console.log(` Never crawled: ${stats.neverCrawled}`);
if (result.citiesFound === 0) {
console.log('\n⚠ No cities found via auto-discovery.');
console.log(' This may be due to Dutchie blocking scraping/API access.');
console.log(' Use manual seeding instead:');
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
process.exit(1);
}
console.log('\n✅ Auto city discovery completed');
process.exit(0);
} catch (error: any) {
console.error('\n❌ Auto city discovery failed:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,137 @@
#!/usr/bin/env npx tsx
/**
* Discovery Entrypoint: Dutchie Cities (Manual Seed)
*
* Manually seeds cities into dutchie_discovery_cities via CLI args.
* Use this when auto-discovery is blocked (403).
*
* Usage:
* npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
* npm run discovery:dt:cities:manual -- --city-slug=ma-boston --city-name=Boston --state-code=MA --country-code=US
*
* Options:
* --city-slug Required. URL slug (e.g., "ny-hudson")
* --city-name Required. Display name (e.g., "Hudson")
* --state-code Required. State/province code (e.g., "NY", "CA", "ON")
* --country-code Optional. Country code (default: "US")
*
* After seeding, run location discovery:
* npm run discovery:dt:locations
*/
import { Pool } from 'pg';
import { DtCityDiscoveryService, DutchieCity } from './DtCityDiscoveryService';
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
interface Args {
citySlug?: string;
cityName?: string;
stateCode?: string;
countryCode: string;
}
function parseArgs(): Args {
const args: Args = { countryCode: 'US' };
for (const arg of process.argv.slice(2)) {
const citySlugMatch = arg.match(/--city-slug=(.+)/);
if (citySlugMatch) args.citySlug = citySlugMatch[1];
const cityNameMatch = arg.match(/--city-name=(.+)/);
if (cityNameMatch) args.cityName = cityNameMatch[1];
const stateCodeMatch = arg.match(/--state-code=(.+)/);
if (stateCodeMatch) args.stateCode = stateCodeMatch[1].toUpperCase();
const countryCodeMatch = arg.match(/--country-code=(.+)/);
if (countryCodeMatch) args.countryCode = countryCodeMatch[1].toUpperCase();
}
return args;
}
function printUsage() {
console.log(`
Usage:
npm run discovery:dt:cities:manual -- --city-slug=<slug> --city-name=<name> --state-code=<state>
Required arguments:
--city-slug URL slug for the city (e.g., "ny-hudson", "ma-boston")
--city-name Display name (e.g., "Hudson", "Boston")
--state-code State/province code (e.g., "NY", "CA", "ON")
Optional arguments:
--country-code Country code (default: "US")
Examples:
npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
npm run discovery:dt:cities:manual -- --city-slug=ca-los-angeles --city-name="Los Angeles" --state-code=CA
npm run discovery:dt:cities:manual -- --city-slug=on-toronto --city-name=Toronto --state-code=ON --country-code=CA
After seeding, run location discovery:
npm run discovery:dt:locations
`);
}
async function main() {
const args = parseArgs();
console.log('╔══════════════════════════════════════════════════╗');
console.log('║ Dutchie City Discovery (MANUAL SEED) ║');
console.log('╚══════════════════════════════════════════════════╝');
if (!args.citySlug || !args.cityName || !args.stateCode) {
console.error('\n❌ Error: Missing required arguments\n');
printUsage();
process.exit(1);
}
console.log(`\nCity Slug: ${args.citySlug}`);
console.log(`City Name: ${args.cityName}`);
console.log(`State Code: ${args.stateCode}`);
console.log(`Country Code: ${args.countryCode}`);
console.log(`Database: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
const pool = new Pool({ connectionString: DB_URL });
try {
const { rows } = await pool.query('SELECT NOW() as time');
console.log(`\nConnected at: ${rows[0].time}`);
const service = new DtCityDiscoveryService(pool);
const city: DutchieCity = {
slug: args.citySlug,
name: args.cityName,
stateCode: args.stateCode,
countryCode: args.countryCode,
};
const result = await service.seedCity(city);
const action = result.wasInserted ? 'INSERTED' : 'UPDATED';
console.log(`\n✅ City ${action}:`);
console.log(` ID: ${result.id}`);
console.log(` City Slug: ${result.city.slug}`);
console.log(` City Name: ${result.city.name}`);
console.log(` State Code: ${result.city.stateCode}`);
console.log(` Country Code: ${result.city.countryCode}`);
const stats = await service.getStats();
console.log(`\nTotal Dutchie cities: ${stats.total} (${stats.crawlEnabled} enabled)`);
console.log('\n📍 Next step: Run location discovery');
console.log(' npm run discovery:dt:locations');
process.exit(0);
} catch (error: any) {
console.error('\n❌ Failed to seed city:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,73 @@
#!/usr/bin/env npx tsx
/**
* Discovery Runner: Dutchie Cities
*
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
*
* Usage:
* npm run discovery:platforms:dt:cities
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities.ts
*/
import { Pool } from 'pg';
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
async function main() {
console.log('╔══════════════════════════════════════════════════╗');
console.log('║ Dutchie City Discovery Runner ║');
console.log('╚══════════════════════════════════════════════════╝');
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
const pool = new Pool({ connectionString: DB_URL });
try {
// Test DB connection
const { rows } = await pool.query('SELECT NOW() as time');
console.log(`Connected at: ${rows[0].time}\n`);
// Run city discovery
const discovery = new DutchieCityDiscovery(pool);
const result = await discovery.run();
// Print summary
console.log('\n' + '═'.repeat(50));
console.log('SUMMARY');
console.log('═'.repeat(50));
console.log(`Cities found: ${result.citiesFound}`);
console.log(`Cities inserted: ${result.citiesInserted}`);
console.log(`Cities updated: ${result.citiesUpdated}`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0) {
console.log('\nErrors:');
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
}
// Get final stats
const stats = await discovery.getStats();
console.log('\nCurrent Database Stats:');
console.log(` Total cities: ${stats.total}`);
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
console.log(` Never crawled: ${stats.neverCrawled}`);
console.log(` By country: ${stats.byCountry.map(c => `${c.countryCode}=${c.count}`).join(', ')}`);
if (result.errors.length > 0) {
console.log('\n⚠ Completed with errors');
process.exit(1);
}
console.log('\n✅ City discovery completed successfully');
process.exit(0);
} catch (error: any) {
console.error('\n❌ City discovery failed:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,113 @@
#!/usr/bin/env npx tsx
/**
* Discovery Entrypoint: Dutchie Locations (From Cities)
*
* Reads from dutchie_discovery_cities (crawl_enabled = true)
* and discovers store locations for each city.
*
* Geo coordinates are captured when available from Dutchie's payloads.
*
* Usage:
* npm run discovery:dt:locations
* npm run discovery:dt:locations -- --limit=10
* npm run discovery:dt:locations -- --delay=3000
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts
*
* Options:
* --limit=N Only process N cities (default: all)
* --delay=N Delay between cities in ms (default: 2000)
*/
import { Pool } from 'pg';
import { DtLocationDiscoveryService } from './DtLocationDiscoveryService';
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
function parseArgs(): { limit?: number; delay?: number } {
const args: { limit?: number; delay?: number } = {};
for (const arg of process.argv.slice(2)) {
const limitMatch = arg.match(/--limit=(\d+)/);
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
const delayMatch = arg.match(/--delay=(\d+)/);
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
}
return args;
}
async function main() {
const args = parseArgs();
console.log('╔══════════════════════════════════════════════════╗');
console.log('║ Dutchie Location Discovery (From Cities) ║');
console.log('║ Reads crawl_enabled cities, discovers stores ║');
console.log('╚══════════════════════════════════════════════════╝');
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
if (args.limit) console.log(`City limit: ${args.limit}`);
if (args.delay) console.log(`Delay: ${args.delay}ms`);
const pool = new Pool({ connectionString: DB_URL });
try {
const { rows } = await pool.query('SELECT NOW() as time');
console.log(`Connected at: ${rows[0].time}\n`);
const service = new DtLocationDiscoveryService(pool);
const result = await service.discoverAllEnabled({
limit: args.limit,
delayMs: args.delay ?? 2000,
});
console.log('\n' + '═'.repeat(50));
console.log('SUMMARY');
console.log('═'.repeat(50));
console.log(`Cities processed: ${result.totalCities}`);
console.log(`Locations found: ${result.totalLocationsFound}`);
console.log(`Locations inserted: ${result.totalInserted}`);
console.log(`Locations updated: ${result.totalUpdated}`);
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0) {
console.log('\nErrors (first 10):');
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
if (result.errors.length > 10) {
console.log(` ... and ${result.errors.length - 10} more`);
}
}
// Get location stats including coordinates
const stats = await service.getStats();
console.log('\nCurrent Database Stats:');
console.log(` Total locations: ${stats.total}`);
console.log(` With coordinates: ${stats.withCoordinates}`);
console.log(` By status:`);
stats.byStatus.forEach(s => console.log(` ${s.status}: ${s.count}`));
if (result.totalCities === 0) {
console.log('\n⚠ No crawl-enabled cities found.');
console.log(' Seed cities first:');
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
process.exit(1);
}
if (result.errors.length > 0) {
console.log('\n⚠ Completed with errors');
process.exit(1);
}
console.log('\n✅ Location discovery completed successfully');
process.exit(0);
} catch (error: any) {
console.error('\n❌ Location discovery failed:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,117 @@
#!/usr/bin/env npx tsx
/**
* Discovery Runner: Dutchie Locations
*
* Discovers store locations for all crawl-enabled cities and upserts to dutchie_discovery_locations.
*
* Usage:
* npm run discovery:platforms:dt:locations
* npm run discovery:platforms:dt:locations -- --limit=10
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations.ts
*
* Options (via args):
* --limit=N Only process N cities (default: all)
* --delay=N Delay between cities in ms (default: 2000)
*/
import { Pool } from 'pg';
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
// Parse CLI args
function parseArgs(): { limit?: number; delay?: number } {
const args: { limit?: number; delay?: number } = {};
for (const arg of process.argv.slice(2)) {
const limitMatch = arg.match(/--limit=(\d+)/);
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
const delayMatch = arg.match(/--delay=(\d+)/);
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
}
return args;
}
async function main() {
const args = parseArgs();
console.log('╔══════════════════════════════════════════════════╗');
console.log('║ Dutchie Location Discovery Runner ║');
console.log('╚══════════════════════════════════════════════════╝');
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
if (args.limit) console.log(`City limit: ${args.limit}`);
if (args.delay) console.log(`Delay: ${args.delay}ms`);
const pool = new Pool({ connectionString: DB_URL });
try {
// Test DB connection
const { rows } = await pool.query('SELECT NOW() as time');
console.log(`Connected at: ${rows[0].time}\n`);
// Run location discovery
const discovery = new DutchieLocationDiscovery(pool);
const result = await discovery.discoverAllEnabled({
limit: args.limit,
delayMs: args.delay ?? 2000,
});
// Print summary
console.log('\n' + '═'.repeat(50));
console.log('SUMMARY');
console.log('═'.repeat(50));
console.log(`Cities processed: ${result.totalCities}`);
console.log(`Locations found: ${result.totalLocationsFound}`);
console.log(`Locations inserted: ${result.totalInserted}`);
console.log(`Locations updated: ${result.totalUpdated}`);
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
console.log(`Errors: ${result.errors.length}`);
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
if (result.errors.length > 0) {
console.log('\nErrors (first 10):');
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
if (result.errors.length > 10) {
console.log(` ... and ${result.errors.length - 10} more`);
}
}
// Get DB counts
const { rows: countRows } = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE status = 'discovered') as discovered,
COUNT(*) FILTER (WHERE status = 'verified') as verified,
COUNT(*) FILTER (WHERE status = 'merged') as merged,
COUNT(*) FILTER (WHERE status = 'rejected') as rejected
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE
`);
const counts = countRows[0];
console.log('\nCurrent Database Stats:');
console.log(` Total locations: ${counts.total}`);
console.log(` Status discovered: ${counts.discovered}`);
console.log(` Status verified: ${counts.verified}`);
console.log(` Status merged: ${counts.merged}`);
console.log(` Status rejected: ${counts.rejected}`);
if (result.errors.length > 0) {
console.log('\n⚠ Completed with errors');
process.exit(1);
}
console.log('\n✅ Location discovery completed successfully');
process.exit(0);
} catch (error: any) {
console.error('\n❌ Location discovery failed:', error.message);
process.exit(1);
} finally {
await pool.end();
}
}
main();

View File

@@ -0,0 +1,10 @@
/**
* Dutchie Discovery Module
*
* Store discovery pipeline for Dutchie platform.
*/
export { DutchieCityDiscovery } from './DutchieCityDiscovery';
export { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
export { createDutchieDiscoveryRoutes } from './routes';
export { promoteDiscoveryLocation } from './promoteDiscoveryLocation';

View File

@@ -0,0 +1,248 @@
/**
* Promote Discovery Location to Crawlable Dispensary
*
* When a discovery location is verified or merged:
* 1. Ensure a crawl profile exists for the dispensary
* 2. Seed/update crawl schedule
* 3. Create initial crawl job
*/
import { Pool } from 'pg';
export interface PromotionResult {
success: boolean;
discoveryId: number;
dispensaryId: number;
crawlProfileId?: number;
scheduleUpdated?: boolean;
crawlJobCreated?: boolean;
error?: string;
}
/**
* Promote a verified/merged discovery location to a crawlable dispensary.
*
* This function:
* 1. Verifies the discovery location is verified/merged and has a dispensary_id
* 2. Ensures the dispensary has platform info (menu_type, platform_dispensary_id)
* 3. Creates/updates a crawler profile if the profile table exists
* 4. Queues an initial crawl job
*/
export async function promoteDiscoveryLocation(
pool: Pool,
discoveryLocationId: number
): Promise<PromotionResult> {
console.log(`[Promote] Starting promotion for discovery location ${discoveryLocationId}...`);
// Get the discovery location
const { rows: locRows } = await pool.query(
`
SELECT
dl.*,
d.id as disp_id,
d.name as disp_name,
d.menu_type as disp_menu_type,
d.platform_dispensary_id as disp_platform_id
FROM dutchie_discovery_locations dl
JOIN dispensaries d ON dl.dispensary_id = d.id
WHERE dl.id = $1
`,
[discoveryLocationId]
);
if (locRows.length === 0) {
return {
success: false,
discoveryId: discoveryLocationId,
dispensaryId: 0,
error: 'Discovery location not found or not linked to a dispensary',
};
}
const location = locRows[0];
// Verify status
if (!['verified', 'merged'].includes(location.status)) {
return {
success: false,
discoveryId: discoveryLocationId,
dispensaryId: location.dispensary_id || 0,
error: `Cannot promote: location status is '${location.status}', must be 'verified' or 'merged'`,
};
}
const dispensaryId = location.dispensary_id;
console.log(`[Promote] Location ${discoveryLocationId} -> Dispensary ${dispensaryId} (${location.disp_name})`);
// Ensure dispensary has platform info
if (!location.disp_platform_id) {
console.log(`[Promote] Updating dispensary with platform info...`);
await pool.query(
`
UPDATE dispensaries
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
menu_url = COALESCE(menu_url, $2),
menu_type = COALESCE(menu_type, 'dutchie'),
updated_at = NOW()
WHERE id = $3
`,
[location.platform_location_id, location.platform_menu_url, dispensaryId]
);
}
let crawlProfileId: number | undefined;
let scheduleUpdated = false;
let crawlJobCreated = false;
// Check if dispensary_crawler_profiles table exists
const { rows: tableCheck } = await pool.query(`
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'dispensary_crawler_profiles'
) as exists
`);
if (tableCheck[0]?.exists) {
// Create or get crawler profile
console.log(`[Promote] Checking crawler profile...`);
const { rows: profileRows } = await pool.query(
`
SELECT id FROM dispensary_crawler_profiles
WHERE dispensary_id = $1 AND platform = 'dutchie'
`,
[dispensaryId]
);
if (profileRows.length > 0) {
crawlProfileId = profileRows[0].id;
console.log(`[Promote] Using existing profile ${crawlProfileId}`);
} else {
// Create new profile
const profileKey = `dutchie-${location.platform_slug}`;
const { rows: newProfile } = await pool.query(
`
INSERT INTO dispensary_crawler_profiles (
dispensary_id,
profile_key,
profile_name,
platform,
config,
status,
enabled,
created_at,
updated_at
) VALUES (
$1, $2, $3, 'dutchie', $4, 'sandbox', TRUE, NOW(), NOW()
)
ON CONFLICT (dispensary_id, platform) DO UPDATE SET
enabled = TRUE,
updated_at = NOW()
RETURNING id
`,
[
dispensaryId,
profileKey,
`${location.name} (Dutchie)`,
JSON.stringify({
platformDispensaryId: location.platform_location_id,
platformSlug: location.platform_slug,
menuUrl: location.platform_menu_url,
pricingType: 'rec',
useBothModes: true,
}),
]
);
crawlProfileId = newProfile[0]?.id;
console.log(`[Promote] Created new profile ${crawlProfileId}`);
}
// Link profile to dispensary if not already linked
await pool.query(
`
UPDATE dispensaries
SET active_crawler_profile_id = COALESCE(active_crawler_profile_id, $1),
updated_at = NOW()
WHERE id = $2
`,
[crawlProfileId, dispensaryId]
);
}
// Check if crawl_jobs table exists and create initial job
const { rows: jobsTableCheck } = await pool.query(`
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'crawl_jobs'
) as exists
`);
if (jobsTableCheck[0]?.exists) {
// Check if there's already a pending job
const { rows: existingJobs } = await pool.query(
`
SELECT id FROM crawl_jobs
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
LIMIT 1
`,
[dispensaryId]
);
if (existingJobs.length === 0) {
// Create initial crawl job
console.log(`[Promote] Creating initial crawl job...`);
await pool.query(
`
INSERT INTO crawl_jobs (
dispensary_id,
job_type,
status,
priority,
config,
created_at,
updated_at
) VALUES (
$1, 'dutchie_product_crawl', 'pending', 1, $2, NOW(), NOW()
)
`,
[
dispensaryId,
JSON.stringify({
source: 'discovery_promotion',
discoveryLocationId,
pricingType: 'rec',
useBothModes: true,
}),
]
);
crawlJobCreated = true;
} else {
console.log(`[Promote] Crawl job already exists for dispensary`);
}
}
// Update discovery location notes
await pool.query(
`
UPDATE dutchie_discovery_locations
SET notes = COALESCE(notes || E'\n', '') || $1,
updated_at = NOW()
WHERE id = $2
`,
[`Promoted to crawlable at ${new Date().toISOString()}`, discoveryLocationId]
);
console.log(`[Promote] Promotion complete for discovery location ${discoveryLocationId}`);
return {
success: true,
discoveryId: discoveryLocationId,
dispensaryId,
crawlProfileId,
scheduleUpdated,
crawlJobCreated,
};
}
export default promoteDiscoveryLocation;

View File

@@ -0,0 +1,973 @@
/**
* Platform Discovery API Routes (DT = Dutchie)
*
* Routes for the platform-specific store discovery pipeline.
* Mount at /api/discovery/platforms/dt
*
* Platform Slug Mapping (for trademark-safe URLs):
* dt = Dutchie
* jn = Jane (future)
* wm = Weedmaps (future)
* lf = Leafly (future)
* tz = Treez (future)
*
* Note: The actual platform value stored in the DB remains 'dutchie'.
* Only the URL paths use neutral slugs.
*/
import { Router, Request, Response } from 'express';
import { Pool } from 'pg';
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
import { DiscoveryGeoService } from '../../services/DiscoveryGeoService';
import { GeoValidationService } from '../../services/GeoValidationService';
export function createDutchieDiscoveryRoutes(pool: Pool): Router {
const router = Router();
// ============================================================
// LOCATIONS
// ============================================================
/**
* GET /api/discovery/platforms/dt/locations
*
* List discovered locations with filtering.
*
* Query params:
* - status: 'discovered' | 'verified' | 'rejected' | 'merged'
* - state_code: e.g., 'AZ', 'CA'
* - country_code: 'US' | 'CA'
* - unlinked_only: 'true' to show only locations without dispensary_id
* - search: search by name
* - limit: number (default 50)
* - offset: number (default 0)
*/
router.get('/locations', async (req: Request, res: Response) => {
try {
const {
status,
state_code,
country_code,
unlinked_only,
search,
limit = '50',
offset = '0',
} = req.query;
let whereClause = "WHERE platform = 'dutchie' AND active = TRUE";
const params: any[] = [];
let paramIndex = 1;
if (status) {
whereClause += ` AND status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (state_code) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(state_code);
paramIndex++;
}
if (country_code) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(country_code);
paramIndex++;
}
if (unlinked_only === 'true') {
whereClause += ' AND dispensary_id IS NULL';
}
if (search) {
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
params.push(`%${search}%`);
paramIndex++;
}
const limitVal = parseInt(limit as string, 10);
const offsetVal = parseInt(offset as string, 10);
params.push(limitVal, offsetVal);
const { rows } = await pool.query(
`
SELECT
dl.id,
dl.platform,
dl.platform_location_id,
dl.platform_slug,
dl.platform_menu_url,
dl.name,
dl.raw_address,
dl.address_line1,
dl.city,
dl.state_code,
dl.postal_code,
dl.country_code,
dl.latitude,
dl.longitude,
dl.status,
dl.dispensary_id,
dl.offers_delivery,
dl.offers_pickup,
dl.is_recreational,
dl.is_medical,
dl.first_seen_at,
dl.last_seen_at,
dl.verified_at,
dl.verified_by,
dl.notes,
d.name as dispensary_name
FROM dutchie_discovery_locations dl
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
${whereClause}
ORDER BY dl.first_seen_at DESC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
);
// Get total count
const countParams = params.slice(0, -2);
const { rows: countRows } = await pool.query(
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
countParams
);
res.json({
success: true,
locations: rows.map((r) => ({
id: r.id,
platform: r.platform,
platformLocationId: r.platform_location_id,
platformSlug: r.platform_slug,
platformMenuUrl: r.platform_menu_url,
name: r.name,
rawAddress: r.raw_address,
addressLine1: r.address_line1,
city: r.city,
stateCode: r.state_code,
postalCode: r.postal_code,
countryCode: r.country_code,
latitude: r.latitude,
longitude: r.longitude,
status: r.status,
dispensaryId: r.dispensary_id,
dispensaryName: r.dispensary_name,
offersDelivery: r.offers_delivery,
offersPickup: r.offers_pickup,
isRecreational: r.is_recreational,
isMedical: r.is_medical,
firstSeenAt: r.first_seen_at,
lastSeenAt: r.last_seen_at,
verifiedAt: r.verified_at,
verifiedBy: r.verified_by,
notes: r.notes,
})),
total: parseInt(countRows[0]?.total || '0', 10),
limit: limitVal,
offset: offsetVal,
});
} catch (error: any) {
console.error('[Discovery Routes] Error fetching locations:', error);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/discovery/platforms/dt/locations/:id
*
* Get a single location by ID.
*/
router.get('/locations/:id', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { rows } = await pool.query(
`
SELECT
dl.*,
d.name as dispensary_name,
d.menu_url as dispensary_menu_url
FROM dutchie_discovery_locations dl
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
WHERE dl.id = $1
`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ success: false, error: 'Location not found' });
}
const r = rows[0];
res.json({
success: true,
location: {
id: r.id,
platform: r.platform,
platformLocationId: r.platform_location_id,
platformSlug: r.platform_slug,
platformMenuUrl: r.platform_menu_url,
name: r.name,
rawAddress: r.raw_address,
addressLine1: r.address_line1,
addressLine2: r.address_line2,
city: r.city,
stateCode: r.state_code,
postalCode: r.postal_code,
countryCode: r.country_code,
latitude: r.latitude,
longitude: r.longitude,
timezone: r.timezone,
status: r.status,
dispensaryId: r.dispensary_id,
dispensaryName: r.dispensary_name,
dispensaryMenuUrl: r.dispensary_menu_url,
offersDelivery: r.offers_delivery,
offersPickup: r.offers_pickup,
isRecreational: r.is_recreational,
isMedical: r.is_medical,
firstSeenAt: r.first_seen_at,
lastSeenAt: r.last_seen_at,
verifiedAt: r.verified_at,
verifiedBy: r.verified_by,
notes: r.notes,
metadata: r.metadata,
},
});
} catch (error: any) {
console.error('[Discovery Routes] Error fetching location:', error);
res.status(500).json({ success: false, error: error.message });
}
});
// ============================================================
// VERIFICATION ACTIONS
// ============================================================
/**
* POST /api/discovery/platforms/dt/locations/:id/verify-create
*
* Verify a discovered location and create a new canonical dispensary.
*/
router.post('/locations/:id/verify-create', async (req: Request, res: Response) => {
const client = await pool.connect();
try {
const { id } = req.params;
const { verifiedBy = 'admin' } = req.body;
await client.query('BEGIN');
// Get the discovery location
const { rows: locRows } = await client.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
await client.query('ROLLBACK');
return res.status(404).json({ success: false, error: 'Location not found' });
}
const location = locRows[0];
if (location.status !== 'discovered') {
await client.query('ROLLBACK');
return res.status(400).json({
success: false,
error: `Cannot verify: location status is '${location.status}'`,
});
}
// Look up state_id if we have a state_code
let stateId: number | null = null;
if (location.state_code) {
const { rows: stateRows } = await client.query(
`SELECT id FROM states WHERE code = $1`,
[location.state_code]
);
if (stateRows.length > 0) {
stateId = stateRows[0].id;
}
}
// Create the canonical dispensary
const { rows: dispRows } = await client.query(
`
INSERT INTO dispensaries (
name,
slug,
address,
city,
state,
zip,
latitude,
longitude,
timezone,
menu_type,
menu_url,
platform_dispensary_id,
state_id,
active,
created_at,
updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, TRUE, NOW(), NOW()
)
RETURNING id
`,
[
location.name,
location.platform_slug,
location.address_line1,
location.city,
location.state_code,
location.postal_code,
location.latitude,
location.longitude,
location.timezone,
'dutchie',
location.platform_menu_url,
location.platform_location_id,
stateId,
]
);
const dispensaryId = dispRows[0].id;
// Update the discovery location
await client.query(
`
UPDATE dutchie_discovery_locations
SET status = 'verified',
dispensary_id = $1,
verified_at = NOW(),
verified_by = $2,
updated_at = NOW()
WHERE id = $3
`,
[dispensaryId, verifiedBy, id]
);
await client.query('COMMIT');
res.json({
success: true,
action: 'created',
discoveryId: parseInt(id, 10),
dispensaryId,
message: `Created new dispensary (ID: ${dispensaryId})`,
});
} catch (error: any) {
await client.query('ROLLBACK');
console.error('[Discovery Routes] Error in verify-create:', error);
res.status(500).json({ success: false, error: error.message });
} finally {
client.release();
}
});
/**
* POST /api/discovery/platforms/dt/locations/:id/verify-link
*
* Link a discovered location to an existing dispensary.
*
* Body:
* - dispensaryId: number (required)
* - verifiedBy: string (optional)
*/
router.post('/locations/:id/verify-link', async (req: Request, res: Response) => {
const client = await pool.connect();
try {
const { id } = req.params;
const { dispensaryId, verifiedBy = 'admin' } = req.body;
if (!dispensaryId) {
return res.status(400).json({ success: false, error: 'dispensaryId is required' });
}
await client.query('BEGIN');
// Verify dispensary exists
const { rows: dispRows } = await client.query(
`SELECT id, name FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (dispRows.length === 0) {
await client.query('ROLLBACK');
return res.status(404).json({ success: false, error: 'Dispensary not found' });
}
// Get the discovery location
const { rows: locRows } = await client.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
await client.query('ROLLBACK');
return res.status(404).json({ success: false, error: 'Location not found' });
}
const location = locRows[0];
if (location.status !== 'discovered') {
await client.query('ROLLBACK');
return res.status(400).json({
success: false,
error: `Cannot link: location status is '${location.status}'`,
});
}
// Update dispensary with platform info if missing
await client.query(
`
UPDATE dispensaries
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
menu_url = COALESCE(menu_url, $2),
menu_type = COALESCE(menu_type, 'dutchie'),
updated_at = NOW()
WHERE id = $3
`,
[location.platform_location_id, location.platform_menu_url, dispensaryId]
);
// Update the discovery location
await client.query(
`
UPDATE dutchie_discovery_locations
SET status = 'merged',
dispensary_id = $1,
verified_at = NOW(),
verified_by = $2,
updated_at = NOW()
WHERE id = $3
`,
[dispensaryId, verifiedBy, id]
);
await client.query('COMMIT');
res.json({
success: true,
action: 'linked',
discoveryId: parseInt(id, 10),
dispensaryId,
dispensaryName: dispRows[0].name,
message: `Linked to existing dispensary: ${dispRows[0].name}`,
});
} catch (error: any) {
await client.query('ROLLBACK');
console.error('[Discovery Routes] Error in verify-link:', error);
res.status(500).json({ success: false, error: error.message });
} finally {
client.release();
}
});
/**
* POST /api/discovery/platforms/dt/locations/:id/reject
*
* Reject a discovered location.
*
* Body:
* - reason: string (optional)
* - verifiedBy: string (optional)
*/
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { reason, verifiedBy = 'admin' } = req.body;
// Get current status
const { rows } = await pool.query(
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ success: false, error: 'Location not found' });
}
if (rows[0].status !== 'discovered') {
return res.status(400).json({
success: false,
error: `Cannot reject: location status is '${rows[0].status}'`,
});
}
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'rejected',
verified_at = NOW(),
verified_by = $1,
notes = COALESCE($2, notes),
updated_at = NOW()
WHERE id = $3
`,
[verifiedBy, reason, id]
);
res.json({
success: true,
action: 'rejected',
discoveryId: parseInt(id, 10),
message: 'Location rejected',
});
} catch (error: any) {
console.error('[Discovery Routes] Error in reject:', error);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* POST /api/discovery/platforms/dt/locations/:id/unreject
*
* Restore a rejected location to discovered status.
*/
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
try {
const { id } = req.params;
// Get current status
const { rows } = await pool.query(
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ success: false, error: 'Location not found' });
}
if (rows[0].status !== 'rejected') {
return res.status(400).json({
success: false,
error: `Cannot unreject: location status is '${rows[0].status}'`,
});
}
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'discovered',
verified_at = NULL,
verified_by = NULL,
updated_at = NOW()
WHERE id = $1
`,
[id]
);
res.json({
success: true,
action: 'unrejected',
discoveryId: parseInt(id, 10),
message: 'Location restored to discovered status',
});
} catch (error: any) {
console.error('[Discovery Routes] Error in unreject:', error);
res.status(500).json({ success: false, error: error.message });
}
});
// ============================================================
// SUMMARY / REPORTING
// ============================================================
/**
* GET /api/discovery/platforms/dt/summary
*
* Get discovery summary statistics.
*/
router.get('/summary', async (_req: Request, res: Response) => {
try {
// Total counts by status
const { rows: statusRows } = await pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE
GROUP BY status
`);
const statusCounts: Record<string, number> = {};
let totalLocations = 0;
for (const row of statusRows) {
statusCounts[row.status] = parseInt(row.cnt, 10);
totalLocations += parseInt(row.cnt, 10);
}
// By state
const { rows: stateRows } = await pool.query(`
SELECT
state_code,
COUNT(*) as total,
COUNT(*) FILTER (WHERE status = 'verified') as verified,
COUNT(*) FILTER (WHERE dispensary_id IS NULL AND status = 'discovered') as unlinked
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY total DESC
`);
res.json({
success: true,
summary: {
total_locations: totalLocations,
discovered: statusCounts['discovered'] || 0,
verified: statusCounts['verified'] || 0,
merged: statusCounts['merged'] || 0,
rejected: statusCounts['rejected'] || 0,
},
by_state: stateRows.map((r) => ({
state_code: r.state_code,
total: parseInt(r.total, 10),
verified: parseInt(r.verified, 10),
unlinked: parseInt(r.unlinked, 10),
})),
});
} catch (error: any) {
console.error('[Discovery Routes] Error in summary:', error);
res.status(500).json({ success: false, error: error.message });
}
});
// ============================================================
// CITIES
// ============================================================
/**
* GET /api/discovery/platforms/dt/cities
*
* List discovery cities.
*/
router.get('/cities', async (req: Request, res: Response) => {
try {
const { state_code, country_code, crawl_enabled, limit = '100', offset = '0' } = req.query;
let whereClause = "WHERE platform = 'dutchie'";
const params: any[] = [];
let paramIndex = 1;
if (state_code) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(state_code);
paramIndex++;
}
if (country_code) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(country_code);
paramIndex++;
}
if (crawl_enabled === 'true') {
whereClause += ' AND crawl_enabled = TRUE';
} else if (crawl_enabled === 'false') {
whereClause += ' AND crawl_enabled = FALSE';
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await pool.query(
`
SELECT
id,
platform,
city_name,
city_slug,
state_code,
country_code,
last_crawled_at,
crawl_enabled,
location_count
FROM dutchie_discovery_cities
${whereClause}
ORDER BY country_code, state_code, city_name
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
);
const { rows: countRows } = await pool.query(
`SELECT COUNT(*) as total FROM dutchie_discovery_cities ${whereClause}`,
params.slice(0, -2)
);
res.json({
success: true,
cities: rows.map((r) => ({
id: r.id,
platform: r.platform,
cityName: r.city_name,
citySlug: r.city_slug,
stateCode: r.state_code,
countryCode: r.country_code,
lastCrawledAt: r.last_crawled_at,
crawlEnabled: r.crawl_enabled,
locationCount: r.location_count,
})),
total: parseInt(countRows[0]?.total || '0', 10),
});
} catch (error: any) {
console.error('[Discovery Routes] Error fetching cities:', error);
res.status(500).json({ success: false, error: error.message });
}
});
// ============================================================
// MATCH CANDIDATES
// ============================================================
/**
* GET /api/discovery/platforms/dt/locations/:id/match-candidates
*
* Find potential dispensary matches for a discovery location.
*/
router.get('/locations/:id/match-candidates', async (req: Request, res: Response) => {
try {
const { id } = req.params;
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ success: false, error: 'Location not found' });
}
const location = locRows[0];
// Find potential matches
const { rows: candidates } = await pool.query(
`
SELECT
d.id,
d.name,
d.city,
d.state,
d.address,
d.menu_type,
d.platform_dispensary_id,
d.menu_url,
d.latitude,
d.longitude,
CASE
WHEN d.name ILIKE $1 THEN 'exact_name'
WHEN d.name ILIKE $2 THEN 'partial_name'
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
ELSE 'location_match'
END as match_type,
CASE
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
THEN (3959 * acos(
LEAST(1.0, GREATEST(-1.0,
cos(radians($5::float)) * cos(radians(d.latitude)) *
cos(radians(d.longitude) - radians($6::float)) +
sin(radians($5::float)) * sin(radians(d.latitude))
))
))
ELSE NULL
END as distance_miles
FROM dispensaries d
WHERE d.state = $4
AND (
d.name ILIKE $1
OR d.name ILIKE $2
OR d.city ILIKE $3
OR (
d.latitude IS NOT NULL
AND d.longitude IS NOT NULL
AND $5::float IS NOT NULL
AND $6::float IS NOT NULL
)
)
ORDER BY
CASE
WHEN d.name ILIKE $1 THEN 1
WHEN d.name ILIKE $2 THEN 2
ELSE 3
END,
distance_miles NULLS LAST
LIMIT 10
`,
[
location.name,
`%${location.name.split(' ')[0]}%`,
location.city,
location.state_code,
location.latitude,
location.longitude,
]
);
res.json({
success: true,
location: {
id: location.id,
name: location.name,
city: location.city,
stateCode: location.state_code,
},
candidates: candidates.map((c) => ({
id: c.id,
name: c.name,
city: c.city,
state: c.state,
address: c.address,
menuType: c.menu_type,
platformDispensaryId: c.platform_dispensary_id,
menuUrl: c.menu_url,
matchType: c.match_type,
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
})),
});
} catch (error: any) {
console.error('[Discovery Routes] Error fetching match candidates:', error);
res.status(500).json({ success: false, error: error.message });
}
});
// ============================================================
// GEO / NEARBY (Admin/Debug Only)
// ============================================================
/**
* GET /api/discovery/platforms/dt/nearby
*
* Find discovery locations near a given coordinate.
* This is an internal/debug endpoint for admin use.
*
* Query params:
* - lat: number (required)
* - lon: number (required)
* - radiusKm: number (optional, default 50)
* - limit: number (optional, default 20)
* - status: string (optional, filter by status)
*/
router.get('/nearby', async (req: Request, res: Response) => {
try {
const { lat, lon, radiusKm = '50', limit = '20', status } = req.query;
// Validate required params
if (!lat || !lon) {
return res.status(400).json({
success: false,
error: 'lat and lon are required query parameters',
});
}
const latNum = parseFloat(lat as string);
const lonNum = parseFloat(lon as string);
const radiusNum = parseFloat(radiusKm as string);
const limitNum = parseInt(limit as string, 10);
if (isNaN(latNum) || isNaN(lonNum)) {
return res.status(400).json({
success: false,
error: 'lat and lon must be valid numbers',
});
}
const geoService = new DiscoveryGeoService(pool);
const locations = await geoService.findNearbyDiscoveryLocations(latNum, lonNum, {
radiusKm: radiusNum,
limit: limitNum,
platform: 'dutchie',
status: status as string | undefined,
});
res.json({
success: true,
center: { lat: latNum, lon: lonNum },
radiusKm: radiusNum,
count: locations.length,
locations,
});
} catch (error: any) {
console.error('[Discovery Routes] Error in nearby:', error);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/discovery/platforms/dt/geo-stats
*
* Get coordinate coverage statistics for discovery locations.
* This is an internal/debug endpoint for admin use.
*/
router.get('/geo-stats', async (_req: Request, res: Response) => {
try {
const geoService = new DiscoveryGeoService(pool);
const stats = await geoService.getCoordinateCoverageStats();
res.json({
success: true,
stats,
});
} catch (error: any) {
console.error('[Discovery Routes] Error in geo-stats:', error);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/discovery/platforms/dt/locations/:id/validate-geo
*
* Validate the geographic data for a discovery location.
* This is an internal/debug endpoint for admin use.
*/
router.get('/locations/:id/validate-geo', async (req: Request, res: Response) => {
try {
const { id } = req.params;
// Get the location
const { rows } = await pool.query(
`SELECT latitude, longitude, state_code, country_code, name
FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ success: false, error: 'Location not found' });
}
const location = rows[0];
const geoValidation = new GeoValidationService();
const result = geoValidation.validateLocationState({
latitude: location.latitude,
longitude: location.longitude,
state_code: location.state_code,
country_code: location.country_code,
});
res.json({
success: true,
location: {
id: parseInt(id, 10),
name: location.name,
latitude: location.latitude,
longitude: location.longitude,
stateCode: location.state_code,
countryCode: location.country_code,
},
validation: result,
});
} catch (error: any) {
console.error('[Discovery Routes] Error in validate-geo:', error);
res.status(500).json({ success: false, error: error.message });
}
});
return router;
}
export default createDutchieDiscoveryRoutes;

View File

@@ -0,0 +1,682 @@
/**
* Analytics API Routes
*
* Provides REST API endpoints for all analytics services.
* All routes are prefixed with /api/analytics
*
* Phase 3: Analytics Dashboards
*/
import { Router, Request, Response } from 'express';
import { Pool } from 'pg';
import {
AnalyticsCache,
PriceTrendService,
PenetrationService,
CategoryAnalyticsService,
StoreChangeService,
BrandOpportunityService,
} from '../services/analytics';
export function createAnalyticsRouter(pool: Pool): Router {
const router = Router();
// Initialize services
const cache = new AnalyticsCache(pool, { defaultTtlMinutes: 15 });
const priceService = new PriceTrendService(pool, cache);
const penetrationService = new PenetrationService(pool, cache);
const categoryService = new CategoryAnalyticsService(pool, cache);
const storeService = new StoreChangeService(pool, cache);
const brandOpportunityService = new BrandOpportunityService(pool, cache);
// ============================================================
// PRICE ANALYTICS
// ============================================================
/**
* GET /api/analytics/price/product/:id
* Get price trend for a specific product
*/
router.get('/price/product/:id', async (req: Request, res: Response) => {
try {
const productId = parseInt(req.params.id);
const storeId = req.query.storeId ? parseInt(req.query.storeId as string) : undefined;
const days = req.query.days ? parseInt(req.query.days as string) : 30;
const result = await priceService.getProductPriceTrend(productId, storeId, days);
res.json(result);
} catch (error) {
console.error('[Analytics] Price product error:', error);
res.status(500).json({ error: 'Failed to fetch product price trend' });
}
});
/**
* GET /api/analytics/price/brand/:name
* Get price trend for a brand
*/
router.get('/price/brand/:name', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.name);
const filters = {
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
category: req.query.category as string | undefined,
state: req.query.state as string | undefined,
days: req.query.days ? parseInt(req.query.days as string) : 30,
};
const result = await priceService.getBrandPriceTrend(brandName, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Price brand error:', error);
res.status(500).json({ error: 'Failed to fetch brand price trend' });
}
});
/**
* GET /api/analytics/price/category/:name
* Get price trend for a category
*/
router.get('/price/category/:name', async (req: Request, res: Response) => {
try {
const category = decodeURIComponent(req.params.name);
const filters = {
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
brandName: req.query.brand as string | undefined,
state: req.query.state as string | undefined,
days: req.query.days ? parseInt(req.query.days as string) : 30,
};
const result = await priceService.getCategoryPriceTrend(category, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Price category error:', error);
res.status(500).json({ error: 'Failed to fetch category price trend' });
}
});
/**
* GET /api/analytics/price/summary
* Get price summary statistics
*/
router.get('/price/summary', async (req: Request, res: Response) => {
try {
const filters = {
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
brandName: req.query.brand as string | undefined,
category: req.query.category as string | undefined,
state: req.query.state as string | undefined,
};
const result = await priceService.getPriceSummary(filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Price summary error:', error);
res.status(500).json({ error: 'Failed to fetch price summary' });
}
});
/**
* GET /api/analytics/price/compression/:category
* Get price compression analysis for a category
*/
router.get('/price/compression/:category', async (req: Request, res: Response) => {
try {
const category = decodeURIComponent(req.params.category);
const state = req.query.state as string | undefined;
const result = await priceService.detectPriceCompression(category, state);
res.json(result);
} catch (error) {
console.error('[Analytics] Price compression error:', error);
res.status(500).json({ error: 'Failed to analyze price compression' });
}
});
/**
* GET /api/analytics/price/global
* Get global price statistics
*/
router.get('/price/global', async (_req: Request, res: Response) => {
try {
const result = await priceService.getGlobalPriceStats();
res.json(result);
} catch (error) {
console.error('[Analytics] Global price error:', error);
res.status(500).json({ error: 'Failed to fetch global price stats' });
}
});
// ============================================================
// PENETRATION ANALYTICS
// ============================================================
/**
* GET /api/analytics/penetration/brand/:name
* Get penetration data for a brand
*/
router.get('/penetration/brand/:name', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.name);
const filters = {
state: req.query.state as string | undefined,
category: req.query.category as string | undefined,
};
const result = await penetrationService.getBrandPenetration(brandName, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Brand penetration error:', error);
res.status(500).json({ error: 'Failed to fetch brand penetration' });
}
});
/**
* GET /api/analytics/penetration/top
* Get top brands by penetration
*/
router.get('/penetration/top', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
const filters = {
state: req.query.state as string | undefined,
category: req.query.category as string | undefined,
minStores: req.query.minStores ? parseInt(req.query.minStores as string) : 2,
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 5,
};
const result = await penetrationService.getTopBrandsByPenetration(limit, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Top penetration error:', error);
res.status(500).json({ error: 'Failed to fetch top brands' });
}
});
/**
* GET /api/analytics/penetration/trend/:brand
* Get penetration trend for a brand
*/
router.get('/penetration/trend/:brand', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.brand);
const days = req.query.days ? parseInt(req.query.days as string) : 30;
const result = await penetrationService.getPenetrationTrend(brandName, days);
res.json(result);
} catch (error) {
console.error('[Analytics] Penetration trend error:', error);
res.status(500).json({ error: 'Failed to fetch penetration trend' });
}
});
/**
* GET /api/analytics/penetration/shelf-share/:brand
* Get shelf share by category for a brand
*/
router.get('/penetration/shelf-share/:brand', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.brand);
const result = await penetrationService.getShelfShareByCategory(brandName);
res.json(result);
} catch (error) {
console.error('[Analytics] Shelf share error:', error);
res.status(500).json({ error: 'Failed to fetch shelf share' });
}
});
/**
* GET /api/analytics/penetration/by-state/:brand
* Get brand presence by state
*/
router.get('/penetration/by-state/:brand', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.brand);
const result = await penetrationService.getBrandPresenceByState(brandName);
res.json(result);
} catch (error) {
console.error('[Analytics] Brand by state error:', error);
res.status(500).json({ error: 'Failed to fetch brand presence by state' });
}
});
/**
* GET /api/analytics/penetration/stores/:brand
* Get stores carrying a brand
*/
router.get('/penetration/stores/:brand', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.brand);
const result = await penetrationService.getStoresCarryingBrand(brandName);
res.json(result);
} catch (error) {
console.error('[Analytics] Stores carrying brand error:', error);
res.status(500).json({ error: 'Failed to fetch stores' });
}
});
/**
* GET /api/analytics/penetration/heatmap
* Get penetration heatmap data
*/
router.get('/penetration/heatmap', async (req: Request, res: Response) => {
try {
const brandName = req.query.brand as string | undefined;
const result = await penetrationService.getPenetrationHeatmap(brandName);
res.json(result);
} catch (error) {
console.error('[Analytics] Heatmap error:', error);
res.status(500).json({ error: 'Failed to fetch heatmap data' });
}
});
// ============================================================
// CATEGORY ANALYTICS
// ============================================================
/**
* GET /api/analytics/category/summary
* Get category summary
*/
router.get('/category/summary', async (req: Request, res: Response) => {
try {
const category = req.query.category as string | undefined;
const filters = {
state: req.query.state as string | undefined,
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
};
const result = await categoryService.getCategorySummary(category, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Category summary error:', error);
res.status(500).json({ error: 'Failed to fetch category summary' });
}
});
/**
* GET /api/analytics/category/growth
* Get category growth data
*/
router.get('/category/growth', async (req: Request, res: Response) => {
try {
const days = req.query.days ? parseInt(req.query.days as string) : 7;
const filters = {
state: req.query.state as string | undefined,
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 10,
};
const result = await categoryService.getCategoryGrowth(days, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Category growth error:', error);
res.status(500).json({ error: 'Failed to fetch category growth' });
}
});
/**
* GET /api/analytics/category/trend/:category
* Get category growth trend over time
*/
router.get('/category/trend/:category', async (req: Request, res: Response) => {
try {
const category = decodeURIComponent(req.params.category);
const days = req.query.days ? parseInt(req.query.days as string) : 90;
const result = await categoryService.getCategoryGrowthTrend(category, days);
res.json(result);
} catch (error) {
console.error('[Analytics] Category trend error:', error);
res.status(500).json({ error: 'Failed to fetch category trend' });
}
});
/**
* GET /api/analytics/category/heatmap
* Get category heatmap data
*/
router.get('/category/heatmap', async (req: Request, res: Response) => {
try {
const metric = (req.query.metric as 'skus' | 'growth' | 'price') || 'skus';
const periods = req.query.periods ? parseInt(req.query.periods as string) : 12;
const result = await categoryService.getCategoryHeatmap(metric, periods);
res.json(result);
} catch (error) {
console.error('[Analytics] Category heatmap error:', error);
res.status(500).json({ error: 'Failed to fetch heatmap' });
}
});
/**
* GET /api/analytics/category/top-movers
* Get top growing and declining categories
*/
router.get('/category/top-movers', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 5;
const days = req.query.days ? parseInt(req.query.days as string) : 30;
const result = await categoryService.getTopMovers(limit, days);
res.json(result);
} catch (error) {
console.error('[Analytics] Top movers error:', error);
res.status(500).json({ error: 'Failed to fetch top movers' });
}
});
/**
* GET /api/analytics/category/:category/subcategories
* Get subcategory breakdown
*/
router.get('/category/:category/subcategories', async (req: Request, res: Response) => {
try {
const category = decodeURIComponent(req.params.category);
const result = await categoryService.getSubcategoryBreakdown(category);
res.json(result);
} catch (error) {
console.error('[Analytics] Subcategory error:', error);
res.status(500).json({ error: 'Failed to fetch subcategories' });
}
});
// ============================================================
// STORE CHANGE TRACKING
// ============================================================
/**
* GET /api/analytics/store/:id/summary
* Get change summary for a store
*/
router.get('/store/:id/summary', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.id);
const result = await storeService.getStoreChangeSummary(storeId);
if (!result) {
return res.status(404).json({ error: 'Store not found' });
}
res.json(result);
} catch (error) {
console.error('[Analytics] Store summary error:', error);
res.status(500).json({ error: 'Failed to fetch store summary' });
}
});
/**
* GET /api/analytics/store/:id/events
* Get recent change events for a store
*/
router.get('/store/:id/events', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.id);
const filters = {
eventType: req.query.type as string | undefined,
days: req.query.days ? parseInt(req.query.days as string) : 30,
limit: req.query.limit ? parseInt(req.query.limit as string) : 100,
};
const result = await storeService.getStoreChangeEvents(storeId, filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Store events error:', error);
res.status(500).json({ error: 'Failed to fetch store events' });
}
});
/**
* GET /api/analytics/store/:id/brands/new
* Get new brands added to a store
*/
router.get('/store/:id/brands/new', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.id);
const days = req.query.days ? parseInt(req.query.days as string) : 30;
const result = await storeService.getNewBrands(storeId, days);
res.json(result);
} catch (error) {
console.error('[Analytics] New brands error:', error);
res.status(500).json({ error: 'Failed to fetch new brands' });
}
});
/**
* GET /api/analytics/store/:id/brands/lost
* Get brands lost from a store
*/
router.get('/store/:id/brands/lost', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.id);
const days = req.query.days ? parseInt(req.query.days as string) : 30;
const result = await storeService.getLostBrands(storeId, days);
res.json(result);
} catch (error) {
console.error('[Analytics] Lost brands error:', error);
res.status(500).json({ error: 'Failed to fetch lost brands' });
}
});
/**
* GET /api/analytics/store/:id/products/changes
* Get product changes for a store
*/
router.get('/store/:id/products/changes', async (req: Request, res: Response) => {
try {
const storeId = parseInt(req.params.id);
const changeType = req.query.type as 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock' | undefined;
const days = req.query.days ? parseInt(req.query.days as string) : 7;
const result = await storeService.getProductChanges(storeId, changeType, days);
res.json(result);
} catch (error) {
console.error('[Analytics] Product changes error:', error);
res.status(500).json({ error: 'Failed to fetch product changes' });
}
});
/**
* GET /api/analytics/store/leaderboard/:category
* Get category leaderboard across stores
*/
router.get('/store/leaderboard/:category', async (req: Request, res: Response) => {
try {
const category = decodeURIComponent(req.params.category);
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
const result = await storeService.getCategoryLeaderboard(category, limit);
res.json(result);
} catch (error) {
console.error('[Analytics] Leaderboard error:', error);
res.status(500).json({ error: 'Failed to fetch leaderboard' });
}
});
/**
* GET /api/analytics/store/most-active
* Get most active stores (by changes)
*/
router.get('/store/most-active', async (req: Request, res: Response) => {
try {
const days = req.query.days ? parseInt(req.query.days as string) : 7;
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
const result = await storeService.getMostActiveStores(days, limit);
res.json(result);
} catch (error) {
console.error('[Analytics] Most active error:', error);
res.status(500).json({ error: 'Failed to fetch active stores' });
}
});
/**
* GET /api/analytics/store/compare
* Compare two stores
*/
router.get('/store/compare', async (req: Request, res: Response) => {
try {
const store1 = parseInt(req.query.store1 as string);
const store2 = parseInt(req.query.store2 as string);
if (!store1 || !store2) {
return res.status(400).json({ error: 'Both store1 and store2 are required' });
}
const result = await storeService.compareStores(store1, store2);
res.json(result);
} catch (error) {
console.error('[Analytics] Compare stores error:', error);
res.status(500).json({ error: 'Failed to compare stores' });
}
});
// ============================================================
// BRAND OPPORTUNITY / RISK
// ============================================================
/**
* GET /api/analytics/brand/:name/opportunity
* Get full opportunity analysis for a brand
*/
router.get('/brand/:name/opportunity', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.name);
const result = await brandOpportunityService.getBrandOpportunity(brandName);
res.json(result);
} catch (error) {
console.error('[Analytics] Brand opportunity error:', error);
res.status(500).json({ error: 'Failed to fetch brand opportunity' });
}
});
/**
* GET /api/analytics/brand/:name/position
* Get market position summary for a brand
*/
router.get('/brand/:name/position', async (req: Request, res: Response) => {
try {
const brandName = decodeURIComponent(req.params.name);
const result = await brandOpportunityService.getMarketPositionSummary(brandName);
res.json(result);
} catch (error) {
console.error('[Analytics] Brand position error:', error);
res.status(500).json({ error: 'Failed to fetch brand position' });
}
});
// ============================================================
// ALERTS
// ============================================================
/**
* GET /api/analytics/alerts
* Get analytics alerts
*/
router.get('/alerts', async (req: Request, res: Response) => {
try {
const filters = {
brandName: req.query.brand as string | undefined,
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
alertType: req.query.type as string | undefined,
unreadOnly: req.query.unreadOnly === 'true',
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
};
const result = await brandOpportunityService.getAlerts(filters);
res.json(result);
} catch (error) {
console.error('[Analytics] Alerts error:', error);
res.status(500).json({ error: 'Failed to fetch alerts' });
}
});
/**
* POST /api/analytics/alerts/mark-read
* Mark alerts as read
*/
router.post('/alerts/mark-read', async (req: Request, res: Response) => {
try {
const { alertIds } = req.body;
if (!Array.isArray(alertIds)) {
return res.status(400).json({ error: 'alertIds must be an array' });
}
await brandOpportunityService.markAlertsRead(alertIds);
res.json({ success: true });
} catch (error) {
console.error('[Analytics] Mark read error:', error);
res.status(500).json({ error: 'Failed to mark alerts as read' });
}
});
// ============================================================
// CACHE MANAGEMENT
// ============================================================
/**
* GET /api/analytics/cache/stats
* Get cache statistics
*/
router.get('/cache/stats', async (_req: Request, res: Response) => {
try {
const stats = await cache.getStats();
res.json(stats);
} catch (error) {
console.error('[Analytics] Cache stats error:', error);
res.status(500).json({ error: 'Failed to get cache stats' });
}
});
/**
* POST /api/analytics/cache/clear
* Clear cache (admin only)
*/
router.post('/cache/clear', async (req: Request, res: Response) => {
try {
const pattern = req.query.pattern as string | undefined;
if (pattern) {
const cleared = await cache.invalidatePattern(pattern);
res.json({ success: true, clearedCount: cleared });
} else {
await cache.cleanExpired();
res.json({ success: true, message: 'Expired entries cleaned' });
}
} catch (error) {
console.error('[Analytics] Cache clear error:', error);
res.status(500).json({ error: 'Failed to clear cache' });
}
});
// ============================================================
// SNAPSHOT CAPTURE (for cron/scheduled jobs)
// ============================================================
/**
* POST /api/analytics/snapshots/capture
* Capture daily snapshots (run by scheduler)
*/
router.post('/snapshots/capture', async (_req: Request, res: Response) => {
try {
const [brandResult, categoryResult] = await Promise.all([
pool.query('SELECT capture_brand_snapshots() as count'),
pool.query('SELECT capture_category_snapshots() as count'),
]);
res.json({
success: true,
brandSnapshots: parseInt(brandResult.rows[0]?.count || '0'),
categorySnapshots: parseInt(categoryResult.rows[0]?.count || '0'),
});
} catch (error) {
console.error('[Analytics] Snapshot capture error:', error);
res.status(500).json({ error: 'Failed to capture snapshots' });
}
});
return router;
}

View File

@@ -21,12 +21,8 @@ import {
} from '../services/discovery';
import { crawlDispensaryProducts } from '../services/product-crawler';
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, dba_name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// Use shared dispensary columns (handles optional columns like provider_detection_data)
import { DISPENSARY_COLUMNS_WITH_PROFILE as DISPENSARY_COLUMNS } from '../db/dispensary-columns';
import {
startScheduler,
stopScheduler,
@@ -43,6 +39,7 @@ import {
getRunLogs,
} from '../services/scheduler';
import { StockStatus } from '../types';
import { getProviderDisplayName } from '../../utils/provider-display';
const router = Router();
@@ -113,9 +110,17 @@ router.get('/stores', async (req: Request, res: Response) => {
const { rows, rowCount } = await query(
`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
SELECT ${DISPENSARY_COLUMNS},
(SELECT COUNT(*) FROM dutchie_products WHERE dispensary_id = dispensaries.id) as product_count,
dcp.status as crawler_status,
dcp.profile_key as crawler_profile_key,
dcp.next_retry_at,
dcp.sandbox_attempt_count
FROM dispensaries
LEFT JOIN dispensary_crawler_profiles dcp
ON dcp.dispensary_id = dispensaries.id AND dcp.enabled = true
${whereClause}
ORDER BY name
ORDER BY dispensaries.name
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
@@ -127,8 +132,15 @@ router.get('/stores', async (req: Request, res: Response) => {
params.slice(0, -2)
);
// Transform stores to include provider_display
const transformedStores = rows.map((store: any) => ({
...store,
provider_raw: store.menu_type,
provider_display: getProviderDisplayName(store.menu_type),
}));
res.json({
stores: rows,
stores: transformedStores,
total: parseInt(countRows[0]?.total || '0', 10),
limit: parseInt(limit as string, 10),
offset: parseInt(offset as string, 10),
@@ -780,7 +792,7 @@ router.get('/products/:id/availability', async (req: Request, res: Response) =>
)
SELECT
d.id as dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
d.name as dispensary_name,
d.city,
d.state,
d.address,
@@ -1042,8 +1054,12 @@ router.post('/admin/scheduler/trigger', async (_req: Request, res: Response) =>
});
/**
* POST /api/dutchie-az/admin/crawl/:id
* POST /api/az/admin/crawl/:id
* Crawl a single dispensary with job tracking
*
* @deprecated Use POST /api/admin/crawl/:dispensaryId instead.
* This route is kept for backward compatibility only.
* The canonical crawl endpoint is now /api/admin/crawl/:dispensaryId
*/
router.post('/admin/crawl/:id', async (req: Request, res: Response) => {
try {
@@ -1075,7 +1091,6 @@ router.get('/admin/dutchie-stores', async (_req: Request, res: Response) => {
SELECT
d.id,
d.name,
d.dba_name,
d.city,
d.state,
d.menu_type,
@@ -1113,7 +1128,7 @@ router.get('/admin/dutchie-stores', async (_req: Request, res: Response) => {
failed: failed.length,
stores: rows.map((r: any) => ({
id: r.id,
name: r.dba_name || r.name,
name: r.name,
city: r.city,
state: r.state,
menuType: r.menu_type,
@@ -1688,6 +1703,7 @@ import {
router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
try {
// Get running jobs from job_run_logs (scheduled jobs like "enqueue all")
// Includes worker_name and run_role for named workforce display
const { rows: runningScheduledJobs } = await query<any>(`
SELECT
jrl.id,
@@ -1699,7 +1715,11 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
jrl.items_succeeded,
jrl.items_failed,
jrl.metadata,
jrl.worker_name,
jrl.run_role,
js.description as job_description,
js.worker_name as schedule_worker_name,
js.worker_role as schedule_worker_role,
EXTRACT(EPOCH FROM (NOW() - jrl.started_at)) as duration_seconds
FROM job_run_logs jrl
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
@@ -1708,7 +1728,7 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
`);
// Get running crawl jobs (individual store crawls with worker info)
// Note: Use COALESCE for optional columns that may not exist in older schemas
// Includes enqueued_by_worker for tracking which named worker enqueued the job
const { rows: runningCrawlJobs } = await query<any>(`
SELECT
cj.id,
@@ -1722,6 +1742,7 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
cj.claimed_by as worker_id,
cj.worker_hostname,
cj.claimed_at,
cj.enqueued_by_worker,
cj.products_found,
cj.products_upserted,
cj.snapshots_created,
@@ -1792,14 +1813,18 @@ router.get('/monitor/recent-jobs', async (req: Request, res: Response) => {
jrl.items_succeeded,
jrl.items_failed,
jrl.metadata,
js.description as job_description
jrl.worker_name,
jrl.run_role,
js.description as job_description,
js.worker_name as schedule_worker_name,
js.worker_role as schedule_worker_role
FROM job_run_logs jrl
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
ORDER BY jrl.created_at DESC
LIMIT $1
`, [limitNum]);
// Recent crawl jobs
// Recent crawl jobs (includes enqueued_by_worker for named workforce tracking)
const { rows: recentCrawlJobs } = await query<any>(`
SELECT
cj.id,
@@ -1814,6 +1839,7 @@ router.get('/monitor/recent-jobs', async (req: Request, res: Response) => {
cj.products_found,
cj.snapshots_created,
cj.metadata,
cj.enqueued_by_worker,
EXTRACT(EPOCH FROM (COALESCE(cj.completed_at, NOW()) - cj.started_at)) * 1000 as duration_ms
FROM dispensary_crawl_jobs cj
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
@@ -1912,12 +1938,14 @@ router.get('/monitor/summary', async (_req: Request, res: Response) => {
(SELECT MAX(completed_at) FROM job_run_logs WHERE status = 'success') as last_job_completed
`);
// Get next scheduled runs
// Get next scheduled runs (with worker names)
const { rows: nextRuns } = await query<any>(`
SELECT
id,
job_name,
description,
worker_name,
worker_role,
enabled,
next_run_at,
last_status,
@@ -2034,6 +2062,189 @@ router.post('/admin/detection/trigger', async (_req: Request, res: Response) =>
}
});
// ============================================================
// CRAWLER RELIABILITY / HEALTH ENDPOINTS (Phase 1)
// ============================================================
/**
* GET /api/dutchie-az/admin/crawler/health
* Get overall crawler health metrics
*/
router.get('/admin/crawler/health', async (_req: Request, res: Response) => {
try {
const { rows } = await query<any>(`SELECT * FROM v_crawl_health`);
res.json(rows[0] || {
active_crawlers: 0,
degraded_crawlers: 0,
paused_crawlers: 0,
failed_crawlers: 0,
due_now: 0,
stores_with_failures: 0,
avg_consecutive_failures: 0,
successful_last_24h: 0,
});
} catch (error: any) {
// View might not exist yet
res.json({
active_crawlers: 0,
degraded_crawlers: 0,
paused_crawlers: 0,
failed_crawlers: 0,
due_now: 0,
error: 'View not available - run migration 046',
});
}
});
/**
* GET /api/dutchie-az/admin/crawler/error-summary
* Get error summary by code over last 7 days
*/
router.get('/admin/crawler/error-summary', async (_req: Request, res: Response) => {
try {
const { rows } = await query<any>(`SELECT * FROM v_crawl_error_summary`);
res.json({ errors: rows });
} catch (error: any) {
res.json({ errors: [], error: 'View not available - run migration 046' });
}
});
/**
* GET /api/dutchie-az/admin/crawler/status
* Get detailed status for all crawlers
*/
router.get('/admin/crawler/status', async (req: Request, res: Response) => {
try {
const { status, limit = '100', offset = '0' } = req.query;
let whereClause = '';
const params: any[] = [];
let paramIndex = 1;
if (status) {
whereClause = `WHERE crawl_status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await query<any>(
`SELECT * FROM v_crawler_status
${whereClause}
ORDER BY consecutive_failures DESC, name ASC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`,
params
);
const { rows: countRows } = await query<any>(
`SELECT COUNT(*) as total FROM v_crawler_status ${whereClause}`,
params.slice(0, -2)
);
res.json({
stores: rows,
total: parseInt(countRows[0]?.total || '0', 10),
limit: parseInt(limit as string, 10),
offset: parseInt(offset as string, 10),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/dutchie-az/admin/crawler/attempts
* Get recent crawl attempts (for debugging)
*/
router.get('/admin/crawler/attempts', async (req: Request, res: Response) => {
try {
const { dispensaryId, errorCode, limit = '50', offset = '0' } = req.query;
let whereClause = 'WHERE 1=1';
const params: any[] = [];
let paramIndex = 1;
if (dispensaryId) {
whereClause += ` AND ca.dispensary_id = $${paramIndex}`;
params.push(parseInt(dispensaryId as string, 10));
paramIndex++;
}
if (errorCode) {
whereClause += ` AND ca.error_code = $${paramIndex}`;
params.push(errorCode);
paramIndex++;
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await query<any>(
`SELECT
ca.*,
d.name as dispensary_name,
d.city
FROM crawl_attempts ca
LEFT JOIN dispensaries d ON ca.dispensary_id = d.id
${whereClause}
ORDER BY ca.started_at DESC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`,
params
);
res.json({ attempts: rows });
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/dutchie-az/admin/dispensaries/:id/pause
* Pause crawling for a dispensary
*/
router.post('/admin/dispensaries/:id/pause', async (req: Request, res: Response) => {
try {
const { id } = req.params;
await query(`
UPDATE dispensaries
SET crawl_status = 'paused',
next_crawl_at = NULL,
updated_at = NOW()
WHERE id = $1
`, [id]);
res.json({ success: true, message: `Crawling paused for dispensary ${id}` });
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/dutchie-az/admin/dispensaries/:id/resume
* Resume crawling for a paused/degraded dispensary
*/
router.post('/admin/dispensaries/:id/resume', async (req: Request, res: Response) => {
try {
const { id } = req.params;
// Reset to active and schedule next crawl
await query(`
UPDATE dispensaries
SET crawl_status = 'active',
consecutive_failures = 0,
backoff_multiplier = 1.0,
next_crawl_at = NOW() + INTERVAL '5 minutes',
updated_at = NOW()
WHERE id = $1
`, [id]);
res.json({ success: true, message: `Crawling resumed for dispensary ${id}` });
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// FAILED DISPENSARIES ROUTES
// ============================================================
@@ -2183,4 +2394,251 @@ router.get('/admin/dispensaries/health-summary', async (_req: Request, res: Resp
}
});
// ============================================================
// ORCHESTRATOR TRACE ROUTES
// ============================================================
import {
getLatestTrace,
getTraceById,
getTracesForDispensary,
getTraceByRunId,
} from '../../services/orchestrator-trace';
/**
* GET /api/dutchie-az/admin/dispensaries/:id/crawl-trace/latest
* Get the latest orchestrator trace for a dispensary
*/
router.get('/admin/dispensaries/:id/crawl-trace/latest', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const trace = await getLatestTrace(parseInt(id, 10));
if (!trace) {
return res.status(404).json({ error: 'No trace found for this dispensary' });
}
res.json(trace);
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/dutchie-az/admin/dispensaries/:id/crawl-traces
* Get paginated list of orchestrator traces for a dispensary
*/
router.get('/admin/dispensaries/:id/crawl-traces', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { limit = '20', offset = '0' } = req.query;
const result = await getTracesForDispensary(
parseInt(id, 10),
parseInt(limit as string, 10),
parseInt(offset as string, 10)
);
res.json(result);
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/dutchie-az/admin/crawl-traces/:traceId
* Get a specific orchestrator trace by ID
*/
router.get('/admin/crawl-traces/:traceId', async (req: Request, res: Response) => {
try {
const { traceId } = req.params;
const trace = await getTraceById(parseInt(traceId, 10));
if (!trace) {
return res.status(404).json({ error: 'Trace not found' });
}
res.json(trace);
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/dutchie-az/admin/crawl-traces/run/:runId
* Get a specific orchestrator trace by run ID
*/
router.get('/admin/crawl-traces/run/:runId', async (req: Request, res: Response) => {
try {
const { runId } = req.params;
const trace = await getTraceByRunId(runId);
if (!trace) {
return res.status(404).json({ error: 'Trace not found for this run ID' });
}
res.json(trace);
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// SCRAPER OVERVIEW DASHBOARD ENDPOINTS
// ============================================================
/**
* GET /api/dutchie-az/scraper/overview
* Comprehensive scraper overview for the new dashboard
*/
router.get('/scraper/overview', async (_req: Request, res: Response) => {
try {
// 1. Core KPI metrics
const { rows: kpiRows } = await query<any>(`
SELECT
-- Total products
(SELECT COUNT(*) FROM dutchie_products) AS total_products,
(SELECT COUNT(*) FROM dutchie_products WHERE stock_status = 'in_stock') AS in_stock_products,
-- Total dispensaries
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND state = 'AZ') AS total_dispensaries,
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND state = 'AZ' AND platform_dispensary_id IS NOT NULL) AS crawlable_dispensaries,
-- Visibility stats (24h)
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_lost = true AND visibility_lost_at > NOW() - INTERVAL '24 hours') AS visibility_lost_24h,
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_restored_at > NOW() - INTERVAL '24 hours') AS visibility_restored_24h,
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_lost = true) AS total_visibility_lost,
-- Job stats (24h)
(SELECT COUNT(*) FROM job_run_logs WHERE status IN ('error', 'partial') AND created_at > NOW() - INTERVAL '24 hours') AS errors_24h,
(SELECT COUNT(*) FROM job_run_logs WHERE status = 'success' AND created_at > NOW() - INTERVAL '24 hours') AS successful_jobs_24h,
-- Active workers
(SELECT COUNT(*) FROM job_schedules WHERE enabled = true) AS active_workers
`);
// 2. Get active worker names
const { rows: workerRows } = await query<any>(`
SELECT worker_name, worker_role, enabled, last_status, last_run_at, next_run_at
FROM job_schedules
WHERE enabled = true
ORDER BY next_run_at ASC NULLS LAST
`);
// 3. Scrape activity by hour (last 24h)
const { rows: activityRows } = await query<any>(`
SELECT
date_trunc('hour', started_at) AS hour,
COUNT(*) FILTER (WHERE status = 'success') AS successful,
COUNT(*) FILTER (WHERE status IN ('error', 'partial')) AS failed,
COUNT(*) AS total
FROM job_run_logs
WHERE started_at > NOW() - INTERVAL '24 hours'
GROUP BY date_trunc('hour', started_at)
ORDER BY hour ASC
`);
// 4. Product growth / coverage (last 7 days)
const { rows: growthRows } = await query<any>(`
SELECT
date_trunc('day', created_at) AS day,
COUNT(*) AS new_products
FROM dutchie_products
WHERE created_at > NOW() - INTERVAL '7 days'
GROUP BY date_trunc('day', created_at)
ORDER BY day ASC
`);
// 5. Recent worker runs (last 20)
const { rows: recentRuns } = await query<any>(`
SELECT
jrl.id,
jrl.job_name,
jrl.status,
jrl.started_at,
jrl.completed_at,
jrl.items_processed,
jrl.items_succeeded,
jrl.items_failed,
jrl.metadata,
js.worker_name,
js.worker_role
FROM job_run_logs jrl
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
ORDER BY jrl.started_at DESC
LIMIT 20
`);
// 6. Recent visibility changes by store
const { rows: visibilityChanges } = await query<any>(`
SELECT
d.id AS dispensary_id,
d.name AS dispensary_name,
d.state,
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = true AND dp.visibility_lost_at > NOW() - INTERVAL '24 hours') AS lost_24h,
COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at > NOW() - INTERVAL '24 hours') AS restored_24h,
MAX(dp.visibility_lost_at) AS latest_loss,
MAX(dp.visibility_restored_at) AS latest_restore
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
WHERE d.menu_type = 'dutchie'
GROUP BY d.id, d.name, d.state
HAVING COUNT(dp.id) FILTER (WHERE dp.visibility_lost = true AND dp.visibility_lost_at > NOW() - INTERVAL '24 hours') > 0
OR COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at > NOW() - INTERVAL '24 hours') > 0
ORDER BY lost_24h DESC, restored_24h DESC
LIMIT 15
`);
const kpi = kpiRows[0] || {};
res.json({
kpi: {
totalProducts: parseInt(kpi.total_products || '0'),
inStockProducts: parseInt(kpi.in_stock_products || '0'),
totalDispensaries: parseInt(kpi.total_dispensaries || '0'),
crawlableDispensaries: parseInt(kpi.crawlable_dispensaries || '0'),
visibilityLost24h: parseInt(kpi.visibility_lost_24h || '0'),
visibilityRestored24h: parseInt(kpi.visibility_restored_24h || '0'),
totalVisibilityLost: parseInt(kpi.total_visibility_lost || '0'),
errors24h: parseInt(kpi.errors_24h || '0'),
successfulJobs24h: parseInt(kpi.successful_jobs_24h || '0'),
activeWorkers: parseInt(kpi.active_workers || '0'),
},
workers: workerRows,
activityByHour: activityRows.map((row: any) => ({
hour: row.hour,
successful: parseInt(row.successful || '0'),
failed: parseInt(row.failed || '0'),
total: parseInt(row.total || '0'),
})),
productGrowth: growthRows.map((row: any) => ({
day: row.day,
newProducts: parseInt(row.new_products || '0'),
})),
recentRuns: recentRuns.map((row: any) => ({
id: row.id,
jobName: row.job_name,
status: row.status,
startedAt: row.started_at,
completedAt: row.completed_at,
itemsProcessed: row.items_processed,
itemsSucceeded: row.items_succeeded,
itemsFailed: row.items_failed,
workerName: row.worker_name,
workerRole: row.worker_role,
visibilityLost: row.metadata?.visibilityLostCount || 0,
visibilityRestored: row.metadata?.visibilityRestoredCount || 0,
})),
visibilityChanges: visibilityChanges.map((row: any) => ({
dispensaryId: row.dispensary_id,
dispensaryName: row.dispensary_name,
state: row.state,
lost24h: parseInt(row.lost_24h || '0'),
restored24h: parseInt(row.restored_24h || '0'),
latestLoss: row.latest_loss,
latestRestore: row.latest_restore,
})),
});
} catch (error: any) {
console.error('Error fetching scraper overview:', error);
res.status(500).json({ error: error.message });
}
});
export default router;

View File

@@ -0,0 +1,486 @@
#!/usr/bin/env npx tsx
/**
* Crawler Reliability Stress Test
*
* Simulates various failure scenarios to test:
* - Retry logic with exponential backoff
* - Error taxonomy classification
* - Self-healing (proxy/UA rotation)
* - Status transitions (active -> degraded -> failed)
* - Minimum crawl gap enforcement
*
* Phase 1: Crawler Reliability & Stabilization
*
* Usage:
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
*
* Available tests:
* retry - Test retry manager with various error types
* backoff - Test exponential backoff calculation
* status - Test status transitions
* gap - Test minimum crawl gap enforcement
* rotation - Test proxy/UA rotation
* all - Run all tests
*/
import {
CrawlErrorCode,
classifyError,
isRetryable,
shouldRotateProxy,
shouldRotateUserAgent,
getBackoffMultiplier,
getErrorMetadata,
} from '../services/error-taxonomy';
import {
RetryManager,
withRetry,
calculateNextCrawlDelay,
calculateNextCrawlAt,
determineCrawlStatus,
shouldAttemptRecovery,
sleep,
} from '../services/retry-manager';
import {
UserAgentRotator,
USER_AGENTS,
} from '../services/proxy-rotator';
import {
validateStoreConfig,
isCrawlable,
DEFAULT_CONFIG,
RawStoreConfig,
} from '../services/store-validator';
// ============================================================
// TEST UTILITIES
// ============================================================
let testsPassed = 0;
let testsFailed = 0;
function assert(condition: boolean, message: string): void {
if (condition) {
console.log(`${message}`);
testsPassed++;
} else {
console.log(`${message}`);
testsFailed++;
}
}
function section(name: string): void {
console.log(`\n${'='.repeat(60)}`);
console.log(`TEST: ${name}`);
console.log('='.repeat(60));
}
// ============================================================
// TEST: Error Classification
// ============================================================
function testErrorClassification(): void {
section('Error Classification');
// HTTP status codes
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
// Error messages
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
// Retryability
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
// Rotation decisions
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
}
// ============================================================
// TEST: Retry Manager
// ============================================================
function testRetryManager(): void {
section('Retry Manager');
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
// Initial state
assert(manager.shouldAttempt() === true, 'Should attempt initially');
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
// First attempt
manager.recordAttempt();
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
// Evaluate retryable error
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
assert(decision1.rotateProxy === true, 'Should rotate proxy');
assert(decision1.backoffMs > 0, 'Backoff is positive');
// More attempts
manager.recordAttempt();
manager.recordAttempt();
// Now at max retries
const decision2 = manager.evaluateError(new Error('timeout'), 504);
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
manager.recordAttempt();
const decision3 = manager.evaluateError(new Error('timeout'));
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
// Reset
manager.reset();
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
// Non-retryable error
const manager2 = new RetryManager({ maxRetries: 3 });
manager2.recordAttempt();
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
}
// ============================================================
// TEST: Exponential Backoff
// ============================================================
function testExponentialBackoff(): void {
section('Exponential Backoff');
// Calculate next crawl delay
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
console.log(` Delay with 0 failures: ${delay0} minutes`);
console.log(` Delay with 1 failure: ${delay1} minutes`);
console.log(` Delay with 2 failures: ${delay2} minutes`);
console.log(` Delay with 3 failures: ${delay3} minutes`);
console.log(` Delay with 5 failures: ${delay5} minutes`);
assert(delay1 > delay0, 'Delay increases with failures');
assert(delay2 > delay1, 'Delay keeps increasing');
assert(delay3 > delay2, 'More delay with more failures');
// With jitter, exact values vary but ratio should be close to 2x
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
// Next crawl time calculation
const now = new Date();
const nextAt = calculateNextCrawlAt(2, 240);
assert(nextAt > now, 'Next crawl is in future');
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
}
// ============================================================
// TEST: Status Transitions
// ============================================================
function testStatusTransitions(): void {
section('Status Transitions');
// Active status
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
// Degraded status
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
// Failed status
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
// Custom thresholds
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
// Recovery check
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
}
// ============================================================
// TEST: Store Validation
// ============================================================
function testStoreValidation(): void {
section('Store Validation');
// Valid config
const validConfig: RawStoreConfig = {
id: 1,
name: 'Test Store',
platformDispensaryId: '123abc',
menuType: 'dutchie',
};
const validResult = validateStoreConfig(validConfig);
assert(validResult.isValid === true, 'Valid config passes');
assert(validResult.config !== null, 'Valid config returns config');
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
// Missing required fields
const missingId: RawStoreConfig = {
id: 0,
name: 'Test',
platformDispensaryId: '123',
menuType: 'dutchie',
};
const missingIdResult = validateStoreConfig(missingId);
assert(missingIdResult.isValid === false, 'Missing ID fails');
// Missing platform ID
const missingPlatform: RawStoreConfig = {
id: 1,
name: 'Test',
menuType: 'dutchie',
};
const missingPlatformResult = validateStoreConfig(missingPlatform);
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
// Unknown menu type
const unknownMenu: RawStoreConfig = {
id: 1,
name: 'Test',
platformDispensaryId: '123',
menuType: 'unknown',
};
const unknownMenuResult = validateStoreConfig(unknownMenu);
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
// Crawlable check
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
}
// ============================================================
// TEST: User Agent Rotation
// ============================================================
function testUserAgentRotation(): void {
section('User Agent Rotation');
const rotator = new UserAgentRotator();
const first = rotator.getCurrent();
const second = rotator.getNext();
const third = rotator.getNext();
assert(first !== second, 'User agents rotate');
assert(second !== third, 'User agents keep rotating');
assert(USER_AGENTS.includes(first), 'Returns valid UA');
assert(USER_AGENTS.includes(second), 'Returns valid UA');
// Random UA
const random = rotator.getRandom();
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
// Count
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
}
// ============================================================
// TEST: WithRetry Helper
// ============================================================
async function testWithRetryHelper(): Promise<void> {
section('WithRetry Helper');
// Successful on first try
let attempts = 0;
const successResult = await withRetry(async () => {
attempts++;
return 'success';
}, { maxRetries: 3 });
assert(attempts === 1, 'Succeeds on first try');
assert(successResult.result === 'success', 'Returns result');
// Fails then succeeds
let failThenSucceedAttempts = 0;
const failThenSuccessResult = await withRetry(async () => {
failThenSucceedAttempts++;
if (failThenSucceedAttempts < 3) {
throw new Error('temporary error');
}
return 'finally succeeded';
}, { maxRetries: 5, baseBackoffMs: 10 });
assert(failThenSucceedAttempts === 3, 'Retries until success');
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
// Exhausts retries
let alwaysFailAttempts = 0;
try {
await withRetry(async () => {
alwaysFailAttempts++;
throw new Error('always fails');
}, { maxRetries: 2, baseBackoffMs: 10 });
assert(false, 'Should have thrown');
} catch (error: any) {
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
}
// Non-retryable error stops immediately
let nonRetryableAttempts = 0;
try {
await withRetry(async () => {
nonRetryableAttempts++;
const err = new Error('HTML structure changed - selector not found');
throw err;
}, { maxRetries: 3, baseBackoffMs: 10 });
assert(false, 'Should have thrown');
} catch {
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
}
}
// ============================================================
// TEST: Minimum Crawl Gap
// ============================================================
function testMinimumCrawlGap(): void {
section('Minimum Crawl Gap');
// Default config
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
// Gap calculation
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
}
// ============================================================
// TEST: Error Metadata
// ============================================================
function testErrorMetadata(): void {
section('Error Metadata');
// RATE_LIMITED
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
// HTML_CHANGED
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
// INVALID_CONFIG
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
}
// ============================================================
// MAIN
// ============================================================
async function runTests(testName?: string): Promise<void> {
console.log('\n');
console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
console.log('╚══════════════════════════════════════════════════════════╝');
const allTests = !testName || testName === 'all';
if (allTests || testName === 'error' || testName === 'classification') {
testErrorClassification();
}
if (allTests || testName === 'retry') {
testRetryManager();
}
if (allTests || testName === 'backoff') {
testExponentialBackoff();
}
if (allTests || testName === 'status') {
testStatusTransitions();
}
if (allTests || testName === 'validation' || testName === 'store') {
testStoreValidation();
}
if (allTests || testName === 'rotation' || testName === 'ua') {
testUserAgentRotation();
}
if (allTests || testName === 'withRetry' || testName === 'helper') {
await testWithRetryHelper();
}
if (allTests || testName === 'gap') {
testMinimumCrawlGap();
}
if (allTests || testName === 'metadata') {
testErrorMetadata();
}
// Summary
console.log('\n');
console.log('═'.repeat(60));
console.log('SUMMARY');
console.log('═'.repeat(60));
console.log(` Passed: ${testsPassed}`);
console.log(` Failed: ${testsFailed}`);
console.log(` Total: ${testsPassed + testsFailed}`);
if (testsFailed > 0) {
console.log('\n❌ SOME TESTS FAILED\n');
process.exit(1);
} else {
console.log('\n✅ ALL TESTS PASSED\n');
process.exit(0);
}
}
// Run tests
const testName = process.argv[2];
runTests(testName).catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});

View File

@@ -0,0 +1,659 @@
/**
* Brand Opportunity / Risk Analytics Service
*
* Provides brand-level opportunity and risk analysis including:
* - Under/overpriced vs market
* - Missing SKU opportunities
* - Stores with declining/growing shelf share
* - Competitor intrusion alerts
*
* Phase 3: Analytics Dashboards
*/
import { Pool } from 'pg';
import { AnalyticsCache, cacheKey } from './cache';
export interface BrandOpportunity {
brandName: string;
underpricedVsMarket: PricePosition[];
overpricedVsMarket: PricePosition[];
missingSkuOpportunities: MissingSkuOpportunity[];
storesWithDecliningShelfShare: StoreShelfShareChange[];
storesWithGrowingShelfShare: StoreShelfShareChange[];
competitorIntrusionAlerts: CompetitorAlert[];
overallScore: number; // 0-100, higher = more opportunity
riskScore: number; // 0-100, higher = more risk
}
export interface PricePosition {
category: string;
brandAvgPrice: number;
marketAvgPrice: number;
priceDifferencePercent: number;
skuCount: number;
suggestion: string;
}
export interface MissingSkuOpportunity {
category: string;
subcategory: string | null;
marketSkuCount: number;
brandSkuCount: number;
gapPercent: number;
topCompetitors: string[];
opportunityScore: number; // 0-100
}
export interface StoreShelfShareChange {
storeId: number;
storeName: string;
city: string;
state: string;
currentShelfShare: number;
previousShelfShare: number;
changePercent: number;
currentSkus: number;
competitors: string[];
}
export interface CompetitorAlert {
competitorBrand: string;
storeId: number;
storeName: string;
alertType: 'new_entry' | 'expanding' | 'price_undercut';
details: string;
severity: 'low' | 'medium' | 'high';
date: string;
}
export interface MarketPositionSummary {
brandName: string;
marketSharePercent: number;
avgPriceVsMarket: number; // -X% to +X%
categoryStrengths: Array<{ category: string; shelfSharePercent: number }>;
categoryWeaknesses: Array<{ category: string; shelfSharePercent: number; marketLeader: string }>;
growthTrend: 'growing' | 'stable' | 'declining';
competitorThreats: string[];
}
export class BrandOpportunityService {
private pool: Pool;
private cache: AnalyticsCache;
constructor(pool: Pool, cache: AnalyticsCache) {
this.pool = pool;
this.cache = cache;
}
/**
* Get full opportunity analysis for a brand
*/
async getBrandOpportunity(brandName: string): Promise<BrandOpportunity> {
const key = cacheKey('brand_opportunity', { brandName });
return (await this.cache.getOrCompute(key, async () => {
const [
underpriced,
overpriced,
missingSkus,
decliningStores,
growingStores,
alerts,
] = await Promise.all([
this.getUnderpricedPositions(brandName),
this.getOverpricedPositions(brandName),
this.getMissingSkuOpportunities(brandName),
this.getStoresWithDecliningShare(brandName),
this.getStoresWithGrowingShare(brandName),
this.getCompetitorAlerts(brandName),
]);
// Calculate opportunity score (higher = more opportunity)
const opportunityFactors = [
missingSkus.length > 0 ? 20 : 0,
underpriced.length > 0 ? 15 : 0,
growingStores.length > 5 ? 20 : growingStores.length * 3,
missingSkus.reduce((sum, m) => sum + m.opportunityScore, 0) / Math.max(1, missingSkus.length) * 0.3,
];
const opportunityScore = Math.min(100, opportunityFactors.reduce((a, b) => a + b, 0));
// Calculate risk score (higher = more risk)
const riskFactors = [
decliningStores.length > 5 ? 30 : decliningStores.length * 5,
alerts.filter(a => a.severity === 'high').length * 15,
alerts.filter(a => a.severity === 'medium').length * 8,
overpriced.length > 3 ? 15 : overpriced.length * 3,
];
const riskScore = Math.min(100, riskFactors.reduce((a, b) => a + b, 0));
return {
brandName,
underpricedVsMarket: underpriced,
overpricedVsMarket: overpriced,
missingSkuOpportunities: missingSkus,
storesWithDecliningShelfShare: decliningStores,
storesWithGrowingShelfShare: growingStores,
competitorIntrusionAlerts: alerts,
overallScore: Math.round(opportunityScore),
riskScore: Math.round(riskScore),
};
}, 30)).data;
}
/**
* Get categories where brand is underpriced vs market
*/
async getUnderpricedPositions(brandName: string): Promise<PricePosition[]> {
const result = await this.pool.query(`
WITH brand_prices AS (
SELECT
type as category,
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
COUNT(*) as sku_count
FROM dutchie_products
WHERE brand_name = $1 AND type IS NOT NULL
GROUP BY type
HAVING COUNT(*) >= 3
),
market_prices AS (
SELECT
type as category,
AVG(extract_min_price(latest_raw_payload)) as market_avg
FROM dutchie_products
WHERE type IS NOT NULL AND brand_name != $1
GROUP BY type
)
SELECT
bp.category,
bp.brand_avg,
mp.market_avg,
bp.sku_count,
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
FROM brand_prices bp
JOIN market_prices mp ON bp.category = mp.category
WHERE bp.brand_avg < mp.market_avg * 0.9 -- 10% or more below market
AND bp.brand_avg IS NOT NULL
AND mp.market_avg IS NOT NULL
ORDER BY diff_pct
`, [brandName]);
return result.rows.map(row => ({
category: row.category,
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
skuCount: parseInt(row.sku_count) || 0,
suggestion: `Consider price increase - ${Math.abs(Math.round(parseFloat(row.diff_pct)))}% below market average`,
}));
}
/**
* Get categories where brand is overpriced vs market
*/
async getOverpricedPositions(brandName: string): Promise<PricePosition[]> {
const result = await this.pool.query(`
WITH brand_prices AS (
SELECT
type as category,
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
COUNT(*) as sku_count
FROM dutchie_products
WHERE brand_name = $1 AND type IS NOT NULL
GROUP BY type
HAVING COUNT(*) >= 3
),
market_prices AS (
SELECT
type as category,
AVG(extract_min_price(latest_raw_payload)) as market_avg
FROM dutchie_products
WHERE type IS NOT NULL AND brand_name != $1
GROUP BY type
)
SELECT
bp.category,
bp.brand_avg,
mp.market_avg,
bp.sku_count,
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
FROM brand_prices bp
JOIN market_prices mp ON bp.category = mp.category
WHERE bp.brand_avg > mp.market_avg * 1.15 -- 15% or more above market
AND bp.brand_avg IS NOT NULL
AND mp.market_avg IS NOT NULL
ORDER BY diff_pct DESC
`, [brandName]);
return result.rows.map(row => ({
category: row.category,
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
skuCount: parseInt(row.sku_count) || 0,
suggestion: `Price sensitivity risk - ${Math.round(parseFloat(row.diff_pct))}% above market average`,
}));
}
/**
* Get missing SKU opportunities (category gaps)
*/
async getMissingSkuOpportunities(brandName: string): Promise<MissingSkuOpportunity[]> {
const result = await this.pool.query(`
WITH market_categories AS (
SELECT
type as category,
subcategory,
COUNT(*) as market_skus,
ARRAY_AGG(DISTINCT brand_name ORDER BY brand_name) FILTER (WHERE brand_name IS NOT NULL) as top_brands
FROM dutchie_products
WHERE type IS NOT NULL
GROUP BY type, subcategory
HAVING COUNT(*) >= 20
),
brand_presence AS (
SELECT
type as category,
subcategory,
COUNT(*) as brand_skus
FROM dutchie_products
WHERE brand_name = $1 AND type IS NOT NULL
GROUP BY type, subcategory
)
SELECT
mc.category,
mc.subcategory,
mc.market_skus,
COALESCE(bp.brand_skus, 0) as brand_skus,
mc.top_brands[1:5] as competitors
FROM market_categories mc
LEFT JOIN brand_presence bp ON mc.category = bp.category
AND (mc.subcategory = bp.subcategory OR (mc.subcategory IS NULL AND bp.subcategory IS NULL))
WHERE COALESCE(bp.brand_skus, 0) < mc.market_skus * 0.05 -- Brand has <5% of market presence
ORDER BY mc.market_skus DESC
LIMIT 10
`, [brandName]);
return result.rows.map(row => {
const marketSkus = parseInt(row.market_skus) || 0;
const brandSkus = parseInt(row.brand_skus) || 0;
const gapPercent = marketSkus > 0 ? ((marketSkus - brandSkus) / marketSkus) * 100 : 100;
const opportunityScore = Math.min(100, Math.round((marketSkus / 100) * (gapPercent / 100) * 100));
return {
category: row.category,
subcategory: row.subcategory,
marketSkuCount: marketSkus,
brandSkuCount: brandSkus,
gapPercent: Math.round(gapPercent),
topCompetitors: (row.competitors || []).filter((c: string) => c !== brandName).slice(0, 5),
opportunityScore,
};
});
}
/**
* Get stores where brand's shelf share is declining
*/
async getStoresWithDecliningShare(brandName: string): Promise<StoreShelfShareChange[]> {
// Use brand_snapshots for historical comparison
const result = await this.pool.query(`
WITH current_share AS (
SELECT
dp.dispensary_id as store_id,
d.name as store_name,
d.city,
d.state,
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
COUNT(*) as total_skus,
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
GROUP BY dp.dispensary_id, d.name, d.city, d.state
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
)
SELECT
cs.store_id,
cs.store_name,
cs.city,
cs.state,
cs.brand_skus as current_skus,
cs.total_skus,
ROUND((cs.brand_skus::NUMERIC / cs.total_skus) * 100, 2) as current_share,
cs.competitors[1:5] as top_competitors
FROM current_share cs
WHERE cs.brand_skus < 10 -- Low presence
ORDER BY cs.brand_skus
LIMIT 10
`, [brandName]);
return result.rows.map(row => ({
storeId: row.store_id,
storeName: row.store_name,
city: row.city,
state: row.state,
currentShelfShare: parseFloat(row.current_share) || 0,
previousShelfShare: parseFloat(row.current_share) || 0, // Would need historical data
changePercent: 0,
currentSkus: parseInt(row.current_skus) || 0,
competitors: row.top_competitors || [],
}));
}
/**
* Get stores where brand's shelf share is growing
*/
async getStoresWithGrowingShare(brandName: string): Promise<StoreShelfShareChange[]> {
const result = await this.pool.query(`
WITH store_share AS (
SELECT
dp.dispensary_id as store_id,
d.name as store_name,
d.city,
d.state,
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
COUNT(*) as total_skus,
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
GROUP BY dp.dispensary_id, d.name, d.city, d.state
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
)
SELECT
ss.store_id,
ss.store_name,
ss.city,
ss.state,
ss.brand_skus as current_skus,
ss.total_skus,
ROUND((ss.brand_skus::NUMERIC / ss.total_skus) * 100, 2) as current_share,
ss.competitors[1:5] as top_competitors
FROM store_share ss
ORDER BY current_share DESC
LIMIT 10
`, [brandName]);
return result.rows.map(row => ({
storeId: row.store_id,
storeName: row.store_name,
city: row.city,
state: row.state,
currentShelfShare: parseFloat(row.current_share) || 0,
previousShelfShare: parseFloat(row.current_share) || 0,
changePercent: 0,
currentSkus: parseInt(row.current_skus) || 0,
competitors: row.top_competitors || [],
}));
}
/**
* Get competitor intrusion alerts
*/
async getCompetitorAlerts(brandName: string): Promise<CompetitorAlert[]> {
// Check for competitor entries in stores where this brand has presence
const result = await this.pool.query(`
WITH brand_stores AS (
SELECT DISTINCT dispensary_id
FROM dutchie_products
WHERE brand_name = $1
),
competitor_presence AS (
SELECT
dp.brand_name as competitor,
dp.dispensary_id as store_id,
d.name as store_name,
COUNT(*) as sku_count,
MAX(dp.created_at) as latest_add
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.dispensary_id IN (SELECT dispensary_id FROM brand_stores)
AND dp.brand_name != $1
AND dp.brand_name IS NOT NULL
AND dp.created_at >= NOW() - INTERVAL '30 days'
GROUP BY dp.brand_name, dp.dispensary_id, d.name
HAVING COUNT(*) >= 5
)
SELECT
competitor,
store_id,
store_name,
sku_count,
latest_add
FROM competitor_presence
ORDER BY sku_count DESC
LIMIT 10
`, [brandName]);
return result.rows.map(row => {
const skuCount = parseInt(row.sku_count) || 0;
let severity: 'low' | 'medium' | 'high' = 'low';
if (skuCount >= 20) severity = 'high';
else if (skuCount >= 10) severity = 'medium';
return {
competitorBrand: row.competitor,
storeId: row.store_id,
storeName: row.store_name,
alertType: 'expanding' as const,
details: `${row.competitor} has ${skuCount} SKUs in ${row.store_name}`,
severity,
date: new Date(row.latest_add).toISOString().split('T')[0],
};
});
}
/**
* Get market position summary for a brand
*/
async getMarketPositionSummary(brandName: string): Promise<MarketPositionSummary> {
const key = cacheKey('market_position', { brandName });
return (await this.cache.getOrCompute(key, async () => {
const [shareResult, priceResult, categoryResult, threatResult] = await Promise.all([
// Market share
this.pool.query(`
SELECT
(SELECT COUNT(*) FROM dutchie_products WHERE brand_name = $1) as brand_count,
(SELECT COUNT(*) FROM dutchie_products) as total_count
`, [brandName]),
// Price vs market
this.pool.query(`
SELECT
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name = $1) as brand_avg,
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name != $1) as market_avg
`, [brandName]),
// Category strengths/weaknesses
this.pool.query(`
WITH brand_by_cat AS (
SELECT type as category, COUNT(*) as brand_count
FROM dutchie_products
WHERE brand_name = $1 AND type IS NOT NULL
GROUP BY type
),
market_by_cat AS (
SELECT type as category, COUNT(*) as total_count
FROM dutchie_products WHERE type IS NOT NULL
GROUP BY type
),
leaders AS (
SELECT type as category, brand_name, COUNT(*) as cnt,
RANK() OVER (PARTITION BY type ORDER BY COUNT(*) DESC) as rnk
FROM dutchie_products WHERE type IS NOT NULL AND brand_name IS NOT NULL
GROUP BY type, brand_name
)
SELECT
mc.category,
COALESCE(bc.brand_count, 0) as brand_count,
mc.total_count,
ROUND((COALESCE(bc.brand_count, 0)::NUMERIC / mc.total_count) * 100, 2) as share_pct,
(SELECT brand_name FROM leaders WHERE category = mc.category AND rnk = 1) as leader
FROM market_by_cat mc
LEFT JOIN brand_by_cat bc ON mc.category = bc.category
ORDER BY share_pct DESC
`, [brandName]),
// Top competitors
this.pool.query(`
SELECT brand_name, COUNT(*) as cnt
FROM dutchie_products
WHERE brand_name IS NOT NULL AND brand_name != $1
GROUP BY brand_name
ORDER BY cnt DESC
LIMIT 5
`, [brandName]),
]);
const brandCount = parseInt(shareResult.rows[0]?.brand_count) || 0;
const totalCount = parseInt(shareResult.rows[0]?.total_count) || 1;
const marketSharePercent = Math.round((brandCount / totalCount) * 1000) / 10;
const brandAvg = parseFloat(priceResult.rows[0]?.brand_avg) || 0;
const marketAvg = parseFloat(priceResult.rows[0]?.market_avg) || 1;
const avgPriceVsMarket = Math.round(((brandAvg - marketAvg) / marketAvg) * 1000) / 10;
const categories = categoryResult.rows;
const strengths = categories
.filter(c => parseFloat(c.share_pct) > 5)
.map(c => ({ category: c.category, shelfSharePercent: parseFloat(c.share_pct) }));
const weaknesses = categories
.filter(c => parseFloat(c.share_pct) < 2 && c.leader !== brandName)
.map(c => ({
category: c.category,
shelfSharePercent: parseFloat(c.share_pct),
marketLeader: c.leader || 'Unknown',
}));
return {
brandName,
marketSharePercent,
avgPriceVsMarket,
categoryStrengths: strengths.slice(0, 5),
categoryWeaknesses: weaknesses.slice(0, 5),
growthTrend: 'stable' as const, // Would need historical data
competitorThreats: threatResult.rows.map(r => r.brand_name),
};
}, 30)).data;
}
/**
* Create an analytics alert
*/
async createAlert(alert: {
alertType: string;
severity: 'info' | 'warning' | 'critical';
title: string;
description?: string;
storeId?: number;
brandName?: string;
productId?: number;
category?: string;
metadata?: Record<string, unknown>;
}): Promise<void> {
await this.pool.query(`
INSERT INTO analytics_alerts
(alert_type, severity, title, description, store_id, brand_name, product_id, category, metadata)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
`, [
alert.alertType,
alert.severity,
alert.title,
alert.description || null,
alert.storeId || null,
alert.brandName || null,
alert.productId || null,
alert.category || null,
alert.metadata ? JSON.stringify(alert.metadata) : null,
]);
}
/**
* Get recent alerts
*/
async getAlerts(filters: {
brandName?: string;
storeId?: number;
alertType?: string;
unreadOnly?: boolean;
limit?: number;
} = {}): Promise<Array<{
id: number;
alertType: string;
severity: string;
title: string;
description: string | null;
storeName: string | null;
brandName: string | null;
createdAt: string;
isRead: boolean;
}>> {
const { brandName, storeId, alertType, unreadOnly = false, limit = 50 } = filters;
const params: (string | number | boolean)[] = [limit];
const conditions: string[] = [];
let paramIndex = 2;
if (brandName) {
conditions.push(`a.brand_name = $${paramIndex++}`);
params.push(brandName);
}
if (storeId) {
conditions.push(`a.store_id = $${paramIndex++}`);
params.push(storeId);
}
if (alertType) {
conditions.push(`a.alert_type = $${paramIndex++}`);
params.push(alertType);
}
if (unreadOnly) {
conditions.push('a.is_read = false');
}
const whereClause = conditions.length > 0
? 'WHERE ' + conditions.join(' AND ')
: '';
const result = await this.pool.query(`
SELECT
a.id,
a.alert_type,
a.severity,
a.title,
a.description,
d.name as store_name,
a.brand_name,
a.created_at,
a.is_read
FROM analytics_alerts a
LEFT JOIN dispensaries d ON a.store_id = d.id
${whereClause}
ORDER BY a.created_at DESC
LIMIT $1
`, params);
return result.rows.map(row => ({
id: row.id,
alertType: row.alert_type,
severity: row.severity,
title: row.title,
description: row.description,
storeName: row.store_name,
brandName: row.brand_name,
createdAt: row.created_at.toISOString(),
isRead: row.is_read,
}));
}
/**
* Mark alerts as read
*/
async markAlertsRead(alertIds: number[]): Promise<void> {
if (alertIds.length === 0) return;
await this.pool.query(`
UPDATE analytics_alerts
SET is_read = true
WHERE id = ANY($1)
`, [alertIds]);
}
}

View File

@@ -0,0 +1,227 @@
/**
* Analytics Cache Service
*
* Provides caching layer for expensive analytics queries.
* Uses PostgreSQL for persistence with configurable TTLs.
*
* Phase 3: Analytics Dashboards
*/
import { Pool } from 'pg';
export interface CacheEntry<T = unknown> {
key: string;
data: T;
computedAt: Date;
expiresAt: Date;
queryTimeMs?: number;
}
export interface CacheConfig {
defaultTtlMinutes: number;
}
const DEFAULT_CONFIG: CacheConfig = {
defaultTtlMinutes: 15,
};
export class AnalyticsCache {
private pool: Pool;
private config: CacheConfig;
private memoryCache: Map<string, CacheEntry> = new Map();
constructor(pool: Pool, config: Partial<CacheConfig> = {}) {
this.pool = pool;
this.config = { ...DEFAULT_CONFIG, ...config };
}
/**
* Get cached data or compute and cache it
*/
async getOrCompute<T>(
key: string,
computeFn: () => Promise<T>,
ttlMinutes?: number
): Promise<{ data: T; fromCache: boolean; queryTimeMs: number }> {
const ttl = ttlMinutes ?? this.config.defaultTtlMinutes;
// Check memory cache first
const memEntry = this.memoryCache.get(key);
if (memEntry && new Date() < memEntry.expiresAt) {
return { data: memEntry.data as T, fromCache: true, queryTimeMs: memEntry.queryTimeMs || 0 };
}
// Check database cache
const dbEntry = await this.getFromDb<T>(key);
if (dbEntry && new Date() < dbEntry.expiresAt) {
this.memoryCache.set(key, dbEntry);
return { data: dbEntry.data, fromCache: true, queryTimeMs: dbEntry.queryTimeMs || 0 };
}
// Compute fresh data
const startTime = Date.now();
const data = await computeFn();
const queryTimeMs = Date.now() - startTime;
// Cache result
const entry: CacheEntry<T> = {
key,
data,
computedAt: new Date(),
expiresAt: new Date(Date.now() + ttl * 60 * 1000),
queryTimeMs,
};
await this.saveToDb(entry);
this.memoryCache.set(key, entry);
return { data, fromCache: false, queryTimeMs };
}
/**
* Get from database cache
*/
private async getFromDb<T>(key: string): Promise<CacheEntry<T> | null> {
try {
const result = await this.pool.query(`
SELECT cache_data, computed_at, expires_at, query_time_ms
FROM analytics_cache
WHERE cache_key = $1
AND expires_at > NOW()
`, [key]);
if (result.rows.length === 0) return null;
const row = result.rows[0];
return {
key,
data: row.cache_data as T,
computedAt: row.computed_at,
expiresAt: row.expires_at,
queryTimeMs: row.query_time_ms,
};
} catch (error) {
console.warn(`[AnalyticsCache] Failed to get from DB: ${error}`);
return null;
}
}
/**
* Save to database cache
*/
private async saveToDb<T>(entry: CacheEntry<T>): Promise<void> {
try {
await this.pool.query(`
INSERT INTO analytics_cache (cache_key, cache_data, computed_at, expires_at, query_time_ms)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (cache_key)
DO UPDATE SET
cache_data = EXCLUDED.cache_data,
computed_at = EXCLUDED.computed_at,
expires_at = EXCLUDED.expires_at,
query_time_ms = EXCLUDED.query_time_ms
`, [entry.key, JSON.stringify(entry.data), entry.computedAt, entry.expiresAt, entry.queryTimeMs]);
} catch (error) {
console.warn(`[AnalyticsCache] Failed to save to DB: ${error}`);
}
}
/**
* Invalidate a cache entry
*/
async invalidate(key: string): Promise<void> {
this.memoryCache.delete(key);
try {
await this.pool.query('DELETE FROM analytics_cache WHERE cache_key = $1', [key]);
} catch (error) {
console.warn(`[AnalyticsCache] Failed to invalidate: ${error}`);
}
}
/**
* Invalidate all entries matching a pattern
*/
async invalidatePattern(pattern: string): Promise<number> {
// Clear memory cache
for (const key of this.memoryCache.keys()) {
if (key.includes(pattern)) {
this.memoryCache.delete(key);
}
}
try {
const result = await this.pool.query(
'DELETE FROM analytics_cache WHERE cache_key LIKE $1',
[`%${pattern}%`]
);
return result.rowCount || 0;
} catch (error) {
console.warn(`[AnalyticsCache] Failed to invalidate pattern: ${error}`);
return 0;
}
}
/**
* Clean expired entries
*/
async cleanExpired(): Promise<number> {
// Clean memory cache
const now = new Date();
for (const [key, entry] of this.memoryCache.entries()) {
if (now >= entry.expiresAt) {
this.memoryCache.delete(key);
}
}
try {
const result = await this.pool.query('DELETE FROM analytics_cache WHERE expires_at < NOW()');
return result.rowCount || 0;
} catch (error) {
console.warn(`[AnalyticsCache] Failed to clean expired: ${error}`);
return 0;
}
}
/**
* Get cache statistics
*/
async getStats(): Promise<{
memoryCacheSize: number;
dbCacheSize: number;
expiredCount: number;
}> {
try {
const result = await this.pool.query(`
SELECT
COUNT(*) FILTER (WHERE expires_at > NOW()) as active,
COUNT(*) FILTER (WHERE expires_at <= NOW()) as expired
FROM analytics_cache
`);
return {
memoryCacheSize: this.memoryCache.size,
dbCacheSize: parseInt(result.rows[0]?.active || '0'),
expiredCount: parseInt(result.rows[0]?.expired || '0'),
};
} catch (error) {
return {
memoryCacheSize: this.memoryCache.size,
dbCacheSize: 0,
expiredCount: 0,
};
}
}
}
/**
* Generate cache key with parameters
*/
export function cacheKey(prefix: string, params: Record<string, unknown> = {}): string {
const sortedParams = Object.keys(params)
.sort()
.filter(k => params[k] !== undefined && params[k] !== null)
.map(k => `${k}=${params[k]}`)
.join('&');
return sortedParams ? `${prefix}:${sortedParams}` : prefix;
}

View File

@@ -0,0 +1,530 @@
/**
* Category Growth Analytics Service
*
* Provides category-level analytics including:
* - SKU count growth
* - Price growth trends
* - New product additions
* - Category shrinkage
* - Seasonality patterns
*
* Phase 3: Analytics Dashboards
*/
import { Pool } from 'pg';
import { AnalyticsCache, cacheKey } from './cache';
export interface CategoryGrowth {
category: string;
currentSkuCount: number;
previousSkuCount: number;
skuGrowthPercent: number;
currentBrandCount: number;
previousBrandCount: number;
brandGrowthPercent: number;
currentAvgPrice: number | null;
previousAvgPrice: number | null;
priceChangePercent: number | null;
newProducts: number;
discontinuedProducts: number;
trend: 'growing' | 'declining' | 'stable';
}
export interface CategorySummary {
category: string;
totalSkus: number;
brandCount: number;
storeCount: number;
avgPrice: number | null;
minPrice: number | null;
maxPrice: number | null;
inStockSkus: number;
outOfStockSkus: number;
stockHealthPercent: number;
}
export interface CategoryGrowthTrend {
category: string;
dataPoints: Array<{
date: string;
skuCount: number;
brandCount: number;
avgPrice: number | null;
storeCount: number;
}>;
growth7d: number | null;
growth30d: number | null;
growth90d: number | null;
}
export interface CategoryHeatmapData {
categories: string[];
periods: string[];
data: Array<{
category: string;
period: string;
value: number; // SKU count, growth %, or price
changeFromPrevious: number | null;
}>;
}
export interface SeasonalityPattern {
category: string;
monthlyPattern: Array<{
month: number;
monthName: string;
avgSkuCount: number;
avgPrice: number | null;
seasonalityIndex: number; // 100 = average, >100 = above, <100 = below
}>;
peakMonth: number;
troughMonth: number;
}
export interface CategoryFilters {
state?: string;
storeId?: number;
minSkus?: number;
}
export class CategoryAnalyticsService {
private pool: Pool;
private cache: AnalyticsCache;
constructor(pool: Pool, cache: AnalyticsCache) {
this.pool = pool;
this.cache = cache;
}
/**
* Get current category summary
*/
async getCategorySummary(
category?: string,
filters: CategoryFilters = {}
): Promise<CategorySummary[]> {
const { state, storeId } = filters;
const key = cacheKey('category_summary', { category, state, storeId });
return (await this.cache.getOrCompute(key, async () => {
const params: (string | number)[] = [];
const conditions: string[] = [];
let paramIndex = 1;
if (category) {
conditions.push(`dp.type = $${paramIndex++}`);
params.push(category);
}
if (state) {
conditions.push(`d.state = $${paramIndex++}`);
params.push(state);
}
if (storeId) {
conditions.push(`dp.dispensary_id = $${paramIndex++}`);
params.push(storeId);
}
const whereClause = conditions.length > 0
? 'WHERE dp.type IS NOT NULL AND ' + conditions.join(' AND ')
: 'WHERE dp.type IS NOT NULL';
const result = await this.pool.query(`
SELECT
dp.type as category,
COUNT(*) as total_skus,
COUNT(DISTINCT dp.brand_name) as brand_count,
COUNT(DISTINCT dp.dispensary_id) as store_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
SUM(CASE WHEN dp.stock_status != 'in_stock' OR dp.stock_status IS NULL THEN 1 ELSE 0 END) as out_of_stock
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
${whereClause}
GROUP BY dp.type
ORDER BY total_skus DESC
`, params);
return result.rows.map(row => {
const totalSkus = parseInt(row.total_skus) || 0;
const inStock = parseInt(row.in_stock) || 0;
return {
category: row.category,
totalSkus,
brandCount: parseInt(row.brand_count) || 0,
storeCount: parseInt(row.store_count) || 0,
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
minPrice: row.min_price ? Math.round(parseFloat(row.min_price) * 100) / 100 : null,
maxPrice: row.max_price ? Math.round(parseFloat(row.max_price) * 100) / 100 : null,
inStockSkus: inStock,
outOfStockSkus: parseInt(row.out_of_stock) || 0,
stockHealthPercent: totalSkus > 0
? Math.round((inStock / totalSkus) * 100)
: 0,
};
});
}, 15)).data;
}
/**
* Get category growth (comparing periods)
*/
async getCategoryGrowth(
days: number = 7,
filters: CategoryFilters = {}
): Promise<CategoryGrowth[]> {
const { state, storeId, minSkus = 10 } = filters;
const key = cacheKey('category_growth', { days, state, storeId, minSkus });
return (await this.cache.getOrCompute(key, async () => {
// Use category_snapshots for historical comparison
const result = await this.pool.query(`
WITH current_data AS (
SELECT
category,
total_skus,
brand_count,
avg_price,
store_count
FROM category_snapshots
WHERE snapshot_date = (SELECT MAX(snapshot_date) FROM category_snapshots)
),
previous_data AS (
SELECT
category,
total_skus,
brand_count,
avg_price,
store_count
FROM category_snapshots
WHERE snapshot_date = (
SELECT MAX(snapshot_date)
FROM category_snapshots
WHERE snapshot_date < (SELECT MAX(snapshot_date) FROM category_snapshots) - ($1 || ' days')::INTERVAL
)
)
SELECT
c.category,
c.total_skus as current_skus,
COALESCE(p.total_skus, c.total_skus) as previous_skus,
c.brand_count as current_brands,
COALESCE(p.brand_count, c.brand_count) as previous_brands,
c.avg_price as current_price,
p.avg_price as previous_price
FROM current_data c
LEFT JOIN previous_data p ON c.category = p.category
WHERE c.total_skus >= $2
ORDER BY c.total_skus DESC
`, [days, minSkus]);
// If no snapshots exist, use current data
if (result.rows.length === 0) {
const fallbackResult = await this.pool.query(`
SELECT
type as category,
COUNT(*) as total_skus,
COUNT(DISTINCT brand_name) as brand_count,
AVG(extract_min_price(latest_raw_payload)) as avg_price
FROM dutchie_products
WHERE type IS NOT NULL
GROUP BY type
HAVING COUNT(*) >= $1
ORDER BY total_skus DESC
`, [minSkus]);
return fallbackResult.rows.map(row => ({
category: row.category,
currentSkuCount: parseInt(row.total_skus) || 0,
previousSkuCount: parseInt(row.total_skus) || 0,
skuGrowthPercent: 0,
currentBrandCount: parseInt(row.brand_count) || 0,
previousBrandCount: parseInt(row.brand_count) || 0,
brandGrowthPercent: 0,
currentAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
previousAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
priceChangePercent: null,
newProducts: 0,
discontinuedProducts: 0,
trend: 'stable' as const,
}));
}
return result.rows.map(row => {
const currentSkus = parseInt(row.current_skus) || 0;
const previousSkus = parseInt(row.previous_skus) || currentSkus;
const currentBrands = parseInt(row.current_brands) || 0;
const previousBrands = parseInt(row.previous_brands) || currentBrands;
const currentPrice = row.current_price ? parseFloat(row.current_price) : null;
const previousPrice = row.previous_price ? parseFloat(row.previous_price) : null;
const skuGrowth = previousSkus > 0
? ((currentSkus - previousSkus) / previousSkus) * 100
: 0;
const brandGrowth = previousBrands > 0
? ((currentBrands - previousBrands) / previousBrands) * 100
: 0;
const priceChange = previousPrice && currentPrice
? ((currentPrice - previousPrice) / previousPrice) * 100
: null;
let trend: 'growing' | 'declining' | 'stable' = 'stable';
if (skuGrowth > 5) trend = 'growing';
else if (skuGrowth < -5) trend = 'declining';
return {
category: row.category,
currentSkuCount: currentSkus,
previousSkuCount: previousSkus,
skuGrowthPercent: Math.round(skuGrowth * 10) / 10,
currentBrandCount: currentBrands,
previousBrandCount: previousBrands,
brandGrowthPercent: Math.round(brandGrowth * 10) / 10,
currentAvgPrice: currentPrice ? Math.round(currentPrice * 100) / 100 : null,
previousAvgPrice: previousPrice ? Math.round(previousPrice * 100) / 100 : null,
priceChangePercent: priceChange !== null ? Math.round(priceChange * 10) / 10 : null,
newProducts: Math.max(0, currentSkus - previousSkus),
discontinuedProducts: Math.max(0, previousSkus - currentSkus),
trend,
};
});
}, 15)).data;
}
/**
* Get category growth trend over time
*/
async getCategoryGrowthTrend(
category: string,
days: number = 90
): Promise<CategoryGrowthTrend> {
const key = cacheKey('category_growth_trend', { category, days });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
snapshot_date as date,
total_skus as sku_count,
brand_count,
avg_price,
store_count
FROM category_snapshots
WHERE category = $1
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
ORDER BY snapshot_date
`, [category, days]);
const dataPoints = result.rows.map(row => ({
date: row.date.toISOString().split('T')[0],
skuCount: parseInt(row.sku_count) || 0,
brandCount: parseInt(row.brand_count) || 0,
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
storeCount: parseInt(row.store_count) || 0,
}));
// Calculate growth rates
const calculateGrowth = (daysBack: number): number | null => {
if (dataPoints.length < 2) return null;
const targetDate = new Date();
targetDate.setDate(targetDate.getDate() - daysBack);
const targetDateStr = targetDate.toISOString().split('T')[0];
const recent = dataPoints[dataPoints.length - 1];
const older = dataPoints.find(d => d.date <= targetDateStr) || dataPoints[0];
if (older.skuCount === 0) return null;
return Math.round(((recent.skuCount - older.skuCount) / older.skuCount) * 1000) / 10;
};
return {
category,
dataPoints,
growth7d: calculateGrowth(7),
growth30d: calculateGrowth(30),
growth90d: calculateGrowth(90),
};
}, 15)).data;
}
/**
* Get category heatmap data
*/
async getCategoryHeatmap(
metric: 'skus' | 'growth' | 'price' = 'skus',
periods: number = 12 // weeks
): Promise<CategoryHeatmapData> {
const key = cacheKey('category_heatmap', { metric, periods });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
category,
snapshot_date,
total_skus,
avg_price
FROM category_snapshots
WHERE snapshot_date >= CURRENT_DATE - ($1 * 7 || ' days')::INTERVAL
ORDER BY category, snapshot_date
`, [periods]);
// Get unique categories and generate weekly periods
const categoriesSet = new Set<string>();
const periodsSet = new Set<string>();
result.rows.forEach(row => {
categoriesSet.add(row.category);
// Group by week
const date = new Date(row.snapshot_date);
const weekStart = new Date(date);
weekStart.setDate(date.getDate() - date.getDay());
periodsSet.add(weekStart.toISOString().split('T')[0]);
});
const categories = Array.from(categoriesSet).sort();
const periodsList = Array.from(periodsSet).sort();
// Aggregate data by category and week
const dataMap = new Map<string, Map<string, { skus: number; price: number | null }>>();
result.rows.forEach(row => {
const date = new Date(row.snapshot_date);
const weekStart = new Date(date);
weekStart.setDate(date.getDate() - date.getDay());
const period = weekStart.toISOString().split('T')[0];
if (!dataMap.has(row.category)) {
dataMap.set(row.category, new Map());
}
const categoryData = dataMap.get(row.category)!;
if (!categoryData.has(period)) {
categoryData.set(period, { skus: 0, price: null });
}
const existing = categoryData.get(period)!;
existing.skus = Math.max(existing.skus, parseInt(row.total_skus) || 0);
if (row.avg_price) {
existing.price = parseFloat(row.avg_price);
}
});
// Build heatmap data
const data: CategoryHeatmapData['data'] = [];
categories.forEach(category => {
let previousValue: number | null = null;
periodsList.forEach(period => {
const categoryData = dataMap.get(category)?.get(period);
let value = 0;
if (categoryData) {
switch (metric) {
case 'skus':
value = categoryData.skus;
break;
case 'price':
value = categoryData.price || 0;
break;
case 'growth':
value = previousValue !== null && previousValue > 0
? ((categoryData.skus - previousValue) / previousValue) * 100
: 0;
break;
}
}
const changeFromPrevious = previousValue !== null && previousValue > 0
? ((value - previousValue) / previousValue) * 100
: null;
data.push({
category,
period,
value: Math.round(value * 100) / 100,
changeFromPrevious: changeFromPrevious !== null
? Math.round(changeFromPrevious * 10) / 10
: null,
});
if (metric !== 'growth') {
previousValue = value;
} else if (categoryData) {
previousValue = categoryData.skus;
}
});
});
return {
categories,
periods: periodsList,
data,
};
}, 30)).data;
}
/**
* Get top growing/declining categories
*/
async getTopMovers(
limit: number = 5,
days: number = 30
): Promise<{
growing: CategoryGrowth[];
declining: CategoryGrowth[];
}> {
const key = cacheKey('top_movers', { limit, days });
return (await this.cache.getOrCompute(key, async () => {
const allGrowth = await this.getCategoryGrowth(days);
const sorted = [...allGrowth].sort((a, b) => b.skuGrowthPercent - a.skuGrowthPercent);
return {
growing: sorted.filter(c => c.skuGrowthPercent > 0).slice(0, limit),
declining: sorted.filter(c => c.skuGrowthPercent < 0).slice(-limit).reverse(),
};
}, 15)).data;
}
/**
* Get category subcategory breakdown
*/
async getSubcategoryBreakdown(category: string): Promise<Array<{
subcategory: string;
skuCount: number;
brandCount: number;
avgPrice: number | null;
percentOfCategory: number;
}>> {
const key = cacheKey('subcategory_breakdown', { category });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
WITH category_total AS (
SELECT COUNT(*) as total FROM dutchie_products WHERE type = $1
)
SELECT
COALESCE(dp.subcategory, 'Other') as subcategory,
COUNT(*) as sku_count,
COUNT(DISTINCT dp.brand_name) as brand_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
ct.total as category_total
FROM dutchie_products dp, category_total ct
WHERE dp.type = $1
GROUP BY dp.subcategory, ct.total
ORDER BY sku_count DESC
`, [category]);
return result.rows.map(row => ({
subcategory: row.subcategory,
skuCount: parseInt(row.sku_count) || 0,
brandCount: parseInt(row.brand_count) || 0,
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
percentOfCategory: parseInt(row.category_total) > 0
? Math.round((parseInt(row.sku_count) / parseInt(row.category_total)) * 1000) / 10
: 0,
}));
}, 15)).data;
}
}

View File

@@ -0,0 +1,57 @@
/**
* Analytics Module Index
*
* Exports all analytics services for CannaiQ dashboards.
*
* Phase 3: Analytics Dashboards
*/
export { AnalyticsCache, cacheKey, type CacheEntry, type CacheConfig } from './cache';
export {
PriceTrendService,
type PricePoint,
type PriceTrend,
type PriceSummary,
type PriceCompressionResult,
type PriceFilters,
} from './price-trends';
export {
PenetrationService,
type BrandPenetration,
type PenetrationTrend,
type ShelfShare,
type BrandPresenceByState,
type PenetrationFilters,
} from './penetration';
export {
CategoryAnalyticsService,
type CategoryGrowth,
type CategorySummary,
type CategoryGrowthTrend,
type CategoryHeatmapData,
type SeasonalityPattern,
type CategoryFilters,
} from './category-analytics';
export {
StoreChangeService,
type StoreChangeSummary,
type StoreChangeEvent,
type BrandChange,
type ProductChange,
type CategoryLeaderboard,
type StoreFilters,
} from './store-changes';
export {
BrandOpportunityService,
type BrandOpportunity,
type PricePosition,
type MissingSkuOpportunity,
type StoreShelfShareChange,
type CompetitorAlert,
type MarketPositionSummary,
} from './brand-opportunity';

View File

@@ -0,0 +1,556 @@
/**
* Brand Penetration Analytics Service
*
* Provides analytics for brand market penetration including:
* - Stores carrying brand
* - SKU counts per brand
* - Percentage of stores carrying
* - Shelf share calculations
* - Penetration trends and momentum
*
* Phase 3: Analytics Dashboards
*/
import { Pool } from 'pg';
import { AnalyticsCache, cacheKey } from './cache';
export interface BrandPenetration {
brandName: string;
brandId: string | null;
totalStores: number;
storesCarrying: number;
penetrationPercent: number;
totalSkus: number;
avgSkusPerStore: number;
shelfSharePercent: number;
categories: string[];
avgPrice: number | null;
inStockSkus: number;
}
export interface PenetrationTrend {
brandName: string;
dataPoints: Array<{
date: string;
storeCount: number;
skuCount: number;
penetrationPercent: number;
}>;
momentumScore: number; // -100 to +100
riskScore: number; // 0 to 100, higher = more risk
trend: 'growing' | 'declining' | 'stable';
}
export interface ShelfShare {
brandName: string;
category: string;
skuCount: number;
categoryTotalSkus: number;
shelfSharePercent: number;
rank: number;
}
export interface BrandPresenceByState {
state: string;
storeCount: number;
skuCount: number;
avgPrice: number | null;
}
export interface PenetrationFilters {
state?: string;
category?: string;
minStores?: number;
minSkus?: number;
}
export class PenetrationService {
private pool: Pool;
private cache: AnalyticsCache;
constructor(pool: Pool, cache: AnalyticsCache) {
this.pool = pool;
this.cache = cache;
}
/**
* Get penetration data for a specific brand
*/
async getBrandPenetration(
brandName: string,
filters: PenetrationFilters = {}
): Promise<BrandPenetration> {
const { state, category } = filters;
const key = cacheKey('brand_penetration', { brandName, state, category });
return (await this.cache.getOrCompute(key, async () => {
// Build where clauses
const conditions: string[] = [];
const params: (string | number)[] = [brandName];
let paramIndex = 2;
if (state) {
conditions.push(`d.state = $${paramIndex++}`);
params.push(state);
}
if (category) {
conditions.push(`dp.type = $${paramIndex++}`);
params.push(category);
}
const stateCondition = state ? `AND d.state = $${params.indexOf(state) + 1}` : '';
const categoryCondition = category ? `AND dp.type = $${params.indexOf(category) + 1}` : '';
const result = await this.pool.query(`
WITH total_stores AS (
SELECT COUNT(DISTINCT id) as total
FROM dispensaries
WHERE 1=1 ${state ? `AND state = $2` : ''}
),
brand_data AS (
SELECT
dp.brand_name,
dp.brand_id,
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
COUNT(*) as total_skus,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name = $1
${stateCondition}
${categoryCondition}
GROUP BY dp.brand_name, dp.brand_id
),
total_skus AS (
SELECT COUNT(*) as total
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE 1=1 ${stateCondition} ${categoryCondition}
)
SELECT
bd.brand_name,
bd.brand_id,
ts.total as total_stores,
bd.stores_carrying,
bd.total_skus,
bd.avg_price,
bd.in_stock,
bd.categories,
tsk.total as market_total_skus
FROM brand_data bd, total_stores ts, total_skus tsk
`, params);
if (result.rows.length === 0) {
return {
brandName,
brandId: null,
totalStores: 0,
storesCarrying: 0,
penetrationPercent: 0,
totalSkus: 0,
avgSkusPerStore: 0,
shelfSharePercent: 0,
categories: [],
avgPrice: null,
inStockSkus: 0,
};
}
const row = result.rows[0];
const totalStores = parseInt(row.total_stores) || 1;
const storesCarrying = parseInt(row.stores_carrying) || 0;
const totalSkus = parseInt(row.total_skus) || 0;
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
return {
brandName: row.brand_name,
brandId: row.brand_id,
totalStores,
storesCarrying,
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
totalSkus,
avgSkusPerStore: storesCarrying > 0
? Math.round((totalSkus / storesCarrying) * 10) / 10
: 0,
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
categories: row.categories || [],
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
inStockSkus: parseInt(row.in_stock) || 0,
};
}, 15)).data;
}
/**
* Get top brands by penetration
*/
async getTopBrandsByPenetration(
limit: number = 20,
filters: PenetrationFilters = {}
): Promise<BrandPenetration[]> {
const { state, category, minStores = 2, minSkus = 5 } = filters;
const key = cacheKey('top_brands_penetration', { limit, state, category, minStores, minSkus });
return (await this.cache.getOrCompute(key, async () => {
const params: (string | number)[] = [limit, minStores, minSkus];
let paramIndex = 4;
let stateCondition = '';
let categoryCondition = '';
if (state) {
stateCondition = `AND d.state = $${paramIndex++}`;
params.push(state);
}
if (category) {
categoryCondition = `AND dp.type = $${paramIndex++}`;
params.push(category);
}
const result = await this.pool.query(`
WITH total_stores AS (
SELECT COUNT(DISTINCT id) as total
FROM dispensaries
WHERE 1=1 ${state ? `AND state = $${params.indexOf(state) + 1}` : ''}
),
total_skus AS (
SELECT COUNT(*) as total
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE 1=1 ${stateCondition} ${categoryCondition}
),
brand_data AS (
SELECT
dp.brand_name,
dp.brand_id,
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
COUNT(*) as total_skus,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name IS NOT NULL
${stateCondition}
${categoryCondition}
GROUP BY dp.brand_name, dp.brand_id
HAVING COUNT(DISTINCT dp.dispensary_id) >= $2
AND COUNT(*) >= $3
)
SELECT
bd.*,
ts.total as total_stores,
tsk.total as market_total_skus
FROM brand_data bd, total_stores ts, total_skus tsk
ORDER BY bd.stores_carrying DESC, bd.total_skus DESC
LIMIT $1
`, params);
return result.rows.map(row => {
const totalStores = parseInt(row.total_stores) || 1;
const storesCarrying = parseInt(row.stores_carrying) || 0;
const totalSkus = parseInt(row.total_skus) || 0;
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
return {
brandName: row.brand_name,
brandId: row.brand_id,
totalStores,
storesCarrying,
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
totalSkus,
avgSkusPerStore: storesCarrying > 0
? Math.round((totalSkus / storesCarrying) * 10) / 10
: 0,
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
categories: row.categories || [],
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
inStockSkus: parseInt(row.in_stock) || 0,
};
});
}, 15)).data;
}
/**
* Get penetration trend for a brand (requires historical snapshots)
*/
async getPenetrationTrend(
brandName: string,
days: number = 30
): Promise<PenetrationTrend> {
const key = cacheKey('penetration_trend', { brandName, days });
return (await this.cache.getOrCompute(key, async () => {
// Use brand_snapshots table for historical data
const result = await this.pool.query(`
SELECT
snapshot_date as date,
store_count,
total_skus
FROM brand_snapshots
WHERE brand_name = $1
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
ORDER BY snapshot_date
`, [brandName, days]);
// Get total stores for penetration calculation
const totalResult = await this.pool.query(
'SELECT COUNT(*) as total FROM dispensaries'
);
const totalStores = parseInt(totalResult.rows[0]?.total) || 1;
const dataPoints = result.rows.map(row => ({
date: row.date.toISOString().split('T')[0],
storeCount: parseInt(row.store_count) || 0,
skuCount: parseInt(row.total_skus) || 0,
penetrationPercent: Math.round((parseInt(row.store_count) / totalStores) * 1000) / 10,
}));
// Calculate momentum and risk scores
let momentumScore = 0;
let riskScore = 0;
let trend: 'growing' | 'declining' | 'stable' = 'stable';
if (dataPoints.length >= 2) {
const first = dataPoints[0];
const last = dataPoints[dataPoints.length - 1];
// Momentum: change in store count
const storeChange = last.storeCount - first.storeCount;
const storeChangePercent = first.storeCount > 0
? (storeChange / first.storeCount) * 100
: 0;
// Momentum score: -100 to +100
momentumScore = Math.max(-100, Math.min(100, storeChangePercent * 10));
// Risk score: higher if losing stores
if (storeChange < 0) {
riskScore = Math.min(100, Math.abs(storeChangePercent) * 5);
}
// Determine trend
if (storeChangePercent > 5) trend = 'growing';
else if (storeChangePercent < -5) trend = 'declining';
}
return {
brandName,
dataPoints,
momentumScore: Math.round(momentumScore),
riskScore: Math.round(riskScore),
trend,
};
}, 15)).data;
}
/**
* Get shelf share by category for a brand
*/
async getShelfShareByCategory(brandName: string): Promise<ShelfShare[]> {
const key = cacheKey('shelf_share_category', { brandName });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
WITH category_totals AS (
SELECT
type as category,
COUNT(*) as total_skus
FROM dutchie_products
WHERE type IS NOT NULL
GROUP BY type
),
brand_by_category AS (
SELECT
type as category,
COUNT(*) as sku_count
FROM dutchie_products
WHERE brand_name = $1
AND type IS NOT NULL
GROUP BY type
),
ranked AS (
SELECT
ct.category,
COALESCE(bc.sku_count, 0) as sku_count,
ct.total_skus,
RANK() OVER (PARTITION BY ct.category ORDER BY bc.sku_count DESC NULLS LAST) as rank
FROM category_totals ct
LEFT JOIN brand_by_category bc ON ct.category = bc.category
)
SELECT
r.category,
r.sku_count,
r.total_skus as category_total_skus,
ROUND((r.sku_count::NUMERIC / r.total_skus) * 100, 2) as shelf_share_pct,
(SELECT COUNT(*) + 1 FROM (
SELECT brand_name, COUNT(*) as cnt
FROM dutchie_products
WHERE type = r.category AND brand_name IS NOT NULL
GROUP BY brand_name
HAVING COUNT(*) > r.sku_count
) t) as rank
FROM ranked r
WHERE r.sku_count > 0
ORDER BY r.shelf_share_pct DESC
`, [brandName]);
return result.rows.map(row => ({
brandName,
category: row.category,
skuCount: parseInt(row.sku_count) || 0,
categoryTotalSkus: parseInt(row.category_total_skus) || 0,
shelfSharePercent: parseFloat(row.shelf_share_pct) || 0,
rank: parseInt(row.rank) || 0,
}));
}, 15)).data;
}
/**
* Get brand presence by state/region
*/
async getBrandPresenceByState(brandName: string): Promise<BrandPresenceByState[]> {
const key = cacheKey('brand_presence_state', { brandName });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
d.state,
COUNT(DISTINCT dp.dispensary_id) as store_count,
COUNT(*) as sku_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name = $1
GROUP BY d.state
ORDER BY store_count DESC
`, [brandName]);
return result.rows.map(row => ({
state: row.state,
storeCount: parseInt(row.store_count) || 0,
skuCount: parseInt(row.sku_count) || 0,
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
}));
}, 15)).data;
}
/**
* Get stores carrying a brand
*/
async getStoresCarryingBrand(brandName: string): Promise<Array<{
storeId: number;
storeName: string;
city: string;
state: string;
skuCount: number;
avgPrice: number | null;
categories: string[];
}>> {
const key = cacheKey('stores_carrying_brand', { brandName });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
d.id as store_id,
d.name as store_name,
d.city,
d.state,
COUNT(*) as sku_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name = $1
GROUP BY d.id, d.name, d.city, d.state
ORDER BY sku_count DESC
`, [brandName]);
return result.rows.map(row => ({
storeId: row.store_id,
storeName: row.store_name,
city: row.city,
state: row.state,
skuCount: parseInt(row.sku_count) || 0,
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
categories: row.categories || [],
}));
}, 15)).data;
}
/**
* Get penetration heatmap data (state-based)
*/
async getPenetrationHeatmap(
brandName?: string
): Promise<Array<{
state: string;
totalStores: number;
storesWithBrand: number;
penetrationPercent: number;
totalSkus: number;
}>> {
const key = cacheKey('penetration_heatmap', { brandName });
return (await this.cache.getOrCompute(key, async () => {
if (brandName) {
const result = await this.pool.query(`
WITH state_totals AS (
SELECT state, COUNT(*) as total_stores
FROM dispensaries
GROUP BY state
),
brand_by_state AS (
SELECT
d.state,
COUNT(DISTINCT dp.dispensary_id) as stores_with_brand,
COUNT(*) as total_skus
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name = $1
GROUP BY d.state
)
SELECT
st.state,
st.total_stores,
COALESCE(bs.stores_with_brand, 0) as stores_with_brand,
ROUND(COALESCE(bs.stores_with_brand, 0)::NUMERIC / st.total_stores * 100, 1) as penetration_pct,
COALESCE(bs.total_skus, 0) as total_skus
FROM state_totals st
LEFT JOIN brand_by_state bs ON st.state = bs.state
ORDER BY penetration_pct DESC
`, [brandName]);
return result.rows.map(row => ({
state: row.state,
totalStores: parseInt(row.total_stores) || 0,
storesWithBrand: parseInt(row.stores_with_brand) || 0,
penetrationPercent: parseFloat(row.penetration_pct) || 0,
totalSkus: parseInt(row.total_skus) || 0,
}));
} else {
// Overall market data by state
const result = await this.pool.query(`
SELECT
d.state,
COUNT(DISTINCT d.id) as total_stores,
COUNT(DISTINCT dp.brand_name) as brand_count,
COUNT(*) as total_skus
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
GROUP BY d.state
ORDER BY total_stores DESC
`);
return result.rows.map(row => ({
state: row.state,
totalStores: parseInt(row.total_stores) || 0,
storesWithBrand: parseInt(row.brand_count) || 0, // Using brand count here
penetrationPercent: 100, // Full penetration for overall view
totalSkus: parseInt(row.total_skus) || 0,
}));
}
}, 30)).data;
}
}

View File

@@ -0,0 +1,534 @@
/**
* Price Trend Analytics Service
*
* Provides time-series price analytics including:
* - Price over time for products
* - Average MSRP/Wholesale by period
* - Price volatility scoring
* - Price compression detection
*
* Phase 3: Analytics Dashboards
*/
import { Pool } from 'pg';
import { AnalyticsCache, cacheKey } from './cache';
export interface PricePoint {
date: string;
minPrice: number | null;
maxPrice: number | null;
avgPrice: number | null;
wholesalePrice: number | null;
sampleSize: number;
}
export interface PriceTrend {
productId?: number;
storeId?: number;
brandName?: string;
category?: string;
dataPoints: PricePoint[];
summary: {
currentAvg: number | null;
previousAvg: number | null;
changePercent: number | null;
trend: 'up' | 'down' | 'stable';
volatilityScore: number | null;
};
}
export interface PriceSummary {
avg7d: number | null;
avg30d: number | null;
avg90d: number | null;
wholesaleAvg7d: number | null;
wholesaleAvg30d: number | null;
wholesaleAvg90d: number | null;
minPrice: number | null;
maxPrice: number | null;
priceRange: number | null;
volatilityScore: number | null;
}
export interface PriceCompressionResult {
category: string;
brands: Array<{
brandName: string;
avgPrice: number;
priceDistance: number; // distance from category mean
}>;
compressionScore: number; // 0-100, higher = more compressed
standardDeviation: number;
}
export interface PriceFilters {
storeId?: number;
brandName?: string;
category?: string;
state?: string;
days?: number;
}
export class PriceTrendService {
private pool: Pool;
private cache: AnalyticsCache;
constructor(pool: Pool, cache: AnalyticsCache) {
this.pool = pool;
this.cache = cache;
}
/**
* Get price trend for a specific product
*/
async getProductPriceTrend(
productId: number,
storeId?: number,
days: number = 30
): Promise<PriceTrend> {
const key = cacheKey('price_trend_product', { productId, storeId, days });
return (await this.cache.getOrCompute(key, async () => {
// Try to get from snapshots first
const snapshotResult = await this.pool.query(`
SELECT
DATE(crawled_at) as date,
MIN(rec_min_price_cents) / 100.0 as min_price,
MAX(rec_max_price_cents) / 100.0 as max_price,
AVG(rec_min_price_cents) / 100.0 as avg_price,
AVG(wholesale_min_price_cents) / 100.0 as wholesale_price,
COUNT(*) as sample_size
FROM dutchie_product_snapshots
WHERE dutchie_product_id = $1
AND crawled_at >= NOW() - ($2 || ' days')::INTERVAL
${storeId ? 'AND dispensary_id = $3' : ''}
GROUP BY DATE(crawled_at)
ORDER BY date
`, storeId ? [productId, days, storeId] : [productId, days]);
let dataPoints: PricePoint[] = snapshotResult.rows.map(row => ({
date: row.date.toISOString().split('T')[0],
minPrice: parseFloat(row.min_price) || null,
maxPrice: parseFloat(row.max_price) || null,
avgPrice: parseFloat(row.avg_price) || null,
wholesalePrice: parseFloat(row.wholesale_price) || null,
sampleSize: parseInt(row.sample_size),
}));
// If no snapshots, get current price from product
if (dataPoints.length === 0) {
const productResult = await this.pool.query(`
SELECT
extract_min_price(latest_raw_payload) as min_price,
extract_max_price(latest_raw_payload) as max_price,
extract_wholesale_price(latest_raw_payload) as wholesale_price
FROM dutchie_products
WHERE id = $1
`, [productId]);
if (productResult.rows.length > 0) {
const row = productResult.rows[0];
dataPoints = [{
date: new Date().toISOString().split('T')[0],
minPrice: parseFloat(row.min_price) || null,
maxPrice: parseFloat(row.max_price) || null,
avgPrice: parseFloat(row.min_price) || null,
wholesalePrice: parseFloat(row.wholesale_price) || null,
sampleSize: 1,
}];
}
}
const summary = this.calculatePriceSummary(dataPoints);
return {
productId,
storeId,
dataPoints,
summary,
};
}, 15)).data;
}
/**
* Get price trends by brand
*/
async getBrandPriceTrend(
brandName: string,
filters: PriceFilters = {}
): Promise<PriceTrend> {
const { storeId, category, state, days = 30 } = filters;
const key = cacheKey('price_trend_brand', { brandName, storeId, category, state, days });
return (await this.cache.getOrCompute(key, async () => {
// Use current product data aggregated by date
const result = await this.pool.query(`
SELECT
DATE(dp.updated_at) as date,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
COUNT(*) as sample_size
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.brand_name = $1
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
${storeId ? 'AND dp.dispensary_id = $3' : ''}
${category ? `AND dp.type = $${storeId ? 4 : 3}` : ''}
${state ? `AND d.state = $${storeId ? (category ? 5 : 4) : (category ? 4 : 3)}` : ''}
GROUP BY DATE(dp.updated_at)
ORDER BY date
`, this.buildParams([brandName, days], { storeId, category, state }));
const dataPoints: PricePoint[] = result.rows.map(row => ({
date: row.date.toISOString().split('T')[0],
minPrice: parseFloat(row.min_price) || null,
maxPrice: parseFloat(row.max_price) || null,
avgPrice: parseFloat(row.avg_price) || null,
wholesalePrice: parseFloat(row.wholesale_price) || null,
sampleSize: parseInt(row.sample_size),
}));
return {
brandName,
storeId,
category,
dataPoints,
summary: this.calculatePriceSummary(dataPoints),
};
}, 15)).data;
}
/**
* Get price trends by category
*/
async getCategoryPriceTrend(
category: string,
filters: PriceFilters = {}
): Promise<PriceTrend> {
const { storeId, brandName, state, days = 30 } = filters;
const key = cacheKey('price_trend_category', { category, storeId, brandName, state, days });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
DATE(dp.updated_at) as date,
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
COUNT(*) as sample_size
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.type = $1
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
${storeId ? 'AND dp.dispensary_id = $3' : ''}
${brandName ? `AND dp.brand_name = $${storeId ? 4 : 3}` : ''}
${state ? `AND d.state = $${storeId ? (brandName ? 5 : 4) : (brandName ? 4 : 3)}` : ''}
GROUP BY DATE(dp.updated_at)
ORDER BY date
`, this.buildParams([category, days], { storeId, brandName, state }));
const dataPoints: PricePoint[] = result.rows.map(row => ({
date: row.date.toISOString().split('T')[0],
minPrice: parseFloat(row.min_price) || null,
maxPrice: parseFloat(row.max_price) || null,
avgPrice: parseFloat(row.avg_price) || null,
wholesalePrice: parseFloat(row.wholesale_price) || null,
sampleSize: parseInt(row.sample_size),
}));
return {
category,
storeId,
brandName,
dataPoints,
summary: this.calculatePriceSummary(dataPoints),
};
}, 15)).data;
}
/**
* Get price summary statistics
*/
async getPriceSummary(filters: PriceFilters = {}): Promise<PriceSummary> {
const { storeId, brandName, category, state } = filters;
const key = cacheKey('price_summary', filters as Record<string, unknown>);
return (await this.cache.getOrCompute(key, async () => {
const whereConditions: string[] = [];
const params: (string | number)[] = [];
let paramIndex = 1;
if (storeId) {
whereConditions.push(`dp.dispensary_id = $${paramIndex++}`);
params.push(storeId);
}
if (brandName) {
whereConditions.push(`dp.brand_name = $${paramIndex++}`);
params.push(brandName);
}
if (category) {
whereConditions.push(`dp.type = $${paramIndex++}`);
params.push(category);
}
if (state) {
whereConditions.push(`d.state = $${paramIndex++}`);
params.push(state);
}
const whereClause = whereConditions.length > 0
? 'WHERE ' + whereConditions.join(' AND ')
: '';
const result = await this.pool.query(`
WITH prices AS (
SELECT
extract_min_price(dp.latest_raw_payload) as min_price,
extract_max_price(dp.latest_raw_payload) as max_price,
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
${whereClause}
)
SELECT
AVG(min_price) as avg_price,
AVG(wholesale_price) as avg_wholesale,
MIN(min_price) as min_price,
MAX(max_price) as max_price,
STDDEV(min_price) as std_dev
FROM prices
WHERE min_price IS NOT NULL
`, params);
const row = result.rows[0];
const avgPrice = parseFloat(row.avg_price) || null;
const stdDev = parseFloat(row.std_dev) || null;
const volatility = avgPrice && stdDev ? (stdDev / avgPrice) * 100 : null;
return {
avg7d: avgPrice, // Using current data as proxy
avg30d: avgPrice,
avg90d: avgPrice,
wholesaleAvg7d: parseFloat(row.avg_wholesale) || null,
wholesaleAvg30d: parseFloat(row.avg_wholesale) || null,
wholesaleAvg90d: parseFloat(row.avg_wholesale) || null,
minPrice: parseFloat(row.min_price) || null,
maxPrice: parseFloat(row.max_price) || null,
priceRange: row.max_price && row.min_price
? parseFloat(row.max_price) - parseFloat(row.min_price)
: null,
volatilityScore: volatility ? Math.round(volatility * 10) / 10 : null,
};
}, 30)).data;
}
/**
* Detect price compression in a category
*/
async detectPriceCompression(
category: string,
state?: string
): Promise<PriceCompressionResult> {
const key = cacheKey('price_compression', { category, state });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
WITH brand_prices AS (
SELECT
dp.brand_name,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
COUNT(*) as sku_count
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.type = $1
AND dp.brand_name IS NOT NULL
${state ? 'AND d.state = $2' : ''}
GROUP BY dp.brand_name
HAVING COUNT(*) >= 3
),
stats AS (
SELECT
AVG(avg_price) as category_avg,
STDDEV(avg_price) as std_dev
FROM brand_prices
WHERE avg_price IS NOT NULL
)
SELECT
bp.brand_name,
bp.avg_price,
ABS(bp.avg_price - s.category_avg) as price_distance,
s.category_avg,
s.std_dev
FROM brand_prices bp, stats s
WHERE bp.avg_price IS NOT NULL
ORDER BY bp.avg_price
`, state ? [category, state] : [category]);
if (result.rows.length === 0) {
return {
category,
brands: [],
compressionScore: 0,
standardDeviation: 0,
};
}
const categoryAvg = parseFloat(result.rows[0].category_avg) || 0;
const stdDev = parseFloat(result.rows[0].std_dev) || 0;
// Compression score: lower std dev relative to mean = more compression
// Scale to 0-100 where 100 = very compressed
const cv = categoryAvg > 0 ? (stdDev / categoryAvg) * 100 : 0;
const compressionScore = Math.max(0, Math.min(100, 100 - cv));
const brands = result.rows.map(row => ({
brandName: row.brand_name,
avgPrice: parseFloat(row.avg_price) || 0,
priceDistance: parseFloat(row.price_distance) || 0,
}));
return {
category,
brands,
compressionScore: Math.round(compressionScore),
standardDeviation: Math.round(stdDev * 100) / 100,
};
}, 30)).data;
}
/**
* Get global price statistics
*/
async getGlobalPriceStats(): Promise<{
totalProductsWithPrice: number;
avgPrice: number | null;
medianPrice: number | null;
priceByCategory: Array<{ category: string; avgPrice: number; count: number }>;
priceByState: Array<{ state: string; avgPrice: number; count: number }>;
}> {
const key = 'global_price_stats';
return (await this.cache.getOrCompute(key, async () => {
const [countResult, categoryResult, stateResult] = await Promise.all([
this.pool.query(`
SELECT
COUNT(*) FILTER (WHERE extract_min_price(latest_raw_payload) IS NOT NULL) as with_price,
AVG(extract_min_price(latest_raw_payload)) as avg_price,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY extract_min_price(latest_raw_payload)) as median
FROM dutchie_products
`),
this.pool.query(`
SELECT
type as category,
AVG(extract_min_price(latest_raw_payload)) as avg_price,
COUNT(*) as count
FROM dutchie_products
WHERE type IS NOT NULL
AND extract_min_price(latest_raw_payload) IS NOT NULL
GROUP BY type
ORDER BY avg_price DESC
`),
this.pool.query(`
SELECT
d.state,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
COUNT(*) as count
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE extract_min_price(dp.latest_raw_payload) IS NOT NULL
GROUP BY d.state
ORDER BY avg_price DESC
`),
]);
return {
totalProductsWithPrice: parseInt(countResult.rows[0]?.with_price || '0'),
avgPrice: parseFloat(countResult.rows[0]?.avg_price) || null,
medianPrice: parseFloat(countResult.rows[0]?.median) || null,
priceByCategory: categoryResult.rows.map(r => ({
category: r.category,
avgPrice: parseFloat(r.avg_price) || 0,
count: parseInt(r.count),
})),
priceByState: stateResult.rows.map(r => ({
state: r.state,
avgPrice: parseFloat(r.avg_price) || 0,
count: parseInt(r.count),
})),
};
}, 30)).data;
}
// ============================================================
// HELPER METHODS
// ============================================================
private calculatePriceSummary(dataPoints: PricePoint[]): PriceTrend['summary'] {
if (dataPoints.length === 0) {
return {
currentAvg: null,
previousAvg: null,
changePercent: null,
trend: 'stable',
volatilityScore: null,
};
}
const prices = dataPoints
.map(d => d.avgPrice)
.filter((p): p is number => p !== null);
if (prices.length === 0) {
return {
currentAvg: null,
previousAvg: null,
changePercent: null,
trend: 'stable',
volatilityScore: null,
};
}
const currentAvg = prices[prices.length - 1];
const midpoint = Math.floor(prices.length / 2);
const previousAvg = prices.length > 1 ? prices[midpoint] : currentAvg;
const changePercent = previousAvg > 0
? ((currentAvg - previousAvg) / previousAvg) * 100
: null;
// Calculate volatility (coefficient of variation)
const mean = prices.reduce((a, b) => a + b, 0) / prices.length;
const variance = prices.reduce((sum, p) => sum + Math.pow(p - mean, 2), 0) / prices.length;
const stdDev = Math.sqrt(variance);
const volatilityScore = mean > 0 ? (stdDev / mean) * 100 : null;
let trend: 'up' | 'down' | 'stable' = 'stable';
if (changePercent !== null) {
if (changePercent > 5) trend = 'up';
else if (changePercent < -5) trend = 'down';
}
return {
currentAvg: Math.round(currentAvg * 100) / 100,
previousAvg: Math.round(previousAvg * 100) / 100,
changePercent: changePercent !== null ? Math.round(changePercent * 10) / 10 : null,
trend,
volatilityScore: volatilityScore !== null ? Math.round(volatilityScore * 10) / 10 : null,
};
}
private buildParams(
baseParams: (string | number)[],
optionalParams: Record<string, string | number | undefined>
): (string | number)[] {
const params = [...baseParams];
for (const value of Object.values(optionalParams)) {
if (value !== undefined) {
params.push(value);
}
}
return params;
}
}

View File

@@ -0,0 +1,587 @@
/**
* Store Change Tracking Service
*
* Tracks changes at the store level including:
* - New/lost brands
* - New/discontinued products
* - Stock status transitions
* - Price changes
* - Category movement leaderboards
*
* Phase 3: Analytics Dashboards
*/
import { Pool } from 'pg';
import { AnalyticsCache, cacheKey } from './cache';
export interface StoreChangeSummary {
storeId: number;
storeName: string;
city: string;
state: string;
brandsAdded7d: number;
brandsAdded30d: number;
brandsLost7d: number;
brandsLost30d: number;
productsAdded7d: number;
productsAdded30d: number;
productsDiscontinued7d: number;
productsDiscontinued30d: number;
priceDrops7d: number;
priceIncreases7d: number;
restocks7d: number;
stockOuts7d: number;
}
export interface StoreChangeEvent {
id: number;
storeId: number;
storeName: string;
eventType: string;
eventDate: string;
brandName: string | null;
productName: string | null;
category: string | null;
oldValue: string | null;
newValue: string | null;
metadata: Record<string, unknown> | null;
}
export interface BrandChange {
brandName: string;
changeType: 'added' | 'removed';
date: string;
skuCount: number;
categories: string[];
}
export interface ProductChange {
productId: number;
productName: string;
brandName: string | null;
category: string | null;
changeType: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock';
date: string;
oldValue?: string;
newValue?: string;
}
export interface CategoryLeaderboard {
category: string;
storeId: number;
storeName: string;
skuCount: number;
brandCount: number;
avgPrice: number | null;
changePercent7d: number;
rank: number;
}
export interface StoreFilters {
storeId?: number;
state?: string;
days?: number;
eventType?: string;
}
export class StoreChangeService {
private pool: Pool;
private cache: AnalyticsCache;
constructor(pool: Pool, cache: AnalyticsCache) {
this.pool = pool;
this.cache = cache;
}
/**
* Get change summary for a store
*/
async getStoreChangeSummary(
storeId: number
): Promise<StoreChangeSummary | null> {
const key = cacheKey('store_change_summary', { storeId });
return (await this.cache.getOrCompute(key, async () => {
// Get store info
const storeResult = await this.pool.query(`
SELECT id, name, city, state FROM dispensaries WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) return null;
const store = storeResult.rows[0];
// Get change events counts
const eventsResult = await this.pool.query(`
SELECT
event_type,
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '7 days') as count_7d,
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '30 days') as count_30d
FROM store_change_events
WHERE store_id = $1
GROUP BY event_type
`, [storeId]);
const counts: Record<string, { count_7d: number; count_30d: number }> = {};
eventsResult.rows.forEach(row => {
counts[row.event_type] = {
count_7d: parseInt(row.count_7d) || 0,
count_30d: parseInt(row.count_30d) || 0,
};
});
return {
storeId: store.id,
storeName: store.name,
city: store.city,
state: store.state,
brandsAdded7d: counts['brand_added']?.count_7d || 0,
brandsAdded30d: counts['brand_added']?.count_30d || 0,
brandsLost7d: counts['brand_removed']?.count_7d || 0,
brandsLost30d: counts['brand_removed']?.count_30d || 0,
productsAdded7d: counts['product_added']?.count_7d || 0,
productsAdded30d: counts['product_added']?.count_30d || 0,
productsDiscontinued7d: counts['product_removed']?.count_7d || 0,
productsDiscontinued30d: counts['product_removed']?.count_30d || 0,
priceDrops7d: counts['price_drop']?.count_7d || 0,
priceIncreases7d: counts['price_increase']?.count_7d || 0,
restocks7d: counts['restocked']?.count_7d || 0,
stockOuts7d: counts['out_of_stock']?.count_7d || 0,
};
}, 15)).data;
}
/**
* Get recent change events for a store
*/
async getStoreChangeEvents(
storeId: number,
filters: { eventType?: string; days?: number; limit?: number } = {}
): Promise<StoreChangeEvent[]> {
const { eventType, days = 30, limit = 100 } = filters;
const key = cacheKey('store_change_events', { storeId, eventType, days, limit });
return (await this.cache.getOrCompute(key, async () => {
const params: (string | number)[] = [storeId, days, limit];
let eventTypeCondition = '';
if (eventType) {
eventTypeCondition = 'AND event_type = $4';
params.push(eventType);
}
const result = await this.pool.query(`
SELECT
sce.id,
sce.store_id,
d.name as store_name,
sce.event_type,
sce.event_date,
sce.brand_name,
sce.product_name,
sce.category,
sce.old_value,
sce.new_value,
sce.metadata
FROM store_change_events sce
JOIN dispensaries d ON sce.store_id = d.id
WHERE sce.store_id = $1
AND sce.event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
${eventTypeCondition}
ORDER BY sce.event_date DESC, sce.id DESC
LIMIT $3
`, params);
return result.rows.map(row => ({
id: row.id,
storeId: row.store_id,
storeName: row.store_name,
eventType: row.event_type,
eventDate: row.event_date.toISOString().split('T')[0],
brandName: row.brand_name,
productName: row.product_name,
category: row.category,
oldValue: row.old_value,
newValue: row.new_value,
metadata: row.metadata,
}));
}, 5)).data;
}
/**
* Get new brands added to a store
*/
async getNewBrands(
storeId: number,
days: number = 30
): Promise<BrandChange[]> {
const key = cacheKey('new_brands', { storeId, days });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
brand_name,
event_date,
metadata
FROM store_change_events
WHERE store_id = $1
AND event_type = 'brand_added'
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
ORDER BY event_date DESC
`, [storeId, days]);
return result.rows.map(row => ({
brandName: row.brand_name,
changeType: 'added' as const,
date: row.event_date.toISOString().split('T')[0],
skuCount: row.metadata?.sku_count || 0,
categories: row.metadata?.categories || [],
}));
}, 15)).data;
}
/**
* Get brands lost from a store
*/
async getLostBrands(
storeId: number,
days: number = 30
): Promise<BrandChange[]> {
const key = cacheKey('lost_brands', { storeId, days });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
brand_name,
event_date,
metadata
FROM store_change_events
WHERE store_id = $1
AND event_type = 'brand_removed'
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
ORDER BY event_date DESC
`, [storeId, days]);
return result.rows.map(row => ({
brandName: row.brand_name,
changeType: 'removed' as const,
date: row.event_date.toISOString().split('T')[0],
skuCount: row.metadata?.sku_count || 0,
categories: row.metadata?.categories || [],
}));
}, 15)).data;
}
/**
* Get product changes for a store
*/
async getProductChanges(
storeId: number,
changeType?: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock',
days: number = 7
): Promise<ProductChange[]> {
const key = cacheKey('product_changes', { storeId, changeType, days });
return (await this.cache.getOrCompute(key, async () => {
const eventTypeMap: Record<string, string> = {
'added': 'product_added',
'discontinued': 'product_removed',
'price_drop': 'price_drop',
'price_increase': 'price_increase',
'restocked': 'restocked',
'out_of_stock': 'out_of_stock',
};
const params: (string | number)[] = [storeId, days];
let eventCondition = '';
if (changeType) {
eventCondition = 'AND event_type = $3';
params.push(eventTypeMap[changeType]);
}
const result = await this.pool.query(`
SELECT
product_id,
product_name,
brand_name,
category,
event_type,
event_date,
old_value,
new_value
FROM store_change_events
WHERE store_id = $1
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
AND product_id IS NOT NULL
${eventCondition}
ORDER BY event_date DESC
LIMIT 100
`, params);
const reverseMap: Record<string, ProductChange['changeType']> = {
'product_added': 'added',
'product_removed': 'discontinued',
'price_drop': 'price_drop',
'price_increase': 'price_increase',
'restocked': 'restocked',
'out_of_stock': 'out_of_stock',
};
return result.rows.map(row => ({
productId: row.product_id,
productName: row.product_name,
brandName: row.brand_name,
category: row.category,
changeType: reverseMap[row.event_type] || 'added',
date: row.event_date.toISOString().split('T')[0],
oldValue: row.old_value,
newValue: row.new_value,
}));
}, 5)).data;
}
/**
* Get category leaderboard across stores
*/
async getCategoryLeaderboard(
category: string,
limit: number = 20
): Promise<CategoryLeaderboard[]> {
const key = cacheKey('category_leaderboard', { category, limit });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
WITH store_category_stats AS (
SELECT
dp.dispensary_id as store_id,
d.name as store_name,
COUNT(*) as sku_count,
COUNT(DISTINCT dp.brand_name) as brand_count,
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
FROM dutchie_products dp
JOIN dispensaries d ON dp.dispensary_id = d.id
WHERE dp.type = $1
GROUP BY dp.dispensary_id, d.name
)
SELECT
scs.*,
RANK() OVER (ORDER BY scs.sku_count DESC) as rank
FROM store_category_stats scs
ORDER BY scs.sku_count DESC
LIMIT $2
`, [category, limit]);
return result.rows.map(row => ({
category,
storeId: row.store_id,
storeName: row.store_name,
skuCount: parseInt(row.sku_count) || 0,
brandCount: parseInt(row.brand_count) || 0,
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
changePercent7d: 0, // Would need historical data
rank: parseInt(row.rank) || 0,
}));
}, 15)).data;
}
/**
* Get stores with most activity (changes)
*/
async getMostActiveStores(
days: number = 7,
limit: number = 10
): Promise<Array<{
storeId: number;
storeName: string;
city: string;
state: string;
totalChanges: number;
brandsChanged: number;
productsChanged: number;
priceChanges: number;
stockChanges: number;
}>> {
const key = cacheKey('most_active_stores', { days, limit });
return (await this.cache.getOrCompute(key, async () => {
const result = await this.pool.query(`
SELECT
d.id as store_id,
d.name as store_name,
d.city,
d.state,
COUNT(*) as total_changes,
COUNT(*) FILTER (WHERE sce.event_type IN ('brand_added', 'brand_removed')) as brands_changed,
COUNT(*) FILTER (WHERE sce.event_type IN ('product_added', 'product_removed')) as products_changed,
COUNT(*) FILTER (WHERE sce.event_type IN ('price_drop', 'price_increase')) as price_changes,
COUNT(*) FILTER (WHERE sce.event_type IN ('restocked', 'out_of_stock')) as stock_changes
FROM store_change_events sce
JOIN dispensaries d ON sce.store_id = d.id
WHERE sce.event_date >= CURRENT_DATE - ($1 || ' days')::INTERVAL
GROUP BY d.id, d.name, d.city, d.state
ORDER BY total_changes DESC
LIMIT $2
`, [days, limit]);
return result.rows.map(row => ({
storeId: row.store_id,
storeName: row.store_name,
city: row.city,
state: row.state,
totalChanges: parseInt(row.total_changes) || 0,
brandsChanged: parseInt(row.brands_changed) || 0,
productsChanged: parseInt(row.products_changed) || 0,
priceChanges: parseInt(row.price_changes) || 0,
stockChanges: parseInt(row.stock_changes) || 0,
}));
}, 15)).data;
}
/**
* Compare two stores
*/
async compareStores(
storeId1: number,
storeId2: number
): Promise<{
store1: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
store2: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
sharedBrands: string[];
uniqueToStore1: string[];
uniqueToStore2: string[];
categoryComparison: Array<{
category: string;
store1Skus: number;
store2Skus: number;
difference: number;
}>;
}> {
const key = cacheKey('compare_stores', { storeId1, storeId2 });
return (await this.cache.getOrCompute(key, async () => {
const [store1Data, store2Data] = await Promise.all([
this.pool.query(`
SELECT
d.id, d.name,
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
COUNT(*) as sku_count
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
WHERE d.id = $1
GROUP BY d.id, d.name
`, [storeId1]),
this.pool.query(`
SELECT
d.id, d.name,
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
COUNT(*) as sku_count
FROM dispensaries d
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
WHERE d.id = $1
GROUP BY d.id, d.name
`, [storeId2]),
]);
const s1 = store1Data.rows[0];
const s2 = store2Data.rows[0];
const brands1Array: string[] = (s1?.brands || []).filter((b: string | null): b is string => b !== null);
const brands2Array: string[] = (s2?.brands || []).filter((b: string | null): b is string => b !== null);
const brands1 = new Set(brands1Array);
const brands2 = new Set(brands2Array);
const sharedBrands: string[] = brands1Array.filter(b => brands2.has(b));
const uniqueToStore1: string[] = brands1Array.filter(b => !brands2.has(b));
const uniqueToStore2: string[] = brands2Array.filter(b => !brands1.has(b));
// Category comparison
const categoryResult = await this.pool.query(`
WITH store1_cats AS (
SELECT type as category, COUNT(*) as sku_count
FROM dutchie_products WHERE dispensary_id = $1 AND type IS NOT NULL
GROUP BY type
),
store2_cats AS (
SELECT type as category, COUNT(*) as sku_count
FROM dutchie_products WHERE dispensary_id = $2 AND type IS NOT NULL
GROUP BY type
),
all_cats AS (
SELECT category FROM store1_cats
UNION
SELECT category FROM store2_cats
)
SELECT
ac.category,
COALESCE(s1.sku_count, 0) as store1_skus,
COALESCE(s2.sku_count, 0) as store2_skus
FROM all_cats ac
LEFT JOIN store1_cats s1 ON ac.category = s1.category
LEFT JOIN store2_cats s2 ON ac.category = s2.category
ORDER BY (COALESCE(s1.sku_count, 0) + COALESCE(s2.sku_count, 0)) DESC
`, [storeId1, storeId2]);
return {
store1: {
id: s1?.id || storeId1,
name: s1?.name || 'Unknown',
brands: s1?.brands || [],
categories: s1?.categories || [],
skuCount: parseInt(s1?.sku_count) || 0,
},
store2: {
id: s2?.id || storeId2,
name: s2?.name || 'Unknown',
brands: s2?.brands || [],
categories: s2?.categories || [],
skuCount: parseInt(s2?.sku_count) || 0,
},
sharedBrands,
uniqueToStore1,
uniqueToStore2,
categoryComparison: categoryResult.rows.map(row => ({
category: row.category,
store1Skus: parseInt(row.store1_skus) || 0,
store2Skus: parseInt(row.store2_skus) || 0,
difference: (parseInt(row.store1_skus) || 0) - (parseInt(row.store2_skus) || 0),
})),
};
}, 15)).data;
}
/**
* Record a change event (used by crawler/worker)
*/
async recordChangeEvent(event: {
storeId: number;
eventType: string;
brandName?: string;
productId?: number;
productName?: string;
category?: string;
oldValue?: string;
newValue?: string;
metadata?: Record<string, unknown>;
}): Promise<void> {
await this.pool.query(`
INSERT INTO store_change_events
(store_id, event_type, event_date, brand_name, product_id, product_name, category, old_value, new_value, metadata)
VALUES ($1, $2, CURRENT_DATE, $3, $4, $5, $6, $7, $8, $9)
`, [
event.storeId,
event.eventType,
event.brandName || null,
event.productId || null,
event.productName || null,
event.category || null,
event.oldValue || null,
event.newValue || null,
event.metadata ? JSON.stringify(event.metadata) : null,
]);
// Invalidate cache
await this.cache.invalidatePattern(`store_change_summary:storeId=${event.storeId}`);
}
}

View File

@@ -1,20 +1,27 @@
/**
* AZDHS Import Service
* LEGACY SERVICE - AZDHS Import
*
* DEPRECATED: This service creates its own database pool.
* Future implementations should use the canonical CannaiQ connection.
*
* Imports Arizona dispensaries from the main database's dispensaries table
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
*
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
*
* DO NOT:
* - Run this in automated jobs
* - Use DATABASE_URL directly
*/
import { Pool } from 'pg';
import { query as dutchieQuery } from '../db/connection';
import { Dispensary } from '../types';
// Main database connection (source of AZDHS data)
const MAIN_DATABASE_URL =
process.env.DATABASE_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
// Single database connection (cannaiq in cannaiq-postgres container)
// Use CANNAIQ_DB_* env vars or defaults
const MAIN_DB_CONNECTION = process.env.CANNAIQ_DB_URL ||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
/**
* AZDHS dispensary record from the main database
@@ -57,8 +64,9 @@ interface ImportResult {
* Create a temporary connection to the main database
*/
function getMainDBPool(): Pool {
console.warn('[AZDHS Import] LEGACY: Using separate pool. Should use canonical CannaiQ connection.');
return new Pool({
connectionString: MAIN_DATABASE_URL,
connectionString: MAIN_DB_CONNECTION,
max: 5,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 5000,

View File

@@ -344,15 +344,12 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number
return { resolved, failed, skipped, notCrawlable };
}
// Use shared dispensary columns (handles optional columns like provider_detection_data)
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
/**
* Get all dispensaries
*/
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
export async function getAllDispensaries(): Promise<Dispensary[]> {
const { rows } = await query(
@@ -386,7 +383,7 @@ export function mapDbRowToDispensary(row: any): Dispensary {
id: row.id,
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
name: row.name,
dbaName: row.dbaName || row.dba_name,
dbaName: row.dbaName || row.dba_name || undefined, // dba_name column is optional
slug: row.slug,
city: row.city,
state: row.state,
@@ -421,7 +418,6 @@ export async function getDispensaryById(id: number): Promise<Dispensary | null>
SELECT
id,
name,
dba_name AS "dbaName",
slug,
city,
state,

View File

@@ -0,0 +1,491 @@
/**
* Error Taxonomy Module
*
* Standardized error codes and classification for crawler reliability.
* All crawl results must use these codes for consistent error handling.
*
* Phase 1: Crawler Reliability & Stabilization
*/
// ============================================================
// ERROR CODES
// ============================================================
/**
* Standardized error codes for all crawl operations.
* These codes are stored in the database for analytics and debugging.
*/
export const CrawlErrorCode = {
// Success states
SUCCESS: 'SUCCESS',
// Rate limiting
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
// Proxy issues
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
// Content issues
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
// Network issues
TIMEOUT: 'TIMEOUT', // Request timeout
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
// Authentication
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
// Server errors
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
// Configuration issues
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
// Unknown
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
} as const;
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
// ============================================================
// ERROR CLASSIFICATION
// ============================================================
/**
* Error metadata for each error code
*/
interface ErrorMetadata {
code: CrawlErrorCodeType;
retryable: boolean;
rotateProxy: boolean;
rotateUserAgent: boolean;
backoffMultiplier: number;
severity: 'low' | 'medium' | 'high' | 'critical';
description: string;
}
/**
* Metadata for each error code - defines retry behavior
*/
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
[CrawlErrorCode.SUCCESS]: {
code: CrawlErrorCode.SUCCESS,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 0,
severity: 'low',
description: 'Crawl completed successfully',
},
[CrawlErrorCode.RATE_LIMITED]: {
code: CrawlErrorCode.RATE_LIMITED,
retryable: true,
rotateProxy: true,
rotateUserAgent: true,
backoffMultiplier: 2.0,
severity: 'medium',
description: 'Rate limited by target (429)',
},
[CrawlErrorCode.BLOCKED_PROXY]: {
code: CrawlErrorCode.BLOCKED_PROXY,
retryable: true,
rotateProxy: true,
rotateUserAgent: true,
backoffMultiplier: 1.5,
severity: 'medium',
description: 'Proxy blocked or rejected (407)',
},
[CrawlErrorCode.PROXY_TIMEOUT]: {
code: CrawlErrorCode.PROXY_TIMEOUT,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'low',
description: 'Proxy connection timed out',
},
[CrawlErrorCode.HTML_CHANGED]: {
code: CrawlErrorCode.HTML_CHANGED,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'high',
description: 'Page structure changed - needs selector update',
},
[CrawlErrorCode.NO_PRODUCTS]: {
code: CrawlErrorCode.NO_PRODUCTS,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'low',
description: 'No products returned (may be temporary)',
},
[CrawlErrorCode.PARSE_ERROR]: {
code: CrawlErrorCode.PARSE_ERROR,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'medium',
description: 'Failed to parse response data',
},
[CrawlErrorCode.TIMEOUT]: {
code: CrawlErrorCode.TIMEOUT,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.5,
severity: 'medium',
description: 'Request timed out',
},
[CrawlErrorCode.NETWORK_ERROR]: {
code: CrawlErrorCode.NETWORK_ERROR,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'medium',
description: 'Network connection failed',
},
[CrawlErrorCode.DNS_ERROR]: {
code: CrawlErrorCode.DNS_ERROR,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'medium',
description: 'DNS resolution failed',
},
[CrawlErrorCode.AUTH_FAILED]: {
code: CrawlErrorCode.AUTH_FAILED,
retryable: true,
rotateProxy: false,
rotateUserAgent: true,
backoffMultiplier: 2.0,
severity: 'high',
description: 'Authentication or session failed',
},
[CrawlErrorCode.SERVER_ERROR]: {
code: CrawlErrorCode.SERVER_ERROR,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.5,
severity: 'medium',
description: 'Server error (5xx)',
},
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 2.0,
severity: 'high',
description: 'Service temporarily unavailable (503)',
},
[CrawlErrorCode.INVALID_CONFIG]: {
code: CrawlErrorCode.INVALID_CONFIG,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 0,
severity: 'critical',
description: 'Invalid store configuration',
},
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
code: CrawlErrorCode.MISSING_PLATFORM_ID,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 0,
severity: 'critical',
description: 'Missing platform_dispensary_id',
},
[CrawlErrorCode.UNKNOWN_ERROR]: {
code: CrawlErrorCode.UNKNOWN_ERROR,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'high',
description: 'Unknown/unclassified error',
},
};
// ============================================================
// ERROR CLASSIFICATION FUNCTIONS
// ============================================================
/**
* Classify an error into a standardized error code.
*
* @param error - The error to classify (Error object, string, or HTTP status)
* @param httpStatus - Optional HTTP status code
* @returns Standardized error code
*/
export function classifyError(
error: Error | string | null,
httpStatus?: number
): CrawlErrorCodeType {
// Check HTTP status first
if (httpStatus) {
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
}
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
// Rate limiting patterns
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
return CrawlErrorCode.RATE_LIMITED;
}
// Proxy patterns
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
return CrawlErrorCode.BLOCKED_PROXY;
}
// Timeout patterns
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
if (message.includes('proxy')) {
return CrawlErrorCode.PROXY_TIMEOUT;
}
return CrawlErrorCode.TIMEOUT;
}
// Network patterns
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
return CrawlErrorCode.NETWORK_ERROR;
}
// DNS patterns
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
return CrawlErrorCode.DNS_ERROR;
}
// Auth patterns
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
return CrawlErrorCode.AUTH_FAILED;
}
// HTML change patterns
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
return CrawlErrorCode.HTML_CHANGED;
}
// Parse patterns
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
return CrawlErrorCode.PARSE_ERROR;
}
// No products patterns
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
return CrawlErrorCode.NO_PRODUCTS;
}
// Server error patterns
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
return CrawlErrorCode.SERVER_ERROR;
}
// Config patterns
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
if (message.includes('platform') || message.includes('dispensary_id')) {
return CrawlErrorCode.MISSING_PLATFORM_ID;
}
return CrawlErrorCode.INVALID_CONFIG;
}
return CrawlErrorCode.UNKNOWN_ERROR;
}
/**
* Get metadata for an error code
*/
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
}
/**
* Check if an error is retryable
*/
export function isRetryable(code: CrawlErrorCodeType): boolean {
return getErrorMetadata(code).retryable;
}
/**
* Check if proxy should be rotated for this error
*/
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
return getErrorMetadata(code).rotateProxy;
}
/**
* Check if user agent should be rotated for this error
*/
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
return getErrorMetadata(code).rotateUserAgent;
}
/**
* Get backoff multiplier for this error
*/
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
return getErrorMetadata(code).backoffMultiplier;
}
// ============================================================
// CRAWL RESULT TYPE
// ============================================================
/**
* Standardized crawl result with error taxonomy
*/
export interface CrawlResult {
success: boolean;
dispensaryId: number;
// Error info
errorCode: CrawlErrorCodeType;
errorMessage?: string;
httpStatus?: number;
// Timing
startedAt: Date;
finishedAt: Date;
durationMs: number;
// Context
attemptNumber: number;
proxyUsed?: string;
userAgentUsed?: string;
// Metrics (on success)
productsFound?: number;
productsUpserted?: number;
snapshotsCreated?: number;
imagesDownloaded?: number;
// Metadata
metadata?: Record<string, any>;
}
/**
* Create a success result
*/
export function createSuccessResult(
dispensaryId: number,
startedAt: Date,
metrics: {
productsFound: number;
productsUpserted: number;
snapshotsCreated: number;
imagesDownloaded?: number;
},
context?: {
attemptNumber?: number;
proxyUsed?: string;
userAgentUsed?: string;
}
): CrawlResult {
const finishedAt = new Date();
return {
success: true,
dispensaryId,
errorCode: CrawlErrorCode.SUCCESS,
startedAt,
finishedAt,
durationMs: finishedAt.getTime() - startedAt.getTime(),
attemptNumber: context?.attemptNumber || 1,
proxyUsed: context?.proxyUsed,
userAgentUsed: context?.userAgentUsed,
...metrics,
};
}
/**
* Create a failure result
*/
export function createFailureResult(
dispensaryId: number,
startedAt: Date,
error: Error | string,
httpStatus?: number,
context?: {
attemptNumber?: number;
proxyUsed?: string;
userAgentUsed?: string;
}
): CrawlResult {
const finishedAt = new Date();
const errorCode = classifyError(error, httpStatus);
const errorMessage = typeof error === 'string' ? error : error.message;
return {
success: false,
dispensaryId,
errorCode,
errorMessage,
httpStatus,
startedAt,
finishedAt,
durationMs: finishedAt.getTime() - startedAt.getTime(),
attemptNumber: context?.attemptNumber || 1,
proxyUsed: context?.proxyUsed,
userAgentUsed: context?.userAgentUsed,
};
}
// ============================================================
// LOGGING HELPERS
// ============================================================
/**
* Format error code for logging
*/
export function formatErrorForLog(result: CrawlResult): string {
const metadata = getErrorMetadata(result.errorCode);
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
if (result.success) {
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
}
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
}
/**
* Get user-friendly error description
*/
export function getErrorDescription(code: CrawlErrorCodeType): string {
return getErrorMetadata(code).description;
}

View File

@@ -8,6 +8,10 @@
import { query, getClient } from '../db/connection';
import { v4 as uuidv4 } from 'uuid';
import * as os from 'os';
import { DEFAULT_CONFIG } from './store-validator';
// Minimum gap between crawls for the same dispensary (in minutes)
const MIN_CRAWL_GAP_MINUTES = DEFAULT_CONFIG.minCrawlGapMinutes; // 2 minutes
// ============================================================
// TYPES
@@ -97,11 +101,30 @@ export function getWorkerHostname(): string {
// JOB ENQUEUEING
// ============================================================
export interface EnqueueResult {
jobId: number | null;
skipped: boolean;
reason?: 'already_queued' | 'too_soon' | 'error';
message?: string;
}
/**
* Enqueue a new job for processing
* Returns null if a pending/running job already exists for this dispensary
* or if a job was completed/failed within the minimum gap period
*/
export async function enqueueJob(options: EnqueueJobOptions): Promise<number | null> {
const result = await enqueueJobWithReason(options);
return result.jobId;
}
/**
* Enqueue a new job with detailed result info
* Enforces:
* 1. No duplicate pending/running jobs for same dispensary
* 2. Minimum 2-minute gap between crawls for same dispensary
*/
export async function enqueueJobWithReason(options: EnqueueJobOptions): Promise<EnqueueResult> {
const {
jobType,
dispensaryId,
@@ -121,10 +144,43 @@ export async function enqueueJob(options: EnqueueJobOptions): Promise<number | n
if (existing.length > 0) {
console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
return null;
return {
jobId: null,
skipped: true,
reason: 'already_queued',
message: `Job already pending/running for dispensary ${dispensaryId}`,
};
}
// Check minimum gap since last job (2 minutes)
const { rows: recent } = await query<any>(
`SELECT id, created_at, status
FROM dispensary_crawl_jobs
WHERE dispensary_id = $1
ORDER BY created_at DESC
LIMIT 1`,
[dispensaryId]
);
if (recent.length > 0) {
const lastJobTime = new Date(recent[0].created_at);
const minGapMs = MIN_CRAWL_GAP_MINUTES * 60 * 1000;
const timeSinceLastJob = Date.now() - lastJobTime.getTime();
if (timeSinceLastJob < minGapMs) {
const waitSeconds = Math.ceil((minGapMs - timeSinceLastJob) / 1000);
console.log(`[JobQueue] Skipping enqueue - minimum ${MIN_CRAWL_GAP_MINUTES}min gap not met for dispensary ${dispensaryId}. Wait ${waitSeconds}s`);
return {
jobId: null,
skipped: true,
reason: 'too_soon',
message: `Minimum ${MIN_CRAWL_GAP_MINUTES}-minute gap required. Try again in ${waitSeconds} seconds.`,
};
}
}
}
try {
const { rows } = await query<any>(
`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
@@ -134,18 +190,41 @@ export async function enqueueJob(options: EnqueueJobOptions): Promise<number | n
const jobId = rows[0].id;
console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
return jobId;
return { jobId, skipped: false };
} catch (error: any) {
// Handle database trigger rejection for minimum gap
if (error.message?.includes('Minimum') && error.message?.includes('gap')) {
console.log(`[JobQueue] DB rejected - minimum gap not met for dispensary ${dispensaryId}`);
return {
jobId: null,
skipped: true,
reason: 'too_soon',
message: error.message,
};
}
throw error;
}
}
export interface BulkEnqueueResult {
enqueued: number;
skipped: number;
skippedReasons: {
alreadyQueued: number;
tooSoon: number;
};
}
/**
* Bulk enqueue jobs for multiple dispensaries
* Skips dispensaries that already have pending/running jobs
* or have jobs within the minimum gap period
*/
export async function bulkEnqueueJobs(
jobType: string,
dispensaryIds: number[],
options: { priority?: number; metadata?: Record<string, any> } = {}
): Promise<{ enqueued: number; skipped: number }> {
): Promise<BulkEnqueueResult> {
const { priority = 0, metadata } = options;
// Get dispensaries that already have pending/running jobs
@@ -156,11 +235,31 @@ export async function bulkEnqueueJobs(
);
const existingSet = new Set(existing.map((r: any) => r.dispensary_id));
// Filter out dispensaries with existing jobs
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id));
// Get dispensaries that have recent jobs within minimum gap
const { rows: recent } = await query<any>(
`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
WHERE dispensary_id = ANY($1)
AND created_at > NOW() - ($2 || ' minutes')::INTERVAL
AND dispensary_id NOT IN (
SELECT dispensary_id FROM dispensary_crawl_jobs
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')
)`,
[dispensaryIds, MIN_CRAWL_GAP_MINUTES]
);
const recentSet = new Set(recent.map((r: any) => r.dispensary_id));
// Filter out dispensaries with existing or recent jobs
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id) && !recentSet.has(id));
if (toEnqueue.length === 0) {
return { enqueued: 0, skipped: dispensaryIds.length };
return {
enqueued: 0,
skipped: dispensaryIds.length,
skippedReasons: {
alreadyQueued: existingSet.size,
tooSoon: recentSet.size,
},
};
}
// Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
@@ -181,8 +280,15 @@ export async function bulkEnqueueJobs(
params
);
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size}`);
return { enqueued: toEnqueue.length, skipped: existingSet.size };
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size} (queued) + ${recentSet.size} (recent)`);
return {
enqueued: toEnqueue.length,
skipped: existingSet.size + recentSet.size,
skippedReasons: {
alreadyQueued: existingSet.size,
tooSoon: recentSet.size,
},
};
}
// ============================================================
@@ -311,22 +417,48 @@ export async function heartbeat(jobId: number): Promise<void> {
/**
* Mark job as completed
*
* Stores visibility tracking stats (visibilityLostCount, visibilityRestoredCount)
* in the metadata JSONB column for dashboard analytics.
*/
export async function completeJob(
jobId: number,
result: { productsFound?: number; productsUpserted?: number; snapshotsCreated?: number }
result: {
productsFound?: number;
productsUpserted?: number;
snapshotsCreated?: number;
visibilityLostCount?: number;
visibilityRestoredCount?: number;
}
): Promise<void> {
// Build metadata with visibility stats if provided
const metadata: Record<string, any> = {};
if (result.visibilityLostCount !== undefined) {
metadata.visibilityLostCount = result.visibilityLostCount;
}
if (result.visibilityRestoredCount !== undefined) {
metadata.visibilityRestoredCount = result.visibilityRestoredCount;
}
if (result.snapshotsCreated !== undefined) {
metadata.snapshotsCreated = result.snapshotsCreated;
}
await query(
`UPDATE dispensary_crawl_jobs
SET
status = 'completed',
completed_at = NOW(),
products_found = COALESCE($2, products_found),
products_upserted = COALESCE($3, products_upserted),
snapshots_created = COALESCE($4, snapshots_created),
products_updated = COALESCE($3, products_updated),
metadata = COALESCE(metadata, '{}'::jsonb) || $4::jsonb,
updated_at = NOW()
WHERE id = $1`,
[jobId, result.productsFound, result.productsUpserted, result.snapshotsCreated]
[
jobId,
result.productsFound,
result.productsUpserted,
JSON.stringify(metadata),
]
);
console.log(`[JobQueue] Job ${jobId} completed`);
}

View File

@@ -16,12 +16,8 @@ import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } fro
import { resolveDispensaryId } from './graphql-client';
import { Dispensary, JobStatus } from '../types';
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// Use shared dispensary columns (handles optional columns like provider_detection_data)
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
// ============================================================
// TYPES
@@ -647,6 +643,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`
UPDATE dispensaries SET
menu_type = 'dutchie',
last_id_resolution_at = NOW(),
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
id_resolution_error = $1,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
@@ -660,7 +659,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`,
[result.error, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
return result;
}
@@ -675,6 +674,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
last_id_resolution_at = NOW(),
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
id_resolution_error = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
@@ -691,7 +693,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`,
[platformId, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
return result;
}
@@ -714,6 +716,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
last_id_resolution_at = NOW(),
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
id_resolution_error = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
@@ -730,10 +735,10 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`,
[platformId, cName, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: Resolved platform ID = ${platformId}`);
} else {
// cName resolution failed - try crawling website as fallback
console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
if (website && website.trim() !== '') {
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
@@ -796,6 +801,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = NULL,
last_id_resolution_at = NOW(),
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
id_resolution_error = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
@@ -812,7 +820,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`,
[cName, result.error, dispensaryId]
);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
}
} catch (error: any) {
result.error = `Resolution failed: ${error.message}`;
@@ -820,6 +828,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`
UPDATE dispensaries SET
menu_type = 'dutchie',
last_id_resolution_at = NOW(),
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
id_resolution_error = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
@@ -835,7 +846,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
`,
[cName, result.error, dispensaryId]
);
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
console.error(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
}
return result;
@@ -844,6 +855,11 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
/**
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
*
* Enhanced for Henry (Entry Point Finder) to also process:
* - Stores with slug changes that need re-resolution
* - Recently added stores from Alice's discovery
* - Stores that failed resolution and need retry
*/
export async function runBulkDetection(options: {
state?: string;
@@ -851,6 +867,9 @@ export async function runBulkDetection(options: {
onlyMissingPlatformId?: boolean;
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
includeSlugChanges?: boolean; // Include stores where Alice detected slug changes
includeRecentlyAdded?: boolean; // Include stores recently added by Alice
scope?: { states?: string[]; storeIds?: number[] }; // Scope filtering for sharding
limit?: number;
} = {}): Promise<BulkDetectionResult> {
const {
@@ -859,14 +878,23 @@ export async function runBulkDetection(options: {
onlyMissingPlatformId = false,
includeWebsiteCrawl = true,
includeDutchieMissingPlatformId = true,
includeSlugChanges = true,
includeRecentlyAdded = true,
scope,
limit,
} = options;
console.log('[MenuDetection] Starting bulk detection...');
const scopeDesc = scope?.states?.length
? ` (states: ${scope.states.join(', ')})`
: scope?.storeIds?.length
? ` (${scope.storeIds.length} specific stores)`
: state ? ` (state: ${state})` : '';
console.log(`[Henry - Entry Point Finder] Starting bulk detection${scopeDesc}...`);
// Build query to find dispensaries needing detection
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
// Optionally includes dutchie stores missing platform ID
// Optionally includes dutchie stores missing platform ID, slug changes, and recently added stores
let whereClause = `WHERE (
menu_url IS NOT NULL
${includeWebsiteCrawl ? `OR (
@@ -882,7 +910,14 @@ export async function runBulkDetection(options: {
const params: any[] = [];
let paramIndex = 1;
if (state) {
// Apply scope filtering (takes precedence over single state filter)
if (scope?.storeIds?.length) {
whereClause += ` AND id = ANY($${paramIndex++})`;
params.push(scope.storeIds);
} else if (scope?.states?.length) {
whereClause += ` AND state = ANY($${paramIndex++})`;
params.push(scope.states);
} else if (state) {
whereClause += ` AND state = $${paramIndex++}`;
params.push(state);
}
@@ -962,6 +997,19 @@ export async function runBulkDetection(options: {
/**
* Execute the menu detection job (called by scheduler)
*
* Worker: Henry (Entry Point Finder)
* Uses METHOD 1 (reactEnv extraction) as primary method per user requirements.
*
* Scope filtering:
* - config.scope.states: Array of state codes to limit detection (e.g., ["AZ", "CA"])
* - config.scope.storeIds: Array of specific store IDs to process
*
* Processes:
* - Stores with unknown/missing menu_type
* - Stores with missing platform_dispensary_id
* - Stores with slug changes that need re-resolution (from Alice)
* - Recently added stores (discovered by Alice)
*/
export async function executeMenuDetectionJob(config: Record<string, any> = {}): Promise<{
status: JobStatus;
@@ -972,19 +1020,31 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
metadata?: any;
}> {
const state = config.state || 'AZ';
const scope = config.scope as { states?: string[]; storeIds?: number[] } | undefined;
const onlyUnknown = config.onlyUnknown !== false;
// Default to true - always try to resolve platform IDs for dutchie stores
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
const includeSlugChanges = config.includeSlugChanges !== false;
const includeRecentlyAdded = config.includeRecentlyAdded !== false;
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
const scopeDesc = scope?.states?.length
? ` (states: ${scope.states.join(', ')})`
: scope?.storeIds?.length
? ` (${scope.storeIds.length} specific stores)`
: ` (state: ${state})`;
console.log(`[Henry - Entry Point Finder] Executing scheduled job${scopeDesc}...`);
try {
const result = await runBulkDetection({
state,
state: scope ? undefined : state, // Use scope if provided, otherwise fall back to state
scope,
onlyUnknown,
onlyMissingPlatformId,
includeDutchieMissingPlatformId,
includeSlugChanges,
includeRecentlyAdded,
});
const status: JobStatus =
@@ -998,9 +1058,11 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
itemsFailed: result.totalFailed,
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
metadata: {
state,
scope: scope || { states: [state] },
onlyUnknown,
onlyMissingPlatformId,
includeSlugChanges,
includeRecentlyAdded,
providerCounts: countByProvider(result.results),
},
};
@@ -1011,6 +1073,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
itemsSucceeded: 0,
itemsFailed: 0,
errorMessage: error.message,
metadata: { scope: scope || { states: [state] } },
};
}
}

View File

@@ -24,12 +24,8 @@ import {
} from '../types';
import { downloadProductImage, imageExists } from '../../utils/image-storage';
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// Use shared dispensary columns (handles optional columns like provider_detection_data)
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
// ============================================================
// BATCH PROCESSING CONFIGURATION
@@ -648,10 +644,15 @@ async function updateDispensaryCrawlStats(
}
/**
* Mark products as missing from feed
* Mark products as missing from feed (visibility-loss detection)
* Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
* for products that were NOT in the UNION of Mode A and Mode B product lists
*
* Bella (Product Sync) visibility tracking:
* - Sets visibility_lost=TRUE and visibility_lost_at=NOW() for disappearing products
* - Records visibility event in snapshot metadata JSONB
* - NEVER deletes products, just marks them as visibility-lost
*
* IMPORTANT: Uses UNION of both modes to avoid false positives
* If the union is empty (possible outage), we skip marking to avoid data corruption
*/
@@ -660,25 +661,28 @@ async function markMissingProducts(
platformDispensaryId: string,
modeAProductIds: Set<string>,
modeBProductIds: Set<string>,
pricingType: 'rec' | 'med'
): Promise<number> {
pricingType: 'rec' | 'med',
workerName: string = 'Bella'
): Promise<{ markedMissing: number; newlyLost: number }> {
// Build UNION of Mode A + Mode B product IDs
const unionProductIds = new Set<string>([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
// OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
if (unionProductIds.size === 0) {
console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
return 0;
console.warn(`[${workerName} - Product Sync] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping visibility-loss marking.`);
return { markedMissing: 0, newlyLost: 0 };
}
// Get all existing products for this dispensary that were not in the UNION
// Also check if they were already marked as visibility_lost to track new losses
const { rows: missingProducts } = await query<{
id: number;
external_product_id: string;
name: string;
visibility_lost: boolean;
}>(
`
SELECT id, external_product_id, name
SELECT id, external_product_id, name, COALESCE(visibility_lost, FALSE) as visibility_lost
FROM dutchie_products
WHERE dispensary_id = $1
AND external_product_id NOT IN (SELECT unnest($2::text[]))
@@ -687,21 +691,27 @@ async function markMissingProducts(
);
if (missingProducts.length === 0) {
return 0;
return { markedMissing: 0, newlyLost: 0 };
}
console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
// Separate newly lost products from already-lost products
const newlyLostProducts = missingProducts.filter(p => !p.visibility_lost);
const alreadyLostProducts = missingProducts.filter(p => p.visibility_lost);
console.log(`[${workerName} - Product Sync] Visibility check: ${missingProducts.length} products missing (${newlyLostProducts.length} newly lost, ${alreadyLostProducts.length} already lost)`);
const crawledAt = new Date();
// Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
const missingSnapshots: Partial<DutchieProductSnapshot>[] = missingProducts.map(product => ({
// Build all missing snapshots with visibility_events metadata
const missingSnapshots: Partial<DutchieProductSnapshot>[] = missingProducts.map(product => {
const isNewlyLost = !product.visibility_lost;
return {
dutchieProductId: product.id,
dispensaryId,
platformDispensaryId,
externalProductId: product.external_product_id,
pricingType,
crawlMode: 'mode_a' as CrawlMode, // Use mode_a for missing snapshots (convention)
crawlMode: 'mode_a' as CrawlMode,
status: undefined,
featured: false,
special: false,
@@ -709,37 +719,113 @@ async function markMissingProducts(
recOnly: false,
isPresentInFeed: false,
stockStatus: 'missing_from_feed' as StockStatus,
totalQuantityAvailable: undefined, // null = unknown, not 0
totalQuantityAvailable: undefined,
manualInventory: false,
isBelowThreshold: false,
isBelowKioskThreshold: false,
options: [],
rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
rawPayload: {
_missingFromFeed: true,
lastKnownName: product.name,
visibility_events: isNewlyLost ? [{
event_type: 'visibility_lost',
timestamp: crawledAt.toISOString(),
worker_name: workerName,
}] : [],
},
crawledAt,
}));
};
});
// Batch insert missing snapshots
const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
// Batch update product stock status in chunks
// Batch update product visibility status in chunks
const productIds = missingProducts.map(p => p.id);
const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
console.log(`[${workerName} - Product Sync] Updating ${productIds.length} product visibility in ${productChunks.length} chunks...`);
for (const chunk of productChunks) {
// Update all products: set stock_status to missing
// Only set visibility_lost_at for NEWLY lost products (not already lost)
await query(
`
UPDATE dutchie_products
SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
SET
stock_status = 'missing_from_feed',
total_quantity_available = NULL,
visibility_lost = TRUE,
visibility_lost_at = CASE
WHEN visibility_lost IS NULL OR visibility_lost = FALSE THEN NOW()
ELSE visibility_lost_at -- Keep existing timestamp for already-lost products
END,
updated_at = NOW()
WHERE id = ANY($1::int[])
`,
[chunk]
);
}
console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
return snapshotsInserted;
console.log(`[${workerName} - Product Sync] Marked ${snapshotsInserted} products as missing, ${newlyLostProducts.length} newly visibility-lost`);
return { markedMissing: snapshotsInserted, newlyLost: newlyLostProducts.length };
}
/**
* Restore visibility for products that reappeared in the feed
* Called when products that were previously visibility_lost=TRUE are now found in the feed
*
* Bella (Product Sync) visibility tracking:
* - Sets visibility_lost=FALSE and visibility_restored_at=NOW()
* - Logs the restoration event
*/
async function restoreVisibilityForProducts(
dispensaryId: number,
productIds: Set<string>,
workerName: string = 'Bella'
): Promise<number> {
if (productIds.size === 0) {
return 0;
}
// Find products that were visibility_lost and are now in the feed
const { rows: restoredProducts } = await query<{ id: number; external_product_id: string }>(
`
SELECT id, external_product_id
FROM dutchie_products
WHERE dispensary_id = $1
AND visibility_lost = TRUE
AND external_product_id = ANY($2::text[])
`,
[dispensaryId, Array.from(productIds)]
);
if (restoredProducts.length === 0) {
return 0;
}
console.log(`[${workerName} - Product Sync] Restoring visibility for ${restoredProducts.length} products that reappeared`);
// Batch update restored products
const restoredIds = restoredProducts.map(p => p.id);
const chunks = chunkArray(restoredIds, BATCH_CHUNK_SIZE);
for (const chunk of chunks) {
await query(
`
UPDATE dutchie_products
SET
visibility_lost = FALSE,
visibility_restored_at = NOW(),
updated_at = NOW()
WHERE id = ANY($1::int[])
`,
[chunk]
);
}
console.log(`[${workerName} - Product Sync] Restored visibility for ${restoredProducts.length} products`);
return restoredProducts.length;
}
// ============================================================
@@ -756,9 +842,12 @@ export interface CrawlResult {
modeAProducts?: number;
modeBProducts?: number;
missingProductsMarked?: number;
visibilityLostCount?: number; // Products newly marked as visibility_lost
visibilityRestoredCount?: number; // Products restored from visibility_lost
imagesDownloaded?: number;
imageErrors?: number;
errorMessage?: string;
httpStatus?: number; // HTTP status code for error classification
durationMs: number;
}
@@ -1005,21 +1094,38 @@ export async function crawlDispensaryProducts(
}
}
// Build union of all product IDs found in both modes
const allFoundProductIds = new Set<string>([
...Array.from(modeAProductIds),
...Array.from(modeBProductIds),
]);
// VISIBILITY RESTORATION: Check if any previously-lost products have reappeared
const visibilityRestored = await restoreVisibilityForProducts(
dispensary.id,
allFoundProductIds,
'Bella'
);
// Mark products as missing using UNION of Mode A + Mode B
// The function handles outage detection (empty union = skip marking)
missingMarked = await markMissingProducts(
// Now also tracks newly lost products vs already-lost products
const missingResult = await markMissingProducts(
dispensary.id,
dispensary.platformDispensaryId,
modeAProductIds,
modeBProductIds,
pricingType
pricingType,
'Bella'
);
missingMarked = missingResult.markedMissing;
const newlyLostCount = missingResult.newlyLost;
totalSnapshots += missingMarked;
// Update dispensary stats
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
console.log(`[Bella - Product Sync] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} missing, ${newlyLostCount} newly lost, ${visibilityRestored} restored, ${totalImagesDownloaded} images`);
const totalProductsFound = modeAProducts + modeBProducts;
return {
@@ -1032,6 +1138,8 @@ export async function crawlDispensaryProducts(
modeAProducts,
modeBProducts,
missingProductsMarked: missingMarked,
visibilityLostCount: newlyLostCount,
visibilityRestoredCount: visibilityRestored,
imagesDownloaded: totalImagesDownloaded,
imageErrors: totalImageErrors,
durationMs: Date.now() - startTime,

View File

@@ -0,0 +1,455 @@
/**
* Proxy & User Agent Rotator
*
* Manages rotation of proxies and user agents to avoid blocks.
* Integrates with error taxonomy for intelligent rotation decisions.
*
* Phase 1: Crawler Reliability & Stabilization
*/
import { Pool } from 'pg';
// ============================================================
// USER AGENT CONFIGURATION
// ============================================================
/**
* Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
* Updated: 2024
*/
export const USER_AGENTS = [
// Chrome on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
// Chrome on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
// Firefox on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
// Firefox on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
// Edge on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
];
// ============================================================
// PROXY TYPES
// ============================================================
export interface Proxy {
id: number;
host: string;
port: number;
username?: string;
password?: string;
protocol: 'http' | 'https' | 'socks5';
isActive: boolean;
lastUsedAt: Date | null;
failureCount: number;
successCount: number;
avgResponseTimeMs: number | null;
}
export interface ProxyStats {
totalProxies: number;
activeProxies: number;
blockedProxies: number;
avgSuccessRate: number;
}
// ============================================================
// PROXY ROTATOR CLASS
// ============================================================
export class ProxyRotator {
private pool: Pool | null = null;
private proxies: Proxy[] = [];
private currentIndex: number = 0;
private lastRotation: Date = new Date();
constructor(pool?: Pool) {
this.pool = pool || null;
}
/**
* Initialize with database pool
*/
setPool(pool: Pool): void {
this.pool = pool;
}
/**
* Load proxies from database
*/
async loadProxies(): Promise<void> {
if (!this.pool) {
console.warn('[ProxyRotator] No database pool configured');
return;
}
try {
const result = await this.pool.query<Proxy>(`
SELECT
id,
host,
port,
username,
password,
protocol,
is_active as "isActive",
last_used_at as "lastUsedAt",
failure_count as "failureCount",
success_count as "successCount",
avg_response_time_ms as "avgResponseTimeMs"
FROM proxies
WHERE is_active = true
ORDER BY failure_count ASC, last_used_at ASC NULLS FIRST
`);
this.proxies = result.rows;
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies`);
} catch (error) {
// Table might not exist - that's okay
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
this.proxies = [];
}
}
/**
* Get next proxy in rotation
*/
getNext(): Proxy | null {
if (this.proxies.length === 0) return null;
// Round-robin rotation
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
this.lastRotation = new Date();
return this.proxies[this.currentIndex];
}
/**
* Get current proxy without rotating
*/
getCurrent(): Proxy | null {
if (this.proxies.length === 0) return null;
return this.proxies[this.currentIndex];
}
/**
* Get proxy by ID
*/
getById(id: number): Proxy | null {
return this.proxies.find(p => p.id === id) || null;
}
/**
* Rotate to a specific proxy
*/
setProxy(id: number): boolean {
const index = this.proxies.findIndex(p => p.id === id);
if (index === -1) return false;
this.currentIndex = index;
this.lastRotation = new Date();
return true;
}
/**
* Mark proxy as failed (temporarily remove from rotation)
*/
async markFailed(proxyId: number, error?: string): Promise<void> {
// Update in-memory
const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) {
proxy.failureCount++;
// Deactivate if too many failures
if (proxy.failureCount >= 5) {
proxy.isActive = false;
this.proxies = this.proxies.filter(p => p.id !== proxyId);
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
}
}
// Update database
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
failure_count = failure_count + 1,
last_failure_at = NOW(),
last_error = $2,
is_active = CASE WHEN failure_count >= 4 THEN false ELSE is_active END
WHERE id = $1
`, [proxyId, error || null]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
}
/**
* Mark proxy as successful
*/
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
// Update in-memory
const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) {
proxy.successCount++;
proxy.lastUsedAt = new Date();
if (responseTimeMs !== undefined) {
// Rolling average
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
: responseTimeMs;
}
}
// Update database
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
success_count = success_count + 1,
last_used_at = NOW(),
avg_response_time_ms = CASE
WHEN avg_response_time_ms IS NULL THEN $2
ELSE (avg_response_time_ms * 0.8) + ($2 * 0.2)
END
WHERE id = $1
`, [proxyId, responseTimeMs || null]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
}
/**
* Get proxy URL for HTTP client
*/
getProxyUrl(proxy: Proxy): string {
const auth = proxy.username && proxy.password
? `${proxy.username}:${proxy.password}@`
: '';
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
}
/**
* Get stats about proxy pool
*/
getStats(): ProxyStats {
const totalProxies = this.proxies.length;
const activeProxies = this.proxies.filter(p => p.isActive).length;
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
const successRates = this.proxies
.filter(p => p.successCount + p.failureCount > 0)
.map(p => p.successCount / (p.successCount + p.failureCount));
const avgSuccessRate = successRates.length > 0
? successRates.reduce((a, b) => a + b, 0) / successRates.length
: 0;
return {
totalProxies,
activeProxies,
blockedProxies,
avgSuccessRate,
};
}
/**
* Check if proxy pool has available proxies
*/
hasAvailableProxies(): boolean {
return this.proxies.length > 0;
}
}
// ============================================================
// USER AGENT ROTATOR CLASS
// ============================================================
export class UserAgentRotator {
private userAgents: string[];
private currentIndex: number = 0;
private lastRotation: Date = new Date();
constructor(userAgents: string[] = USER_AGENTS) {
this.userAgents = userAgents;
// Start at random index to avoid patterns
this.currentIndex = Math.floor(Math.random() * userAgents.length);
}
/**
* Get next user agent in rotation
*/
getNext(): string {
this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
this.lastRotation = new Date();
return this.userAgents[this.currentIndex];
}
/**
* Get current user agent without rotating
*/
getCurrent(): string {
return this.userAgents[this.currentIndex];
}
/**
* Get a random user agent
*/
getRandom(): string {
const index = Math.floor(Math.random() * this.userAgents.length);
return this.userAgents[index];
}
/**
* Get total available user agents
*/
getCount(): number {
return this.userAgents.length;
}
}
// ============================================================
// COMBINED ROTATOR (for convenience)
// ============================================================
export class CrawlRotator {
public proxy: ProxyRotator;
public userAgent: UserAgentRotator;
constructor(pool?: Pool) {
this.proxy = new ProxyRotator(pool);
this.userAgent = new UserAgentRotator();
}
/**
* Initialize rotator (load proxies from DB)
*/
async initialize(): Promise<void> {
await this.proxy.loadProxies();
}
/**
* Rotate proxy only
*/
rotateProxy(): Proxy | null {
return this.proxy.getNext();
}
/**
* Rotate user agent only
*/
rotateUserAgent(): string {
return this.userAgent.getNext();
}
/**
* Rotate both proxy and user agent
*/
rotateBoth(): { proxy: Proxy | null; userAgent: string } {
return {
proxy: this.proxy.getNext(),
userAgent: this.userAgent.getNext(),
};
}
/**
* Get current proxy and user agent without rotating
*/
getCurrent(): { proxy: Proxy | null; userAgent: string } {
return {
proxy: this.proxy.getCurrent(),
userAgent: this.userAgent.getCurrent(),
};
}
/**
* Record success for current proxy
*/
async recordSuccess(responseTimeMs?: number): Promise<void> {
const current = this.proxy.getCurrent();
if (current) {
await this.proxy.markSuccess(current.id, responseTimeMs);
}
}
/**
* Record failure for current proxy
*/
async recordFailure(error?: string): Promise<void> {
const current = this.proxy.getCurrent();
if (current) {
await this.proxy.markFailed(current.id, error);
}
}
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Update dispensary's current proxy and user agent
*/
export async function updateDispensaryRotation(
pool: Pool,
dispensaryId: number,
proxyId: number | null,
userAgent: string | null
): Promise<void> {
await pool.query(`
UPDATE dispensaries
SET
current_proxy_id = $2,
current_user_agent = $3
WHERE id = $1
`, [dispensaryId, proxyId, userAgent]);
}
/**
* Get dispensary's current proxy and user agent
*/
export async function getDispensaryRotation(
pool: Pool,
dispensaryId: number
): Promise<{ proxyId: number | null; userAgent: string | null }> {
const result = await pool.query(`
SELECT current_proxy_id as "proxyId", current_user_agent as "userAgent"
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
if (result.rows.length === 0) {
return { proxyId: null, userAgent: null };
}
return result.rows[0];
}
// ============================================================
// SINGLETON INSTANCES
// ============================================================
export const proxyRotator = new ProxyRotator();
export const userAgentRotator = new UserAgentRotator();
export const crawlRotator = new CrawlRotator();

View File

@@ -0,0 +1,435 @@
/**
* Unified Retry Manager
*
* Handles retry logic with exponential backoff, jitter, and
* intelligent error-based decisions (rotate proxy, rotate UA, etc.)
*
* Phase 1: Crawler Reliability & Stabilization
*/
import {
CrawlErrorCodeType,
CrawlErrorCode,
classifyError,
getErrorMetadata,
isRetryable,
shouldRotateProxy,
shouldRotateUserAgent,
getBackoffMultiplier,
} from './error-taxonomy';
import { DEFAULT_CONFIG } from './store-validator';
// ============================================================
// RETRY CONFIGURATION
// ============================================================
export interface RetryConfig {
maxRetries: number;
baseBackoffMs: number;
maxBackoffMs: number;
backoffMultiplier: number;
jitterFactor: number; // 0.0 - 1.0 (percentage of backoff to randomize)
}
export const DEFAULT_RETRY_CONFIG: RetryConfig = {
maxRetries: DEFAULT_CONFIG.maxRetries,
baseBackoffMs: DEFAULT_CONFIG.baseBackoffMs,
maxBackoffMs: DEFAULT_CONFIG.maxBackoffMs,
backoffMultiplier: DEFAULT_CONFIG.backoffMultiplier,
jitterFactor: 0.25, // +/- 25% jitter
};
// ============================================================
// RETRY CONTEXT
// ============================================================
/**
* Context for tracking retry state across attempts
*/
export interface RetryContext {
attemptNumber: number;
maxAttempts: number;
lastErrorCode: CrawlErrorCodeType | null;
lastHttpStatus: number | null;
totalBackoffMs: number;
proxyRotated: boolean;
userAgentRotated: boolean;
startedAt: Date;
}
/**
* Decision about what to do after an error
*/
export interface RetryDecision {
shouldRetry: boolean;
reason: string;
backoffMs: number;
rotateProxy: boolean;
rotateUserAgent: boolean;
errorCode: CrawlErrorCodeType;
attemptNumber: number;
}
// ============================================================
// RETRY MANAGER CLASS
// ============================================================
export class RetryManager {
private config: RetryConfig;
private context: RetryContext;
constructor(config: Partial<RetryConfig> = {}) {
this.config = { ...DEFAULT_RETRY_CONFIG, ...config };
this.context = this.createInitialContext();
}
/**
* Create initial retry context
*/
private createInitialContext(): RetryContext {
return {
attemptNumber: 0,
maxAttempts: this.config.maxRetries + 1, // +1 for initial attempt
lastErrorCode: null,
lastHttpStatus: null,
totalBackoffMs: 0,
proxyRotated: false,
userAgentRotated: false,
startedAt: new Date(),
};
}
/**
* Reset retry state for a new operation
*/
reset(): void {
this.context = this.createInitialContext();
}
/**
* Get current attempt number (1-based)
*/
getAttemptNumber(): number {
return this.context.attemptNumber + 1;
}
/**
* Check if we should attempt (call before each attempt)
*/
shouldAttempt(): boolean {
return this.context.attemptNumber < this.context.maxAttempts;
}
/**
* Record an attempt (call at start of each attempt)
*/
recordAttempt(): void {
this.context.attemptNumber++;
}
/**
* Evaluate an error and decide what to do
*/
evaluateError(
error: Error | string | null,
httpStatus?: number
): RetryDecision {
const errorCode = classifyError(error, httpStatus);
const metadata = getErrorMetadata(errorCode);
const attemptNumber = this.context.attemptNumber;
// Update context
this.context.lastErrorCode = errorCode;
this.context.lastHttpStatus = httpStatus || null;
// Check if error is retryable
if (!isRetryable(errorCode)) {
return {
shouldRetry: false,
reason: `Error ${errorCode} is not retryable: ${metadata.description}`,
backoffMs: 0,
rotateProxy: false,
rotateUserAgent: false,
errorCode,
attemptNumber,
};
}
// Check if we've exhausted retries
if (!this.shouldAttempt()) {
return {
shouldRetry: false,
reason: `Max retries (${this.config.maxRetries}) exhausted`,
backoffMs: 0,
rotateProxy: false,
rotateUserAgent: false,
errorCode,
attemptNumber,
};
}
// Calculate backoff with exponential increase and jitter
const baseBackoff = this.calculateBackoff(attemptNumber, errorCode);
const backoffWithJitter = this.addJitter(baseBackoff);
// Track total backoff
this.context.totalBackoffMs += backoffWithJitter;
// Determine rotation needs
const rotateProxy = shouldRotateProxy(errorCode);
const rotateUserAgent = shouldRotateUserAgent(errorCode);
if (rotateProxy) this.context.proxyRotated = true;
if (rotateUserAgent) this.context.userAgentRotated = true;
const rotationInfo = [];
if (rotateProxy) rotationInfo.push('rotate proxy');
if (rotateUserAgent) rotationInfo.push('rotate UA');
const rotationStr = rotationInfo.length > 0 ? ` (${rotationInfo.join(', ')})` : '';
return {
shouldRetry: true,
reason: `Retrying after ${errorCode}${rotationStr}, backoff ${backoffWithJitter}ms`,
backoffMs: backoffWithJitter,
rotateProxy,
rotateUserAgent,
errorCode,
attemptNumber,
};
}
/**
* Calculate exponential backoff for an attempt
*/
private calculateBackoff(attemptNumber: number, errorCode: CrawlErrorCodeType): number {
// Base exponential: baseBackoff * multiplier^(attempt-1)
const exponential = this.config.baseBackoffMs *
Math.pow(this.config.backoffMultiplier, attemptNumber - 1);
// Apply error-specific multiplier
const errorMultiplier = getBackoffMultiplier(errorCode);
const adjusted = exponential * errorMultiplier;
// Cap at max backoff
return Math.min(adjusted, this.config.maxBackoffMs);
}
/**
* Add jitter to backoff to prevent thundering herd
*/
private addJitter(backoffMs: number): number {
const jitterRange = backoffMs * this.config.jitterFactor;
// Random between -jitterRange and +jitterRange
const jitter = (Math.random() * 2 - 1) * jitterRange;
return Math.max(0, Math.round(backoffMs + jitter));
}
/**
* Get retry context summary
*/
getSummary(): RetryContextSummary {
const elapsedMs = Date.now() - this.context.startedAt.getTime();
return {
attemptsMade: this.context.attemptNumber,
maxAttempts: this.context.maxAttempts,
lastErrorCode: this.context.lastErrorCode,
lastHttpStatus: this.context.lastHttpStatus,
totalBackoffMs: this.context.totalBackoffMs,
totalElapsedMs: elapsedMs,
proxyWasRotated: this.context.proxyRotated,
userAgentWasRotated: this.context.userAgentRotated,
};
}
}
export interface RetryContextSummary {
attemptsMade: number;
maxAttempts: number;
lastErrorCode: CrawlErrorCodeType | null;
lastHttpStatus: number | null;
totalBackoffMs: number;
totalElapsedMs: number;
proxyWasRotated: boolean;
userAgentWasRotated: boolean;
}
// ============================================================
// CONVENIENCE FUNCTIONS
// ============================================================
/**
* Sleep for specified milliseconds
*/
export function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Execute a function with automatic retry logic
*/
export async function withRetry<T>(
fn: (attemptNumber: number) => Promise<T>,
config: Partial<RetryConfig> = {},
callbacks?: {
onRetry?: (decision: RetryDecision) => void | Promise<void>;
onRotateProxy?: () => void | Promise<void>;
onRotateUserAgent?: () => void | Promise<void>;
}
): Promise<{ result: T; summary: RetryContextSummary }> {
const manager = new RetryManager(config);
while (manager.shouldAttempt()) {
manager.recordAttempt();
const attemptNumber = manager.getAttemptNumber();
try {
const result = await fn(attemptNumber);
return { result, summary: manager.getSummary() };
} catch (error) {
const err = error instanceof Error ? error : new Error(String(error));
const httpStatus = (error as any)?.status || (error as any)?.statusCode;
const decision = manager.evaluateError(err, httpStatus);
if (!decision.shouldRetry) {
// Re-throw with enhanced context
const enhancedError = new RetryExhaustedError(
`${err.message} (${decision.reason})`,
err,
manager.getSummary()
);
throw enhancedError;
}
// Notify callbacks
if (callbacks?.onRetry) {
await callbacks.onRetry(decision);
}
if (decision.rotateProxy && callbacks?.onRotateProxy) {
await callbacks.onRotateProxy();
}
if (decision.rotateUserAgent && callbacks?.onRotateUserAgent) {
await callbacks.onRotateUserAgent();
}
// Log retry decision
console.log(
`[RetryManager] Attempt ${attemptNumber} failed: ${decision.errorCode}. ` +
`${decision.reason}. Waiting ${decision.backoffMs}ms before retry.`
);
// Wait before retry
await sleep(decision.backoffMs);
}
}
// Should not reach here, but handle edge case
throw new RetryExhaustedError(
'Max retries exhausted',
null,
manager.getSummary()
);
}
// ============================================================
// CUSTOM ERROR CLASS
// ============================================================
export class RetryExhaustedError extends Error {
public readonly originalError: Error | null;
public readonly summary: RetryContextSummary;
public readonly errorCode: CrawlErrorCodeType;
constructor(
message: string,
originalError: Error | null,
summary: RetryContextSummary
) {
super(message);
this.name = 'RetryExhaustedError';
this.originalError = originalError;
this.summary = summary;
this.errorCode = summary.lastErrorCode || CrawlErrorCode.UNKNOWN_ERROR;
}
}
// ============================================================
// BACKOFF CALCULATOR (for external use)
// ============================================================
/**
* Calculate next crawl time based on consecutive failures
*/
export function calculateNextCrawlDelay(
consecutiveFailures: number,
baseFrequencyMinutes: number,
maxBackoffMultiplier: number = 4.0
): number {
// Each failure doubles the delay, up to max multiplier
const multiplier = Math.min(
Math.pow(2, consecutiveFailures),
maxBackoffMultiplier
);
const delayMinutes = baseFrequencyMinutes * multiplier;
// Add jitter (0-10% of delay)
const jitterMinutes = delayMinutes * Math.random() * 0.1;
return Math.round(delayMinutes + jitterMinutes);
}
/**
* Calculate next crawl timestamp
*/
export function calculateNextCrawlAt(
consecutiveFailures: number,
baseFrequencyMinutes: number
): Date {
const delayMinutes = calculateNextCrawlDelay(consecutiveFailures, baseFrequencyMinutes);
return new Date(Date.now() + delayMinutes * 60 * 1000);
}
// ============================================================
// STATUS DETERMINATION
// ============================================================
/**
* Determine crawl status based on failure count
*/
export function determineCrawlStatus(
consecutiveFailures: number,
thresholds: { degraded: number; failed: number } = { degraded: 3, failed: 10 }
): 'active' | 'degraded' | 'failed' {
if (consecutiveFailures >= thresholds.failed) {
return 'failed';
}
if (consecutiveFailures >= thresholds.degraded) {
return 'degraded';
}
return 'active';
}
/**
* Determine if store should be auto-recovered
* (Called periodically to check if failed stores can be retried)
*/
export function shouldAttemptRecovery(
lastFailureAt: Date | null,
consecutiveFailures: number,
recoveryIntervalHours: number = 24
): boolean {
if (!lastFailureAt) return true;
// Wait longer for more failures
const waitHours = recoveryIntervalHours * Math.min(consecutiveFailures, 5);
const recoveryTime = new Date(lastFailureAt.getTime() + waitHours * 60 * 60 * 1000);
return new Date() >= recoveryTime;
}
// ============================================================
// SINGLETON INSTANCE
// ============================================================
export const retryManager = new RetryManager();

Some files were not shown because too many files have changed in this diff Show More