Merge pull request 'feature/workers-dashboard' (#1) from feature/workers-dashboard into master
Reviewed-on: https://code.cannabrands.app/Creationshop/dispensary-scraper/pulls/1
This commit is contained in:
53
.gitignore
vendored
Normal file
53
.gitignore
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Build outputs (compiled JS, not source)
|
||||
backend/dist/
|
||||
cannaiq/dist/
|
||||
findadispo/build/
|
||||
findagram/build/
|
||||
frontend/dist/
|
||||
|
||||
# Environment files (local secrets)
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
backend/.env
|
||||
backend/.env.local
|
||||
|
||||
# Database dumps and backups (large files)
|
||||
*.dump
|
||||
*.sql.backup
|
||||
backup_*.sql
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
|
||||
# Local storage (runtime data, not source)
|
||||
backend/storage/
|
||||
|
||||
# Product images (crawled data, not source)
|
||||
backend/public/images/products/
|
||||
backend/public/images/brands/
|
||||
|
||||
# Vite cache
|
||||
**/node_modules/.vite/
|
||||
|
||||
# Test coverage
|
||||
coverage/
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
llm-scraper/
|
||||
140
.woodpecker/.ci.yml
Normal file
140
.woodpecker/.ci.yml
Normal file
@@ -0,0 +1,140 @@
|
||||
when:
|
||||
- event: [push, pull_request]
|
||||
|
||||
steps:
|
||||
# Build checks
|
||||
typecheck-backend:
|
||||
image: node:20
|
||||
commands:
|
||||
- cd backend
|
||||
- npm ci
|
||||
- npx tsc --noEmit || true
|
||||
|
||||
build-cannaiq:
|
||||
image: node:20
|
||||
commands:
|
||||
- cd cannaiq
|
||||
- npm ci
|
||||
- npx tsc --noEmit
|
||||
- npm run build
|
||||
|
||||
build-findadispo:
|
||||
image: node:20
|
||||
commands:
|
||||
- cd findadispo/frontend
|
||||
- npm ci
|
||||
- npm run build
|
||||
|
||||
build-findagram:
|
||||
image: node:20
|
||||
commands:
|
||||
- cd findagram/frontend
|
||||
- npm ci
|
||||
- npm run build
|
||||
|
||||
# Docker builds - only on master
|
||||
docker-backend:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/dispensary-scraper
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: backend/Dockerfile
|
||||
context: backend
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
docker-cannaiq:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/cannaiq-frontend
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: cannaiq/Dockerfile
|
||||
context: cannaiq
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
docker-findadispo:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/findadispo-frontend
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: findadispo/frontend/Dockerfile
|
||||
context: findadispo/frontend
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
docker-findagram:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/findagram-frontend
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: findagram/frontend/Dockerfile
|
||||
context: findagram/frontend
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
# Deploy to Kubernetes
|
||||
deploy:
|
||||
image: bitnami/kubectl:latest
|
||||
environment:
|
||||
KUBECONFIG_CONTENT:
|
||||
from_secret: kubeconfig_data
|
||||
commands:
|
||||
- echo "Deploying to Kubernetes..."
|
||||
- mkdir -p ~/.kube
|
||||
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
|
||||
- chmod 600 ~/.kube/config
|
||||
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/scraper-worker scraper-worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/scraper-worker -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
|
||||
- kubectl rollout status deployment/findadispo-frontend -n dispensary-scraper --timeout=120s
|
||||
- kubectl rollout status deployment/findagram-frontend -n dispensary-scraper --timeout=120s
|
||||
- echo "All deployments complete!"
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
33
backend/.env
33
backend/.env
@@ -1,17 +1,30 @@
|
||||
PORT=3010
|
||||
NODE_ENV=development
|
||||
|
||||
# Database
|
||||
DATABASE_URL=postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus
|
||||
# =============================================================================
|
||||
# CannaiQ Database (dutchie_menus) - PRIMARY DATABASE
|
||||
# =============================================================================
|
||||
# This is where all schema migrations run and where canonical tables live.
|
||||
# All CANNAIQ_DB_* variables are REQUIRED - connection will fail if missing.
|
||||
CANNAIQ_DB_HOST=localhost
|
||||
CANNAIQ_DB_PORT=54320
|
||||
CANNAIQ_DB_NAME=dutchie_menus
|
||||
CANNAIQ_DB_USER=dutchie
|
||||
CANNAIQ_DB_PASS=dutchie_local_pass
|
||||
|
||||
# MinIO (connecting to Docker from host)
|
||||
MINIO_ENDPOINT=localhost
|
||||
MINIO_PORT=9020
|
||||
MINIO_USE_SSL=false
|
||||
MINIO_ACCESS_KEY=minioadmin
|
||||
MINIO_SECRET_KEY=minioadmin
|
||||
MINIO_BUCKET=dutchie
|
||||
MINIO_PUBLIC_ENDPOINT=http://localhost:9020
|
||||
# =============================================================================
|
||||
# Legacy Database (dutchie_legacy) - READ-ONLY SOURCE
|
||||
# =============================================================================
|
||||
# Used ONLY by ETL scripts to read historical data.
|
||||
# NEVER run migrations against this database.
|
||||
LEGACY_DB_HOST=localhost
|
||||
LEGACY_DB_PORT=54320
|
||||
LEGACY_DB_NAME=dutchie_legacy
|
||||
LEGACY_DB_USER=dutchie
|
||||
LEGACY_DB_PASS=dutchie_local_pass
|
||||
|
||||
# Local image storage (no MinIO per CLAUDE.md)
|
||||
LOCAL_IMAGES_PATH=./public/images
|
||||
|
||||
# JWT
|
||||
JWT_SECRET=your-secret-key-change-in-production
|
||||
|
||||
50
backend/.env.example
Normal file
50
backend/.env.example
Normal file
@@ -0,0 +1,50 @@
|
||||
# CannaiQ Backend Environment Configuration
|
||||
# Copy this file to .env and fill in the values
|
||||
|
||||
# Server
|
||||
PORT=3010
|
||||
NODE_ENV=development
|
||||
|
||||
# =============================================================================
|
||||
# CANNAIQ DATABASE (dutchie_menus) - PRIMARY DATABASE
|
||||
# =============================================================================
|
||||
# This is where ALL schema migrations run and where canonical tables live.
|
||||
# All CANNAIQ_DB_* variables are REQUIRED - no defaults.
|
||||
# The application will fail to start if any are missing.
|
||||
|
||||
CANNAIQ_DB_HOST=localhost
|
||||
CANNAIQ_DB_PORT=54320
|
||||
CANNAIQ_DB_NAME=dutchie_menus # MUST be dutchie_menus - NOT dutchie_legacy
|
||||
CANNAIQ_DB_USER=dutchie
|
||||
CANNAIQ_DB_PASS=
|
||||
|
||||
# Alternative: Use a full connection URL instead of individual vars
|
||||
# If set, this takes priority over individual vars above
|
||||
# CANNAIQ_DB_URL=postgresql://user:pass@host:port/dutchie_menus
|
||||
|
||||
# =============================================================================
|
||||
# LEGACY DATABASE (dutchie_legacy) - READ-ONLY FOR ETL
|
||||
# =============================================================================
|
||||
# Used ONLY by ETL scripts to read historical data.
|
||||
# NEVER run migrations against this database.
|
||||
# These are only needed when running 042_legacy_import.ts
|
||||
|
||||
LEGACY_DB_HOST=localhost
|
||||
LEGACY_DB_PORT=54320
|
||||
LEGACY_DB_NAME=dutchie_legacy # READ-ONLY - never migrated
|
||||
LEGACY_DB_USER=dutchie
|
||||
LEGACY_DB_PASS=
|
||||
|
||||
# Alternative: Use a full connection URL instead of individual vars
|
||||
# LEGACY_DB_URL=postgresql://user:pass@host:port/dutchie_legacy
|
||||
|
||||
# =============================================================================
|
||||
# LOCAL STORAGE
|
||||
# =============================================================================
|
||||
# Local image storage path (no MinIO)
|
||||
LOCAL_IMAGES_PATH=./public/images
|
||||
|
||||
# =============================================================================
|
||||
# AUTHENTICATION
|
||||
# =============================================================================
|
||||
JWT_SECRET=your-secret-key-change-in-production
|
||||
30
backend/docker-compose.local.yml
Normal file
30
backend/docker-compose.local.yml
Normal file
@@ -0,0 +1,30 @@
|
||||
# CannaiQ Local Development Environment
|
||||
# Run: docker-compose -f docker-compose.local.yml up -d
|
||||
#
|
||||
# Services:
|
||||
# - cannaiq-postgres: PostgreSQL at localhost:54320
|
||||
#
|
||||
# Note: Backend and frontend run outside Docker for faster dev iteration
|
||||
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
cannaiq-postgres:
|
||||
image: postgres:15-alpine
|
||||
container_name: cannaiq-postgres
|
||||
environment:
|
||||
POSTGRES_USER: cannaiq
|
||||
POSTGRES_PASSWORD: cannaiq_local_pass
|
||||
POSTGRES_DB: cannaiq
|
||||
ports:
|
||||
- "54320:5432"
|
||||
volumes:
|
||||
- cannaiq-postgres-data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U cannaiq"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
volumes:
|
||||
cannaiq-postgres-data:
|
||||
712
backend/docs/ANALYTICS_RUNBOOK.md
Normal file
712
backend/docs/ANALYTICS_RUNBOOK.md
Normal file
@@ -0,0 +1,712 @@
|
||||
# CannaiQ Analytics Runbook
|
||||
|
||||
Phase 3: Analytics Engine - Complete Implementation Guide
|
||||
|
||||
## Overview
|
||||
|
||||
The CannaiQ Analytics Engine provides real-time insights into cannabis market data across price trends, brand penetration, category performance, store changes, and competitive positioning.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ API Layer │
|
||||
│ /api/az/analytics/* │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Analytics Services │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
|
||||
│ │PriceTrend │ │Penetration │ │CategoryAnalytics │ │
|
||||
│ │Service │ │Service │ │Service │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────────────┘ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
|
||||
│ │StoreChange │ │BrandOpportunity│ │AnalyticsCache │ │
|
||||
│ │Service │ │Service │ │(15-min TTL) │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Canonical Tables │
|
||||
│ store_products │ store_product_snapshots │ brands │ categories │
|
||||
│ dispensaries │ brand_snapshots │ category_snapshots │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Services
|
||||
|
||||
### 1. PriceTrendService
|
||||
|
||||
Provides time-series price analytics.
|
||||
|
||||
**Key Methods:**
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `getProductPriceTrend(productId, storeId?, days)` | Price history for a product |
|
||||
| `getBrandPriceTrend(brandName, filters)` | Average prices for a brand |
|
||||
| `getCategoryPriceTrend(category, filters)` | Category-level price trends |
|
||||
| `getPriceSummary(filters)` | 7d/30d/90d price averages |
|
||||
| `detectPriceCompression(category, state?)` | Price war detection |
|
||||
| `getGlobalPriceStats()` | Market-wide pricing overview |
|
||||
|
||||
**Filters:**
|
||||
```typescript
|
||||
interface PriceFilters {
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
state?: string;
|
||||
days?: number; // default: 30
|
||||
}
|
||||
```
|
||||
|
||||
**Price Compression Detection:**
|
||||
- Calculates standard deviation of prices within category
|
||||
- Returns compression score 0-100 (higher = more compressed)
|
||||
- Identifies brands converging toward mean price
|
||||
|
||||
---
|
||||
|
||||
### 2. PenetrationService
|
||||
|
||||
Tracks brand market presence across stores and states.
|
||||
|
||||
**Key Methods:**
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `getBrandPenetration(brandName, filters)` | Store count, SKU count, coverage |
|
||||
| `getTopBrandsByPenetration(limit, filters)` | Leaderboard of dominant brands |
|
||||
| `getPenetrationTrend(brandName, days)` | Historical penetration growth |
|
||||
| `getShelfShareByCategory(brandName)` | % of shelf per category |
|
||||
| `getBrandPresenceByState(brandName)` | Multi-state presence map |
|
||||
| `getStoresCarryingBrand(brandName)` | List of stores carrying brand |
|
||||
| `getPenetrationHeatmap(brandName?)` | Geographic distribution |
|
||||
|
||||
**Penetration Calculation:**
|
||||
```
|
||||
Penetration % = (Stores with Brand / Total Stores in Market) × 100
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. CategoryAnalyticsService
|
||||
|
||||
Analyzes category performance and trends.
|
||||
|
||||
**Key Methods:**
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `getCategorySummary(category?, filters)` | SKU count, avg price, stores |
|
||||
| `getCategoryGrowth(days, filters)` | 7d/30d/90d growth rates |
|
||||
| `getCategoryGrowthTrend(category, days)` | Time-series category growth |
|
||||
| `getCategoryHeatmap(metric, periods)` | Visual heatmap data |
|
||||
| `getTopMovers(limit, days)` | Fastest growing/declining categories |
|
||||
| `getSubcategoryBreakdown(category)` | Drill-down into subcategories |
|
||||
|
||||
**Time Windows:**
|
||||
- 7 days: Short-term volatility
|
||||
- 30 days: Monthly trends
|
||||
- 90 days: Seasonal patterns
|
||||
|
||||
---
|
||||
|
||||
### 4. StoreChangeService
|
||||
|
||||
Tracks product adds/drops, brand changes, and price movements per store.
|
||||
|
||||
**Key Methods:**
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `getStoreChangeSummary(storeId)` | Overview of recent changes |
|
||||
| `getStoreChangeEvents(storeId, filters)` | Event log (add, drop, price, OOS) |
|
||||
| `getNewBrands(storeId, days)` | Brands added to store |
|
||||
| `getLostBrands(storeId, days)` | Brands dropped from store |
|
||||
| `getProductChanges(storeId, type, days)` | Filtered product changes |
|
||||
| `getCategoryLeaderboard(category, limit)` | Top stores for category |
|
||||
| `getMostActiveStores(days, limit)` | Stores with most changes |
|
||||
| `compareStores(store1, store2)` | Side-by-side store comparison |
|
||||
|
||||
**Event Types:**
|
||||
- `added` - New product appeared
|
||||
- `discontinued` - Product removed
|
||||
- `price_drop` - Price decreased
|
||||
- `price_increase` - Price increased
|
||||
- `restocked` - OOS → In Stock
|
||||
- `out_of_stock` - In Stock → OOS
|
||||
|
||||
---
|
||||
|
||||
### 5. BrandOpportunityService
|
||||
|
||||
Competitive intelligence and opportunity identification.
|
||||
|
||||
**Key Methods:**
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `getBrandOpportunity(brandName)` | Full opportunity analysis |
|
||||
| `getMarketPositionSummary(brandName)` | Market position vs competitors |
|
||||
| `getAlerts(filters)` | Analytics-generated alerts |
|
||||
| `markAlertsRead(alertIds)` | Mark alerts as read |
|
||||
|
||||
**Opportunity Analysis Includes:**
|
||||
- White space stores (potential targets)
|
||||
- Competitive threats (brands gaining share)
|
||||
- Pricing opportunities (underpriced vs market)
|
||||
- Missing SKU recommendations
|
||||
|
||||
---
|
||||
|
||||
### 6. AnalyticsCache
|
||||
|
||||
In-memory caching with database fallback.
|
||||
|
||||
**Configuration:**
|
||||
```typescript
|
||||
const cache = new AnalyticsCache(pool, {
|
||||
defaultTtlMinutes: 15,
|
||||
});
|
||||
```
|
||||
|
||||
**Usage Pattern:**
|
||||
```typescript
|
||||
const data = await cache.getOrCompute(cacheKey, async () => {
|
||||
// Expensive query here
|
||||
return result;
|
||||
});
|
||||
```
|
||||
|
||||
**Cache Management:**
|
||||
- `GET /api/az/analytics/cache/stats` - View cache stats
|
||||
- `POST /api/az/analytics/cache/clear?pattern=price*` - Clear by pattern
|
||||
- Auto-cleanup of expired entries every 5 minutes
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints Reference
|
||||
|
||||
### Price Endpoints
|
||||
|
||||
```bash
|
||||
# Product price trend (last 30 days)
|
||||
GET /api/az/analytics/price/product/12345?days=30
|
||||
|
||||
# Brand price trend with filters
|
||||
GET /api/az/analytics/price/brand/Cookies?storeId=101&category=Flower&days=90
|
||||
|
||||
# Category median price
|
||||
GET /api/az/analytics/price/category/Vaporizers?state=AZ
|
||||
|
||||
# Price summary (7d/30d/90d)
|
||||
GET /api/az/analytics/price/summary?brand=Stiiizy&state=AZ
|
||||
|
||||
# Detect price wars
|
||||
GET /api/az/analytics/price/compression/Flower?state=AZ
|
||||
|
||||
# Global stats
|
||||
GET /api/az/analytics/price/global
|
||||
```
|
||||
|
||||
### Penetration Endpoints
|
||||
|
||||
```bash
|
||||
# Brand penetration
|
||||
GET /api/az/analytics/penetration/brand/Cookies
|
||||
|
||||
# Top brands leaderboard
|
||||
GET /api/az/analytics/penetration/top?limit=20&state=AZ&category=Flower
|
||||
|
||||
# Penetration trend
|
||||
GET /api/az/analytics/penetration/trend/Cookies?days=90
|
||||
|
||||
# Shelf share by category
|
||||
GET /api/az/analytics/penetration/shelf-share/Cookies
|
||||
|
||||
# Multi-state presence
|
||||
GET /api/az/analytics/penetration/by-state/Cookies
|
||||
|
||||
# Stores carrying brand
|
||||
GET /api/az/analytics/penetration/stores/Cookies
|
||||
|
||||
# Heatmap data
|
||||
GET /api/az/analytics/penetration/heatmap?brand=Cookies
|
||||
```
|
||||
|
||||
### Category Endpoints
|
||||
|
||||
```bash
|
||||
# Category summary
|
||||
GET /api/az/analytics/category/summary?category=Flower&state=AZ
|
||||
|
||||
# Category growth (7d/30d/90d)
|
||||
GET /api/az/analytics/category/growth?days=30&state=AZ
|
||||
|
||||
# Category trend
|
||||
GET /api/az/analytics/category/trend/Concentrates?days=90
|
||||
|
||||
# Heatmap
|
||||
GET /api/az/analytics/category/heatmap?metric=growth&periods=12
|
||||
|
||||
# Top movers (growing/declining)
|
||||
GET /api/az/analytics/category/top-movers?limit=5&days=30
|
||||
|
||||
# Subcategory breakdown
|
||||
GET /api/az/analytics/category/Edibles/subcategories
|
||||
```
|
||||
|
||||
### Store Endpoints
|
||||
|
||||
```bash
|
||||
# Store change summary
|
||||
GET /api/az/analytics/store/101/summary
|
||||
|
||||
# Event log
|
||||
GET /api/az/analytics/store/101/events?type=price_drop&days=7&limit=50
|
||||
|
||||
# New brands
|
||||
GET /api/az/analytics/store/101/brands/new?days=30
|
||||
|
||||
# Lost brands
|
||||
GET /api/az/analytics/store/101/brands/lost?days=30
|
||||
|
||||
# Product changes by type
|
||||
GET /api/az/analytics/store/101/products/changes?type=added&days=7
|
||||
|
||||
# Category leaderboard
|
||||
GET /api/az/analytics/store/leaderboard/Flower?limit=20
|
||||
|
||||
# Most active stores
|
||||
GET /api/az/analytics/store/most-active?days=7&limit=10
|
||||
|
||||
# Compare two stores
|
||||
GET /api/az/analytics/store/compare?store1=101&store2=102
|
||||
```
|
||||
|
||||
### Brand Opportunity Endpoints
|
||||
|
||||
```bash
|
||||
# Full opportunity analysis
|
||||
GET /api/az/analytics/brand/Cookies/opportunity
|
||||
|
||||
# Market position summary
|
||||
GET /api/az/analytics/brand/Cookies/position
|
||||
|
||||
# Get alerts
|
||||
GET /api/az/analytics/alerts?brand=Cookies&type=competitive&unreadOnly=true
|
||||
|
||||
# Mark alerts read
|
||||
POST /api/az/analytics/alerts/mark-read
|
||||
Body: { "alertIds": [1, 2, 3] }
|
||||
```
|
||||
|
||||
### Maintenance Endpoints
|
||||
|
||||
```bash
|
||||
# Capture daily snapshots (run by scheduler)
|
||||
POST /api/az/analytics/snapshots/capture
|
||||
|
||||
# Cache statistics
|
||||
GET /api/az/analytics/cache/stats
|
||||
|
||||
# Clear cache (admin)
|
||||
POST /api/az/analytics/cache/clear?pattern=price*
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Incremental Computation
|
||||
|
||||
Analytics are designed for real-time queries without full recomputation:
|
||||
|
||||
### Snapshot Strategy
|
||||
|
||||
1. **Raw Data**: `store_products` (current state)
|
||||
2. **Historical**: `store_product_snapshots` (time-series)
|
||||
3. **Aggregated**: `brand_snapshots`, `category_snapshots` (daily rollups)
|
||||
|
||||
### Window Calculations
|
||||
|
||||
```sql
|
||||
-- 7-day window
|
||||
WHERE crawled_at >= NOW() - INTERVAL '7 days'
|
||||
|
||||
-- 30-day window
|
||||
WHERE crawled_at >= NOW() - INTERVAL '30 days'
|
||||
|
||||
-- 90-day window
|
||||
WHERE crawled_at >= NOW() - INTERVAL '90 days'
|
||||
```
|
||||
|
||||
### Materialized Views (Optional)
|
||||
|
||||
For heavy queries, create materialized views:
|
||||
|
||||
```sql
|
||||
CREATE MATERIALIZED VIEW mv_brand_daily_metrics AS
|
||||
SELECT
|
||||
DATE(sps.captured_at) as date,
|
||||
sp.brand_id,
|
||||
COUNT(DISTINCT sp.dispensary_id) as store_count,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(sp.price_rec) as avg_price
|
||||
FROM store_product_snapshots sps
|
||||
JOIN store_products sp ON sps.store_product_id = sp.id
|
||||
WHERE sps.captured_at >= NOW() - INTERVAL '90 days'
|
||||
GROUP BY DATE(sps.captured_at), sp.brand_id;
|
||||
|
||||
-- Refresh daily
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_brand_daily_metrics;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Scheduled Jobs
|
||||
|
||||
### Daily Snapshot Capture
|
||||
|
||||
Trigger via cron or scheduler:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3010/api/az/analytics/snapshots/capture
|
||||
```
|
||||
|
||||
This calls:
|
||||
- `capture_brand_snapshots()` - Captures brand metrics
|
||||
- `capture_category_snapshots()` - Captures category metrics
|
||||
|
||||
### Cache Cleanup
|
||||
|
||||
Automatic cleanup every 5 minutes via in-memory timer.
|
||||
|
||||
For manual cleanup:
|
||||
```bash
|
||||
curl -X POST http://localhost:3010/api/az/analytics/cache/clear
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Extending Analytics (Future Phases)
|
||||
|
||||
### Phase 6: Intelligence Engine
|
||||
- Automated alert generation
|
||||
- Recommendation engine
|
||||
- Price prediction
|
||||
|
||||
### Phase 7: Orders Integration
|
||||
- Sales velocity analytics
|
||||
- Reorder predictions
|
||||
- Inventory turnover
|
||||
|
||||
### Phase 8: Advanced ML
|
||||
- Demand forecasting
|
||||
- Price elasticity modeling
|
||||
- Customer segmentation
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**1. Slow queries**
|
||||
- Check cache stats: `GET /api/az/analytics/cache/stats`
|
||||
- Increase cache TTL if data doesn't need real-time freshness
|
||||
- Add indexes on frequently filtered columns
|
||||
|
||||
**2. Empty results**
|
||||
- Verify data exists in source tables
|
||||
- Check filter parameters (case-sensitive brand names)
|
||||
- Verify state codes are valid
|
||||
|
||||
**3. Stale data**
|
||||
- Run snapshot capture: `POST /api/az/analytics/snapshots/capture`
|
||||
- Clear cache: `POST /api/az/analytics/cache/clear`
|
||||
|
||||
### Debugging
|
||||
|
||||
Enable query logging:
|
||||
```typescript
|
||||
// In service constructor
|
||||
this.debug = process.env.ANALYTICS_DEBUG === 'true';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Contracts
|
||||
|
||||
### Price Trend Response
|
||||
```typescript
|
||||
interface PriceTrend {
|
||||
productId?: number;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
avgPrice: number | null;
|
||||
wholesalePrice: number | null;
|
||||
sampleSize: number;
|
||||
}>;
|
||||
summary: {
|
||||
currentAvg: number | null;
|
||||
previousAvg: number | null;
|
||||
changePercent: number | null;
|
||||
trend: 'up' | 'down' | 'stable';
|
||||
volatilityScore: number | null;
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Brand Penetration Response
|
||||
```typescript
|
||||
interface BrandPenetration {
|
||||
brandName: string;
|
||||
totalStores: number;
|
||||
storesWithBrand: number;
|
||||
penetrationPercent: number;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
priceRange: { min: number; max: number } | null;
|
||||
topCategories: Array<{ category: string; count: number }>;
|
||||
stateBreakdown?: Array<{ state: string; storeCount: number }>;
|
||||
}
|
||||
```
|
||||
|
||||
### Category Growth Response
|
||||
```typescript
|
||||
interface CategoryGrowth {
|
||||
category: string;
|
||||
currentCount: number;
|
||||
previousCount: number;
|
||||
growthPercent: number;
|
||||
growthTrend: 'up' | 'down' | 'stable';
|
||||
avgPrice: number | null;
|
||||
priceChange: number | null;
|
||||
topBrands: Array<{ brandName: string; count: number }>;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files Reference
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/dutchie-az/services/analytics/price-trends.ts` | Price analytics |
|
||||
| `src/dutchie-az/services/analytics/penetration.ts` | Brand penetration |
|
||||
| `src/dutchie-az/services/analytics/category-analytics.ts` | Category metrics |
|
||||
| `src/dutchie-az/services/analytics/store-changes.ts` | Store event tracking |
|
||||
| `src/dutchie-az/services/analytics/brand-opportunity.ts` | Competitive intel |
|
||||
| `src/dutchie-az/services/analytics/cache.ts` | Caching layer |
|
||||
| `src/dutchie-az/services/analytics/index.ts` | Module exports |
|
||||
| `src/dutchie-az/routes/analytics.ts` | API routes (680 LOC) |
|
||||
| `src/multi-state/state-query-service.ts` | Cross-state analytics |
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
## Analytics V2: Rec/Med State Segmentation
|
||||
|
||||
Phase 3 Enhancement: Enhanced analytics with recreational vs medical-only state analysis.
|
||||
|
||||
### V2 API Endpoints
|
||||
|
||||
All V2 endpoints are prefixed with `/api/analytics/v2`
|
||||
|
||||
#### V2 Price Analytics
|
||||
|
||||
```bash
|
||||
# Price trends for a specific product
|
||||
GET /api/analytics/v2/price/product/12345?window=30d
|
||||
|
||||
# Price by category and state (with rec/med segmentation)
|
||||
GET /api/analytics/v2/price/category/Flower?state=AZ
|
||||
|
||||
# Price by brand and state
|
||||
GET /api/analytics/v2/price/brand/Cookies?state=AZ
|
||||
|
||||
# Most volatile products
|
||||
GET /api/analytics/v2/price/volatile?window=30d&limit=50&state=AZ
|
||||
|
||||
# Rec vs Med price comparison by category
|
||||
GET /api/analytics/v2/price/rec-vs-med?category=Flower
|
||||
```
|
||||
|
||||
#### V2 Brand Penetration
|
||||
|
||||
```bash
|
||||
# Brand penetration metrics with state breakdown
|
||||
GET /api/analytics/v2/brand/Cookies/penetration?window=30d
|
||||
|
||||
# Brand market position within categories
|
||||
GET /api/analytics/v2/brand/Cookies/market-position?category=Flower&state=AZ
|
||||
|
||||
# Brand presence in rec vs med-only states
|
||||
GET /api/analytics/v2/brand/Cookies/rec-vs-med
|
||||
|
||||
# Top brands by penetration
|
||||
GET /api/analytics/v2/brand/top?limit=25&state=AZ
|
||||
|
||||
# Brands expanding or contracting
|
||||
GET /api/analytics/v2/brand/expansion-contraction?window=30d&limit=25
|
||||
```
|
||||
|
||||
#### V2 Category Analytics
|
||||
|
||||
```bash
|
||||
# Category growth metrics
|
||||
GET /api/analytics/v2/category/Flower/growth?window=30d
|
||||
|
||||
# Category growth trend over time
|
||||
GET /api/analytics/v2/category/Flower/trend?window=30d
|
||||
|
||||
# Top brands in category
|
||||
GET /api/analytics/v2/category/Flower/top-brands?limit=25&state=AZ
|
||||
|
||||
# All categories with metrics
|
||||
GET /api/analytics/v2/category/all?state=AZ&limit=50
|
||||
|
||||
# Rec vs Med category comparison
|
||||
GET /api/analytics/v2/category/rec-vs-med?category=Flower
|
||||
|
||||
# Fastest growing categories
|
||||
GET /api/analytics/v2/category/fastest-growing?window=30d&limit=25
|
||||
```
|
||||
|
||||
#### V2 Store Analytics
|
||||
|
||||
```bash
|
||||
# Store change summary
|
||||
GET /api/analytics/v2/store/101/summary?window=30d
|
||||
|
||||
# Product change events
|
||||
GET /api/analytics/v2/store/101/events?window=7d&limit=100
|
||||
|
||||
# Store inventory composition
|
||||
GET /api/analytics/v2/store/101/inventory
|
||||
|
||||
# Store price positioning vs market
|
||||
GET /api/analytics/v2/store/101/price-position
|
||||
|
||||
# Most active stores by changes
|
||||
GET /api/analytics/v2/store/most-active?window=7d&limit=25&state=AZ
|
||||
```
|
||||
|
||||
#### V2 State Analytics
|
||||
|
||||
```bash
|
||||
# State market summary
|
||||
GET /api/analytics/v2/state/AZ/summary
|
||||
|
||||
# All states with coverage metrics
|
||||
GET /api/analytics/v2/state/all
|
||||
|
||||
# Legal state breakdown (rec, med-only, no program)
|
||||
GET /api/analytics/v2/state/legal-breakdown
|
||||
|
||||
# Rec vs Med pricing by category
|
||||
GET /api/analytics/v2/state/rec-vs-med-pricing?category=Flower
|
||||
|
||||
# States with coverage gaps
|
||||
GET /api/analytics/v2/state/coverage-gaps
|
||||
|
||||
# Cross-state pricing comparison
|
||||
GET /api/analytics/v2/state/price-comparison
|
||||
```
|
||||
|
||||
### V2 Services Architecture
|
||||
|
||||
```
|
||||
src/services/analytics/
|
||||
├── index.ts # Exports all V2 services
|
||||
├── types.ts # Shared type definitions
|
||||
├── PriceAnalyticsService.ts # Price trends and volatility
|
||||
├── BrandPenetrationService.ts # Brand market presence
|
||||
├── CategoryAnalyticsService.ts # Category growth analysis
|
||||
├── StoreAnalyticsService.ts # Store change tracking
|
||||
└── StateAnalyticsService.ts # State-level analytics
|
||||
|
||||
src/routes/analytics-v2.ts # V2 API route handlers
|
||||
```
|
||||
|
||||
### Key V2 Features
|
||||
|
||||
1. **Rec/Med State Segmentation**: All analytics can be filtered and compared by legal status
|
||||
2. **State Coverage Gaps**: Identify legal states with missing or stale data
|
||||
3. **Cross-State Pricing**: Compare prices across recreational and medical-only markets
|
||||
4. **Brand Footprint Analysis**: Track brand presence in rec vs med states
|
||||
5. **Category Comparison**: Compare category performance by legal status
|
||||
|
||||
### V2 Migration Path
|
||||
|
||||
1. Run migration 052 for state cannabis flags:
|
||||
```bash
|
||||
psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
|
||||
```
|
||||
|
||||
2. Run migration 053 for analytics indexes:
|
||||
```bash
|
||||
psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
|
||||
```
|
||||
|
||||
3. Restart backend to pick up new routes
|
||||
|
||||
### V2 Response Examples
|
||||
|
||||
**Rec vs Med Price Comparison:**
|
||||
```json
|
||||
{
|
||||
"category": "Flower",
|
||||
"recreational": {
|
||||
"state_count": 15,
|
||||
"product_count": 12500,
|
||||
"avg_price": 35.50,
|
||||
"median_price": 32.00
|
||||
},
|
||||
"medical_only": {
|
||||
"state_count": 8,
|
||||
"product_count": 5200,
|
||||
"avg_price": 42.00,
|
||||
"median_price": 40.00
|
||||
},
|
||||
"price_diff_percent": -15.48
|
||||
}
|
||||
```
|
||||
|
||||
**Legal State Breakdown:**
|
||||
```json
|
||||
{
|
||||
"recreational_states": {
|
||||
"count": 24,
|
||||
"dispensary_count": 850,
|
||||
"product_count": 125000,
|
||||
"states": [
|
||||
{ "code": "CA", "name": "California", "dispensary_count": 250 },
|
||||
{ "code": "CO", "name": "Colorado", "dispensary_count": 150 }
|
||||
]
|
||||
},
|
||||
"medical_only_states": {
|
||||
"count": 18,
|
||||
"dispensary_count": 320,
|
||||
"product_count": 45000,
|
||||
"states": [
|
||||
{ "code": "FL", "name": "Florida", "dispensary_count": 120 }
|
||||
]
|
||||
},
|
||||
"no_program_states": {
|
||||
"count": 9,
|
||||
"states": [
|
||||
{ "code": "ID", "name": "Idaho" }
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Phase 3 Analytics Engine - Fully Implemented*
|
||||
*V2 Rec/Med State Analytics - Added December 2024*
|
||||
594
backend/docs/ANALYTICS_V2_EXAMPLES.md
Normal file
594
backend/docs/ANALYTICS_V2_EXAMPLES.md
Normal file
@@ -0,0 +1,594 @@
|
||||
# Analytics V2 API Examples
|
||||
|
||||
## Overview
|
||||
|
||||
All endpoints are prefixed with `/api/analytics/v2`
|
||||
|
||||
### Filtering Options
|
||||
|
||||
**Time Windows:**
|
||||
- `?window=7d` - Last 7 days
|
||||
- `?window=30d` - Last 30 days (default)
|
||||
- `?window=90d` - Last 90 days
|
||||
|
||||
**Legal Type Filtering:**
|
||||
- `?legalType=recreational` - Recreational states only
|
||||
- `?legalType=medical_only` - Medical-only states (not recreational)
|
||||
- `?legalType=no_program` - States with no cannabis program
|
||||
|
||||
---
|
||||
|
||||
## 1. Price Analytics
|
||||
|
||||
### GET /price/product/:id
|
||||
|
||||
Get price trends for a specific store product.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/price/product/12345?window=30d
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"store_product_id": 12345,
|
||||
"product_name": "Blue Dream 3.5g",
|
||||
"brand_name": "Cookies",
|
||||
"category": "Flower",
|
||||
"dispensary_id": 101,
|
||||
"dispensary_name": "Green Leaf Dispensary",
|
||||
"state_code": "AZ",
|
||||
"data_points": [
|
||||
{
|
||||
"date": "2024-11-06",
|
||||
"price_rec": 45.00,
|
||||
"price_med": 40.00,
|
||||
"price_rec_special": null,
|
||||
"price_med_special": null,
|
||||
"is_on_special": false
|
||||
},
|
||||
{
|
||||
"date": "2024-11-07",
|
||||
"price_rec": 42.00,
|
||||
"price_med": 38.00,
|
||||
"price_rec_special": null,
|
||||
"price_med_special": null,
|
||||
"is_on_special": false
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"current_price": 42.00,
|
||||
"min_price": 40.00,
|
||||
"max_price": 48.00,
|
||||
"avg_price": 43.50,
|
||||
"price_change_count": 3,
|
||||
"volatility_percent": 8.2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### GET /price/rec-vs-med
|
||||
|
||||
Get recreational vs medical-only price comparison by category.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/price/rec-vs-med?category=Flower
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"category": "Flower",
|
||||
"rec_avg": 38.50,
|
||||
"rec_median": 35.00,
|
||||
"med_avg": 42.00,
|
||||
"med_median": 40.00
|
||||
},
|
||||
{
|
||||
"category": "Concentrates",
|
||||
"rec_avg": 45.00,
|
||||
"rec_median": 42.00,
|
||||
"med_avg": 48.00,
|
||||
"med_median": 45.00
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Brand Analytics
|
||||
|
||||
### GET /brand/:name/penetration
|
||||
|
||||
Get brand penetration metrics with state breakdown.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/brand/Cookies/penetration?window=30d
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"brand_name": "Cookies",
|
||||
"total_dispensaries": 125,
|
||||
"total_skus": 450,
|
||||
"avg_skus_per_dispensary": 3.6,
|
||||
"states_present": ["AZ", "CA", "CO", "NV", "MI"],
|
||||
"state_breakdown": [
|
||||
{
|
||||
"state_code": "CA",
|
||||
"state_name": "California",
|
||||
"legal_type": "recreational",
|
||||
"dispensary_count": 45,
|
||||
"sku_count": 180,
|
||||
"avg_skus_per_dispensary": 4.0,
|
||||
"market_share_percent": 12.5
|
||||
},
|
||||
{
|
||||
"state_code": "AZ",
|
||||
"state_name": "Arizona",
|
||||
"legal_type": "recreational",
|
||||
"dispensary_count": 32,
|
||||
"sku_count": 128,
|
||||
"avg_skus_per_dispensary": 4.0,
|
||||
"market_share_percent": 15.2
|
||||
}
|
||||
],
|
||||
"penetration_trend": [
|
||||
{
|
||||
"date": "2024-11-01",
|
||||
"dispensary_count": 120,
|
||||
"new_dispensaries": 0,
|
||||
"dropped_dispensaries": 0
|
||||
},
|
||||
{
|
||||
"date": "2024-11-08",
|
||||
"dispensary_count": 123,
|
||||
"new_dispensaries": 3,
|
||||
"dropped_dispensaries": 0
|
||||
},
|
||||
{
|
||||
"date": "2024-11-15",
|
||||
"dispensary_count": 125,
|
||||
"new_dispensaries": 2,
|
||||
"dropped_dispensaries": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### GET /brand/:name/rec-vs-med
|
||||
|
||||
Get brand presence in recreational vs medical-only states.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/brand/Cookies/rec-vs-med
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"brand_name": "Cookies",
|
||||
"rec_states_count": 4,
|
||||
"rec_states": ["AZ", "CA", "CO", "NV"],
|
||||
"rec_dispensary_count": 110,
|
||||
"rec_avg_skus": 3.8,
|
||||
"med_only_states_count": 2,
|
||||
"med_only_states": ["FL", "OH"],
|
||||
"med_only_dispensary_count": 15,
|
||||
"med_only_avg_skus": 2.5
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Category Analytics
|
||||
|
||||
### GET /category/:name/growth
|
||||
|
||||
Get category growth metrics with state breakdown.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/category/Flower/growth?window=30d
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"category": "Flower",
|
||||
"current_sku_count": 5200,
|
||||
"current_dispensary_count": 320,
|
||||
"avg_price": 38.50,
|
||||
"growth_data": [
|
||||
{
|
||||
"date": "2024-11-01",
|
||||
"sku_count": 4800,
|
||||
"dispensary_count": 310,
|
||||
"avg_price": 39.00
|
||||
},
|
||||
{
|
||||
"date": "2024-11-15",
|
||||
"sku_count": 5000,
|
||||
"dispensary_count": 315,
|
||||
"avg_price": 38.75
|
||||
},
|
||||
{
|
||||
"date": "2024-12-01",
|
||||
"sku_count": 5200,
|
||||
"dispensary_count": 320,
|
||||
"avg_price": 38.50
|
||||
}
|
||||
],
|
||||
"state_breakdown": [
|
||||
{
|
||||
"state_code": "CA",
|
||||
"state_name": "California",
|
||||
"legal_type": "recreational",
|
||||
"sku_count": 2100,
|
||||
"dispensary_count": 145,
|
||||
"avg_price": 36.00
|
||||
},
|
||||
{
|
||||
"state_code": "AZ",
|
||||
"state_name": "Arizona",
|
||||
"legal_type": "recreational",
|
||||
"sku_count": 950,
|
||||
"dispensary_count": 85,
|
||||
"avg_price": 40.00
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### GET /category/rec-vs-med
|
||||
|
||||
Get category comparison between recreational and medical-only states.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/category/rec-vs-med
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"category": "Flower",
|
||||
"recreational": {
|
||||
"state_count": 15,
|
||||
"dispensary_count": 650,
|
||||
"sku_count": 12500,
|
||||
"avg_price": 35.50,
|
||||
"median_price": 32.00
|
||||
},
|
||||
"medical_only": {
|
||||
"state_count": 8,
|
||||
"dispensary_count": 220,
|
||||
"sku_count": 4200,
|
||||
"avg_price": 42.00,
|
||||
"median_price": 40.00
|
||||
},
|
||||
"price_diff_percent": -15.48
|
||||
},
|
||||
{
|
||||
"category": "Concentrates",
|
||||
"recreational": {
|
||||
"state_count": 15,
|
||||
"dispensary_count": 600,
|
||||
"sku_count": 8500,
|
||||
"avg_price": 42.00,
|
||||
"median_price": 40.00
|
||||
},
|
||||
"medical_only": {
|
||||
"state_count": 8,
|
||||
"dispensary_count": 200,
|
||||
"sku_count": 3100,
|
||||
"avg_price": 48.00,
|
||||
"median_price": 45.00
|
||||
},
|
||||
"price_diff_percent": -12.50
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Store Analytics
|
||||
|
||||
### GET /store/:id/summary
|
||||
|
||||
Get change summary for a store over a time window.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/store/101/summary?window=30d
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"dispensary_id": 101,
|
||||
"dispensary_name": "Green Leaf Dispensary",
|
||||
"state_code": "AZ",
|
||||
"window": "30d",
|
||||
"products_added": 45,
|
||||
"products_dropped": 12,
|
||||
"brands_added": ["Alien Labs", "Connected"],
|
||||
"brands_dropped": ["House Brand"],
|
||||
"price_changes": 156,
|
||||
"avg_price_change_percent": 3.2,
|
||||
"stock_in_events": 89,
|
||||
"stock_out_events": 34,
|
||||
"current_product_count": 512,
|
||||
"current_in_stock_count": 478
|
||||
}
|
||||
```
|
||||
|
||||
### GET /store/:id/events
|
||||
|
||||
Get recent product change events for a store.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/store/101/events?window=7d&limit=50
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"store_product_id": 12345,
|
||||
"product_name": "Blue Dream 3.5g",
|
||||
"brand_name": "Cookies",
|
||||
"category": "Flower",
|
||||
"event_type": "price_change",
|
||||
"event_date": "2024-12-05T14:30:00.000Z",
|
||||
"old_value": "45.00",
|
||||
"new_value": "42.00"
|
||||
},
|
||||
{
|
||||
"store_product_id": 12346,
|
||||
"product_name": "OG Kush 1g",
|
||||
"brand_name": "Alien Labs",
|
||||
"category": "Flower",
|
||||
"event_type": "added",
|
||||
"event_date": "2024-12-04T10:00:00.000Z",
|
||||
"old_value": null,
|
||||
"new_value": null
|
||||
},
|
||||
{
|
||||
"store_product_id": 12300,
|
||||
"product_name": "Sour Diesel Cart",
|
||||
"brand_name": "Select",
|
||||
"category": "Vaporizers",
|
||||
"event_type": "stock_out",
|
||||
"event_date": "2024-12-03T16:45:00.000Z",
|
||||
"old_value": "true",
|
||||
"new_value": "false"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. State Analytics
|
||||
|
||||
### GET /state/:code/summary
|
||||
|
||||
Get market summary for a specific state with rec/med breakdown.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/state/AZ/summary
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"state_code": "AZ",
|
||||
"state_name": "Arizona",
|
||||
"legal_status": {
|
||||
"recreational_legal": true,
|
||||
"rec_year": 2020,
|
||||
"medical_legal": true,
|
||||
"med_year": 2010
|
||||
},
|
||||
"coverage": {
|
||||
"dispensary_count": 145,
|
||||
"product_count": 18500,
|
||||
"brand_count": 320,
|
||||
"category_count": 12,
|
||||
"snapshot_count": 2450000,
|
||||
"last_crawl_at": "2024-12-06T02:30:00.000Z"
|
||||
},
|
||||
"pricing": {
|
||||
"avg_price": 42.50,
|
||||
"median_price": 38.00,
|
||||
"min_price": 5.00,
|
||||
"max_price": 250.00
|
||||
},
|
||||
"top_categories": [
|
||||
{ "category": "Flower", "count": 5200 },
|
||||
{ "category": "Concentrates", "count": 3800 },
|
||||
{ "category": "Vaporizers", "count": 2950 },
|
||||
{ "category": "Edibles", "count": 2400 },
|
||||
{ "category": "Pre-Rolls", "count": 1850 }
|
||||
],
|
||||
"top_brands": [
|
||||
{ "brand": "Cookies", "count": 450 },
|
||||
{ "brand": "Alien Labs", "count": 380 },
|
||||
{ "brand": "Connected", "count": 320 },
|
||||
{ "brand": "Stiiizy", "count": 290 },
|
||||
{ "brand": "Raw Garden", "count": 275 }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### GET /state/legal-breakdown
|
||||
|
||||
Get breakdown by legal status (recreational, medical-only, no program).
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/state/legal-breakdown
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"recreational_states": {
|
||||
"count": 24,
|
||||
"dispensary_count": 850,
|
||||
"product_count": 125000,
|
||||
"snapshot_count": 15000000,
|
||||
"states": [
|
||||
{ "code": "CA", "name": "California", "dispensary_count": 250 },
|
||||
{ "code": "CO", "name": "Colorado", "dispensary_count": 150 },
|
||||
{ "code": "AZ", "name": "Arizona", "dispensary_count": 145 },
|
||||
{ "code": "MI", "name": "Michigan", "dispensary_count": 120 }
|
||||
]
|
||||
},
|
||||
"medical_only_states": {
|
||||
"count": 18,
|
||||
"dispensary_count": 320,
|
||||
"product_count": 45000,
|
||||
"snapshot_count": 5000000,
|
||||
"states": [
|
||||
{ "code": "FL", "name": "Florida", "dispensary_count": 120 },
|
||||
{ "code": "OH", "name": "Ohio", "dispensary_count": 85 },
|
||||
{ "code": "PA", "name": "Pennsylvania", "dispensary_count": 75 }
|
||||
]
|
||||
},
|
||||
"no_program_states": {
|
||||
"count": 9,
|
||||
"states": [
|
||||
{ "code": "ID", "name": "Idaho" },
|
||||
{ "code": "WY", "name": "Wyoming" },
|
||||
{ "code": "KS", "name": "Kansas" }
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### GET /state/recreational
|
||||
|
||||
Get list of recreational state codes.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/state/recreational
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"legal_type": "recreational",
|
||||
"states": ["AK", "AZ", "CA", "CO", "CT", "DE", "IL", "MA", "MD", "ME", "MI", "MN", "MO", "MT", "NJ", "NM", "NV", "NY", "OH", "OR", "RI", "VA", "VT", "WA"],
|
||||
"count": 24
|
||||
}
|
||||
```
|
||||
|
||||
### GET /state/medical-only
|
||||
|
||||
Get list of medical-only state codes (not recreational).
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/state/medical-only
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"legal_type": "medical_only",
|
||||
"states": ["AR", "FL", "HI", "LA", "MS", "ND", "NH", "OK", "PA", "SD", "UT", "WV"],
|
||||
"count": 12
|
||||
}
|
||||
```
|
||||
|
||||
### GET /state/rec-vs-med-pricing
|
||||
|
||||
Get rec vs med price comparison by category.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
GET /api/analytics/v2/state/rec-vs-med-pricing?category=Flower
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
[
|
||||
{
|
||||
"category": "Flower",
|
||||
"recreational": {
|
||||
"state_count": 15,
|
||||
"product_count": 12500,
|
||||
"avg_price": 35.50,
|
||||
"median_price": 32.00
|
||||
},
|
||||
"medical_only": {
|
||||
"state_count": 8,
|
||||
"product_count": 5200,
|
||||
"avg_price": 42.00,
|
||||
"median_price": 40.00
|
||||
},
|
||||
"price_diff_percent": -15.48
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How These Endpoints Support Portals
|
||||
|
||||
### Brand Portal Use Cases
|
||||
|
||||
1. **Track brand penetration**: Use `/brand/:name/penetration` to see how many stores carry the brand
|
||||
2. **Compare rec vs med markets**: Use `/brand/:name/rec-vs-med` to understand footprint by legal status
|
||||
3. **Identify expansion opportunities**: Use `/state/coverage-gaps` to find underserved markets
|
||||
4. **Monitor pricing**: Use `/price/brand/:brand` to track pricing by state
|
||||
|
||||
### Buyer Portal Use Cases
|
||||
|
||||
1. **Compare stores**: Use `/store/:id/summary` to see activity levels
|
||||
2. **Track price changes**: Use `/store/:id/events` to monitor competitor pricing
|
||||
3. **Analyze categories**: Use `/category/:name/growth` to identify trending products
|
||||
4. **State-level insights**: Use `/state/:code/summary` for market overview
|
||||
|
||||
---
|
||||
|
||||
## Time Window Filtering
|
||||
|
||||
All time-based endpoints support the `window` query parameter:
|
||||
|
||||
| Value | Description |
|
||||
|-------|-------------|
|
||||
| `7d` | Last 7 days |
|
||||
| `30d` | Last 30 days (default) |
|
||||
| `90d` | Last 90 days |
|
||||
|
||||
The window affects:
|
||||
- `store_product_snapshots.captured_at` for historical data
|
||||
- `store_products.first_seen_at` / `last_seen_at` for product lifecycle
|
||||
- `crawl_runs.started_at` for crawl-based metrics
|
||||
|
||||
---
|
||||
|
||||
## Rec/Med Segmentation
|
||||
|
||||
All state-level endpoints automatically segment by:
|
||||
|
||||
- **Recreational**: `states.recreational_legal = TRUE`
|
||||
- **Medical-only**: `states.medical_legal = TRUE AND states.recreational_legal = FALSE`
|
||||
- **No program**: Both flags are FALSE or NULL
|
||||
|
||||
This segmentation appears in:
|
||||
- `legal_type` field in responses
|
||||
- State breakdown arrays
|
||||
- Price comparison endpoints
|
||||
90
backend/migrations/037_dispensary_crawler_profiles.sql
Normal file
90
backend/migrations/037_dispensary_crawler_profiles.sql
Normal file
@@ -0,0 +1,90 @@
|
||||
-- Migration 037: Add per-store crawler profiles for Dutchie dispensaries
|
||||
-- This enables per-store crawler configuration without changing shared logic
|
||||
-- Phase 1: Schema only - no automatic behavior changes
|
||||
|
||||
-- Create the crawler profiles table
|
||||
CREATE TABLE IF NOT EXISTS dispensary_crawler_profiles (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- Human readable name for this profile
|
||||
profile_name VARCHAR(255) NOT NULL,
|
||||
|
||||
-- High-level type, e.g. 'dutchie', 'treez', 'jane'
|
||||
crawler_type VARCHAR(50) NOT NULL,
|
||||
|
||||
-- Optional key for mapping to a per-store crawler module later,
|
||||
-- e.g. 'curaleaf-dispensary-gilbert'
|
||||
profile_key VARCHAR(255),
|
||||
|
||||
-- Generic configuration bucket; will hold selectors, URLs, flags, etc.
|
||||
config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
|
||||
-- Execution hints (safe defaults; can be overridden in config if needed)
|
||||
timeout_ms INTEGER DEFAULT 30000,
|
||||
download_images BOOLEAN DEFAULT TRUE,
|
||||
track_stock BOOLEAN DEFAULT TRUE,
|
||||
|
||||
version INTEGER DEFAULT 1,
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Unique index on dispensary_id + profile_name
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS dispensary_crawler_profiles_unique_name
|
||||
ON dispensary_crawler_profiles (dispensary_id, profile_name);
|
||||
|
||||
-- Index for finding enabled profiles by type
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_type_enabled
|
||||
ON dispensary_crawler_profiles (crawler_type, enabled);
|
||||
|
||||
-- Index for dispensary lookup
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_dispensary
|
||||
ON dispensary_crawler_profiles (dispensary_id);
|
||||
|
||||
-- Add FK from dispensaries to active profile
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries'
|
||||
AND column_name = 'active_crawler_profile_id') THEN
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN active_crawler_profile_id INTEGER NULL
|
||||
REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Create index on the FK for faster joins
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_active_profile
|
||||
ON dispensaries (active_crawler_profile_id)
|
||||
WHERE active_crawler_profile_id IS NOT NULL;
|
||||
|
||||
-- Create or replace trigger function for updated_at
|
||||
CREATE OR REPLACE FUNCTION set_updated_at_timestamp()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Add trigger to keep updated_at fresh (drop first if exists to avoid duplicates)
|
||||
DROP TRIGGER IF EXISTS dispensary_crawler_profiles_set_timestamp ON dispensary_crawler_profiles;
|
||||
CREATE TRIGGER dispensary_crawler_profiles_set_timestamp
|
||||
BEFORE UPDATE ON dispensary_crawler_profiles
|
||||
FOR EACH ROW EXECUTE PROCEDURE set_updated_at_timestamp();
|
||||
|
||||
-- Add comments for documentation
|
||||
COMMENT ON TABLE dispensary_crawler_profiles IS 'Per-store crawler configuration profiles. Each dispensary can have multiple profiles but only one active at a time.';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.profile_name IS 'Human readable name for the profile, e.g. "Curaleaf Gilbert - Dutchie v1"';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.crawler_type IS 'The crawler implementation type: dutchie, treez, jane, sandbox, custom';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.profile_key IS 'Optional identifier for per-store crawler module mapping';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.config IS 'JSONB configuration for the crawler. Schema depends on crawler_type.';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.timeout_ms IS 'Request timeout in milliseconds (default 30000)';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.download_images IS 'Whether to download product images locally';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.track_stock IS 'Whether to track inventory/stock levels';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.version IS 'Profile version number for A/B testing or upgrades';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.enabled IS 'Whether this profile can be used (soft delete)';
|
||||
COMMENT ON COLUMN dispensaries.active_crawler_profile_id IS 'FK to the currently active crawler profile for this dispensary';
|
||||
84
backend/migrations/038_profile_status_field.sql
Normal file
84
backend/migrations/038_profile_status_field.sql
Normal file
@@ -0,0 +1,84 @@
|
||||
-- Migration: Add status field to dispensary_crawler_profiles
|
||||
-- This adds a proper status column for crawler state machine
|
||||
-- Status values: 'production', 'sandbox', 'needs_manual', 'disabled'
|
||||
|
||||
-- Add status column with default 'production' for existing profiles
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'production';
|
||||
|
||||
-- Add next_retry_at column for sandbox retry scheduling
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS next_retry_at TIMESTAMPTZ;
|
||||
|
||||
-- Add sandbox_attempt_count for quick lookup
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS sandbox_attempt_count INTEGER DEFAULT 0;
|
||||
|
||||
-- Add last_sandbox_at for tracking
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS last_sandbox_at TIMESTAMPTZ;
|
||||
|
||||
-- Create index for finding profiles by status
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_status
|
||||
ON dispensary_crawler_profiles(status) WHERE enabled = true;
|
||||
|
||||
-- Create index for finding profiles needing retry
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_next_retry
|
||||
ON dispensary_crawler_profiles(next_retry_at) WHERE enabled = true AND status = 'sandbox';
|
||||
|
||||
-- Add comment explaining status values
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.status IS
|
||||
'Crawler status: production (ready for regular crawls), sandbox (discovery mode), needs_manual (max retries exceeded), disabled (turned off)';
|
||||
|
||||
-- Update existing profiles to have status based on config if present
|
||||
UPDATE dispensary_crawler_profiles
|
||||
SET status = COALESCE(config->>'status', 'production')
|
||||
WHERE status IS NULL OR status = '';
|
||||
|
||||
-- Backfill sandbox_attempt_count from config
|
||||
UPDATE dispensary_crawler_profiles
|
||||
SET sandbox_attempt_count = COALESCE(
|
||||
jsonb_array_length(config->'sandboxAttempts'),
|
||||
0
|
||||
)
|
||||
WHERE config->'sandboxAttempts' IS NOT NULL;
|
||||
|
||||
-- Backfill next_retry_at from config
|
||||
UPDATE dispensary_crawler_profiles
|
||||
SET next_retry_at = (config->>'nextRetryAt')::timestamptz
|
||||
WHERE config->>'nextRetryAt' IS NOT NULL;
|
||||
|
||||
-- Create view for crawler profile summary
|
||||
CREATE OR REPLACE VIEW v_crawler_profile_summary AS
|
||||
SELECT
|
||||
dcp.id,
|
||||
dcp.dispensary_id,
|
||||
d.name AS dispensary_name,
|
||||
d.city,
|
||||
d.menu_type,
|
||||
dcp.profile_name,
|
||||
dcp.profile_key,
|
||||
dcp.crawler_type,
|
||||
dcp.status,
|
||||
dcp.enabled,
|
||||
dcp.sandbox_attempt_count,
|
||||
dcp.next_retry_at,
|
||||
dcp.last_sandbox_at,
|
||||
dcp.created_at,
|
||||
dcp.updated_at,
|
||||
CASE
|
||||
WHEN dcp.profile_key IS NOT NULL THEN 'per-store'
|
||||
ELSE 'legacy'
|
||||
END AS crawler_mode,
|
||||
CASE
|
||||
WHEN dcp.status = 'production' THEN 'Ready'
|
||||
WHEN dcp.status = 'sandbox' AND dcp.next_retry_at <= NOW() THEN 'Retry Due'
|
||||
WHEN dcp.status = 'sandbox' THEN 'Waiting'
|
||||
WHEN dcp.status = 'needs_manual' THEN 'Needs Manual'
|
||||
WHEN dcp.status = 'disabled' THEN 'Disabled'
|
||||
ELSE 'Unknown'
|
||||
END AS status_display
|
||||
FROM dispensary_crawler_profiles dcp
|
||||
JOIN dispensaries d ON d.id = dcp.dispensary_id
|
||||
WHERE dcp.enabled = true
|
||||
ORDER BY dcp.status, dcp.updated_at DESC;
|
||||
73
backend/migrations/039_crawl_orchestration_traces.sql
Normal file
73
backend/migrations/039_crawl_orchestration_traces.sql
Normal file
@@ -0,0 +1,73 @@
|
||||
-- Migration: Create crawl_orchestration_traces table
|
||||
-- Purpose: Store detailed step-by-step traces for every crawl orchestration run
|
||||
-- This enables full visibility into per-store crawler behavior
|
||||
|
||||
CREATE TABLE IF NOT EXISTS crawl_orchestration_traces (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
run_id VARCHAR(255), -- UUID or job ID for this crawl run
|
||||
profile_id INTEGER REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL,
|
||||
profile_key VARCHAR(255), -- e.g. "trulieve-scottsdale"
|
||||
crawler_module VARCHAR(255), -- Full path to .ts file loaded
|
||||
state_at_start VARCHAR(50), -- sandbox, production, legacy, disabled
|
||||
state_at_end VARCHAR(50), -- sandbox, production, needs_manual, etc.
|
||||
|
||||
-- The trace: ordered array of step objects
|
||||
trace JSONB NOT NULL DEFAULT '[]'::jsonb,
|
||||
|
||||
-- Summary metrics for quick querying
|
||||
total_steps INTEGER DEFAULT 0,
|
||||
duration_ms INTEGER,
|
||||
success BOOLEAN,
|
||||
error_message TEXT,
|
||||
products_found INTEGER,
|
||||
|
||||
-- Timestamps
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for quick lookup by dispensary
|
||||
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_id
|
||||
ON crawl_orchestration_traces(dispensary_id);
|
||||
|
||||
-- Index for finding latest trace per dispensary
|
||||
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_created
|
||||
ON crawl_orchestration_traces(dispensary_id, created_at DESC);
|
||||
|
||||
-- Index for finding traces by run_id
|
||||
CREATE INDEX IF NOT EXISTS idx_traces_run_id
|
||||
ON crawl_orchestration_traces(run_id) WHERE run_id IS NOT NULL;
|
||||
|
||||
-- Index for finding traces by profile
|
||||
CREATE INDEX IF NOT EXISTS idx_traces_profile_id
|
||||
ON crawl_orchestration_traces(profile_id) WHERE profile_id IS NOT NULL;
|
||||
|
||||
-- Comment explaining trace structure
|
||||
COMMENT ON COLUMN crawl_orchestration_traces.trace IS
|
||||
'Ordered array of step objects. Each step has:
|
||||
{
|
||||
"step": 1,
|
||||
"action": "load_profile",
|
||||
"description": "Loading crawler profile for dispensary",
|
||||
"timestamp": 1701234567890,
|
||||
"duration_ms": 45,
|
||||
"input": { ... },
|
||||
"output": { ... },
|
||||
"what": "Description of what happened",
|
||||
"why": "Reason this step was taken",
|
||||
"where": "Code location / module",
|
||||
"how": "Method or approach used",
|
||||
"when": "ISO timestamp"
|
||||
}';
|
||||
|
||||
-- View for easy access to latest traces
|
||||
CREATE OR REPLACE VIEW v_latest_crawl_traces AS
|
||||
SELECT DISTINCT ON (dispensary_id)
|
||||
cot.*,
|
||||
d.name AS dispensary_name,
|
||||
d.city AS dispensary_city
|
||||
FROM crawl_orchestration_traces cot
|
||||
JOIN dispensaries d ON d.id = cot.dispensary_id
|
||||
ORDER BY dispensary_id, cot.created_at DESC;
|
||||
73
backend/migrations/040_dispensary_dba_name.sql
Normal file
73
backend/migrations/040_dispensary_dba_name.sql
Normal file
@@ -0,0 +1,73 @@
|
||||
-- Migration 040: Add dba_name column to dispensaries table
|
||||
-- DBA (Doing Business As) name - the name the dispensary operates under,
|
||||
-- which may differ from the legal entity name
|
||||
-- This migration is idempotent - safe to run multiple times
|
||||
|
||||
-- Add dba_name column
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'dba_name') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN dba_name TEXT DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add company_name column (legal entity name)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'company_name') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN company_name TEXT DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add azdhs_id for Arizona Department of Health Services license number
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'azdhs_id') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN azdhs_id INTEGER DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add phone column
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'phone') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN phone TEXT DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add email column
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'email') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN email TEXT DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add google_rating column
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_rating') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN google_rating NUMERIC(2,1) DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add google_review_count column
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_review_count') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN google_review_count INTEGER DEFAULT NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Add comments for documentation
|
||||
COMMENT ON COLUMN dispensaries.dba_name IS 'DBA (Doing Business As) name - the public-facing name the dispensary operates under';
|
||||
COMMENT ON COLUMN dispensaries.company_name IS 'Legal entity/company name that owns the dispensary';
|
||||
COMMENT ON COLUMN dispensaries.azdhs_id IS 'Arizona Department of Health Services license number';
|
||||
COMMENT ON COLUMN dispensaries.phone IS 'Contact phone number';
|
||||
COMMENT ON COLUMN dispensaries.email IS 'Contact email address';
|
||||
COMMENT ON COLUMN dispensaries.google_rating IS 'Google Maps rating (1.0 to 5.0)';
|
||||
COMMENT ON COLUMN dispensaries.google_review_count IS 'Number of Google reviews';
|
||||
|
||||
-- Create index for searching by dba_name
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_dba_name ON dispensaries (dba_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_azdhs_id ON dispensaries (azdhs_id);
|
||||
376
backend/migrations/041_cannaiq_canonical_schema.sql
Normal file
376
backend/migrations/041_cannaiq_canonical_schema.sql
Normal file
@@ -0,0 +1,376 @@
|
||||
-- Migration 041: CannaiQ Canonical Schema
|
||||
--
|
||||
-- This migration adds the canonical CannaiQ schema tables and columns.
|
||||
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
|
||||
--
|
||||
-- Run with: psql $CANNAIQ_DB_URL -f migrations/041_cannaiq_canonical_schema.sql
|
||||
--
|
||||
-- Tables created:
|
||||
-- - states (new)
|
||||
-- - chains (new)
|
||||
-- - brands (new)
|
||||
-- - store_products (new - normalized view of current menu)
|
||||
-- - store_product_snapshots (new - historical crawl data)
|
||||
-- - crawl_runs (new - replaces/supplements dispensary_crawl_jobs)
|
||||
--
|
||||
-- Tables modified:
|
||||
-- - dispensaries (add state_id, chain_id FKs)
|
||||
-- - dispensary_crawler_profiles (add status, allow_autopromote, validated_at)
|
||||
-- - crawl_orchestration_traces (add run_id FK)
|
||||
--
|
||||
|
||||
-- =====================================================
|
||||
-- 1) STATES TABLE
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id SERIAL PRIMARY KEY,
|
||||
code VARCHAR(2) NOT NULL UNIQUE,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Insert known states
|
||||
INSERT INTO states (code, name) VALUES
|
||||
('AZ', 'Arizona'),
|
||||
('CA', 'California'),
|
||||
('CO', 'Colorado'),
|
||||
('FL', 'Florida'),
|
||||
('IL', 'Illinois'),
|
||||
('MA', 'Massachusetts'),
|
||||
('MD', 'Maryland'),
|
||||
('MI', 'Michigan'),
|
||||
('MO', 'Missouri'),
|
||||
('NV', 'Nevada'),
|
||||
('NJ', 'New Jersey'),
|
||||
('NY', 'New York'),
|
||||
('OH', 'Ohio'),
|
||||
('OK', 'Oklahoma'),
|
||||
('OR', 'Oregon'),
|
||||
('PA', 'Pennsylvania'),
|
||||
('WA', 'Washington')
|
||||
ON CONFLICT (code) DO NOTHING;
|
||||
|
||||
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state codes.';
|
||||
|
||||
-- =====================================================
|
||||
-- 2) CHAINS TABLE (retail groups)
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS chains (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||
website_url TEXT,
|
||||
logo_url TEXT,
|
||||
description TEXT,
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
|
||||
|
||||
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations (e.g., Curaleaf, Trulieve).';
|
||||
|
||||
-- =====================================================
|
||||
-- 3) BRANDS TABLE (canonical brand catalog)
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS brands (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||
external_id VARCHAR(100), -- Provider-specific brand ID
|
||||
website_url TEXT,
|
||||
instagram_handle VARCHAR(100),
|
||||
logo_url TEXT,
|
||||
description TEXT,
|
||||
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brands_slug ON brands(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_brands_external_id ON brands(external_id) WHERE external_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_brands_portfolio ON brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
|
||||
|
||||
COMMENT ON TABLE brands IS 'Canonical brand catalog. Brands may appear across multiple dispensaries.';
|
||||
COMMENT ON COLUMN brands.is_portfolio_brand IS 'TRUE if this is a brand we represent/manage (vs third-party brand)';
|
||||
|
||||
-- =====================================================
|
||||
-- 4) ADD state_id AND chain_id TO dispensaries
|
||||
-- =====================================================
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
|
||||
|
||||
-- NOTE: state_id backfill is done by ETL script (042_legacy_import.ts), not this migration.
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
|
||||
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
|
||||
|
||||
-- =====================================================
|
||||
-- 5) STORE_PRODUCTS TABLE (current menu state)
|
||||
-- =====================================================
|
||||
-- This is the normalized "what is currently on the menu" table.
|
||||
-- It supplements dutchie_products with a provider-agnostic structure.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS store_products (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL, -- Link to canonical product
|
||||
brand_id INTEGER REFERENCES brands(id) ON DELETE SET NULL, -- Link to canonical brand
|
||||
|
||||
-- Provider-specific identifiers
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', -- dutchie, treez, jane, etc.
|
||||
provider_product_id VARCHAR(100), -- Platform-specific product ID
|
||||
provider_brand_id VARCHAR(100), -- Platform-specific brand ID
|
||||
|
||||
-- Raw data from platform (not normalized)
|
||||
name_raw VARCHAR(500) NOT NULL,
|
||||
brand_name_raw VARCHAR(255),
|
||||
category_raw VARCHAR(100),
|
||||
subcategory_raw VARCHAR(100),
|
||||
|
||||
-- Pricing
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
special_name TEXT,
|
||||
discount_percent NUMERIC(5,2),
|
||||
|
||||
-- Inventory
|
||||
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||
stock_quantity INTEGER,
|
||||
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||
|
||||
-- Potency
|
||||
thc_percent NUMERIC(5,2),
|
||||
cbd_percent NUMERIC(5,2),
|
||||
|
||||
-- Images
|
||||
image_url TEXT,
|
||||
local_image_path TEXT,
|
||||
|
||||
-- Timestamps
|
||||
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, provider, provider_product_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_product ON store_products(product_id) WHERE product_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(brand_id) WHERE brand_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
|
||||
|
||||
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
|
||||
COMMENT ON COLUMN store_products.product_id IS 'FK to canonical products table. NULL if not yet mapped.';
|
||||
COMMENT ON COLUMN store_products.brand_id IS 'FK to canonical brands table. NULL if not yet mapped.';
|
||||
|
||||
-- =====================================================
|
||||
-- 6) STORE_PRODUCT_SNAPSHOTS TABLE (historical data)
|
||||
-- =====================================================
|
||||
-- This is the critical time-series table for analytics.
|
||||
-- One row per product per crawl.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS store_product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
|
||||
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL,
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
provider_product_id VARCHAR(100),
|
||||
|
||||
-- Link to crawl run
|
||||
crawl_run_id INTEGER, -- FK added after crawl_runs table created
|
||||
|
||||
-- Capture timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Raw data from platform
|
||||
name_raw VARCHAR(500),
|
||||
brand_name_raw VARCHAR(255),
|
||||
category_raw VARCHAR(100),
|
||||
subcategory_raw VARCHAR(100),
|
||||
|
||||
-- Pricing at time of capture
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
discount_percent NUMERIC(5,2),
|
||||
|
||||
-- Inventory at time of capture
|
||||
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||
stock_quantity INTEGER,
|
||||
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||
|
||||
-- Potency at time of capture
|
||||
thc_percent NUMERIC(5,2),
|
||||
cbd_percent NUMERIC(5,2),
|
||||
|
||||
-- Image URL at time of capture
|
||||
image_url TEXT,
|
||||
|
||||
-- Full raw response for debugging
|
||||
raw_data JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(product_id, captured_at DESC) WHERE product_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_store_product ON store_product_snapshots(store_product_id) WHERE store_product_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
|
||||
|
||||
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
|
||||
COMMENT ON COLUMN store_product_snapshots.captured_at IS 'When this snapshot was captured (crawl time).';
|
||||
|
||||
-- =====================================================
|
||||
-- 7) CRAWL_RUNS TABLE (job execution records)
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- Provider
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
-- Execution times
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
|
||||
error_message TEXT,
|
||||
|
||||
-- Results
|
||||
products_found INTEGER DEFAULT 0,
|
||||
products_new INTEGER DEFAULT 0,
|
||||
products_updated INTEGER DEFAULT 0,
|
||||
snapshots_written INTEGER DEFAULT 0,
|
||||
|
||||
-- Metadata
|
||||
worker_id VARCHAR(100),
|
||||
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
|
||||
|
||||
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
|
||||
|
||||
-- Add FK from store_product_snapshots to crawl_runs
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.table_constraints
|
||||
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
|
||||
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- =====================================================
|
||||
-- 8) UPDATE crawl_orchestration_traces
|
||||
-- =====================================================
|
||||
-- Add run_id FK if not exists
|
||||
ALTER TABLE crawl_orchestration_traces
|
||||
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
|
||||
ON crawl_orchestration_traces(crawl_run_id)
|
||||
WHERE crawl_run_id IS NOT NULL;
|
||||
|
||||
-- =====================================================
|
||||
-- 9) UPDATE dispensary_crawler_profiles
|
||||
-- =====================================================
|
||||
-- Add missing columns from canonical schema
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
|
||||
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_profiles_status
|
||||
ON dispensary_crawler_profiles(status);
|
||||
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.allow_autopromote IS 'Whether this profile can be auto-promoted from sandbox to production';
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.validated_at IS 'When this profile was last validated as working';
|
||||
|
||||
-- =====================================================
|
||||
-- 10) VIEWS FOR BACKWARD COMPATIBILITY
|
||||
-- =====================================================
|
||||
|
||||
-- View to get latest snapshot per store product
|
||||
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
|
||||
SELECT DISTINCT ON (dispensary_id, provider_product_id)
|
||||
sps.*
|
||||
FROM store_product_snapshots sps
|
||||
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
|
||||
|
||||
-- View to get crawl run summary per dispensary
|
||||
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
d.name AS dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(DISTINCT sp.id) AS current_product_count,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
|
||||
MAX(cr.finished_at) AS last_crawl_at,
|
||||
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
|
||||
FROM dispensaries d
|
||||
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
|
||||
GROUP BY d.id, d.name, d.city, d.state;
|
||||
|
||||
-- =====================================================
|
||||
-- 11) COMMENTS
|
||||
-- =====================================================
|
||||
COMMENT ON TABLE states IS 'Canonical list of US states. Use state_id FK in dispensaries.';
|
||||
COMMENT ON TABLE chains IS 'Retail chains (multi-location operators).';
|
||||
COMMENT ON TABLE brands IS 'Canonical brand catalog across all providers.';
|
||||
COMMENT ON TABLE store_products IS 'Current menu state per dispensary. Provider-agnostic.';
|
||||
COMMENT ON TABLE store_product_snapshots IS 'Historical price/stock data. One row per product per crawl.';
|
||||
COMMENT ON TABLE crawl_runs IS 'Crawl execution records. Links snapshots to runs.';
|
||||
|
||||
-- =====================================================
|
||||
-- MIGRATION COMPLETE
|
||||
-- =====================================================
|
||||
--
|
||||
-- Next steps (manual - not in this migration):
|
||||
-- 1. Populate chains table from known retail groups
|
||||
-- 2. Populate brands table from existing dutchie_products.brand_name
|
||||
-- 3. Migrate data from dutchie_products → store_products
|
||||
-- 4. Migrate data from dutchie_product_snapshots → store_product_snapshots
|
||||
-- 5. Link dispensaries.chain_id to chains where applicable
|
||||
--
|
||||
50
backend/migrations/043_add_states_table.sql
Normal file
50
backend/migrations/043_add_states_table.sql
Normal file
@@ -0,0 +1,50 @@
|
||||
-- Migration 043: Add States Table
|
||||
--
|
||||
-- Creates the states table if it does not exist.
|
||||
-- Safe to run multiple times (idempotent).
|
||||
--
|
||||
-- Run with:
|
||||
-- CANNAIQ_DB_URL="postgresql://..." psql $CANNAIQ_DB_URL -f migrations/043_add_states_table.sql
|
||||
|
||||
-- =====================================================
|
||||
-- 1) CREATE STATES TABLE
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id SERIAL PRIMARY KEY,
|
||||
code TEXT NOT NULL UNIQUE,
|
||||
name TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- =====================================================
|
||||
-- 2) INSERT CORE US STATES
|
||||
-- =====================================================
|
||||
INSERT INTO states (code, name) VALUES
|
||||
('AZ', 'Arizona'),
|
||||
('CA', 'California'),
|
||||
('CO', 'Colorado'),
|
||||
('FL', 'Florida'),
|
||||
('IL', 'Illinois'),
|
||||
('MA', 'Massachusetts'),
|
||||
('MD', 'Maryland'),
|
||||
('MI', 'Michigan'),
|
||||
('MO', 'Missouri'),
|
||||
('NV', 'Nevada'),
|
||||
('NJ', 'New Jersey'),
|
||||
('NY', 'New York'),
|
||||
('OH', 'Ohio'),
|
||||
('OK', 'Oklahoma'),
|
||||
('OR', 'Oregon'),
|
||||
('PA', 'Pennsylvania'),
|
||||
('WA', 'Washington')
|
||||
ON CONFLICT (code) DO NOTHING;
|
||||
|
||||
-- =====================================================
|
||||
-- 3) ADD INDEX
|
||||
-- =====================================================
|
||||
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
|
||||
|
||||
-- =====================================================
|
||||
-- DONE
|
||||
-- =====================================================
|
||||
45
backend/migrations/044_add_provider_detection_data.sql
Normal file
45
backend/migrations/044_add_provider_detection_data.sql
Normal file
@@ -0,0 +1,45 @@
|
||||
-- Migration 044: Add provider_detection_data column to dispensaries
|
||||
--
|
||||
-- This column stores detection metadata for menu provider discovery.
|
||||
-- Used by menu-detection.ts and discovery.ts to track:
|
||||
-- - Detected provider type
|
||||
-- - Resolution attempts
|
||||
-- - Error messages
|
||||
-- - not_crawlable flag
|
||||
--
|
||||
-- Run with: psql $CANNAIQ_DB_URL -f migrations/044_add_provider_detection_data.sql
|
||||
--
|
||||
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
|
||||
|
||||
-- Add provider_detection_data to dispensaries table
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN provider_detection_data JSONB DEFAULT NULL;
|
||||
|
||||
RAISE NOTICE 'Added provider_detection_data column to dispensaries table';
|
||||
ELSE
|
||||
RAISE NOTICE 'provider_detection_data column already exists on dispensaries table';
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Add index for querying by not_crawlable flag
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_not_crawlable
|
||||
ON dispensaries ((provider_detection_data->>'not_crawlable'))
|
||||
WHERE provider_detection_data IS NOT NULL;
|
||||
|
||||
-- Add index for querying by detected provider
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_provider
|
||||
ON dispensaries ((provider_detection_data->>'detected_provider'))
|
||||
WHERE provider_detection_data IS NOT NULL;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSONB metadata from menu provider detection. Keys: detected_provider, resolution_error, not_crawlable, detection_timestamp';
|
||||
|
||||
-- =====================================================
|
||||
-- MIGRATION COMPLETE
|
||||
-- =====================================================
|
||||
27
backend/migrations/045_add_image_columns.sql
Normal file
27
backend/migrations/045_add_image_columns.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- Migration 045: Add thumbnail_url columns to canonical tables
|
||||
--
|
||||
-- NOTE: image_url already exists in both tables from migration 041.
|
||||
-- This migration adds thumbnail_url for cached thumbnail images.
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Add thumbnail_url to store_products if not exists
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'store_products' AND column_name = 'thumbnail_url'
|
||||
) THEN
|
||||
ALTER TABLE store_products ADD COLUMN thumbnail_url TEXT NULL;
|
||||
END IF;
|
||||
|
||||
-- Add thumbnail_url to store_product_snapshots if not exists
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'store_product_snapshots' AND column_name = 'thumbnail_url'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN thumbnail_url TEXT NULL;
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
COMMENT ON COLUMN store_products.thumbnail_url IS 'URL to cached thumbnail image';
|
||||
COMMENT ON COLUMN store_product_snapshots.thumbnail_url IS 'URL to cached thumbnail image at time of snapshot';
|
||||
351
backend/migrations/046_crawler_reliability.sql
Normal file
351
backend/migrations/046_crawler_reliability.sql
Normal file
@@ -0,0 +1,351 @@
|
||||
-- Migration 046: Crawler Reliability & Stabilization
|
||||
-- Phase 1: Add fields for error taxonomy, retry management, and self-healing
|
||||
|
||||
-- ============================================================
|
||||
-- PART 1: Error Taxonomy - Standardized error codes
|
||||
-- ============================================================
|
||||
|
||||
-- Create enum for standardized error codes
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN
|
||||
CREATE TYPE crawl_error_code AS ENUM (
|
||||
'SUCCESS',
|
||||
'RATE_LIMITED',
|
||||
'BLOCKED_PROXY',
|
||||
'HTML_CHANGED',
|
||||
'TIMEOUT',
|
||||
'AUTH_FAILED',
|
||||
'NETWORK_ERROR',
|
||||
'PARSE_ERROR',
|
||||
'NO_PRODUCTS',
|
||||
'UNKNOWN_ERROR'
|
||||
);
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- PART 2: Dispensary Crawl Configuration
|
||||
-- ============================================================
|
||||
|
||||
-- Add crawl config columns to dispensaries
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Crawl frequency (minutes between crawls)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240;
|
||||
END IF;
|
||||
|
||||
-- Max retries per crawl
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'max_retries'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3;
|
||||
END IF;
|
||||
|
||||
-- Current proxy ID
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL;
|
||||
END IF;
|
||||
|
||||
-- Current user agent
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL;
|
||||
END IF;
|
||||
|
||||
-- Next scheduled run
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL;
|
||||
END IF;
|
||||
|
||||
-- Last successful crawl
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'last_success_at'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL;
|
||||
END IF;
|
||||
|
||||
-- Last error code (using text for flexibility, validated in app)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'last_error_code'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL;
|
||||
END IF;
|
||||
|
||||
-- Crawl status: active, degraded, paused, failed
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'crawl_status'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active';
|
||||
END IF;
|
||||
|
||||
-- Backoff multiplier (increases with failures)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0;
|
||||
END IF;
|
||||
|
||||
-- Total attempt count (lifetime)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'total_attempts'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
|
||||
-- Total success count (lifetime)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'total_successes'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- PART 3: Enhanced Job Tracking
|
||||
-- ============================================================
|
||||
|
||||
-- Add columns to dispensary_crawl_jobs
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Error code
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL;
|
||||
END IF;
|
||||
|
||||
-- Proxy used for this job
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL;
|
||||
END IF;
|
||||
|
||||
-- User agent used for this job
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL;
|
||||
END IF;
|
||||
|
||||
-- Attempt number for this job
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1;
|
||||
END IF;
|
||||
|
||||
-- Backoff delay applied (ms)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
|
||||
-- HTTP status code received
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL;
|
||||
END IF;
|
||||
|
||||
-- Response time (ms)
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL;
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- PART 4: Crawl History Table (for detailed tracking)
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS crawl_attempts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
job_id INTEGER REFERENCES dispensary_crawl_jobs(id),
|
||||
|
||||
-- Timing
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- Result
|
||||
error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR',
|
||||
error_message TEXT,
|
||||
http_status INTEGER,
|
||||
|
||||
-- Context
|
||||
attempt_number INTEGER NOT NULL DEFAULT 1,
|
||||
proxy_used TEXT,
|
||||
user_agent_used TEXT,
|
||||
|
||||
-- Metrics
|
||||
products_found INTEGER DEFAULT 0,
|
||||
products_upserted INTEGER DEFAULT 0,
|
||||
snapshots_created INTEGER DEFAULT 0,
|
||||
|
||||
-- Metadata
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for quick lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- PART 5: Views for Monitoring
|
||||
-- ============================================================
|
||||
|
||||
-- Drop existing view if exists
|
||||
DROP VIEW IF EXISTS v_crawler_status;
|
||||
|
||||
-- Crawler status view with all reliability fields
|
||||
CREATE VIEW v_crawler_status AS
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.slug,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.crawl_status,
|
||||
d.consecutive_failures,
|
||||
d.last_crawl_at,
|
||||
d.last_success_at,
|
||||
d.last_failure_at,
|
||||
d.last_error_code,
|
||||
d.next_crawl_at,
|
||||
d.crawl_frequency_minutes,
|
||||
d.max_retries,
|
||||
d.current_proxy_id,
|
||||
d.current_user_agent,
|
||||
d.backoff_multiplier,
|
||||
d.total_attempts,
|
||||
d.total_successes,
|
||||
d.product_count,
|
||||
CASE
|
||||
WHEN d.total_attempts > 0
|
||||
THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1)
|
||||
ELSE 0
|
||||
END AS success_rate,
|
||||
CASE
|
||||
WHEN d.crawl_status = 'failed' THEN 'FAILED'
|
||||
WHEN d.crawl_status = 'paused' THEN 'PAUSED'
|
||||
WHEN d.crawl_status = 'degraded' THEN 'DEGRADED'
|
||||
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION'
|
||||
WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID'
|
||||
WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED'
|
||||
WHEN d.next_crawl_at <= NOW() THEN 'DUE'
|
||||
ELSE 'SCHEDULED'
|
||||
END AS schedule_status,
|
||||
d.failed_at,
|
||||
d.failure_notes
|
||||
FROM dispensaries d
|
||||
WHERE d.state = 'AZ';
|
||||
|
||||
-- Drop existing view if exists
|
||||
DROP VIEW IF EXISTS v_crawl_error_summary;
|
||||
|
||||
-- Error summary view
|
||||
CREATE VIEW v_crawl_error_summary AS
|
||||
SELECT
|
||||
error_code,
|
||||
COUNT(*) as total_occurrences,
|
||||
COUNT(DISTINCT dispensary_id) as affected_stores,
|
||||
MAX(started_at) as last_occurrence,
|
||||
AVG(duration_ms)::INTEGER as avg_duration_ms
|
||||
FROM crawl_attempts
|
||||
WHERE started_at > NOW() - INTERVAL '7 days'
|
||||
GROUP BY error_code
|
||||
ORDER BY total_occurrences DESC;
|
||||
|
||||
-- Drop existing view if exists
|
||||
DROP VIEW IF EXISTS v_crawl_health;
|
||||
|
||||
-- Overall crawl health view
|
||||
CREATE VIEW v_crawl_health AS
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers,
|
||||
COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers,
|
||||
COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers,
|
||||
COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers,
|
||||
COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now,
|
||||
COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures,
|
||||
AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures,
|
||||
COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ' AND menu_type = 'dutchie';
|
||||
|
||||
-- ============================================================
|
||||
-- PART 6: Constraint for minimum crawl gap
|
||||
-- ============================================================
|
||||
|
||||
-- Function to check minimum crawl gap (2 minutes)
|
||||
CREATE OR REPLACE FUNCTION check_minimum_crawl_gap()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
-- Only check for new pending jobs
|
||||
IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN
|
||||
-- Check if there's a recent job for same dispensary
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = NEW.dispensary_id
|
||||
AND id != NEW.id
|
||||
AND status IN ('pending', 'running')
|
||||
AND created_at > NOW() - INTERVAL '2 minutes'
|
||||
) THEN
|
||||
RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary';
|
||||
END IF;
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Create trigger (drop first if exists)
|
||||
DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs;
|
||||
CREATE TRIGGER enforce_minimum_crawl_gap
|
||||
BEFORE INSERT ON dispensary_crawl_jobs
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION check_minimum_crawl_gap();
|
||||
|
||||
-- ============================================================
|
||||
-- PART 7: Comments
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging';
|
||||
COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics';
|
||||
COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days';
|
||||
COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';
|
||||
130
backend/migrations/046_raw_payloads_table.sql
Normal file
130
backend/migrations/046_raw_payloads_table.sql
Normal file
@@ -0,0 +1,130 @@
|
||||
-- Migration 046: Raw Payloads Table
|
||||
--
|
||||
-- Immutable event stream for raw crawler responses.
|
||||
-- NEVER delete or overwrite historical payloads.
|
||||
--
|
||||
-- Run with:
|
||||
-- DATABASE_URL="postgresql://..." psql $DATABASE_URL -f migrations/046_raw_payloads_table.sql
|
||||
|
||||
-- =====================================================
|
||||
-- 1) RAW_PAYLOADS TABLE
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS raw_payloads (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
|
||||
-- Store reference
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- Crawl run reference (nullable for backfilled data)
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
|
||||
-- Platform identification
|
||||
platform VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
-- Versioning for schema evolution
|
||||
payload_version INTEGER NOT NULL DEFAULT 1,
|
||||
|
||||
-- The raw JSON response from the crawler (immutable)
|
||||
raw_json JSONB NOT NULL,
|
||||
|
||||
-- Metadata
|
||||
product_count INTEGER, -- Number of products in payload
|
||||
pricing_type VARCHAR(20), -- 'rec', 'med', or 'both'
|
||||
crawl_mode VARCHAR(20), -- 'mode_a', 'mode_b', 'dual'
|
||||
|
||||
-- Timestamps
|
||||
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Hydration status
|
||||
processed BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
normalized_at TIMESTAMPTZ,
|
||||
hydration_error TEXT,
|
||||
hydration_attempts INTEGER DEFAULT 0,
|
||||
|
||||
-- Audit
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- =====================================================
|
||||
-- 2) INDEXES FOR EFFICIENT QUERYING
|
||||
-- =====================================================
|
||||
|
||||
-- Primary lookup: unprocessed payloads in FIFO order
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed
|
||||
ON raw_payloads(fetched_at ASC)
|
||||
WHERE processed = FALSE;
|
||||
|
||||
-- Store-based lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary
|
||||
ON raw_payloads(dispensary_id, fetched_at DESC);
|
||||
|
||||
-- Platform filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_platform
|
||||
ON raw_payloads(platform);
|
||||
|
||||
-- Crawl run linkage
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run
|
||||
ON raw_payloads(crawl_run_id)
|
||||
WHERE crawl_run_id IS NOT NULL;
|
||||
|
||||
-- Error tracking
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_errors
|
||||
ON raw_payloads(hydration_attempts, processed)
|
||||
WHERE hydration_error IS NOT NULL;
|
||||
|
||||
-- =====================================================
|
||||
-- 3) HYDRATION LOCKS TABLE (distributed locking)
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS hydration_locks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
lock_name VARCHAR(100) NOT NULL UNIQUE,
|
||||
worker_id VARCHAR(100) NOT NULL,
|
||||
acquired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
expires_at TIMESTAMPTZ NOT NULL,
|
||||
heartbeat_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_hydration_locks_expires
|
||||
ON hydration_locks(expires_at);
|
||||
|
||||
-- =====================================================
|
||||
-- 4) HYDRATION_RUNS TABLE (audit trail)
|
||||
-- =====================================================
|
||||
CREATE TABLE IF NOT EXISTS hydration_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id VARCHAR(100) NOT NULL,
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed
|
||||
|
||||
-- Metrics
|
||||
payloads_processed INTEGER DEFAULT 0,
|
||||
products_upserted INTEGER DEFAULT 0,
|
||||
snapshots_created INTEGER DEFAULT 0,
|
||||
brands_created INTEGER DEFAULT 0,
|
||||
errors_count INTEGER DEFAULT 0,
|
||||
|
||||
-- Error details
|
||||
error_message TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_hydration_runs_status
|
||||
ON hydration_runs(status, started_at DESC);
|
||||
|
||||
-- =====================================================
|
||||
-- 5) COMMENTS
|
||||
-- =====================================================
|
||||
COMMENT ON TABLE raw_payloads IS 'Immutable event stream of raw crawler responses. NEVER DELETE.';
|
||||
COMMENT ON COLUMN raw_payloads.raw_json IS 'Complete raw JSON from GraphQL/API response. Immutable.';
|
||||
COMMENT ON COLUMN raw_payloads.payload_version IS 'Schema version for normalization compatibility.';
|
||||
COMMENT ON COLUMN raw_payloads.processed IS 'TRUE when payload has been hydrated to canonical tables.';
|
||||
COMMENT ON COLUMN raw_payloads.normalized_at IS 'When the payload was successfully hydrated.';
|
||||
|
||||
COMMENT ON TABLE hydration_locks IS 'Distributed locks for hydration workers to prevent double-processing.';
|
||||
COMMENT ON TABLE hydration_runs IS 'Audit trail of hydration job executions.';
|
||||
|
||||
-- =====================================================
|
||||
-- MIGRATION COMPLETE
|
||||
-- =====================================================
|
||||
473
backend/migrations/047_analytics_infrastructure.sql
Normal file
473
backend/migrations/047_analytics_infrastructure.sql
Normal file
@@ -0,0 +1,473 @@
|
||||
-- Migration 047: Analytics Infrastructure
|
||||
-- Phase 3: Analytics Dashboards for CannaiQ
|
||||
-- Creates views, functions, and tables for price trends, brand penetration, category growth, etc.
|
||||
|
||||
-- ============================================================
|
||||
-- ANALYTICS CACHE TABLE (for expensive query results)
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS analytics_cache (
|
||||
id SERIAL PRIMARY KEY,
|
||||
cache_key VARCHAR(255) NOT NULL UNIQUE,
|
||||
cache_data JSONB NOT NULL,
|
||||
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
expires_at TIMESTAMPTZ NOT NULL,
|
||||
query_time_ms INTEGER,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_cache_key ON analytics_cache(cache_key);
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_cache_expires ON analytics_cache(expires_at);
|
||||
|
||||
-- ============================================================
|
||||
-- PRICE EXTRACTION HELPER FUNCTION
|
||||
-- Extracts pricing from JSONB latest_raw_payload
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE FUNCTION extract_min_price(payload JSONB)
|
||||
RETURNS NUMERIC AS $$
|
||||
DECLARE
|
||||
prices JSONB;
|
||||
min_val NUMERIC;
|
||||
BEGIN
|
||||
-- Try recPrices first (retail prices)
|
||||
prices := payload->'recPrices';
|
||||
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
|
||||
END IF;
|
||||
|
||||
-- Try Prices array
|
||||
prices := payload->'Prices';
|
||||
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
|
||||
END IF;
|
||||
|
||||
RETURN NULL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION extract_max_price(payload JSONB)
|
||||
RETURNS NUMERIC AS $$
|
||||
DECLARE
|
||||
prices JSONB;
|
||||
max_val NUMERIC;
|
||||
BEGIN
|
||||
prices := payload->'recPrices';
|
||||
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
|
||||
END IF;
|
||||
|
||||
prices := payload->'Prices';
|
||||
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
|
||||
END IF;
|
||||
|
||||
RETURN NULL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION extract_wholesale_price(payload JSONB)
|
||||
RETURNS NUMERIC AS $$
|
||||
DECLARE
|
||||
prices JSONB;
|
||||
min_val NUMERIC;
|
||||
BEGIN
|
||||
prices := payload->'wholesalePrices';
|
||||
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||
RETURN min_val;
|
||||
END IF;
|
||||
RETURN NULL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||
|
||||
-- ============================================================
|
||||
-- VIEW: v_product_pricing
|
||||
-- Flattened view of products with extracted pricing
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE VIEW v_product_pricing AS
|
||||
SELECT
|
||||
dp.id,
|
||||
dp.dispensary_id,
|
||||
dp.name,
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
dp.type as category,
|
||||
dp.subcategory,
|
||||
dp.strain_type,
|
||||
dp.stock_status,
|
||||
dp.status,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
extract_min_price(dp.latest_raw_payload) as min_price,
|
||||
extract_max_price(dp.latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price,
|
||||
dp.thc,
|
||||
dp.cbd,
|
||||
dp.updated_at,
|
||||
dp.created_at
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id;
|
||||
|
||||
-- ============================================================
|
||||
-- VIEW: v_brand_store_presence
|
||||
-- Which brands are in which stores
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE VIEW v_brand_store_presence AS
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
dp.dispensary_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
dp.type as category,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count,
|
||||
MAX(dp.updated_at) as last_updated
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name IS NOT NULL
|
||||
GROUP BY dp.brand_name, dp.brand_id, dp.dispensary_id, d.name, d.city, d.state, dp.type;
|
||||
|
||||
-- ============================================================
|
||||
-- VIEW: v_category_store_summary
|
||||
-- Category breakdown per store
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE VIEW v_category_store_summary AS
|
||||
SELECT
|
||||
dp.dispensary_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
dp.type as category,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type IS NOT NULL
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state, dp.type;
|
||||
|
||||
-- ============================================================
|
||||
-- VIEW: v_brand_summary
|
||||
-- Global brand statistics
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE VIEW v_brand_summary AS
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
COUNT(DISTINCT dp.type) as category_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
MAX(dp.updated_at) as last_updated
|
||||
FROM dutchie_products dp
|
||||
WHERE dp.brand_name IS NOT NULL
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
ORDER BY total_skus DESC;
|
||||
|
||||
-- ============================================================
|
||||
-- VIEW: v_category_summary
|
||||
-- Global category statistics
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE VIEW v_category_summary AS
|
||||
SELECT
|
||||
dp.type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus
|
||||
FROM dutchie_products dp
|
||||
WHERE dp.type IS NOT NULL
|
||||
GROUP BY dp.type
|
||||
ORDER BY total_skus DESC;
|
||||
|
||||
-- ============================================================
|
||||
-- VIEW: v_store_summary
|
||||
-- Store-level statistics
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE VIEW v_store_summary AS
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.chain_id,
|
||||
COUNT(dp.id) as total_skus,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(DISTINCT dp.type) as category_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
|
||||
d.last_crawl_at,
|
||||
d.product_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
GROUP BY d.id, d.name, d.city, d.state, d.chain_id, d.last_crawl_at, d.product_count;
|
||||
|
||||
-- ============================================================
|
||||
-- TABLE: brand_snapshots (for historical brand tracking)
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS brand_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
brand_name VARCHAR(255) NOT NULL,
|
||||
brand_id VARCHAR(255),
|
||||
snapshot_date DATE NOT NULL,
|
||||
store_count INTEGER NOT NULL DEFAULT 0,
|
||||
total_skus INTEGER NOT NULL DEFAULT 0,
|
||||
avg_price NUMERIC(10,2),
|
||||
in_stock_skus INTEGER NOT NULL DEFAULT 0,
|
||||
categories TEXT[],
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(brand_name, snapshot_date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_date ON brand_snapshots(snapshot_date);
|
||||
|
||||
-- ============================================================
|
||||
-- TABLE: category_snapshots (for historical category tracking)
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS category_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
category VARCHAR(255) NOT NULL,
|
||||
snapshot_date DATE NOT NULL,
|
||||
store_count INTEGER NOT NULL DEFAULT 0,
|
||||
brand_count INTEGER NOT NULL DEFAULT 0,
|
||||
total_skus INTEGER NOT NULL DEFAULT 0,
|
||||
avg_price NUMERIC(10,2),
|
||||
in_stock_skus INTEGER NOT NULL DEFAULT 0,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(category, snapshot_date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_category_snapshots_cat ON category_snapshots(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_category_snapshots_date ON category_snapshots(snapshot_date);
|
||||
|
||||
-- ============================================================
|
||||
-- TABLE: store_change_events (for tracking store changes)
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS store_change_events (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||
event_type VARCHAR(50) NOT NULL, -- brand_added, brand_removed, product_added, product_removed, price_change, stock_change
|
||||
event_date DATE NOT NULL,
|
||||
brand_name VARCHAR(255),
|
||||
product_id INTEGER,
|
||||
product_name VARCHAR(500),
|
||||
category VARCHAR(255),
|
||||
old_value TEXT,
|
||||
new_value TEXT,
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_store_events_store ON store_change_events(store_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_events_type ON store_change_events(event_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_events_date ON store_change_events(event_date);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_events_brand ON store_change_events(brand_name);
|
||||
|
||||
-- ============================================================
|
||||
-- TABLE: analytics_alerts
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS analytics_alerts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
alert_type VARCHAR(50) NOT NULL, -- price_warning, brand_dropped, competitive_intrusion, restock_event
|
||||
severity VARCHAR(20) NOT NULL DEFAULT 'info', -- info, warning, critical
|
||||
title VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
store_id INTEGER REFERENCES dispensaries(id),
|
||||
brand_name VARCHAR(255),
|
||||
product_id INTEGER,
|
||||
category VARCHAR(255),
|
||||
metadata JSONB,
|
||||
is_read BOOLEAN DEFAULT FALSE,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_type ON analytics_alerts(alert_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_read ON analytics_alerts(is_read);
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_created ON analytics_alerts(created_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- FUNCTION: Capture daily brand snapshots
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE FUNCTION capture_brand_snapshots()
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
inserted_count INTEGER;
|
||||
BEGIN
|
||||
INSERT INTO brand_snapshots (brand_name, brand_id, snapshot_date, store_count, total_skus, avg_price, in_stock_skus, categories)
|
||||
SELECT
|
||||
brand_name,
|
||||
brand_id,
|
||||
CURRENT_DATE,
|
||||
COUNT(DISTINCT dispensary_id),
|
||||
COUNT(*),
|
||||
AVG(extract_min_price(latest_raw_payload)),
|
||||
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END),
|
||||
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL)
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL
|
||||
GROUP BY brand_name, brand_id
|
||||
ON CONFLICT (brand_name, snapshot_date)
|
||||
DO UPDATE SET
|
||||
store_count = EXCLUDED.store_count,
|
||||
total_skus = EXCLUDED.total_skus,
|
||||
avg_price = EXCLUDED.avg_price,
|
||||
in_stock_skus = EXCLUDED.in_stock_skus,
|
||||
categories = EXCLUDED.categories;
|
||||
|
||||
GET DIAGNOSTICS inserted_count = ROW_COUNT;
|
||||
RETURN inserted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- FUNCTION: Capture daily category snapshots
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE FUNCTION capture_category_snapshots()
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
inserted_count INTEGER;
|
||||
BEGIN
|
||||
INSERT INTO category_snapshots (category, snapshot_date, store_count, brand_count, total_skus, avg_price, in_stock_skus)
|
||||
SELECT
|
||||
type,
|
||||
CURRENT_DATE,
|
||||
COUNT(DISTINCT dispensary_id),
|
||||
COUNT(DISTINCT brand_name),
|
||||
COUNT(*),
|
||||
AVG(extract_min_price(latest_raw_payload)),
|
||||
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END)
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
ON CONFLICT (category, snapshot_date)
|
||||
DO UPDATE SET
|
||||
store_count = EXCLUDED.store_count,
|
||||
brand_count = EXCLUDED.brand_count,
|
||||
total_skus = EXCLUDED.total_skus,
|
||||
avg_price = EXCLUDED.avg_price,
|
||||
in_stock_skus = EXCLUDED.in_stock_skus;
|
||||
|
||||
GET DIAGNOSTICS inserted_count = ROW_COUNT;
|
||||
RETURN inserted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- FUNCTION: Calculate price volatility for a product
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE FUNCTION calculate_price_volatility(
|
||||
p_product_id INTEGER,
|
||||
p_days INTEGER DEFAULT 30
|
||||
)
|
||||
RETURNS NUMERIC AS $$
|
||||
DECLARE
|
||||
std_dev NUMERIC;
|
||||
avg_price NUMERIC;
|
||||
BEGIN
|
||||
-- Using dutchie_product_snapshots if available
|
||||
SELECT
|
||||
STDDEV(rec_min_price_cents / 100.0),
|
||||
AVG(rec_min_price_cents / 100.0)
|
||||
INTO std_dev, avg_price
|
||||
FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = p_product_id
|
||||
AND crawled_at >= NOW() - (p_days || ' days')::INTERVAL
|
||||
AND rec_min_price_cents IS NOT NULL;
|
||||
|
||||
IF avg_price IS NULL OR avg_price = 0 THEN
|
||||
RETURN NULL;
|
||||
END IF;
|
||||
|
||||
-- Return coefficient of variation (CV)
|
||||
RETURN ROUND((std_dev / avg_price) * 100, 2);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- FUNCTION: Get brand penetration stats
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE FUNCTION get_brand_penetration(
|
||||
p_brand_name VARCHAR,
|
||||
p_state VARCHAR DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (
|
||||
total_stores BIGINT,
|
||||
stores_carrying BIGINT,
|
||||
penetration_pct NUMERIC,
|
||||
total_skus BIGINT,
|
||||
avg_skus_per_store NUMERIC,
|
||||
shelf_share_pct NUMERIC
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
WITH store_counts AS (
|
||||
SELECT
|
||||
COUNT(DISTINCT d.id) as total,
|
||||
COUNT(DISTINCT CASE WHEN dp.brand_name = p_brand_name THEN dp.dispensary_id END) as carrying
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE (p_state IS NULL OR d.state = p_state)
|
||||
),
|
||||
sku_counts AS (
|
||||
SELECT
|
||||
COUNT(*) as brand_skus,
|
||||
COUNT(DISTINCT dispensary_id) as stores_with_brand
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = p_brand_name
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE (p_state IS NULL OR d.state = p_state)
|
||||
)
|
||||
SELECT
|
||||
sc.total,
|
||||
sc.carrying,
|
||||
ROUND((sc.carrying::NUMERIC / NULLIF(sc.total, 0)) * 100, 2),
|
||||
skc.brand_skus,
|
||||
ROUND(skc.brand_skus::NUMERIC / NULLIF(skc.stores_with_brand, 0), 2),
|
||||
ROUND((skc.brand_skus::NUMERIC / NULLIF(ts.total, 0)) * 100, 2)
|
||||
FROM store_counts sc, sku_counts skc, total_skus ts;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- Initial snapshot capture (run manually if needed)
|
||||
-- ============================================================
|
||||
-- Note: Run these after migration to capture initial snapshots:
|
||||
-- SELECT capture_brand_snapshots();
|
||||
-- SELECT capture_category_snapshots();
|
||||
|
||||
-- ============================================================
|
||||
-- Grant permissions
|
||||
-- ============================================================
|
||||
-- Views are accessible to all roles by default
|
||||
|
||||
COMMENT ON VIEW v_product_pricing IS 'Flattened product view with extracted pricing from JSONB';
|
||||
COMMENT ON VIEW v_brand_store_presence IS 'Brand presence across stores with SKU counts';
|
||||
COMMENT ON VIEW v_brand_summary IS 'Global brand statistics';
|
||||
COMMENT ON VIEW v_category_summary IS 'Global category statistics';
|
||||
COMMENT ON VIEW v_store_summary IS 'Store-level statistics';
|
||||
COMMENT ON TABLE analytics_cache IS 'Cache for expensive analytics queries';
|
||||
COMMENT ON TABLE brand_snapshots IS 'Historical daily snapshots of brand metrics';
|
||||
COMMENT ON TABLE category_snapshots IS 'Historical daily snapshots of category metrics';
|
||||
COMMENT ON TABLE store_change_events IS 'Log of brand/product changes at stores';
|
||||
COMMENT ON TABLE analytics_alerts IS 'Analytics-generated alerts and notifications';
|
||||
598
backend/migrations/048_production_sync_monitoring.sql
Normal file
598
backend/migrations/048_production_sync_monitoring.sql
Normal file
@@ -0,0 +1,598 @@
|
||||
-- Migration 048: Production Sync + Monitoring Infrastructure
|
||||
-- Phase 5: Full Production Sync + Monitoring
|
||||
--
|
||||
-- Creates:
|
||||
-- 1. Sync orchestrator tables
|
||||
-- 2. Dead-letter queue (DLQ)
|
||||
-- 3. System metrics tracking
|
||||
-- 4. Integrity check results
|
||||
-- 5. Auto-fix audit log
|
||||
|
||||
-- ============================================================
|
||||
-- SYNC ORCHESTRATOR TABLES
|
||||
-- ============================================================
|
||||
|
||||
-- Orchestrator state and control
|
||||
CREATE TABLE IF NOT EXISTS sync_orchestrator_state (
|
||||
id INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1), -- Singleton row
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'SLEEPING', -- RUNNING, SLEEPING, LOCKED, PAUSED
|
||||
current_worker_id VARCHAR(100),
|
||||
last_heartbeat_at TIMESTAMPTZ,
|
||||
last_run_started_at TIMESTAMPTZ,
|
||||
last_run_completed_at TIMESTAMPTZ,
|
||||
last_run_duration_ms INTEGER,
|
||||
last_run_payloads_processed INTEGER DEFAULT 0,
|
||||
last_run_errors INTEGER DEFAULT 0,
|
||||
consecutive_failures INTEGER DEFAULT 0,
|
||||
is_paused BOOLEAN DEFAULT FALSE,
|
||||
pause_reason TEXT,
|
||||
config JSONB DEFAULT '{
|
||||
"batchSize": 50,
|
||||
"pollIntervalMs": 5000,
|
||||
"maxRetries": 3,
|
||||
"lockTimeoutMs": 300000,
|
||||
"enableAnalyticsPrecompute": true,
|
||||
"enableIntegrityChecks": true
|
||||
}'::jsonb,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Insert singleton row if not exists
|
||||
INSERT INTO sync_orchestrator_state (id) VALUES (1) ON CONFLICT (id) DO NOTHING;
|
||||
|
||||
-- Sync run history
|
||||
CREATE TABLE IF NOT EXISTS sync_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
|
||||
worker_id VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed, cancelled
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- Metrics
|
||||
payloads_queued INTEGER DEFAULT 0,
|
||||
payloads_processed INTEGER DEFAULT 0,
|
||||
payloads_skipped INTEGER DEFAULT 0,
|
||||
payloads_failed INTEGER DEFAULT 0,
|
||||
payloads_dlq INTEGER DEFAULT 0,
|
||||
|
||||
products_upserted INTEGER DEFAULT 0,
|
||||
products_inserted INTEGER DEFAULT 0,
|
||||
products_updated INTEGER DEFAULT 0,
|
||||
products_discontinued INTEGER DEFAULT 0,
|
||||
|
||||
snapshots_created INTEGER DEFAULT 0,
|
||||
|
||||
-- Error tracking
|
||||
errors JSONB DEFAULT '[]'::jsonb,
|
||||
error_summary TEXT,
|
||||
|
||||
-- Diff stats (before/after)
|
||||
diff_stats JSONB DEFAULT '{}'::jsonb,
|
||||
|
||||
-- Analytics precompute triggered
|
||||
analytics_updated BOOLEAN DEFAULT FALSE,
|
||||
analytics_duration_ms INTEGER,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sync_runs_status ON sync_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_sync_runs_started_at ON sync_runs(started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_sync_runs_run_id ON sync_runs(run_id);
|
||||
|
||||
-- ============================================================
|
||||
-- DEAD-LETTER QUEUE (DLQ)
|
||||
-- ============================================================
|
||||
|
||||
-- DLQ for failed payloads
|
||||
CREATE TABLE IF NOT EXISTS raw_payloads_dlq (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
original_payload_id UUID NOT NULL,
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id),
|
||||
state_code VARCHAR(2),
|
||||
platform VARCHAR(50) DEFAULT 'dutchie',
|
||||
|
||||
-- Original payload data (preserved)
|
||||
raw_json JSONB NOT NULL,
|
||||
product_count INTEGER,
|
||||
pricing_type VARCHAR(10),
|
||||
crawl_mode VARCHAR(20),
|
||||
|
||||
-- DLQ metadata
|
||||
moved_to_dlq_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
failure_count INTEGER DEFAULT 0,
|
||||
|
||||
-- Error history (array of error objects)
|
||||
error_history JSONB DEFAULT '[]'::jsonb,
|
||||
last_error_type VARCHAR(50),
|
||||
last_error_message TEXT,
|
||||
last_error_at TIMESTAMPTZ,
|
||||
|
||||
-- Retry tracking
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
last_retry_at TIMESTAMPTZ,
|
||||
next_retry_at TIMESTAMPTZ,
|
||||
|
||||
-- Resolution
|
||||
status VARCHAR(20) DEFAULT 'pending', -- pending, retrying, resolved, abandoned
|
||||
resolved_at TIMESTAMPTZ,
|
||||
resolved_by VARCHAR(100),
|
||||
resolution_notes TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dlq_status ON raw_payloads_dlq(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dlq_dispensary ON raw_payloads_dlq(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dlq_error_type ON raw_payloads_dlq(last_error_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dlq_moved_at ON raw_payloads_dlq(moved_to_dlq_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- SYSTEM METRICS
|
||||
-- ============================================================
|
||||
|
||||
-- System metrics time series
|
||||
CREATE TABLE IF NOT EXISTS system_metrics (
|
||||
id SERIAL PRIMARY KEY,
|
||||
metric_name VARCHAR(100) NOT NULL,
|
||||
metric_value NUMERIC NOT NULL,
|
||||
labels JSONB DEFAULT '{}',
|
||||
recorded_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_metrics_name_time ON system_metrics(metric_name, recorded_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_metrics_recorded_at ON system_metrics(recorded_at DESC);
|
||||
|
||||
-- Metrics snapshot (current state, updated continuously)
|
||||
CREATE TABLE IF NOT EXISTS system_metrics_current (
|
||||
metric_name VARCHAR(100) PRIMARY KEY,
|
||||
metric_value NUMERIC NOT NULL,
|
||||
labels JSONB DEFAULT '{}',
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Error buckets for classification
|
||||
CREATE TABLE IF NOT EXISTS error_buckets (
|
||||
id SERIAL PRIMARY KEY,
|
||||
error_type VARCHAR(50) NOT NULL,
|
||||
error_message TEXT,
|
||||
source_table VARCHAR(50),
|
||||
source_id TEXT,
|
||||
dispensary_id INTEGER,
|
||||
state_code VARCHAR(2),
|
||||
context JSONB DEFAULT '{}',
|
||||
occurred_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
acknowledged BOOLEAN DEFAULT FALSE,
|
||||
acknowledged_at TIMESTAMPTZ,
|
||||
acknowledged_by VARCHAR(100)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_error_buckets_type ON error_buckets(error_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_error_buckets_occurred ON error_buckets(occurred_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_error_buckets_unacked ON error_buckets(acknowledged) WHERE acknowledged = FALSE;
|
||||
|
||||
-- ============================================================
|
||||
-- INTEGRITY CHECK RESULTS
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS integrity_check_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
|
||||
check_type VARCHAR(50) NOT NULL, -- daily, on_demand, scheduled
|
||||
triggered_by VARCHAR(100),
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed
|
||||
|
||||
-- Results summary
|
||||
total_checks INTEGER DEFAULT 0,
|
||||
passed_checks INTEGER DEFAULT 0,
|
||||
failed_checks INTEGER DEFAULT 0,
|
||||
warning_checks INTEGER DEFAULT 0,
|
||||
|
||||
-- Detailed results
|
||||
results JSONB DEFAULT '[]'::jsonb,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_integrity_runs_status ON integrity_check_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_integrity_runs_started ON integrity_check_runs(started_at DESC);
|
||||
|
||||
-- Individual integrity check results
|
||||
CREATE TABLE IF NOT EXISTS integrity_check_results (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id UUID REFERENCES integrity_check_runs(run_id) ON DELETE CASCADE,
|
||||
check_name VARCHAR(100) NOT NULL,
|
||||
check_category VARCHAR(50) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL, -- passed, failed, warning, skipped
|
||||
|
||||
-- Check details
|
||||
expected_value TEXT,
|
||||
actual_value TEXT,
|
||||
difference TEXT,
|
||||
affected_count INTEGER DEFAULT 0,
|
||||
|
||||
-- Context
|
||||
details JSONB DEFAULT '{}',
|
||||
affected_ids JSONB DEFAULT '[]'::jsonb,
|
||||
|
||||
-- Remediation
|
||||
can_auto_fix BOOLEAN DEFAULT FALSE,
|
||||
fix_routine VARCHAR(100),
|
||||
|
||||
checked_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_integrity_results_run ON integrity_check_results(run_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_integrity_results_status ON integrity_check_results(status);
|
||||
|
||||
-- ============================================================
|
||||
-- AUTO-FIX AUDIT LOG
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auto_fix_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
|
||||
routine_name VARCHAR(100) NOT NULL,
|
||||
triggered_by VARCHAR(100) NOT NULL,
|
||||
trigger_type VARCHAR(20) NOT NULL, -- manual, auto, scheduled
|
||||
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed, rolled_back
|
||||
|
||||
-- What was changed
|
||||
rows_affected INTEGER DEFAULT 0,
|
||||
changes JSONB DEFAULT '[]'::jsonb,
|
||||
|
||||
-- Dry run support
|
||||
is_dry_run BOOLEAN DEFAULT FALSE,
|
||||
dry_run_preview JSONB,
|
||||
|
||||
-- Error handling
|
||||
error_message TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_fix_runs_routine ON auto_fix_runs(routine_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_fix_runs_started ON auto_fix_runs(started_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- ALERTS TABLE
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS system_alerts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
alert_type VARCHAR(50) NOT NULL,
|
||||
severity VARCHAR(20) NOT NULL, -- info, warning, error, critical
|
||||
title VARCHAR(255) NOT NULL,
|
||||
message TEXT,
|
||||
source VARCHAR(100),
|
||||
|
||||
-- Context
|
||||
context JSONB DEFAULT '{}',
|
||||
|
||||
-- State
|
||||
status VARCHAR(20) DEFAULT 'active', -- active, acknowledged, resolved, muted
|
||||
acknowledged_at TIMESTAMPTZ,
|
||||
acknowledged_by VARCHAR(100),
|
||||
resolved_at TIMESTAMPTZ,
|
||||
resolved_by VARCHAR(100),
|
||||
|
||||
-- Deduplication
|
||||
fingerprint VARCHAR(64), -- Hash for dedup
|
||||
occurrence_count INTEGER DEFAULT 1,
|
||||
first_occurred_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_occurred_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_status ON system_alerts(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_severity ON system_alerts(severity);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_type ON system_alerts(alert_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_fingerprint ON system_alerts(fingerprint);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_active ON system_alerts(status, created_at DESC) WHERE status = 'active';
|
||||
|
||||
-- ============================================================
|
||||
-- HELPER VIEWS
|
||||
-- ============================================================
|
||||
|
||||
-- Current sync status view
|
||||
CREATE OR REPLACE VIEW v_sync_status AS
|
||||
SELECT
|
||||
sos.status as orchestrator_status,
|
||||
sos.current_worker_id,
|
||||
sos.last_heartbeat_at,
|
||||
sos.is_paused,
|
||||
sos.pause_reason,
|
||||
sos.consecutive_failures,
|
||||
sos.last_run_started_at,
|
||||
sos.last_run_completed_at,
|
||||
sos.last_run_duration_ms,
|
||||
sos.last_run_payloads_processed,
|
||||
sos.last_run_errors,
|
||||
sos.config,
|
||||
(SELECT COUNT(*) FROM raw_payloads WHERE processed = FALSE) as unprocessed_payloads,
|
||||
(SELECT COUNT(*) FROM raw_payloads_dlq WHERE status = 'pending') as dlq_pending,
|
||||
(SELECT COUNT(*) FROM system_alerts WHERE status = 'active') as active_alerts,
|
||||
(
|
||||
SELECT json_build_object(
|
||||
'total', COUNT(*),
|
||||
'completed', COUNT(*) FILTER (WHERE status = 'completed'),
|
||||
'failed', COUNT(*) FILTER (WHERE status = 'failed')
|
||||
)
|
||||
FROM sync_runs
|
||||
WHERE started_at >= NOW() - INTERVAL '24 hours'
|
||||
) as runs_24h
|
||||
FROM sync_orchestrator_state sos
|
||||
WHERE sos.id = 1;
|
||||
|
||||
-- DLQ summary view
|
||||
CREATE OR REPLACE VIEW v_dlq_summary AS
|
||||
SELECT
|
||||
status,
|
||||
last_error_type,
|
||||
COUNT(*) as count,
|
||||
MIN(moved_to_dlq_at) as oldest,
|
||||
MAX(moved_to_dlq_at) as newest
|
||||
FROM raw_payloads_dlq
|
||||
GROUP BY status, last_error_type
|
||||
ORDER BY count DESC;
|
||||
|
||||
-- Error bucket summary (last 24h)
|
||||
CREATE OR REPLACE VIEW v_error_summary AS
|
||||
SELECT
|
||||
error_type,
|
||||
COUNT(*) as count,
|
||||
COUNT(*) FILTER (WHERE acknowledged = FALSE) as unacknowledged,
|
||||
MIN(occurred_at) as first_occurred,
|
||||
MAX(occurred_at) as last_occurred
|
||||
FROM error_buckets
|
||||
WHERE occurred_at >= NOW() - INTERVAL '24 hours'
|
||||
GROUP BY error_type
|
||||
ORDER BY count DESC;
|
||||
|
||||
-- Metrics summary view
|
||||
CREATE OR REPLACE VIEW v_metrics_summary AS
|
||||
SELECT
|
||||
metric_name,
|
||||
metric_value,
|
||||
labels,
|
||||
updated_at,
|
||||
NOW() - updated_at as age
|
||||
FROM system_metrics_current
|
||||
ORDER BY metric_name;
|
||||
|
||||
-- ============================================================
|
||||
-- HELPER FUNCTIONS
|
||||
-- ============================================================
|
||||
|
||||
-- Record a metric
|
||||
CREATE OR REPLACE FUNCTION record_metric(
|
||||
p_name VARCHAR(100),
|
||||
p_value NUMERIC,
|
||||
p_labels JSONB DEFAULT '{}'
|
||||
) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
-- Insert into time series
|
||||
INSERT INTO system_metrics (metric_name, metric_value, labels)
|
||||
VALUES (p_name, p_value, p_labels);
|
||||
|
||||
-- Upsert current value
|
||||
INSERT INTO system_metrics_current (metric_name, metric_value, labels, updated_at)
|
||||
VALUES (p_name, p_value, p_labels, NOW())
|
||||
ON CONFLICT (metric_name) DO UPDATE SET
|
||||
metric_value = EXCLUDED.metric_value,
|
||||
labels = EXCLUDED.labels,
|
||||
updated_at = NOW();
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Record an error
|
||||
CREATE OR REPLACE FUNCTION record_error(
|
||||
p_type VARCHAR(50),
|
||||
p_message TEXT,
|
||||
p_source_table VARCHAR(50) DEFAULT NULL,
|
||||
p_source_id TEXT DEFAULT NULL,
|
||||
p_dispensary_id INTEGER DEFAULT NULL,
|
||||
p_context JSONB DEFAULT '{}'
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
v_id INTEGER;
|
||||
BEGIN
|
||||
INSERT INTO error_buckets (
|
||||
error_type, error_message, source_table, source_id,
|
||||
dispensary_id, context
|
||||
)
|
||||
VALUES (
|
||||
p_type, p_message, p_source_table, p_source_id,
|
||||
p_dispensary_id, p_context
|
||||
)
|
||||
RETURNING id INTO v_id;
|
||||
|
||||
-- Update error count metric
|
||||
PERFORM record_metric(
|
||||
'error_count_' || p_type,
|
||||
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'error_count_' || p_type), 0) + 1
|
||||
);
|
||||
|
||||
RETURN v_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Create or update alert (with deduplication)
|
||||
CREATE OR REPLACE FUNCTION upsert_alert(
|
||||
p_type VARCHAR(50),
|
||||
p_severity VARCHAR(20),
|
||||
p_title VARCHAR(255),
|
||||
p_message TEXT DEFAULT NULL,
|
||||
p_source VARCHAR(100) DEFAULT NULL,
|
||||
p_context JSONB DEFAULT '{}'
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
v_fingerprint VARCHAR(64);
|
||||
v_id INTEGER;
|
||||
BEGIN
|
||||
-- Generate fingerprint for dedup
|
||||
v_fingerprint := md5(p_type || p_title || COALESCE(p_source, ''));
|
||||
|
||||
-- Try to find existing active alert
|
||||
SELECT id INTO v_id
|
||||
FROM system_alerts
|
||||
WHERE fingerprint = v_fingerprint AND status = 'active';
|
||||
|
||||
IF v_id IS NOT NULL THEN
|
||||
-- Update existing alert
|
||||
UPDATE system_alerts
|
||||
SET occurrence_count = occurrence_count + 1,
|
||||
last_occurred_at = NOW(),
|
||||
context = p_context
|
||||
WHERE id = v_id;
|
||||
ELSE
|
||||
-- Create new alert
|
||||
INSERT INTO system_alerts (
|
||||
alert_type, severity, title, message, source, context, fingerprint
|
||||
)
|
||||
VALUES (
|
||||
p_type, p_severity, p_title, p_message, p_source, p_context, v_fingerprint
|
||||
)
|
||||
RETURNING id INTO v_id;
|
||||
END IF;
|
||||
|
||||
RETURN v_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Move payload to DLQ
|
||||
CREATE OR REPLACE FUNCTION move_to_dlq(
|
||||
p_payload_id UUID,
|
||||
p_error_type VARCHAR(50),
|
||||
p_error_message TEXT
|
||||
) RETURNS UUID AS $$
|
||||
DECLARE
|
||||
v_dlq_id UUID;
|
||||
v_payload RECORD;
|
||||
BEGIN
|
||||
-- Get the original payload
|
||||
SELECT * INTO v_payload
|
||||
FROM raw_payloads
|
||||
WHERE id = p_payload_id;
|
||||
|
||||
IF v_payload IS NULL THEN
|
||||
RAISE EXCEPTION 'Payload not found: %', p_payload_id;
|
||||
END IF;
|
||||
|
||||
-- Insert into DLQ
|
||||
INSERT INTO raw_payloads_dlq (
|
||||
original_payload_id, dispensary_id, state_code, platform,
|
||||
raw_json, product_count, pricing_type, crawl_mode,
|
||||
failure_count, last_error_type, last_error_message, last_error_at,
|
||||
error_history
|
||||
)
|
||||
VALUES (
|
||||
p_payload_id, v_payload.dispensary_id,
|
||||
(SELECT state FROM dispensaries WHERE id = v_payload.dispensary_id),
|
||||
v_payload.platform,
|
||||
v_payload.raw_json, v_payload.product_count, v_payload.pricing_type, v_payload.crawl_mode,
|
||||
v_payload.hydration_attempts,
|
||||
p_error_type, p_error_message, NOW(),
|
||||
COALESCE(v_payload.hydration_error::jsonb, '[]'::jsonb) || jsonb_build_object(
|
||||
'type', p_error_type,
|
||||
'message', p_error_message,
|
||||
'at', NOW()
|
||||
)
|
||||
)
|
||||
RETURNING id INTO v_dlq_id;
|
||||
|
||||
-- Mark original as processed (moved to DLQ)
|
||||
UPDATE raw_payloads
|
||||
SET processed = TRUE,
|
||||
hydration_error = 'Moved to DLQ: ' || p_error_message
|
||||
WHERE id = p_payload_id;
|
||||
|
||||
-- Record metric
|
||||
PERFORM record_metric('payloads_dlq_total',
|
||||
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'payloads_dlq_total'), 0) + 1
|
||||
);
|
||||
|
||||
-- Create alert for DLQ
|
||||
PERFORM upsert_alert(
|
||||
'DLQ_ARRIVAL',
|
||||
'warning',
|
||||
'Payload moved to Dead-Letter Queue',
|
||||
p_error_message,
|
||||
'hydration',
|
||||
jsonb_build_object('payload_id', p_payload_id, 'dlq_id', v_dlq_id, 'error_type', p_error_type)
|
||||
);
|
||||
|
||||
RETURN v_dlq_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Cleanup old metrics (keep 7 days of time series)
|
||||
CREATE OR REPLACE FUNCTION cleanup_old_metrics() RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
v_deleted INTEGER;
|
||||
BEGIN
|
||||
DELETE FROM system_metrics
|
||||
WHERE recorded_at < NOW() - INTERVAL '7 days';
|
||||
|
||||
GET DIAGNOSTICS v_deleted = ROW_COUNT;
|
||||
RETURN v_deleted;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ============================================================
|
||||
-- ENSURE RAW_PAYLOADS HAS REQUIRED COLUMNS
|
||||
-- ============================================================
|
||||
|
||||
-- Add state column to raw_payloads if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'raw_payloads' AND column_name = 'state_code'
|
||||
) THEN
|
||||
ALTER TABLE raw_payloads ADD COLUMN state_code VARCHAR(2);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- ============================================================
|
||||
-- INITIAL METRICS
|
||||
-- ============================================================
|
||||
|
||||
-- Initialize core metrics
|
||||
INSERT INTO system_metrics_current (metric_name, metric_value, labels)
|
||||
VALUES
|
||||
('payloads_unprocessed', 0, '{}'),
|
||||
('payloads_processed_today', 0, '{}'),
|
||||
('hydration_errors', 0, '{}'),
|
||||
('hydration_success_rate', 100, '{}'),
|
||||
('canonical_rows_inserted', 0, '{}'),
|
||||
('canonical_rows_updated', 0, '{}'),
|
||||
('canonical_rows_discontinued', 0, '{}'),
|
||||
('snapshot_volume', 0, '{}'),
|
||||
('ingestion_latency_avg_ms', 0, '{}'),
|
||||
('payloads_dlq_total', 0, '{}')
|
||||
ON CONFLICT (metric_name) DO NOTHING;
|
||||
|
||||
-- ============================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON TABLE sync_orchestrator_state IS 'Singleton table tracking orchestrator status and config';
|
||||
COMMENT ON TABLE sync_runs IS 'History of sync runs with metrics';
|
||||
COMMENT ON TABLE raw_payloads_dlq IS 'Dead-letter queue for failed payloads';
|
||||
COMMENT ON TABLE system_metrics IS 'Time-series metrics storage';
|
||||
COMMENT ON TABLE system_metrics_current IS 'Current metric values (fast lookup)';
|
||||
COMMENT ON TABLE error_buckets IS 'Classified errors for monitoring';
|
||||
COMMENT ON TABLE integrity_check_runs IS 'Integrity check execution history';
|
||||
COMMENT ON TABLE integrity_check_results IS 'Individual check results';
|
||||
COMMENT ON TABLE auto_fix_runs IS 'Audit log for auto-fix routines';
|
||||
COMMENT ON TABLE system_alerts IS 'System alerts with deduplication';
|
||||
750
backend/migrations/050_cannaiq_canonical_v2.sql
Normal file
750
backend/migrations/050_cannaiq_canonical_v2.sql
Normal file
@@ -0,0 +1,750 @@
|
||||
-- ============================================================================
|
||||
-- Migration 050: CannaiQ Canonical Schema v2
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Add canonical tables for multi-state analytics, pricing engine,
|
||||
-- promotions, intelligence, and brand/buyer portals.
|
||||
--
|
||||
-- RULES:
|
||||
-- - STRICTLY ADDITIVE (no DROP, DELETE, TRUNCATE, or ALTER column type)
|
||||
-- - All new tables use IF NOT EXISTS
|
||||
-- - All new columns use ADD COLUMN IF NOT EXISTS
|
||||
-- - All indexes use IF NOT EXISTS
|
||||
-- - Compatible with existing dutchie_products, dispensaries, etc.
|
||||
--
|
||||
-- Run with:
|
||||
-- psql $CANNAIQ_DB_URL -f migrations/050_cannaiq_canonical_v2.sql
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: STATES TABLE
|
||||
-- ============================================================================
|
||||
-- Reference table for US states. Already may exist from 041/043.
|
||||
-- This is idempotent.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id SERIAL PRIMARY KEY,
|
||||
code VARCHAR(2) NOT NULL UNIQUE,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
timezone VARCHAR(50) DEFAULT 'America/Phoenix',
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
crawl_enabled BOOLEAN DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Insert states if not present
|
||||
INSERT INTO states (code, name, timezone) VALUES
|
||||
('AZ', 'Arizona', 'America/Phoenix'),
|
||||
('CA', 'California', 'America/Los_Angeles'),
|
||||
('CO', 'Colorado', 'America/Denver'),
|
||||
('FL', 'Florida', 'America/New_York'),
|
||||
('IL', 'Illinois', 'America/Chicago'),
|
||||
('MA', 'Massachusetts', 'America/New_York'),
|
||||
('MD', 'Maryland', 'America/New_York'),
|
||||
('MI', 'Michigan', 'America/Detroit'),
|
||||
('MO', 'Missouri', 'America/Chicago'),
|
||||
('NV', 'Nevada', 'America/Los_Angeles'),
|
||||
('NJ', 'New Jersey', 'America/New_York'),
|
||||
('NY', 'New York', 'America/New_York'),
|
||||
('OH', 'Ohio', 'America/New_York'),
|
||||
('OK', 'Oklahoma', 'America/Chicago'),
|
||||
('OR', 'Oregon', 'America/Los_Angeles'),
|
||||
('PA', 'Pennsylvania', 'America/New_York'),
|
||||
('WA', 'Washington', 'America/Los_Angeles')
|
||||
ON CONFLICT (code) DO UPDATE SET
|
||||
timezone = EXCLUDED.timezone,
|
||||
updated_at = NOW();
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
|
||||
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
|
||||
|
||||
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: CHAINS TABLE (Retail Groups)
|
||||
-- ============================================================================
|
||||
-- Chains are multi-location operators like Curaleaf, Trulieve, Harvest, etc.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS chains (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||
|
||||
-- Branding
|
||||
website_url TEXT,
|
||||
logo_url TEXT,
|
||||
description TEXT,
|
||||
|
||||
-- Business info
|
||||
headquarters_city VARCHAR(100),
|
||||
headquarters_state_id INTEGER REFERENCES states(id),
|
||||
founded_year INTEGER,
|
||||
|
||||
-- Status
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
is_public BOOLEAN DEFAULT FALSE, -- Publicly traded?
|
||||
stock_ticker VARCHAR(10),
|
||||
|
||||
-- Metadata
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
|
||||
|
||||
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: CANONICAL BRANDS TABLE
|
||||
-- ============================================================================
|
||||
-- This is the master brand catalog across all providers and states.
|
||||
-- Distinct from the per-store `brands` table which tracks store-level brand presence.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS canonical_brands (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||
|
||||
-- External IDs from various platforms
|
||||
dutchie_brand_id VARCHAR(100),
|
||||
jane_brand_id VARCHAR(100),
|
||||
treez_brand_id VARCHAR(100),
|
||||
weedmaps_brand_id VARCHAR(100),
|
||||
|
||||
-- Branding
|
||||
logo_url TEXT,
|
||||
local_logo_path TEXT, -- Local storage path
|
||||
website_url TEXT,
|
||||
instagram_handle VARCHAR(100),
|
||||
description TEXT,
|
||||
|
||||
-- Classification
|
||||
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
|
||||
is_house_brand BOOLEAN DEFAULT FALSE, -- TRUE if dispensary house brand
|
||||
parent_company VARCHAR(255), -- Parent company name if subsidiary
|
||||
|
||||
-- State presence
|
||||
states_available TEXT[], -- Array of state codes where brand is present
|
||||
|
||||
-- Status
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
is_verified BOOLEAN DEFAULT FALSE, -- Manually verified brand info
|
||||
verified_at TIMESTAMPTZ,
|
||||
|
||||
-- Metadata
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_brands_slug ON canonical_brands(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_brands_dutchie ON canonical_brands(dutchie_brand_id) WHERE dutchie_brand_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_brands_portfolio ON canonical_brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_brands_states ON canonical_brands USING GIN(states_available);
|
||||
|
||||
COMMENT ON TABLE canonical_brands IS 'Canonical brand catalog across all providers. Master brand reference.';
|
||||
COMMENT ON COLUMN canonical_brands.is_portfolio_brand IS 'TRUE if this is a brand CannaiQ represents/manages.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: CRAWL_RUNS TABLE
|
||||
-- ============================================================================
|
||||
-- One record per crawl execution. Links to snapshots.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
state_id INTEGER REFERENCES states(id),
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
-- Timing
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
|
||||
error_code VARCHAR(50),
|
||||
error_message TEXT,
|
||||
http_status INTEGER,
|
||||
|
||||
-- Results
|
||||
products_found INTEGER DEFAULT 0,
|
||||
products_new INTEGER DEFAULT 0,
|
||||
products_updated INTEGER DEFAULT 0,
|
||||
products_missing INTEGER DEFAULT 0, -- Products gone from feed
|
||||
snapshots_written INTEGER DEFAULT 0,
|
||||
|
||||
-- Infrastructure
|
||||
worker_id VARCHAR(100),
|
||||
worker_hostname VARCHAR(100),
|
||||
proxy_used TEXT,
|
||||
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
|
||||
|
||||
-- Metadata
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
|
||||
|
||||
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 5: STORE_PRODUCTS TABLE (Current Menu State)
|
||||
-- ============================================================================
|
||||
-- Canonical representation of what's currently on the menu.
|
||||
-- Provider-agnostic structure for analytics.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS store_products (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
state_id INTEGER REFERENCES states(id),
|
||||
|
||||
-- Links to canonical entities
|
||||
canonical_brand_id INTEGER REFERENCES canonical_brands(id) ON DELETE SET NULL,
|
||||
category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL,
|
||||
|
||||
-- Provider-specific identifiers
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
provider_product_id VARCHAR(100) NOT NULL, -- Platform product ID
|
||||
provider_brand_id VARCHAR(100), -- Platform brand ID
|
||||
enterprise_product_id VARCHAR(100), -- Cross-store product ID
|
||||
|
||||
-- Raw data from platform (not normalized)
|
||||
name VARCHAR(500) NOT NULL,
|
||||
brand_name VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
strain_type VARCHAR(50),
|
||||
description TEXT,
|
||||
|
||||
-- Pricing (current)
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
special_name TEXT,
|
||||
discount_percent NUMERIC(5,2),
|
||||
price_unit VARCHAR(20) DEFAULT 'each', -- gram, ounce, each, mg
|
||||
|
||||
-- Inventory
|
||||
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||
stock_quantity INTEGER,
|
||||
stock_status VARCHAR(50) DEFAULT 'in_stock', -- in_stock, out_of_stock, low_stock, missing_from_feed
|
||||
|
||||
-- Potency
|
||||
thc_percent NUMERIC(5,2),
|
||||
cbd_percent NUMERIC(5,2),
|
||||
thc_mg NUMERIC(10,2),
|
||||
cbd_mg NUMERIC(10,2),
|
||||
|
||||
-- Weight/Size
|
||||
weight_value NUMERIC(10,2),
|
||||
weight_unit VARCHAR(20), -- g, oz, mg
|
||||
|
||||
-- Images
|
||||
image_url TEXT,
|
||||
local_image_path TEXT,
|
||||
thumbnail_url TEXT,
|
||||
|
||||
-- Flags
|
||||
is_featured BOOLEAN DEFAULT FALSE,
|
||||
medical_only BOOLEAN DEFAULT FALSE,
|
||||
rec_only BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Menu position (for tracking prominence)
|
||||
menu_position INTEGER,
|
||||
|
||||
-- Timestamps
|
||||
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_price_change_at TIMESTAMPTZ,
|
||||
last_stock_change_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, provider, provider_product_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(canonical_brand_id) WHERE canonical_brand_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE (Historical Data)
|
||||
-- ============================================================================
|
||||
-- Time-series data for analytics. One row per product per crawl.
|
||||
-- CRITICAL: NEVER DELETE from this table.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS store_product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
|
||||
state_id INTEGER REFERENCES states(id),
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
provider_product_id VARCHAR(100),
|
||||
|
||||
-- Link to crawl run
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
|
||||
-- Capture timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Raw data from platform
|
||||
name VARCHAR(500),
|
||||
brand_name VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
|
||||
-- Pricing at time of capture
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
discount_percent NUMERIC(5,2),
|
||||
|
||||
-- Inventory at time of capture
|
||||
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||
stock_quantity INTEGER,
|
||||
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||
is_present_in_feed BOOLEAN DEFAULT TRUE, -- FALSE = missing from feed
|
||||
|
||||
-- Potency at time of capture
|
||||
thc_percent NUMERIC(5,2),
|
||||
cbd_percent NUMERIC(5,2),
|
||||
|
||||
-- Menu position (for tracking prominence changes)
|
||||
menu_position INTEGER,
|
||||
|
||||
-- Image URL at time of capture
|
||||
image_url TEXT,
|
||||
|
||||
-- Full raw response for debugging
|
||||
raw_data JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Partitioning-ready indexes (for future table partitioning by month)
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 7: ADD state_id AND chain_id TO DISPENSARIES
|
||||
-- ============================================================================
|
||||
-- Link dispensaries to states and chains tables.
|
||||
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
|
||||
|
||||
-- Backfill state_id from existing state column
|
||||
UPDATE dispensaries d
|
||||
SET state_id = s.id
|
||||
FROM states s
|
||||
WHERE d.state = s.code
|
||||
AND d.state_id IS NULL;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
|
||||
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 8: BRAND PENETRATION TABLE
|
||||
-- ============================================================================
|
||||
-- Pre-computed brand presence across stores for analytics dashboards.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS brand_penetration (
|
||||
id SERIAL PRIMARY KEY,
|
||||
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
|
||||
state_id INTEGER NOT NULL REFERENCES states(id) ON DELETE CASCADE,
|
||||
|
||||
-- Metrics
|
||||
stores_carrying INTEGER DEFAULT 0,
|
||||
stores_total INTEGER DEFAULT 0,
|
||||
penetration_pct NUMERIC(5,2) DEFAULT 0,
|
||||
|
||||
-- Product breakdown
|
||||
products_count INTEGER DEFAULT 0,
|
||||
products_in_stock INTEGER DEFAULT 0,
|
||||
products_on_special INTEGER DEFAULT 0,
|
||||
|
||||
-- Pricing
|
||||
avg_price NUMERIC(10,2),
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
|
||||
-- Time range
|
||||
calculated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
period_start TIMESTAMPTZ,
|
||||
period_end TIMESTAMPTZ,
|
||||
|
||||
UNIQUE(canonical_brand_id, state_id, calculated_at)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_penetration_brand ON brand_penetration(canonical_brand_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_penetration_state ON brand_penetration(state_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_brand_penetration_calculated ON brand_penetration(calculated_at DESC);
|
||||
|
||||
COMMENT ON TABLE brand_penetration IS 'Pre-computed brand penetration metrics by state.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 9: PRICE_ALERTS TABLE
|
||||
-- ============================================================================
|
||||
-- Track significant price changes for intelligence/alerts.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS price_alerts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_product_id INTEGER REFERENCES store_products(id) ON DELETE CASCADE,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
state_id INTEGER REFERENCES states(id),
|
||||
|
||||
-- What changed
|
||||
alert_type VARCHAR(50) NOT NULL, -- price_drop, price_increase, new_special, special_ended
|
||||
|
||||
-- Values
|
||||
old_price NUMERIC(10,2),
|
||||
new_price NUMERIC(10,2),
|
||||
change_amount NUMERIC(10,2),
|
||||
change_percent NUMERIC(5,2),
|
||||
|
||||
-- Context
|
||||
product_name VARCHAR(500),
|
||||
brand_name VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
|
||||
-- Status
|
||||
is_processed BOOLEAN DEFAULT FALSE,
|
||||
processed_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_price_alerts_dispensary ON price_alerts(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_price_alerts_state ON price_alerts(state_id) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_price_alerts_type ON price_alerts(alert_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_price_alerts_unprocessed ON price_alerts(is_processed) WHERE is_processed = FALSE;
|
||||
CREATE INDEX IF NOT EXISTS idx_price_alerts_created ON price_alerts(created_at DESC);
|
||||
|
||||
COMMENT ON TABLE price_alerts IS 'Significant price changes for intelligence/alerting.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 10: RAW_PAYLOADS TABLE
|
||||
-- ============================================================================
|
||||
-- Store raw API responses for replay/debugging. Separate from snapshots.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS raw_payloads (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
|
||||
-- Payload info
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
payload_type VARCHAR(50) NOT NULL DEFAULT 'products', -- products, brands, specials
|
||||
|
||||
-- The raw data
|
||||
payload JSONB NOT NULL,
|
||||
payload_size_bytes INTEGER,
|
||||
|
||||
-- Deduplication
|
||||
payload_hash VARCHAR(64), -- SHA256 for deduplication
|
||||
|
||||
-- Processing status
|
||||
is_processed BOOLEAN DEFAULT FALSE,
|
||||
processed_at TIMESTAMPTZ,
|
||||
|
||||
captured_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary ON raw_payloads(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run ON raw_payloads(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed ON raw_payloads(is_processed) WHERE is_processed = FALSE;
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_payloads_hash ON raw_payloads(payload_hash) WHERE payload_hash IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE raw_payloads IS 'Raw API responses for replay/debugging. Enables re-hydration.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 11: ANALYTICS CACHE TABLES
|
||||
-- ============================================================================
|
||||
-- Pre-computed analytics for dashboard performance.
|
||||
|
||||
-- Daily store metrics
|
||||
CREATE TABLE IF NOT EXISTS analytics_store_daily (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
state_id INTEGER REFERENCES states(id),
|
||||
date DATE NOT NULL,
|
||||
|
||||
-- Product counts
|
||||
total_products INTEGER DEFAULT 0,
|
||||
in_stock_products INTEGER DEFAULT 0,
|
||||
out_of_stock_products INTEGER DEFAULT 0,
|
||||
on_special_products INTEGER DEFAULT 0,
|
||||
|
||||
-- Brand/category diversity
|
||||
unique_brands INTEGER DEFAULT 0,
|
||||
unique_categories INTEGER DEFAULT 0,
|
||||
|
||||
-- Pricing
|
||||
avg_price NUMERIC(10,2),
|
||||
median_price NUMERIC(10,2),
|
||||
|
||||
-- Crawl health
|
||||
crawl_count INTEGER DEFAULT 0,
|
||||
successful_crawls INTEGER DEFAULT 0,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(dispensary_id, date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_dispensary ON analytics_store_daily(dispensary_id, date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_state ON analytics_store_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_date ON analytics_store_daily(date DESC);
|
||||
|
||||
|
||||
-- Daily brand metrics
|
||||
CREATE TABLE IF NOT EXISTS analytics_brand_daily (
|
||||
id SERIAL PRIMARY KEY,
|
||||
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
|
||||
state_id INTEGER REFERENCES states(id),
|
||||
date DATE NOT NULL,
|
||||
|
||||
-- Presence
|
||||
stores_carrying INTEGER DEFAULT 0,
|
||||
products_count INTEGER DEFAULT 0,
|
||||
|
||||
-- Stock
|
||||
in_stock_count INTEGER DEFAULT 0,
|
||||
out_of_stock_count INTEGER DEFAULT 0,
|
||||
|
||||
-- Pricing
|
||||
avg_price NUMERIC(10,2),
|
||||
min_price NUMERIC(10,2),
|
||||
max_price NUMERIC(10,2),
|
||||
on_special_count INTEGER DEFAULT 0,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(canonical_brand_id, state_id, date)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_brand ON analytics_brand_daily(canonical_brand_id, date DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_state ON analytics_brand_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 12: VIEWS FOR COMPATIBILITY
|
||||
-- ============================================================================
|
||||
|
||||
-- View: Latest snapshot per store product
|
||||
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
|
||||
SELECT DISTINCT ON (dispensary_id, provider_product_id)
|
||||
sps.*
|
||||
FROM store_product_snapshots sps
|
||||
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
|
||||
|
||||
-- View: Crawl run summary per dispensary
|
||||
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.state_id,
|
||||
s.name AS state_name,
|
||||
COUNT(DISTINCT sp.id) AS current_product_count,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
|
||||
MAX(cr.finished_at) AS last_crawl_at,
|
||||
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
|
||||
FROM dispensaries d
|
||||
LEFT JOIN states s ON s.id = d.state_id
|
||||
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
|
||||
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
|
||||
|
||||
-- View: Brand presence across stores
|
||||
CREATE OR REPLACE VIEW v_brand_store_presence AS
|
||||
SELECT
|
||||
cb.id AS brand_id,
|
||||
cb.name AS brand_name,
|
||||
cb.slug AS brand_slug,
|
||||
s.id AS state_id,
|
||||
s.code AS state_code,
|
||||
COUNT(DISTINCT sp.dispensary_id) AS store_count,
|
||||
COUNT(sp.id) AS product_count,
|
||||
COUNT(sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||
AVG(sp.price_rec) AS avg_price,
|
||||
MIN(sp.price_rec) AS min_price,
|
||||
MAX(sp.price_rec) AS max_price
|
||||
FROM canonical_brands cb
|
||||
JOIN store_products sp ON sp.canonical_brand_id = cb.id
|
||||
LEFT JOIN states s ON s.id = sp.state_id
|
||||
GROUP BY cb.id, cb.name, cb.slug, s.id, s.code;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 13: ADD FK FROM store_product_snapshots TO crawl_runs
|
||||
-- ============================================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.table_constraints
|
||||
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
|
||||
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 14: ADD crawl_run_id TO crawl_orchestration_traces
|
||||
-- ============================================================================
|
||||
|
||||
ALTER TABLE crawl_orchestration_traces
|
||||
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
|
||||
ON crawl_orchestration_traces(crawl_run_id)
|
||||
WHERE crawl_run_id IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 15: UPDATE dispensary_crawler_profiles
|
||||
-- ============================================================================
|
||||
-- Add status columns for profile lifecycle.
|
||||
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
|
||||
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE dispensary_crawler_profiles
|
||||
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_profiles_status
|
||||
ON dispensary_crawler_profiles(status);
|
||||
|
||||
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 16: UPDATE dispensary_crawl_jobs WITH ADDITIONAL COLUMNS
|
||||
-- ============================================================================
|
||||
-- Add columns needed for enhanced job tracking.
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS worker_id VARCHAR(100);
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS worker_hostname VARCHAR(100);
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS claimed_by VARCHAR(100);
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS claimed_at TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS locked_until TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS last_heartbeat_at TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS max_retries INTEGER DEFAULT 3;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS products_upserted INTEGER DEFAULT 0;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS snapshots_created INTEGER DEFAULT 0;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS current_page INTEGER DEFAULT 0;
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS total_pages INTEGER;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status_pending ON dispensary_crawl_jobs(status) WHERE status = 'pending';
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_claimed_by ON dispensary_crawl_jobs(claimed_by) WHERE claimed_by IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 17: QUEUE MONITORING VIEWS
|
||||
-- ============================================================================
|
||||
|
||||
CREATE OR REPLACE VIEW v_queue_stats AS
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'pending') AS pending_jobs,
|
||||
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'running') AS running_jobs,
|
||||
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS completed_1h,
|
||||
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') AS failed_1h,
|
||||
(SELECT COUNT(DISTINCT worker_id) FROM dispensary_crawl_jobs WHERE status = 'running' AND worker_id IS NOT NULL) AS active_workers,
|
||||
(SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS avg_duration_seconds;
|
||||
|
||||
CREATE OR REPLACE VIEW v_active_workers AS
|
||||
SELECT
|
||||
worker_id,
|
||||
worker_hostname,
|
||||
COUNT(*) AS current_jobs,
|
||||
SUM(products_found) AS total_products_found,
|
||||
SUM(products_upserted) AS total_products_upserted,
|
||||
SUM(snapshots_created) AS total_snapshots,
|
||||
MIN(claimed_at) AS first_claimed_at,
|
||||
MAX(last_heartbeat_at) AS last_heartbeat
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE status = 'running' AND worker_id IS NOT NULL
|
||||
GROUP BY worker_id, worker_hostname;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- DONE
|
||||
-- ============================================================================
|
||||
|
||||
SELECT 'Migration 050 completed successfully. Canonical schema v2 is ready.' AS status;
|
||||
642
backend/migrations/051_cannaiq_canonical_safe_bootstrap.sql
Normal file
642
backend/migrations/051_cannaiq_canonical_safe_bootstrap.sql
Normal file
@@ -0,0 +1,642 @@
|
||||
-- ============================================================================
|
||||
-- Migration 051: CannaiQ Canonical Schema - Safe Bootstrap
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Create the canonical CannaiQ schema tables from scratch.
|
||||
-- This migration is FULLY IDEMPOTENT and safe to run multiple times.
|
||||
--
|
||||
-- SAFETY RULES FOLLOWED:
|
||||
-- 1. ALL tables use CREATE TABLE IF NOT EXISTS
|
||||
-- 2. ALL columns use ALTER TABLE ADD COLUMN IF NOT EXISTS
|
||||
-- 3. ALL indexes use CREATE INDEX IF NOT EXISTS
|
||||
-- 4. NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||
-- 5. NO assumptions about existing data or column existence
|
||||
-- 6. NO dependencies on migrations 041, 043, or 050
|
||||
-- 7. Compatible with dutchie_menus database as it exists today
|
||||
-- 8. Safe handling of pre-existing states table with missing columns
|
||||
--
|
||||
-- Tables Created:
|
||||
-- - states (US state reference table)
|
||||
-- - chains (retail chain/group table)
|
||||
-- - crawl_runs (crawl execution records)
|
||||
-- - store_products (current menu state)
|
||||
-- - store_product_snapshots (historical price/stock data)
|
||||
--
|
||||
-- Columns Added:
|
||||
-- - dispensaries.state_id (FK to states)
|
||||
-- - dispensaries.chain_id (FK to chains)
|
||||
--
|
||||
-- Run with:
|
||||
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
||||
-- -f migrations/051_cannaiq_canonical_safe_bootstrap.sql
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: STATES TABLE
|
||||
-- ============================================================================
|
||||
-- Reference table for US states where CannaiQ operates.
|
||||
-- This section handles the case where the table exists but is missing columns.
|
||||
|
||||
-- First, create the table if it doesn't exist (minimal definition)
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id SERIAL PRIMARY KEY,
|
||||
code VARCHAR(2) NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Now safely add any missing columns (each is independent, won't fail if exists)
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS timezone TEXT;
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS is_active BOOLEAN DEFAULT TRUE;
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE;
|
||||
|
||||
-- Add unique constraint on code if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'states_code_key' AND conrelid = 'states'::regclass
|
||||
) THEN
|
||||
-- Check if there's already a unique constraint with a different name
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_indexes
|
||||
WHERE tablename = 'states' AND indexdef LIKE '%UNIQUE%code%'
|
||||
) THEN
|
||||
ALTER TABLE states ADD CONSTRAINT states_code_key UNIQUE (code);
|
||||
END IF;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL; -- Constraint already exists
|
||||
WHEN OTHERS THEN
|
||||
NULL; -- Handle any other errors gracefully
|
||||
END $$;
|
||||
|
||||
-- Set default timezone values for existing rows that have NULL
|
||||
UPDATE states SET timezone = 'America/Phoenix' WHERE timezone IS NULL AND code = 'AZ';
|
||||
UPDATE states SET timezone = 'America/Los_Angeles' WHERE timezone IS NULL AND code IN ('CA', 'NV', 'OR', 'WA');
|
||||
UPDATE states SET timezone = 'America/Denver' WHERE timezone IS NULL AND code = 'CO';
|
||||
UPDATE states SET timezone = 'America/New_York' WHERE timezone IS NULL AND code IN ('FL', 'MA', 'MD', 'NJ', 'NY', 'OH', 'PA');
|
||||
UPDATE states SET timezone = 'America/Chicago' WHERE timezone IS NULL AND code IN ('IL', 'MO', 'OK');
|
||||
UPDATE states SET timezone = 'America/Detroit' WHERE timezone IS NULL AND code = 'MI';
|
||||
|
||||
-- Set default is_active for existing rows
|
||||
UPDATE states SET is_active = TRUE WHERE is_active IS NULL;
|
||||
UPDATE states SET crawl_enabled = TRUE WHERE crawl_enabled IS NULL;
|
||||
|
||||
-- Insert known states (idempotent - ON CONFLICT DO UPDATE to fill missing values)
|
||||
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
|
||||
('AZ', 'Arizona', 'America/Phoenix', TRUE, TRUE),
|
||||
('CA', 'California', 'America/Los_Angeles', TRUE, TRUE),
|
||||
('CO', 'Colorado', 'America/Denver', TRUE, TRUE),
|
||||
('FL', 'Florida', 'America/New_York', TRUE, TRUE),
|
||||
('IL', 'Illinois', 'America/Chicago', TRUE, TRUE),
|
||||
('MA', 'Massachusetts', 'America/New_York', TRUE, TRUE),
|
||||
('MD', 'Maryland', 'America/New_York', TRUE, TRUE),
|
||||
('MI', 'Michigan', 'America/Detroit', TRUE, TRUE),
|
||||
('MO', 'Missouri', 'America/Chicago', TRUE, TRUE),
|
||||
('NV', 'Nevada', 'America/Los_Angeles', TRUE, TRUE),
|
||||
('NJ', 'New Jersey', 'America/New_York', TRUE, TRUE),
|
||||
('NY', 'New York', 'America/New_York', TRUE, TRUE),
|
||||
('OH', 'Ohio', 'America/New_York', TRUE, TRUE),
|
||||
('OK', 'Oklahoma', 'America/Chicago', TRUE, TRUE),
|
||||
('OR', 'Oregon', 'America/Los_Angeles', TRUE, TRUE),
|
||||
('PA', 'Pennsylvania', 'America/New_York', TRUE, TRUE),
|
||||
('WA', 'Washington', 'America/Los_Angeles', TRUE, TRUE)
|
||||
ON CONFLICT (code) DO UPDATE SET
|
||||
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
|
||||
is_active = COALESCE(states.is_active, EXCLUDED.is_active),
|
||||
crawl_enabled = COALESCE(states.crawl_enabled, EXCLUDED.crawl_enabled),
|
||||
updated_at = NOW();
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
|
||||
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
|
||||
|
||||
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: CHAINS TABLE
|
||||
-- ============================================================================
|
||||
-- Retail chains/groups that own multiple dispensary locations.
|
||||
-- Examples: Curaleaf, Trulieve, Harvest, Columbia Care
|
||||
|
||||
CREATE TABLE IF NOT EXISTS chains (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL,
|
||||
website_url TEXT,
|
||||
logo_url TEXT,
|
||||
description TEXT,
|
||||
headquarters_city VARCHAR(100),
|
||||
headquarters_state_id INTEGER,
|
||||
founded_year INTEGER,
|
||||
is_active BOOLEAN DEFAULT TRUE,
|
||||
is_public BOOLEAN DEFAULT FALSE,
|
||||
stock_ticker VARCHAR(10),
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add unique constraint on slug if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'chains_slug_key' AND conrelid = 'chains'::regclass
|
||||
) THEN
|
||||
ALTER TABLE chains ADD CONSTRAINT chains_slug_key UNIQUE (slug);
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
-- Add FK to states if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'chains_headquarters_state_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE chains
|
||||
ADD CONSTRAINT chains_headquarters_state_id_fkey
|
||||
FOREIGN KEY (headquarters_state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
|
||||
|
||||
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: ADD state_id AND chain_id TO DISPENSARIES
|
||||
-- ============================================================================
|
||||
-- Link existing dispensaries table to states and chains.
|
||||
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER;
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER;
|
||||
|
||||
-- Add FK constraints if not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dispensaries_state_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries
|
||||
ADD CONSTRAINT dispensaries_state_id_fkey
|
||||
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dispensaries_chain_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE dispensaries
|
||||
ADD CONSTRAINT dispensaries_chain_id_fkey
|
||||
FOREIGN KEY (chain_id) REFERENCES chains(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
|
||||
|
||||
-- Backfill state_id from existing state column (safe - only updates NULL values)
|
||||
UPDATE dispensaries d
|
||||
SET state_id = s.id
|
||||
FROM states s
|
||||
WHERE d.state = s.code
|
||||
AND d.state_id IS NULL;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
|
||||
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: CRAWL_RUNS TABLE
|
||||
-- ============================================================================
|
||||
-- One record per crawl execution. Links to snapshots.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL,
|
||||
state_id INTEGER,
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
-- Timing
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- Status
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running',
|
||||
error_code VARCHAR(50),
|
||||
error_message TEXT,
|
||||
http_status INTEGER,
|
||||
|
||||
-- Results
|
||||
products_found INTEGER DEFAULT 0,
|
||||
products_new INTEGER DEFAULT 0,
|
||||
products_updated INTEGER DEFAULT 0,
|
||||
products_missing INTEGER DEFAULT 0,
|
||||
snapshots_written INTEGER DEFAULT 0,
|
||||
|
||||
-- Infrastructure
|
||||
worker_id VARCHAR(100),
|
||||
worker_hostname VARCHAR(100),
|
||||
proxy_used TEXT,
|
||||
trigger_type VARCHAR(50) DEFAULT 'scheduled',
|
||||
|
||||
-- Metadata
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add FK constraints if not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'crawl_runs_dispensary_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE crawl_runs
|
||||
ADD CONSTRAINT crawl_runs_dispensary_id_fkey
|
||||
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'crawl_runs_state_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE crawl_runs
|
||||
ADD CONSTRAINT crawl_runs_state_id_fkey
|
||||
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
|
||||
|
||||
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 5: STORE_PRODUCTS TABLE
|
||||
-- ============================================================================
|
||||
-- Current state of products on each dispensary menu.
|
||||
-- Provider-agnostic structure for analytics.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS store_products (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL,
|
||||
state_id INTEGER,
|
||||
|
||||
-- Provider-specific identifiers
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
provider_product_id VARCHAR(100) NOT NULL,
|
||||
provider_brand_id VARCHAR(100),
|
||||
enterprise_product_id VARCHAR(100),
|
||||
|
||||
-- Raw data from platform (not normalized)
|
||||
name VARCHAR(500) NOT NULL,
|
||||
brand_name VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
strain_type VARCHAR(50),
|
||||
description TEXT,
|
||||
|
||||
-- Pricing (current)
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
special_name TEXT,
|
||||
discount_percent NUMERIC(5,2),
|
||||
price_unit VARCHAR(20) DEFAULT 'each',
|
||||
|
||||
-- Inventory
|
||||
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||
stock_quantity INTEGER,
|
||||
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||
|
||||
-- Potency
|
||||
thc_percent NUMERIC(5,2),
|
||||
cbd_percent NUMERIC(5,2),
|
||||
thc_mg NUMERIC(10,2),
|
||||
cbd_mg NUMERIC(10,2),
|
||||
|
||||
-- Weight/Size
|
||||
weight_value NUMERIC(10,2),
|
||||
weight_unit VARCHAR(20),
|
||||
|
||||
-- Images
|
||||
image_url TEXT,
|
||||
local_image_path TEXT,
|
||||
thumbnail_url TEXT,
|
||||
|
||||
-- Flags
|
||||
is_featured BOOLEAN DEFAULT FALSE,
|
||||
medical_only BOOLEAN DEFAULT FALSE,
|
||||
rec_only BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Menu position (for tracking prominence)
|
||||
menu_position INTEGER,
|
||||
|
||||
-- Timestamps
|
||||
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_price_change_at TIMESTAMPTZ,
|
||||
last_stock_change_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add unique constraint if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_products_dispensary_provider_product_key'
|
||||
) THEN
|
||||
ALTER TABLE store_products
|
||||
ADD CONSTRAINT store_products_dispensary_provider_product_key
|
||||
UNIQUE (dispensary_id, provider, provider_product_id);
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
-- Add FK constraints if not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_products_dispensary_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_products
|
||||
ADD CONSTRAINT store_products_dispensary_id_fkey
|
||||
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_products_state_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_products
|
||||
ADD CONSTRAINT store_products_state_id_fkey
|
||||
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_brand_name ON store_products(brand_name) WHERE brand_name IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE
|
||||
-- ============================================================================
|
||||
-- Historical price/stock data. One row per product per crawl.
|
||||
-- CRITICAL: NEVER DELETE from this table.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS store_product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL,
|
||||
store_product_id INTEGER,
|
||||
state_id INTEGER,
|
||||
|
||||
-- Provider info
|
||||
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
provider_product_id VARCHAR(100),
|
||||
|
||||
-- Link to crawl run
|
||||
crawl_run_id INTEGER,
|
||||
|
||||
-- Capture timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Raw data from platform
|
||||
name VARCHAR(500),
|
||||
brand_name VARCHAR(255),
|
||||
category VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
|
||||
-- Pricing at time of capture
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
discount_percent NUMERIC(5,2),
|
||||
|
||||
-- Inventory at time of capture
|
||||
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||
stock_quantity INTEGER,
|
||||
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||
is_present_in_feed BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Potency at time of capture
|
||||
thc_percent NUMERIC(5,2),
|
||||
cbd_percent NUMERIC(5,2),
|
||||
|
||||
-- Menu position (for tracking prominence changes)
|
||||
menu_position INTEGER,
|
||||
|
||||
-- Image URL at time of capture
|
||||
image_url TEXT,
|
||||
|
||||
-- Full raw response for debugging
|
||||
raw_data JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add FK constraints if not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_product_snapshots_dispensary_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD CONSTRAINT store_product_snapshots_dispensary_id_fkey
|
||||
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_product_snapshots_store_product_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD CONSTRAINT store_product_snapshots_store_product_id_fkey
|
||||
FOREIGN KEY (store_product_id) REFERENCES store_products(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_product_snapshots_state_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD CONSTRAINT store_product_snapshots_state_id_fkey
|
||||
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_product_snapshots_crawl_run_id_fkey'
|
||||
) THEN
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
|
||||
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN
|
||||
NULL;
|
||||
WHEN OTHERS THEN
|
||||
NULL;
|
||||
END $$;
|
||||
|
||||
-- Indexes optimized for analytics queries
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_product ON store_product_snapshots(provider_product_id) WHERE provider_product_id IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 7: VIEWS FOR BACKWARD COMPATIBILITY
|
||||
-- ============================================================================
|
||||
|
||||
-- View: Latest snapshot per store product
|
||||
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
|
||||
SELECT DISTINCT ON (dispensary_id, provider_product_id)
|
||||
sps.*
|
||||
FROM store_product_snapshots sps
|
||||
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
|
||||
|
||||
-- View: Crawl run summary per dispensary
|
||||
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.state_id,
|
||||
s.name AS state_name,
|
||||
COUNT(DISTINCT sp.id) AS current_product_count,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
|
||||
MAX(cr.finished_at) AS last_crawl_at,
|
||||
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
|
||||
FROM dispensaries d
|
||||
LEFT JOIN states s ON s.id = d.state_id
|
||||
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
|
||||
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- MIGRATION 051 COMPLETE
|
||||
-- ============================================================================
|
||||
|
||||
SELECT 'Migration 051 completed successfully. Canonical schema is ready.' AS status;
|
||||
98
backend/migrations/051_create_mv_state_metrics.sql
Normal file
98
backend/migrations/051_create_mv_state_metrics.sql
Normal file
@@ -0,0 +1,98 @@
|
||||
-- Migration 051: Create materialized view for state metrics
|
||||
-- Used by Analytics V2 state endpoints for fast aggregated queries
|
||||
-- Canonical tables: states, dispensaries, store_products, store_product_snapshots, brands
|
||||
|
||||
-- Drop existing view if it exists (for clean recreation)
|
||||
DROP MATERIALIZED VIEW IF EXISTS mv_state_metrics;
|
||||
|
||||
-- Create materialized view with comprehensive state metrics
|
||||
-- Schema verified via information_schema on 2025-12-06
|
||||
-- Real columns used:
|
||||
-- states: id, code, name, recreational_legal, medical_legal, rec_year, med_year
|
||||
-- dispensaries: id, state_id (NO is_active column)
|
||||
-- store_products: id, dispensary_id, brand_id, category_raw, price_rec, price_med, is_in_stock
|
||||
-- store_product_snapshots: id, store_product_id, captured_at
|
||||
-- brands: id (joined via sp.brand_id)
|
||||
|
||||
CREATE MATERIALIZED VIEW mv_state_metrics AS
|
||||
SELECT
|
||||
s.id AS state_id,
|
||||
s.code AS state,
|
||||
s.name AS state_name,
|
||||
COALESCE(s.recreational_legal, FALSE) AS recreational_legal,
|
||||
COALESCE(s.medical_legal, FALSE) AS medical_legal,
|
||||
s.rec_year,
|
||||
s.med_year,
|
||||
|
||||
-- Dispensary metrics
|
||||
COUNT(DISTINCT d.id) AS dispensary_count,
|
||||
|
||||
-- Product metrics
|
||||
COUNT(DISTINCT sp.id) AS total_products,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = TRUE) AS in_stock_products,
|
||||
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = FALSE) AS out_of_stock_products,
|
||||
|
||||
-- Brand metrics (using brand_id FK, not brand_name)
|
||||
COUNT(DISTINCT sp.brand_id) FILTER (WHERE sp.brand_id IS NOT NULL) AS unique_brands,
|
||||
|
||||
-- Category metrics (using category_raw, not category)
|
||||
COUNT(DISTINCT sp.category_raw) FILTER (WHERE sp.category_raw IS NOT NULL) AS unique_categories,
|
||||
|
||||
-- Pricing metrics (recreational)
|
||||
AVG(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_rec,
|
||||
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)
|
||||
FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_rec,
|
||||
MIN(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS min_price_rec,
|
||||
MAX(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS max_price_rec,
|
||||
|
||||
-- Pricing metrics (medical)
|
||||
AVG(sp.price_med) FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_med,
|
||||
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_med)
|
||||
FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_med,
|
||||
|
||||
-- Snapshot/crawl metrics
|
||||
COUNT(sps.id) AS total_snapshots,
|
||||
MAX(sps.captured_at) AS last_crawl_at,
|
||||
MIN(sps.captured_at) AS first_crawl_at,
|
||||
|
||||
-- Data freshness
|
||||
CASE
|
||||
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '24 hours' THEN 'fresh'
|
||||
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '7 days' THEN 'recent'
|
||||
WHEN MAX(sps.captured_at) IS NOT NULL THEN 'stale'
|
||||
ELSE 'no_data'
|
||||
END AS data_freshness,
|
||||
|
||||
-- Metadata
|
||||
NOW() AS refreshed_at
|
||||
|
||||
FROM states s
|
||||
LEFT JOIN dispensaries d ON d.state_id = s.id
|
||||
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||
LEFT JOIN store_product_snapshots sps ON sps.store_product_id = sp.id
|
||||
GROUP BY s.id, s.code, s.name, s.recreational_legal, s.medical_legal, s.rec_year, s.med_year;
|
||||
|
||||
-- Create unique index on state code for fast lookups
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS mv_state_metrics_state_idx
|
||||
ON mv_state_metrics (state);
|
||||
|
||||
-- Create index on state_id for joins
|
||||
CREATE INDEX IF NOT EXISTS mv_state_metrics_state_id_idx
|
||||
ON mv_state_metrics (state_id);
|
||||
|
||||
-- Create index for legal status filtering
|
||||
CREATE INDEX IF NOT EXISTS mv_state_metrics_legal_idx
|
||||
ON mv_state_metrics (recreational_legal, medical_legal);
|
||||
|
||||
-- Create index for data freshness queries
|
||||
CREATE INDEX IF NOT EXISTS mv_state_metrics_freshness_idx
|
||||
ON mv_state_metrics (data_freshness);
|
||||
|
||||
-- Comment on the view
|
||||
COMMENT ON MATERIALIZED VIEW mv_state_metrics IS
|
||||
'Aggregated state-level metrics for Analytics V2 endpoints. Refresh periodically with: REFRESH MATERIALIZED VIEW CONCURRENTLY mv_state_metrics;';
|
||||
|
||||
-- Record migration
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES ('051', 'create_mv_state_metrics', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
96
backend/migrations/052_add_provider_data_columns.sql
Normal file
96
backend/migrations/052_add_provider_data_columns.sql
Normal file
@@ -0,0 +1,96 @@
|
||||
-- Migration 052: Add provider_data JSONB and frequently-queried columns
|
||||
--
|
||||
-- Adds hybrid storage for legacy data:
|
||||
-- 1. provider_data JSONB on both tables for all extra fields
|
||||
-- 2. Specific columns for frequently-queried fields
|
||||
|
||||
-- ============================================================================
|
||||
-- store_products: Add provider_data and queryable columns
|
||||
-- ============================================================================
|
||||
|
||||
-- JSONB for all extra provider-specific data
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS provider_data JSONB;
|
||||
|
||||
-- Frequently-queried columns
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS strain_type TEXT;
|
||||
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS medical_only BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS rec_only BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS brand_logo_url TEXT;
|
||||
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS platform_dispensary_id TEXT;
|
||||
|
||||
-- Index for strain_type queries
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_strain_type
|
||||
ON store_products(strain_type)
|
||||
WHERE strain_type IS NOT NULL;
|
||||
|
||||
-- Index for medical/rec filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_medical_rec
|
||||
ON store_products(medical_only, rec_only);
|
||||
|
||||
-- GIN index for provider_data JSONB queries
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_provider_data
|
||||
ON store_products USING GIN (provider_data);
|
||||
|
||||
-- ============================================================================
|
||||
-- store_product_snapshots: Add provider_data and queryable columns
|
||||
-- ============================================================================
|
||||
|
||||
-- JSONB for all extra provider-specific data
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD COLUMN IF NOT EXISTS provider_data JSONB;
|
||||
|
||||
-- Frequently-queried columns
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD COLUMN IF NOT EXISTS featured BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE store_product_snapshots
|
||||
ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Index for featured products
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_featured
|
||||
ON store_product_snapshots(dispensary_id, featured)
|
||||
WHERE featured = TRUE;
|
||||
|
||||
-- Index for low stock alerts
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_below_threshold
|
||||
ON store_product_snapshots(dispensary_id, is_below_threshold)
|
||||
WHERE is_below_threshold = TRUE;
|
||||
|
||||
-- GIN index for provider_data JSONB queries
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_data
|
||||
ON store_product_snapshots USING GIN (provider_data);
|
||||
|
||||
-- ============================================================================
|
||||
-- Comments for documentation
|
||||
-- ============================================================================
|
||||
|
||||
COMMENT ON COLUMN store_products.provider_data IS
|
||||
'JSONB blob containing all provider-specific fields not in canonical columns (effects, terpenes, cannabinoids_v2, etc.)';
|
||||
|
||||
COMMENT ON COLUMN store_products.strain_type IS
|
||||
'Cannabis strain type: Indica, Sativa, Hybrid, Indica-Hybrid, Sativa-Hybrid';
|
||||
|
||||
COMMENT ON COLUMN store_products.platform_dispensary_id IS
|
||||
'Provider platform dispensary ID (e.g., Dutchie MongoDB ObjectId)';
|
||||
|
||||
COMMENT ON COLUMN store_product_snapshots.provider_data IS
|
||||
'JSONB blob containing all provider-specific snapshot fields (options, kiosk data, etc.)';
|
||||
|
||||
COMMENT ON COLUMN store_product_snapshots.featured IS
|
||||
'Whether product was featured/highlighted at capture time';
|
||||
|
||||
COMMENT ON COLUMN store_product_snapshots.is_below_threshold IS
|
||||
'Whether product was below inventory threshold at capture time';
|
||||
127
backend/migrations/052_add_state_cannabis_flags.sql
Normal file
127
backend/migrations/052_add_state_cannabis_flags.sql
Normal file
@@ -0,0 +1,127 @@
|
||||
-- ============================================================================
|
||||
-- Migration 052: Add Cannabis Legalization Flags to States
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Add recreational/medical cannabis legalization status and years
|
||||
-- to the existing states table, then seed all 50 states + DC.
|
||||
--
|
||||
-- SAFETY RULES:
|
||||
-- - Uses ADD COLUMN IF NOT EXISTS (idempotent)
|
||||
-- - Uses INSERT ... ON CONFLICT (code) DO UPDATE (idempotent)
|
||||
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||
-- - Safe to run multiple times
|
||||
--
|
||||
-- Run with:
|
||||
-- psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: Add cannabis legalization columns
|
||||
-- ============================================================================
|
||||
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS recreational_legal BOOLEAN;
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS rec_year INTEGER;
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS medical_legal BOOLEAN;
|
||||
ALTER TABLE states ADD COLUMN IF NOT EXISTS med_year INTEGER;
|
||||
|
||||
COMMENT ON COLUMN states.recreational_legal IS 'Whether recreational cannabis is legal in this state';
|
||||
COMMENT ON COLUMN states.rec_year IS 'Year recreational cannabis was legalized (NULL if not legal)';
|
||||
COMMENT ON COLUMN states.medical_legal IS 'Whether medical cannabis is legal in this state';
|
||||
COMMENT ON COLUMN states.med_year IS 'Year medical cannabis was legalized (NULL if not legal)';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: Seed all 50 states + DC with cannabis legalization data
|
||||
-- ============================================================================
|
||||
-- Data sourced from state legalization records as of 2024
|
||||
-- States ordered by medical legalization year, then alphabetically
|
||||
|
||||
INSERT INTO states (code, name, timezone, recreational_legal, rec_year, medical_legal, med_year)
|
||||
VALUES
|
||||
-- Recreational + Medical States (ordered by rec year)
|
||||
('WA', 'Washington', 'America/Los_Angeles', TRUE, 2012, TRUE, 1998),
|
||||
('CO', 'Colorado', 'America/Denver', TRUE, 2012, TRUE, 2000),
|
||||
('AK', 'Alaska', 'America/Anchorage', TRUE, 2014, TRUE, 1998),
|
||||
('OR', 'Oregon', 'America/Los_Angeles', TRUE, 2014, TRUE, 1998),
|
||||
('DC', 'District of Columbia', 'America/New_York', TRUE, 2015, TRUE, 2011),
|
||||
('CA', 'California', 'America/Los_Angeles', TRUE, 2016, TRUE, 1996),
|
||||
('NV', 'Nevada', 'America/Los_Angeles', TRUE, 2016, TRUE, 1998),
|
||||
('ME', 'Maine', 'America/New_York', TRUE, 2016, TRUE, 1999),
|
||||
('MA', 'Massachusetts', 'America/New_York', TRUE, 2016, TRUE, 2012),
|
||||
('MI', 'Michigan', 'America/Detroit', TRUE, 2018, TRUE, 2008),
|
||||
('IL', 'Illinois', 'America/Chicago', TRUE, 2019, TRUE, 2013),
|
||||
('AZ', 'Arizona', 'America/Phoenix', TRUE, 2020, TRUE, 2010),
|
||||
('MT', 'Montana', 'America/Denver', TRUE, 2020, TRUE, 2004),
|
||||
('NJ', 'New Jersey', 'America/New_York', TRUE, 2020, TRUE, 2010),
|
||||
('VT', 'Vermont', 'America/New_York', TRUE, 2020, TRUE, 2004),
|
||||
('CT', 'Connecticut', 'America/New_York', TRUE, 2021, TRUE, 2012),
|
||||
('NM', 'New Mexico', 'America/Denver', TRUE, 2021, TRUE, 2007),
|
||||
('NY', 'New York', 'America/New_York', TRUE, 2021, TRUE, 2014),
|
||||
('VA', 'Virginia', 'America/New_York', TRUE, 2021, TRUE, 2020),
|
||||
('MD', 'Maryland', 'America/New_York', TRUE, 2022, TRUE, 2013),
|
||||
('MO', 'Missouri', 'America/Chicago', TRUE, 2022, TRUE, 2018),
|
||||
('RI', 'Rhode Island', 'America/New_York', TRUE, 2022, TRUE, 2006),
|
||||
('DE', 'Delaware', 'America/New_York', TRUE, 2023, TRUE, 2011),
|
||||
('MN', 'Minnesota', 'America/Chicago', TRUE, 2023, TRUE, 2014),
|
||||
('OH', 'Ohio', 'America/New_York', TRUE, 2023, TRUE, 2016),
|
||||
|
||||
-- Medical Only States (no recreational)
|
||||
('HI', 'Hawaii', 'Pacific/Honolulu', FALSE, NULL, TRUE, 2000),
|
||||
('NH', 'New Hampshire', 'America/New_York', FALSE, NULL, TRUE, 2013),
|
||||
('GA', 'Georgia', 'America/New_York', FALSE, NULL, TRUE, 2015),
|
||||
('LA', 'Louisiana', 'America/Chicago', FALSE, NULL, TRUE, 2015),
|
||||
('TX', 'Texas', 'America/Chicago', FALSE, NULL, TRUE, 2015),
|
||||
('AR', 'Arkansas', 'America/Chicago', FALSE, NULL, TRUE, 2016),
|
||||
('FL', 'Florida', 'America/New_York', FALSE, NULL, TRUE, 2016),
|
||||
('ND', 'North Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2016),
|
||||
('PA', 'Pennsylvania', 'America/New_York', FALSE, NULL, TRUE, 2016),
|
||||
('IA', 'Iowa', 'America/Chicago', FALSE, NULL, TRUE, 2017),
|
||||
('WV', 'West Virginia', 'America/New_York', FALSE, NULL, TRUE, 2017),
|
||||
('OK', 'Oklahoma', 'America/Chicago', FALSE, NULL, TRUE, 2018),
|
||||
('UT', 'Utah', 'America/Denver', FALSE, NULL, TRUE, 2018),
|
||||
('SD', 'South Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2020),
|
||||
('AL', 'Alabama', 'America/Chicago', FALSE, NULL, TRUE, 2021),
|
||||
('MS', 'Mississippi', 'America/Chicago', FALSE, NULL, TRUE, 2022),
|
||||
('KY', 'Kentucky', 'America/New_York', FALSE, NULL, TRUE, 2023),
|
||||
('NE', 'Nebraska', 'America/Chicago', FALSE, NULL, TRUE, 2024),
|
||||
|
||||
-- No Cannabis Programs (neither rec nor medical)
|
||||
('ID', 'Idaho', 'America/Boise', FALSE, NULL, FALSE, NULL),
|
||||
('IN', 'Indiana', 'America/Indiana/Indianapolis', FALSE, NULL, FALSE, NULL),
|
||||
('KS', 'Kansas', 'America/Chicago', FALSE, NULL, FALSE, NULL),
|
||||
('NC', 'North Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
|
||||
('SC', 'South Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
|
||||
('TN', 'Tennessee', 'America/Chicago', FALSE, NULL, FALSE, NULL),
|
||||
('WI', 'Wisconsin', 'America/Chicago', FALSE, NULL, FALSE, NULL),
|
||||
('WY', 'Wyoming', 'America/Denver', FALSE, NULL, FALSE, NULL)
|
||||
|
||||
ON CONFLICT (code) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
|
||||
recreational_legal = EXCLUDED.recreational_legal,
|
||||
rec_year = EXCLUDED.rec_year,
|
||||
medical_legal = EXCLUDED.medical_legal,
|
||||
med_year = EXCLUDED.med_year,
|
||||
updated_at = NOW();
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: Add indexes for common queries
|
||||
-- ============================================================================
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_states_recreational ON states(recreational_legal) WHERE recreational_legal = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_states_medical ON states(medical_legal) WHERE medical_legal = TRUE;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: Verification query (informational only)
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
'Migration 052 completed successfully.' AS status,
|
||||
(SELECT COUNT(*) FROM states WHERE recreational_legal = TRUE) AS rec_states,
|
||||
(SELECT COUNT(*) FROM states WHERE medical_legal = TRUE AND recreational_legal = FALSE) AS med_only_states,
|
||||
(SELECT COUNT(*) FROM states WHERE medical_legal = FALSE OR medical_legal IS NULL) AS no_program_states,
|
||||
(SELECT COUNT(*) FROM states) AS total_states;
|
||||
249
backend/migrations/052_hydration_schema_alignment.sql
Normal file
249
backend/migrations/052_hydration_schema_alignment.sql
Normal file
@@ -0,0 +1,249 @@
|
||||
-- ============================================================================
|
||||
-- Migration 052: Hydration Schema Alignment
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Add columns to canonical tables needed for hydration from
|
||||
-- dutchie_products and dutchie_product_snapshots.
|
||||
--
|
||||
-- This migration ensures store_products and store_product_snapshots can
|
||||
-- receive all data from the legacy dutchie_* tables.
|
||||
--
|
||||
-- SAFETY RULES:
|
||||
-- - ALL columns use ADD COLUMN IF NOT EXISTS
|
||||
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||
-- - Fully idempotent - safe to run multiple times
|
||||
--
|
||||
-- Run with:
|
||||
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
||||
-- -f migrations/052_hydration_schema_alignment.sql
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: store_products - Additional columns from dutchie_products
|
||||
-- ============================================================================
|
||||
|
||||
-- Brand ID from Dutchie GraphQL (brandId field)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_brand_id VARCHAR(100);
|
||||
|
||||
-- Legacy dutchie_products.id for cross-reference during migration
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
|
||||
|
||||
-- THC/CBD content as text (from dutchie_products.thc_content/cbd_content)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS thc_content_text VARCHAR(50);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cbd_content_text VARCHAR(50);
|
||||
|
||||
-- Full cannabinoid data
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cannabinoids JSONB;
|
||||
|
||||
-- Effects array
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS effects TEXT[];
|
||||
|
||||
-- Type (Flower, Edible, etc.) - maps to category in legacy
|
||||
-- Already have category VARCHAR(100), but type may differ
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS product_type VARCHAR(100);
|
||||
|
||||
-- Additional images array
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS additional_images TEXT[];
|
||||
|
||||
-- Local image paths (from 032 migration)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_url TEXT;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS original_image_url TEXT;
|
||||
|
||||
-- Status from Dutchie (Active/Inactive)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
|
||||
|
||||
-- Threshold flags
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- cName / slug from Dutchie
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
|
||||
|
||||
-- Coming soon flag
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_coming_soon BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Provider column already exists, ensure we have provider_dispensary_id
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
|
||||
|
||||
-- Enterprise product ID (cross-store product linking)
|
||||
-- Already exists from migration 051
|
||||
|
||||
-- Total quantity available (from POSMetaData.children)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
|
||||
|
||||
-- Weight
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weight VARCHAR(50);
|
||||
|
||||
-- Options array (size/weight options)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options TEXT[];
|
||||
|
||||
-- Measurements
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS measurements JSONB;
|
||||
|
||||
-- Raw data from last crawl
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS raw_data JSONB;
|
||||
|
||||
-- Source timestamps from Dutchie
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_created_at TIMESTAMPTZ;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_updated_at TIMESTAMPTZ;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: store_product_snapshots - Additional columns for hydration
|
||||
-- ============================================================================
|
||||
|
||||
-- Legacy dutchie_product_snapshot.id for cross-reference
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_snapshot_id INTEGER;
|
||||
|
||||
-- Legacy dutchie_product_id reference
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
|
||||
|
||||
-- Options JSONB from dutchie_product_snapshots
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS options JSONB;
|
||||
|
||||
-- Provider dispensary ID
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
|
||||
|
||||
-- Inventory details
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
|
||||
|
||||
-- Platform status at time of snapshot
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
|
||||
|
||||
-- Threshold flags at time of snapshot
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Special data
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_data JSONB;
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_name TEXT;
|
||||
|
||||
-- Pricing mode (rec/med)
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS pricing_type VARCHAR(10);
|
||||
|
||||
-- Crawl mode (mode_a/mode_b)
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS crawl_mode VARCHAR(20);
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: crawl_runs - Additional columns for hydration
|
||||
-- ============================================================================
|
||||
|
||||
-- Legacy job ID references
|
||||
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_dispensary_crawl_job_id INTEGER;
|
||||
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_job_run_log_id INTEGER;
|
||||
|
||||
-- Schedule reference
|
||||
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS schedule_id INTEGER;
|
||||
|
||||
-- Job type
|
||||
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS job_type VARCHAR(50);
|
||||
|
||||
-- Brands found count
|
||||
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS brands_found INTEGER DEFAULT 0;
|
||||
|
||||
-- Retry count
|
||||
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: INDEXES for hydration queries
|
||||
-- ============================================================================
|
||||
|
||||
-- Index on legacy IDs for migration lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_legacy_id
|
||||
ON store_products(legacy_dutchie_product_id)
|
||||
WHERE legacy_dutchie_product_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_id
|
||||
ON store_product_snapshots(legacy_snapshot_id)
|
||||
WHERE legacy_snapshot_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_product_id
|
||||
ON store_product_snapshots(legacy_dutchie_product_id)
|
||||
WHERE legacy_dutchie_product_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_runs_legacy_job_id
|
||||
ON crawl_runs(legacy_dispensary_crawl_job_id)
|
||||
WHERE legacy_dispensary_crawl_job_id IS NOT NULL;
|
||||
|
||||
-- Index on provider_product_id for upserts
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_provider_id
|
||||
ON store_products(provider_product_id);
|
||||
|
||||
-- Composite index for canonical key lookup
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_canonical_key
|
||||
ON store_products(dispensary_id, provider, provider_product_id);
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 5: Unique constraint for idempotent hydration
|
||||
-- ============================================================================
|
||||
|
||||
-- Ensure unique snapshots per product per crawl
|
||||
-- This prevents duplicate snapshots during re-runs
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'store_product_snapshots_unique_per_crawl'
|
||||
) THEN
|
||||
-- Can't add unique constraint on nullable columns directly,
|
||||
-- so we use a partial unique index instead
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_snapshots_unique_per_crawl
|
||||
ON store_product_snapshots(store_product_id, crawl_run_id)
|
||||
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
WHEN OTHERS THEN NULL;
|
||||
END $$;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 6: View for hydration status monitoring
|
||||
-- ============================================================================
|
||||
|
||||
CREATE OR REPLACE VIEW v_hydration_status AS
|
||||
SELECT
|
||||
'dutchie_products' AS source_table,
|
||||
(SELECT COUNT(*) FROM dutchie_products) AS source_count,
|
||||
(SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) AS hydrated_count,
|
||||
ROUND(
|
||||
100.0 * (SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) /
|
||||
NULLIF((SELECT COUNT(*) FROM dutchie_products), 0),
|
||||
2
|
||||
) AS hydration_pct
|
||||
UNION ALL
|
||||
SELECT
|
||||
'dutchie_product_snapshots' AS source_table,
|
||||
(SELECT COUNT(*) FROM dutchie_product_snapshots) AS source_count,
|
||||
(SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) AS hydrated_count,
|
||||
ROUND(
|
||||
100.0 * (SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) /
|
||||
NULLIF((SELECT COUNT(*) FROM dutchie_product_snapshots), 0),
|
||||
2
|
||||
) AS hydration_pct
|
||||
UNION ALL
|
||||
SELECT
|
||||
'dispensary_crawl_jobs' AS source_table,
|
||||
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed') AS source_count,
|
||||
(SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) AS hydrated_count,
|
||||
ROUND(
|
||||
100.0 * (SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) /
|
||||
NULLIF((SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed'), 0),
|
||||
2
|
||||
) AS hydration_pct;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- DONE
|
||||
-- ============================================================================
|
||||
|
||||
SELECT 'Migration 052 completed successfully. Hydration schema aligned.' AS status;
|
||||
157
backend/migrations/053_analytics_indexes.sql
Normal file
157
backend/migrations/053_analytics_indexes.sql
Normal file
@@ -0,0 +1,157 @@
|
||||
-- ============================================================================
|
||||
-- Migration 053: Analytics Engine Indexes
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Add indexes optimized for analytics queries on canonical tables.
|
||||
-- These indexes support price trends, brand penetration, category
|
||||
-- growth, and state-level analytics.
|
||||
--
|
||||
-- SAFETY RULES:
|
||||
-- - Uses CREATE INDEX IF NOT EXISTS (idempotent)
|
||||
-- - Uses ADD COLUMN IF NOT EXISTS for helper columns
|
||||
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||
-- - Safe to run multiple times
|
||||
--
|
||||
-- Run with:
|
||||
-- psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: Helper columns for analytics (if missing)
|
||||
-- ============================================================================
|
||||
|
||||
-- Ensure store_products has brand_id for faster brand analytics joins
|
||||
-- (brand_name exists, but a normalized brand_id helps)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS brand_id INTEGER;
|
||||
|
||||
-- Ensure snapshots have category for time-series category analytics
|
||||
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS category VARCHAR(100);
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: Price Analytics Indexes
|
||||
-- ============================================================================
|
||||
|
||||
-- Price trends by store_product over time
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product_price_time
|
||||
ON store_product_snapshots(store_product_id, captured_at DESC, price_rec, price_med)
|
||||
WHERE store_product_id IS NOT NULL;
|
||||
|
||||
-- Price by category over time (for category price trends)
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_category_price_time
|
||||
ON store_product_snapshots(category, captured_at DESC, price_rec)
|
||||
WHERE category IS NOT NULL;
|
||||
|
||||
-- Price changes detection (for volatility analysis)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_price_change
|
||||
ON store_products(last_price_change_at DESC)
|
||||
WHERE last_price_change_at IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: Brand Penetration Indexes
|
||||
-- ============================================================================
|
||||
|
||||
-- Brand by dispensary (for penetration counts)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_brand_dispensary
|
||||
ON store_products(brand_name, dispensary_id)
|
||||
WHERE brand_name IS NOT NULL;
|
||||
|
||||
-- Brand by state (for state-level brand analytics)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_brand_state
|
||||
ON store_products(brand_name, state_id)
|
||||
WHERE brand_name IS NOT NULL AND state_id IS NOT NULL;
|
||||
|
||||
-- Brand first/last seen (for penetration trends)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_brand_first_seen
|
||||
ON store_products(brand_name, first_seen_at)
|
||||
WHERE brand_name IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: Category Analytics Indexes
|
||||
-- ============================================================================
|
||||
|
||||
-- Category by state (for state-level category analytics)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_category_state
|
||||
ON store_products(category, state_id)
|
||||
WHERE category IS NOT NULL;
|
||||
|
||||
-- Category by dispensary
|
||||
CREATE INDEX IF NOT EXISTS idx_products_category_dispensary
|
||||
ON store_products(category, dispensary_id)
|
||||
WHERE category IS NOT NULL;
|
||||
|
||||
-- Category first seen (for growth tracking)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_category_first_seen
|
||||
ON store_products(category, first_seen_at)
|
||||
WHERE category IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 5: Store Analytics Indexes
|
||||
-- ============================================================================
|
||||
|
||||
-- Products added/removed by dispensary
|
||||
CREATE INDEX IF NOT EXISTS idx_products_dispensary_first_seen
|
||||
ON store_products(dispensary_id, first_seen_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_products_dispensary_last_seen
|
||||
ON store_products(dispensary_id, last_seen_at DESC);
|
||||
|
||||
-- Stock status changes
|
||||
CREATE INDEX IF NOT EXISTS idx_products_stock_change
|
||||
ON store_products(dispensary_id, last_stock_change_at DESC)
|
||||
WHERE last_stock_change_at IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 6: State Analytics Indexes
|
||||
-- ============================================================================
|
||||
|
||||
-- Dispensary count by state
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_active
|
||||
ON dispensaries(state_id)
|
||||
WHERE state_id IS NOT NULL;
|
||||
|
||||
-- Products by state
|
||||
CREATE INDEX IF NOT EXISTS idx_products_state_active
|
||||
ON store_products(state_id, is_in_stock)
|
||||
WHERE state_id IS NOT NULL;
|
||||
|
||||
-- Snapshots by state for time-series
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_state_time
|
||||
ON store_product_snapshots(state_id, captured_at DESC)
|
||||
WHERE state_id IS NOT NULL;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 7: Composite indexes for common analytics queries
|
||||
-- ============================================================================
|
||||
|
||||
-- Brand + Category + State (for market share calculations)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_brand_category_state
|
||||
ON store_products(brand_name, category, state_id)
|
||||
WHERE brand_name IS NOT NULL AND category IS NOT NULL;
|
||||
|
||||
-- Dispensary + Category + Brand (for store-level brand analysis)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_disp_cat_brand
|
||||
ON store_products(dispensary_id, category, brand_name)
|
||||
WHERE category IS NOT NULL;
|
||||
|
||||
-- Special pricing by category (for promo analysis)
|
||||
CREATE INDEX IF NOT EXISTS idx_products_special_category
|
||||
ON store_products(category, is_on_special)
|
||||
WHERE is_on_special = TRUE;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 8: Verification
|
||||
-- ============================================================================
|
||||
|
||||
SELECT
|
||||
'Migration 053 completed successfully.' AS status,
|
||||
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_products_%') AS product_indexes,
|
||||
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_snapshots_%') AS snapshot_indexes;
|
||||
346
backend/migrations/053_dutchie_discovery_schema.sql
Normal file
346
backend/migrations/053_dutchie_discovery_schema.sql
Normal file
@@ -0,0 +1,346 @@
|
||||
-- ============================================================================
|
||||
-- Migration 053: Dutchie Discovery Schema
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Create tables for Dutchie store discovery workflow.
|
||||
-- Stores are discovered and held in staging tables until verified,
|
||||
-- then promoted to the canonical dispensaries table.
|
||||
--
|
||||
-- Tables Created:
|
||||
-- - dutchie_discovery_cities: City pages from Dutchie
|
||||
-- - dutchie_discovery_locations: Individual store locations
|
||||
--
|
||||
-- SAFETY RULES:
|
||||
-- - ALL tables use CREATE TABLE IF NOT EXISTS
|
||||
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||
-- - Does NOT touch canonical dispensaries table
|
||||
-- - Fully idempotent - safe to run multiple times
|
||||
--
|
||||
-- Run with:
|
||||
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
||||
-- -f migrations/053_dutchie_discovery_schema.sql
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: DUTCHIE_DISCOVERY_CITIES
|
||||
-- ============================================================================
|
||||
-- Stores Dutchie city pages for systematic crawling.
|
||||
-- Each city can contain multiple dispensary locations.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dutchie_discovery_cities (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
|
||||
-- Platform identification (future-proof for other platforms)
|
||||
platform TEXT NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
-- City identification
|
||||
city_name TEXT NOT NULL,
|
||||
city_slug TEXT NOT NULL,
|
||||
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
|
||||
country_code TEXT NOT NULL DEFAULT 'US',
|
||||
|
||||
-- Crawl management
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
crawl_enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
location_count INTEGER, -- Number of locations found in this city
|
||||
|
||||
-- Metadata
|
||||
notes TEXT,
|
||||
metadata JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add unique constraint if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dutchie_discovery_cities_unique'
|
||||
) THEN
|
||||
ALTER TABLE dutchie_discovery_cities
|
||||
ADD CONSTRAINT dutchie_discovery_cities_unique
|
||||
UNIQUE (platform, country_code, state_code, city_slug);
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
WHEN OTHERS THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_cities_platform
|
||||
ON dutchie_discovery_cities(platform);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_cities_state
|
||||
ON dutchie_discovery_cities(country_code, state_code);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_cities_crawl_enabled
|
||||
ON dutchie_discovery_cities(crawl_enabled)
|
||||
WHERE crawl_enabled = TRUE;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_cities_last_crawled
|
||||
ON dutchie_discovery_cities(last_crawled_at);
|
||||
|
||||
COMMENT ON TABLE dutchie_discovery_cities IS 'City pages from Dutchie for systematic store discovery.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: DUTCHIE_DISCOVERY_LOCATIONS
|
||||
-- ============================================================================
|
||||
-- Individual store locations discovered from Dutchie.
|
||||
-- These are NOT promoted to canonical dispensaries until verified.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dutchie_discovery_locations (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
|
||||
-- Platform identification
|
||||
platform TEXT NOT NULL DEFAULT 'dutchie',
|
||||
platform_location_id TEXT NOT NULL, -- Dutchie's internal Location ID
|
||||
platform_slug TEXT NOT NULL, -- URL slug for the store
|
||||
platform_menu_url TEXT NOT NULL, -- Full menu URL
|
||||
|
||||
-- Store name
|
||||
name TEXT NOT NULL,
|
||||
|
||||
-- Address components
|
||||
raw_address TEXT,
|
||||
address_line1 TEXT,
|
||||
address_line2 TEXT,
|
||||
city TEXT,
|
||||
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
|
||||
postal_code TEXT,
|
||||
country_code TEXT, -- 'US' or 'CA'
|
||||
|
||||
-- Coordinates
|
||||
latitude DOUBLE PRECISION,
|
||||
longitude DOUBLE PRECISION,
|
||||
timezone TEXT,
|
||||
|
||||
-- Discovery status
|
||||
status TEXT NOT NULL DEFAULT 'discovered',
|
||||
-- discovered: Just found, not yet verified
|
||||
-- verified: Verified and promoted to canonical dispensaries
|
||||
-- rejected: Manually rejected (e.g., duplicate, test store)
|
||||
-- merged: Linked to existing canonical dispensary
|
||||
|
||||
-- Link to canonical dispensaries (only after verification)
|
||||
dispensary_id INTEGER,
|
||||
|
||||
-- Reference to discovery city
|
||||
discovery_city_id BIGINT,
|
||||
|
||||
-- Raw data from Dutchie
|
||||
metadata JSONB,
|
||||
notes TEXT,
|
||||
|
||||
-- Store capabilities (from Dutchie)
|
||||
offers_delivery BOOLEAN,
|
||||
offers_pickup BOOLEAN,
|
||||
is_recreational BOOLEAN,
|
||||
is_medical BOOLEAN,
|
||||
|
||||
-- Tracking
|
||||
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
last_checked_at TIMESTAMPTZ,
|
||||
verified_at TIMESTAMPTZ,
|
||||
verified_by TEXT, -- User who verified
|
||||
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add unique constraints if not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dutchie_discovery_locations_platform_id_unique'
|
||||
) THEN
|
||||
ALTER TABLE dutchie_discovery_locations
|
||||
ADD CONSTRAINT dutchie_discovery_locations_platform_id_unique
|
||||
UNIQUE (platform, platform_location_id);
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
WHEN OTHERS THEN NULL;
|
||||
END $$;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dutchie_discovery_locations_slug_unique'
|
||||
) THEN
|
||||
ALTER TABLE dutchie_discovery_locations
|
||||
ADD CONSTRAINT dutchie_discovery_locations_slug_unique
|
||||
UNIQUE (platform, platform_slug, country_code, state_code, city);
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
WHEN OTHERS THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Add FK to dispensaries if not exists (allows NULL)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dutchie_discovery_locations_dispensary_fk'
|
||||
) THEN
|
||||
ALTER TABLE dutchie_discovery_locations
|
||||
ADD CONSTRAINT dutchie_discovery_locations_dispensary_fk
|
||||
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
WHEN OTHERS THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Add FK to discovery cities if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_constraint
|
||||
WHERE conname = 'dutchie_discovery_locations_city_fk'
|
||||
) THEN
|
||||
ALTER TABLE dutchie_discovery_locations
|
||||
ADD CONSTRAINT dutchie_discovery_locations_city_fk
|
||||
FOREIGN KEY (discovery_city_id) REFERENCES dutchie_discovery_cities(id) ON DELETE SET NULL;
|
||||
END IF;
|
||||
EXCEPTION
|
||||
WHEN duplicate_object THEN NULL;
|
||||
WHEN OTHERS THEN NULL;
|
||||
END $$;
|
||||
|
||||
-- Indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_platform
|
||||
ON dutchie_discovery_locations(platform);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_status
|
||||
ON dutchie_discovery_locations(status);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_state
|
||||
ON dutchie_discovery_locations(country_code, state_code);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_city
|
||||
ON dutchie_discovery_locations(city, state_code);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_dispensary
|
||||
ON dutchie_discovery_locations(dispensary_id)
|
||||
WHERE dispensary_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_discovered
|
||||
ON dutchie_discovery_locations(status, first_seen_at DESC)
|
||||
WHERE status = 'discovered';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_active
|
||||
ON dutchie_discovery_locations(active)
|
||||
WHERE active = TRUE;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_coords
|
||||
ON dutchie_discovery_locations(latitude, longitude)
|
||||
WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE dutchie_discovery_locations IS 'Discovered store locations from Dutchie. Held in staging until verified.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: ADD CANADIAN PROVINCES TO STATES TABLE
|
||||
-- ============================================================================
|
||||
-- Support for Canadian provinces (Ontario, BC, Alberta, etc.)
|
||||
|
||||
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
|
||||
('AB', 'Alberta', 'America/Edmonton', TRUE, TRUE),
|
||||
('BC', 'British Columbia', 'America/Vancouver', TRUE, TRUE),
|
||||
('MB', 'Manitoba', 'America/Winnipeg', TRUE, TRUE),
|
||||
('NB', 'New Brunswick', 'America/Moncton', TRUE, TRUE),
|
||||
('NL', 'Newfoundland and Labrador', 'America/St_Johns', TRUE, TRUE),
|
||||
('NS', 'Nova Scotia', 'America/Halifax', TRUE, TRUE),
|
||||
('NT', 'Northwest Territories', 'America/Yellowknife', TRUE, TRUE),
|
||||
('NU', 'Nunavut', 'America/Iqaluit', TRUE, TRUE),
|
||||
('ON', 'Ontario', 'America/Toronto', TRUE, TRUE),
|
||||
('PE', 'Prince Edward Island', 'America/Halifax', TRUE, TRUE),
|
||||
('QC', 'Quebec', 'America/Montreal', TRUE, TRUE),
|
||||
('SK', 'Saskatchewan', 'America/Regina', TRUE, TRUE),
|
||||
('YT', 'Yukon', 'America/Whitehorse', TRUE, TRUE)
|
||||
ON CONFLICT (code) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
|
||||
updated_at = NOW();
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: VIEWS FOR DISCOVERY MONITORING
|
||||
-- ============================================================================
|
||||
|
||||
-- View: Discovery status summary
|
||||
CREATE OR REPLACE VIEW v_discovery_status AS
|
||||
SELECT
|
||||
platform,
|
||||
country_code,
|
||||
state_code,
|
||||
status,
|
||||
COUNT(*) AS location_count,
|
||||
COUNT(*) FILTER (WHERE dispensary_id IS NOT NULL) AS linked_count,
|
||||
MIN(first_seen_at) AS earliest_discovery,
|
||||
MAX(last_seen_at) AS latest_activity
|
||||
FROM dutchie_discovery_locations
|
||||
GROUP BY platform, country_code, state_code, status
|
||||
ORDER BY country_code, state_code, status;
|
||||
|
||||
-- View: Unverified discoveries awaiting action
|
||||
CREATE OR REPLACE VIEW v_discovery_pending AS
|
||||
SELECT
|
||||
dl.id,
|
||||
dl.platform,
|
||||
dl.name,
|
||||
dl.city,
|
||||
dl.state_code,
|
||||
dl.country_code,
|
||||
dl.platform_menu_url,
|
||||
dl.first_seen_at,
|
||||
dl.last_seen_at,
|
||||
dl.offers_delivery,
|
||||
dl.offers_pickup,
|
||||
dl.is_recreational,
|
||||
dl.is_medical,
|
||||
dc.city_name AS discovery_city_name
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dutchie_discovery_cities dc ON dc.id = dl.discovery_city_id
|
||||
WHERE dl.status = 'discovered'
|
||||
AND dl.active = TRUE
|
||||
ORDER BY dl.state_code, dl.city, dl.name;
|
||||
|
||||
-- View: City crawl status
|
||||
CREATE OR REPLACE VIEW v_discovery_cities_status AS
|
||||
SELECT
|
||||
dc.id,
|
||||
dc.platform,
|
||||
dc.city_name,
|
||||
dc.state_code,
|
||||
dc.country_code,
|
||||
dc.crawl_enabled,
|
||||
dc.last_crawled_at,
|
||||
dc.location_count,
|
||||
COUNT(dl.id) AS actual_locations,
|
||||
COUNT(dl.id) FILTER (WHERE dl.status = 'discovered') AS pending_count,
|
||||
COUNT(dl.id) FILTER (WHERE dl.status = 'verified') AS verified_count,
|
||||
COUNT(dl.id) FILTER (WHERE dl.status = 'rejected') AS rejected_count
|
||||
FROM dutchie_discovery_cities dc
|
||||
LEFT JOIN dutchie_discovery_locations dl ON dl.discovery_city_id = dc.id
|
||||
GROUP BY dc.id, dc.platform, dc.city_name, dc.state_code, dc.country_code,
|
||||
dc.crawl_enabled, dc.last_crawled_at, dc.location_count
|
||||
ORDER BY dc.country_code, dc.state_code, dc.city_name;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- DONE
|
||||
-- ============================================================================
|
||||
|
||||
SELECT 'Migration 053 completed successfully. Discovery schema created.' AS status;
|
||||
49
backend/migrations/054_worker_metadata.sql
Normal file
49
backend/migrations/054_worker_metadata.sql
Normal file
@@ -0,0 +1,49 @@
|
||||
-- Migration 054: Worker Metadata for Named Workforce
|
||||
-- Adds worker_name and worker_role to job tables for displaying friendly worker identities
|
||||
|
||||
-- Add worker metadata columns to job_schedules
|
||||
ALTER TABLE job_schedules
|
||||
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
|
||||
|
||||
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
|
||||
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., Store Discovery Worker, GraphQL Product Sync)';
|
||||
|
||||
-- Add worker metadata columns to job_run_logs
|
||||
ALTER TABLE job_run_logs
|
||||
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS run_role VARCHAR(100);
|
||||
|
||||
COMMENT ON COLUMN job_run_logs.worker_name IS 'Name of the worker that executed this run (copied from schedule)';
|
||||
COMMENT ON COLUMN job_run_logs.run_role IS 'Role description for this specific run';
|
||||
|
||||
-- Add worker_name to dispensary_crawl_jobs (for tracking which named worker enqueued it)
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
|
||||
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
|
||||
|
||||
-- Update existing schedules with worker names
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Bella',
|
||||
worker_role = 'GraphQL Product Sync'
|
||||
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Henry',
|
||||
worker_role = 'Entry Point Finder'
|
||||
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Alice',
|
||||
worker_role = 'Store Discovery'
|
||||
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Oscar',
|
||||
worker_role = 'Analytics Refresh'
|
||||
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
|
||||
|
||||
-- Create index for worker name lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by ON dispensary_crawl_jobs(enqueued_by_worker);
|
||||
123
backend/migrations/055_workforce_enhancements.sql
Normal file
123
backend/migrations/055_workforce_enhancements.sql
Normal file
@@ -0,0 +1,123 @@
|
||||
-- Migration 055: Workforce System Enhancements
|
||||
-- Adds visibility tracking, slug change tracking, and scope support for workers
|
||||
|
||||
-- ============================================================
|
||||
-- 1. VISIBILITY TRACKING FOR BELLA (Product Sync)
|
||||
-- ============================================================
|
||||
|
||||
-- Add visibility tracking to dutchie_products
|
||||
ALTER TABLE dutchie_products
|
||||
ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE,
|
||||
ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMPTZ;
|
||||
|
||||
COMMENT ON COLUMN dutchie_products.visibility_lost IS 'True if product disappeared from GraphQL results';
|
||||
COMMENT ON COLUMN dutchie_products.visibility_lost_at IS 'When product was last marked as visibility lost';
|
||||
COMMENT ON COLUMN dutchie_products.visibility_restored_at IS 'When product reappeared after being lost';
|
||||
|
||||
-- Index for visibility queries
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost
|
||||
ON dutchie_products(dispensary_id, visibility_lost)
|
||||
WHERE visibility_lost = TRUE;
|
||||
|
||||
-- ============================================================
|
||||
-- 2. SLUG CHANGE TRACKING FOR ALICE (Store Discovery)
|
||||
-- ============================================================
|
||||
|
||||
-- Add slug change and retirement tracking to discovery locations
|
||||
ALTER TABLE dutchie_discovery_locations
|
||||
ADD COLUMN IF NOT EXISTS slug_changed_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS previous_slug VARCHAR(255),
|
||||
ADD COLUMN IF NOT EXISTS retired_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS retirement_reason VARCHAR(100);
|
||||
|
||||
COMMENT ON COLUMN dutchie_discovery_locations.slug_changed_at IS 'When the platform slug was last changed';
|
||||
COMMENT ON COLUMN dutchie_discovery_locations.previous_slug IS 'Previous slug before the last change';
|
||||
COMMENT ON COLUMN dutchie_discovery_locations.retired_at IS 'When store was marked as retired/removed';
|
||||
COMMENT ON COLUMN dutchie_discovery_locations.retirement_reason IS 'Reason for retirement (removed_from_source, closed, etc.)';
|
||||
|
||||
-- Index for finding retired stores
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_retired
|
||||
ON dutchie_discovery_locations(retired_at)
|
||||
WHERE retired_at IS NOT NULL;
|
||||
|
||||
-- ============================================================
|
||||
-- 3. ID RESOLUTION TRACKING FOR HENRY (Entry Point Finder)
|
||||
-- ============================================================
|
||||
|
||||
-- Add resolution tracking to dispensaries
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS last_id_resolution_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS id_resolution_attempts INT DEFAULT 0,
|
||||
ADD COLUMN IF NOT EXISTS id_resolution_error TEXT;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.last_id_resolution_at IS 'When platform_dispensary_id was last resolved/attempted';
|
||||
COMMENT ON COLUMN dispensaries.id_resolution_attempts IS 'Number of resolution attempts';
|
||||
COMMENT ON COLUMN dispensaries.id_resolution_error IS 'Last error message from resolution attempt';
|
||||
|
||||
-- Index for finding stores needing resolution
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_resolution
|
||||
ON dispensaries(state, menu_type)
|
||||
WHERE platform_dispensary_id IS NULL AND menu_type = 'dutchie';
|
||||
|
||||
-- ============================================================
|
||||
-- 4. ENHANCED CITIES TABLE FOR ALICE
|
||||
-- ============================================================
|
||||
|
||||
-- Add tracking columns to cities table
|
||||
ALTER TABLE dutchie_discovery_cities
|
||||
ADD COLUMN IF NOT EXISTS state_name VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS discovered_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
ADD COLUMN IF NOT EXISTS last_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS store_count_reported INT,
|
||||
ADD COLUMN IF NOT EXISTS store_count_actual INT;
|
||||
|
||||
COMMENT ON COLUMN dutchie_discovery_cities.state_name IS 'Full state name from source';
|
||||
COMMENT ON COLUMN dutchie_discovery_cities.discovered_at IS 'When city was first discovered';
|
||||
COMMENT ON COLUMN dutchie_discovery_cities.last_verified_at IS 'When city was last verified to exist';
|
||||
COMMENT ON COLUMN dutchie_discovery_cities.store_count_reported IS 'Store count reported by source';
|
||||
COMMENT ON COLUMN dutchie_discovery_cities.store_count_actual IS 'Actual store count from discovery';
|
||||
|
||||
-- ============================================================
|
||||
-- 5. UPDATE WORKER ROLES (Standardize naming)
|
||||
-- ============================================================
|
||||
|
||||
-- Update existing workers to use standardized role names
|
||||
UPDATE job_schedules SET worker_role = 'store_discovery'
|
||||
WHERE worker_name = 'Alice' AND worker_role = 'Store Discovery';
|
||||
|
||||
UPDATE job_schedules SET worker_role = 'entry_point_finder'
|
||||
WHERE worker_name = 'Henry' AND worker_role = 'Entry Point Finder';
|
||||
|
||||
UPDATE job_schedules SET worker_role = 'product_sync'
|
||||
WHERE worker_name = 'Bella' AND worker_role = 'GraphQL Product Sync';
|
||||
|
||||
UPDATE job_schedules SET worker_role = 'analytics_refresh'
|
||||
WHERE worker_name = 'Oscar' AND worker_role = 'Analytics Refresh';
|
||||
|
||||
-- ============================================================
|
||||
-- 6. VISIBILITY EVENTS IN SNAPSHOTS (JSONB approach)
|
||||
-- ============================================================
|
||||
|
||||
-- Add visibility_events array to product snapshots metadata
|
||||
-- This will store: [{event_type, timestamp, worker_name}]
|
||||
-- No schema change needed - we use existing metadata JSONB column
|
||||
|
||||
-- ============================================================
|
||||
-- 7. INDEXES FOR WORKER QUERIES
|
||||
-- ============================================================
|
||||
|
||||
-- Index for finding recently added stores (for Henry)
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_created
|
||||
ON dutchie_discovery_locations(created_at DESC)
|
||||
WHERE active = TRUE;
|
||||
|
||||
-- Index for scope-based queries (by state)
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_menu
|
||||
ON dispensaries(state, menu_type)
|
||||
WHERE menu_type IS NOT NULL;
|
||||
|
||||
-- Record migration
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES (55, '055_workforce_enhancements', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
110
backend/migrations/056_fix_worker_and_run_logs.sql
Normal file
110
backend/migrations/056_fix_worker_and_run_logs.sql
Normal file
@@ -0,0 +1,110 @@
|
||||
-- Migration 056: Fix Worker Metadata and Job Run Logs
|
||||
--
|
||||
-- This migration safely ensures all expected schema exists for:
|
||||
-- 1. job_schedules - worker_name, worker_role columns
|
||||
-- 2. job_run_logs - entire table creation if missing
|
||||
--
|
||||
-- Uses IF NOT EXISTS / ADD COLUMN IF NOT EXISTS for idempotency.
|
||||
-- Safe to run on databases that already have some or all of these changes.
|
||||
|
||||
-- ============================================================
|
||||
-- 1. ADD MISSING COLUMNS TO job_schedules
|
||||
-- ============================================================
|
||||
|
||||
ALTER TABLE job_schedules
|
||||
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
|
||||
|
||||
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
|
||||
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., store_discovery, product_sync)';
|
||||
|
||||
-- ============================================================
|
||||
-- 2. CREATE job_run_logs TABLE IF NOT EXISTS
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS job_run_logs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
|
||||
job_name VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
error_message TEXT,
|
||||
|
||||
-- Results summary
|
||||
items_processed INTEGER DEFAULT 0,
|
||||
items_succeeded INTEGER DEFAULT 0,
|
||||
items_failed INTEGER DEFAULT 0,
|
||||
|
||||
-- Worker metadata (from scheduler.ts createRunLog function)
|
||||
worker_name VARCHAR(50),
|
||||
run_role VARCHAR(100),
|
||||
|
||||
-- Additional run details
|
||||
metadata JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Create indexes if they don't exist
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
|
||||
|
||||
-- ============================================================
|
||||
-- 3. ADD enqueued_by_worker TO dispensary_crawl_jobs IF EXISTS
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Only add column if dispensary_crawl_jobs table exists
|
||||
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'dispensary_crawl_jobs') THEN
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
|
||||
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by
|
||||
ON dispensary_crawl_jobs(enqueued_by_worker);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- ============================================================
|
||||
-- 4. SEED DEFAULT WORKER NAMES FOR EXISTING SCHEDULES
|
||||
-- ============================================================
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Bella',
|
||||
worker_role = 'product_sync'
|
||||
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Henry',
|
||||
worker_role = 'entry_point_finder'
|
||||
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Alice',
|
||||
worker_role = 'store_discovery'
|
||||
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
|
||||
|
||||
UPDATE job_schedules SET
|
||||
worker_name = 'Oscar',
|
||||
worker_role = 'analytics_refresh'
|
||||
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
|
||||
|
||||
-- ============================================================
|
||||
-- 5. RECORD MIGRATION (if schema_migrations table exists)
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'schema_migrations') THEN
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES (56, '056_fix_worker_and_run_logs', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
END IF;
|
||||
END $$;
|
||||
64
backend/migrations/057_visibility_tracking_columns.sql
Normal file
64
backend/migrations/057_visibility_tracking_columns.sql
Normal file
@@ -0,0 +1,64 @@
|
||||
-- Migration 057: Add visibility tracking columns to dutchie_products
|
||||
--
|
||||
-- Supports Bella (Product Sync) worker visibility-loss tracking:
|
||||
-- - visibility_lost: TRUE when product disappears from GraphQL feed
|
||||
-- - visibility_lost_at: Timestamp when product first went missing
|
||||
-- - visibility_restored_at: Timestamp when product reappeared
|
||||
--
|
||||
-- These columns enable tracking of products that temporarily or permanently
|
||||
-- disappear from Dutchie GraphQL API responses.
|
||||
|
||||
-- ============================================================
|
||||
-- 1. ADD VISIBILITY TRACKING COLUMNS TO dutchie_products
|
||||
-- ============================================================
|
||||
|
||||
ALTER TABLE dutchie_products
|
||||
ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE,
|
||||
ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMPTZ;
|
||||
|
||||
COMMENT ON COLUMN dutchie_products.visibility_lost IS 'TRUE when product is missing from GraphQL feed';
|
||||
COMMENT ON COLUMN dutchie_products.visibility_lost_at IS 'Timestamp when product first went missing from feed';
|
||||
COMMENT ON COLUMN dutchie_products.visibility_restored_at IS 'Timestamp when product reappeared after being missing';
|
||||
|
||||
-- ============================================================
|
||||
-- 2. CREATE INDEX FOR VISIBILITY QUERIES
|
||||
-- ============================================================
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost
|
||||
ON dutchie_products(visibility_lost)
|
||||
WHERE visibility_lost = TRUE;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost_at
|
||||
ON dutchie_products(visibility_lost_at)
|
||||
WHERE visibility_lost_at IS NOT NULL;
|
||||
|
||||
-- ============================================================
|
||||
-- 3. CREATE VIEW FOR VISIBILITY ANALYTICS
|
||||
-- ============================================================
|
||||
|
||||
CREATE OR REPLACE VIEW v_visibility_summary AS
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
d.name AS dispensary_name,
|
||||
d.state,
|
||||
COUNT(dp.id) AS total_products,
|
||||
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = TRUE) AS visibility_lost_count,
|
||||
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = FALSE OR dp.visibility_lost IS NULL) AS visible_count,
|
||||
COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at IS NOT NULL) AS restored_count,
|
||||
MAX(dp.visibility_lost_at) AS latest_loss_at,
|
||||
MAX(dp.visibility_restored_at) AS latest_restore_at
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.menu_type = 'dutchie'
|
||||
GROUP BY d.id, d.name, d.state;
|
||||
|
||||
COMMENT ON VIEW v_visibility_summary IS 'Aggregated visibility metrics per dispensary for dashboard analytics';
|
||||
|
||||
-- ============================================================
|
||||
-- 4. RECORD MIGRATION
|
||||
-- ============================================================
|
||||
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES (57, '057_visibility_tracking_columns', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
46
backend/migrations/058_add_id_resolution_columns.sql
Normal file
46
backend/migrations/058_add_id_resolution_columns.sql
Normal file
@@ -0,0 +1,46 @@
|
||||
-- Migration 058: Add ID resolution tracking columns to dispensaries
|
||||
--
|
||||
-- Supports Henry (Entry Point Finder) worker tracking:
|
||||
-- - id_resolution_attempts: Count of how many times we've tried to resolve platform ID
|
||||
-- - last_id_resolution_at: When we last tried (matches code expectation)
|
||||
-- - id_resolution_status: Current status (pending, resolved, failed)
|
||||
-- - id_resolution_error: Last error message from resolution attempt
|
||||
|
||||
-- ============================================================
|
||||
-- 1. ADD ID RESOLUTION COLUMNS TO dispensaries
|
||||
-- ============================================================
|
||||
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS id_resolution_attempts INTEGER DEFAULT 0,
|
||||
ADD COLUMN IF NOT EXISTS last_id_resolution_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS id_resolution_status VARCHAR(20) DEFAULT 'pending',
|
||||
ADD COLUMN IF NOT EXISTS id_resolution_error TEXT;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.id_resolution_attempts IS 'Number of attempts to resolve platform_dispensary_id';
|
||||
COMMENT ON COLUMN dispensaries.last_id_resolution_at IS 'Timestamp of last ID resolution attempt';
|
||||
COMMENT ON COLUMN dispensaries.id_resolution_status IS 'Status: pending, resolved, failed';
|
||||
COMMENT ON COLUMN dispensaries.id_resolution_error IS 'Last error message from ID resolution attempt';
|
||||
|
||||
-- Additional columns needed by worker/scheduler
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS failed_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS failure_notes TEXT;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.failed_at IS 'Timestamp when dispensary was marked as permanently failed';
|
||||
COMMENT ON COLUMN dispensaries.failure_notes IS 'Notes about why dispensary was marked as failed';
|
||||
|
||||
-- ============================================================
|
||||
-- 2. CREATE INDEX FOR RESOLUTION QUERIES
|
||||
-- ============================================================
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_id_resolution_status
|
||||
ON dispensaries(id_resolution_status)
|
||||
WHERE id_resolution_status = 'pending';
|
||||
|
||||
-- ============================================================
|
||||
-- 3. RECORD MIGRATION
|
||||
-- ============================================================
|
||||
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES (58, '058_add_id_resolution_columns', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
67
backend/migrations/059_job_queue_columns.sql
Normal file
67
backend/migrations/059_job_queue_columns.sql
Normal file
@@ -0,0 +1,67 @@
|
||||
-- Migration 059: Add missing columns to dispensary_crawl_jobs
|
||||
--
|
||||
-- Required for worker job processing:
|
||||
-- - max_retries: Maximum retry attempts for a job
|
||||
-- - retry_count: Current retry count
|
||||
-- - worker_id: ID of worker processing the job
|
||||
-- - locked_at: When the job was locked by a worker
|
||||
-- - locked_by: Hostname of worker that locked the job
|
||||
|
||||
-- ============================================================
|
||||
-- 1. ADD JOB QUEUE COLUMNS
|
||||
-- ============================================================
|
||||
|
||||
ALTER TABLE dispensary_crawl_jobs
|
||||
ADD COLUMN IF NOT EXISTS max_retries INTEGER DEFAULT 3,
|
||||
ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0,
|
||||
ADD COLUMN IF NOT EXISTS worker_id VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS locked_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS locked_by VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ DEFAULT NOW();
|
||||
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.max_retries IS 'Maximum number of retry attempts';
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.retry_count IS 'Current retry count';
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.worker_id IS 'ID of worker processing this job';
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.locked_at IS 'When job was locked by worker';
|
||||
COMMENT ON COLUMN dispensary_crawl_jobs.locked_by IS 'Hostname of worker that locked job';
|
||||
|
||||
-- ============================================================
|
||||
-- 2. CREATE INDEXES FOR JOB QUEUE QUERIES
|
||||
-- ============================================================
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status_priority
|
||||
ON dispensary_crawl_jobs(status, priority DESC)
|
||||
WHERE status = 'pending';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_worker_id
|
||||
ON dispensary_crawl_jobs(worker_id)
|
||||
WHERE worker_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_locked_at
|
||||
ON dispensary_crawl_jobs(locked_at)
|
||||
WHERE locked_at IS NOT NULL;
|
||||
|
||||
-- ============================================================
|
||||
-- 3. CREATE QUEUE STATS VIEW
|
||||
-- ============================================================
|
||||
|
||||
CREATE OR REPLACE VIEW v_queue_stats AS
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'pending') AS pending_jobs,
|
||||
COUNT(*) FILTER (WHERE status = 'running') AS running_jobs,
|
||||
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS completed_1h,
|
||||
COUNT(*) FILTER (WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') AS failed_1h,
|
||||
COUNT(DISTINCT worker_id) FILTER (WHERE status = 'running') AS active_workers,
|
||||
ROUND((AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'))::numeric, 2) AS avg_duration_seconds
|
||||
FROM dispensary_crawl_jobs;
|
||||
|
||||
COMMENT ON VIEW v_queue_stats IS 'Real-time queue statistics for monitoring dashboard';
|
||||
|
||||
-- ============================================================
|
||||
-- 4. RECORD MIGRATION
|
||||
-- ============================================================
|
||||
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES (59, '059_job_queue_columns', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
281
backend/node_modules/.package-lock.json
generated
vendored
281
backend/node_modules/.package-lock.json
generated
vendored
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.0.0",
|
||||
"version": "1.5.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
@@ -575,6 +575,11 @@
|
||||
"npm": "1.2.8000 || >= 1.4.16"
|
||||
}
|
||||
},
|
||||
"node_modules/boolbase": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.12",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||
@@ -685,6 +690,46 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/cheerio": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz",
|
||||
"integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==",
|
||||
"dependencies": {
|
||||
"cheerio-select": "^2.1.0",
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.2.2",
|
||||
"encoding-sniffer": "^0.2.1",
|
||||
"htmlparser2": "^10.0.0",
|
||||
"parse5": "^7.3.0",
|
||||
"parse5-htmlparser2-tree-adapter": "^7.1.0",
|
||||
"parse5-parser-stream": "^7.1.2",
|
||||
"undici": "^7.12.0",
|
||||
"whatwg-mimetype": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20.18.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/cheerio-select": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
|
||||
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-select": "^5.1.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/chownr": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz",
|
||||
@@ -876,6 +921,32 @@
|
||||
"node-fetch": "^2.6.12"
|
||||
}
|
||||
},
|
||||
"node_modules/css-select": {
|
||||
"version": "5.2.2",
|
||||
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
|
||||
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"domutils": "^3.0.1",
|
||||
"nth-check": "^2.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/css-what": {
|
||||
"version": "6.2.2",
|
||||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
|
||||
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -1002,6 +1073,57 @@
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1232444.tgz",
|
||||
"integrity": "sha512-pM27vqEfxSxRkTMnF+XCmxSEb6duO5R+t8A9DEEJgy4Wz2RVanje2mmj99B6A3zv2r/qGfYlOvYznUhuokizmg=="
|
||||
},
|
||||
"node_modules/dom-serializer": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
||||
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"entities": "^4.2.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domelementtype": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
||||
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
]
|
||||
},
|
||||
"node_modules/domhandler": {
|
||||
"version": "5.0.3",
|
||||
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
||||
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domutils": {
|
||||
"version": "3.2.2",
|
||||
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
|
||||
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
|
||||
"dependencies": {
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/dotenv": {
|
||||
"version": "16.6.1",
|
||||
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
|
||||
@@ -1052,6 +1174,29 @@
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/encoding-sniffer": {
|
||||
"version": "0.2.1",
|
||||
"resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz",
|
||||
"integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==",
|
||||
"dependencies": {
|
||||
"iconv-lite": "^0.6.3",
|
||||
"whatwg-encoding": "^3.1.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/encoding-sniffer?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/encoding-sniffer/node_modules/iconv-lite": {
|
||||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||
"dependencies": {
|
||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/end-of-stream": {
|
||||
"version": "1.4.5",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
||||
@@ -1060,6 +1205,17 @@
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/entities": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
|
||||
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/env-paths": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
|
||||
@@ -1765,6 +1921,35 @@
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
|
||||
"integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==",
|
||||
"funding": [
|
||||
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
],
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.2.1",
|
||||
"entities": "^6.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2/node_modules/entities": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/http-errors": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
|
||||
@@ -2530,6 +2715,17 @@
|
||||
"set-blocking": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/nth-check": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
||||
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
@@ -2647,6 +2843,51 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5": {
|
||||
"version": "7.3.0",
|
||||
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
|
||||
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
|
||||
"dependencies": {
|
||||
"entities": "^6.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5-htmlparser2-tree-adapter": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
|
||||
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
|
||||
"dependencies": {
|
||||
"domhandler": "^5.0.3",
|
||||
"parse5": "^7.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5-parser-stream": {
|
||||
"version": "7.1.2",
|
||||
"resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz",
|
||||
"integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==",
|
||||
"dependencies": {
|
||||
"parse5": "^7.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5/node_modules/entities": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parseurl": {
|
||||
"version": "1.3.3",
|
||||
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
||||
@@ -4040,6 +4281,14 @@
|
||||
"through": "^2.3.8"
|
||||
}
|
||||
},
|
||||
"node_modules/undici": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz",
|
||||
"integrity": "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g==",
|
||||
"engines": {
|
||||
"node": ">=20.18.1"
|
||||
}
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "6.21.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
||||
@@ -4128,6 +4377,36 @@
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
|
||||
},
|
||||
"node_modules/whatwg-encoding": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
||||
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
||||
"dependencies": {
|
||||
"iconv-lite": "0.6.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-encoding/node_modules/iconv-lite": {
|
||||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||
"dependencies": {
|
||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-mimetype": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
||||
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
|
||||
284
backend/package-lock.json
generated
284
backend/package-lock.json
generated
@@ -1,15 +1,16 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.0.0",
|
||||
"version": "1.5.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.0.0",
|
||||
"version": "1.5.1",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.2",
|
||||
"bcrypt": "^5.1.1",
|
||||
"cheerio": "^1.1.2",
|
||||
"cors": "^2.8.5",
|
||||
"dotenv": "^16.3.1",
|
||||
"express": "^4.18.2",
|
||||
@@ -1015,6 +1016,11 @@
|
||||
"npm": "1.2.8000 || >= 1.4.16"
|
||||
}
|
||||
},
|
||||
"node_modules/boolbase": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.12",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||
@@ -1125,6 +1131,46 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/cheerio": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz",
|
||||
"integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==",
|
||||
"dependencies": {
|
||||
"cheerio-select": "^2.1.0",
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.2.2",
|
||||
"encoding-sniffer": "^0.2.1",
|
||||
"htmlparser2": "^10.0.0",
|
||||
"parse5": "^7.3.0",
|
||||
"parse5-htmlparser2-tree-adapter": "^7.1.0",
|
||||
"parse5-parser-stream": "^7.1.2",
|
||||
"undici": "^7.12.0",
|
||||
"whatwg-mimetype": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20.18.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/cheerio-select": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
|
||||
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-select": "^5.1.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/chownr": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/chownr/-/chownr-2.0.0.tgz",
|
||||
@@ -1316,6 +1362,32 @@
|
||||
"node-fetch": "^2.6.12"
|
||||
}
|
||||
},
|
||||
"node_modules/css-select": {
|
||||
"version": "5.2.2",
|
||||
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
|
||||
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"domutils": "^3.0.1",
|
||||
"nth-check": "^2.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/css-what": {
|
||||
"version": "6.2.2",
|
||||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
|
||||
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -1442,6 +1514,57 @@
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1232444.tgz",
|
||||
"integrity": "sha512-pM27vqEfxSxRkTMnF+XCmxSEb6duO5R+t8A9DEEJgy4Wz2RVanje2mmj99B6A3zv2r/qGfYlOvYznUhuokizmg=="
|
||||
},
|
||||
"node_modules/dom-serializer": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
||||
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"entities": "^4.2.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domelementtype": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
||||
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
]
|
||||
},
|
||||
"node_modules/domhandler": {
|
||||
"version": "5.0.3",
|
||||
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
||||
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domutils": {
|
||||
"version": "3.2.2",
|
||||
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
|
||||
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
|
||||
"dependencies": {
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/dotenv": {
|
||||
"version": "16.6.1",
|
||||
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
|
||||
@@ -1492,6 +1615,29 @@
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/encoding-sniffer": {
|
||||
"version": "0.2.1",
|
||||
"resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz",
|
||||
"integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==",
|
||||
"dependencies": {
|
||||
"iconv-lite": "^0.6.3",
|
||||
"whatwg-encoding": "^3.1.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/encoding-sniffer?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/encoding-sniffer/node_modules/iconv-lite": {
|
||||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||
"dependencies": {
|
||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/end-of-stream": {
|
||||
"version": "1.4.5",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
||||
@@ -1500,6 +1646,17 @@
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/entities": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
|
||||
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/env-paths": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
|
||||
@@ -2219,6 +2376,35 @@
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
|
||||
"integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==",
|
||||
"funding": [
|
||||
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
],
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.2.1",
|
||||
"entities": "^6.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2/node_modules/entities": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/http-errors": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
|
||||
@@ -2984,6 +3170,17 @@
|
||||
"set-blocking": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/nth-check": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
||||
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
@@ -3101,6 +3298,51 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5": {
|
||||
"version": "7.3.0",
|
||||
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
|
||||
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
|
||||
"dependencies": {
|
||||
"entities": "^6.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5-htmlparser2-tree-adapter": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
|
||||
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
|
||||
"dependencies": {
|
||||
"domhandler": "^5.0.3",
|
||||
"parse5": "^7.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5-parser-stream": {
|
||||
"version": "7.1.2",
|
||||
"resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz",
|
||||
"integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==",
|
||||
"dependencies": {
|
||||
"parse5": "^7.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5/node_modules/entities": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/parseurl": {
|
||||
"version": "1.3.3",
|
||||
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
||||
@@ -4507,6 +4749,14 @@
|
||||
"through": "^2.3.8"
|
||||
}
|
||||
},
|
||||
"node_modules/undici": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz",
|
||||
"integrity": "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g==",
|
||||
"engines": {
|
||||
"node": ">=20.18.1"
|
||||
}
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "6.21.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
||||
@@ -4595,6 +4845,36 @@
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
|
||||
},
|
||||
"node_modules/whatwg-encoding": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
||||
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
||||
"dependencies": {
|
||||
"iconv-lite": "0.6.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-encoding/node_modules/iconv-lite": {
|
||||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||
"dependencies": {
|
||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-mimetype": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
||||
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
|
||||
@@ -10,11 +10,18 @@
|
||||
"migrate": "tsx src/db/migrate.ts",
|
||||
"seed": "tsx src/db/seed.ts",
|
||||
"migrate:az": "tsx src/dutchie-az/db/migrate.ts",
|
||||
"health:az": "tsx -e \"import { healthCheck } from './src/dutchie-az/db/connection'; (async()=>{ const ok=await healthCheck(); console.log(ok?'AZ DB healthy':'AZ DB NOT reachable'); process.exit(ok?0:1); })();\""
|
||||
"health:az": "tsx -e \"import { healthCheck } from './src/dutchie-az/db/connection'; (async()=>{ const ok=await healthCheck(); console.log(ok?'AZ DB healthy':'AZ DB NOT reachable'); process.exit(ok?0:1); })();\"",
|
||||
"system:smoke-test": "tsx src/scripts/system-smoke-test.ts",
|
||||
"discovery:dt:cities:auto": "tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts",
|
||||
"discovery:dt:cities:manual": "tsx src/dutchie-az/discovery/discovery-dt-cities-manual-seed.ts",
|
||||
"discovery:dt:locations": "tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts",
|
||||
"backfill:legacy:canonical": "tsx src/scripts/backfill-legacy-to-canonical.ts",
|
||||
"seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"axios": "^1.6.2",
|
||||
"bcrypt": "^5.1.1",
|
||||
"cheerio": "^1.1.2",
|
||||
"cors": "^2.8.5",
|
||||
"dotenv": "^16.3.1",
|
||||
"express": "^4.18.2",
|
||||
|
||||
224
backend/setup-local.sh
Executable file
224
backend/setup-local.sh
Executable file
@@ -0,0 +1,224 @@
|
||||
#!/bin/bash
|
||||
# CannaiQ Local Development Setup (Idempotent)
|
||||
#
|
||||
# This script starts the complete local development environment:
|
||||
# - PostgreSQL (cannaiq-postgres) on port 54320
|
||||
# - Backend API on port 3010
|
||||
# - CannaiQ Admin UI on port 8080
|
||||
# - FindADispo Consumer UI on port 3001
|
||||
# - Findagram Consumer UI on port 3002
|
||||
#
|
||||
# Usage: ./setup-local.sh
|
||||
#
|
||||
# URLs:
|
||||
# Admin: http://localhost:8080/admin
|
||||
# FindADispo: http://localhost:3001
|
||||
# Findagram: http://localhost:3002
|
||||
# Backend: http://localhost:3010
|
||||
#
|
||||
# Idempotent: Safe to run multiple times. Already-running services are left alone.
|
||||
|
||||
set -e
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${BLUE}================================${NC}"
|
||||
echo -e "${BLUE} CannaiQ Local Dev Setup${NC}"
|
||||
echo -e "${BLUE}================================${NC}"
|
||||
echo ""
|
||||
|
||||
# Check for required tools
|
||||
command -v docker >/dev/null 2>&1 || { echo -e "${RED}Error: docker is required but not installed.${NC}" >&2; exit 1; }
|
||||
command -v npm >/dev/null 2>&1 || { echo -e "${RED}Error: npm is required but not installed.${NC}" >&2; exit 1; }
|
||||
|
||||
# Get the script directory
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
ROOT_DIR="$SCRIPT_DIR/.."
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
# Step 1: PostgreSQL
|
||||
PG_RUNNING=$(docker ps --filter "name=cannaiq-postgres" --filter "status=running" -q)
|
||||
if [ -n "$PG_RUNNING" ]; then
|
||||
echo -e "${GREEN}[1/6] PostgreSQL already running (cannaiq-postgres)${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}[1/6] Starting PostgreSQL (cannaiq-postgres)...${NC}"
|
||||
docker compose -f docker-compose.local.yml up -d cannaiq-postgres
|
||||
|
||||
# Wait for PostgreSQL to be ready
|
||||
echo -e "${YELLOW} Waiting for PostgreSQL to be ready...${NC}"
|
||||
until docker exec cannaiq-postgres pg_isready -U cannaiq >/dev/null 2>&1; do
|
||||
sleep 1
|
||||
done
|
||||
echo -e "${GREEN} PostgreSQL ready on port 54320${NC}"
|
||||
fi
|
||||
|
||||
# Step 2: Create storage directories (always safe to run)
|
||||
mkdir -p storage/images/products
|
||||
mkdir -p storage/images/brands
|
||||
mkdir -p public/images
|
||||
|
||||
# Step 3: Backend
|
||||
if lsof -i:3010 >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[2/6] Backend already running on port 3010${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}[2/6] Starting Backend API...${NC}"
|
||||
|
||||
# Install dependencies if needed
|
||||
if [ ! -d "node_modules" ]; then
|
||||
echo -e "${YELLOW} Installing backend dependencies...${NC}"
|
||||
npm install
|
||||
fi
|
||||
|
||||
# Set environment for local mode
|
||||
export STORAGE_DRIVER=local
|
||||
export STORAGE_BASE_PATH=./storage
|
||||
export PORT=3010
|
||||
|
||||
# Start backend in background
|
||||
npm run dev > /tmp/cannaiq-backend.log 2>&1 &
|
||||
BACKEND_PID=$!
|
||||
echo $BACKEND_PID > /tmp/cannaiq-backend.pid
|
||||
echo -e "${GREEN} Backend starting (PID: $BACKEND_PID)${NC}"
|
||||
|
||||
# Wait briefly for backend to start
|
||||
sleep 3
|
||||
fi
|
||||
|
||||
# Step 4: CannaiQ Admin UI
|
||||
if lsof -i:8080 >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[3/6] CannaiQ Admin already running on port 8080${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}[3/6] Starting CannaiQ Admin UI...${NC}"
|
||||
|
||||
cd "$ROOT_DIR/cannaiq"
|
||||
|
||||
# Install dependencies if needed
|
||||
if [ ! -d "node_modules" ]; then
|
||||
echo -e "${YELLOW} Installing cannaiq dependencies...${NC}"
|
||||
npm install
|
||||
fi
|
||||
|
||||
# Start frontend in background
|
||||
npm run dev:admin > /tmp/cannaiq-frontend.log 2>&1 &
|
||||
FRONTEND_PID=$!
|
||||
echo $FRONTEND_PID > /tmp/cannaiq-frontend.pid
|
||||
echo -e "${GREEN} CannaiQ Admin starting (PID: $FRONTEND_PID)${NC}"
|
||||
|
||||
cd "$SCRIPT_DIR"
|
||||
fi
|
||||
|
||||
# Step 5: FindADispo Consumer UI
|
||||
if lsof -i:3001 >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[4/6] FindADispo already running on port 3001${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}[4/6] Starting FindADispo Consumer UI...${NC}"
|
||||
|
||||
cd "$ROOT_DIR/findadispo/frontend"
|
||||
|
||||
# Install dependencies if needed
|
||||
if [ ! -d "node_modules" ]; then
|
||||
echo -e "${YELLOW} Installing findadispo dependencies...${NC}"
|
||||
npm install
|
||||
fi
|
||||
|
||||
# Start in background on port 3001
|
||||
PORT=3001 npm run dev > /tmp/findadispo-frontend.log 2>&1 &
|
||||
FINDADISPO_PID=$!
|
||||
echo $FINDADISPO_PID > /tmp/findadispo-frontend.pid
|
||||
echo -e "${GREEN} FindADispo starting (PID: $FINDADISPO_PID)${NC}"
|
||||
|
||||
cd "$SCRIPT_DIR"
|
||||
fi
|
||||
|
||||
# Step 6: Findagram Consumer UI
|
||||
if lsof -i:3002 >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[5/6] Findagram already running on port 3002${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}[5/6] Starting Findagram Consumer UI...${NC}"
|
||||
|
||||
cd "$ROOT_DIR/findagram/frontend"
|
||||
|
||||
# Install dependencies if needed
|
||||
if [ ! -d "node_modules" ]; then
|
||||
echo -e "${YELLOW} Installing findagram dependencies...${NC}"
|
||||
npm install
|
||||
fi
|
||||
|
||||
# Start in background on port 3002
|
||||
PORT=3002 npm run dev > /tmp/findagram-frontend.log 2>&1 &
|
||||
FINDAGRAM_PID=$!
|
||||
echo $FINDAGRAM_PID > /tmp/findagram-frontend.pid
|
||||
echo -e "${GREEN} Findagram starting (PID: $FINDAGRAM_PID)${NC}"
|
||||
|
||||
cd "$SCRIPT_DIR"
|
||||
fi
|
||||
|
||||
# Step 7: Health checks for newly started services
|
||||
echo ""
|
||||
echo -e "${YELLOW}[6/6] Checking service health...${NC}"
|
||||
|
||||
# Check backend if it was just started
|
||||
if ! lsof -i:3010 >/dev/null 2>&1; then
|
||||
for i in {1..15}; do
|
||||
if curl -s http://localhost:3010/health > /dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
if curl -s http://localhost:3010/health > /dev/null 2>&1; then
|
||||
echo -e "${GREEN} Backend API: OK (port 3010)${NC}"
|
||||
else
|
||||
echo -e "${YELLOW} Backend API: Starting (check: tail -f /tmp/cannaiq-backend.log)${NC}"
|
||||
fi
|
||||
|
||||
# Check CannaiQ Admin
|
||||
if curl -s http://localhost:8080 > /dev/null 2>&1; then
|
||||
echo -e "${GREEN} CannaiQ Admin: OK (port 8080)${NC}"
|
||||
else
|
||||
echo -e "${YELLOW} CannaiQ Admin: Starting (check: tail -f /tmp/cannaiq-frontend.log)${NC}"
|
||||
fi
|
||||
|
||||
# Check FindADispo
|
||||
sleep 2
|
||||
if curl -s http://localhost:3001 > /dev/null 2>&1; then
|
||||
echo -e "${GREEN} FindADispo: OK (port 3001)${NC}"
|
||||
else
|
||||
echo -e "${YELLOW} FindADispo: Starting (check: tail -f /tmp/findadispo-frontend.log)${NC}"
|
||||
fi
|
||||
|
||||
# Check Findagram
|
||||
if curl -s http://localhost:3002 > /dev/null 2>&1; then
|
||||
echo -e "${GREEN} Findagram: OK (port 3002)${NC}"
|
||||
else
|
||||
echo -e "${YELLOW} Findagram: Starting (check: tail -f /tmp/findagram-frontend.log)${NC}"
|
||||
fi
|
||||
|
||||
# Print final status
|
||||
echo ""
|
||||
echo -e "${BLUE}================================${NC}"
|
||||
echo -e "${GREEN} Local Environment Ready${NC}"
|
||||
echo -e "${BLUE}================================${NC}"
|
||||
echo ""
|
||||
echo -e " ${BLUE}Services:${NC}"
|
||||
echo -e " Postgres: localhost:54320"
|
||||
echo -e " Backend API: http://localhost:3010"
|
||||
echo ""
|
||||
echo -e " ${BLUE}Frontends:${NC}"
|
||||
echo -e " CannaiQ Admin: http://localhost:8080/admin"
|
||||
echo -e " FindADispo: http://localhost:3001"
|
||||
echo -e " Findagram: http://localhost:3002"
|
||||
echo ""
|
||||
echo -e "${YELLOW}To stop services:${NC} ./stop-local.sh"
|
||||
echo -e "${YELLOW}View logs:${NC}"
|
||||
echo " Backend: tail -f /tmp/cannaiq-backend.log"
|
||||
echo " CannaiQ: tail -f /tmp/cannaiq-frontend.log"
|
||||
echo " FindADispo: tail -f /tmp/findadispo-frontend.log"
|
||||
echo " Findagram: tail -f /tmp/findagram-frontend.log"
|
||||
echo ""
|
||||
@@ -1,7 +1,7 @@
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
import jwt from 'jsonwebtoken';
|
||||
import bcrypt from 'bcrypt';
|
||||
import { pool } from '../db/migrate';
|
||||
import { pool } from '../db/pool';
|
||||
|
||||
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
|
||||
|
||||
|
||||
204
backend/src/canonical-hydration/RUNBOOK.md
Normal file
204
backend/src/canonical-hydration/RUNBOOK.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# Canonical Hydration Pipeline - Runbook
|
||||
|
||||
## Overview
|
||||
|
||||
The Canonical Hydration Pipeline transforms data from the `dutchie_*` source tables into the provider-agnostic canonical tables (`store_products`, `store_product_snapshots`, `crawl_runs`). This enables:
|
||||
|
||||
- Unified analytics across multiple data providers
|
||||
- Historical price/inventory tracking
|
||||
- Provider-agnostic API endpoints
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Source Tables (read-only):
|
||||
dutchie_products → StoreProductNormalizer → store_products
|
||||
dutchie_product_snapshots → SnapshotWriter → store_product_snapshots
|
||||
dispensary_crawl_jobs → CrawlRunRecorder → crawl_runs
|
||||
|
||||
Orchestration:
|
||||
CanonicalHydrationService coordinates all transformations
|
||||
```
|
||||
|
||||
## Table Mappings
|
||||
|
||||
### dutchie_products → store_products
|
||||
|
||||
| Source Column | Target Column | Notes |
|
||||
|---------------|---------------|-------|
|
||||
| dispensary_id | dispensary_id | Direct mapping |
|
||||
| external_product_id | provider_product_id | Canonical key |
|
||||
| platform | provider | 'dutchie' |
|
||||
| name | name_raw | Raw product name |
|
||||
| brand_name | brand_name_raw | Raw brand name |
|
||||
| type/subcategory | category_raw | Category info |
|
||||
| price_rec (JSONB) | price_rec (DECIMAL) | Extracted from JSONB |
|
||||
| price_med (JSONB) | price_med (DECIMAL) | Extracted from JSONB |
|
||||
| thc | thc_percent | Parsed percentage |
|
||||
| cbd | cbd_percent | Parsed percentage |
|
||||
| stock_status | is_in_stock | Boolean conversion |
|
||||
| total_quantity_available | stock_quantity | Direct mapping |
|
||||
| primary_image_url | image_url | Direct mapping |
|
||||
| created_at | first_seen_at | First seen timestamp |
|
||||
| updated_at | last_seen_at | Last seen timestamp |
|
||||
|
||||
### Canonical Keys
|
||||
|
||||
- **store_products**: `(dispensary_id, provider, provider_product_id)`
|
||||
- **store_product_snapshots**: `(store_product_id, crawl_run_id)`
|
||||
- **crawl_runs**: `(source_job_type, source_job_id)`
|
||||
|
||||
## CLI Commands
|
||||
|
||||
### Check Hydration Status
|
||||
|
||||
```bash
|
||||
# Overall status
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --status
|
||||
|
||||
# Single dispensary status
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --status --dispensary-id 112
|
||||
```
|
||||
|
||||
### Products-Only Hydration
|
||||
|
||||
Use when source data has products but no historical snapshots/job records.
|
||||
|
||||
```bash
|
||||
# Dry run (see what would be done)
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
|
||||
|
||||
# Hydrate single dispensary
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
|
||||
|
||||
# Hydrate all dispensaries
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts
|
||||
```
|
||||
|
||||
### Backfill Hydration
|
||||
|
||||
Use when source data has historical job records in `dispensary_crawl_jobs`.
|
||||
|
||||
```bash
|
||||
# Dry run
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
|
||||
|
||||
# Backfill with date range
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
|
||||
|
||||
# Backfill single dispensary
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
|
||||
```
|
||||
|
||||
### Incremental Hydration
|
||||
|
||||
Use for ongoing hydration of new data.
|
||||
|
||||
```bash
|
||||
# Single run
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts
|
||||
|
||||
# Continuous loop (runs every 60 seconds)
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts --loop
|
||||
|
||||
# Continuous loop with custom interval
|
||||
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
|
||||
```
|
||||
|
||||
## Migration
|
||||
|
||||
Apply the schema migration before first use:
|
||||
|
||||
```bash
|
||||
# Apply migration 050
|
||||
DATABASE_URL="..." psql -f src/migrations/050_canonical_hydration_schema.sql
|
||||
```
|
||||
|
||||
This migration adds:
|
||||
- `source_job_type` and `source_job_id` columns to `crawl_runs`
|
||||
- Unique index on `crawl_runs (source_job_type, source_job_id)`
|
||||
- Unique index on `store_product_snapshots (store_product_id, crawl_run_id)`
|
||||
- Performance indexes for hydration queries
|
||||
|
||||
## Idempotency
|
||||
|
||||
All hydration operations are idempotent:
|
||||
|
||||
- **crawl_runs**: ON CONFLICT updates existing records
|
||||
- **store_products**: ON CONFLICT updates mutable fields
|
||||
- **store_product_snapshots**: ON CONFLICT DO NOTHING
|
||||
|
||||
Re-running hydration is safe and will not create duplicates.
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Check Canonical Data
|
||||
|
||||
```sql
|
||||
-- Count canonical records
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM crawl_runs WHERE provider = 'dutchie') as crawl_runs,
|
||||
(SELECT COUNT(*) FROM store_products WHERE provider = 'dutchie') as products,
|
||||
(SELECT COUNT(*) FROM store_product_snapshots) as snapshots;
|
||||
|
||||
-- Products by dispensary
|
||||
SELECT dispensary_id, COUNT(*) as products
|
||||
FROM store_products
|
||||
WHERE provider = 'dutchie'
|
||||
GROUP BY dispensary_id
|
||||
ORDER BY products DESC;
|
||||
|
||||
-- Recent crawl runs
|
||||
SELECT id, dispensary_id, started_at, products_found, snapshots_written
|
||||
FROM crawl_runs
|
||||
ORDER BY started_at DESC
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
### Verify Hydration Completeness
|
||||
|
||||
```sql
|
||||
-- Compare source vs canonical product counts
|
||||
SELECT
|
||||
dp.dispensary_id,
|
||||
COUNT(DISTINCT dp.id) as source_products,
|
||||
COUNT(DISTINCT sp.id) as canonical_products
|
||||
FROM dutchie_products dp
|
||||
LEFT JOIN store_products sp
|
||||
ON sp.dispensary_id = dp.dispensary_id
|
||||
AND sp.provider = 'dutchie'
|
||||
AND sp.provider_product_id = dp.external_product_id
|
||||
GROUP BY dp.dispensary_id
|
||||
ORDER BY dp.dispensary_id;
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "invalid input syntax for type integer"
|
||||
|
||||
This usually means a type mismatch between source and target columns. The most common case is `brand_id` - the source has UUID strings but the target expects integers. The normalizer sets `brand_id = null` to handle this.
|
||||
|
||||
### "could not determine data type of parameter $1"
|
||||
|
||||
This indicates a batch insert issue with parameter indexing. Ensure each batch has its own parameter indexing starting from $1.
|
||||
|
||||
### Empty Snapshots
|
||||
|
||||
If `snapshotsWritten` is 0 but products were upserted:
|
||||
1. Check if snapshots already exist for the crawl run (ON CONFLICT DO NOTHING)
|
||||
2. Verify store_products exist with the correct dispensary_id and provider
|
||||
|
||||
## Performance
|
||||
|
||||
Typical performance metrics:
|
||||
- ~1000 products/second for upsert
|
||||
- ~2000 snapshots/second for insert
|
||||
- 39 dispensaries with 37K products: ~17 seconds
|
||||
|
||||
For large backfills, use `--batch-size` to control memory usage.
|
||||
|
||||
## Known Limitations
|
||||
|
||||
1. **brand_id not mapped**: Source brand_id is UUID, target expects integer. Currently set to null.
|
||||
2. **No historical snapshots**: If source has no `dutchie_product_snapshots`, use products-only mode which creates initial snapshots from current product state.
|
||||
3. **Source jobs empty**: If `dispensary_crawl_jobs` is empty, use products-only mode.
|
||||
170
backend/src/canonical-hydration/cli/backfill.ts
Normal file
170
backend/src/canonical-hydration/cli/backfill.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Backfill CLI - Historical data hydration
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/canonical-hydration/cli/backfill.ts [options]
|
||||
*
|
||||
* Options:
|
||||
* --dispensary-id <id> Hydrate only a specific dispensary
|
||||
* --start-date <date> Start date for backfill (ISO format)
|
||||
* --end-date <date> End date for backfill (ISO format)
|
||||
* --batch-size <n> Number of jobs to process per batch (default: 50)
|
||||
* --dry-run Show what would be done without making changes
|
||||
* --status Show hydration status and exit
|
||||
*
|
||||
* Examples:
|
||||
* npx tsx src/canonical-hydration/cli/backfill.ts --status
|
||||
* npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
|
||||
* npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
|
||||
* npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { CanonicalHydrationService } from '../hydration-service';
|
||||
import { HydrationOptions } from '../types';
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
// Parse command line arguments
|
||||
const options: HydrationOptions = {
|
||||
mode: 'backfill',
|
||||
};
|
||||
let showStatus = false;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i];
|
||||
switch (arg) {
|
||||
case '--dispensary-id':
|
||||
options.dispensaryId = parseInt(args[++i]);
|
||||
break;
|
||||
case '--start-date':
|
||||
options.startDate = new Date(args[++i]);
|
||||
break;
|
||||
case '--end-date':
|
||||
options.endDate = new Date(args[++i]);
|
||||
break;
|
||||
case '--batch-size':
|
||||
options.batchSize = parseInt(args[++i]);
|
||||
break;
|
||||
case '--dry-run':
|
||||
options.dryRun = true;
|
||||
break;
|
||||
case '--status':
|
||||
showStatus = true;
|
||||
break;
|
||||
case '--help':
|
||||
console.log(`
|
||||
Backfill CLI - Historical data hydration
|
||||
|
||||
Usage:
|
||||
npx tsx src/canonical-hydration/cli/backfill.ts [options]
|
||||
|
||||
Options:
|
||||
--dispensary-id <id> Hydrate only a specific dispensary
|
||||
--start-date <date> Start date for backfill (ISO format)
|
||||
--end-date <date> End date for backfill (ISO format)
|
||||
--batch-size <n> Number of jobs to process per batch (default: 50)
|
||||
--dry-run Show what would be done without making changes
|
||||
--status Show hydration status and exit
|
||||
|
||||
Examples:
|
||||
npx tsx src/canonical-hydration/cli/backfill.ts --status
|
||||
npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
|
||||
npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
|
||||
npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to database
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
const service = new CanonicalHydrationService({
|
||||
pool,
|
||||
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
|
||||
});
|
||||
|
||||
try {
|
||||
if (showStatus) {
|
||||
// Show status and exit
|
||||
if (options.dispensaryId) {
|
||||
const status = await service.getHydrationStatus(options.dispensaryId);
|
||||
console.log(`\nHydration Status for Dispensary ${options.dispensaryId}:`);
|
||||
console.log('═'.repeat(50));
|
||||
console.log(` Source Jobs (completed): ${status.sourceJobs}`);
|
||||
console.log(` Hydrated Jobs: ${status.hydratedJobs}`);
|
||||
console.log(` Unhydrated Jobs: ${status.unhydratedJobs}`);
|
||||
console.log('');
|
||||
console.log(` Source Products: ${status.sourceProducts}`);
|
||||
console.log(` Store Products: ${status.storeProducts}`);
|
||||
console.log('');
|
||||
console.log(` Source Snapshots: ${status.sourceSnapshots}`);
|
||||
console.log(` Store Snapshots: ${status.storeSnapshots}`);
|
||||
} else {
|
||||
const status = await service.getOverallStatus();
|
||||
console.log('\nOverall Hydration Status:');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(` Dispensaries with Data: ${status.dispensariesWithData}`);
|
||||
console.log('');
|
||||
console.log(` Source Jobs (completed): ${status.totalSourceJobs}`);
|
||||
console.log(` Hydrated Jobs: ${status.totalHydratedJobs}`);
|
||||
console.log(` Unhydrated Jobs: ${status.totalSourceJobs - status.totalHydratedJobs}`);
|
||||
console.log('');
|
||||
console.log(` Source Products: ${status.totalSourceProducts}`);
|
||||
console.log(` Store Products: ${status.totalStoreProducts}`);
|
||||
console.log('');
|
||||
console.log(` Source Snapshots: ${status.totalSourceSnapshots}`);
|
||||
console.log(` Store Snapshots: ${status.totalStoreSnapshots}`);
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Run backfill
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log(' CANONICAL HYDRATION - BACKFILL MODE');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Dispensary ID: ${options.dispensaryId || 'ALL'}`);
|
||||
console.log(` Start Date: ${options.startDate?.toISOString() || 'N/A'}`);
|
||||
console.log(` End Date: ${options.endDate?.toISOString() || 'N/A'}`);
|
||||
console.log(` Batch Size: ${options.batchSize || 50}`);
|
||||
console.log(` Dry Run: ${options.dryRun ? 'YES' : 'NO'}`);
|
||||
console.log('═'.repeat(60) + '\n');
|
||||
|
||||
const result = await service.hydrate(options);
|
||||
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log(' HYDRATION COMPLETE');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Crawl Runs Created: ${result.crawlRunsCreated}`);
|
||||
console.log(` Crawl Runs Skipped: ${result.crawlRunsSkipped}`);
|
||||
console.log(` Products Upserted: ${result.productsUpserted}`);
|
||||
console.log(` Snapshots Written: ${result.snapshotsWritten}`);
|
||||
console.log(` Duration: ${result.durationMs}ms`);
|
||||
console.log(` Errors: ${result.errors.length}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
for (const error of result.errors.slice(0, 10)) {
|
||||
console.log(` - ${error}`);
|
||||
}
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
console.log('═'.repeat(60) + '\n');
|
||||
|
||||
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||
} catch (error: any) {
|
||||
console.error('Fatal error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
142
backend/src/canonical-hydration/cli/incremental.ts
Normal file
142
backend/src/canonical-hydration/cli/incremental.ts
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Incremental CLI - Ongoing data hydration
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/canonical-hydration/cli/incremental.ts [options]
|
||||
*
|
||||
* Options:
|
||||
* --dispensary-id <id> Hydrate only a specific dispensary
|
||||
* --batch-size <n> Number of jobs to process per batch (default: 100)
|
||||
* --loop Run continuously in a loop
|
||||
* --interval <seconds> Interval between loops (default: 60)
|
||||
* --dry-run Show what would be done without making changes
|
||||
*
|
||||
* Examples:
|
||||
* npx tsx src/canonical-hydration/cli/incremental.ts
|
||||
* npx tsx src/canonical-hydration/cli/incremental.ts --dispensary-id 112
|
||||
* npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
|
||||
* npx tsx src/canonical-hydration/cli/incremental.ts --dry-run
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { CanonicalHydrationService } from '../hydration-service';
|
||||
import { HydrationOptions } from '../types';
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
// Parse command line arguments
|
||||
const options: HydrationOptions = {
|
||||
mode: 'incremental',
|
||||
};
|
||||
let loop = false;
|
||||
let intervalSeconds = 60;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i];
|
||||
switch (arg) {
|
||||
case '--dispensary-id':
|
||||
options.dispensaryId = parseInt(args[++i]);
|
||||
break;
|
||||
case '--batch-size':
|
||||
options.batchSize = parseInt(args[++i]);
|
||||
break;
|
||||
case '--loop':
|
||||
loop = true;
|
||||
break;
|
||||
case '--interval':
|
||||
intervalSeconds = parseInt(args[++i]);
|
||||
break;
|
||||
case '--dry-run':
|
||||
options.dryRun = true;
|
||||
break;
|
||||
case '--help':
|
||||
console.log(`
|
||||
Incremental CLI - Ongoing data hydration
|
||||
|
||||
Usage:
|
||||
npx tsx src/canonical-hydration/cli/incremental.ts [options]
|
||||
|
||||
Options:
|
||||
--dispensary-id <id> Hydrate only a specific dispensary
|
||||
--batch-size <n> Number of jobs to process per batch (default: 100)
|
||||
--loop Run continuously in a loop
|
||||
--interval <seconds> Interval between loops (default: 60)
|
||||
--dry-run Show what would be done without making changes
|
||||
|
||||
Examples:
|
||||
npx tsx src/canonical-hydration/cli/incremental.ts
|
||||
npx tsx src/canonical-hydration/cli/incremental.ts --dispensary-id 112
|
||||
npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
|
||||
npx tsx src/canonical-hydration/cli/incremental.ts --dry-run
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to database
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
const service = new CanonicalHydrationService({
|
||||
pool,
|
||||
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
|
||||
});
|
||||
|
||||
const log = (msg: string) => console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
|
||||
// Graceful shutdown
|
||||
let running = true;
|
||||
process.on('SIGINT', () => {
|
||||
log('Received SIGINT, shutting down...');
|
||||
running = false;
|
||||
});
|
||||
process.on('SIGTERM', () => {
|
||||
log('Received SIGTERM, shutting down...');
|
||||
running = false;
|
||||
});
|
||||
|
||||
try {
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log(' CANONICAL HYDRATION - INCREMENTAL MODE');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Dispensary ID: ${options.dispensaryId || 'ALL'}`);
|
||||
console.log(` Batch Size: ${options.batchSize || 100}`);
|
||||
console.log(` Loop Mode: ${loop ? 'YES' : 'NO'}`);
|
||||
if (loop) {
|
||||
console.log(` Interval: ${intervalSeconds}s`);
|
||||
}
|
||||
console.log(` Dry Run: ${options.dryRun ? 'YES' : 'NO'}`);
|
||||
console.log('═'.repeat(60) + '\n');
|
||||
|
||||
do {
|
||||
const result = await service.hydrate(options);
|
||||
|
||||
log(`Hydration complete: ${result.crawlRunsCreated} runs, ${result.productsUpserted} products, ${result.snapshotsWritten} snapshots (${result.durationMs}ms)`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
log(`Errors: ${result.errors.length}`);
|
||||
for (const error of result.errors.slice(0, 5)) {
|
||||
log(` - ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (loop && running) {
|
||||
log(`Sleeping for ${intervalSeconds}s...`);
|
||||
await new Promise(resolve => setTimeout(resolve, intervalSeconds * 1000));
|
||||
}
|
||||
} while (loop && running);
|
||||
|
||||
log('Incremental hydration completed');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('Fatal error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
113
backend/src/canonical-hydration/cli/products-only.ts
Normal file
113
backend/src/canonical-hydration/cli/products-only.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Products-Only Hydration CLI
|
||||
*
|
||||
* Used when there are no historical job records - creates synthetic crawl runs
|
||||
* from current product data.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/canonical-hydration/cli/products-only.ts [options]
|
||||
*
|
||||
* Options:
|
||||
* --dispensary-id <id> Hydrate only a specific dispensary
|
||||
* --dry-run Show what would be done without making changes
|
||||
*
|
||||
* Examples:
|
||||
* npx tsx src/canonical-hydration/cli/products-only.ts
|
||||
* npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
|
||||
* npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { CanonicalHydrationService } from '../hydration-service';
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
// Parse command line arguments
|
||||
let dispensaryId: number | undefined;
|
||||
let dryRun = false;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i];
|
||||
switch (arg) {
|
||||
case '--dispensary-id':
|
||||
dispensaryId = parseInt(args[++i]);
|
||||
break;
|
||||
case '--dry-run':
|
||||
dryRun = true;
|
||||
break;
|
||||
case '--help':
|
||||
console.log(`
|
||||
Products-Only Hydration CLI
|
||||
|
||||
Used when there are no historical job records - creates synthetic crawl runs
|
||||
from current product data.
|
||||
|
||||
Usage:
|
||||
npx tsx src/canonical-hydration/cli/products-only.ts [options]
|
||||
|
||||
Options:
|
||||
--dispensary-id <id> Hydrate only a specific dispensary
|
||||
--dry-run Show what would be done without making changes
|
||||
|
||||
Examples:
|
||||
npx tsx src/canonical-hydration/cli/products-only.ts
|
||||
npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
|
||||
npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to database
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
const service = new CanonicalHydrationService({
|
||||
pool,
|
||||
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
|
||||
});
|
||||
|
||||
try {
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log(' CANONICAL HYDRATION - PRODUCTS-ONLY MODE');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Dispensary ID: ${dispensaryId || 'ALL'}`);
|
||||
console.log(` Dry Run: ${dryRun ? 'YES' : 'NO'}`);
|
||||
console.log('═'.repeat(60) + '\n');
|
||||
|
||||
const result = await service.hydrateProductsOnly({ dispensaryId, dryRun });
|
||||
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log(' HYDRATION COMPLETE');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Crawl Runs Created: ${result.crawlRunsCreated}`);
|
||||
console.log(` Crawl Runs Skipped: ${result.crawlRunsSkipped}`);
|
||||
console.log(` Products Upserted: ${result.productsUpserted}`);
|
||||
console.log(` Snapshots Written: ${result.snapshotsWritten}`);
|
||||
console.log(` Duration: ${result.durationMs}ms`);
|
||||
console.log(` Errors: ${result.errors.length}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
for (const error of result.errors.slice(0, 10)) {
|
||||
console.log(` - ${error}`);
|
||||
}
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
console.log('═'.repeat(60) + '\n');
|
||||
|
||||
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||
} catch (error: any) {
|
||||
console.error('Fatal error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
226
backend/src/canonical-hydration/crawl-run-recorder.ts
Normal file
226
backend/src/canonical-hydration/crawl-run-recorder.ts
Normal file
@@ -0,0 +1,226 @@
|
||||
/**
|
||||
* CrawlRunRecorder
|
||||
* Records crawl runs from source job tables (dispensary_crawl_jobs) to canonical crawl_runs table
|
||||
*/
|
||||
|
||||
import { Pool, PoolClient } from 'pg';
|
||||
import { SourceJob, CrawlRun, ServiceContext, SourceJobType } from './types';
|
||||
|
||||
export class CrawlRunRecorder {
|
||||
private pool: Pool;
|
||||
private log: (message: string) => void;
|
||||
|
||||
constructor(ctx: ServiceContext) {
|
||||
this.pool = ctx.pool;
|
||||
this.log = ctx.logger || console.log;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a single crawl run from a source job
|
||||
* Uses ON CONFLICT to ensure idempotency
|
||||
*/
|
||||
async recordCrawlRun(
|
||||
sourceJob: SourceJob,
|
||||
sourceJobType: SourceJobType = 'dispensary_crawl_jobs'
|
||||
): Promise<number | null> {
|
||||
// Skip jobs that aren't completed successfully
|
||||
if (sourceJob.status !== 'completed') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const crawlRun: Partial<CrawlRun> = {
|
||||
dispensary_id: sourceJob.dispensary_id,
|
||||
provider: 'dutchie', // Source is always dutchie for now
|
||||
started_at: sourceJob.started_at || new Date(),
|
||||
finished_at: sourceJob.completed_at,
|
||||
duration_ms: sourceJob.duration_ms,
|
||||
status: this.mapStatus(sourceJob.status),
|
||||
error_message: sourceJob.error_message,
|
||||
products_found: sourceJob.products_found,
|
||||
products_new: sourceJob.products_new,
|
||||
products_updated: sourceJob.products_updated,
|
||||
snapshots_written: null, // Will be updated after snapshot insertion
|
||||
worker_id: null,
|
||||
trigger_type: sourceJob.job_type === 'dutchie_product_crawl' ? 'scheduled' : 'manual',
|
||||
metadata: { sourceJobType, originalJobType: sourceJob.job_type },
|
||||
source_job_type: sourceJobType,
|
||||
source_job_id: sourceJob.id,
|
||||
};
|
||||
|
||||
const result = await this.pool.query(
|
||||
`INSERT INTO crawl_runs (
|
||||
dispensary_id, provider, started_at, finished_at, duration_ms,
|
||||
status, error_message, products_found, products_new, products_updated,
|
||||
snapshots_written, worker_id, trigger_type, metadata,
|
||||
source_job_type, source_job_id
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
|
||||
ON CONFLICT (source_job_type, source_job_id) WHERE source_job_id IS NOT NULL
|
||||
DO UPDATE SET
|
||||
finished_at = EXCLUDED.finished_at,
|
||||
duration_ms = EXCLUDED.duration_ms,
|
||||
status = EXCLUDED.status,
|
||||
error_message = EXCLUDED.error_message,
|
||||
products_found = EXCLUDED.products_found,
|
||||
products_new = EXCLUDED.products_new,
|
||||
products_updated = EXCLUDED.products_updated
|
||||
RETURNING id`,
|
||||
[
|
||||
crawlRun.dispensary_id,
|
||||
crawlRun.provider,
|
||||
crawlRun.started_at,
|
||||
crawlRun.finished_at,
|
||||
crawlRun.duration_ms,
|
||||
crawlRun.status,
|
||||
crawlRun.error_message,
|
||||
crawlRun.products_found,
|
||||
crawlRun.products_new,
|
||||
crawlRun.products_updated,
|
||||
crawlRun.snapshots_written,
|
||||
crawlRun.worker_id,
|
||||
crawlRun.trigger_type,
|
||||
JSON.stringify(crawlRun.metadata),
|
||||
crawlRun.source_job_type,
|
||||
crawlRun.source_job_id,
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0]?.id || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record multiple crawl runs in a batch
|
||||
*/
|
||||
async recordCrawlRunsBatch(
|
||||
sourceJobs: SourceJob[],
|
||||
sourceJobType: SourceJobType = 'dispensary_crawl_jobs'
|
||||
): Promise<{ created: number; skipped: number; crawlRunIds: Map<number, number> }> {
|
||||
let created = 0;
|
||||
let skipped = 0;
|
||||
const crawlRunIds = new Map<number, number>(); // sourceJobId -> crawlRunId
|
||||
|
||||
for (const job of sourceJobs) {
|
||||
const crawlRunId = await this.recordCrawlRun(job, sourceJobType);
|
||||
if (crawlRunId) {
|
||||
created++;
|
||||
crawlRunIds.set(job.id, crawlRunId);
|
||||
} else {
|
||||
skipped++;
|
||||
}
|
||||
}
|
||||
|
||||
return { created, skipped, crawlRunIds };
|
||||
}
|
||||
|
||||
/**
|
||||
* Update snapshots_written count for a crawl run
|
||||
*/
|
||||
async updateSnapshotsWritten(crawlRunId: number, snapshotsWritten: number): Promise<void> {
|
||||
await this.pool.query(
|
||||
'UPDATE crawl_runs SET snapshots_written = $1 WHERE id = $2',
|
||||
[snapshotsWritten, crawlRunId]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get crawl run ID by source job
|
||||
*/
|
||||
async getCrawlRunIdBySourceJob(
|
||||
sourceJobType: SourceJobType,
|
||||
sourceJobId: number
|
||||
): Promise<number | null> {
|
||||
const result = await this.pool.query(
|
||||
'SELECT id FROM crawl_runs WHERE source_job_type = $1 AND source_job_id = $2',
|
||||
[sourceJobType, sourceJobId]
|
||||
);
|
||||
return result.rows[0]?.id || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get unhydrated source jobs (jobs not yet recorded in crawl_runs)
|
||||
*/
|
||||
async getUnhydratedJobs(
|
||||
dispensaryId?: number,
|
||||
startDate?: Date,
|
||||
limit: number = 100
|
||||
): Promise<SourceJob[]> {
|
||||
let query = `
|
||||
SELECT j.*
|
||||
FROM dispensary_crawl_jobs j
|
||||
LEFT JOIN crawl_runs cr ON cr.source_job_type = 'dispensary_crawl_jobs' AND cr.source_job_id = j.id
|
||||
WHERE cr.id IS NULL
|
||||
AND j.status = 'completed'
|
||||
AND j.job_type = 'dutchie_product_crawl'
|
||||
`;
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (dispensaryId) {
|
||||
query += ` AND j.dispensary_id = $${paramIndex++}`;
|
||||
params.push(dispensaryId);
|
||||
}
|
||||
|
||||
if (startDate) {
|
||||
query += ` AND j.completed_at >= $${paramIndex++}`;
|
||||
params.push(startDate);
|
||||
}
|
||||
|
||||
query += ` ORDER BY j.completed_at ASC LIMIT $${paramIndex}`;
|
||||
params.push(limit);
|
||||
|
||||
const result = await this.pool.query(query, params);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all source jobs for backfill (within date range)
|
||||
*/
|
||||
async getSourceJobsForBackfill(
|
||||
startDate?: Date,
|
||||
endDate?: Date,
|
||||
dispensaryId?: number,
|
||||
limit: number = 1000
|
||||
): Promise<SourceJob[]> {
|
||||
let query = `
|
||||
SELECT *
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE status = 'completed'
|
||||
AND job_type = 'dutchie_product_crawl'
|
||||
`;
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (startDate) {
|
||||
query += ` AND completed_at >= $${paramIndex++}`;
|
||||
params.push(startDate);
|
||||
}
|
||||
|
||||
if (endDate) {
|
||||
query += ` AND completed_at <= $${paramIndex++}`;
|
||||
params.push(endDate);
|
||||
}
|
||||
|
||||
if (dispensaryId) {
|
||||
query += ` AND dispensary_id = $${paramIndex++}`;
|
||||
params.push(dispensaryId);
|
||||
}
|
||||
|
||||
query += ` ORDER BY completed_at ASC LIMIT $${paramIndex}`;
|
||||
params.push(limit);
|
||||
|
||||
const result = await this.pool.query(query, params);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
private mapStatus(sourceStatus: string): string {
|
||||
switch (sourceStatus) {
|
||||
case 'completed':
|
||||
return 'success';
|
||||
case 'failed':
|
||||
return 'failed';
|
||||
case 'running':
|
||||
return 'running';
|
||||
default:
|
||||
return sourceStatus;
|
||||
}
|
||||
}
|
||||
}
|
||||
560
backend/src/canonical-hydration/hydration-service.ts
Normal file
560
backend/src/canonical-hydration/hydration-service.ts
Normal file
@@ -0,0 +1,560 @@
|
||||
/**
|
||||
* CanonicalHydrationService
|
||||
* Orchestrates the full hydration pipeline from dutchie_* to canonical tables
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { CrawlRunRecorder } from './crawl-run-recorder';
|
||||
import { StoreProductNormalizer } from './store-product-normalizer';
|
||||
import { SnapshotWriter } from './snapshot-writer';
|
||||
import { HydrationOptions, HydrationResult, ServiceContext, SourceJob } from './types';
|
||||
|
||||
export class CanonicalHydrationService {
|
||||
private pool: Pool;
|
||||
private log: (message: string) => void;
|
||||
private crawlRunRecorder: CrawlRunRecorder;
|
||||
private productNormalizer: StoreProductNormalizer;
|
||||
private snapshotWriter: SnapshotWriter;
|
||||
|
||||
constructor(ctx: ServiceContext) {
|
||||
this.pool = ctx.pool;
|
||||
this.log = ctx.logger || console.log;
|
||||
this.crawlRunRecorder = new CrawlRunRecorder(ctx);
|
||||
this.productNormalizer = new StoreProductNormalizer(ctx);
|
||||
this.snapshotWriter = new SnapshotWriter(ctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the full hydration pipeline
|
||||
* Supports both backfill (historical) and incremental (ongoing) modes
|
||||
*/
|
||||
async hydrate(options: HydrationOptions): Promise<HydrationResult> {
|
||||
const startTime = Date.now();
|
||||
const result: HydrationResult = {
|
||||
crawlRunsCreated: 0,
|
||||
crawlRunsSkipped: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsWritten: 0,
|
||||
errors: [],
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
this.log(`Starting hydration in ${options.mode} mode`);
|
||||
|
||||
try {
|
||||
if (options.mode === 'backfill') {
|
||||
await this.runBackfill(options, result);
|
||||
} else {
|
||||
await this.runIncremental(options, result);
|
||||
}
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Fatal error: ${err.message}`);
|
||||
this.log(`Hydration failed: ${err.message}`);
|
||||
}
|
||||
|
||||
result.durationMs = Date.now() - startTime;
|
||||
this.log(`Hydration completed in ${result.durationMs}ms: ${JSON.stringify({
|
||||
crawlRunsCreated: result.crawlRunsCreated,
|
||||
crawlRunsSkipped: result.crawlRunsSkipped,
|
||||
productsUpserted: result.productsUpserted,
|
||||
snapshotsWritten: result.snapshotsWritten,
|
||||
errors: result.errors.length,
|
||||
})}`);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Backfill mode: Process historical data from source tables
|
||||
*/
|
||||
private async runBackfill(options: HydrationOptions, result: HydrationResult): Promise<void> {
|
||||
const batchSize = options.batchSize || 50;
|
||||
|
||||
// Get source jobs to process
|
||||
const sourceJobs = await this.crawlRunRecorder.getSourceJobsForBackfill(
|
||||
options.startDate,
|
||||
options.endDate,
|
||||
options.dispensaryId,
|
||||
1000 // Max jobs to process
|
||||
);
|
||||
|
||||
this.log(`Found ${sourceJobs.length} source jobs to backfill`);
|
||||
|
||||
// Group jobs by dispensary for efficient processing
|
||||
const jobsByDispensary = this.groupJobsByDispensary(sourceJobs);
|
||||
|
||||
for (const [dispensaryId, jobs] of jobsByDispensary) {
|
||||
this.log(`Processing dispensary ${dispensaryId} (${jobs.length} jobs)`);
|
||||
|
||||
try {
|
||||
// Step 1: Upsert products for this dispensary
|
||||
if (!options.dryRun) {
|
||||
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
|
||||
result.productsUpserted += productResult.upserted;
|
||||
if (productResult.errors.length > 0) {
|
||||
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
|
||||
}
|
||||
}
|
||||
|
||||
// Get store_product_id map for snapshot writing
|
||||
const storeProductIdMap = await this.productNormalizer.getStoreProductIdMap(dispensaryId);
|
||||
|
||||
// Step 2: Record crawl runs and write snapshots for each job
|
||||
for (const job of jobs) {
|
||||
try {
|
||||
await this.processJob(job, storeProductIdMap, result, options.dryRun);
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Job ${job.id}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Incremental mode: Process only unhydrated jobs
|
||||
*/
|
||||
private async runIncremental(options: HydrationOptions, result: HydrationResult): Promise<void> {
|
||||
const limit = options.batchSize || 100;
|
||||
|
||||
// Get unhydrated jobs
|
||||
const unhydratedJobs = await this.crawlRunRecorder.getUnhydratedJobs(
|
||||
options.dispensaryId,
|
||||
options.startDate,
|
||||
limit
|
||||
);
|
||||
|
||||
this.log(`Found ${unhydratedJobs.length} unhydrated jobs`);
|
||||
|
||||
// Group by dispensary
|
||||
const jobsByDispensary = this.groupJobsByDispensary(unhydratedJobs);
|
||||
|
||||
for (const [dispensaryId, jobs] of jobsByDispensary) {
|
||||
this.log(`Processing dispensary ${dispensaryId} (${jobs.length} jobs)`);
|
||||
|
||||
try {
|
||||
// Step 1: Upsert products
|
||||
if (!options.dryRun) {
|
||||
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
|
||||
result.productsUpserted += productResult.upserted;
|
||||
if (productResult.errors.length > 0) {
|
||||
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
|
||||
}
|
||||
}
|
||||
|
||||
// Get store_product_id map
|
||||
const storeProductIdMap = await this.productNormalizer.getStoreProductIdMap(dispensaryId);
|
||||
|
||||
// Step 2: Process each job
|
||||
for (const job of jobs) {
|
||||
try {
|
||||
await this.processJob(job, storeProductIdMap, result, options.dryRun);
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Job ${job.id}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single job: record crawl run and write snapshots
|
||||
*/
|
||||
private async processJob(
|
||||
job: SourceJob,
|
||||
storeProductIdMap: Map<string, number>,
|
||||
result: HydrationResult,
|
||||
dryRun?: boolean
|
||||
): Promise<void> {
|
||||
// Step 1: Record the crawl run
|
||||
let crawlRunId: number | null = null;
|
||||
|
||||
if (!dryRun) {
|
||||
crawlRunId = await this.crawlRunRecorder.recordCrawlRun(job);
|
||||
if (crawlRunId) {
|
||||
result.crawlRunsCreated++;
|
||||
} else {
|
||||
result.crawlRunsSkipped++;
|
||||
return; // Skip snapshot writing if crawl run wasn't created
|
||||
}
|
||||
} else {
|
||||
// In dry run, check if it would be created
|
||||
const existingId = await this.crawlRunRecorder.getCrawlRunIdBySourceJob(
|
||||
'dispensary_crawl_jobs',
|
||||
job.id
|
||||
);
|
||||
if (existingId) {
|
||||
result.crawlRunsSkipped++;
|
||||
return;
|
||||
}
|
||||
result.crawlRunsCreated++;
|
||||
return; // Skip snapshot writing in dry run
|
||||
}
|
||||
|
||||
// Step 2: Write snapshots for this crawl run
|
||||
if (crawlRunId && job.completed_at) {
|
||||
const snapshotResult = await this.snapshotWriter.writeSnapshotsForCrawlRun(
|
||||
crawlRunId,
|
||||
job.dispensary_id,
|
||||
storeProductIdMap,
|
||||
job.completed_at
|
||||
);
|
||||
|
||||
result.snapshotsWritten += snapshotResult.written;
|
||||
if (snapshotResult.errors.length > 0) {
|
||||
result.errors.push(...snapshotResult.errors);
|
||||
}
|
||||
|
||||
// Update crawl_run with snapshots_written count
|
||||
await this.crawlRunRecorder.updateSnapshotsWritten(crawlRunId, snapshotResult.written);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hydrate a single dispensary (convenience method)
|
||||
*/
|
||||
async hydrateDispensary(
|
||||
dispensaryId: number,
|
||||
mode: 'backfill' | 'incremental' = 'incremental'
|
||||
): Promise<HydrationResult> {
|
||||
return this.hydrate({
|
||||
mode,
|
||||
dispensaryId,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hydration status for a dispensary
|
||||
*/
|
||||
async getHydrationStatus(dispensaryId: number): Promise<{
|
||||
sourceJobs: number;
|
||||
hydratedJobs: number;
|
||||
unhydratedJobs: number;
|
||||
sourceProducts: number;
|
||||
storeProducts: number;
|
||||
sourceSnapshots: number;
|
||||
storeSnapshots: number;
|
||||
}> {
|
||||
const [sourceJobs, hydratedJobs, sourceProducts, storeProducts, sourceSnapshots, storeSnapshots] =
|
||||
await Promise.all([
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status = 'completed' AND job_type = 'dutchie_product_crawl'`,
|
||||
[dispensaryId]
|
||||
),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM crawl_runs
|
||||
WHERE dispensary_id = $1 AND source_job_type = 'dispensary_crawl_jobs'`,
|
||||
[dispensaryId]
|
||||
),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM dutchie_products WHERE dispensary_id = $1`,
|
||||
[dispensaryId]
|
||||
),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM store_products WHERE dispensary_id = $1 AND provider = 'dutchie'`,
|
||||
[dispensaryId]
|
||||
),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM dutchie_product_snapshots WHERE dispensary_id = $1`,
|
||||
[dispensaryId]
|
||||
),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM store_product_snapshots WHERE dispensary_id = $1`,
|
||||
[dispensaryId]
|
||||
),
|
||||
]);
|
||||
|
||||
const sourceJobCount = parseInt(sourceJobs.rows[0].count);
|
||||
const hydratedJobCount = parseInt(hydratedJobs.rows[0].count);
|
||||
|
||||
return {
|
||||
sourceJobs: sourceJobCount,
|
||||
hydratedJobs: hydratedJobCount,
|
||||
unhydratedJobs: sourceJobCount - hydratedJobCount,
|
||||
sourceProducts: parseInt(sourceProducts.rows[0].count),
|
||||
storeProducts: parseInt(storeProducts.rows[0].count),
|
||||
sourceSnapshots: parseInt(sourceSnapshots.rows[0].count),
|
||||
storeSnapshots: parseInt(storeSnapshots.rows[0].count),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get overall hydration status
|
||||
*/
|
||||
async getOverallStatus(): Promise<{
|
||||
totalSourceJobs: number;
|
||||
totalHydratedJobs: number;
|
||||
totalSourceProducts: number;
|
||||
totalStoreProducts: number;
|
||||
totalSourceSnapshots: number;
|
||||
totalStoreSnapshots: number;
|
||||
dispensariesWithData: number;
|
||||
}> {
|
||||
const [sourceJobs, hydratedJobs, sourceProducts, storeProducts, sourceSnapshots, storeSnapshots, dispensaries] =
|
||||
await Promise.all([
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM dispensary_crawl_jobs
|
||||
WHERE status = 'completed' AND job_type = 'dutchie_product_crawl'`
|
||||
),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(*) FROM crawl_runs WHERE source_job_type = 'dispensary_crawl_jobs'`
|
||||
),
|
||||
this.pool.query(`SELECT COUNT(*) FROM dutchie_products`),
|
||||
this.pool.query(`SELECT COUNT(*) FROM store_products WHERE provider = 'dutchie'`),
|
||||
this.pool.query(`SELECT COUNT(*) FROM dutchie_product_snapshots`),
|
||||
this.pool.query(`SELECT COUNT(*) FROM store_product_snapshots`),
|
||||
this.pool.query(
|
||||
`SELECT COUNT(DISTINCT dispensary_id) FROM dutchie_products`
|
||||
),
|
||||
]);
|
||||
|
||||
return {
|
||||
totalSourceJobs: parseInt(sourceJobs.rows[0].count),
|
||||
totalHydratedJobs: parseInt(hydratedJobs.rows[0].count),
|
||||
totalSourceProducts: parseInt(sourceProducts.rows[0].count),
|
||||
totalStoreProducts: parseInt(storeProducts.rows[0].count),
|
||||
totalSourceSnapshots: parseInt(sourceSnapshots.rows[0].count),
|
||||
totalStoreSnapshots: parseInt(storeSnapshots.rows[0].count),
|
||||
dispensariesWithData: parseInt(dispensaries.rows[0].count),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Group jobs by dispensary ID
|
||||
*/
|
||||
private groupJobsByDispensary(jobs: SourceJob[]): Map<number, SourceJob[]> {
|
||||
const map = new Map<number, SourceJob[]>();
|
||||
for (const job of jobs) {
|
||||
const list = map.get(job.dispensary_id) || [];
|
||||
list.push(job);
|
||||
map.set(job.dispensary_id, list);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Products-only hydration mode
|
||||
* Used when there are no historical job records - creates synthetic crawl runs
|
||||
* from current product data
|
||||
*/
|
||||
async hydrateProductsOnly(options: {
|
||||
dispensaryId?: number;
|
||||
dryRun?: boolean;
|
||||
} = {}): Promise<HydrationResult> {
|
||||
const startTime = Date.now();
|
||||
const result: HydrationResult = {
|
||||
crawlRunsCreated: 0,
|
||||
crawlRunsSkipped: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsWritten: 0,
|
||||
errors: [],
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
this.log('Starting products-only hydration mode');
|
||||
|
||||
try {
|
||||
// Get all dispensaries with products
|
||||
let dispensaryIds: number[];
|
||||
if (options.dispensaryId) {
|
||||
dispensaryIds = [options.dispensaryId];
|
||||
} else {
|
||||
const dispResult = await this.pool.query(
|
||||
'SELECT DISTINCT dispensary_id FROM dutchie_products ORDER BY dispensary_id'
|
||||
);
|
||||
dispensaryIds = dispResult.rows.map(r => r.dispensary_id);
|
||||
}
|
||||
|
||||
this.log(`Processing ${dispensaryIds.length} dispensaries`);
|
||||
|
||||
for (const dispensaryId of dispensaryIds) {
|
||||
try {
|
||||
await this.hydrateDispensaryProductsOnly(dispensaryId, result, options.dryRun);
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
result.errors.push(`Fatal error: ${err.message}`);
|
||||
}
|
||||
|
||||
result.durationMs = Date.now() - startTime;
|
||||
this.log(`Products-only hydration completed in ${result.durationMs}ms: ${JSON.stringify({
|
||||
crawlRunsCreated: result.crawlRunsCreated,
|
||||
productsUpserted: result.productsUpserted,
|
||||
snapshotsWritten: result.snapshotsWritten,
|
||||
errors: result.errors.length,
|
||||
})}`);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hydrate a single dispensary in products-only mode
|
||||
*/
|
||||
private async hydrateDispensaryProductsOnly(
|
||||
dispensaryId: number,
|
||||
result: HydrationResult,
|
||||
dryRun?: boolean
|
||||
): Promise<void> {
|
||||
// Get product count and timestamps for this dispensary
|
||||
const statsResult = await this.pool.query(
|
||||
`SELECT COUNT(*) as cnt, MIN(created_at) as min_date, MAX(updated_at) as max_date
|
||||
FROM dutchie_products WHERE dispensary_id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
const stats = statsResult.rows[0];
|
||||
const productCount = parseInt(stats.cnt);
|
||||
|
||||
if (productCount === 0) {
|
||||
this.log(`Dispensary ${dispensaryId}: No products, skipping`);
|
||||
return;
|
||||
}
|
||||
|
||||
this.log(`Dispensary ${dispensaryId}: ${productCount} products`);
|
||||
|
||||
// Step 1: Create synthetic crawl run
|
||||
let crawlRunId: number | null = null;
|
||||
const now = new Date();
|
||||
|
||||
if (!dryRun) {
|
||||
// Check if we already have a synthetic run for this dispensary
|
||||
const existingRun = await this.pool.query(
|
||||
`SELECT id FROM crawl_runs
|
||||
WHERE dispensary_id = $1
|
||||
AND source_job_type = 'products_only_hydration'
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (existingRun.rows.length > 0) {
|
||||
crawlRunId = existingRun.rows[0].id;
|
||||
this.log(`Dispensary ${dispensaryId}: Using existing synthetic crawl run ${crawlRunId}`);
|
||||
result.crawlRunsSkipped++;
|
||||
} else {
|
||||
// Create new synthetic crawl run
|
||||
const insertResult = await this.pool.query(
|
||||
`INSERT INTO crawl_runs (
|
||||
dispensary_id, provider, started_at, finished_at, duration_ms,
|
||||
status, products_found, trigger_type, metadata,
|
||||
source_job_type, source_job_id
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||
RETURNING id`,
|
||||
[
|
||||
dispensaryId,
|
||||
'dutchie',
|
||||
stats.min_date || now,
|
||||
stats.max_date || now,
|
||||
0,
|
||||
'success',
|
||||
productCount,
|
||||
'hydration',
|
||||
JSON.stringify({ mode: 'products_only', hydratedAt: now.toISOString() }),
|
||||
'products_only_hydration',
|
||||
dispensaryId, // Use dispensary_id as synthetic job_id
|
||||
]
|
||||
);
|
||||
crawlRunId = insertResult.rows[0].id;
|
||||
result.crawlRunsCreated++;
|
||||
this.log(`Dispensary ${dispensaryId}: Created synthetic crawl run ${crawlRunId}`);
|
||||
}
|
||||
|
||||
// Step 2: Upsert products
|
||||
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
|
||||
result.productsUpserted += productResult.upserted;
|
||||
if (productResult.errors.length > 0) {
|
||||
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
|
||||
}
|
||||
|
||||
// Step 3: Create initial snapshots from current product state
|
||||
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId);
|
||||
result.snapshotsWritten += snapshotsWritten;
|
||||
|
||||
// Update crawl run with snapshot count
|
||||
await this.pool.query(
|
||||
'UPDATE crawl_runs SET snapshots_written = $1 WHERE id = $2',
|
||||
[snapshotsWritten, crawlRunId]
|
||||
);
|
||||
} else {
|
||||
// Dry run - just count what would be done
|
||||
result.crawlRunsCreated++;
|
||||
result.productsUpserted += productCount;
|
||||
result.snapshotsWritten += productCount;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create initial snapshots from current product state
|
||||
*/
|
||||
private async createInitialSnapshots(
|
||||
dispensaryId: number,
|
||||
crawlRunId: number
|
||||
): Promise<number> {
|
||||
// Get all store products for this dispensary
|
||||
const products = await this.pool.query(
|
||||
`SELECT sp.id, sp.price_rec, sp.price_med, sp.is_on_special, sp.is_in_stock,
|
||||
sp.stock_quantity, sp.thc_percent, sp.cbd_percent
|
||||
FROM store_products sp
|
||||
WHERE sp.dispensary_id = $1 AND sp.provider = 'dutchie'`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (products.rows.length === 0) return 0;
|
||||
|
||||
const now = new Date();
|
||||
const batchSize = 100;
|
||||
let totalInserted = 0;
|
||||
|
||||
// Process in batches
|
||||
for (let i = 0; i < products.rows.length; i += batchSize) {
|
||||
const batch = products.rows.slice(i, i + batchSize);
|
||||
const values: any[] = [];
|
||||
const placeholders: string[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
for (const product of batch) {
|
||||
values.push(
|
||||
dispensaryId,
|
||||
product.id,
|
||||
crawlRunId,
|
||||
now,
|
||||
product.price_rec,
|
||||
product.price_med,
|
||||
product.is_on_special || false,
|
||||
product.is_in_stock || false,
|
||||
product.stock_quantity,
|
||||
product.thc_percent,
|
||||
product.cbd_percent,
|
||||
JSON.stringify({ source: 'initial_hydration' })
|
||||
);
|
||||
|
||||
const rowPlaceholders = [];
|
||||
for (let j = 0; j < 12; j++) {
|
||||
rowPlaceholders.push(`$${paramIndex++}`);
|
||||
}
|
||||
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW())`);
|
||||
}
|
||||
|
||||
const query = `
|
||||
INSERT INTO store_product_snapshots (
|
||||
dispensary_id, store_product_id, crawl_run_id, captured_at,
|
||||
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||
thc_percent, cbd_percent, raw_data, created_at
|
||||
) VALUES ${placeholders.join(', ')}
|
||||
ON CONFLICT (store_product_id, crawl_run_id)
|
||||
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
|
||||
DO NOTHING
|
||||
`;
|
||||
|
||||
const result = await this.pool.query(query, values);
|
||||
totalInserted += result.rowCount || 0;
|
||||
}
|
||||
|
||||
return totalInserted;
|
||||
}
|
||||
}
|
||||
13
backend/src/canonical-hydration/index.ts
Normal file
13
backend/src/canonical-hydration/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
/**
|
||||
* Canonical Hydration Module
|
||||
* Phase 2: Hydration Pipeline from dutchie_* to store_products/store_product_snapshots/crawl_runs
|
||||
*/
|
||||
|
||||
// Types
|
||||
export * from './types';
|
||||
|
||||
// Services
|
||||
export { CrawlRunRecorder } from './crawl-run-recorder';
|
||||
export { StoreProductNormalizer } from './store-product-normalizer';
|
||||
export { SnapshotWriter } from './snapshot-writer';
|
||||
export { CanonicalHydrationService } from './hydration-service';
|
||||
303
backend/src/canonical-hydration/snapshot-writer.ts
Normal file
303
backend/src/canonical-hydration/snapshot-writer.ts
Normal file
@@ -0,0 +1,303 @@
|
||||
/**
|
||||
* SnapshotWriter
|
||||
* Inserts store_product_snapshots from dutchie_product_snapshots source table
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { SourceSnapshot, StoreProductSnapshot, ServiceContext } from './types';
|
||||
|
||||
export class SnapshotWriter {
|
||||
private pool: Pool;
|
||||
private log: (message: string) => void;
|
||||
private batchSize: number;
|
||||
|
||||
constructor(ctx: ServiceContext, batchSize: number = 100) {
|
||||
this.pool = ctx.pool;
|
||||
this.log = ctx.logger || console.log;
|
||||
this.batchSize = batchSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write snapshots for a crawl run
|
||||
* Reads from dutchie_product_snapshots and inserts to store_product_snapshots
|
||||
*/
|
||||
async writeSnapshotsForCrawlRun(
|
||||
crawlRunId: number,
|
||||
dispensaryId: number,
|
||||
storeProductIdMap: Map<string, number>,
|
||||
crawledAt: Date
|
||||
): Promise<{ written: number; skipped: number; errors: string[] }> {
|
||||
const errors: string[] = [];
|
||||
let written = 0;
|
||||
let skipped = 0;
|
||||
|
||||
// Get source snapshots for this dispensary at this crawl time
|
||||
const sourceSnapshots = await this.getSourceSnapshots(dispensaryId, crawledAt);
|
||||
this.log(`Found ${sourceSnapshots.length} source snapshots for dispensary ${dispensaryId} at ${crawledAt.toISOString()}`);
|
||||
|
||||
// Process in batches
|
||||
for (let i = 0; i < sourceSnapshots.length; i += this.batchSize) {
|
||||
const batch = sourceSnapshots.slice(i, i + this.batchSize);
|
||||
try {
|
||||
const { batchWritten, batchSkipped } = await this.writeBatch(
|
||||
batch,
|
||||
crawlRunId,
|
||||
storeProductIdMap
|
||||
);
|
||||
written += batchWritten;
|
||||
skipped += batchSkipped;
|
||||
} catch (err: any) {
|
||||
errors.push(`Batch ${i / this.batchSize}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { written, skipped, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a single snapshot
|
||||
*/
|
||||
async writeSnapshot(
|
||||
source: SourceSnapshot,
|
||||
crawlRunId: number,
|
||||
storeProductId: number
|
||||
): Promise<number | null> {
|
||||
const normalized = this.normalizeSnapshot(source, crawlRunId, storeProductId);
|
||||
|
||||
const result = await this.pool.query(
|
||||
`INSERT INTO store_product_snapshots (
|
||||
dispensary_id, store_product_id, crawl_run_id, captured_at,
|
||||
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||
thc_percent, cbd_percent, raw_data, created_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
|
||||
ON CONFLICT (store_product_id, crawl_run_id)
|
||||
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
|
||||
DO UPDATE SET
|
||||
price_rec = EXCLUDED.price_rec,
|
||||
price_med = EXCLUDED.price_med,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
raw_data = EXCLUDED.raw_data
|
||||
RETURNING id`,
|
||||
[
|
||||
normalized.dispensary_id,
|
||||
normalized.store_product_id,
|
||||
normalized.crawl_run_id,
|
||||
normalized.captured_at,
|
||||
normalized.price_rec,
|
||||
normalized.price_med,
|
||||
normalized.is_on_special,
|
||||
normalized.is_in_stock,
|
||||
normalized.stock_quantity,
|
||||
normalized.thc_percent,
|
||||
normalized.cbd_percent,
|
||||
JSON.stringify(normalized.raw_data),
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0]?.id || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a batch of snapshots
|
||||
*/
|
||||
async writeBatch(
|
||||
sourceSnapshots: SourceSnapshot[],
|
||||
crawlRunId: number,
|
||||
storeProductIdMap: Map<string, number>
|
||||
): Promise<{ batchWritten: number; batchSkipped: number }> {
|
||||
if (sourceSnapshots.length === 0) return { batchWritten: 0, batchSkipped: 0 };
|
||||
|
||||
const values: any[] = [];
|
||||
const placeholders: string[] = [];
|
||||
let paramIndex = 1;
|
||||
let skipped = 0;
|
||||
|
||||
for (const source of sourceSnapshots) {
|
||||
// Look up store_product_id
|
||||
const storeProductId = storeProductIdMap.get(source.external_product_id);
|
||||
if (!storeProductId) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const normalized = this.normalizeSnapshot(source, crawlRunId, storeProductId);
|
||||
|
||||
values.push(
|
||||
normalized.dispensary_id,
|
||||
normalized.store_product_id,
|
||||
normalized.crawl_run_id,
|
||||
normalized.captured_at,
|
||||
normalized.price_rec,
|
||||
normalized.price_med,
|
||||
normalized.is_on_special,
|
||||
normalized.is_in_stock,
|
||||
normalized.stock_quantity,
|
||||
normalized.thc_percent,
|
||||
normalized.cbd_percent,
|
||||
JSON.stringify(normalized.raw_data)
|
||||
);
|
||||
|
||||
const rowPlaceholders = [];
|
||||
for (let j = 0; j < 12; j++) {
|
||||
rowPlaceholders.push(`$${paramIndex++}`);
|
||||
}
|
||||
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW())`);
|
||||
}
|
||||
|
||||
if (placeholders.length === 0) {
|
||||
return { batchWritten: 0, batchSkipped: skipped };
|
||||
}
|
||||
|
||||
const query = `
|
||||
INSERT INTO store_product_snapshots (
|
||||
dispensary_id, store_product_id, crawl_run_id, captured_at,
|
||||
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||
thc_percent, cbd_percent, raw_data, created_at
|
||||
) VALUES ${placeholders.join(', ')}
|
||||
ON CONFLICT (store_product_id, crawl_run_id)
|
||||
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
|
||||
DO UPDATE SET
|
||||
price_rec = EXCLUDED.price_rec,
|
||||
price_med = EXCLUDED.price_med,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
raw_data = EXCLUDED.raw_data
|
||||
`;
|
||||
|
||||
const result = await this.pool.query(query, values);
|
||||
return { batchWritten: result.rowCount || 0, batchSkipped: skipped };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get source snapshots from dutchie_product_snapshots for a specific crawl time
|
||||
* Groups snapshots by crawled_at time (within a 5-minute window)
|
||||
*/
|
||||
async getSourceSnapshots(
|
||||
dispensaryId: number,
|
||||
crawledAt: Date
|
||||
): Promise<SourceSnapshot[]> {
|
||||
// Find snapshots within 5 minutes of the target time
|
||||
const windowMinutes = 5;
|
||||
const result = await this.pool.query(
|
||||
`SELECT * FROM dutchie_product_snapshots
|
||||
WHERE dispensary_id = $1
|
||||
AND crawled_at >= $2 - INTERVAL '${windowMinutes} minutes'
|
||||
AND crawled_at <= $2 + INTERVAL '${windowMinutes} minutes'
|
||||
ORDER BY crawled_at ASC`,
|
||||
[dispensaryId, crawledAt]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get distinct crawl times from dutchie_product_snapshots for a dispensary
|
||||
* Used for backfill to identify each crawl run
|
||||
*/
|
||||
async getDistinctCrawlTimes(
|
||||
dispensaryId: number,
|
||||
startDate?: Date,
|
||||
endDate?: Date
|
||||
): Promise<Date[]> {
|
||||
let query = `
|
||||
SELECT DISTINCT date_trunc('minute', crawled_at) as crawl_time
|
||||
FROM dutchie_product_snapshots
|
||||
WHERE dispensary_id = $1
|
||||
`;
|
||||
const params: any[] = [dispensaryId];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (startDate) {
|
||||
query += ` AND crawled_at >= $${paramIndex++}`;
|
||||
params.push(startDate);
|
||||
}
|
||||
|
||||
if (endDate) {
|
||||
query += ` AND crawled_at <= $${paramIndex++}`;
|
||||
params.push(endDate);
|
||||
}
|
||||
|
||||
query += ' ORDER BY crawl_time ASC';
|
||||
|
||||
const result = await this.pool.query(query, params);
|
||||
return result.rows.map(row => new Date(row.crawl_time));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if snapshots already exist for a crawl run
|
||||
*/
|
||||
async snapshotsExistForCrawlRun(crawlRunId: number): Promise<boolean> {
|
||||
const result = await this.pool.query(
|
||||
'SELECT 1 FROM store_product_snapshots WHERE crawl_run_id = $1 LIMIT 1',
|
||||
[crawlRunId]
|
||||
);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a source snapshot to store_product_snapshot format
|
||||
*/
|
||||
private normalizeSnapshot(
|
||||
source: SourceSnapshot,
|
||||
crawlRunId: number,
|
||||
storeProductId: number
|
||||
): StoreProductSnapshot {
|
||||
// Convert cents to dollars
|
||||
const priceRec = source.rec_min_price_cents !== null
|
||||
? source.rec_min_price_cents / 100
|
||||
: null;
|
||||
const priceMed = source.med_min_price_cents !== null
|
||||
? source.med_min_price_cents / 100
|
||||
: null;
|
||||
|
||||
// Determine stock status
|
||||
const isInStock = this.isSnapshotInStock(source.stock_status, source.total_quantity_available);
|
||||
|
||||
return {
|
||||
dispensary_id: source.dispensary_id,
|
||||
store_product_id: storeProductId,
|
||||
crawl_run_id: crawlRunId,
|
||||
captured_at: source.crawled_at,
|
||||
price_rec: priceRec,
|
||||
price_med: priceMed,
|
||||
is_on_special: false, // Source doesn't have special flag
|
||||
is_in_stock: isInStock,
|
||||
stock_quantity: source.total_quantity_available,
|
||||
thc_percent: null, // Not in snapshot, would need to join with product
|
||||
cbd_percent: null, // Not in snapshot, would need to join with product
|
||||
raw_data: {
|
||||
source_id: source.id,
|
||||
status: source.status,
|
||||
rec_min_price_cents: source.rec_min_price_cents,
|
||||
rec_max_price_cents: source.rec_max_price_cents,
|
||||
med_min_price_cents: source.med_min_price_cents,
|
||||
med_max_price_cents: source.med_max_price_cents,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if snapshot is in stock
|
||||
*/
|
||||
private isSnapshotInStock(stockStatus: string | null, quantity: number | null): boolean {
|
||||
if (quantity !== null && quantity > 0) return true;
|
||||
|
||||
if (stockStatus) {
|
||||
const status = stockStatus.toLowerCase();
|
||||
if (status === 'in_stock' || status === 'instock' || status === 'available') {
|
||||
return true;
|
||||
}
|
||||
if (status === 'out_of_stock' || status === 'outofstock' || status === 'unavailable') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
322
backend/src/canonical-hydration/store-product-normalizer.ts
Normal file
322
backend/src/canonical-hydration/store-product-normalizer.ts
Normal file
@@ -0,0 +1,322 @@
|
||||
/**
|
||||
* StoreProductNormalizer
|
||||
* Upserts store_products from dutchie_products source table
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { SourceProduct, StoreProduct, ServiceContext } from './types';
|
||||
|
||||
export class StoreProductNormalizer {
|
||||
private pool: Pool;
|
||||
private log: (message: string) => void;
|
||||
private batchSize: number;
|
||||
|
||||
constructor(ctx: ServiceContext, batchSize: number = 100) {
|
||||
this.pool = ctx.pool;
|
||||
this.log = ctx.logger || console.log;
|
||||
this.batchSize = batchSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert products for a specific dispensary
|
||||
* Reads from dutchie_products and upserts to store_products
|
||||
*/
|
||||
async upsertProductsForDispensary(dispensaryId: number): Promise<{ upserted: number; errors: string[] }> {
|
||||
const errors: string[] = [];
|
||||
let upserted = 0;
|
||||
|
||||
// Get all products for this dispensary from source
|
||||
const sourceProducts = await this.getSourceProducts(dispensaryId);
|
||||
this.log(`Found ${sourceProducts.length} source products for dispensary ${dispensaryId}`);
|
||||
|
||||
// Process in batches to avoid memory issues
|
||||
for (let i = 0; i < sourceProducts.length; i += this.batchSize) {
|
||||
const batch = sourceProducts.slice(i, i + this.batchSize);
|
||||
try {
|
||||
const batchUpserted = await this.upsertBatch(batch);
|
||||
upserted += batchUpserted;
|
||||
} catch (err: any) {
|
||||
errors.push(`Batch ${i / this.batchSize}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { upserted, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert a single product
|
||||
*/
|
||||
async upsertProduct(source: SourceProduct): Promise<number | null> {
|
||||
const normalized = this.normalizeProduct(source);
|
||||
|
||||
const result = await this.pool.query(
|
||||
`INSERT INTO store_products (
|
||||
dispensary_id, brand_id, provider, provider_product_id,
|
||||
name_raw, brand_name_raw, category_raw,
|
||||
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||
thc_percent, cbd_percent, image_url,
|
||||
first_seen_at, last_seen_at, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, NOW(), NOW())
|
||||
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||
DO UPDATE SET
|
||||
name_raw = EXCLUDED.name_raw,
|
||||
brand_name_raw = EXCLUDED.brand_name_raw,
|
||||
category_raw = EXCLUDED.category_raw,
|
||||
price_rec = EXCLUDED.price_rec,
|
||||
price_med = EXCLUDED.price_med,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
image_url = COALESCE(EXCLUDED.image_url, store_products.image_url),
|
||||
last_seen_at = EXCLUDED.last_seen_at,
|
||||
updated_at = NOW()
|
||||
RETURNING id`,
|
||||
[
|
||||
normalized.dispensary_id,
|
||||
normalized.brand_id,
|
||||
normalized.provider,
|
||||
normalized.provider_product_id,
|
||||
normalized.name_raw,
|
||||
normalized.brand_name_raw,
|
||||
normalized.category_raw,
|
||||
normalized.price_rec,
|
||||
normalized.price_med,
|
||||
normalized.is_on_special,
|
||||
normalized.is_in_stock,
|
||||
normalized.stock_quantity,
|
||||
normalized.thc_percent,
|
||||
normalized.cbd_percent,
|
||||
normalized.image_url,
|
||||
normalized.first_seen_at,
|
||||
normalized.last_seen_at,
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0]?.id || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert a batch of products
|
||||
*/
|
||||
async upsertBatch(sourceProducts: SourceProduct[]): Promise<number> {
|
||||
if (sourceProducts.length === 0) return 0;
|
||||
|
||||
// Build multi-row INSERT with ON CONFLICT
|
||||
const values: any[] = [];
|
||||
const placeholders: string[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
for (const source of sourceProducts) {
|
||||
const normalized = this.normalizeProduct(source);
|
||||
values.push(
|
||||
normalized.dispensary_id,
|
||||
normalized.brand_id,
|
||||
normalized.provider,
|
||||
normalized.provider_product_id,
|
||||
normalized.name_raw,
|
||||
normalized.brand_name_raw,
|
||||
normalized.category_raw,
|
||||
normalized.price_rec,
|
||||
normalized.price_med,
|
||||
normalized.is_on_special,
|
||||
normalized.is_in_stock,
|
||||
normalized.stock_quantity,
|
||||
normalized.thc_percent,
|
||||
normalized.cbd_percent,
|
||||
normalized.image_url,
|
||||
normalized.first_seen_at,
|
||||
normalized.last_seen_at
|
||||
);
|
||||
|
||||
const rowPlaceholders = [];
|
||||
for (let j = 0; j < 17; j++) {
|
||||
rowPlaceholders.push(`$${paramIndex++}`);
|
||||
}
|
||||
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW(), NOW())`);
|
||||
}
|
||||
|
||||
const query = `
|
||||
INSERT INTO store_products (
|
||||
dispensary_id, brand_id, provider, provider_product_id,
|
||||
name_raw, brand_name_raw, category_raw,
|
||||
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||
thc_percent, cbd_percent, image_url,
|
||||
first_seen_at, last_seen_at, created_at, updated_at
|
||||
) VALUES ${placeholders.join(', ')}
|
||||
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||
DO UPDATE SET
|
||||
name_raw = EXCLUDED.name_raw,
|
||||
brand_name_raw = EXCLUDED.brand_name_raw,
|
||||
category_raw = EXCLUDED.category_raw,
|
||||
price_rec = EXCLUDED.price_rec,
|
||||
price_med = EXCLUDED.price_med,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
image_url = COALESCE(EXCLUDED.image_url, store_products.image_url),
|
||||
last_seen_at = EXCLUDED.last_seen_at,
|
||||
updated_at = NOW()
|
||||
`;
|
||||
|
||||
const result = await this.pool.query(query, values);
|
||||
return result.rowCount || 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get store_product ID by canonical key
|
||||
*/
|
||||
async getStoreProductId(
|
||||
dispensaryId: number,
|
||||
provider: string,
|
||||
providerProductId: string
|
||||
): Promise<number | null> {
|
||||
const result = await this.pool.query(
|
||||
'SELECT id FROM store_products WHERE dispensary_id = $1 AND provider = $2 AND provider_product_id = $3',
|
||||
[dispensaryId, provider, providerProductId]
|
||||
);
|
||||
return result.rows[0]?.id || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all store_product IDs for a dispensary (for snapshot writing)
|
||||
*/
|
||||
async getStoreProductIdMap(dispensaryId: number): Promise<Map<string, number>> {
|
||||
const result = await this.pool.query(
|
||||
'SELECT id, provider_product_id FROM store_products WHERE dispensary_id = $1',
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
const map = new Map<string, number>();
|
||||
for (const row of result.rows) {
|
||||
map.set(row.provider_product_id, row.id);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get source products from dutchie_products
|
||||
*/
|
||||
private async getSourceProducts(dispensaryId: number): Promise<SourceProduct[]> {
|
||||
const result = await this.pool.query(
|
||||
`SELECT * FROM dutchie_products WHERE dispensary_id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a source product to store_product format
|
||||
*/
|
||||
private normalizeProduct(source: SourceProduct): StoreProduct {
|
||||
// Extract price from JSONB if present
|
||||
const priceRec = this.extractPrice(source.price_rec);
|
||||
const priceMed = this.extractPrice(source.price_med);
|
||||
|
||||
// Parse THC/CBD percentages
|
||||
const thcPercent = this.parsePercentage(source.thc);
|
||||
const cbdPercent = this.parsePercentage(source.cbd);
|
||||
|
||||
// Determine stock status
|
||||
const isInStock = this.isProductInStock(source.stock_status, source.total_quantity_available);
|
||||
|
||||
return {
|
||||
dispensary_id: source.dispensary_id,
|
||||
brand_id: null, // Source has UUID strings, target expects integer - set to null for now
|
||||
provider: source.platform || 'dutchie',
|
||||
provider_product_id: source.external_product_id,
|
||||
name_raw: source.name,
|
||||
brand_name_raw: source.brand_name,
|
||||
category_raw: source.type || source.subcategory,
|
||||
price_rec: priceRec,
|
||||
price_med: priceMed,
|
||||
is_on_special: false, // Dutchie doesn't have a direct special flag, would need to check specials table
|
||||
is_in_stock: isInStock,
|
||||
stock_quantity: source.total_quantity_available,
|
||||
thc_percent: thcPercent,
|
||||
cbd_percent: cbdPercent,
|
||||
image_url: source.primary_image_url,
|
||||
first_seen_at: source.created_at,
|
||||
last_seen_at: source.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract price from JSONB price field
|
||||
* Handles formats like: {min: 10, max: 20}, {value: 15}, or just a number
|
||||
*/
|
||||
private extractPrice(priceData: any): number | null {
|
||||
if (priceData === null || priceData === undefined) return null;
|
||||
|
||||
// If it's already a number
|
||||
if (typeof priceData === 'number') return priceData;
|
||||
|
||||
// If it's a string that looks like a number
|
||||
if (typeof priceData === 'string') {
|
||||
const parsed = parseFloat(priceData);
|
||||
return isNaN(parsed) ? null : parsed;
|
||||
}
|
||||
|
||||
// If it's an object with price data
|
||||
if (typeof priceData === 'object') {
|
||||
// Try common price formats
|
||||
if (priceData.min !== undefined && priceData.min !== null) {
|
||||
return typeof priceData.min === 'number' ? priceData.min : parseFloat(priceData.min);
|
||||
}
|
||||
if (priceData.value !== undefined && priceData.value !== null) {
|
||||
return typeof priceData.value === 'number' ? priceData.value : parseFloat(priceData.value);
|
||||
}
|
||||
if (priceData.price !== undefined && priceData.price !== null) {
|
||||
return typeof priceData.price === 'number' ? priceData.price : parseFloat(priceData.price);
|
||||
}
|
||||
// Check for array of variants
|
||||
if (Array.isArray(priceData) && priceData.length > 0) {
|
||||
const firstVariant = priceData[0];
|
||||
if (firstVariant.price !== undefined) {
|
||||
return typeof firstVariant.price === 'number' ? firstVariant.price : parseFloat(firstVariant.price);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse percentage string to number
|
||||
* Handles formats like: "25.5%", "25.5", "25.5 %", etc.
|
||||
*/
|
||||
private parsePercentage(value: string | null | undefined): number | null {
|
||||
if (value === null || value === undefined) return null;
|
||||
|
||||
// Remove percentage sign and whitespace
|
||||
const cleaned = value.toString().replace(/%/g, '').trim();
|
||||
|
||||
const parsed = parseFloat(cleaned);
|
||||
return isNaN(parsed) ? null : parsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if product is in stock based on status and quantity
|
||||
*/
|
||||
private isProductInStock(stockStatus: string | null, quantity: number | null): boolean {
|
||||
// Check quantity first
|
||||
if (quantity !== null && quantity > 0) return true;
|
||||
|
||||
// Check status string
|
||||
if (stockStatus) {
|
||||
const status = stockStatus.toLowerCase();
|
||||
if (status === 'in_stock' || status === 'instock' || status === 'available') {
|
||||
return true;
|
||||
}
|
||||
if (status === 'out_of_stock' || status === 'outofstock' || status === 'unavailable') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Default to false if unknown
|
||||
return false;
|
||||
}
|
||||
}
|
||||
150
backend/src/canonical-hydration/types.ts
Normal file
150
backend/src/canonical-hydration/types.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
/**
|
||||
* Canonical Hydration Types
|
||||
* Phase 2: Hydration Pipeline from dutchie_* to store_products/store_product_snapshots/crawl_runs
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
// Source job types for hydration
|
||||
export type SourceJobType = 'dispensary_crawl_jobs' | 'crawl_jobs' | 'job_run_logs';
|
||||
|
||||
// Source job record (from dispensary_crawl_jobs)
|
||||
export interface SourceJob {
|
||||
id: number;
|
||||
dispensary_id: number;
|
||||
job_type: string;
|
||||
status: string;
|
||||
started_at: Date | null;
|
||||
completed_at: Date | null;
|
||||
duration_ms: number | null;
|
||||
products_found: number | null;
|
||||
products_new: number | null;
|
||||
products_updated: number | null;
|
||||
error_message: string | null;
|
||||
}
|
||||
|
||||
// Source product record (from dutchie_products)
|
||||
export interface SourceProduct {
|
||||
id: number;
|
||||
dispensary_id: number;
|
||||
platform: string;
|
||||
external_product_id: string;
|
||||
name: string;
|
||||
brand_name: string | null;
|
||||
brand_id: number | null;
|
||||
type: string | null;
|
||||
subcategory: string | null;
|
||||
strain_type: string | null;
|
||||
thc: string | null;
|
||||
cbd: string | null;
|
||||
price_rec: any; // JSONB
|
||||
price_med: any; // JSONB
|
||||
stock_status: string | null;
|
||||
total_quantity_available: number | null;
|
||||
primary_image_url: string | null;
|
||||
created_at: Date;
|
||||
updated_at: Date;
|
||||
}
|
||||
|
||||
// Source snapshot record (from dutchie_product_snapshots)
|
||||
export interface SourceSnapshot {
|
||||
id: number;
|
||||
dutchie_product_id: number;
|
||||
dispensary_id: number;
|
||||
external_product_id: string;
|
||||
status: string | null;
|
||||
rec_min_price_cents: number | null;
|
||||
rec_max_price_cents: number | null;
|
||||
med_min_price_cents: number | null;
|
||||
med_max_price_cents: number | null;
|
||||
stock_status: string | null;
|
||||
total_quantity_available: number | null;
|
||||
crawled_at: Date;
|
||||
created_at: Date;
|
||||
}
|
||||
|
||||
// Crawl run record for canonical table
|
||||
export interface CrawlRun {
|
||||
id?: number;
|
||||
dispensary_id: number;
|
||||
provider: string;
|
||||
started_at: Date;
|
||||
finished_at: Date | null;
|
||||
duration_ms: number | null;
|
||||
status: string;
|
||||
error_message: string | null;
|
||||
products_found: number | null;
|
||||
products_new: number | null;
|
||||
products_updated: number | null;
|
||||
snapshots_written: number | null;
|
||||
worker_id: string | null;
|
||||
trigger_type: string | null;
|
||||
metadata: any;
|
||||
source_job_type: SourceJobType;
|
||||
source_job_id: number;
|
||||
}
|
||||
|
||||
// Store product record for canonical table
|
||||
export interface StoreProduct {
|
||||
id?: number;
|
||||
dispensary_id: number;
|
||||
brand_id: number | null;
|
||||
provider: string;
|
||||
provider_product_id: string;
|
||||
name_raw: string;
|
||||
brand_name_raw: string | null;
|
||||
category_raw: string | null;
|
||||
price_rec: number | null;
|
||||
price_med: number | null;
|
||||
is_on_special: boolean;
|
||||
is_in_stock: boolean;
|
||||
stock_quantity: number | null;
|
||||
thc_percent: number | null;
|
||||
cbd_percent: number | null;
|
||||
image_url: string | null;
|
||||
first_seen_at: Date;
|
||||
last_seen_at: Date;
|
||||
}
|
||||
|
||||
// Store product snapshot record for canonical table
|
||||
export interface StoreProductSnapshot {
|
||||
id?: number;
|
||||
dispensary_id: number;
|
||||
store_product_id: number;
|
||||
crawl_run_id: number;
|
||||
captured_at: Date;
|
||||
price_rec: number | null;
|
||||
price_med: number | null;
|
||||
is_on_special: boolean;
|
||||
is_in_stock: boolean;
|
||||
stock_quantity: number | null;
|
||||
thc_percent: number | null;
|
||||
cbd_percent: number | null;
|
||||
raw_data: any;
|
||||
}
|
||||
|
||||
// Hydration options
|
||||
export interface HydrationOptions {
|
||||
mode: 'backfill' | 'incremental';
|
||||
dispensaryId?: number;
|
||||
startDate?: Date;
|
||||
endDate?: Date;
|
||||
batchSize?: number;
|
||||
dryRun?: boolean;
|
||||
}
|
||||
|
||||
// Hydration result
|
||||
export interface HydrationResult {
|
||||
crawlRunsCreated: number;
|
||||
crawlRunsSkipped: number;
|
||||
productsUpserted: number;
|
||||
snapshotsWritten: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// Service context
|
||||
export interface ServiceContext {
|
||||
pool: Pool;
|
||||
logger?: (message: string) => void;
|
||||
}
|
||||
657
backend/src/crawlers/base/base-dutchie.ts
Normal file
657
backend/src/crawlers/base/base-dutchie.ts
Normal file
@@ -0,0 +1,657 @@
|
||||
/**
|
||||
* Base Dutchie Crawler Template
|
||||
*
|
||||
* This is the base template for all Dutchie store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* Exports:
|
||||
* - crawlProducts(dispensary, options) - Main crawl entry point
|
||||
* - detectStructure(page) - Detect page structure for sandbox mode
|
||||
* - extractProducts(document) - Extract product data
|
||||
* - extractImages(document) - Extract product images
|
||||
* - extractStock(document) - Extract stock status
|
||||
* - extractPagination(document) - Extract pagination info
|
||||
*/
|
||||
|
||||
import {
|
||||
crawlDispensaryProducts as baseCrawlDispensaryProducts,
|
||||
CrawlResult,
|
||||
} from '../../dutchie-az/services/product-crawler';
|
||||
import { Dispensary, CrawlerProfileOptions } from '../../dutchie-az/types';
|
||||
|
||||
// Re-export CrawlResult for convenience
|
||||
export { CrawlResult };
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Options passed to the per-store crawler
|
||||
*/
|
||||
export interface StoreCrawlOptions {
|
||||
pricingType?: 'rec' | 'med';
|
||||
useBothModes?: boolean;
|
||||
downloadImages?: boolean;
|
||||
trackStock?: boolean;
|
||||
timeoutMs?: number;
|
||||
config?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Progress callback for reporting crawl progress
|
||||
*/
|
||||
export interface CrawlProgressCallback {
|
||||
phase: 'fetching' | 'processing' | 'saving' | 'images' | 'complete';
|
||||
current: number;
|
||||
total: number;
|
||||
message?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Structure detection result for sandbox mode
|
||||
*/
|
||||
export interface StructureDetectionResult {
|
||||
success: boolean;
|
||||
menuType: 'dutchie' | 'treez' | 'jane' | 'unknown';
|
||||
iframeUrl?: string;
|
||||
graphqlEndpoint?: string;
|
||||
dispensaryId?: string;
|
||||
selectors: {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
pagination?: string;
|
||||
loadMore?: string;
|
||||
};
|
||||
pagination: {
|
||||
type: 'scroll' | 'click' | 'graphql' | 'none';
|
||||
hasMore?: boolean;
|
||||
pageSize?: number;
|
||||
};
|
||||
errors: string[];
|
||||
metadata: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Product extraction result
|
||||
*/
|
||||
export interface ExtractedProduct {
|
||||
externalId: string;
|
||||
name: string;
|
||||
brand?: string;
|
||||
category?: string;
|
||||
subcategory?: string;
|
||||
price?: number;
|
||||
priceRec?: number;
|
||||
priceMed?: number;
|
||||
weight?: string;
|
||||
thcContent?: string;
|
||||
cbdContent?: string;
|
||||
description?: string;
|
||||
imageUrl?: string;
|
||||
stockStatus?: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||
quantity?: number;
|
||||
raw?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Image extraction result
|
||||
*/
|
||||
export interface ExtractedImage {
|
||||
productId: string;
|
||||
imageUrl: string;
|
||||
isPrimary: boolean;
|
||||
position: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stock extraction result
|
||||
*/
|
||||
export interface ExtractedStock {
|
||||
productId: string;
|
||||
status: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||
quantity?: number;
|
||||
lastChecked: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pagination extraction result
|
||||
*/
|
||||
export interface ExtractedPagination {
|
||||
hasNextPage: boolean;
|
||||
currentPage?: number;
|
||||
totalPages?: number;
|
||||
totalProducts?: number;
|
||||
nextCursor?: string;
|
||||
loadMoreSelector?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hook points that per-store crawlers can override
|
||||
*/
|
||||
export interface DutchieCrawlerHooks {
|
||||
/**
|
||||
* Called before fetching products
|
||||
* Can be used to set up custom headers, cookies, etc.
|
||||
*/
|
||||
beforeFetch?: (dispensary: Dispensary) => Promise<void>;
|
||||
|
||||
/**
|
||||
* Called after fetching products, before processing
|
||||
* Can be used to filter or transform raw products
|
||||
*/
|
||||
afterFetch?: (products: any[], dispensary: Dispensary) => Promise<any[]>;
|
||||
|
||||
/**
|
||||
* Called after all processing is complete
|
||||
* Can be used for cleanup or post-processing
|
||||
*/
|
||||
afterComplete?: (result: CrawlResult, dispensary: Dispensary) => Promise<void>;
|
||||
|
||||
/**
|
||||
* Custom selector resolver for iframe detection
|
||||
*/
|
||||
resolveIframe?: (page: any) => Promise<string | null>;
|
||||
|
||||
/**
|
||||
* Custom product container selector
|
||||
*/
|
||||
getProductContainerSelector?: () => string;
|
||||
|
||||
/**
|
||||
* Custom product extraction from container element
|
||||
*/
|
||||
extractProductFromElement?: (element: any) => Promise<ExtractedProduct | null>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selectors configuration for per-store overrides
|
||||
*/
|
||||
export interface DutchieSelectors {
|
||||
iframe?: string;
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productPriceRec?: string;
|
||||
productPriceMed?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
productWeight?: string;
|
||||
productThc?: string;
|
||||
productCbd?: string;
|
||||
productDescription?: string;
|
||||
productStock?: string;
|
||||
loadMore?: string;
|
||||
pagination?: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DEFAULT SELECTORS
|
||||
// ============================================================
|
||||
|
||||
export const DEFAULT_DUTCHIE_SELECTORS: DutchieSelectors = {
|
||||
iframe: 'iframe[src*="dutchie.com"]',
|
||||
productContainer: '[data-testid="product-card"], .product-card, [class*="ProductCard"]',
|
||||
productName: '[data-testid="product-title"], .product-title, [class*="ProductTitle"]',
|
||||
productPrice: '[data-testid="product-price"], .product-price, [class*="ProductPrice"]',
|
||||
productImage: 'img[src*="dutchie"], img[src*="product"], .product-image img',
|
||||
productCategory: '[data-testid="category-name"], .category-name',
|
||||
productBrand: '[data-testid="brand-name"], .brand-name, [class*="BrandName"]',
|
||||
loadMore: 'button[data-testid="load-more"], .load-more-button',
|
||||
pagination: '.pagination, [class*="Pagination"]',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* BaseDutchieCrawler - Base class for all Dutchie store crawlers
|
||||
*
|
||||
* Per-store crawlers extend this class and override methods as needed.
|
||||
* The default implementation delegates to the existing shared Dutchie logic.
|
||||
*/
|
||||
export class BaseDutchieCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected hooks: DutchieCrawlerHooks;
|
||||
protected selectors: DutchieSelectors;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
hooks: DutchieCrawlerHooks = {},
|
||||
selectors: DutchieSelectors = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.hooks = hooks;
|
||||
this.selectors = { ...DEFAULT_DUTCHIE_SELECTORS, ...selectors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* Override this in per-store crawlers to customize behavior
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
// Call beforeFetch hook if defined
|
||||
if (this.hooks.beforeFetch) {
|
||||
await this.hooks.beforeFetch(this.dispensary);
|
||||
}
|
||||
|
||||
// Use the existing shared Dutchie crawl logic
|
||||
const result = await baseCrawlDispensaryProducts(
|
||||
this.dispensary,
|
||||
this.options.pricingType || 'rec',
|
||||
{
|
||||
useBothModes: this.options.useBothModes,
|
||||
downloadImages: this.options.downloadImages,
|
||||
}
|
||||
);
|
||||
|
||||
// Call afterComplete hook if defined
|
||||
if (this.hooks.afterComplete) {
|
||||
await this.hooks.afterComplete(result, this.dispensary);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
* Override in per-store crawlers if needed
|
||||
*
|
||||
* @param page - Puppeteer page object or HTML string
|
||||
* @returns Structure detection result
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
const result: StructureDetectionResult = {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: [],
|
||||
metadata: {},
|
||||
};
|
||||
|
||||
try {
|
||||
// Default implementation: check for Dutchie iframe
|
||||
if (typeof page === 'string') {
|
||||
// HTML string mode
|
||||
if (page.includes('dutchie.com')) {
|
||||
result.menuType = 'dutchie';
|
||||
result.success = true;
|
||||
}
|
||||
} else if (page && typeof page.evaluate === 'function') {
|
||||
// Puppeteer page mode
|
||||
const detection = await page.evaluate((selectorConfig: DutchieSelectors) => {
|
||||
const iframe = document.querySelector(selectorConfig.iframe || '') as HTMLIFrameElement;
|
||||
const iframeUrl = iframe?.src || null;
|
||||
|
||||
// Check for product containers
|
||||
const containers = document.querySelectorAll(selectorConfig.productContainer || '');
|
||||
|
||||
return {
|
||||
hasIframe: !!iframe,
|
||||
iframeUrl,
|
||||
productCount: containers.length,
|
||||
isDutchie: !!iframeUrl?.includes('dutchie.com'),
|
||||
};
|
||||
}, this.selectors);
|
||||
|
||||
if (detection.isDutchie) {
|
||||
result.menuType = 'dutchie';
|
||||
result.iframeUrl = detection.iframeUrl;
|
||||
result.success = true;
|
||||
}
|
||||
|
||||
result.metadata = detection;
|
||||
}
|
||||
|
||||
// Set default selectors for Dutchie
|
||||
if (result.menuType === 'dutchie') {
|
||||
result.selectors = {
|
||||
productContainer: this.selectors.productContainer,
|
||||
productName: this.selectors.productName,
|
||||
productPrice: this.selectors.productPrice,
|
||||
productImage: this.selectors.productImage,
|
||||
productCategory: this.selectors.productCategory,
|
||||
};
|
||||
result.pagination = { type: 'graphql' };
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Detection error: ${error.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from page/document
|
||||
* Override in per-store crawlers for custom extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or raw products array
|
||||
* @returns Array of extracted products
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
// Default implementation: assume document is already an array of products
|
||||
// from the GraphQL response
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((product) => this.mapRawProduct(product));
|
||||
}
|
||||
|
||||
// If document is a Puppeteer page, extract from DOM
|
||||
if (document && typeof document.evaluate === 'function') {
|
||||
return this.extractProductsFromPage(document);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from Puppeteer page
|
||||
* Override for custom DOM extraction
|
||||
*/
|
||||
protected async extractProductsFromPage(page: any): Promise<ExtractedProduct[]> {
|
||||
const products = await page.evaluate((selectors: DutchieSelectors) => {
|
||||
const containers = document.querySelectorAll(selectors.productContainer || '');
|
||||
return Array.from(containers).map((container) => {
|
||||
const nameEl = container.querySelector(selectors.productName || '');
|
||||
const priceEl = container.querySelector(selectors.productPrice || '');
|
||||
const imageEl = container.querySelector(selectors.productImage || '') as HTMLImageElement;
|
||||
const brandEl = container.querySelector(selectors.productBrand || '');
|
||||
|
||||
return {
|
||||
name: nameEl?.textContent?.trim() || '',
|
||||
price: priceEl?.textContent?.trim() || '',
|
||||
imageUrl: imageEl?.src || '',
|
||||
brand: brandEl?.textContent?.trim() || '',
|
||||
};
|
||||
});
|
||||
}, this.selectors);
|
||||
|
||||
return products.map((p: any, i: number) => ({
|
||||
externalId: `dom-product-${i}`,
|
||||
name: p.name,
|
||||
brand: p.brand,
|
||||
price: this.parsePrice(p.price),
|
||||
imageUrl: p.imageUrl,
|
||||
stockStatus: 'unknown' as const,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Map raw product from GraphQL to ExtractedProduct
|
||||
* Override for custom mapping
|
||||
*/
|
||||
protected mapRawProduct(raw: any): ExtractedProduct {
|
||||
return {
|
||||
externalId: raw.id || raw._id || raw.externalId,
|
||||
name: raw.name || raw.Name,
|
||||
brand: raw.brand?.name || raw.brandName || raw.brand,
|
||||
category: raw.type || raw.category || raw.Category,
|
||||
subcategory: raw.subcategory || raw.Subcategory,
|
||||
price: raw.recPrice || raw.price || raw.Price,
|
||||
priceRec: raw.recPrice || raw.Prices?.rec,
|
||||
priceMed: raw.medPrice || raw.Prices?.med,
|
||||
weight: raw.weight || raw.Weight,
|
||||
thcContent: raw.potencyThc?.formatted || raw.THCContent?.formatted,
|
||||
cbdContent: raw.potencyCbd?.formatted || raw.CBDContent?.formatted,
|
||||
description: raw.description || raw.Description,
|
||||
imageUrl: raw.image || raw.Image,
|
||||
stockStatus: this.mapStockStatus(raw),
|
||||
quantity: raw.quantity || raw.Quantity,
|
||||
raw,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Map raw stock status to standardized value
|
||||
*/
|
||||
protected mapStockStatus(raw: any): 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown' {
|
||||
const status = raw.Status || raw.status || raw.stockStatus;
|
||||
if (status === 'Active' || status === 'active' || status === 'in_stock') {
|
||||
return 'in_stock';
|
||||
}
|
||||
if (status === 'Inactive' || status === 'inactive' || status === 'out_of_stock') {
|
||||
return 'out_of_stock';
|
||||
}
|
||||
if (status === 'low_stock') {
|
||||
return 'low_stock';
|
||||
}
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse price string to number
|
||||
*/
|
||||
protected parsePrice(priceStr: string): number | undefined {
|
||||
if (!priceStr) return undefined;
|
||||
const cleaned = priceStr.replace(/[^0-9.]/g, '');
|
||||
const num = parseFloat(cleaned);
|
||||
return isNaN(num) ? undefined : num;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
* Override for custom image extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or products array
|
||||
* @returns Array of extracted images
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document
|
||||
.filter((p) => p.image || p.Image || p.imageUrl)
|
||||
.map((p, i) => ({
|
||||
productId: p.id || p._id || `product-${i}`,
|
||||
imageUrl: p.image || p.Image || p.imageUrl,
|
||||
isPrimary: true,
|
||||
position: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
// Puppeteer page extraction
|
||||
if (document && typeof document.evaluate === 'function') {
|
||||
return this.extractImagesFromPage(document);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from Puppeteer page
|
||||
*/
|
||||
protected async extractImagesFromPage(page: any): Promise<ExtractedImage[]> {
|
||||
const images = await page.evaluate((selector: string) => {
|
||||
const imgs = document.querySelectorAll(selector);
|
||||
return Array.from(imgs).map((img, i) => ({
|
||||
src: (img as HTMLImageElement).src,
|
||||
position: i,
|
||||
}));
|
||||
}, this.selectors.productImage || 'img');
|
||||
|
||||
return images.map((img: any, i: number) => ({
|
||||
productId: `dom-product-${i}`,
|
||||
imageUrl: img.src,
|
||||
isPrimary: i === 0,
|
||||
position: img.position,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
* Override for custom stock extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or products array
|
||||
* @returns Array of extracted stock statuses
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((p) => ({
|
||||
productId: p.id || p._id || p.externalId,
|
||||
status: this.mapStockStatus(p),
|
||||
quantity: p.quantity || p.Quantity,
|
||||
lastChecked: new Date(),
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information from document
|
||||
* Override for custom pagination handling
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or GraphQL response
|
||||
* @returns Pagination info
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
// Default: check for page info in GraphQL response
|
||||
if (document && document.pageInfo) {
|
||||
return {
|
||||
hasNextPage: document.pageInfo.hasNextPage || false,
|
||||
currentPage: document.pageInfo.currentPage,
|
||||
totalPages: document.pageInfo.totalPages,
|
||||
totalProducts: document.pageInfo.totalCount || document.totalCount,
|
||||
nextCursor: document.pageInfo.endCursor,
|
||||
};
|
||||
}
|
||||
|
||||
// Default: no pagination
|
||||
return {
|
||||
hasNextPage: false,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cName (Dutchie slug) for this dispensary
|
||||
* Override to customize cName extraction
|
||||
*/
|
||||
getCName(): string {
|
||||
if (this.dispensary.menuUrl) {
|
||||
try {
|
||||
const url = new URL(this.dispensary.menuUrl);
|
||||
const segments = url.pathname.split('/').filter(Boolean);
|
||||
if (segments.length >= 2) {
|
||||
return segments[segments.length - 1];
|
||||
}
|
||||
} catch {
|
||||
// Fall through to default
|
||||
}
|
||||
}
|
||||
return this.dispensary.slug || '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get custom headers for API requests
|
||||
* Override for store-specific headers
|
||||
*/
|
||||
getCustomHeaders(): Record<string, string> {
|
||||
const cName = this.getCName();
|
||||
return {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Origin: 'https://dutchie.com',
|
||||
Referer: `https://dutchie.com/embedded-menu/${cName}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Create a base Dutchie crawler instance
|
||||
* This is the default export used when no per-store override exists
|
||||
*/
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
hooks: DutchieCrawlerHooks = {},
|
||||
selectors: DutchieSelectors = {}
|
||||
): BaseDutchieCrawler {
|
||||
return new BaseDutchieCrawler(dispensary, options, hooks, selectors);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS (required exports for orchestrator)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Crawl products using the base Dutchie logic
|
||||
* Per-store files can call this or override it completely
|
||||
*/
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect structure using the base Dutchie logic
|
||||
*/
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products using the base Dutchie logic
|
||||
*/
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images using the base Dutchie logic
|
||||
*/
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock using the base Dutchie logic
|
||||
*/
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination using the base Dutchie logic
|
||||
*/
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
330
backend/src/crawlers/base/base-jane.ts
Normal file
330
backend/src/crawlers/base/base-jane.ts
Normal file
@@ -0,0 +1,330 @@
|
||||
/**
|
||||
* Base Jane Crawler Template (PLACEHOLDER)
|
||||
*
|
||||
* This is the base template for all Jane (iheartjane) store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* TODO: Implement Jane-specific crawling logic (Algolia-based)
|
||||
*/
|
||||
|
||||
import { Dispensary } from '../../dutchie-az/types';
|
||||
import {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
} from './base-dutchie';
|
||||
|
||||
// Re-export types
|
||||
export {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// JANE-SPECIFIC TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface JaneConfig {
|
||||
algoliaAppId?: string;
|
||||
algoliaApiKey?: string;
|
||||
algoliaIndex?: string;
|
||||
storeId?: string;
|
||||
}
|
||||
|
||||
export interface JaneSelectors {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
pagination?: string;
|
||||
loadMore?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_JANE_SELECTORS: JaneSelectors = {
|
||||
productContainer: '[data-testid="product-card"], .product-card',
|
||||
productName: '[data-testid="product-name"], .product-name',
|
||||
productPrice: '[data-testid="product-price"], .product-price',
|
||||
productImage: '.product-image img, [data-testid="product-image"] img',
|
||||
productCategory: '.product-category',
|
||||
productBrand: '.product-brand, [data-testid="brand-name"]',
|
||||
loadMore: '[data-testid="load-more"], .load-more-btn',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE JANE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class BaseJaneCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected selectors: JaneSelectors;
|
||||
protected janeConfig: JaneConfig;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: JaneSelectors = {},
|
||||
janeConfig: JaneConfig = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: false,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.selectors = { ...DEFAULT_JANE_SELECTORS, ...selectors };
|
||||
this.janeConfig = janeConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* TODO: Implement Jane/Algolia-specific crawling
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
const startTime = Date.now();
|
||||
console.warn(`[BaseJaneCrawler] Jane crawling not yet implemented for ${this.dispensary.name}`);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: this.dispensary.id || 0,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
imagesDownloaded: 0,
|
||||
errorMessage: 'Jane crawler not yet implemented',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
* Jane uses Algolia, so we look for Algolia config
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
const result: StructureDetectionResult = {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: [],
|
||||
metadata: {},
|
||||
};
|
||||
|
||||
try {
|
||||
if (page && typeof page.evaluate === 'function') {
|
||||
// Look for Jane/Algolia indicators
|
||||
const detection = await page.evaluate(() => {
|
||||
// Check for iheartjane in page
|
||||
const hasJane = document.documentElement.innerHTML.includes('iheartjane') ||
|
||||
document.documentElement.innerHTML.includes('jane-menu');
|
||||
|
||||
// Look for Algolia config
|
||||
const scripts = Array.from(document.querySelectorAll('script'));
|
||||
let algoliaConfig: any = null;
|
||||
|
||||
for (const script of scripts) {
|
||||
const content = script.textContent || '';
|
||||
if (content.includes('algolia') || content.includes('ALGOLIA')) {
|
||||
// Try to extract config
|
||||
const appIdMatch = content.match(/applicationId['":\s]+['"]([^'"]+)['"]/);
|
||||
const apiKeyMatch = content.match(/apiKey['":\s]+['"]([^'"]+)['"]/);
|
||||
if (appIdMatch && apiKeyMatch) {
|
||||
algoliaConfig = {
|
||||
appId: appIdMatch[1],
|
||||
apiKey: apiKeyMatch[1],
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
hasJane,
|
||||
algoliaConfig,
|
||||
};
|
||||
});
|
||||
|
||||
if (detection.hasJane) {
|
||||
result.menuType = 'jane';
|
||||
result.success = true;
|
||||
result.metadata = detection;
|
||||
|
||||
if (detection.algoliaConfig) {
|
||||
result.metadata.algoliaAppId = detection.algoliaConfig.appId;
|
||||
result.metadata.algoliaApiKey = detection.algoliaConfig.apiKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Detection error: ${error.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from Algolia response or page
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
// If document is Algolia hits array
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((hit) => this.mapAlgoliaHit(hit));
|
||||
}
|
||||
|
||||
console.warn('[BaseJaneCrawler] extractProducts not yet fully implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Map Algolia hit to ExtractedProduct
|
||||
*/
|
||||
protected mapAlgoliaHit(hit: any): ExtractedProduct {
|
||||
return {
|
||||
externalId: hit.objectID || hit.id || hit.product_id,
|
||||
name: hit.name || hit.product_name,
|
||||
brand: hit.brand || hit.brand_name,
|
||||
category: hit.category || hit.kind,
|
||||
subcategory: hit.subcategory,
|
||||
price: hit.price || hit.bucket_price,
|
||||
priceRec: hit.prices?.rec || hit.price_rec,
|
||||
priceMed: hit.prices?.med || hit.price_med,
|
||||
weight: hit.weight || hit.amount,
|
||||
thcContent: hit.percent_thc ? `${hit.percent_thc}%` : undefined,
|
||||
cbdContent: hit.percent_cbd ? `${hit.percent_cbd}%` : undefined,
|
||||
description: hit.description,
|
||||
imageUrl: hit.image_url || hit.product_image_url,
|
||||
stockStatus: hit.available ? 'in_stock' : 'out_of_stock',
|
||||
quantity: hit.quantity_available,
|
||||
raw: hit,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document
|
||||
.filter((hit) => hit.image_url || hit.product_image_url)
|
||||
.map((hit, i) => ({
|
||||
productId: hit.objectID || hit.id || `jane-product-${i}`,
|
||||
imageUrl: hit.image_url || hit.product_image_url,
|
||||
isPrimary: true,
|
||||
position: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((hit) => ({
|
||||
productId: hit.objectID || hit.id,
|
||||
status: hit.available ? 'in_stock' as const : 'out_of_stock' as const,
|
||||
quantity: hit.quantity_available,
|
||||
lastChecked: new Date(),
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information
|
||||
* Algolia uses cursor-based pagination
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
if (document && typeof document === 'object' && !Array.isArray(document)) {
|
||||
return {
|
||||
hasNextPage: document.page < document.nbPages - 1,
|
||||
currentPage: document.page,
|
||||
totalPages: document.nbPages,
|
||||
totalProducts: document.nbHits,
|
||||
};
|
||||
}
|
||||
|
||||
return { hasNextPage: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: JaneSelectors = {},
|
||||
janeConfig: JaneConfig = {}
|
||||
): BaseJaneCrawler {
|
||||
return new BaseJaneCrawler(dispensary, options, selectors, janeConfig);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
212
backend/src/crawlers/base/base-treez.ts
Normal file
212
backend/src/crawlers/base/base-treez.ts
Normal file
@@ -0,0 +1,212 @@
|
||||
/**
|
||||
* Base Treez Crawler Template (PLACEHOLDER)
|
||||
*
|
||||
* This is the base template for all Treez store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* TODO: Implement Treez-specific crawling logic
|
||||
*/
|
||||
|
||||
import { Dispensary } from '../../dutchie-az/types';
|
||||
import {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
} from './base-dutchie';
|
||||
|
||||
// Re-export types
|
||||
export {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// TREEZ-SPECIFIC TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface TreezSelectors {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
addToCart?: string;
|
||||
pagination?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_TREEZ_SELECTORS: TreezSelectors = {
|
||||
productContainer: '.product-tile, [class*="ProductCard"]',
|
||||
productName: '.product-name, [class*="ProductName"]',
|
||||
productPrice: '.product-price, [class*="ProductPrice"]',
|
||||
productImage: '.product-image img',
|
||||
productCategory: '.product-category',
|
||||
productBrand: '.product-brand',
|
||||
addToCart: '.add-to-cart-btn',
|
||||
pagination: '.pagination',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE TREEZ CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class BaseTreezCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected selectors: TreezSelectors;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: TreezSelectors = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: false,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.selectors = { ...DEFAULT_TREEZ_SELECTORS, ...selectors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* TODO: Implement Treez-specific crawling
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
const startTime = Date.now();
|
||||
console.warn(`[BaseTreezCrawler] Treez crawling not yet implemented for ${this.dispensary.name}`);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: this.dispensary.id || 0,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
imagesDownloaded: 0,
|
||||
errorMessage: 'Treez crawler not yet implemented',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
return {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: ['Treez structure detection not yet implemented'],
|
||||
metadata: {},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from page/document
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
console.warn('[BaseTreezCrawler] extractProducts not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
console.warn('[BaseTreezCrawler] extractImages not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
console.warn('[BaseTreezCrawler] extractStock not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information from document
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
return { hasNextPage: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: TreezSelectors = {}
|
||||
): BaseTreezCrawler {
|
||||
return new BaseTreezCrawler(dispensary, options, selectors);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
27
backend/src/crawlers/base/index.ts
Normal file
27
backend/src/crawlers/base/index.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
/**
|
||||
* Base Crawler Templates Index
|
||||
*
|
||||
* Exports all base crawler templates for easy importing.
|
||||
*/
|
||||
|
||||
// Dutchie base (primary implementation)
|
||||
export * from './base-dutchie';
|
||||
|
||||
// Treez base (placeholder)
|
||||
export * as Treez from './base-treez';
|
||||
|
||||
// Jane base (placeholder)
|
||||
export * as Jane from './base-jane';
|
||||
|
||||
// Re-export common types from dutchie for convenience
|
||||
export type {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
DutchieCrawlerHooks,
|
||||
DutchieSelectors,
|
||||
} from './base-dutchie';
|
||||
9
backend/src/crawlers/dutchie/base-dutchie.ts
Normal file
9
backend/src/crawlers/dutchie/base-dutchie.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
/**
|
||||
* Base Dutchie Crawler Template (Re-export for backward compatibility)
|
||||
*
|
||||
* DEPRECATED: Import from '../base/base-dutchie' instead.
|
||||
* This file re-exports everything from the new location for existing code.
|
||||
*/
|
||||
|
||||
// Re-export everything from the new base location
|
||||
export * from '../base/base-dutchie';
|
||||
118
backend/src/crawlers/dutchie/stores/trulieve-scottsdale.ts
Normal file
118
backend/src/crawlers/dutchie/stores/trulieve-scottsdale.ts
Normal file
@@ -0,0 +1,118 @@
|
||||
/**
|
||||
* Trulieve Scottsdale - Per-Store Dutchie Crawler
|
||||
*
|
||||
* Store ID: 101
|
||||
* Profile Key: trulieve-scottsdale
|
||||
* Platform Dispensary ID: 5eaf489fa8a61801212577cc
|
||||
*
|
||||
* Phase 1: Identity implementation - no overrides, just uses base Dutchie logic.
|
||||
* Future: Add store-specific selectors, timing, or custom logic as needed.
|
||||
*/
|
||||
|
||||
import {
|
||||
BaseDutchieCrawler,
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
DutchieSelectors,
|
||||
crawlProducts as baseCrawlProducts,
|
||||
} from '../../base/base-dutchie';
|
||||
import { Dispensary } from '../../../dutchie-az/types';
|
||||
|
||||
// Re-export CrawlResult for the orchestrator
|
||||
export { CrawlResult };
|
||||
|
||||
// ============================================================
|
||||
// STORE CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Store-specific configuration
|
||||
* These can be used to customize crawler behavior for this store
|
||||
*/
|
||||
export const STORE_CONFIG = {
|
||||
storeId: 101,
|
||||
profileKey: 'trulieve-scottsdale',
|
||||
name: 'Trulieve of Scottsdale Dispensary',
|
||||
platformDispensaryId: '5eaf489fa8a61801212577cc',
|
||||
|
||||
// Store-specific overrides (none for Phase 1)
|
||||
customOptions: {
|
||||
// Example future overrides:
|
||||
// pricingType: 'rec',
|
||||
// useBothModes: true,
|
||||
// customHeaders: {},
|
||||
// maxRetries: 3,
|
||||
},
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// STORE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* TrulieveScottsdaleCrawler - Per-store crawler for Trulieve Scottsdale
|
||||
*
|
||||
* Phase 1: Identity implementation - extends BaseDutchieCrawler with no overrides.
|
||||
* Future phases can override methods like:
|
||||
* - getCName() for custom slug handling
|
||||
* - crawlProducts() for completely custom logic
|
||||
* - Add hooks for pre/post processing
|
||||
*/
|
||||
export class TrulieveScottsdaleCrawler extends BaseDutchieCrawler {
|
||||
constructor(dispensary: Dispensary, options: StoreCrawlOptions = {}) {
|
||||
// Merge store-specific options with provided options
|
||||
const mergedOptions: StoreCrawlOptions = {
|
||||
...STORE_CONFIG.customOptions,
|
||||
...options,
|
||||
};
|
||||
|
||||
super(dispensary, mergedOptions);
|
||||
}
|
||||
|
||||
// Phase 1: No overrides - use base implementation
|
||||
// Future phases can add overrides here:
|
||||
//
|
||||
// async crawlProducts(): Promise<CrawlResult> {
|
||||
// // Custom pre-processing
|
||||
// // ...
|
||||
// const result = await super.crawlProducts();
|
||||
// // Custom post-processing
|
||||
// // ...
|
||||
// return result;
|
||||
// }
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// EXPORTED CRAWL FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Main entry point for the orchestrator
|
||||
*
|
||||
* The orchestrator calls: mod.crawlProducts(dispensary, options)
|
||||
* This function creates a TrulieveScottsdaleCrawler and runs it.
|
||||
*/
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
console.log(`[TrulieveScottsdale] Using per-store crawler for ${dispensary.name}`);
|
||||
|
||||
const crawler = new TrulieveScottsdaleCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION (alternative API)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Create a crawler instance without running it
|
||||
* Useful for testing or when you need to configure before running
|
||||
*/
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): TrulieveScottsdaleCrawler {
|
||||
return new TrulieveScottsdaleCrawler(dispensary, options);
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
import { pool } from './migrate';
|
||||
import { pool } from './pool';
|
||||
|
||||
async function addJobsTable() {
|
||||
const client = await pool.connect();
|
||||
|
||||
@@ -1,18 +1,58 @@
|
||||
/**
|
||||
* Database Migration Script (CLI-ONLY)
|
||||
*
|
||||
* This file is for running migrations via CLI only:
|
||||
* npx tsx src/db/migrate.ts
|
||||
*
|
||||
* DO NOT import this file from runtime code.
|
||||
* Runtime code should import from src/db/pool.ts instead.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
// Consolidated DB connection:
|
||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
||||
// - Then DATABASE_URL (default)
|
||||
const DATABASE_URL =
|
||||
process.env.CRAWLSY_DATABASE_URL ||
|
||||
process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
||||
// Load .env BEFORE any env var access
|
||||
dotenv.config();
|
||||
|
||||
/**
|
||||
* Get the database connection string from environment variables.
|
||||
* Strict validation - will throw if required vars are missing.
|
||||
*/
|
||||
function getConnectionString(): string {
|
||||
// Priority 1: Full connection URL
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
// Priority 2: Build from individual env vars (all required)
|
||||
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||
const missing = required.filter((key) => !process.env[key]);
|
||||
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`[Migrate] Missing required environment variables: ${missing.join(', ')}\n` +
|
||||
`Either set CANNAIQ_DB_URL or all of: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS`
|
||||
);
|
||||
}
|
||||
|
||||
const host = process.env.CANNAIQ_DB_HOST!;
|
||||
const port = process.env.CANNAIQ_DB_PORT!;
|
||||
const name = process.env.CANNAIQ_DB_NAME!;
|
||||
const user = process.env.CANNAIQ_DB_USER!;
|
||||
const pass = process.env.CANNAIQ_DB_PASS!;
|
||||
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run all database migrations
|
||||
*/
|
||||
async function runMigrations() {
|
||||
// Create pool only when migrations are actually run
|
||||
const pool = new Pool({
|
||||
connectionString: DATABASE_URL,
|
||||
connectionString: getConnectionString(),
|
||||
});
|
||||
|
||||
export async function runMigrations() {
|
||||
const client = await pool.connect();
|
||||
|
||||
try {
|
||||
@@ -340,12 +380,12 @@ export async function runMigrations() {
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
export { pool };
|
||||
|
||||
// Run migrations if this file is executed directly
|
||||
// Only run when executed directly (CLI mode)
|
||||
// DO NOT export pool - runtime code must use src/db/pool.ts
|
||||
if (require.main === module) {
|
||||
runMigrations()
|
||||
.then(() => process.exit(0))
|
||||
|
||||
94
backend/src/db/pool.ts
Normal file
94
backend/src/db/pool.ts
Normal file
@@ -0,0 +1,94 @@
|
||||
/**
|
||||
* Runtime Database Pool
|
||||
*
|
||||
* This is the canonical database pool for all runtime services.
|
||||
* Import pool from here, NOT from migrate.ts.
|
||||
*
|
||||
* migrate.ts is for CLI migrations only and must NOT be imported at runtime.
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
// Load .env before any env var access
|
||||
dotenv.config();
|
||||
|
||||
/**
|
||||
* Get the database connection string from environment variables.
|
||||
* Supports both CANNAIQ_DB_URL and individual CANNAIQ_DB_* vars.
|
||||
*/
|
||||
function getConnectionString(): string {
|
||||
// Priority 1: Full connection URL
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
// Priority 2: Build from individual env vars
|
||||
const host = process.env.CANNAIQ_DB_HOST;
|
||||
const port = process.env.CANNAIQ_DB_PORT;
|
||||
const name = process.env.CANNAIQ_DB_NAME;
|
||||
const user = process.env.CANNAIQ_DB_USER;
|
||||
const pass = process.env.CANNAIQ_DB_PASS;
|
||||
|
||||
// Check if all individual vars are present
|
||||
if (host && port && name && user && pass) {
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
// Fallback: Try DATABASE_URL for legacy compatibility
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
|
||||
// Report what's missing
|
||||
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||
const missing = required.filter((key) => !process.env[key]);
|
||||
|
||||
throw new Error(
|
||||
`[DB Pool] Missing database configuration.\n` +
|
||||
`Set CANNAIQ_DB_URL, or all of: ${missing.join(', ')}`
|
||||
);
|
||||
}
|
||||
|
||||
// Lazy-initialized pool singleton
|
||||
let _pool: Pool | null = null;
|
||||
|
||||
/**
|
||||
* Get the database pool (lazy singleton)
|
||||
*/
|
||||
export function getPool(): Pool {
|
||||
if (!_pool) {
|
||||
_pool = new Pool({
|
||||
connectionString: getConnectionString(),
|
||||
max: 10,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
|
||||
_pool.on('error', (err) => {
|
||||
console.error('[DB Pool] Unexpected error on idle client:', err);
|
||||
});
|
||||
}
|
||||
return _pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* The database pool for runtime use.
|
||||
* This is a getter that lazily initializes on first access.
|
||||
*/
|
||||
export const pool = {
|
||||
query: (...args: Parameters<Pool['query']>) => getPool().query(...args),
|
||||
connect: () => getPool().connect(),
|
||||
end: () => getPool().end(),
|
||||
on: (event: 'error' | 'connect' | 'acquire' | 'remove' | 'release', listener: (...args: any[]) => void) => getPool().on(event as any, listener),
|
||||
};
|
||||
|
||||
/**
|
||||
* Close the pool connection
|
||||
*/
|
||||
export async function closePool(): Promise<void> {
|
||||
if (_pool) {
|
||||
await _pool.end();
|
||||
_pool = null;
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
import { pool } from './migrate';
|
||||
import { pool } from './pool';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { pool } from './migrate';
|
||||
import { pool } from './pool';
|
||||
import bcrypt from 'bcrypt';
|
||||
|
||||
export async function seedDatabase() {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { pool } from './migrate';
|
||||
import { pool } from './pool';
|
||||
|
||||
async function updateCategoriesHierarchy() {
|
||||
const client = await pool.connect();
|
||||
|
||||
474
backend/src/discovery/city-discovery.ts
Normal file
474
backend/src/discovery/city-discovery.ts
Normal file
@@ -0,0 +1,474 @@
|
||||
/**
|
||||
* Dutchie City Discovery Service
|
||||
*
|
||||
* Discovers cities from the Dutchie cities page.
|
||||
* Each city can contain multiple dispensary locations.
|
||||
*
|
||||
* Source: https://dutchie.com/cities
|
||||
*
|
||||
* This module ONLY handles city discovery and upserts to dutchie_discovery_cities.
|
||||
* It does NOT create any dispensary records.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import {
|
||||
DiscoveryCity,
|
||||
DiscoveryCityRow,
|
||||
DutchieCityResponse,
|
||||
CityDiscoveryResult,
|
||||
mapCityRowToCity,
|
||||
} from './types';
|
||||
|
||||
const CITIES_PAGE_URL = 'https://dutchie.com/cities';
|
||||
const PLATFORM = 'dutchie';
|
||||
|
||||
// ============================================================
|
||||
// CITY PAGE SCRAPING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch and parse the Dutchie cities page.
|
||||
* Returns a list of cities with their slugs and states.
|
||||
*/
|
||||
export async function fetchCitiesFromPage(): Promise<DutchieCityResponse[]> {
|
||||
console.log(`[CityDiscovery] Fetching cities from ${CITIES_PAGE_URL}...`);
|
||||
|
||||
const response = await axios.get(CITIES_PAGE_URL, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
},
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
const $ = cheerio.load(response.data);
|
||||
const cities: DutchieCityResponse[] = [];
|
||||
|
||||
// Look for city links in various possible structures
|
||||
// Structure 1: Links in /dispensaries/{state}/{city} format
|
||||
$('a[href*="/dispensaries/"]').each((_, element) => {
|
||||
const href = $(element).attr('href') || '';
|
||||
const text = $(element).text().trim();
|
||||
|
||||
// Match /dispensaries/{state}/{city} pattern
|
||||
const match = href.match(/\/dispensaries\/([a-z]{2,3})\/([a-z0-9-]+)/i);
|
||||
if (match) {
|
||||
const [, stateCode, citySlug] = match;
|
||||
cities.push({
|
||||
slug: citySlug,
|
||||
name: text || citySlug.replace(/-/g, ' '),
|
||||
stateCode: stateCode.toUpperCase(),
|
||||
countryCode: stateCode.length === 2 ? 'US' : 'CA', // 2-letter = US state, 3+ = Canadian province
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Structure 2: Links in /city/{slug} format
|
||||
$('a[href*="/city/"]').each((_, element) => {
|
||||
const href = $(element).attr('href') || '';
|
||||
const text = $(element).text().trim();
|
||||
|
||||
const match = href.match(/\/city\/([a-z0-9-]+)/i);
|
||||
if (match) {
|
||||
const [, citySlug] = match;
|
||||
cities.push({
|
||||
slug: citySlug,
|
||||
name: text || citySlug.replace(/-/g, ' '),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Dedupe by slug
|
||||
const uniqueCities = new Map<string, DutchieCityResponse>();
|
||||
for (const city of cities) {
|
||||
const key = `${city.countryCode || 'unknown'}-${city.stateCode || 'unknown'}-${city.slug}`;
|
||||
if (!uniqueCities.has(key)) {
|
||||
uniqueCities.set(key, city);
|
||||
}
|
||||
}
|
||||
|
||||
const result = Array.from(uniqueCities.values());
|
||||
console.log(`[CityDiscovery] Found ${result.length} unique cities`);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Fetch cities from Dutchie's internal API/GraphQL
|
||||
* This is a fallback if the HTML scraping doesn't work.
|
||||
*/
|
||||
export async function fetchCitiesFromApi(): Promise<DutchieCityResponse[]> {
|
||||
console.log('[CityDiscovery] Attempting to fetch cities from API...');
|
||||
|
||||
// Try to find the cities endpoint - this is exploratory
|
||||
// Dutchie may expose cities via their public API
|
||||
|
||||
// Common patterns to try:
|
||||
const possibleEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://dutchie.com/api-3/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of possibleEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'Accept': 'application/json',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
},
|
||||
timeout: 10000,
|
||||
validateStatus: () => true,
|
||||
});
|
||||
|
||||
if (response.status === 200 && Array.isArray(response.data)) {
|
||||
console.log(`[CityDiscovery] Found cities at ${endpoint}`);
|
||||
return response.data.map((city: any) => ({
|
||||
slug: city.slug || city.city_slug,
|
||||
name: city.name || city.city_name,
|
||||
stateCode: city.stateCode || city.state_code || city.state,
|
||||
countryCode: city.countryCode || city.country_code || city.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch {
|
||||
// Continue to next endpoint
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[CityDiscovery] No API endpoint found, falling back to page scraping');
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities.
|
||||
* Returns the city ID.
|
||||
*/
|
||||
export async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCityResponse
|
||||
): Promise<{ id: number; isNew: boolean }> {
|
||||
const result = await pool.query(
|
||||
`INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, NOW())
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_new`,
|
||||
[
|
||||
PLATFORM,
|
||||
city.name,
|
||||
city.slug,
|
||||
city.stateCode || null,
|
||||
city.countryCode || 'US',
|
||||
]
|
||||
);
|
||||
|
||||
return {
|
||||
id: result.rows[0].id,
|
||||
isNew: result.rows[0].is_new,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a city as crawled and update location count.
|
||||
*/
|
||||
export async function markCityCrawled(
|
||||
pool: Pool,
|
||||
cityId: number,
|
||||
locationCount: number
|
||||
): Promise<void> {
|
||||
await pool.query(
|
||||
`UPDATE dutchie_discovery_cities
|
||||
SET last_crawled_at = NOW(),
|
||||
location_count = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[cityId, locationCount]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all cities that need to be crawled.
|
||||
*/
|
||||
export async function getCitiesToCrawl(
|
||||
pool: Pool,
|
||||
options: {
|
||||
stateCode?: string;
|
||||
countryCode?: string;
|
||||
limit?: number;
|
||||
onlyStale?: boolean;
|
||||
staleDays?: number;
|
||||
} = {}
|
||||
): Promise<DiscoveryCity[]> {
|
||||
const {
|
||||
stateCode,
|
||||
countryCode,
|
||||
limit = 100,
|
||||
onlyStale = false,
|
||||
staleDays = 7,
|
||||
} = options;
|
||||
|
||||
let query = `
|
||||
SELECT *
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE crawl_enabled = TRUE
|
||||
`;
|
||||
const params: any[] = [];
|
||||
let paramIdx = 1;
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $${paramIdx}`;
|
||||
params.push(stateCode);
|
||||
paramIdx++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
query += ` AND country_code = $${paramIdx}`;
|
||||
params.push(countryCode);
|
||||
paramIdx++;
|
||||
}
|
||||
|
||||
if (onlyStale) {
|
||||
query += ` AND (last_crawled_at IS NULL OR last_crawled_at < NOW() - INTERVAL '${staleDays} days')`;
|
||||
}
|
||||
|
||||
query += ` ORDER BY last_crawled_at ASC NULLS FIRST LIMIT $${paramIdx}`;
|
||||
params.push(limit);
|
||||
|
||||
const result = await pool.query<DiscoveryCityRow>(query, params);
|
||||
return result.rows.map(mapCityRowToCity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a city by ID.
|
||||
*/
|
||||
export async function getCityById(
|
||||
pool: Pool,
|
||||
id: number
|
||||
): Promise<DiscoveryCity | null> {
|
||||
const result = await pool.query<DiscoveryCityRow>(
|
||||
`SELECT * FROM dutchie_discovery_cities WHERE id = $1`,
|
||||
[id]
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return mapCityRowToCity(result.rows[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a city by slug.
|
||||
*/
|
||||
export async function getCityBySlug(
|
||||
pool: Pool,
|
||||
slug: string,
|
||||
stateCode?: string,
|
||||
countryCode: string = 'US'
|
||||
): Promise<DiscoveryCity | null> {
|
||||
let query = `
|
||||
SELECT * FROM dutchie_discovery_cities
|
||||
WHERE platform = $1 AND city_slug = $2 AND country_code = $3
|
||||
`;
|
||||
const params: any[] = [PLATFORM, slug, countryCode];
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $4`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
const result = await pool.query<DiscoveryCityRow>(query, params);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return mapCityRowToCity(result.rows[0]);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Run the full city discovery process.
|
||||
* Fetches cities from Dutchie and upserts them into the database.
|
||||
*/
|
||||
export async function discoverCities(
|
||||
pool: Pool,
|
||||
options: {
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
} = {}
|
||||
): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const { dryRun = false, verbose = false } = options;
|
||||
const errors: string[] = [];
|
||||
|
||||
console.log('[CityDiscovery] Starting city discovery...');
|
||||
console.log(`[CityDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
|
||||
// Try API first, fall back to page scraping
|
||||
let cities = await fetchCitiesFromApi();
|
||||
if (cities.length === 0) {
|
||||
cities = await fetchCitiesFromPage();
|
||||
}
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[CityDiscovery] No cities found');
|
||||
return {
|
||||
citiesFound: 0,
|
||||
citiesUpserted: 0,
|
||||
citiesSkipped: 0,
|
||||
errors: ['No cities found from page or API'],
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
let upserted = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
if (dryRun) {
|
||||
if (verbose) {
|
||||
console.log(`[CityDiscovery][DryRun] Would upsert: ${city.name} (${city.stateCode}, ${city.countryCode})`);
|
||||
}
|
||||
upserted++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const result = await upsertCity(pool, city);
|
||||
upserted++;
|
||||
|
||||
if (verbose) {
|
||||
const action = result.isNew ? 'Created' : 'Updated';
|
||||
console.log(`[CityDiscovery] ${action}: ${city.name} (${city.stateCode}, ${city.countryCode}) -> ID ${result.id}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
errors.push(`City ${city.slug}: ${error.message}`);
|
||||
skipped++;
|
||||
}
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log(`[CityDiscovery] Complete: ${upserted} upserted, ${skipped} skipped, ${errors.length} errors in ${durationMs}ms`);
|
||||
|
||||
return {
|
||||
citiesFound: cities.length,
|
||||
citiesUpserted: upserted,
|
||||
citiesSkipped: skipped,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MANUAL CITY SEEDING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Seed known cities manually.
|
||||
* Use this when the cities page doesn't expose all cities.
|
||||
*/
|
||||
export async function seedKnownCities(
|
||||
pool: Pool,
|
||||
cities: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string;
|
||||
countryCode?: string;
|
||||
}>
|
||||
): Promise<{ created: number; updated: number }> {
|
||||
let created = 0;
|
||||
let updated = 0;
|
||||
|
||||
for (const city of cities) {
|
||||
const result = await upsertCity(pool, {
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode: city.stateCode,
|
||||
countryCode: city.countryCode || 'US',
|
||||
});
|
||||
|
||||
if (result.isNew) {
|
||||
created++;
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
|
||||
return { created, updated };
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-defined Arizona cities for seeding.
|
||||
*/
|
||||
export const ARIZONA_CITIES = [
|
||||
{ name: 'Phoenix', slug: 'phoenix', stateCode: 'AZ' },
|
||||
{ name: 'Tucson', slug: 'tucson', stateCode: 'AZ' },
|
||||
{ name: 'Mesa', slug: 'mesa', stateCode: 'AZ' },
|
||||
{ name: 'Chandler', slug: 'chandler', stateCode: 'AZ' },
|
||||
{ name: 'Scottsdale', slug: 'scottsdale', stateCode: 'AZ' },
|
||||
{ name: 'Glendale', slug: 'glendale', stateCode: 'AZ' },
|
||||
{ name: 'Gilbert', slug: 'gilbert', stateCode: 'AZ' },
|
||||
{ name: 'Tempe', slug: 'tempe', stateCode: 'AZ' },
|
||||
{ name: 'Peoria', slug: 'peoria', stateCode: 'AZ' },
|
||||
{ name: 'Surprise', slug: 'surprise', stateCode: 'AZ' },
|
||||
{ name: 'Yuma', slug: 'yuma', stateCode: 'AZ' },
|
||||
{ name: 'Avondale', slug: 'avondale', stateCode: 'AZ' },
|
||||
{ name: 'Flagstaff', slug: 'flagstaff', stateCode: 'AZ' },
|
||||
{ name: 'Goodyear', slug: 'goodyear', stateCode: 'AZ' },
|
||||
{ name: 'Lake Havasu City', slug: 'lake-havasu-city', stateCode: 'AZ' },
|
||||
{ name: 'Buckeye', slug: 'buckeye', stateCode: 'AZ' },
|
||||
{ name: 'Casa Grande', slug: 'casa-grande', stateCode: 'AZ' },
|
||||
{ name: 'Sierra Vista', slug: 'sierra-vista', stateCode: 'AZ' },
|
||||
{ name: 'Maricopa', slug: 'maricopa', stateCode: 'AZ' },
|
||||
{ name: 'Oro Valley', slug: 'oro-valley', stateCode: 'AZ' },
|
||||
{ name: 'Prescott', slug: 'prescott', stateCode: 'AZ' },
|
||||
{ name: 'Bullhead City', slug: 'bullhead-city', stateCode: 'AZ' },
|
||||
{ name: 'Prescott Valley', slug: 'prescott-valley', stateCode: 'AZ' },
|
||||
{ name: 'Apache Junction', slug: 'apache-junction', stateCode: 'AZ' },
|
||||
{ name: 'Marana', slug: 'marana', stateCode: 'AZ' },
|
||||
{ name: 'El Mirage', slug: 'el-mirage', stateCode: 'AZ' },
|
||||
{ name: 'Kingman', slug: 'kingman', stateCode: 'AZ' },
|
||||
{ name: 'Queen Creek', slug: 'queen-creek', stateCode: 'AZ' },
|
||||
{ name: 'San Luis', slug: 'san-luis', stateCode: 'AZ' },
|
||||
{ name: 'Sahuarita', slug: 'sahuarita', stateCode: 'AZ' },
|
||||
{ name: 'Fountain Hills', slug: 'fountain-hills', stateCode: 'AZ' },
|
||||
{ name: 'Nogales', slug: 'nogales', stateCode: 'AZ' },
|
||||
{ name: 'Douglas', slug: 'douglas', stateCode: 'AZ' },
|
||||
{ name: 'Eloy', slug: 'eloy', stateCode: 'AZ' },
|
||||
{ name: 'Somerton', slug: 'somerton', stateCode: 'AZ' },
|
||||
{ name: 'Paradise Valley', slug: 'paradise-valley', stateCode: 'AZ' },
|
||||
{ name: 'Coolidge', slug: 'coolidge', stateCode: 'AZ' },
|
||||
{ name: 'Cottonwood', slug: 'cottonwood', stateCode: 'AZ' },
|
||||
{ name: 'Camp Verde', slug: 'camp-verde', stateCode: 'AZ' },
|
||||
{ name: 'Show Low', slug: 'show-low', stateCode: 'AZ' },
|
||||
{ name: 'Payson', slug: 'payson', stateCode: 'AZ' },
|
||||
{ name: 'Sedona', slug: 'sedona', stateCode: 'AZ' },
|
||||
{ name: 'Winslow', slug: 'winslow', stateCode: 'AZ' },
|
||||
{ name: 'Globe', slug: 'globe', stateCode: 'AZ' },
|
||||
{ name: 'Safford', slug: 'safford', stateCode: 'AZ' },
|
||||
{ name: 'Bisbee', slug: 'bisbee', stateCode: 'AZ' },
|
||||
{ name: 'Wickenburg', slug: 'wickenburg', stateCode: 'AZ' },
|
||||
{ name: 'Page', slug: 'page', stateCode: 'AZ' },
|
||||
{ name: 'Holbrook', slug: 'holbrook', stateCode: 'AZ' },
|
||||
{ name: 'Willcox', slug: 'willcox', stateCode: 'AZ' },
|
||||
];
|
||||
327
backend/src/discovery/discovery-crawler.ts
Normal file
327
backend/src/discovery/discovery-crawler.ts
Normal file
@@ -0,0 +1,327 @@
|
||||
/**
|
||||
* Dutchie Discovery Crawler
|
||||
*
|
||||
* Main orchestrator for the Dutchie store discovery pipeline.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Discover cities from Dutchie (or use seeded cities)
|
||||
* 2. For each city, discover store locations
|
||||
* 3. Upsert all data to discovery tables
|
||||
* 4. Admin verifies locations manually
|
||||
* 5. Verified locations are promoted to canonical dispensaries
|
||||
*
|
||||
* This module does NOT create canonical dispensaries automatically.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
FullDiscoveryResult,
|
||||
LocationDiscoveryResult,
|
||||
DiscoveryCity,
|
||||
} from './types';
|
||||
import {
|
||||
discoverCities,
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import {
|
||||
discoverLocationsForCity,
|
||||
} from './location-discovery';
|
||||
|
||||
// ============================================================
|
||||
// FULL DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryCrawlerOptions {
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
stateCode?: string;
|
||||
countryCode?: string;
|
||||
cityLimit?: number;
|
||||
skipCityDiscovery?: boolean;
|
||||
onlyStale?: boolean;
|
||||
staleDays?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the full discovery pipeline:
|
||||
* 1. Discover/refresh cities
|
||||
* 2. For each city, discover locations
|
||||
*/
|
||||
export async function runFullDiscovery(
|
||||
pool: Pool,
|
||||
options: DiscoveryCrawlerOptions = {}
|
||||
): Promise<FullDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const {
|
||||
dryRun = false,
|
||||
verbose = false,
|
||||
stateCode,
|
||||
countryCode = 'US',
|
||||
cityLimit = 50,
|
||||
skipCityDiscovery = false,
|
||||
onlyStale = true,
|
||||
staleDays = 7,
|
||||
} = options;
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE DISCOVERY CRAWLER');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
if (stateCode) console.log(`State: ${stateCode}`);
|
||||
console.log(`Country: ${countryCode}`);
|
||||
console.log(`City limit: ${cityLimit}`);
|
||||
console.log('');
|
||||
|
||||
// Step 1: Discover/refresh cities
|
||||
let cityResult = {
|
||||
citiesFound: 0,
|
||||
citiesUpserted: 0,
|
||||
citiesSkipped: 0,
|
||||
errors: [] as string[],
|
||||
durationMs: 0,
|
||||
};
|
||||
|
||||
if (!skipCityDiscovery) {
|
||||
console.log('[Discovery] Step 1: Discovering cities...');
|
||||
cityResult = await discoverCities(pool, { dryRun, verbose });
|
||||
} else {
|
||||
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
|
||||
}
|
||||
|
||||
// Step 2: Get cities to crawl
|
||||
console.log('[Discovery] Step 2: Getting cities to crawl...');
|
||||
const cities = await getCitiesToCrawl(pool, {
|
||||
stateCode,
|
||||
countryCode,
|
||||
limit: cityLimit,
|
||||
onlyStale,
|
||||
staleDays,
|
||||
});
|
||||
|
||||
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
|
||||
|
||||
// Step 3: Discover locations for each city
|
||||
console.log('[Discovery] Step 3: Discovering locations...');
|
||||
const locationResults: LocationDiscoveryResult[] = [];
|
||||
let totalLocationsFound = 0;
|
||||
let totalLocationsUpserted = 0;
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||
|
||||
try {
|
||||
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
||||
locationResults.push(result);
|
||||
totalLocationsFound += result.locationsFound;
|
||||
totalLocationsUpserted += result.locationsUpserted;
|
||||
|
||||
// Rate limiting between cities
|
||||
if (i < cities.length - 1) {
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
|
||||
locationResults.push({
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound: 0,
|
||||
locationsUpserted: 0,
|
||||
locationsNew: 0,
|
||||
locationsUpdated: 0,
|
||||
errors: [error.message],
|
||||
durationMs: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
// Summary
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('DISCOVERY COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
console.log('');
|
||||
console.log('Cities:');
|
||||
console.log(` Discovered: ${cityResult.citiesFound}`);
|
||||
console.log(` Upserted: ${cityResult.citiesUpserted}`);
|
||||
console.log(` Crawled: ${cities.length}`);
|
||||
console.log('');
|
||||
console.log('Locations:');
|
||||
console.log(` Found: ${totalLocationsFound}`);
|
||||
console.log(` Upserted: ${totalLocationsUpserted}`);
|
||||
console.log('');
|
||||
|
||||
const totalErrors = cityResult.errors.length +
|
||||
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
|
||||
if (totalErrors > 0) {
|
||||
console.log(`Errors: ${totalErrors}`);
|
||||
}
|
||||
|
||||
return {
|
||||
cities: cityResult,
|
||||
locations: locationResults,
|
||||
totalLocationsFound,
|
||||
totalLocationsUpserted,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLE CITY DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Discover locations for a single city by slug.
|
||||
*/
|
||||
export async function discoverCity(
|
||||
pool: Pool,
|
||||
citySlug: string,
|
||||
options: {
|
||||
stateCode?: string;
|
||||
countryCode?: string;
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
} = {}
|
||||
): Promise<LocationDiscoveryResult | null> {
|
||||
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
|
||||
|
||||
// Find the city
|
||||
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
||||
|
||||
if (!city) {
|
||||
// Try to create it if we have enough info
|
||||
if (stateCode) {
|
||||
console.log(`[Discovery] City ${citySlug} not found, creating...`);
|
||||
await seedKnownCities(pool, [{
|
||||
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
|
||||
slug: citySlug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
}]);
|
||||
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
||||
}
|
||||
|
||||
if (!city) {
|
||||
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STATE-WIDE DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Seed and discover all cities for a state.
|
||||
*/
|
||||
export async function discoverState(
|
||||
pool: Pool,
|
||||
stateCode: string,
|
||||
options: {
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
cityLimit?: number;
|
||||
} = {}
|
||||
): Promise<FullDiscoveryResult> {
|
||||
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
|
||||
|
||||
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
||||
|
||||
// Seed known cities for this state
|
||||
if (stateCode === 'AZ') {
|
||||
console.log('[Discovery] Seeding Arizona cities...');
|
||||
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
|
||||
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
|
||||
}
|
||||
|
||||
// Run full discovery for this state
|
||||
return await runFullDiscovery(pool, {
|
||||
dryRun,
|
||||
verbose,
|
||||
stateCode,
|
||||
countryCode: 'US',
|
||||
cityLimit,
|
||||
skipCityDiscovery: true, // Use seeded cities
|
||||
onlyStale: false, // Crawl all
|
||||
});
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STATISTICS
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryStats {
|
||||
cities: {
|
||||
total: number;
|
||||
crawledLast24h: number;
|
||||
neverCrawled: number;
|
||||
};
|
||||
locations: {
|
||||
total: number;
|
||||
discovered: number;
|
||||
verified: number;
|
||||
rejected: number;
|
||||
merged: number;
|
||||
byState: Array<{ stateCode: string; count: number }>;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get discovery statistics.
|
||||
*/
|
||||
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
|
||||
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
|
||||
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
|
||||
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
|
||||
]);
|
||||
|
||||
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
|
||||
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
|
||||
pool.query(`
|
||||
SELECT status, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE active = TRUE
|
||||
GROUP BY status
|
||||
`),
|
||||
pool.query(`
|
||||
SELECT state_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE active = TRUE AND state_code IS NOT NULL
|
||||
GROUP BY state_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
]);
|
||||
|
||||
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
|
||||
acc[row.status] = parseInt(row.cnt, 10);
|
||||
return acc;
|
||||
}, {} as Record<string, number>);
|
||||
|
||||
return {
|
||||
cities: {
|
||||
total: parseInt(citiesTotal.rows[0].cnt, 10),
|
||||
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
|
||||
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
|
||||
},
|
||||
locations: {
|
||||
total: parseInt(locsTotal.rows[0].cnt, 10),
|
||||
discovered: statusCounts.discovered || 0,
|
||||
verified: statusCounts.verified || 0,
|
||||
rejected: statusCounts.rejected || 0,
|
||||
merged: statusCounts.merged || 0,
|
||||
byState: locsByState.rows.map(row => ({
|
||||
stateCode: row.state_code,
|
||||
count: parseInt(row.cnt, 10),
|
||||
})),
|
||||
},
|
||||
};
|
||||
}
|
||||
37
backend/src/discovery/index.ts
Normal file
37
backend/src/discovery/index.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
/**
|
||||
* Dutchie Discovery Module
|
||||
*
|
||||
* Exports all discovery-related functionality for use in the main application.
|
||||
*/
|
||||
|
||||
// Types
|
||||
export * from './types';
|
||||
|
||||
// City Discovery
|
||||
export {
|
||||
discoverCities,
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
|
||||
// Location Discovery
|
||||
export {
|
||||
discoverLocationsForCity,
|
||||
fetchLocationsForCity,
|
||||
upsertLocation,
|
||||
} from './location-discovery';
|
||||
|
||||
// Discovery Crawler (Orchestrator)
|
||||
export {
|
||||
runFullDiscovery,
|
||||
discoverCity,
|
||||
discoverState,
|
||||
getDiscoveryStats,
|
||||
DiscoveryCrawlerOptions,
|
||||
DiscoveryStats,
|
||||
} from './discovery-crawler';
|
||||
|
||||
// Routes
|
||||
export { createDiscoveryRoutes } from './routes';
|
||||
686
backend/src/discovery/location-discovery.ts
Normal file
686
backend/src/discovery/location-discovery.ts
Normal file
@@ -0,0 +1,686 @@
|
||||
/**
|
||||
* Dutchie Location Discovery Service
|
||||
*
|
||||
* Discovers store locations from Dutchie city pages.
|
||||
* Each city can contain multiple dispensary locations.
|
||||
*
|
||||
* This module:
|
||||
* 1. Fetches location listings for a given city
|
||||
* 2. Upserts locations into dutchie_discovery_locations
|
||||
* 3. Does NOT create any canonical dispensary records
|
||||
*
|
||||
* Locations remain in "discovered" status until manually verified.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import type { Browser, Page, Protocol } from 'puppeteer';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import {
|
||||
DiscoveryLocation,
|
||||
DiscoveryLocationRow,
|
||||
DutchieLocationResponse,
|
||||
LocationDiscoveryResult,
|
||||
DiscoveryStatus,
|
||||
mapLocationRowToLocation,
|
||||
} from './types';
|
||||
import { DiscoveryCity } from './types';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const PLATFORM = 'dutchie';
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL / API FETCHING
|
||||
// ============================================================
|
||||
|
||||
interface SessionCredentials {
|
||||
cookies: string;
|
||||
userAgent: string;
|
||||
browser: Browser;
|
||||
page: Page;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a browser session for fetching location data.
|
||||
*/
|
||||
async function createSession(citySlug: string): Promise<SessionCredentials> {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||
|
||||
await page.setUserAgent(userAgent);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Navigate to a dispensaries page to get cookies
|
||||
const url = `https://dutchie.com/dispensaries/az/${citySlug}`;
|
||||
console.log(`[LocationDiscovery] Loading ${url} to establish session...`);
|
||||
|
||||
try {
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
} catch (error: any) {
|
||||
console.warn(`[LocationDiscovery] Navigation warning: ${error.message}`);
|
||||
}
|
||||
|
||||
const cookies = await page.cookies();
|
||||
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
|
||||
|
||||
return { cookies: cookieString, userAgent, browser, page };
|
||||
}
|
||||
|
||||
async function closeSession(session: SessionCredentials): Promise<void> {
|
||||
await session.browser.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch locations for a city using Dutchie's internal search API.
|
||||
*/
|
||||
export async function fetchLocationsForCity(
|
||||
city: DiscoveryCity,
|
||||
options: {
|
||||
session?: SessionCredentials;
|
||||
verbose?: boolean;
|
||||
} = {}
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
const { verbose = false } = options;
|
||||
let session = options.session;
|
||||
let shouldCloseSession = false;
|
||||
|
||||
if (!session) {
|
||||
session = await createSession(city.citySlug);
|
||||
shouldCloseSession = true;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
// Try multiple approaches to get location data
|
||||
|
||||
// Approach 1: Extract from page __NEXT_DATA__ or similar
|
||||
const locations = await extractLocationsFromPage(session.page, verbose);
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data`);
|
||||
return locations;
|
||||
}
|
||||
|
||||
// Approach 2: Try the geo-based GraphQL query
|
||||
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
||||
if (geoLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from GraphQL`);
|
||||
return geoLocations;
|
||||
}
|
||||
|
||||
// Approach 3: Scrape visible location cards
|
||||
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
||||
if (scrapedLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping`);
|
||||
return scrapedLocations;
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||
return [];
|
||||
} finally {
|
||||
if (shouldCloseSession) {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract locations from page's embedded data (__NEXT_DATA__, window.*, etc.)
|
||||
*/
|
||||
async function extractLocationsFromPage(
|
||||
page: Page,
|
||||
verbose: boolean
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
try {
|
||||
const data = await page.evaluate(() => {
|
||||
// Try __NEXT_DATA__
|
||||
const nextDataEl = document.querySelector('#__NEXT_DATA__');
|
||||
if (nextDataEl?.textContent) {
|
||||
try {
|
||||
const nextData = JSON.parse(nextDataEl.textContent);
|
||||
// Look for dispensaries in various paths
|
||||
const dispensaries =
|
||||
nextData?.props?.pageProps?.dispensaries ||
|
||||
nextData?.props?.pageProps?.initialDispensaries ||
|
||||
nextData?.props?.pageProps?.data?.dispensaries ||
|
||||
[];
|
||||
if (Array.isArray(dispensaries) && dispensaries.length > 0) {
|
||||
return { source: '__NEXT_DATA__', dispensaries };
|
||||
}
|
||||
} catch {
|
||||
// Ignore parse errors
|
||||
}
|
||||
}
|
||||
|
||||
// Try window variables
|
||||
const win = window as any;
|
||||
if (win.__APOLLO_STATE__) {
|
||||
// Extract from Apollo cache
|
||||
const entries = Object.entries(win.__APOLLO_STATE__).filter(
|
||||
([key]) => key.startsWith('Dispensary:')
|
||||
);
|
||||
if (entries.length > 0) {
|
||||
return { source: 'APOLLO_STATE', dispensaries: entries.map(([, v]) => v) };
|
||||
}
|
||||
}
|
||||
|
||||
return { source: 'none', dispensaries: [] };
|
||||
});
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Page data source: ${data.source}, count: ${data.dispensaries.length}`);
|
||||
}
|
||||
|
||||
return data.dispensaries.map((d: any) => normalizeLocationResponse(d));
|
||||
} catch (error: any) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Could not extract from page data: ${error.message}`);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch locations via GraphQL geo-based query.
|
||||
*/
|
||||
async function fetchLocationsViaGraphQL(
|
||||
session: SessionCredentials,
|
||||
city: DiscoveryCity,
|
||||
verbose: boolean
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
// Use a known center point for the city or default to a central US location
|
||||
const CITY_COORDS: Record<string, { lat: number; lng: number }> = {
|
||||
'phoenix': { lat: 33.4484, lng: -112.074 },
|
||||
'tucson': { lat: 32.2226, lng: -110.9747 },
|
||||
'scottsdale': { lat: 33.4942, lng: -111.9261 },
|
||||
'mesa': { lat: 33.4152, lng: -111.8315 },
|
||||
'tempe': { lat: 33.4255, lng: -111.94 },
|
||||
'flagstaff': { lat: 35.1983, lng: -111.6513 },
|
||||
// Add more as needed
|
||||
};
|
||||
|
||||
const coords = CITY_COORDS[city.citySlug] || { lat: 33.4484, lng: -112.074 };
|
||||
|
||||
const variables = {
|
||||
dispensariesFilter: {
|
||||
latitude: coords.lat,
|
||||
longitude: coords.lng,
|
||||
distance: 50, // miles
|
||||
state: city.stateCode,
|
||||
city: city.cityName,
|
||||
},
|
||||
};
|
||||
|
||||
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
|
||||
|
||||
try {
|
||||
const response = await axios.post(
|
||||
'https://dutchie.com/api-3/graphql',
|
||||
{
|
||||
operationName: 'ConsumerDispensaries',
|
||||
variables,
|
||||
extensions: {
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
},
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'origin': 'https://dutchie.com',
|
||||
'referer': `https://dutchie.com/dispensaries/${city.stateCode?.toLowerCase()}/${city.citySlug}`,
|
||||
'user-agent': session.userAgent,
|
||||
'cookie': session.cookies,
|
||||
},
|
||||
timeout: 30000,
|
||||
validateStatus: () => true,
|
||||
}
|
||||
);
|
||||
|
||||
if (response.status !== 200) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] GraphQL returned ${response.status}`);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
const dispensaries = response.data?.data?.consumerDispensaries || [];
|
||||
return dispensaries.map((d: any) => normalizeLocationResponse(d));
|
||||
} catch (error: any) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape location cards from the visible page.
|
||||
*/
|
||||
async function scrapeLocationCards(
|
||||
page: Page,
|
||||
verbose: boolean
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
try {
|
||||
const locations = await page.evaluate(() => {
|
||||
const cards: any[] = [];
|
||||
|
||||
// Look for common dispensary card patterns
|
||||
const selectors = [
|
||||
'[data-testid="dispensary-card"]',
|
||||
'.dispensary-card',
|
||||
'a[href*="/dispensary/"]',
|
||||
'[class*="DispensaryCard"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
if (elements.length > 0) {
|
||||
elements.forEach((el) => {
|
||||
const link = el.querySelector('a')?.href || (el as HTMLAnchorElement).href || '';
|
||||
const name = el.querySelector('h2, h3, [class*="name"]')?.textContent?.trim() || '';
|
||||
const address = el.querySelector('[class*="address"], address')?.textContent?.trim() || '';
|
||||
|
||||
// Extract slug from URL
|
||||
const slugMatch = link.match(/\/dispensary\/([^/?]+)/);
|
||||
const slug = slugMatch ? slugMatch[1] : '';
|
||||
|
||||
if (slug && name) {
|
||||
cards.push({
|
||||
slug,
|
||||
name,
|
||||
address,
|
||||
menuUrl: link,
|
||||
});
|
||||
}
|
||||
});
|
||||
break; // Stop after first successful selector
|
||||
}
|
||||
}
|
||||
|
||||
return cards;
|
||||
});
|
||||
|
||||
return locations.map((d: any) => ({
|
||||
id: '',
|
||||
name: d.name,
|
||||
slug: d.slug,
|
||||
address: d.address,
|
||||
menuUrl: d.menuUrl,
|
||||
}));
|
||||
} catch (error: any) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Scraping error: ${error.message}`);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a raw location response to a consistent format.
|
||||
*/
|
||||
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
||||
const slug = raw.slug || raw.cName || raw.urlSlug || '';
|
||||
const id = raw.id || raw._id || raw.dispensaryId || '';
|
||||
|
||||
return {
|
||||
id,
|
||||
name: raw.name || raw.dispensaryName || '',
|
||||
slug,
|
||||
address: raw.address || raw.fullAddress || '',
|
||||
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || '',
|
||||
address2: raw.address2 || raw.addressLine2 || '',
|
||||
city: raw.city || '',
|
||||
state: raw.state || raw.stateCode || '',
|
||||
zip: raw.zip || raw.zipCode || raw.postalCode || '',
|
||||
country: raw.country || raw.countryCode || 'US',
|
||||
latitude: raw.latitude || raw.lat || raw.location?.latitude,
|
||||
longitude: raw.longitude || raw.lng || raw.location?.longitude,
|
||||
timezone: raw.timezone || raw.tz || '',
|
||||
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
|
||||
retailType: raw.retailType || raw.type || '',
|
||||
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
|
||||
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
|
||||
isRecreational: raw.isRecreational ?? raw.retailType?.includes('Recreational') ?? true,
|
||||
isMedical: raw.isMedical ?? raw.retailType?.includes('Medical') ?? true,
|
||||
// Preserve raw data
|
||||
...raw,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a location into dutchie_discovery_locations.
|
||||
*/
|
||||
export async function upsertLocation(
|
||||
pool: Pool,
|
||||
location: DutchieLocationResponse,
|
||||
cityId: number | null
|
||||
): Promise<{ id: number; isNew: boolean }> {
|
||||
const platformLocationId = location.id || location.slug;
|
||||
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
|
||||
|
||||
const result = await pool.query(
|
||||
`INSERT INTO dutchie_discovery_locations (
|
||||
platform,
|
||||
platform_location_id,
|
||||
platform_slug,
|
||||
platform_menu_url,
|
||||
name,
|
||||
raw_address,
|
||||
address_line1,
|
||||
address_line2,
|
||||
city,
|
||||
state_code,
|
||||
postal_code,
|
||||
country_code,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
discovery_city_id,
|
||||
metadata,
|
||||
offers_delivery,
|
||||
offers_pickup,
|
||||
is_recreational,
|
||||
is_medical,
|
||||
last_seen_at,
|
||||
updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW(), NOW())
|
||||
ON CONFLICT (platform, platform_location_id)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
platform_menu_url = EXCLUDED.platform_menu_url,
|
||||
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
|
||||
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
|
||||
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
|
||||
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
|
||||
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
|
||||
latitude = COALESCE(EXCLUDED.latitude, dutchie_discovery_locations.latitude),
|
||||
longitude = COALESCE(EXCLUDED.longitude, dutchie_discovery_locations.longitude),
|
||||
timezone = COALESCE(EXCLUDED.timezone, dutchie_discovery_locations.timezone),
|
||||
metadata = EXCLUDED.metadata,
|
||||
offers_delivery = COALESCE(EXCLUDED.offers_delivery, dutchie_discovery_locations.offers_delivery),
|
||||
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
|
||||
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
|
||||
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_new`,
|
||||
[
|
||||
PLATFORM,
|
||||
platformLocationId,
|
||||
location.slug,
|
||||
menuUrl,
|
||||
location.name,
|
||||
location.address || null,
|
||||
location.address1 || null,
|
||||
location.address2 || null,
|
||||
location.city || null,
|
||||
location.state || null,
|
||||
location.zip || null,
|
||||
location.country || 'US',
|
||||
location.latitude || null,
|
||||
location.longitude || null,
|
||||
location.timezone || null,
|
||||
cityId,
|
||||
JSON.stringify(location),
|
||||
location.offerDelivery ?? null,
|
||||
location.offerPickup ?? null,
|
||||
location.isRecreational ?? null,
|
||||
location.isMedical ?? null,
|
||||
]
|
||||
);
|
||||
|
||||
return {
|
||||
id: result.rows[0].id,
|
||||
isNew: result.rows[0].is_new,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get locations by status.
|
||||
*/
|
||||
export async function getLocationsByStatus(
|
||||
pool: Pool,
|
||||
status: DiscoveryStatus,
|
||||
options: {
|
||||
stateCode?: string;
|
||||
countryCode?: string;
|
||||
limit?: number;
|
||||
offset?: number;
|
||||
} = {}
|
||||
): Promise<DiscoveryLocation[]> {
|
||||
const { stateCode, countryCode, limit = 100, offset = 0 } = options;
|
||||
|
||||
let query = `
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
WHERE status = $1 AND active = TRUE
|
||||
`;
|
||||
const params: any[] = [status];
|
||||
let paramIdx = 2;
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $${paramIdx}`;
|
||||
params.push(stateCode);
|
||||
paramIdx++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
query += ` AND country_code = $${paramIdx}`;
|
||||
params.push(countryCode);
|
||||
paramIdx++;
|
||||
}
|
||||
|
||||
query += ` ORDER BY first_seen_at DESC LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`;
|
||||
params.push(limit, offset);
|
||||
|
||||
const result = await pool.query<DiscoveryLocationRow>(query, params);
|
||||
return result.rows.map(mapLocationRowToLocation);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a location by ID.
|
||||
*/
|
||||
export async function getLocationById(
|
||||
pool: Pool,
|
||||
id: number
|
||||
): Promise<DiscoveryLocation | null> {
|
||||
const result = await pool.query<DiscoveryLocationRow>(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[id]
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return mapLocationRowToLocation(result.rows[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update location status.
|
||||
*/
|
||||
export async function updateLocationStatus(
|
||||
pool: Pool,
|
||||
locationId: number,
|
||||
status: DiscoveryStatus,
|
||||
options: {
|
||||
dispensaryId?: number;
|
||||
verifiedBy?: string;
|
||||
notes?: string;
|
||||
} = {}
|
||||
): Promise<void> {
|
||||
const { dispensaryId, verifiedBy, notes } = options;
|
||||
|
||||
await pool.query(
|
||||
`UPDATE dutchie_discovery_locations
|
||||
SET status = $2,
|
||||
dispensary_id = COALESCE($3, dispensary_id),
|
||||
verified_at = CASE WHEN $2 IN ('verified', 'merged') THEN NOW() ELSE verified_at END,
|
||||
verified_by = COALESCE($4, verified_by),
|
||||
notes = COALESCE($5, notes),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[locationId, status, dispensaryId || null, verifiedBy || null, notes || null]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Search locations by name or address.
|
||||
*/
|
||||
export async function searchLocations(
|
||||
pool: Pool,
|
||||
query: string,
|
||||
options: {
|
||||
status?: DiscoveryStatus;
|
||||
stateCode?: string;
|
||||
limit?: number;
|
||||
} = {}
|
||||
): Promise<DiscoveryLocation[]> {
|
||||
const { status, stateCode, limit = 50 } = options;
|
||||
const searchPattern = `%${query}%`;
|
||||
|
||||
let sql = `
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
WHERE active = TRUE
|
||||
AND (name ILIKE $1 OR city ILIKE $1 OR raw_address ILIKE $1 OR platform_slug ILIKE $1)
|
||||
`;
|
||||
const params: any[] = [searchPattern];
|
||||
let paramIdx = 2;
|
||||
|
||||
if (status) {
|
||||
sql += ` AND status = $${paramIdx}`;
|
||||
params.push(status);
|
||||
paramIdx++;
|
||||
}
|
||||
|
||||
if (stateCode) {
|
||||
sql += ` AND state_code = $${paramIdx}`;
|
||||
params.push(stateCode);
|
||||
paramIdx++;
|
||||
}
|
||||
|
||||
sql += ` ORDER BY name LIMIT $${paramIdx}`;
|
||||
params.push(limit);
|
||||
|
||||
const result = await pool.query<DiscoveryLocationRow>(sql, params);
|
||||
return result.rows.map(mapLocationRowToLocation);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Discover locations for a specific city.
|
||||
*/
|
||||
export async function discoverLocationsForCity(
|
||||
pool: Pool,
|
||||
city: DiscoveryCity,
|
||||
options: {
|
||||
dryRun?: boolean;
|
||||
verbose?: boolean;
|
||||
} = {}
|
||||
): Promise<LocationDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const { dryRun = false, verbose = false } = options;
|
||||
const errors: string[] = [];
|
||||
|
||||
console.log(`[LocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
console.log(`[LocationDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
|
||||
const locations = await fetchLocationsForCity(city, { verbose });
|
||||
|
||||
if (locations.length === 0) {
|
||||
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||
return {
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound: 0,
|
||||
locationsUpserted: 0,
|
||||
locationsNew: 0,
|
||||
locationsUpdated: 0,
|
||||
errors: [],
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
let newCount = 0;
|
||||
let updatedCount = 0;
|
||||
|
||||
for (const location of locations) {
|
||||
try {
|
||||
if (dryRun) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery][DryRun] Would upsert: ${location.name} (${location.slug})`);
|
||||
}
|
||||
newCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const result = await upsertLocation(pool, location, city.id);
|
||||
|
||||
if (result.isNew) {
|
||||
newCount++;
|
||||
} else {
|
||||
updatedCount++;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
const action = result.isNew ? 'Created' : 'Updated';
|
||||
console.log(`[LocationDiscovery] ${action}: ${location.name} -> ID ${result.id}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
errors.push(`Location ${location.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update city crawl status
|
||||
if (!dryRun) {
|
||||
await pool.query(
|
||||
`UPDATE dutchie_discovery_cities
|
||||
SET last_crawled_at = NOW(),
|
||||
location_count = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[city.id, locations.length]
|
||||
);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log(`[LocationDiscovery] Complete for ${city.cityName}: ${newCount} new, ${updatedCount} updated, ${errors.length} errors in ${durationMs}ms`);
|
||||
|
||||
return {
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound: locations.length,
|
||||
locationsUpserted: newCount + updatedCount,
|
||||
locationsNew: newCount,
|
||||
locationsUpdated: updatedCount,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
840
backend/src/discovery/routes.ts
Normal file
840
backend/src/discovery/routes.ts
Normal file
@@ -0,0 +1,840 @@
|
||||
/**
|
||||
* Dutchie Discovery API Routes
|
||||
*
|
||||
* Express routes for the Dutchie store discovery pipeline.
|
||||
* Provides endpoints for discovering, listing, and verifying locations.
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
runFullDiscovery,
|
||||
discoverCity,
|
||||
discoverState,
|
||||
getDiscoveryStats,
|
||||
} from './discovery-crawler';
|
||||
import {
|
||||
discoverCities,
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import {
|
||||
DiscoveryLocation,
|
||||
DiscoveryCity,
|
||||
DiscoveryStatus,
|
||||
mapLocationRowToLocation,
|
||||
mapCityRowToCity,
|
||||
} from './types';
|
||||
|
||||
export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY LOCATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/locations
|
||||
* List discovered locations with filtering
|
||||
*/
|
||||
router.get('/locations', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
status,
|
||||
stateCode,
|
||||
countryCode,
|
||||
city,
|
||||
platform = 'dutchie',
|
||||
search,
|
||||
hasDispensary,
|
||||
limit = '50',
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = 'WHERE platform = $1 AND active = TRUE';
|
||||
const params: any[] = [platform];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (status) {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (stateCode) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(stateCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(countryCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (city) {
|
||||
whereClause += ` AND city ILIKE $${paramIndex}`;
|
||||
params.push(`%${city}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (search) {
|
||||
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (hasDispensary === 'true') {
|
||||
whereClause += ' AND dispensary_id IS NOT NULL';
|
||||
} else if (hasDispensary === 'false') {
|
||||
whereClause += ' AND dispensary_id IS NULL';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.name as dispensary_name,
|
||||
dc.city_name as discovery_city_name
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
|
||||
${whereClause}
|
||||
ORDER BY dl.first_seen_at DESC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
const locations = rows.map((row: any) => ({
|
||||
...mapLocationRowToLocation(row),
|
||||
dispensaryName: row.dispensary_name,
|
||||
discoveryCityName: row.discovery_city_name,
|
||||
}));
|
||||
|
||||
res.json({
|
||||
locations,
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: parseInt(limit as string, 10),
|
||||
offset: parseInt(offset as string, 10),
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/locations/:id
|
||||
* Get a single discovery location
|
||||
*/
|
||||
router.get('/locations/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.name as dispensary_name,
|
||||
d.menu_url as dispensary_menu_url,
|
||||
dc.city_name as discovery_city_name
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Location not found' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
...mapLocationRowToLocation(rows[0]),
|
||||
dispensaryName: rows[0].dispensary_name,
|
||||
dispensaryMenuUrl: rows[0].dispensary_menu_url,
|
||||
discoveryCityName: rows[0].discovery_city_name,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/locations/pending
|
||||
* Get locations awaiting verification
|
||||
*/
|
||||
router.get('/locations/pending', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode, countryCode, limit = '100' } = req.query;
|
||||
|
||||
let whereClause = `WHERE status = 'discovered' AND active = TRUE`;
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (stateCode) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(stateCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(countryCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10));
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
${whereClause}
|
||||
ORDER BY state_code, city, name
|
||||
LIMIT $${paramIndex}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
res.json({
|
||||
locations: rows.map(mapLocationRowToLocation),
|
||||
total: rows.length,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY CITIES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/cities
|
||||
* List discovery cities
|
||||
*/
|
||||
router.get('/cities', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
stateCode,
|
||||
countryCode,
|
||||
crawlEnabled,
|
||||
platform = 'dutchie',
|
||||
limit = '100',
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = 'WHERE platform = $1';
|
||||
const params: any[] = [platform];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (stateCode) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(stateCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(countryCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (crawlEnabled === 'true') {
|
||||
whereClause += ' AND crawl_enabled = TRUE';
|
||||
} else if (crawlEnabled === 'false') {
|
||||
whereClause += ' AND crawl_enabled = FALSE';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dc.*,
|
||||
(SELECT COUNT(*) FROM dutchie_discovery_locations dl WHERE dl.discovery_city_id = dc.id) as actual_location_count
|
||||
FROM dutchie_discovery_cities dc
|
||||
${whereClause}
|
||||
ORDER BY dc.country_code, dc.state_code, dc.city_name
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_cities dc ${whereClause}`,
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
const cities = rows.map((row: any) => ({
|
||||
...mapCityRowToCity(row),
|
||||
actualLocationCount: parseInt(row.actual_location_count || '0', 10),
|
||||
}));
|
||||
|
||||
res.json({
|
||||
cities,
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: parseInt(limit as string, 10),
|
||||
offset: parseInt(offset as string, 10),
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STATISTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/stats
|
||||
* Get discovery statistics
|
||||
*/
|
||||
router.get('/stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const stats = await getDiscoveryStats(pool);
|
||||
res.json(stats);
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// VERIFICATION ACTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/discovery/locations/:id/verify
|
||||
* Verify a discovered location and create a new canonical dispensary
|
||||
*/
|
||||
router.post('/locations/:id/verify', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { verifiedBy = 'admin' } = req.body;
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return res.status(404).json({ error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
return res.status(400).json({
|
||||
error: `Location already has status: ${location.status}`,
|
||||
});
|
||||
}
|
||||
|
||||
// Create the canonical dispensary
|
||||
const { rows: dispRows } = await pool.query(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
name,
|
||||
slug,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
zip,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
menu_type,
|
||||
menu_url,
|
||||
platform_dispensary_id,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, TRUE, NOW(), NOW()
|
||||
)
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
location.platform_slug,
|
||||
location.address_line1,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.postal_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.platform,
|
||||
location.platform_menu_url,
|
||||
location.platform_location_id,
|
||||
]
|
||||
);
|
||||
|
||||
const dispensaryId = dispRows[0].id;
|
||||
|
||||
// Update the discovery location
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'verified',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'created',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
message: `Created new dispensary (ID: ${dispensaryId})`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/locations/:id/link
|
||||
* Link a discovered location to an existing dispensary
|
||||
*/
|
||||
router.post('/locations/:id/link', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { dispensaryId, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
if (!dispensaryId) {
|
||||
return res.status(400).json({ error: 'dispensaryId is required' });
|
||||
}
|
||||
|
||||
// Verify dispensary exists
|
||||
const { rows: dispRows } = await pool.query(
|
||||
`SELECT id, name FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (dispRows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return res.status(404).json({ error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
return res.status(400).json({
|
||||
error: `Location already has status: ${location.status}`,
|
||||
});
|
||||
}
|
||||
|
||||
// Update dispensary with platform info if missing
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, $3),
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`,
|
||||
[
|
||||
location.platform_location_id,
|
||||
location.platform_menu_url,
|
||||
location.platform,
|
||||
dispensaryId,
|
||||
]
|
||||
);
|
||||
|
||||
// Update the discovery location
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'merged',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'linked',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
dispensaryName: dispRows[0].name,
|
||||
message: `Linked to existing dispensary: ${dispRows[0].name}`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/locations/:id/reject
|
||||
* Reject a discovered location
|
||||
*/
|
||||
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { reason, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'discovered') {
|
||||
return res.status(400).json({
|
||||
error: `Location already has status: ${rows[0].status}`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'rejected',
|
||||
verified_at = NOW(),
|
||||
verified_by = $1,
|
||||
notes = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[verifiedBy, reason || 'Rejected by admin', id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'rejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location rejected',
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/locations/:id/unreject
|
||||
* Restore a rejected location back to discovered status
|
||||
*/
|
||||
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'rejected') {
|
||||
return res.status(400).json({
|
||||
error: `Location is not rejected. Current status: ${rows[0].status}`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'discovered',
|
||||
verified_at = NULL,
|
||||
verified_by = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'unrejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location restored to discovered status',
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY ADMIN ACTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/discover-state
|
||||
* Run discovery for an entire state
|
||||
*/
|
||||
router.post('/admin/discover-state', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode, dryRun = false, cityLimit = 100 } = req.body;
|
||||
|
||||
if (!stateCode) {
|
||||
return res.status(400).json({ error: 'stateCode is required' });
|
||||
}
|
||||
|
||||
console.log(`[Discovery API] Starting state discovery for ${stateCode}`);
|
||||
const result = await discoverState(pool, stateCode, {
|
||||
dryRun,
|
||||
cityLimit,
|
||||
verbose: true,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
stateCode,
|
||||
result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/discover-city
|
||||
* Run discovery for a single city
|
||||
*/
|
||||
router.post('/admin/discover-city', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { citySlug, stateCode, countryCode = 'US', dryRun = false } = req.body;
|
||||
|
||||
if (!citySlug) {
|
||||
return res.status(400).json({ error: 'citySlug is required' });
|
||||
}
|
||||
|
||||
console.log(`[Discovery API] Starting city discovery for ${citySlug}`);
|
||||
const result = await discoverCity(pool, citySlug, {
|
||||
stateCode,
|
||||
countryCode,
|
||||
dryRun,
|
||||
verbose: true,
|
||||
});
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({ error: `City not found: ${citySlug}` });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
citySlug,
|
||||
result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/run-full
|
||||
* Run full discovery pipeline
|
||||
*/
|
||||
router.post('/admin/run-full', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
stateCode,
|
||||
countryCode = 'US',
|
||||
cityLimit = 50,
|
||||
skipCityDiscovery = false,
|
||||
onlyStale = true,
|
||||
staleDays = 7,
|
||||
dryRun = false,
|
||||
} = req.body;
|
||||
|
||||
console.log(`[Discovery API] Starting full discovery`);
|
||||
const result = await runFullDiscovery(pool, {
|
||||
stateCode,
|
||||
countryCode,
|
||||
cityLimit,
|
||||
skipCityDiscovery,
|
||||
onlyStale,
|
||||
staleDays,
|
||||
dryRun,
|
||||
verbose: true,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/seed-cities
|
||||
* Seed known cities for a state
|
||||
*/
|
||||
router.post('/admin/seed-cities', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode } = req.body;
|
||||
|
||||
if (!stateCode) {
|
||||
return res.status(400).json({ error: 'stateCode is required' });
|
||||
}
|
||||
|
||||
let cities: any[] = [];
|
||||
if (stateCode === 'AZ') {
|
||||
cities = ARIZONA_CITIES;
|
||||
} else {
|
||||
return res.status(400).json({
|
||||
error: `No predefined cities for state: ${stateCode}. Add cities to city-discovery.ts`,
|
||||
});
|
||||
}
|
||||
|
||||
const result = await seedKnownCities(pool, cities);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
stateCode,
|
||||
...result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/admin/match-candidates/:id
|
||||
* Find potential dispensary matches for a discovery location
|
||||
*/
|
||||
router.get('/admin/match-candidates/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return res.status(404).json({ error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Find potential matches by name similarity and location
|
||||
const { rows: candidates } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.address,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.menu_url,
|
||||
d.latitude,
|
||||
d.longitude,
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 'exact_name'
|
||||
WHEN d.name ILIKE $2 THEN 'partial_name'
|
||||
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
|
||||
ELSE 'location_match'
|
||||
END as match_type,
|
||||
-- Distance in miles if coordinates available
|
||||
CASE
|
||||
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
|
||||
THEN (3959 * acos(
|
||||
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||
cos(radians(d.longitude) - radians($6::float)) +
|
||||
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||
))
|
||||
ELSE NULL
|
||||
END as distance_miles
|
||||
FROM dispensaries d
|
||||
WHERE d.state = $4
|
||||
AND (
|
||||
d.name ILIKE $1
|
||||
OR d.name ILIKE $2
|
||||
OR d.city ILIKE $3
|
||||
OR (
|
||||
d.latitude IS NOT NULL
|
||||
AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL
|
||||
AND $6::float IS NOT NULL
|
||||
AND (3959 * acos(
|
||||
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||
cos(radians(d.longitude) - radians($6::float)) +
|
||||
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||
)) < 5
|
||||
)
|
||||
)
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 1
|
||||
WHEN d.name ILIKE $2 THEN 2
|
||||
ELSE 3
|
||||
END,
|
||||
distance_miles NULLS LAST
|
||||
LIMIT 10
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
`%${location.name.split(' ')[0]}%`,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
]
|
||||
);
|
||||
|
||||
res.json({
|
||||
location: mapLocationRowToLocation(location),
|
||||
candidates: candidates.map((c: any) => ({
|
||||
id: c.id,
|
||||
name: c.name,
|
||||
city: c.city,
|
||||
state: c.state,
|
||||
address: c.address,
|
||||
menuType: c.menu_type,
|
||||
platformDispensaryId: c.platform_dispensary_id,
|
||||
menuUrl: c.menu_url,
|
||||
matchType: c.match_type,
|
||||
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
export default createDiscoveryRoutes;
|
||||
269
backend/src/discovery/types.ts
Normal file
269
backend/src/discovery/types.ts
Normal file
@@ -0,0 +1,269 @@
|
||||
/**
|
||||
* Dutchie Discovery Types
|
||||
*
|
||||
* Type definitions for the Dutchie store discovery pipeline.
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY CITY
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryCity {
|
||||
id: number;
|
||||
platform: string;
|
||||
cityName: string;
|
||||
citySlug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
lastCrawledAt: Date | null;
|
||||
crawlEnabled: boolean;
|
||||
locationCount: number | null;
|
||||
notes: string | null;
|
||||
metadata: Record<string, any> | null;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
export interface DiscoveryCityRow {
|
||||
id: number;
|
||||
platform: string;
|
||||
city_name: string;
|
||||
city_slug: string;
|
||||
state_code: string | null;
|
||||
country_code: string;
|
||||
last_crawled_at: Date | null;
|
||||
crawl_enabled: boolean;
|
||||
location_count: number | null;
|
||||
notes: string | null;
|
||||
metadata: Record<string, any> | null;
|
||||
created_at: Date;
|
||||
updated_at: Date;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY LOCATION
|
||||
// ============================================================
|
||||
|
||||
export type DiscoveryStatus = 'discovered' | 'verified' | 'rejected' | 'merged';
|
||||
|
||||
export interface DiscoveryLocation {
|
||||
id: number;
|
||||
platform: string;
|
||||
platformLocationId: string;
|
||||
platformSlug: string;
|
||||
platformMenuUrl: string;
|
||||
name: string;
|
||||
rawAddress: string | null;
|
||||
addressLine1: string | null;
|
||||
addressLine2: string | null;
|
||||
city: string | null;
|
||||
stateCode: string | null;
|
||||
postalCode: string | null;
|
||||
countryCode: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
status: DiscoveryStatus;
|
||||
dispensaryId: number | null;
|
||||
discoveryCityId: number | null;
|
||||
metadata: Record<string, any> | null;
|
||||
notes: string | null;
|
||||
offersDelivery: boolean | null;
|
||||
offersPickup: boolean | null;
|
||||
isRecreational: boolean | null;
|
||||
isMedical: boolean | null;
|
||||
firstSeenAt: Date;
|
||||
lastSeenAt: Date;
|
||||
lastCheckedAt: Date | null;
|
||||
verifiedAt: Date | null;
|
||||
verifiedBy: string | null;
|
||||
active: boolean;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
export interface DiscoveryLocationRow {
|
||||
id: number;
|
||||
platform: string;
|
||||
platform_location_id: string;
|
||||
platform_slug: string;
|
||||
platform_menu_url: string;
|
||||
name: string;
|
||||
raw_address: string | null;
|
||||
address_line1: string | null;
|
||||
address_line2: string | null;
|
||||
city: string | null;
|
||||
state_code: string | null;
|
||||
postal_code: string | null;
|
||||
country_code: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
status: DiscoveryStatus;
|
||||
dispensary_id: number | null;
|
||||
discovery_city_id: number | null;
|
||||
metadata: Record<string, any> | null;
|
||||
notes: string | null;
|
||||
offers_delivery: boolean | null;
|
||||
offers_pickup: boolean | null;
|
||||
is_recreational: boolean | null;
|
||||
is_medical: boolean | null;
|
||||
first_seen_at: Date;
|
||||
last_seen_at: Date;
|
||||
last_checked_at: Date | null;
|
||||
verified_at: Date | null;
|
||||
verified_by: string | null;
|
||||
active: boolean;
|
||||
created_at: Date;
|
||||
updated_at: Date;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// RAW API RESPONSES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCityResponse {
|
||||
slug: string;
|
||||
name: string;
|
||||
state?: string;
|
||||
stateCode?: string;
|
||||
country?: string;
|
||||
countryCode?: string;
|
||||
}
|
||||
|
||||
export interface DutchieLocationResponse {
|
||||
id: string;
|
||||
name: string;
|
||||
slug: string;
|
||||
address?: string;
|
||||
address1?: string;
|
||||
address2?: string;
|
||||
city?: string;
|
||||
state?: string;
|
||||
zip?: string;
|
||||
zipCode?: string;
|
||||
country?: string;
|
||||
latitude?: number;
|
||||
longitude?: number;
|
||||
timezone?: string;
|
||||
menuUrl?: string;
|
||||
retailType?: string;
|
||||
offerPickup?: boolean;
|
||||
offerDelivery?: boolean;
|
||||
isRecreational?: boolean;
|
||||
isMedical?: boolean;
|
||||
// Raw response preserved
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DISCOVERY RESULTS
|
||||
// ============================================================
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesUpserted: number;
|
||||
citiesSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
export interface LocationDiscoveryResult {
|
||||
cityId: number;
|
||||
citySlug: string;
|
||||
locationsFound: number;
|
||||
locationsUpserted: number;
|
||||
locationsNew: number;
|
||||
locationsUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
export interface FullDiscoveryResult {
|
||||
cities: CityDiscoveryResult;
|
||||
locations: LocationDiscoveryResult[];
|
||||
totalLocationsFound: number;
|
||||
totalLocationsUpserted: number;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// VERIFICATION
|
||||
// ============================================================
|
||||
|
||||
export interface VerificationResult {
|
||||
success: boolean;
|
||||
discoveryId: number;
|
||||
dispensaryId: number | null;
|
||||
action: 'created' | 'linked' | 'rejected';
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface PromotionResult {
|
||||
success: boolean;
|
||||
discoveryId: number;
|
||||
dispensaryId: number;
|
||||
crawlProfileId?: number;
|
||||
scheduleId?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAPPER FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export function mapCityRowToCity(row: DiscoveryCityRow): DiscoveryCity {
|
||||
return {
|
||||
id: row.id,
|
||||
platform: row.platform,
|
||||
cityName: row.city_name,
|
||||
citySlug: row.city_slug,
|
||||
stateCode: row.state_code,
|
||||
countryCode: row.country_code,
|
||||
lastCrawledAt: row.last_crawled_at,
|
||||
crawlEnabled: row.crawl_enabled,
|
||||
locationCount: row.location_count,
|
||||
notes: row.notes,
|
||||
metadata: row.metadata,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLocation {
|
||||
return {
|
||||
id: row.id,
|
||||
platform: row.platform,
|
||||
platformLocationId: row.platform_location_id,
|
||||
platformSlug: row.platform_slug,
|
||||
platformMenuUrl: row.platform_menu_url,
|
||||
name: row.name,
|
||||
rawAddress: row.raw_address,
|
||||
addressLine1: row.address_line1,
|
||||
addressLine2: row.address_line2,
|
||||
city: row.city,
|
||||
stateCode: row.state_code,
|
||||
postalCode: row.postal_code,
|
||||
countryCode: row.country_code,
|
||||
latitude: row.latitude,
|
||||
longitude: row.longitude,
|
||||
timezone: row.timezone,
|
||||
status: row.status,
|
||||
dispensaryId: row.dispensary_id,
|
||||
discoveryCityId: row.discovery_city_id,
|
||||
metadata: row.metadata,
|
||||
notes: row.notes,
|
||||
offersDelivery: row.offers_delivery,
|
||||
offersPickup: row.offers_pickup,
|
||||
isRecreational: row.is_recreational,
|
||||
isMedical: row.is_medical,
|
||||
firstSeenAt: row.first_seen_at,
|
||||
lastSeenAt: row.last_seen_at,
|
||||
lastCheckedAt: row.last_checked_at,
|
||||
verifiedAt: row.verified_at,
|
||||
verifiedBy: row.verified_by,
|
||||
active: row.active,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
@@ -1,50 +1,99 @@
|
||||
/**
|
||||
* Dutchie AZ Database Connection
|
||||
* CannaiQ Database Connection
|
||||
*
|
||||
* Isolated database connection for Dutchie Arizona data.
|
||||
* Uses a separate database/schema to prevent cross-contamination with main app data.
|
||||
* All database access for the CannaiQ platform goes through this module.
|
||||
*
|
||||
* SINGLE DATABASE ARCHITECTURE:
|
||||
* - All services (auth, orchestrator, crawlers, admin) use this ONE database
|
||||
* - States are modeled via states table + state_id on dispensaries (not separate DBs)
|
||||
*
|
||||
* CONFIGURATION (in priority order):
|
||||
* 1. CANNAIQ_DB_URL - Full connection string (preferred)
|
||||
* 2. Individual vars: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS
|
||||
* 3. DATABASE_URL - Legacy fallback for K8s compatibility
|
||||
*
|
||||
* IMPORTANT:
|
||||
* - Do NOT create separate pools elsewhere
|
||||
* - All services should import from this module
|
||||
*/
|
||||
|
||||
import { Pool, PoolClient } from 'pg';
|
||||
|
||||
// Consolidated DB naming:
|
||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
||||
// - Then DUTCHIE_AZ_DATABASE_URL (legacy)
|
||||
// - Finally DATABASE_URL (legacy main DB)
|
||||
const DUTCHIE_AZ_DATABASE_URL =
|
||||
process.env.CRAWLSY_DATABASE_URL ||
|
||||
process.env.DUTCHIE_AZ_DATABASE_URL ||
|
||||
process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
||||
/**
|
||||
* Get the database connection string from environment variables.
|
||||
* Supports multiple configuration methods with fallback for legacy compatibility.
|
||||
*/
|
||||
function getConnectionString(): string {
|
||||
// Priority 1: Full CANNAIQ connection URL
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
// Priority 2: Build from individual CANNAIQ env vars
|
||||
const host = process.env.CANNAIQ_DB_HOST;
|
||||
const port = process.env.CANNAIQ_DB_PORT;
|
||||
const name = process.env.CANNAIQ_DB_NAME;
|
||||
const user = process.env.CANNAIQ_DB_USER;
|
||||
const pass = process.env.CANNAIQ_DB_PASS;
|
||||
|
||||
if (host && port && name && user && pass) {
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
// Priority 3: Fallback to DATABASE_URL for legacy/K8s compatibility
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
|
||||
// Report what's missing
|
||||
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||
const missing = required.filter((key) => !process.env[key]);
|
||||
|
||||
throw new Error(
|
||||
`[CannaiQ DB] Missing database configuration.\n` +
|
||||
`Set CANNAIQ_DB_URL, DATABASE_URL, or all of: ${missing.join(', ')}`
|
||||
);
|
||||
}
|
||||
|
||||
let pool: Pool | null = null;
|
||||
|
||||
/**
|
||||
* Get the Dutchie AZ database pool (singleton)
|
||||
* Get the CannaiQ database pool (singleton)
|
||||
*
|
||||
* This is the canonical pool for all CannaiQ services.
|
||||
* Do NOT create separate pools elsewhere.
|
||||
*/
|
||||
export function getDutchieAZPool(): Pool {
|
||||
export function getPool(): Pool {
|
||||
if (!pool) {
|
||||
pool = new Pool({
|
||||
connectionString: DUTCHIE_AZ_DATABASE_URL,
|
||||
connectionString: getConnectionString(),
|
||||
max: 10,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
|
||||
pool.on('error', (err) => {
|
||||
console.error('[DutchieAZ DB] Unexpected error on idle client:', err);
|
||||
console.error('[CannaiQ DB] Unexpected error on idle client:', err);
|
||||
});
|
||||
|
||||
console.log('[DutchieAZ DB] Pool initialized');
|
||||
console.log('[CannaiQ DB] Pool initialized');
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a query on the Dutchie AZ database
|
||||
* @deprecated Use getPool() instead
|
||||
*/
|
||||
export function getDutchieAZPool(): Pool {
|
||||
console.warn('[CannaiQ DB] getDutchieAZPool() is deprecated. Use getPool() instead.');
|
||||
return getPool();
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a query on the CannaiQ database
|
||||
*/
|
||||
export async function query<T = any>(text: string, params?: any[]): Promise<{ rows: T[]; rowCount: number }> {
|
||||
const p = getDutchieAZPool();
|
||||
const p = getPool();
|
||||
const result = await p.query(text, params);
|
||||
return { rows: result.rows as T[], rowCount: result.rowCount || 0 };
|
||||
}
|
||||
@@ -53,7 +102,7 @@ export async function query<T = any>(text: string, params?: any[]): Promise<{ ro
|
||||
* Get a client from the pool for transaction use
|
||||
*/
|
||||
export async function getClient(): Promise<PoolClient> {
|
||||
const p = getDutchieAZPool();
|
||||
const p = getPool();
|
||||
return p.connect();
|
||||
}
|
||||
|
||||
@@ -64,7 +113,7 @@ export async function closePool(): Promise<void> {
|
||||
if (pool) {
|
||||
await pool.end();
|
||||
pool = null;
|
||||
console.log('[DutchieAZ DB] Pool closed');
|
||||
console.log('[CannaiQ DB] Pool closed');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,7 +125,7 @@ export async function healthCheck(): Promise<boolean> {
|
||||
const result = await query('SELECT 1 as ok');
|
||||
return result.rows.length > 0 && result.rows[0].ok === 1;
|
||||
} catch (error) {
|
||||
console.error('[DutchieAZ DB] Health check failed:', error);
|
||||
console.error('[CannaiQ DB] Health check failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
137
backend/src/dutchie-az/db/dispensary-columns.ts
Normal file
137
backend/src/dutchie-az/db/dispensary-columns.ts
Normal file
@@ -0,0 +1,137 @@
|
||||
/**
|
||||
* Dispensary Column Definitions
|
||||
*
|
||||
* Centralized column list for dispensaries table queries.
|
||||
* Handles optional columns that may not exist in all environments.
|
||||
*
|
||||
* USAGE:
|
||||
* import { DISPENSARY_COLUMNS, DISPENSARY_COLUMNS_WITH_FAILED } from '../db/dispensary-columns';
|
||||
* const result = await query(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE ...`);
|
||||
*/
|
||||
|
||||
/**
|
||||
* Core dispensary columns that always exist.
|
||||
* These are guaranteed to be present in all environments.
|
||||
*/
|
||||
const CORE_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
created_at, updated_at
|
||||
`;
|
||||
|
||||
/**
|
||||
* Optional columns with NULL fallback.
|
||||
*
|
||||
* provider_detection_data: Added in migration 044
|
||||
* active_crawler_profile_id: Added in migration 041
|
||||
*
|
||||
* Using COALESCE ensures the query works whether or not the column exists:
|
||||
* - If column exists: returns the actual value
|
||||
* - If column doesn't exist: query fails (but migration should be run)
|
||||
*
|
||||
* For pre-migration compatibility, we select NULL::jsonb which always works.
|
||||
* After migration 044 is applied, this can be changed to the real column.
|
||||
*/
|
||||
|
||||
// TEMPORARY: Use NULL fallback until migration 044 is applied
|
||||
// After running 044, change this to: provider_detection_data
|
||||
const PROVIDER_DETECTION_COLUMN = `NULL::jsonb AS provider_detection_data`;
|
||||
|
||||
// After migration 044 is applied, uncomment this line and remove the above:
|
||||
// const PROVIDER_DETECTION_COLUMN = `provider_detection_data`;
|
||||
|
||||
/**
|
||||
* Standard dispensary columns for most queries.
|
||||
* Includes provider_detection_data with NULL fallback for pre-migration compatibility.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN}`;
|
||||
|
||||
/**
|
||||
* Dispensary columns including active_crawler_profile_id.
|
||||
* Used by routes that need profile information.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS_WITH_PROFILE = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN},
|
||||
active_crawler_profile_id`;
|
||||
|
||||
/**
|
||||
* Dispensary columns including failed_at.
|
||||
* Used by worker for compatibility checks.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS_WITH_FAILED = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN},
|
||||
failed_at`;
|
||||
|
||||
/**
|
||||
* NOTE: After migration 044 is applied, update PROVIDER_DETECTION_COLUMN above
|
||||
* to use the real column instead of NULL fallback.
|
||||
*
|
||||
* To verify migration status:
|
||||
* SELECT column_name FROM information_schema.columns
|
||||
* WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data';
|
||||
*/
|
||||
|
||||
// Cache for column existence check
|
||||
let _providerDetectionColumnExists: boolean | null = null;
|
||||
|
||||
/**
|
||||
* Check if provider_detection_data column exists in dispensaries table.
|
||||
* Result is cached after first check.
|
||||
*/
|
||||
export async function hasProviderDetectionColumn(pool: { query: (sql: string) => Promise<{ rows: any[] }> }): Promise<boolean> {
|
||||
if (_providerDetectionColumnExists !== null) {
|
||||
return _providerDetectionColumnExists;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
|
||||
`);
|
||||
_providerDetectionColumnExists = result.rows.length > 0;
|
||||
} catch {
|
||||
_providerDetectionColumnExists = false;
|
||||
}
|
||||
|
||||
return _providerDetectionColumnExists;
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely update provider_detection_data column.
|
||||
* If column doesn't exist, logs a warning but doesn't crash.
|
||||
*
|
||||
* @param pool - Database pool with query method
|
||||
* @param dispensaryId - ID of dispensary to update
|
||||
* @param data - JSONB data to merge into provider_detection_data
|
||||
* @returns true if update succeeded, false if column doesn't exist
|
||||
*/
|
||||
export async function safeUpdateProviderDetectionData(
|
||||
pool: { query: (sql: string, params?: any[]) => Promise<any> },
|
||||
dispensaryId: number,
|
||||
data: Record<string, any>
|
||||
): Promise<boolean> {
|
||||
const hasColumn = await hasProviderDetectionColumn(pool);
|
||||
|
||||
if (!hasColumn) {
|
||||
console.warn(`[DispensaryColumns] provider_detection_data column not found. Run migration 044 to add it.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || $1::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2`,
|
||||
[JSON.stringify(data), dispensaryId]
|
||||
);
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
if (error.message?.includes('provider_detection_data')) {
|
||||
console.warn(`[DispensaryColumns] Failed to update provider_detection_data: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
403
backend/src/dutchie-az/discovery/DtCityDiscoveryService.ts
Normal file
403
backend/src/dutchie-az/discovery/DtCityDiscoveryService.ts
Normal file
@@ -0,0 +1,403 @@
|
||||
/**
|
||||
* DtCityDiscoveryService
|
||||
*
|
||||
* Core service for Dutchie city discovery.
|
||||
* Contains shared logic used by multiple entrypoints.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Browser/API-based city fetching
|
||||
* - Manual city seeding
|
||||
* - City upsert operations
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCity {
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesInserted: number;
|
||||
citiesUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
export interface ManualSeedResult {
|
||||
city: DutchieCity;
|
||||
id: number;
|
||||
wasInserted: boolean;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// US STATE CODE MAPPING
|
||||
// ============================================================
|
||||
|
||||
export const US_STATE_MAP: Record<string, string> = {
|
||||
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||
};
|
||||
|
||||
// Canadian province mapping
|
||||
export const CA_PROVINCE_MAP: Record<string, string> = {
|
||||
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// CITY FETCHING (AUTO DISCOVERY)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch cities from Dutchie's /cities page using Puppeteer.
|
||||
*/
|
||||
export async function fetchCitiesFromBrowser(): Promise<DutchieCity[]> {
|
||||
console.log('[DtCityDiscoveryService] Launching browser to fetch cities...');
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('[DtCityDiscoveryService] Navigating to https://dutchie.com/cities...');
|
||||
await page.goto('https://dutchie.com/cities', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const cities = await page.evaluate(() => {
|
||||
const cityLinks: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
stateSlug: string | null;
|
||||
}> = [];
|
||||
|
||||
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||
links.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
const text = (link as HTMLElement).innerText?.trim();
|
||||
|
||||
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||
if (match && text) {
|
||||
cityLinks.push({
|
||||
name: text,
|
||||
slug: match[2],
|
||||
url: href,
|
||||
stateSlug: match[1],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return cityLinks;
|
||||
});
|
||||
|
||||
console.log(`[DtCityDiscoveryService] Extracted ${cities.length} city links from page`);
|
||||
|
||||
return cities.map((city) => {
|
||||
let countryCode = 'US';
|
||||
let stateCode: string | null = null;
|
||||
|
||||
if (city.stateSlug) {
|
||||
if (US_STATE_MAP[city.stateSlug]) {
|
||||
stateCode = US_STATE_MAP[city.stateSlug];
|
||||
countryCode = 'US';
|
||||
} else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||
countryCode = 'CA';
|
||||
} else if (city.stateSlug.length === 2) {
|
||||
stateCode = city.stateSlug.toUpperCase();
|
||||
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||
countryCode = 'CA';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
url: city.url,
|
||||
};
|
||||
});
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch cities via API endpoints (fallback).
|
||||
*/
|
||||
export async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||
console.log('[DtCityDiscoveryService] Attempting API-based city discovery...');
|
||||
|
||||
const apiEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of apiEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
if (response.data && Array.isArray(response.data)) {
|
||||
console.log(`[DtCityDiscoveryService] API returned ${response.data.length} cities`);
|
||||
return response.data.map((c: any) => ({
|
||||
name: c.name || c.city,
|
||||
slug: c.slug || c.citySlug,
|
||||
stateCode: c.stateCode || c.state,
|
||||
countryCode: c.countryCode || c.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(`[DtCityDiscoveryService] API ${endpoint} failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities
|
||||
*/
|
||||
export async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCity
|
||||
): Promise<{ id: number; inserted: boolean; updated: boolean }> {
|
||||
const result = await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
crawl_enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
TRUE,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
crawl_enabled = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) AS inserted
|
||||
`,
|
||||
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||
);
|
||||
|
||||
const inserted = result.rows[0]?.inserted === true;
|
||||
return {
|
||||
id: result.rows[0]?.id,
|
||||
inserted,
|
||||
updated: !inserted,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN SERVICE CLASS
|
||||
// ============================================================
|
||||
|
||||
export class DtCityDiscoveryService {
|
||||
constructor(private pool: Pool) {}
|
||||
|
||||
/**
|
||||
* Run auto-discovery (browser + API fallback)
|
||||
*/
|
||||
async runAutoDiscovery(): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let citiesFound = 0;
|
||||
let citiesInserted = 0;
|
||||
let citiesUpdated = 0;
|
||||
|
||||
console.log('[DtCityDiscoveryService] Starting auto city discovery...');
|
||||
|
||||
try {
|
||||
let cities = await fetchCitiesFromBrowser();
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[DtCityDiscoveryService] Browser returned 0 cities, trying API...');
|
||||
cities = await fetchCitiesFromAPI();
|
||||
}
|
||||
|
||||
citiesFound = cities.length;
|
||||
console.log(`[DtCityDiscoveryService] Found ${citiesFound} cities`);
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await upsertCity(this.pool, city);
|
||||
if (result.inserted) citiesInserted++;
|
||||
else if (result.updated) citiesUpdated++;
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `Auto discovery failed: ${error.message}`;
|
||||
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
citiesFound,
|
||||
citiesInserted,
|
||||
citiesUpdated,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed a single city manually
|
||||
*/
|
||||
async seedCity(city: DutchieCity): Promise<ManualSeedResult> {
|
||||
console.log(`[DtCityDiscoveryService] Seeding city: ${city.name} (${city.slug}), ${city.stateCode}, ${city.countryCode}`);
|
||||
|
||||
const result = await upsertCity(this.pool, city);
|
||||
|
||||
return {
|
||||
city,
|
||||
id: result.id,
|
||||
wasInserted: result.inserted,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed multiple cities from a list
|
||||
*/
|
||||
async seedCities(cities: DutchieCity[]): Promise<{
|
||||
results: ManualSeedResult[];
|
||||
errors: string[];
|
||||
}> {
|
||||
const results: ManualSeedResult[] = [];
|
||||
const errors: string[] = [];
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await this.seedCity(city);
|
||||
results.push(result);
|
||||
} catch (error: any) {
|
||||
errors.push(`${city.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { results, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about discovered cities
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
byCountry: Array<{ countryCode: string; count: number }>;
|
||||
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||
crawlEnabled: number;
|
||||
neverCrawled: number;
|
||||
}> {
|
||||
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE platform = \'dutchie\''),
|
||||
this.pool.query(`
|
||||
SELECT country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie'
|
||||
GROUP BY country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT state_code, country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND state_code IS NOT NULL
|
||||
GROUP BY state_code, country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND last_crawled_at IS NULL
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||
byCountry: byCountryRes.rows.map((r) => ({
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
byState: byStateRes.rows.map((r) => ({
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DtCityDiscoveryService;
|
||||
1249
backend/src/dutchie-az/discovery/DtLocationDiscoveryService.ts
Normal file
1249
backend/src/dutchie-az/discovery/DtLocationDiscoveryService.ts
Normal file
File diff suppressed because it is too large
Load Diff
390
backend/src/dutchie-az/discovery/DutchieCityDiscovery.ts
Normal file
390
backend/src/dutchie-az/discovery/DutchieCityDiscovery.ts
Normal file
@@ -0,0 +1,390 @@
|
||||
/**
|
||||
* DutchieCityDiscovery
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Fetch all cities available on Dutchie
|
||||
* - For each city derive: city_name, city_slug, state_code, country_code
|
||||
* - Upsert into dutchie_discovery_cities
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import type { Browser, Page } from 'puppeteer';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCity {
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesInserted: number;
|
||||
citiesUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// US STATE CODE MAPPING
|
||||
// ============================================================
|
||||
|
||||
const US_STATE_MAP: Record<string, string> = {
|
||||
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||
};
|
||||
|
||||
// Canadian province mapping
|
||||
const CA_PROVINCE_MAP: Record<string, string> = {
|
||||
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// CITY FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch cities from Dutchie's /cities page using Puppeteer to extract data.
|
||||
*/
|
||||
async function fetchCitiesFromDutchie(): Promise<DutchieCity[]> {
|
||||
console.log('[DutchieCityDiscovery] Launching browser to fetch cities...');
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Navigate to cities page
|
||||
console.log('[DutchieCityDiscovery] Navigating to https://dutchie.com/cities...');
|
||||
await page.goto('https://dutchie.com/cities', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for content to load
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// Extract city links from the page
|
||||
const cities = await page.evaluate(() => {
|
||||
const cityLinks: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
stateSlug: string | null;
|
||||
}> = [];
|
||||
|
||||
// Find all city links - they typically follow pattern /city/{state}/{city}
|
||||
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||
links.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
const text = (link as HTMLElement).innerText?.trim();
|
||||
|
||||
// Parse URL: https://dutchie.com/city/{state}/{city}
|
||||
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||
if (match && text) {
|
||||
cityLinks.push({
|
||||
name: text,
|
||||
slug: match[2],
|
||||
url: href,
|
||||
stateSlug: match[1],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return cityLinks;
|
||||
});
|
||||
|
||||
console.log(`[DutchieCityDiscovery] Extracted ${cities.length} city links from page`);
|
||||
|
||||
// Convert to DutchieCity format
|
||||
const result: DutchieCity[] = [];
|
||||
|
||||
for (const city of cities) {
|
||||
// Determine country and state code
|
||||
let countryCode = 'US';
|
||||
let stateCode: string | null = null;
|
||||
|
||||
if (city.stateSlug) {
|
||||
// Check if it's a US state
|
||||
if (US_STATE_MAP[city.stateSlug]) {
|
||||
stateCode = US_STATE_MAP[city.stateSlug];
|
||||
countryCode = 'US';
|
||||
}
|
||||
// Check if it's a Canadian province
|
||||
else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||
countryCode = 'CA';
|
||||
}
|
||||
// Check if it's already a 2-letter code
|
||||
else if (city.stateSlug.length === 2) {
|
||||
stateCode = city.stateSlug.toUpperCase();
|
||||
// Determine country based on state code
|
||||
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||
countryCode = 'CA';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.push({
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
url: city.url,
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Fetch cities by making API/GraphQL requests.
|
||||
* Falls back to this if scraping fails.
|
||||
*/
|
||||
async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||
console.log('[DutchieCityDiscovery] Attempting API-based city discovery...');
|
||||
|
||||
// Dutchie may have an API endpoint for cities
|
||||
// Try common patterns
|
||||
const apiEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of apiEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
if (response.data && Array.isArray(response.data)) {
|
||||
console.log(`[DutchieCityDiscovery] API returned ${response.data.length} cities`);
|
||||
return response.data.map((c: any) => ({
|
||||
name: c.name || c.city,
|
||||
slug: c.slug || c.citySlug,
|
||||
stateCode: c.stateCode || c.state,
|
||||
countryCode: c.countryCode || c.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(`[DutchieCityDiscovery] API ${endpoint} failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities
|
||||
*/
|
||||
async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCity
|
||||
): Promise<{ inserted: boolean; updated: boolean }> {
|
||||
const result = await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
last_crawled_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
last_crawled_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS inserted
|
||||
`,
|
||||
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||
);
|
||||
|
||||
const inserted = result.rows[0]?.inserted === true;
|
||||
return { inserted, updated: !inserted };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export class DutchieCityDiscovery {
|
||||
private pool: Pool;
|
||||
|
||||
constructor(pool: Pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the city discovery process
|
||||
*/
|
||||
async run(): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let citiesFound = 0;
|
||||
let citiesInserted = 0;
|
||||
let citiesUpdated = 0;
|
||||
|
||||
console.log('[DutchieCityDiscovery] Starting city discovery...');
|
||||
|
||||
try {
|
||||
// Try scraping first, fall back to API
|
||||
let cities = await fetchCitiesFromDutchie();
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[DutchieCityDiscovery] Scraping returned 0 cities, trying API...');
|
||||
cities = await fetchCitiesFromAPI();
|
||||
}
|
||||
|
||||
citiesFound = cities.length;
|
||||
console.log(`[DutchieCityDiscovery] Found ${citiesFound} cities`);
|
||||
|
||||
// Upsert each city
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await upsertCity(this.pool, city);
|
||||
if (result.inserted) {
|
||||
citiesInserted++;
|
||||
} else if (result.updated) {
|
||||
citiesUpdated++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `City discovery failed: ${error.message}`;
|
||||
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log('[DutchieCityDiscovery] Discovery complete:');
|
||||
console.log(` Cities found: ${citiesFound}`);
|
||||
console.log(` Inserted: ${citiesInserted}`);
|
||||
console.log(` Updated: ${citiesUpdated}`);
|
||||
console.log(` Errors: ${errors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
citiesFound,
|
||||
citiesInserted,
|
||||
citiesUpdated,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about discovered cities
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
byCountry: Array<{ countryCode: string; count: number }>;
|
||||
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||
crawlEnabled: number;
|
||||
neverCrawled: number;
|
||||
}> {
|
||||
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||
this.pool.query(`
|
||||
SELECT country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
GROUP BY country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT state_code, country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE state_code IS NOT NULL
|
||||
GROUP BY state_code, country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE crawl_enabled = TRUE
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE last_crawled_at IS NULL
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||
byCountry: byCountryRes.rows.map((r) => ({
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
byState: byStateRes.rows.map((r) => ({
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DutchieCityDiscovery;
|
||||
639
backend/src/dutchie-az/discovery/DutchieLocationDiscovery.ts
Normal file
639
backend/src/dutchie-az/discovery/DutchieLocationDiscovery.ts
Normal file
@@ -0,0 +1,639 @@
|
||||
/**
|
||||
* DutchieLocationDiscovery
|
||||
*
|
||||
* Discovers store locations for each city from Dutchie and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Given a dutchie_discovery_cities row, call Dutchie's location/search endpoint
|
||||
* - For each store: extract platform_location_id, platform_slug, platform_menu_url, name, address, coords
|
||||
* - Upsert into dutchie_discovery_locations
|
||||
* - DO NOT overwrite status if already verified/merged/rejected
|
||||
* - DO NOT overwrite dispensary_id if already set
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryCity {
|
||||
id: number;
|
||||
platform: string;
|
||||
cityName: string;
|
||||
citySlug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
crawlEnabled: boolean;
|
||||
}
|
||||
|
||||
export interface DutchieLocation {
|
||||
platformLocationId: string;
|
||||
platformSlug: string;
|
||||
platformMenuUrl: string;
|
||||
name: string;
|
||||
rawAddress: string | null;
|
||||
addressLine1: string | null;
|
||||
addressLine2: string | null;
|
||||
city: string | null;
|
||||
stateCode: string | null;
|
||||
postalCode: string | null;
|
||||
countryCode: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
offersDelivery: boolean | null;
|
||||
offersPickup: boolean | null;
|
||||
isRecreational: boolean | null;
|
||||
isMedical: boolean | null;
|
||||
metadata: Record<string, any>;
|
||||
}
|
||||
|
||||
export interface LocationDiscoveryResult {
|
||||
cityId: number;
|
||||
citySlug: string;
|
||||
locationsFound: number;
|
||||
locationsInserted: number;
|
||||
locationsUpdated: number;
|
||||
locationsSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOCATION FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch locations for a city using Puppeteer to scrape the city page
|
||||
*/
|
||||
async function fetchLocationsForCity(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||
console.log(`[DutchieLocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Navigate to city page - use /us/dispensaries/{city_slug} pattern
|
||||
const cityUrl = `https://dutchie.com/us/dispensaries/${city.citySlug}`;
|
||||
console.log(`[DutchieLocationDiscovery] Navigating to ${cityUrl}...`);
|
||||
|
||||
await page.goto(cityUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for content
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// Try to extract __NEXT_DATA__ which often contains store data
|
||||
const nextData = await page.evaluate(() => {
|
||||
const script = document.querySelector('script#__NEXT_DATA__');
|
||||
if (script) {
|
||||
try {
|
||||
return JSON.parse(script.textContent || '{}');
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
let locations: DutchieLocation[] = [];
|
||||
|
||||
if (nextData?.props?.pageProps?.dispensaries) {
|
||||
// Extract from Next.js data
|
||||
const dispensaries = nextData.props.pageProps.dispensaries;
|
||||
console.log(`[DutchieLocationDiscovery] Found ${dispensaries.length} dispensaries in __NEXT_DATA__`);
|
||||
|
||||
locations = dispensaries.map((d: any) => parseDispensaryData(d, city));
|
||||
} else {
|
||||
// Fall back to DOM scraping
|
||||
console.log('[DutchieLocationDiscovery] No __NEXT_DATA__, trying DOM scraping...');
|
||||
|
||||
const scrapedData = await page.evaluate(() => {
|
||||
const stores: Array<{
|
||||
name: string;
|
||||
href: string;
|
||||
address: string | null;
|
||||
}> = [];
|
||||
|
||||
// Look for dispensary cards/links
|
||||
const cards = document.querySelectorAll('[data-testid="dispensary-card"], .dispensary-card, a[href*="/dispensary/"]');
|
||||
cards.forEach((card) => {
|
||||
const link = card.querySelector('a[href*="/dispensary/"]') || (card as HTMLAnchorElement);
|
||||
const href = (link as HTMLAnchorElement).href || '';
|
||||
const name =
|
||||
card.querySelector('[data-testid="dispensary-name"]')?.textContent ||
|
||||
card.querySelector('h2, h3, .name')?.textContent ||
|
||||
link.textContent ||
|
||||
'';
|
||||
const address = card.querySelector('[data-testid="dispensary-address"], .address')?.textContent || null;
|
||||
|
||||
if (href && name) {
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
href,
|
||||
address: address?.trim() || null,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return stores;
|
||||
});
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] DOM scraping found ${scrapedData.length} stores`);
|
||||
|
||||
locations = scrapedData.map((s) => {
|
||||
// Parse slug from URL
|
||||
const match = s.href.match(/\/dispensary\/([^/?]+)/);
|
||||
const slug = match ? match[1] : s.name.toLowerCase().replace(/\s+/g, '-');
|
||||
|
||||
return {
|
||||
platformLocationId: slug, // Will be resolved later
|
||||
platformSlug: slug,
|
||||
platformMenuUrl: `https://dutchie.com/dispensary/${slug}`,
|
||||
name: s.name,
|
||||
rawAddress: s.address,
|
||||
addressLine1: null,
|
||||
addressLine2: null,
|
||||
city: city.cityName,
|
||||
stateCode: city.stateCode,
|
||||
postalCode: null,
|
||||
countryCode: city.countryCode,
|
||||
latitude: null,
|
||||
longitude: null,
|
||||
timezone: null,
|
||||
offersDelivery: null,
|
||||
offersPickup: null,
|
||||
isRecreational: null,
|
||||
isMedical: null,
|
||||
metadata: { source: 'dom_scrape', originalUrl: s.href },
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
return locations;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse dispensary data from Dutchie's API/JSON response
|
||||
*/
|
||||
function parseDispensaryData(d: any, city: DiscoveryCity): DutchieLocation {
|
||||
const id = d.id || d._id || d.dispensaryId || '';
|
||||
const slug = d.slug || d.cName || d.name?.toLowerCase().replace(/\s+/g, '-') || '';
|
||||
|
||||
// Build menu URL
|
||||
let menuUrl = `https://dutchie.com/dispensary/${slug}`;
|
||||
if (d.menuUrl) {
|
||||
menuUrl = d.menuUrl;
|
||||
} else if (d.embeddedMenuUrl) {
|
||||
menuUrl = d.embeddedMenuUrl;
|
||||
}
|
||||
|
||||
// Parse address
|
||||
const address = d.address || d.location?.address || {};
|
||||
const rawAddress = [
|
||||
address.line1 || address.street1 || d.address1,
|
||||
address.line2 || address.street2 || d.address2,
|
||||
[
|
||||
address.city || d.city,
|
||||
address.state || address.stateCode || d.state,
|
||||
address.zip || address.zipCode || address.postalCode || d.zip,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' '),
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(', ');
|
||||
|
||||
return {
|
||||
platformLocationId: id,
|
||||
platformSlug: slug,
|
||||
platformMenuUrl: menuUrl,
|
||||
name: d.name || d.dispensaryName || '',
|
||||
rawAddress: rawAddress || null,
|
||||
addressLine1: address.line1 || address.street1 || d.address1 || null,
|
||||
addressLine2: address.line2 || address.street2 || d.address2 || null,
|
||||
city: address.city || d.city || city.cityName,
|
||||
stateCode: address.state || address.stateCode || d.state || city.stateCode,
|
||||
postalCode: address.zip || address.zipCode || address.postalCode || d.zip || null,
|
||||
countryCode: address.country || address.countryCode || d.country || city.countryCode,
|
||||
latitude: d.latitude ?? d.location?.latitude ?? d.location?.lat ?? null,
|
||||
longitude: d.longitude ?? d.location?.longitude ?? d.location?.lng ?? null,
|
||||
timezone: d.timezone || d.timeZone || null,
|
||||
offersDelivery: d.offerDelivery ?? d.offersDelivery ?? d.delivery ?? null,
|
||||
offersPickup: d.offerPickup ?? d.offersPickup ?? d.pickup ?? null,
|
||||
isRecreational: d.isRecreational ?? d.recreational ?? (d.retailType === 'recreational' || d.retailType === 'both'),
|
||||
isMedical: d.isMedical ?? d.medical ?? (d.retailType === 'medical' || d.retailType === 'both'),
|
||||
metadata: {
|
||||
source: 'next_data',
|
||||
retailType: d.retailType,
|
||||
brand: d.brand,
|
||||
logo: d.logo || d.logoUrl,
|
||||
raw: d,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Use GraphQL to discover locations
|
||||
*/
|
||||
async function fetchLocationsViaGraphQL(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||
console.log(`[DutchieLocationDiscovery] Trying GraphQL for ${city.cityName}...`);
|
||||
|
||||
// Try geo-based search
|
||||
// This would require knowing the city's coordinates
|
||||
// For now, return empty and rely on page scraping
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a location into dutchie_discovery_locations
|
||||
* Does NOT overwrite status if already verified/merged/rejected
|
||||
* Does NOT overwrite dispensary_id if already set
|
||||
*/
|
||||
async function upsertLocation(
|
||||
pool: Pool,
|
||||
location: DutchieLocation,
|
||||
cityId: number
|
||||
): Promise<{ inserted: boolean; updated: boolean; skipped: boolean }> {
|
||||
// First check if this location exists and has a protected status
|
||||
const existing = await pool.query(
|
||||
`
|
||||
SELECT id, status, dispensary_id
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND platform_location_id = $1
|
||||
`,
|
||||
[location.platformLocationId]
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
const row = existing.rows[0];
|
||||
const protectedStatuses = ['verified', 'merged', 'rejected'];
|
||||
|
||||
if (protectedStatuses.includes(row.status)) {
|
||||
// Only update last_seen_at for protected statuses
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET last_seen_at = NOW(), updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[row.id]
|
||||
);
|
||||
return { inserted: false, updated: false, skipped: true };
|
||||
}
|
||||
|
||||
// Update existing discovered location (but preserve dispensary_id if set)
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET
|
||||
platform_slug = $2,
|
||||
platform_menu_url = $3,
|
||||
name = $4,
|
||||
raw_address = COALESCE($5, raw_address),
|
||||
address_line1 = COALESCE($6, address_line1),
|
||||
address_line2 = COALESCE($7, address_line2),
|
||||
city = COALESCE($8, city),
|
||||
state_code = COALESCE($9, state_code),
|
||||
postal_code = COALESCE($10, postal_code),
|
||||
country_code = COALESCE($11, country_code),
|
||||
latitude = COALESCE($12, latitude),
|
||||
longitude = COALESCE($13, longitude),
|
||||
timezone = COALESCE($14, timezone),
|
||||
offers_delivery = COALESCE($15, offers_delivery),
|
||||
offers_pickup = COALESCE($16, offers_pickup),
|
||||
is_recreational = COALESCE($17, is_recreational),
|
||||
is_medical = COALESCE($18, is_medical),
|
||||
metadata = COALESCE($19, metadata),
|
||||
discovery_city_id = $20,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[
|
||||
row.id,
|
||||
location.platformSlug,
|
||||
location.platformMenuUrl,
|
||||
location.name,
|
||||
location.rawAddress,
|
||||
location.addressLine1,
|
||||
location.addressLine2,
|
||||
location.city,
|
||||
location.stateCode,
|
||||
location.postalCode,
|
||||
location.countryCode,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.offersDelivery,
|
||||
location.offersPickup,
|
||||
location.isRecreational,
|
||||
location.isMedical,
|
||||
JSON.stringify(location.metadata),
|
||||
cityId,
|
||||
]
|
||||
);
|
||||
return { inserted: false, updated: true, skipped: false };
|
||||
}
|
||||
|
||||
// Insert new location
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_locations (
|
||||
platform,
|
||||
platform_location_id,
|
||||
platform_slug,
|
||||
platform_menu_url,
|
||||
name,
|
||||
raw_address,
|
||||
address_line1,
|
||||
address_line2,
|
||||
city,
|
||||
state_code,
|
||||
postal_code,
|
||||
country_code,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
status,
|
||||
offers_delivery,
|
||||
offers_pickup,
|
||||
is_recreational,
|
||||
is_medical,
|
||||
metadata,
|
||||
discovery_city_id,
|
||||
first_seen_at,
|
||||
last_seen_at,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
|
||||
'discovered',
|
||||
$15, $16, $17, $18, $19, $20,
|
||||
NOW(), NOW(), TRUE, NOW(), NOW()
|
||||
)
|
||||
`,
|
||||
[
|
||||
location.platformLocationId,
|
||||
location.platformSlug,
|
||||
location.platformMenuUrl,
|
||||
location.name,
|
||||
location.rawAddress,
|
||||
location.addressLine1,
|
||||
location.addressLine2,
|
||||
location.city,
|
||||
location.stateCode,
|
||||
location.postalCode,
|
||||
location.countryCode,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.offersDelivery,
|
||||
location.offersPickup,
|
||||
location.isRecreational,
|
||||
location.isMedical,
|
||||
JSON.stringify(location.metadata),
|
||||
cityId,
|
||||
]
|
||||
);
|
||||
|
||||
return { inserted: true, updated: false, skipped: false };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY CLASS
|
||||
// ============================================================
|
||||
|
||||
export class DutchieLocationDiscovery {
|
||||
private pool: Pool;
|
||||
|
||||
constructor(pool: Pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a city by slug
|
||||
*/
|
||||
async getCityBySlug(citySlug: string): Promise<DiscoveryCity | null> {
|
||||
const { rows } = await this.pool.query(
|
||||
`
|
||||
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND city_slug = $1
|
||||
LIMIT 1
|
||||
`,
|
||||
[citySlug]
|
||||
);
|
||||
|
||||
if (rows.length === 0) return null;
|
||||
|
||||
const r = rows[0];
|
||||
return {
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all crawl-enabled cities
|
||||
*/
|
||||
async getEnabledCities(limit?: number): Promise<DiscoveryCity[]> {
|
||||
const { rows } = await this.pool.query(
|
||||
`
|
||||
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||
ORDER BY last_crawled_at ASC NULLS FIRST, city_name ASC
|
||||
${limit ? `LIMIT ${limit}` : ''}
|
||||
`
|
||||
);
|
||||
|
||||
return rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover locations for a single city
|
||||
*/
|
||||
async discoverForCity(city: DiscoveryCity): Promise<LocationDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let locationsFound = 0;
|
||||
let locationsInserted = 0;
|
||||
let locationsUpdated = 0;
|
||||
let locationsSkipped = 0;
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
try {
|
||||
// Fetch locations
|
||||
let locations = await fetchLocationsForCity(city);
|
||||
|
||||
// If scraping fails, try GraphQL
|
||||
if (locations.length === 0) {
|
||||
locations = await fetchLocationsViaGraphQL(city);
|
||||
}
|
||||
|
||||
locationsFound = locations.length;
|
||||
console.log(`[DutchieLocationDiscovery] Found ${locationsFound} locations`);
|
||||
|
||||
// Upsert each location
|
||||
for (const location of locations) {
|
||||
try {
|
||||
const result = await upsertLocation(this.pool, location, city.id);
|
||||
if (result.inserted) locationsInserted++;
|
||||
else if (result.updated) locationsUpdated++;
|
||||
else if (result.skipped) locationsSkipped++;
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert location ${location.platformSlug}: ${error.message}`;
|
||||
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Update city's last_crawled_at and location_count
|
||||
await this.pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_cities
|
||||
SET last_crawled_at = NOW(),
|
||||
location_count = $1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[locationsFound, city.id]
|
||||
);
|
||||
} catch (error: any) {
|
||||
const msg = `Location discovery failed for ${city.citySlug}: ${error.message}`;
|
||||
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] City ${city.citySlug} complete:`);
|
||||
console.log(` Locations found: ${locationsFound}`);
|
||||
console.log(` Inserted: ${locationsInserted}`);
|
||||
console.log(` Updated: ${locationsUpdated}`);
|
||||
console.log(` Skipped (protected): ${locationsSkipped}`);
|
||||
console.log(` Errors: ${errors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound,
|
||||
locationsInserted,
|
||||
locationsUpdated,
|
||||
locationsSkipped,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover locations for all enabled cities
|
||||
*/
|
||||
async discoverAllEnabled(options: {
|
||||
limit?: number;
|
||||
delayMs?: number;
|
||||
} = {}): Promise<{
|
||||
totalCities: number;
|
||||
totalLocationsFound: number;
|
||||
totalInserted: number;
|
||||
totalUpdated: number;
|
||||
totalSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}> {
|
||||
const { limit, delayMs = 2000 } = options;
|
||||
const startTime = Date.now();
|
||||
let totalLocationsFound = 0;
|
||||
let totalInserted = 0;
|
||||
let totalUpdated = 0;
|
||||
let totalSkipped = 0;
|
||||
const allErrors: string[] = [];
|
||||
|
||||
const cities = await this.getEnabledCities(limit);
|
||||
console.log(`[DutchieLocationDiscovery] Discovering locations for ${cities.length} cities...`);
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
console.log(`\n[DutchieLocationDiscovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||
|
||||
try {
|
||||
const result = await this.discoverForCity(city);
|
||||
totalLocationsFound += result.locationsFound;
|
||||
totalInserted += result.locationsInserted;
|
||||
totalUpdated += result.locationsUpdated;
|
||||
totalSkipped += result.locationsSkipped;
|
||||
allErrors.push(...result.errors);
|
||||
} catch (error: any) {
|
||||
allErrors.push(`City ${city.citySlug} failed: ${error.message}`);
|
||||
}
|
||||
|
||||
// Delay between cities
|
||||
if (i < cities.length - 1 && delayMs > 0) {
|
||||
await new Promise((r) => setTimeout(r, delayMs));
|
||||
}
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log('\n[DutchieLocationDiscovery] All cities complete:');
|
||||
console.log(` Total cities: ${cities.length}`);
|
||||
console.log(` Total locations found: ${totalLocationsFound}`);
|
||||
console.log(` Total inserted: ${totalInserted}`);
|
||||
console.log(` Total updated: ${totalUpdated}`);
|
||||
console.log(` Total skipped: ${totalSkipped}`);
|
||||
console.log(` Total errors: ${allErrors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
totalCities: cities.length,
|
||||
totalLocationsFound,
|
||||
totalInserted,
|
||||
totalUpdated,
|
||||
totalSkipped,
|
||||
errors: allErrors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DutchieLocationDiscovery;
|
||||
73
backend/src/dutchie-az/discovery/discovery-dt-cities-auto.ts
Normal file
73
backend/src/dutchie-az/discovery/discovery-dt-cities-auto.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Cities (Auto)
|
||||
*
|
||||
* Attempts browser/API-based /cities discovery.
|
||||
* Even if currently blocked (403), this runner preserves the auto-discovery path.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:cities:auto
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtCityDiscoveryService } from './DtCityDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery (AUTO) ║');
|
||||
console.log('║ Browser + API fallback ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
const service = new DtCityDiscoveryService(pool);
|
||||
const result = await service.runAutoDiscovery();
|
||||
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
}
|
||||
|
||||
const stats = await service.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
|
||||
if (result.citiesFound === 0) {
|
||||
console.log('\n⚠️ No cities found via auto-discovery.');
|
||||
console.log(' This may be due to Dutchie blocking scraping/API access.');
|
||||
console.log(' Use manual seeding instead:');
|
||||
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Auto city discovery completed');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Auto city discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Cities (Manual Seed)
|
||||
*
|
||||
* Manually seeds cities into dutchie_discovery_cities via CLI args.
|
||||
* Use this when auto-discovery is blocked (403).
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||
* npm run discovery:dt:cities:manual -- --city-slug=ma-boston --city-name=Boston --state-code=MA --country-code=US
|
||||
*
|
||||
* Options:
|
||||
* --city-slug Required. URL slug (e.g., "ny-hudson")
|
||||
* --city-name Required. Display name (e.g., "Hudson")
|
||||
* --state-code Required. State/province code (e.g., "NY", "CA", "ON")
|
||||
* --country-code Optional. Country code (default: "US")
|
||||
*
|
||||
* After seeding, run location discovery:
|
||||
* npm run discovery:dt:locations
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtCityDiscoveryService, DutchieCity } from './DtCityDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
interface Args {
|
||||
citySlug?: string;
|
||||
cityName?: string;
|
||||
stateCode?: string;
|
||||
countryCode: string;
|
||||
}
|
||||
|
||||
function parseArgs(): Args {
|
||||
const args: Args = { countryCode: 'US' };
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const citySlugMatch = arg.match(/--city-slug=(.+)/);
|
||||
if (citySlugMatch) args.citySlug = citySlugMatch[1];
|
||||
|
||||
const cityNameMatch = arg.match(/--city-name=(.+)/);
|
||||
if (cityNameMatch) args.cityName = cityNameMatch[1];
|
||||
|
||||
const stateCodeMatch = arg.match(/--state-code=(.+)/);
|
||||
if (stateCodeMatch) args.stateCode = stateCodeMatch[1].toUpperCase();
|
||||
|
||||
const countryCodeMatch = arg.match(/--country-code=(.+)/);
|
||||
if (countryCodeMatch) args.countryCode = countryCodeMatch[1].toUpperCase();
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
function printUsage() {
|
||||
console.log(`
|
||||
Usage:
|
||||
npm run discovery:dt:cities:manual -- --city-slug=<slug> --city-name=<name> --state-code=<state>
|
||||
|
||||
Required arguments:
|
||||
--city-slug URL slug for the city (e.g., "ny-hudson", "ma-boston")
|
||||
--city-name Display name (e.g., "Hudson", "Boston")
|
||||
--state-code State/province code (e.g., "NY", "CA", "ON")
|
||||
|
||||
Optional arguments:
|
||||
--country-code Country code (default: "US")
|
||||
|
||||
Examples:
|
||||
npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||
npm run discovery:dt:cities:manual -- --city-slug=ca-los-angeles --city-name="Los Angeles" --state-code=CA
|
||||
npm run discovery:dt:cities:manual -- --city-slug=on-toronto --city-name=Toronto --state-code=ON --country-code=CA
|
||||
|
||||
After seeding, run location discovery:
|
||||
npm run discovery:dt:locations
|
||||
`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery (MANUAL SEED) ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
|
||||
if (!args.citySlug || !args.cityName || !args.stateCode) {
|
||||
console.error('\n❌ Error: Missing required arguments\n');
|
||||
printUsage();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`\nCity Slug: ${args.citySlug}`);
|
||||
console.log(`City Name: ${args.cityName}`);
|
||||
console.log(`State Code: ${args.stateCode}`);
|
||||
console.log(`Country Code: ${args.countryCode}`);
|
||||
console.log(`Database: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`\nConnected at: ${rows[0].time}`);
|
||||
|
||||
const service = new DtCityDiscoveryService(pool);
|
||||
|
||||
const city: DutchieCity = {
|
||||
slug: args.citySlug,
|
||||
name: args.cityName,
|
||||
stateCode: args.stateCode,
|
||||
countryCode: args.countryCode,
|
||||
};
|
||||
|
||||
const result = await service.seedCity(city);
|
||||
|
||||
const action = result.wasInserted ? 'INSERTED' : 'UPDATED';
|
||||
console.log(`\n✅ City ${action}:`);
|
||||
console.log(` ID: ${result.id}`);
|
||||
console.log(` City Slug: ${result.city.slug}`);
|
||||
console.log(` City Name: ${result.city.name}`);
|
||||
console.log(` State Code: ${result.city.stateCode}`);
|
||||
console.log(` Country Code: ${result.city.countryCode}`);
|
||||
|
||||
const stats = await service.getStats();
|
||||
console.log(`\nTotal Dutchie cities: ${stats.total} (${stats.crawlEnabled} enabled)`);
|
||||
|
||||
console.log('\n📍 Next step: Run location discovery');
|
||||
console.log(' npm run discovery:dt:locations');
|
||||
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Failed to seed city:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
73
backend/src/dutchie-az/discovery/discovery-dt-cities.ts
Normal file
73
backend/src/dutchie-az/discovery/discovery-dt-cities.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Runner: Dutchie Cities
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:platforms:dt:cities
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery Runner ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
// Test DB connection
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
// Run city discovery
|
||||
const discovery = new DutchieCityDiscovery(pool);
|
||||
const result = await discovery.run();
|
||||
|
||||
// Print summary
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
}
|
||||
|
||||
// Get final stats
|
||||
const stats = await discovery.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
console.log(` By country: ${stats.byCountry.map(c => `${c.countryCode}=${c.count}`).join(', ')}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ City discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ City discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Locations (From Cities)
|
||||
*
|
||||
* Reads from dutchie_discovery_cities (crawl_enabled = true)
|
||||
* and discovers store locations for each city.
|
||||
*
|
||||
* Geo coordinates are captured when available from Dutchie's payloads.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:locations
|
||||
* npm run discovery:dt:locations -- --limit=10
|
||||
* npm run discovery:dt:locations -- --delay=3000
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts
|
||||
*
|
||||
* Options:
|
||||
* --limit=N Only process N cities (default: all)
|
||||
* --delay=N Delay between cities in ms (default: 2000)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtLocationDiscoveryService } from './DtLocationDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
function parseArgs(): { limit?: number; delay?: number } {
|
||||
const args: { limit?: number; delay?: number } = {};
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||
|
||||
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie Location Discovery (From Cities) ║');
|
||||
console.log('║ Reads crawl_enabled cities, discovers stores ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
const service = new DtLocationDiscoveryService(pool);
|
||||
const result = await service.discoverAllEnabled({
|
||||
limit: args.limit,
|
||||
delayMs: args.delay ?? 2000,
|
||||
});
|
||||
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities processed: ${result.totalCities}`);
|
||||
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors (first 10):');
|
||||
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get location stats including coordinates
|
||||
const stats = await service.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total locations: ${stats.total}`);
|
||||
console.log(` With coordinates: ${stats.withCoordinates}`);
|
||||
console.log(` By status:`);
|
||||
stats.byStatus.forEach(s => console.log(` ${s.status}: ${s.count}`));
|
||||
|
||||
if (result.totalCities === 0) {
|
||||
console.log('\n⚠️ No crawl-enabled cities found.');
|
||||
console.log(' Seed cities first:');
|
||||
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Location discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Location discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
117
backend/src/dutchie-az/discovery/discovery-dt-locations.ts
Normal file
117
backend/src/dutchie-az/discovery/discovery-dt-locations.ts
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Runner: Dutchie Locations
|
||||
*
|
||||
* Discovers store locations for all crawl-enabled cities and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:platforms:dt:locations
|
||||
* npm run discovery:platforms:dt:locations -- --limit=10
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations.ts
|
||||
*
|
||||
* Options (via args):
|
||||
* --limit=N Only process N cities (default: all)
|
||||
* --delay=N Delay between cities in ms (default: 2000)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
// Parse CLI args
|
||||
function parseArgs(): { limit?: number; delay?: number } {
|
||||
const args: { limit?: number; delay?: number } = {};
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||
|
||||
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie Location Discovery Runner ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
// Test DB connection
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
// Run location discovery
|
||||
const discovery = new DutchieLocationDiscovery(pool);
|
||||
const result = await discovery.discoverAllEnabled({
|
||||
limit: args.limit,
|
||||
delayMs: args.delay ?? 2000,
|
||||
});
|
||||
|
||||
// Print summary
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities processed: ${result.totalCities}`);
|
||||
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors (first 10):');
|
||||
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get DB counts
|
||||
const { rows: countRows } = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'discovered') as discovered,
|
||||
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||
COUNT(*) FILTER (WHERE status = 'merged') as merged,
|
||||
COUNT(*) FILTER (WHERE status = 'rejected') as rejected
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE
|
||||
`);
|
||||
|
||||
const counts = countRows[0];
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total locations: ${counts.total}`);
|
||||
console.log(` Status discovered: ${counts.discovered}`);
|
||||
console.log(` Status verified: ${counts.verified}`);
|
||||
console.log(` Status merged: ${counts.merged}`);
|
||||
console.log(` Status rejected: ${counts.rejected}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Location discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Location discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
10
backend/src/dutchie-az/discovery/index.ts
Normal file
10
backend/src/dutchie-az/discovery/index.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
/**
|
||||
* Dutchie Discovery Module
|
||||
*
|
||||
* Store discovery pipeline for Dutchie platform.
|
||||
*/
|
||||
|
||||
export { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
export { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
export { createDutchieDiscoveryRoutes } from './routes';
|
||||
export { promoteDiscoveryLocation } from './promoteDiscoveryLocation';
|
||||
248
backend/src/dutchie-az/discovery/promoteDiscoveryLocation.ts
Normal file
248
backend/src/dutchie-az/discovery/promoteDiscoveryLocation.ts
Normal file
@@ -0,0 +1,248 @@
|
||||
/**
|
||||
* Promote Discovery Location to Crawlable Dispensary
|
||||
*
|
||||
* When a discovery location is verified or merged:
|
||||
* 1. Ensure a crawl profile exists for the dispensary
|
||||
* 2. Seed/update crawl schedule
|
||||
* 3. Create initial crawl job
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface PromotionResult {
|
||||
success: boolean;
|
||||
discoveryId: number;
|
||||
dispensaryId: number;
|
||||
crawlProfileId?: number;
|
||||
scheduleUpdated?: boolean;
|
||||
crawlJobCreated?: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a verified/merged discovery location to a crawlable dispensary.
|
||||
*
|
||||
* This function:
|
||||
* 1. Verifies the discovery location is verified/merged and has a dispensary_id
|
||||
* 2. Ensures the dispensary has platform info (menu_type, platform_dispensary_id)
|
||||
* 3. Creates/updates a crawler profile if the profile table exists
|
||||
* 4. Queues an initial crawl job
|
||||
*/
|
||||
export async function promoteDiscoveryLocation(
|
||||
pool: Pool,
|
||||
discoveryLocationId: number
|
||||
): Promise<PromotionResult> {
|
||||
console.log(`[Promote] Starting promotion for discovery location ${discoveryLocationId}...`);
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.id as disp_id,
|
||||
d.name as disp_name,
|
||||
d.menu_type as disp_menu_type,
|
||||
d.platform_dispensary_id as disp_platform_id
|
||||
FROM dutchie_discovery_locations dl
|
||||
JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[discoveryLocationId]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId: 0,
|
||||
error: 'Discovery location not found or not linked to a dispensary',
|
||||
};
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Verify status
|
||||
if (!['verified', 'merged'].includes(location.status)) {
|
||||
return {
|
||||
success: false,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId: location.dispensary_id || 0,
|
||||
error: `Cannot promote: location status is '${location.status}', must be 'verified' or 'merged'`,
|
||||
};
|
||||
}
|
||||
|
||||
const dispensaryId = location.dispensary_id;
|
||||
console.log(`[Promote] Location ${discoveryLocationId} -> Dispensary ${dispensaryId} (${location.disp_name})`);
|
||||
|
||||
// Ensure dispensary has platform info
|
||||
if (!location.disp_platform_id) {
|
||||
console.log(`[Promote] Updating dispensary with platform info...`);
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||
);
|
||||
}
|
||||
|
||||
let crawlProfileId: number | undefined;
|
||||
let scheduleUpdated = false;
|
||||
let crawlJobCreated = false;
|
||||
|
||||
// Check if dispensary_crawler_profiles table exists
|
||||
const { rows: tableCheck } = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'dispensary_crawler_profiles'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (tableCheck[0]?.exists) {
|
||||
// Create or get crawler profile
|
||||
console.log(`[Promote] Checking crawler profile...`);
|
||||
|
||||
const { rows: profileRows } = await pool.query(
|
||||
`
|
||||
SELECT id FROM dispensary_crawler_profiles
|
||||
WHERE dispensary_id = $1 AND platform = 'dutchie'
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (profileRows.length > 0) {
|
||||
crawlProfileId = profileRows[0].id;
|
||||
console.log(`[Promote] Using existing profile ${crawlProfileId}`);
|
||||
} else {
|
||||
// Create new profile
|
||||
const profileKey = `dutchie-${location.platform_slug}`;
|
||||
const { rows: newProfile } = await pool.query(
|
||||
`
|
||||
INSERT INTO dispensary_crawler_profiles (
|
||||
dispensary_id,
|
||||
profile_key,
|
||||
profile_name,
|
||||
platform,
|
||||
config,
|
||||
status,
|
||||
enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, 'dutchie', $4, 'sandbox', TRUE, NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, platform) DO UPDATE SET
|
||||
enabled = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
dispensaryId,
|
||||
profileKey,
|
||||
`${location.name} (Dutchie)`,
|
||||
JSON.stringify({
|
||||
platformDispensaryId: location.platform_location_id,
|
||||
platformSlug: location.platform_slug,
|
||||
menuUrl: location.platform_menu_url,
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
}),
|
||||
]
|
||||
);
|
||||
|
||||
crawlProfileId = newProfile[0]?.id;
|
||||
console.log(`[Promote] Created new profile ${crawlProfileId}`);
|
||||
}
|
||||
|
||||
// Link profile to dispensary if not already linked
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET active_crawler_profile_id = COALESCE(active_crawler_profile_id, $1),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[crawlProfileId, dispensaryId]
|
||||
);
|
||||
}
|
||||
|
||||
// Check if crawl_jobs table exists and create initial job
|
||||
const { rows: jobsTableCheck } = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'crawl_jobs'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (jobsTableCheck[0]?.exists) {
|
||||
// Check if there's already a pending job
|
||||
const { rows: existingJobs } = await pool.query(
|
||||
`
|
||||
SELECT id FROM crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (existingJobs.length === 0) {
|
||||
// Create initial crawl job
|
||||
console.log(`[Promote] Creating initial crawl job...`);
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO crawl_jobs (
|
||||
dispensary_id,
|
||||
job_type,
|
||||
status,
|
||||
priority,
|
||||
config,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, 'dutchie_product_crawl', 'pending', 1, $2, NOW(), NOW()
|
||||
)
|
||||
`,
|
||||
[
|
||||
dispensaryId,
|
||||
JSON.stringify({
|
||||
source: 'discovery_promotion',
|
||||
discoveryLocationId,
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
}),
|
||||
]
|
||||
);
|
||||
crawlJobCreated = true;
|
||||
} else {
|
||||
console.log(`[Promote] Crawl job already exists for dispensary`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update discovery location notes
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET notes = COALESCE(notes || E'\n', '') || $1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[`Promoted to crawlable at ${new Date().toISOString()}`, discoveryLocationId]
|
||||
);
|
||||
|
||||
console.log(`[Promote] Promotion complete for discovery location ${discoveryLocationId}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId,
|
||||
crawlProfileId,
|
||||
scheduleUpdated,
|
||||
crawlJobCreated,
|
||||
};
|
||||
}
|
||||
|
||||
export default promoteDiscoveryLocation;
|
||||
973
backend/src/dutchie-az/discovery/routes.ts
Normal file
973
backend/src/dutchie-az/discovery/routes.ts
Normal file
@@ -0,0 +1,973 @@
|
||||
/**
|
||||
* Platform Discovery API Routes (DT = Dutchie)
|
||||
*
|
||||
* Routes for the platform-specific store discovery pipeline.
|
||||
* Mount at /api/discovery/platforms/dt
|
||||
*
|
||||
* Platform Slug Mapping (for trademark-safe URLs):
|
||||
* dt = Dutchie
|
||||
* jn = Jane (future)
|
||||
* wm = Weedmaps (future)
|
||||
* lf = Leafly (future)
|
||||
* tz = Treez (future)
|
||||
*
|
||||
* Note: The actual platform value stored in the DB remains 'dutchie'.
|
||||
* Only the URL paths use neutral slugs.
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
import { DiscoveryGeoService } from '../../services/DiscoveryGeoService';
|
||||
import { GeoValidationService } from '../../services/GeoValidationService';
|
||||
|
||||
export function createDutchieDiscoveryRoutes(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// ============================================================
|
||||
// LOCATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations
|
||||
*
|
||||
* List discovered locations with filtering.
|
||||
*
|
||||
* Query params:
|
||||
* - status: 'discovered' | 'verified' | 'rejected' | 'merged'
|
||||
* - state_code: e.g., 'AZ', 'CA'
|
||||
* - country_code: 'US' | 'CA'
|
||||
* - unlinked_only: 'true' to show only locations without dispensary_id
|
||||
* - search: search by name
|
||||
* - limit: number (default 50)
|
||||
* - offset: number (default 0)
|
||||
*/
|
||||
router.get('/locations', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
status,
|
||||
state_code,
|
||||
country_code,
|
||||
unlinked_only,
|
||||
search,
|
||||
limit = '50',
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = "WHERE platform = 'dutchie' AND active = TRUE";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (status) {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (state_code) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(state_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (country_code) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(country_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (unlinked_only === 'true') {
|
||||
whereClause += ' AND dispensary_id IS NULL';
|
||||
}
|
||||
|
||||
if (search) {
|
||||
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
const limitVal = parseInt(limit as string, 10);
|
||||
const offsetVal = parseInt(offset as string, 10);
|
||||
params.push(limitVal, offsetVal);
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.id,
|
||||
dl.platform,
|
||||
dl.platform_location_id,
|
||||
dl.platform_slug,
|
||||
dl.platform_menu_url,
|
||||
dl.name,
|
||||
dl.raw_address,
|
||||
dl.address_line1,
|
||||
dl.city,
|
||||
dl.state_code,
|
||||
dl.postal_code,
|
||||
dl.country_code,
|
||||
dl.latitude,
|
||||
dl.longitude,
|
||||
dl.status,
|
||||
dl.dispensary_id,
|
||||
dl.offers_delivery,
|
||||
dl.offers_pickup,
|
||||
dl.is_recreational,
|
||||
dl.is_medical,
|
||||
dl.first_seen_at,
|
||||
dl.last_seen_at,
|
||||
dl.verified_at,
|
||||
dl.verified_by,
|
||||
dl.notes,
|
||||
d.name as dispensary_name
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY dl.first_seen_at DESC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
// Get total count
|
||||
const countParams = params.slice(0, -2);
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
|
||||
countParams
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
locations: rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
platformLocationId: r.platform_location_id,
|
||||
platformSlug: r.platform_slug,
|
||||
platformMenuUrl: r.platform_menu_url,
|
||||
name: r.name,
|
||||
rawAddress: r.raw_address,
|
||||
addressLine1: r.address_line1,
|
||||
city: r.city,
|
||||
stateCode: r.state_code,
|
||||
postalCode: r.postal_code,
|
||||
countryCode: r.country_code,
|
||||
latitude: r.latitude,
|
||||
longitude: r.longitude,
|
||||
status: r.status,
|
||||
dispensaryId: r.dispensary_id,
|
||||
dispensaryName: r.dispensary_name,
|
||||
offersDelivery: r.offers_delivery,
|
||||
offersPickup: r.offers_pickup,
|
||||
isRecreational: r.is_recreational,
|
||||
isMedical: r.is_medical,
|
||||
firstSeenAt: r.first_seen_at,
|
||||
lastSeenAt: r.last_seen_at,
|
||||
verifiedAt: r.verified_at,
|
||||
verifiedBy: r.verified_by,
|
||||
notes: r.notes,
|
||||
})),
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: limitVal,
|
||||
offset: offsetVal,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching locations:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id
|
||||
*
|
||||
* Get a single location by ID.
|
||||
*/
|
||||
router.get('/locations/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.name as dispensary_name,
|
||||
d.menu_url as dispensary_menu_url
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const r = rows[0];
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
platformLocationId: r.platform_location_id,
|
||||
platformSlug: r.platform_slug,
|
||||
platformMenuUrl: r.platform_menu_url,
|
||||
name: r.name,
|
||||
rawAddress: r.raw_address,
|
||||
addressLine1: r.address_line1,
|
||||
addressLine2: r.address_line2,
|
||||
city: r.city,
|
||||
stateCode: r.state_code,
|
||||
postalCode: r.postal_code,
|
||||
countryCode: r.country_code,
|
||||
latitude: r.latitude,
|
||||
longitude: r.longitude,
|
||||
timezone: r.timezone,
|
||||
status: r.status,
|
||||
dispensaryId: r.dispensary_id,
|
||||
dispensaryName: r.dispensary_name,
|
||||
dispensaryMenuUrl: r.dispensary_menu_url,
|
||||
offersDelivery: r.offers_delivery,
|
||||
offersPickup: r.offers_pickup,
|
||||
isRecreational: r.is_recreational,
|
||||
isMedical: r.is_medical,
|
||||
firstSeenAt: r.first_seen_at,
|
||||
lastSeenAt: r.last_seen_at,
|
||||
verifiedAt: r.verified_at,
|
||||
verifiedBy: r.verified_by,
|
||||
notes: r.notes,
|
||||
metadata: r.metadata,
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching location:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// VERIFICATION ACTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/verify-create
|
||||
*
|
||||
* Verify a discovered location and create a new canonical dispensary.
|
||||
*/
|
||||
router.post('/locations/:id/verify-create', async (req: Request, res: Response) => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { verifiedBy = 'admin' } = req.body;
|
||||
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await client.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot verify: location status is '${location.status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
// Look up state_id if we have a state_code
|
||||
let stateId: number | null = null;
|
||||
if (location.state_code) {
|
||||
const { rows: stateRows } = await client.query(
|
||||
`SELECT id FROM states WHERE code = $1`,
|
||||
[location.state_code]
|
||||
);
|
||||
if (stateRows.length > 0) {
|
||||
stateId = stateRows[0].id;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the canonical dispensary
|
||||
const { rows: dispRows } = await client.query(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
name,
|
||||
slug,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
zip,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
menu_type,
|
||||
menu_url,
|
||||
platform_dispensary_id,
|
||||
state_id,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, TRUE, NOW(), NOW()
|
||||
)
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
location.platform_slug,
|
||||
location.address_line1,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.postal_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
'dutchie',
|
||||
location.platform_menu_url,
|
||||
location.platform_location_id,
|
||||
stateId,
|
||||
]
|
||||
);
|
||||
|
||||
const dispensaryId = dispRows[0].id;
|
||||
|
||||
// Update the discovery location
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'verified',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'created',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
message: `Created new dispensary (ID: ${dispensaryId})`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[Discovery Routes] Error in verify-create:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/verify-link
|
||||
*
|
||||
* Link a discovered location to an existing dispensary.
|
||||
*
|
||||
* Body:
|
||||
* - dispensaryId: number (required)
|
||||
* - verifiedBy: string (optional)
|
||||
*/
|
||||
router.post('/locations/:id/verify-link', async (req: Request, res: Response) => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { dispensaryId, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
if (!dispensaryId) {
|
||||
return res.status(400).json({ success: false, error: 'dispensaryId is required' });
|
||||
}
|
||||
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Verify dispensary exists
|
||||
const { rows: dispRows } = await client.query(
|
||||
`SELECT id, name FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (dispRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await client.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot link: location status is '${location.status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
// Update dispensary with platform info if missing
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||
);
|
||||
|
||||
// Update the discovery location
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'merged',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'linked',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
dispensaryName: dispRows[0].name,
|
||||
message: `Linked to existing dispensary: ${dispRows[0].name}`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[Discovery Routes] Error in verify-link:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/reject
|
||||
*
|
||||
* Reject a discovered location.
|
||||
*
|
||||
* Body:
|
||||
* - reason: string (optional)
|
||||
* - verifiedBy: string (optional)
|
||||
*/
|
||||
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { reason, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
// Get current status
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'discovered') {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot reject: location status is '${rows[0].status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'rejected',
|
||||
verified_at = NOW(),
|
||||
verified_by = $1,
|
||||
notes = COALESCE($2, notes),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[verifiedBy, reason, id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'rejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location rejected',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in reject:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/unreject
|
||||
*
|
||||
* Restore a rejected location to discovered status.
|
||||
*/
|
||||
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get current status
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'rejected') {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot unreject: location status is '${rows[0].status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'discovered',
|
||||
verified_at = NULL,
|
||||
verified_by = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'unrejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location restored to discovered status',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in unreject:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SUMMARY / REPORTING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/summary
|
||||
*
|
||||
* Get discovery summary statistics.
|
||||
*/
|
||||
router.get('/summary', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
// Total counts by status
|
||||
const { rows: statusRows } = await pool.query(`
|
||||
SELECT status, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE
|
||||
GROUP BY status
|
||||
`);
|
||||
|
||||
const statusCounts: Record<string, number> = {};
|
||||
let totalLocations = 0;
|
||||
for (const row of statusRows) {
|
||||
statusCounts[row.status] = parseInt(row.cnt, 10);
|
||||
totalLocations += parseInt(row.cnt, 10);
|
||||
}
|
||||
|
||||
// By state
|
||||
const { rows: stateRows } = await pool.query(`
|
||||
SELECT
|
||||
state_code,
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||
COUNT(*) FILTER (WHERE dispensary_id IS NULL AND status = 'discovered') as unlinked
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE AND state_code IS NOT NULL
|
||||
GROUP BY state_code
|
||||
ORDER BY total DESC
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
summary: {
|
||||
total_locations: totalLocations,
|
||||
discovered: statusCounts['discovered'] || 0,
|
||||
verified: statusCounts['verified'] || 0,
|
||||
merged: statusCounts['merged'] || 0,
|
||||
rejected: statusCounts['rejected'] || 0,
|
||||
},
|
||||
by_state: stateRows.map((r) => ({
|
||||
state_code: r.state_code,
|
||||
total: parseInt(r.total, 10),
|
||||
verified: parseInt(r.verified, 10),
|
||||
unlinked: parseInt(r.unlinked, 10),
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in summary:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CITIES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/cities
|
||||
*
|
||||
* List discovery cities.
|
||||
*/
|
||||
router.get('/cities', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { state_code, country_code, crawl_enabled, limit = '100', offset = '0' } = req.query;
|
||||
|
||||
let whereClause = "WHERE platform = 'dutchie'";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state_code) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(state_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (country_code) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(country_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (crawl_enabled === 'true') {
|
||||
whereClause += ' AND crawl_enabled = TRUE';
|
||||
} else if (crawl_enabled === 'false') {
|
||||
whereClause += ' AND crawl_enabled = FALSE';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
last_crawled_at,
|
||||
crawl_enabled,
|
||||
location_count
|
||||
FROM dutchie_discovery_cities
|
||||
${whereClause}
|
||||
ORDER BY country_code, state_code, city_name
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_cities ${whereClause}`,
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
cities: rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
lastCrawledAt: r.last_crawled_at,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
locationCount: r.location_count,
|
||||
})),
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching cities:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// MATCH CANDIDATES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id/match-candidates
|
||||
*
|
||||
* Find potential dispensary matches for a discovery location.
|
||||
*/
|
||||
router.get('/locations/:id/match-candidates', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Find potential matches
|
||||
const { rows: candidates } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.address,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.menu_url,
|
||||
d.latitude,
|
||||
d.longitude,
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 'exact_name'
|
||||
WHEN d.name ILIKE $2 THEN 'partial_name'
|
||||
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
|
||||
ELSE 'location_match'
|
||||
END as match_type,
|
||||
CASE
|
||||
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
|
||||
THEN (3959 * acos(
|
||||
LEAST(1.0, GREATEST(-1.0,
|
||||
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||
cos(radians(d.longitude) - radians($6::float)) +
|
||||
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||
))
|
||||
))
|
||||
ELSE NULL
|
||||
END as distance_miles
|
||||
FROM dispensaries d
|
||||
WHERE d.state = $4
|
||||
AND (
|
||||
d.name ILIKE $1
|
||||
OR d.name ILIKE $2
|
||||
OR d.city ILIKE $3
|
||||
OR (
|
||||
d.latitude IS NOT NULL
|
||||
AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL
|
||||
AND $6::float IS NOT NULL
|
||||
)
|
||||
)
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 1
|
||||
WHEN d.name ILIKE $2 THEN 2
|
||||
ELSE 3
|
||||
END,
|
||||
distance_miles NULLS LAST
|
||||
LIMIT 10
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
`%${location.name.split(' ')[0]}%`,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: location.id,
|
||||
name: location.name,
|
||||
city: location.city,
|
||||
stateCode: location.state_code,
|
||||
},
|
||||
candidates: candidates.map((c) => ({
|
||||
id: c.id,
|
||||
name: c.name,
|
||||
city: c.city,
|
||||
state: c.state,
|
||||
address: c.address,
|
||||
menuType: c.menu_type,
|
||||
platformDispensaryId: c.platform_dispensary_id,
|
||||
menuUrl: c.menu_url,
|
||||
matchType: c.match_type,
|
||||
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching match candidates:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// GEO / NEARBY (Admin/Debug Only)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/nearby
|
||||
*
|
||||
* Find discovery locations near a given coordinate.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*
|
||||
* Query params:
|
||||
* - lat: number (required)
|
||||
* - lon: number (required)
|
||||
* - radiusKm: number (optional, default 50)
|
||||
* - limit: number (optional, default 20)
|
||||
* - status: string (optional, filter by status)
|
||||
*/
|
||||
router.get('/nearby', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { lat, lon, radiusKm = '50', limit = '20', status } = req.query;
|
||||
|
||||
// Validate required params
|
||||
if (!lat || !lon) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'lat and lon are required query parameters',
|
||||
});
|
||||
}
|
||||
|
||||
const latNum = parseFloat(lat as string);
|
||||
const lonNum = parseFloat(lon as string);
|
||||
const radiusNum = parseFloat(radiusKm as string);
|
||||
const limitNum = parseInt(limit as string, 10);
|
||||
|
||||
if (isNaN(latNum) || isNaN(lonNum)) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'lat and lon must be valid numbers',
|
||||
});
|
||||
}
|
||||
|
||||
const geoService = new DiscoveryGeoService(pool);
|
||||
|
||||
const locations = await geoService.findNearbyDiscoveryLocations(latNum, lonNum, {
|
||||
radiusKm: radiusNum,
|
||||
limit: limitNum,
|
||||
platform: 'dutchie',
|
||||
status: status as string | undefined,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
center: { lat: latNum, lon: lonNum },
|
||||
radiusKm: radiusNum,
|
||||
count: locations.length,
|
||||
locations,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in nearby:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/geo-stats
|
||||
*
|
||||
* Get coordinate coverage statistics for discovery locations.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*/
|
||||
router.get('/geo-stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const geoService = new DiscoveryGeoService(pool);
|
||||
const stats = await geoService.getCoordinateCoverageStats();
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
stats,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in geo-stats:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id/validate-geo
|
||||
*
|
||||
* Validate the geographic data for a discovery location.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*/
|
||||
router.get('/locations/:id/validate-geo', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the location
|
||||
const { rows } = await pool.query(
|
||||
`SELECT latitude, longitude, state_code, country_code, name
|
||||
FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = rows[0];
|
||||
const geoValidation = new GeoValidationService();
|
||||
const result = geoValidation.validateLocationState({
|
||||
latitude: location.latitude,
|
||||
longitude: location.longitude,
|
||||
state_code: location.state_code,
|
||||
country_code: location.country_code,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: parseInt(id, 10),
|
||||
name: location.name,
|
||||
latitude: location.latitude,
|
||||
longitude: location.longitude,
|
||||
stateCode: location.state_code,
|
||||
countryCode: location.country_code,
|
||||
},
|
||||
validation: result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in validate-geo:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
export default createDutchieDiscoveryRoutes;
|
||||
682
backend/src/dutchie-az/routes/analytics.ts
Normal file
682
backend/src/dutchie-az/routes/analytics.ts
Normal file
@@ -0,0 +1,682 @@
|
||||
/**
|
||||
* Analytics API Routes
|
||||
*
|
||||
* Provides REST API endpoints for all analytics services.
|
||||
* All routes are prefixed with /api/analytics
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
AnalyticsCache,
|
||||
PriceTrendService,
|
||||
PenetrationService,
|
||||
CategoryAnalyticsService,
|
||||
StoreChangeService,
|
||||
BrandOpportunityService,
|
||||
} from '../services/analytics';
|
||||
|
||||
export function createAnalyticsRouter(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// Initialize services
|
||||
const cache = new AnalyticsCache(pool, { defaultTtlMinutes: 15 });
|
||||
const priceService = new PriceTrendService(pool, cache);
|
||||
const penetrationService = new PenetrationService(pool, cache);
|
||||
const categoryService = new CategoryAnalyticsService(pool, cache);
|
||||
const storeService = new StoreChangeService(pool, cache);
|
||||
const brandOpportunityService = new BrandOpportunityService(pool, cache);
|
||||
|
||||
// ============================================================
|
||||
// PRICE ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/product/:id
|
||||
* Get price trend for a specific product
|
||||
*/
|
||||
router.get('/price/product/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const productId = parseInt(req.params.id);
|
||||
const storeId = req.query.storeId ? parseInt(req.query.storeId as string) : undefined;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await priceService.getProductPriceTrend(productId, storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price product error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch product price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/brand/:name
|
||||
* Get price trend for a brand
|
||||
*/
|
||||
router.get('/price/brand/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
};
|
||||
|
||||
const result = await priceService.getBrandPriceTrend(brandName, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price brand error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/category/:name
|
||||
* Get price trend for a category
|
||||
*/
|
||||
router.get('/price/category/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
brandName: req.query.brand as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
};
|
||||
|
||||
const result = await priceService.getCategoryPriceTrend(category, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price category error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/summary
|
||||
* Get price summary statistics
|
||||
*/
|
||||
router.get('/price/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
brandName: req.query.brand as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
};
|
||||
|
||||
const result = await priceService.getPriceSummary(filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch price summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/compression/:category
|
||||
* Get price compression analysis for a category
|
||||
*/
|
||||
router.get('/price/compression/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const state = req.query.state as string | undefined;
|
||||
|
||||
const result = await priceService.detectPriceCompression(category, state);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price compression error:', error);
|
||||
res.status(500).json({ error: 'Failed to analyze price compression' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/global
|
||||
* Get global price statistics
|
||||
*/
|
||||
router.get('/price/global', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await priceService.getGlobalPriceStats();
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Global price error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch global price stats' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PENETRATION ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/brand/:name
|
||||
* Get penetration data for a brand
|
||||
*/
|
||||
router.get('/penetration/brand/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
};
|
||||
|
||||
const result = await penetrationService.getBrandPenetration(brandName, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand penetration error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand penetration' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/top
|
||||
* Get top brands by penetration
|
||||
*/
|
||||
router.get('/penetration/top', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
minStores: req.query.minStores ? parseInt(req.query.minStores as string) : 2,
|
||||
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 5,
|
||||
};
|
||||
|
||||
const result = await penetrationService.getTopBrandsByPenetration(limit, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Top penetration error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch top brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/trend/:brand
|
||||
* Get penetration trend for a brand
|
||||
*/
|
||||
router.get('/penetration/trend/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await penetrationService.getPenetrationTrend(brandName, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Penetration trend error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch penetration trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/shelf-share/:brand
|
||||
* Get shelf share by category for a brand
|
||||
*/
|
||||
router.get('/penetration/shelf-share/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getShelfShareByCategory(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Shelf share error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch shelf share' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/by-state/:brand
|
||||
* Get brand presence by state
|
||||
*/
|
||||
router.get('/penetration/by-state/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getBrandPresenceByState(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand by state error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand presence by state' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/stores/:brand
|
||||
* Get stores carrying a brand
|
||||
*/
|
||||
router.get('/penetration/stores/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getStoresCarryingBrand(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Stores carrying brand error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch stores' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/heatmap
|
||||
* Get penetration heatmap data
|
||||
*/
|
||||
router.get('/penetration/heatmap', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = req.query.brand as string | undefined;
|
||||
const result = await penetrationService.getPenetrationHeatmap(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Heatmap error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch heatmap data' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CATEGORY ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/summary
|
||||
* Get category summary
|
||||
*/
|
||||
router.get('/category/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = req.query.category as string | undefined;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
};
|
||||
|
||||
const result = await categoryService.getCategorySummary(category, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/growth
|
||||
* Get category growth data
|
||||
*/
|
||||
router.get('/category/growth', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 10,
|
||||
};
|
||||
|
||||
const result = await categoryService.getCategoryGrowth(days, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category growth error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category growth' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/trend/:category
|
||||
* Get category growth trend over time
|
||||
*/
|
||||
router.get('/category/trend/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 90;
|
||||
|
||||
const result = await categoryService.getCategoryGrowthTrend(category, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category trend error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/heatmap
|
||||
* Get category heatmap data
|
||||
*/
|
||||
router.get('/category/heatmap', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const metric = (req.query.metric as 'skus' | 'growth' | 'price') || 'skus';
|
||||
const periods = req.query.periods ? parseInt(req.query.periods as string) : 12;
|
||||
|
||||
const result = await categoryService.getCategoryHeatmap(metric, periods);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category heatmap error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch heatmap' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/top-movers
|
||||
* Get top growing and declining categories
|
||||
*/
|
||||
router.get('/category/top-movers', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 5;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await categoryService.getTopMovers(limit, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Top movers error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch top movers' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/:category/subcategories
|
||||
* Get subcategory breakdown
|
||||
*/
|
||||
router.get('/category/:category/subcategories', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const result = await categoryService.getSubcategoryBreakdown(category);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Subcategory error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch subcategories' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STORE CHANGE TRACKING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/summary
|
||||
* Get change summary for a store
|
||||
*/
|
||||
router.get('/store/:id/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const result = await storeService.getStoreChangeSummary(storeId);
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({ error: 'Store not found' });
|
||||
}
|
||||
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Store summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/events
|
||||
* Get recent change events for a store
|
||||
*/
|
||||
router.get('/store/:id/events', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const filters = {
|
||||
eventType: req.query.type as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 100,
|
||||
};
|
||||
|
||||
const result = await storeService.getStoreChangeEvents(storeId, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Store events error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store events' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/brands/new
|
||||
* Get new brands added to a store
|
||||
*/
|
||||
router.get('/store/:id/brands/new', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await storeService.getNewBrands(storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] New brands error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch new brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/brands/lost
|
||||
* Get brands lost from a store
|
||||
*/
|
||||
router.get('/store/:id/brands/lost', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await storeService.getLostBrands(storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Lost brands error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch lost brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/products/changes
|
||||
* Get product changes for a store
|
||||
*/
|
||||
router.get('/store/:id/products/changes', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const changeType = req.query.type as 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock' | undefined;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
|
||||
const result = await storeService.getProductChanges(storeId, changeType, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Product changes error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch product changes' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/leaderboard/:category
|
||||
* Get category leaderboard across stores
|
||||
*/
|
||||
router.get('/store/leaderboard/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
|
||||
const result = await storeService.getCategoryLeaderboard(category, limit);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Leaderboard error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch leaderboard' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/most-active
|
||||
* Get most active stores (by changes)
|
||||
*/
|
||||
router.get('/store/most-active', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
|
||||
|
||||
const result = await storeService.getMostActiveStores(days, limit);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Most active error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch active stores' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/compare
|
||||
* Compare two stores
|
||||
*/
|
||||
router.get('/store/compare', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const store1 = parseInt(req.query.store1 as string);
|
||||
const store2 = parseInt(req.query.store2 as string);
|
||||
|
||||
if (!store1 || !store2) {
|
||||
return res.status(400).json({ error: 'Both store1 and store2 are required' });
|
||||
}
|
||||
|
||||
const result = await storeService.compareStores(store1, store2);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Compare stores error:', error);
|
||||
res.status(500).json({ error: 'Failed to compare stores' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// BRAND OPPORTUNITY / RISK
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/brand/:name/opportunity
|
||||
* Get full opportunity analysis for a brand
|
||||
*/
|
||||
router.get('/brand/:name/opportunity', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const result = await brandOpportunityService.getBrandOpportunity(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand opportunity error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand opportunity' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/brand/:name/position
|
||||
* Get market position summary for a brand
|
||||
*/
|
||||
router.get('/brand/:name/position', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const result = await brandOpportunityService.getMarketPositionSummary(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand position error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand position' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// ALERTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/alerts
|
||||
* Get analytics alerts
|
||||
*/
|
||||
router.get('/alerts', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const filters = {
|
||||
brandName: req.query.brand as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
alertType: req.query.type as string | undefined,
|
||||
unreadOnly: req.query.unreadOnly === 'true',
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
|
||||
};
|
||||
|
||||
const result = await brandOpportunityService.getAlerts(filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Alerts error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch alerts' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/analytics/alerts/mark-read
|
||||
* Mark alerts as read
|
||||
*/
|
||||
router.post('/alerts/mark-read', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { alertIds } = req.body;
|
||||
|
||||
if (!Array.isArray(alertIds)) {
|
||||
return res.status(400).json({ error: 'alertIds must be an array' });
|
||||
}
|
||||
|
||||
await brandOpportunityService.markAlertsRead(alertIds);
|
||||
res.json({ success: true });
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Mark read error:', error);
|
||||
res.status(500).json({ error: 'Failed to mark alerts as read' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CACHE MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/cache/stats
|
||||
* Get cache statistics
|
||||
*/
|
||||
router.get('/cache/stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const stats = await cache.getStats();
|
||||
res.json(stats);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Cache stats error:', error);
|
||||
res.status(500).json({ error: 'Failed to get cache stats' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/analytics/cache/clear
|
||||
* Clear cache (admin only)
|
||||
*/
|
||||
router.post('/cache/clear', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pattern = req.query.pattern as string | undefined;
|
||||
|
||||
if (pattern) {
|
||||
const cleared = await cache.invalidatePattern(pattern);
|
||||
res.json({ success: true, clearedCount: cleared });
|
||||
} else {
|
||||
await cache.cleanExpired();
|
||||
res.json({ success: true, message: 'Expired entries cleaned' });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Cache clear error:', error);
|
||||
res.status(500).json({ error: 'Failed to clear cache' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SNAPSHOT CAPTURE (for cron/scheduled jobs)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/analytics/snapshots/capture
|
||||
* Capture daily snapshots (run by scheduler)
|
||||
*/
|
||||
router.post('/snapshots/capture', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const [brandResult, categoryResult] = await Promise.all([
|
||||
pool.query('SELECT capture_brand_snapshots() as count'),
|
||||
pool.query('SELECT capture_category_snapshots() as count'),
|
||||
]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
brandSnapshots: parseInt(brandResult.rows[0]?.count || '0'),
|
||||
categorySnapshots: parseInt(categoryResult.rows[0]?.count || '0'),
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Snapshot capture error:', error);
|
||||
res.status(500).json({ error: 'Failed to capture snapshots' });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
@@ -21,12 +21,8 @@ import {
|
||||
} from '../services/discovery';
|
||||
import { crawlDispensaryProducts } from '../services/product-crawler';
|
||||
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, dba_name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
import { DISPENSARY_COLUMNS_WITH_PROFILE as DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
import {
|
||||
startScheduler,
|
||||
stopScheduler,
|
||||
@@ -43,6 +39,7 @@ import {
|
||||
getRunLogs,
|
||||
} from '../services/scheduler';
|
||||
import { StockStatus } from '../types';
|
||||
import { getProviderDisplayName } from '../../utils/provider-display';
|
||||
|
||||
const router = Router();
|
||||
|
||||
@@ -113,9 +110,17 @@ router.get('/stores', async (req: Request, res: Response) => {
|
||||
|
||||
const { rows, rowCount } = await query(
|
||||
`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
SELECT ${DISPENSARY_COLUMNS},
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE dispensary_id = dispensaries.id) as product_count,
|
||||
dcp.status as crawler_status,
|
||||
dcp.profile_key as crawler_profile_key,
|
||||
dcp.next_retry_at,
|
||||
dcp.sandbox_attempt_count
|
||||
FROM dispensaries
|
||||
LEFT JOIN dispensary_crawler_profiles dcp
|
||||
ON dcp.dispensary_id = dispensaries.id AND dcp.enabled = true
|
||||
${whereClause}
|
||||
ORDER BY name
|
||||
ORDER BY dispensaries.name
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
@@ -127,8 +132,15 @@ router.get('/stores', async (req: Request, res: Response) => {
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
// Transform stores to include provider_display
|
||||
const transformedStores = rows.map((store: any) => ({
|
||||
...store,
|
||||
provider_raw: store.menu_type,
|
||||
provider_display: getProviderDisplayName(store.menu_type),
|
||||
}));
|
||||
|
||||
res.json({
|
||||
stores: rows,
|
||||
stores: transformedStores,
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: parseInt(limit as string, 10),
|
||||
offset: parseInt(offset as string, 10),
|
||||
@@ -780,7 +792,7 @@ router.get('/products/:id/availability', async (req: Request, res: Response) =>
|
||||
)
|
||||
SELECT
|
||||
d.id as dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
d.name as dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.address,
|
||||
@@ -1042,8 +1054,12 @@ router.post('/admin/scheduler/trigger', async (_req: Request, res: Response) =>
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/dutchie-az/admin/crawl/:id
|
||||
* POST /api/az/admin/crawl/:id
|
||||
* Crawl a single dispensary with job tracking
|
||||
*
|
||||
* @deprecated Use POST /api/admin/crawl/:dispensaryId instead.
|
||||
* This route is kept for backward compatibility only.
|
||||
* The canonical crawl endpoint is now /api/admin/crawl/:dispensaryId
|
||||
*/
|
||||
router.post('/admin/crawl/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -1075,7 +1091,6 @@ router.get('/admin/dutchie-stores', async (_req: Request, res: Response) => {
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.dba_name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.menu_type,
|
||||
@@ -1113,7 +1128,7 @@ router.get('/admin/dutchie-stores', async (_req: Request, res: Response) => {
|
||||
failed: failed.length,
|
||||
stores: rows.map((r: any) => ({
|
||||
id: r.id,
|
||||
name: r.dba_name || r.name,
|
||||
name: r.name,
|
||||
city: r.city,
|
||||
state: r.state,
|
||||
menuType: r.menu_type,
|
||||
@@ -1688,6 +1703,7 @@ import {
|
||||
router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
// Get running jobs from job_run_logs (scheduled jobs like "enqueue all")
|
||||
// Includes worker_name and run_role for named workforce display
|
||||
const { rows: runningScheduledJobs } = await query<any>(`
|
||||
SELECT
|
||||
jrl.id,
|
||||
@@ -1699,7 +1715,11 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
||||
jrl.items_succeeded,
|
||||
jrl.items_failed,
|
||||
jrl.metadata,
|
||||
jrl.worker_name,
|
||||
jrl.run_role,
|
||||
js.description as job_description,
|
||||
js.worker_name as schedule_worker_name,
|
||||
js.worker_role as schedule_worker_role,
|
||||
EXTRACT(EPOCH FROM (NOW() - jrl.started_at)) as duration_seconds
|
||||
FROM job_run_logs jrl
|
||||
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
||||
@@ -1708,7 +1728,7 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
||||
`);
|
||||
|
||||
// Get running crawl jobs (individual store crawls with worker info)
|
||||
// Note: Use COALESCE for optional columns that may not exist in older schemas
|
||||
// Includes enqueued_by_worker for tracking which named worker enqueued the job
|
||||
const { rows: runningCrawlJobs } = await query<any>(`
|
||||
SELECT
|
||||
cj.id,
|
||||
@@ -1722,6 +1742,7 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
||||
cj.claimed_by as worker_id,
|
||||
cj.worker_hostname,
|
||||
cj.claimed_at,
|
||||
cj.enqueued_by_worker,
|
||||
cj.products_found,
|
||||
cj.products_upserted,
|
||||
cj.snapshots_created,
|
||||
@@ -1792,14 +1813,18 @@ router.get('/monitor/recent-jobs', async (req: Request, res: Response) => {
|
||||
jrl.items_succeeded,
|
||||
jrl.items_failed,
|
||||
jrl.metadata,
|
||||
js.description as job_description
|
||||
jrl.worker_name,
|
||||
jrl.run_role,
|
||||
js.description as job_description,
|
||||
js.worker_name as schedule_worker_name,
|
||||
js.worker_role as schedule_worker_role
|
||||
FROM job_run_logs jrl
|
||||
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
||||
ORDER BY jrl.created_at DESC
|
||||
LIMIT $1
|
||||
`, [limitNum]);
|
||||
|
||||
// Recent crawl jobs
|
||||
// Recent crawl jobs (includes enqueued_by_worker for named workforce tracking)
|
||||
const { rows: recentCrawlJobs } = await query<any>(`
|
||||
SELECT
|
||||
cj.id,
|
||||
@@ -1814,6 +1839,7 @@ router.get('/monitor/recent-jobs', async (req: Request, res: Response) => {
|
||||
cj.products_found,
|
||||
cj.snapshots_created,
|
||||
cj.metadata,
|
||||
cj.enqueued_by_worker,
|
||||
EXTRACT(EPOCH FROM (COALESCE(cj.completed_at, NOW()) - cj.started_at)) * 1000 as duration_ms
|
||||
FROM dispensary_crawl_jobs cj
|
||||
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
|
||||
@@ -1912,12 +1938,14 @@ router.get('/monitor/summary', async (_req: Request, res: Response) => {
|
||||
(SELECT MAX(completed_at) FROM job_run_logs WHERE status = 'success') as last_job_completed
|
||||
`);
|
||||
|
||||
// Get next scheduled runs
|
||||
// Get next scheduled runs (with worker names)
|
||||
const { rows: nextRuns } = await query<any>(`
|
||||
SELECT
|
||||
id,
|
||||
job_name,
|
||||
description,
|
||||
worker_name,
|
||||
worker_role,
|
||||
enabled,
|
||||
next_run_at,
|
||||
last_status,
|
||||
@@ -2034,6 +2062,189 @@ router.post('/admin/detection/trigger', async (_req: Request, res: Response) =>
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CRAWLER RELIABILITY / HEALTH ENDPOINTS (Phase 1)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/crawler/health
|
||||
* Get overall crawler health metrics
|
||||
*/
|
||||
router.get('/admin/crawler/health', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const { rows } = await query<any>(`SELECT * FROM v_crawl_health`);
|
||||
res.json(rows[0] || {
|
||||
active_crawlers: 0,
|
||||
degraded_crawlers: 0,
|
||||
paused_crawlers: 0,
|
||||
failed_crawlers: 0,
|
||||
due_now: 0,
|
||||
stores_with_failures: 0,
|
||||
avg_consecutive_failures: 0,
|
||||
successful_last_24h: 0,
|
||||
});
|
||||
} catch (error: any) {
|
||||
// View might not exist yet
|
||||
res.json({
|
||||
active_crawlers: 0,
|
||||
degraded_crawlers: 0,
|
||||
paused_crawlers: 0,
|
||||
failed_crawlers: 0,
|
||||
due_now: 0,
|
||||
error: 'View not available - run migration 046',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/crawler/error-summary
|
||||
* Get error summary by code over last 7 days
|
||||
*/
|
||||
router.get('/admin/crawler/error-summary', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const { rows } = await query<any>(`SELECT * FROM v_crawl_error_summary`);
|
||||
res.json({ errors: rows });
|
||||
} catch (error: any) {
|
||||
res.json({ errors: [], error: 'View not available - run migration 046' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/crawler/status
|
||||
* Get detailed status for all crawlers
|
||||
*/
|
||||
router.get('/admin/crawler/status', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { status, limit = '100', offset = '0' } = req.query;
|
||||
|
||||
let whereClause = '';
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (status) {
|
||||
whereClause = `WHERE crawl_status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await query<any>(
|
||||
`SELECT * FROM v_crawler_status
|
||||
${whereClause}
|
||||
ORDER BY consecutive_failures DESC, name ASC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`,
|
||||
params
|
||||
);
|
||||
|
||||
const { rows: countRows } = await query<any>(
|
||||
`SELECT COUNT(*) as total FROM v_crawler_status ${whereClause}`,
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
res.json({
|
||||
stores: rows,
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: parseInt(limit as string, 10),
|
||||
offset: parseInt(offset as string, 10),
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/crawler/attempts
|
||||
* Get recent crawl attempts (for debugging)
|
||||
*/
|
||||
router.get('/admin/crawler/attempts', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { dispensaryId, errorCode, limit = '50', offset = '0' } = req.query;
|
||||
|
||||
let whereClause = 'WHERE 1=1';
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (dispensaryId) {
|
||||
whereClause += ` AND ca.dispensary_id = $${paramIndex}`;
|
||||
params.push(parseInt(dispensaryId as string, 10));
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (errorCode) {
|
||||
whereClause += ` AND ca.error_code = $${paramIndex}`;
|
||||
params.push(errorCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await query<any>(
|
||||
`SELECT
|
||||
ca.*,
|
||||
d.name as dispensary_name,
|
||||
d.city
|
||||
FROM crawl_attempts ca
|
||||
LEFT JOIN dispensaries d ON ca.dispensary_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY ca.started_at DESC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`,
|
||||
params
|
||||
);
|
||||
|
||||
res.json({ attempts: rows });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/dutchie-az/admin/dispensaries/:id/pause
|
||||
* Pause crawling for a dispensary
|
||||
*/
|
||||
router.post('/admin/dispensaries/:id/pause', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
await query(`
|
||||
UPDATE dispensaries
|
||||
SET crawl_status = 'paused',
|
||||
next_crawl_at = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [id]);
|
||||
|
||||
res.json({ success: true, message: `Crawling paused for dispensary ${id}` });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/dutchie-az/admin/dispensaries/:id/resume
|
||||
* Resume crawling for a paused/degraded dispensary
|
||||
*/
|
||||
router.post('/admin/dispensaries/:id/resume', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Reset to active and schedule next crawl
|
||||
await query(`
|
||||
UPDATE dispensaries
|
||||
SET crawl_status = 'active',
|
||||
consecutive_failures = 0,
|
||||
backoff_multiplier = 1.0,
|
||||
next_crawl_at = NOW() + INTERVAL '5 minutes',
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [id]);
|
||||
|
||||
res.json({ success: true, message: `Crawling resumed for dispensary ${id}` });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// FAILED DISPENSARIES ROUTES
|
||||
// ============================================================
|
||||
@@ -2183,4 +2394,251 @@ router.get('/admin/dispensaries/health-summary', async (_req: Request, res: Resp
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// ORCHESTRATOR TRACE ROUTES
|
||||
// ============================================================
|
||||
|
||||
import {
|
||||
getLatestTrace,
|
||||
getTraceById,
|
||||
getTracesForDispensary,
|
||||
getTraceByRunId,
|
||||
} from '../../services/orchestrator-trace';
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/dispensaries/:id/crawl-trace/latest
|
||||
* Get the latest orchestrator trace for a dispensary
|
||||
*/
|
||||
router.get('/admin/dispensaries/:id/crawl-trace/latest', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const trace = await getLatestTrace(parseInt(id, 10));
|
||||
|
||||
if (!trace) {
|
||||
return res.status(404).json({ error: 'No trace found for this dispensary' });
|
||||
}
|
||||
|
||||
res.json(trace);
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/dispensaries/:id/crawl-traces
|
||||
* Get paginated list of orchestrator traces for a dispensary
|
||||
*/
|
||||
router.get('/admin/dispensaries/:id/crawl-traces', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { limit = '20', offset = '0' } = req.query;
|
||||
|
||||
const result = await getTracesForDispensary(
|
||||
parseInt(id, 10),
|
||||
parseInt(limit as string, 10),
|
||||
parseInt(offset as string, 10)
|
||||
);
|
||||
|
||||
res.json(result);
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/crawl-traces/:traceId
|
||||
* Get a specific orchestrator trace by ID
|
||||
*/
|
||||
router.get('/admin/crawl-traces/:traceId', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { traceId } = req.params;
|
||||
const trace = await getTraceById(parseInt(traceId, 10));
|
||||
|
||||
if (!trace) {
|
||||
return res.status(404).json({ error: 'Trace not found' });
|
||||
}
|
||||
|
||||
res.json(trace);
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/admin/crawl-traces/run/:runId
|
||||
* Get a specific orchestrator trace by run ID
|
||||
*/
|
||||
router.get('/admin/crawl-traces/run/:runId', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { runId } = req.params;
|
||||
const trace = await getTraceByRunId(runId);
|
||||
|
||||
if (!trace) {
|
||||
return res.status(404).json({ error: 'Trace not found for this run ID' });
|
||||
}
|
||||
|
||||
res.json(trace);
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SCRAPER OVERVIEW DASHBOARD ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/dutchie-az/scraper/overview
|
||||
* Comprehensive scraper overview for the new dashboard
|
||||
*/
|
||||
router.get('/scraper/overview', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
// 1. Core KPI metrics
|
||||
const { rows: kpiRows } = await query<any>(`
|
||||
SELECT
|
||||
-- Total products
|
||||
(SELECT COUNT(*) FROM dutchie_products) AS total_products,
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE stock_status = 'in_stock') AS in_stock_products,
|
||||
-- Total dispensaries
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND state = 'AZ') AS total_dispensaries,
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND state = 'AZ' AND platform_dispensary_id IS NOT NULL) AS crawlable_dispensaries,
|
||||
-- Visibility stats (24h)
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_lost = true AND visibility_lost_at > NOW() - INTERVAL '24 hours') AS visibility_lost_24h,
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_restored_at > NOW() - INTERVAL '24 hours') AS visibility_restored_24h,
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_lost = true) AS total_visibility_lost,
|
||||
-- Job stats (24h)
|
||||
(SELECT COUNT(*) FROM job_run_logs WHERE status IN ('error', 'partial') AND created_at > NOW() - INTERVAL '24 hours') AS errors_24h,
|
||||
(SELECT COUNT(*) FROM job_run_logs WHERE status = 'success' AND created_at > NOW() - INTERVAL '24 hours') AS successful_jobs_24h,
|
||||
-- Active workers
|
||||
(SELECT COUNT(*) FROM job_schedules WHERE enabled = true) AS active_workers
|
||||
`);
|
||||
|
||||
// 2. Get active worker names
|
||||
const { rows: workerRows } = await query<any>(`
|
||||
SELECT worker_name, worker_role, enabled, last_status, last_run_at, next_run_at
|
||||
FROM job_schedules
|
||||
WHERE enabled = true
|
||||
ORDER BY next_run_at ASC NULLS LAST
|
||||
`);
|
||||
|
||||
// 3. Scrape activity by hour (last 24h)
|
||||
const { rows: activityRows } = await query<any>(`
|
||||
SELECT
|
||||
date_trunc('hour', started_at) AS hour,
|
||||
COUNT(*) FILTER (WHERE status = 'success') AS successful,
|
||||
COUNT(*) FILTER (WHERE status IN ('error', 'partial')) AS failed,
|
||||
COUNT(*) AS total
|
||||
FROM job_run_logs
|
||||
WHERE started_at > NOW() - INTERVAL '24 hours'
|
||||
GROUP BY date_trunc('hour', started_at)
|
||||
ORDER BY hour ASC
|
||||
`);
|
||||
|
||||
// 4. Product growth / coverage (last 7 days)
|
||||
const { rows: growthRows } = await query<any>(`
|
||||
SELECT
|
||||
date_trunc('day', created_at) AS day,
|
||||
COUNT(*) AS new_products
|
||||
FROM dutchie_products
|
||||
WHERE created_at > NOW() - INTERVAL '7 days'
|
||||
GROUP BY date_trunc('day', created_at)
|
||||
ORDER BY day ASC
|
||||
`);
|
||||
|
||||
// 5. Recent worker runs (last 20)
|
||||
const { rows: recentRuns } = await query<any>(`
|
||||
SELECT
|
||||
jrl.id,
|
||||
jrl.job_name,
|
||||
jrl.status,
|
||||
jrl.started_at,
|
||||
jrl.completed_at,
|
||||
jrl.items_processed,
|
||||
jrl.items_succeeded,
|
||||
jrl.items_failed,
|
||||
jrl.metadata,
|
||||
js.worker_name,
|
||||
js.worker_role
|
||||
FROM job_run_logs jrl
|
||||
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
||||
ORDER BY jrl.started_at DESC
|
||||
LIMIT 20
|
||||
`);
|
||||
|
||||
// 6. Recent visibility changes by store
|
||||
const { rows: visibilityChanges } = await query<any>(`
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
d.name AS dispensary_name,
|
||||
d.state,
|
||||
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = true AND dp.visibility_lost_at > NOW() - INTERVAL '24 hours') AS lost_24h,
|
||||
COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at > NOW() - INTERVAL '24 hours') AS restored_24h,
|
||||
MAX(dp.visibility_lost_at) AS latest_loss,
|
||||
MAX(dp.visibility_restored_at) AS latest_restore
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.menu_type = 'dutchie'
|
||||
GROUP BY d.id, d.name, d.state
|
||||
HAVING COUNT(dp.id) FILTER (WHERE dp.visibility_lost = true AND dp.visibility_lost_at > NOW() - INTERVAL '24 hours') > 0
|
||||
OR COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at > NOW() - INTERVAL '24 hours') > 0
|
||||
ORDER BY lost_24h DESC, restored_24h DESC
|
||||
LIMIT 15
|
||||
`);
|
||||
|
||||
const kpi = kpiRows[0] || {};
|
||||
|
||||
res.json({
|
||||
kpi: {
|
||||
totalProducts: parseInt(kpi.total_products || '0'),
|
||||
inStockProducts: parseInt(kpi.in_stock_products || '0'),
|
||||
totalDispensaries: parseInt(kpi.total_dispensaries || '0'),
|
||||
crawlableDispensaries: parseInt(kpi.crawlable_dispensaries || '0'),
|
||||
visibilityLost24h: parseInt(kpi.visibility_lost_24h || '0'),
|
||||
visibilityRestored24h: parseInt(kpi.visibility_restored_24h || '0'),
|
||||
totalVisibilityLost: parseInt(kpi.total_visibility_lost || '0'),
|
||||
errors24h: parseInt(kpi.errors_24h || '0'),
|
||||
successfulJobs24h: parseInt(kpi.successful_jobs_24h || '0'),
|
||||
activeWorkers: parseInt(kpi.active_workers || '0'),
|
||||
},
|
||||
workers: workerRows,
|
||||
activityByHour: activityRows.map((row: any) => ({
|
||||
hour: row.hour,
|
||||
successful: parseInt(row.successful || '0'),
|
||||
failed: parseInt(row.failed || '0'),
|
||||
total: parseInt(row.total || '0'),
|
||||
})),
|
||||
productGrowth: growthRows.map((row: any) => ({
|
||||
day: row.day,
|
||||
newProducts: parseInt(row.new_products || '0'),
|
||||
})),
|
||||
recentRuns: recentRuns.map((row: any) => ({
|
||||
id: row.id,
|
||||
jobName: row.job_name,
|
||||
status: row.status,
|
||||
startedAt: row.started_at,
|
||||
completedAt: row.completed_at,
|
||||
itemsProcessed: row.items_processed,
|
||||
itemsSucceeded: row.items_succeeded,
|
||||
itemsFailed: row.items_failed,
|
||||
workerName: row.worker_name,
|
||||
workerRole: row.worker_role,
|
||||
visibilityLost: row.metadata?.visibilityLostCount || 0,
|
||||
visibilityRestored: row.metadata?.visibilityRestoredCount || 0,
|
||||
})),
|
||||
visibilityChanges: visibilityChanges.map((row: any) => ({
|
||||
dispensaryId: row.dispensary_id,
|
||||
dispensaryName: row.dispensary_name,
|
||||
state: row.state,
|
||||
lost24h: parseInt(row.lost_24h || '0'),
|
||||
restored24h: parseInt(row.restored_24h || '0'),
|
||||
latestLoss: row.latest_loss,
|
||||
latestRestore: row.latest_restore,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Error fetching scraper overview:', error);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
486
backend/src/dutchie-az/scripts/stress-test.ts
Normal file
486
backend/src/dutchie-az/scripts/stress-test.ts
Normal file
@@ -0,0 +1,486 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Crawler Reliability Stress Test
|
||||
*
|
||||
* Simulates various failure scenarios to test:
|
||||
* - Retry logic with exponential backoff
|
||||
* - Error taxonomy classification
|
||||
* - Self-healing (proxy/UA rotation)
|
||||
* - Status transitions (active -> degraded -> failed)
|
||||
* - Minimum crawl gap enforcement
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*
|
||||
* Usage:
|
||||
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
|
||||
*
|
||||
* Available tests:
|
||||
* retry - Test retry manager with various error types
|
||||
* backoff - Test exponential backoff calculation
|
||||
* status - Test status transitions
|
||||
* gap - Test minimum crawl gap enforcement
|
||||
* rotation - Test proxy/UA rotation
|
||||
* all - Run all tests
|
||||
*/
|
||||
|
||||
import {
|
||||
CrawlErrorCode,
|
||||
classifyError,
|
||||
isRetryable,
|
||||
shouldRotateProxy,
|
||||
shouldRotateUserAgent,
|
||||
getBackoffMultiplier,
|
||||
getErrorMetadata,
|
||||
} from '../services/error-taxonomy';
|
||||
|
||||
import {
|
||||
RetryManager,
|
||||
withRetry,
|
||||
calculateNextCrawlDelay,
|
||||
calculateNextCrawlAt,
|
||||
determineCrawlStatus,
|
||||
shouldAttemptRecovery,
|
||||
sleep,
|
||||
} from '../services/retry-manager';
|
||||
|
||||
import {
|
||||
UserAgentRotator,
|
||||
USER_AGENTS,
|
||||
} from '../services/proxy-rotator';
|
||||
|
||||
import {
|
||||
validateStoreConfig,
|
||||
isCrawlable,
|
||||
DEFAULT_CONFIG,
|
||||
RawStoreConfig,
|
||||
} from '../services/store-validator';
|
||||
|
||||
// ============================================================
|
||||
// TEST UTILITIES
|
||||
// ============================================================
|
||||
|
||||
let testsPassed = 0;
|
||||
let testsFailed = 0;
|
||||
|
||||
function assert(condition: boolean, message: string): void {
|
||||
if (condition) {
|
||||
console.log(` ✓ ${message}`);
|
||||
testsPassed++;
|
||||
} else {
|
||||
console.log(` ✗ ${message}`);
|
||||
testsFailed++;
|
||||
}
|
||||
}
|
||||
|
||||
function section(name: string): void {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`TEST: ${name}`);
|
||||
console.log('='.repeat(60));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Error Classification
|
||||
// ============================================================
|
||||
|
||||
function testErrorClassification(): void {
|
||||
section('Error Classification');
|
||||
|
||||
// HTTP status codes
|
||||
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
|
||||
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
|
||||
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
|
||||
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
|
||||
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
|
||||
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
|
||||
|
||||
// Error messages
|
||||
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
|
||||
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
|
||||
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
|
||||
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
|
||||
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
|
||||
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
|
||||
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
|
||||
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
|
||||
|
||||
// Retryability
|
||||
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
|
||||
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
|
||||
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
|
||||
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
|
||||
|
||||
// Rotation decisions
|
||||
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
|
||||
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
|
||||
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Retry Manager
|
||||
// ============================================================
|
||||
|
||||
function testRetryManager(): void {
|
||||
section('Retry Manager');
|
||||
|
||||
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
|
||||
|
||||
// Initial state
|
||||
assert(manager.shouldAttempt() === true, 'Should attempt initially');
|
||||
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
|
||||
|
||||
// First attempt
|
||||
manager.recordAttempt();
|
||||
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
|
||||
|
||||
// Evaluate retryable error
|
||||
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
|
||||
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
|
||||
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
|
||||
assert(decision1.rotateProxy === true, 'Should rotate proxy');
|
||||
assert(decision1.backoffMs > 0, 'Backoff is positive');
|
||||
|
||||
// More attempts
|
||||
manager.recordAttempt();
|
||||
manager.recordAttempt();
|
||||
|
||||
// Now at max retries
|
||||
const decision2 = manager.evaluateError(new Error('timeout'), 504);
|
||||
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
|
||||
|
||||
manager.recordAttempt();
|
||||
const decision3 = manager.evaluateError(new Error('timeout'));
|
||||
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
|
||||
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
|
||||
|
||||
// Reset
|
||||
manager.reset();
|
||||
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
|
||||
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
|
||||
|
||||
// Non-retryable error
|
||||
const manager2 = new RetryManager({ maxRetries: 3 });
|
||||
manager2.recordAttempt();
|
||||
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
|
||||
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
|
||||
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Exponential Backoff
|
||||
// ============================================================
|
||||
|
||||
function testExponentialBackoff(): void {
|
||||
section('Exponential Backoff');
|
||||
|
||||
// Calculate next crawl delay
|
||||
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
|
||||
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
|
||||
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
|
||||
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
|
||||
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
|
||||
|
||||
console.log(` Delay with 0 failures: ${delay0} minutes`);
|
||||
console.log(` Delay with 1 failure: ${delay1} minutes`);
|
||||
console.log(` Delay with 2 failures: ${delay2} minutes`);
|
||||
console.log(` Delay with 3 failures: ${delay3} minutes`);
|
||||
console.log(` Delay with 5 failures: ${delay5} minutes`);
|
||||
|
||||
assert(delay1 > delay0, 'Delay increases with failures');
|
||||
assert(delay2 > delay1, 'Delay keeps increasing');
|
||||
assert(delay3 > delay2, 'More delay with more failures');
|
||||
// With jitter, exact values vary but ratio should be close to 2x
|
||||
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
|
||||
|
||||
// Next crawl time calculation
|
||||
const now = new Date();
|
||||
const nextAt = calculateNextCrawlAt(2, 240);
|
||||
assert(nextAt > now, 'Next crawl is in future');
|
||||
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Status Transitions
|
||||
// ============================================================
|
||||
|
||||
function testStatusTransitions(): void {
|
||||
section('Status Transitions');
|
||||
|
||||
// Active status
|
||||
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
|
||||
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
|
||||
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
|
||||
|
||||
// Degraded status
|
||||
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
|
||||
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
|
||||
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
|
||||
|
||||
// Failed status
|
||||
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
|
||||
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
|
||||
|
||||
// Custom thresholds
|
||||
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
|
||||
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
|
||||
|
||||
// Recovery check
|
||||
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
|
||||
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
|
||||
|
||||
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
|
||||
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
|
||||
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Store Validation
|
||||
// ============================================================
|
||||
|
||||
function testStoreValidation(): void {
|
||||
section('Store Validation');
|
||||
|
||||
// Valid config
|
||||
const validConfig: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test Store',
|
||||
platformDispensaryId: '123abc',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const validResult = validateStoreConfig(validConfig);
|
||||
assert(validResult.isValid === true, 'Valid config passes');
|
||||
assert(validResult.config !== null, 'Valid config returns config');
|
||||
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
|
||||
|
||||
// Missing required fields
|
||||
const missingId: RawStoreConfig = {
|
||||
id: 0,
|
||||
name: 'Test',
|
||||
platformDispensaryId: '123',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const missingIdResult = validateStoreConfig(missingId);
|
||||
assert(missingIdResult.isValid === false, 'Missing ID fails');
|
||||
|
||||
// Missing platform ID
|
||||
const missingPlatform: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const missingPlatformResult = validateStoreConfig(missingPlatform);
|
||||
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
|
||||
|
||||
// Unknown menu type
|
||||
const unknownMenu: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test',
|
||||
platformDispensaryId: '123',
|
||||
menuType: 'unknown',
|
||||
};
|
||||
const unknownMenuResult = validateStoreConfig(unknownMenu);
|
||||
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
|
||||
|
||||
// Crawlable check
|
||||
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
|
||||
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
|
||||
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
|
||||
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: User Agent Rotation
|
||||
// ============================================================
|
||||
|
||||
function testUserAgentRotation(): void {
|
||||
section('User Agent Rotation');
|
||||
|
||||
const rotator = new UserAgentRotator();
|
||||
|
||||
const first = rotator.getCurrent();
|
||||
const second = rotator.getNext();
|
||||
const third = rotator.getNext();
|
||||
|
||||
assert(first !== second, 'User agents rotate');
|
||||
assert(second !== third, 'User agents keep rotating');
|
||||
assert(USER_AGENTS.includes(first), 'Returns valid UA');
|
||||
assert(USER_AGENTS.includes(second), 'Returns valid UA');
|
||||
|
||||
// Random UA
|
||||
const random = rotator.getRandom();
|
||||
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
|
||||
|
||||
// Count
|
||||
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: WithRetry Helper
|
||||
// ============================================================
|
||||
|
||||
async function testWithRetryHelper(): Promise<void> {
|
||||
section('WithRetry Helper');
|
||||
|
||||
// Successful on first try
|
||||
let attempts = 0;
|
||||
const successResult = await withRetry(async () => {
|
||||
attempts++;
|
||||
return 'success';
|
||||
}, { maxRetries: 3 });
|
||||
assert(attempts === 1, 'Succeeds on first try');
|
||||
assert(successResult.result === 'success', 'Returns result');
|
||||
|
||||
// Fails then succeeds
|
||||
let failThenSucceedAttempts = 0;
|
||||
const failThenSuccessResult = await withRetry(async () => {
|
||||
failThenSucceedAttempts++;
|
||||
if (failThenSucceedAttempts < 3) {
|
||||
throw new Error('temporary error');
|
||||
}
|
||||
return 'finally succeeded';
|
||||
}, { maxRetries: 5, baseBackoffMs: 10 });
|
||||
assert(failThenSucceedAttempts === 3, 'Retries until success');
|
||||
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
|
||||
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
|
||||
|
||||
// Exhausts retries
|
||||
let alwaysFailAttempts = 0;
|
||||
try {
|
||||
await withRetry(async () => {
|
||||
alwaysFailAttempts++;
|
||||
throw new Error('always fails');
|
||||
}, { maxRetries: 2, baseBackoffMs: 10 });
|
||||
assert(false, 'Should have thrown');
|
||||
} catch (error: any) {
|
||||
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
|
||||
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
|
||||
}
|
||||
|
||||
// Non-retryable error stops immediately
|
||||
let nonRetryableAttempts = 0;
|
||||
try {
|
||||
await withRetry(async () => {
|
||||
nonRetryableAttempts++;
|
||||
const err = new Error('HTML structure changed - selector not found');
|
||||
throw err;
|
||||
}, { maxRetries: 3, baseBackoffMs: 10 });
|
||||
assert(false, 'Should have thrown');
|
||||
} catch {
|
||||
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Minimum Crawl Gap
|
||||
// ============================================================
|
||||
|
||||
function testMinimumCrawlGap(): void {
|
||||
section('Minimum Crawl Gap');
|
||||
|
||||
// Default config
|
||||
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
|
||||
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
|
||||
|
||||
// Gap calculation
|
||||
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
|
||||
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
|
||||
|
||||
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Error Metadata
|
||||
// ============================================================
|
||||
|
||||
function testErrorMetadata(): void {
|
||||
section('Error Metadata');
|
||||
|
||||
// RATE_LIMITED
|
||||
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
|
||||
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
|
||||
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
|
||||
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
|
||||
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
|
||||
|
||||
// HTML_CHANGED
|
||||
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
|
||||
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
|
||||
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
|
||||
|
||||
// INVALID_CONFIG
|
||||
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
|
||||
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
|
||||
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN
|
||||
// ============================================================
|
||||
|
||||
async function runTests(testName?: string): Promise<void> {
|
||||
console.log('\n');
|
||||
console.log('╔══════════════════════════════════════════════════════════╗');
|
||||
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
|
||||
console.log('╚══════════════════════════════════════════════════════════╝');
|
||||
|
||||
const allTests = !testName || testName === 'all';
|
||||
|
||||
if (allTests || testName === 'error' || testName === 'classification') {
|
||||
testErrorClassification();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'retry') {
|
||||
testRetryManager();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'backoff') {
|
||||
testExponentialBackoff();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'status') {
|
||||
testStatusTransitions();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'validation' || testName === 'store') {
|
||||
testStoreValidation();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'rotation' || testName === 'ua') {
|
||||
testUserAgentRotation();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'withRetry' || testName === 'helper') {
|
||||
await testWithRetryHelper();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'gap') {
|
||||
testMinimumCrawlGap();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'metadata') {
|
||||
testErrorMetadata();
|
||||
}
|
||||
|
||||
// Summary
|
||||
console.log('\n');
|
||||
console.log('═'.repeat(60));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Passed: ${testsPassed}`);
|
||||
console.log(` Failed: ${testsFailed}`);
|
||||
console.log(` Total: ${testsPassed + testsFailed}`);
|
||||
|
||||
if (testsFailed > 0) {
|
||||
console.log('\n❌ SOME TESTS FAILED\n');
|
||||
process.exit(1);
|
||||
} else {
|
||||
console.log('\n✅ ALL TESTS PASSED\n');
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests
|
||||
const testName = process.argv[2];
|
||||
runTests(testName).catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
659
backend/src/dutchie-az/services/analytics/brand-opportunity.ts
Normal file
659
backend/src/dutchie-az/services/analytics/brand-opportunity.ts
Normal file
@@ -0,0 +1,659 @@
|
||||
/**
|
||||
* Brand Opportunity / Risk Analytics Service
|
||||
*
|
||||
* Provides brand-level opportunity and risk analysis including:
|
||||
* - Under/overpriced vs market
|
||||
* - Missing SKU opportunities
|
||||
* - Stores with declining/growing shelf share
|
||||
* - Competitor intrusion alerts
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface BrandOpportunity {
|
||||
brandName: string;
|
||||
underpricedVsMarket: PricePosition[];
|
||||
overpricedVsMarket: PricePosition[];
|
||||
missingSkuOpportunities: MissingSkuOpportunity[];
|
||||
storesWithDecliningShelfShare: StoreShelfShareChange[];
|
||||
storesWithGrowingShelfShare: StoreShelfShareChange[];
|
||||
competitorIntrusionAlerts: CompetitorAlert[];
|
||||
overallScore: number; // 0-100, higher = more opportunity
|
||||
riskScore: number; // 0-100, higher = more risk
|
||||
}
|
||||
|
||||
export interface PricePosition {
|
||||
category: string;
|
||||
brandAvgPrice: number;
|
||||
marketAvgPrice: number;
|
||||
priceDifferencePercent: number;
|
||||
skuCount: number;
|
||||
suggestion: string;
|
||||
}
|
||||
|
||||
export interface MissingSkuOpportunity {
|
||||
category: string;
|
||||
subcategory: string | null;
|
||||
marketSkuCount: number;
|
||||
brandSkuCount: number;
|
||||
gapPercent: number;
|
||||
topCompetitors: string[];
|
||||
opportunityScore: number; // 0-100
|
||||
}
|
||||
|
||||
export interface StoreShelfShareChange {
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
currentShelfShare: number;
|
||||
previousShelfShare: number;
|
||||
changePercent: number;
|
||||
currentSkus: number;
|
||||
competitors: string[];
|
||||
}
|
||||
|
||||
export interface CompetitorAlert {
|
||||
competitorBrand: string;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
alertType: 'new_entry' | 'expanding' | 'price_undercut';
|
||||
details: string;
|
||||
severity: 'low' | 'medium' | 'high';
|
||||
date: string;
|
||||
}
|
||||
|
||||
export interface MarketPositionSummary {
|
||||
brandName: string;
|
||||
marketSharePercent: number;
|
||||
avgPriceVsMarket: number; // -X% to +X%
|
||||
categoryStrengths: Array<{ category: string; shelfSharePercent: number }>;
|
||||
categoryWeaknesses: Array<{ category: string; shelfSharePercent: number; marketLeader: string }>;
|
||||
growthTrend: 'growing' | 'stable' | 'declining';
|
||||
competitorThreats: string[];
|
||||
}
|
||||
|
||||
export class BrandOpportunityService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get full opportunity analysis for a brand
|
||||
*/
|
||||
async getBrandOpportunity(brandName: string): Promise<BrandOpportunity> {
|
||||
const key = cacheKey('brand_opportunity', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [
|
||||
underpriced,
|
||||
overpriced,
|
||||
missingSkus,
|
||||
decliningStores,
|
||||
growingStores,
|
||||
alerts,
|
||||
] = await Promise.all([
|
||||
this.getUnderpricedPositions(brandName),
|
||||
this.getOverpricedPositions(brandName),
|
||||
this.getMissingSkuOpportunities(brandName),
|
||||
this.getStoresWithDecliningShare(brandName),
|
||||
this.getStoresWithGrowingShare(brandName),
|
||||
this.getCompetitorAlerts(brandName),
|
||||
]);
|
||||
|
||||
// Calculate opportunity score (higher = more opportunity)
|
||||
const opportunityFactors = [
|
||||
missingSkus.length > 0 ? 20 : 0,
|
||||
underpriced.length > 0 ? 15 : 0,
|
||||
growingStores.length > 5 ? 20 : growingStores.length * 3,
|
||||
missingSkus.reduce((sum, m) => sum + m.opportunityScore, 0) / Math.max(1, missingSkus.length) * 0.3,
|
||||
];
|
||||
const opportunityScore = Math.min(100, opportunityFactors.reduce((a, b) => a + b, 0));
|
||||
|
||||
// Calculate risk score (higher = more risk)
|
||||
const riskFactors = [
|
||||
decliningStores.length > 5 ? 30 : decliningStores.length * 5,
|
||||
alerts.filter(a => a.severity === 'high').length * 15,
|
||||
alerts.filter(a => a.severity === 'medium').length * 8,
|
||||
overpriced.length > 3 ? 15 : overpriced.length * 3,
|
||||
];
|
||||
const riskScore = Math.min(100, riskFactors.reduce((a, b) => a + b, 0));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
underpricedVsMarket: underpriced,
|
||||
overpricedVsMarket: overpriced,
|
||||
missingSkuOpportunities: missingSkus,
|
||||
storesWithDecliningShelfShare: decliningStores,
|
||||
storesWithGrowingShelfShare: growingStores,
|
||||
competitorIntrusionAlerts: alerts,
|
||||
overallScore: Math.round(opportunityScore),
|
||||
riskScore: Math.round(riskScore),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories where brand is underpriced vs market
|
||||
*/
|
||||
async getUnderpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
market_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL AND brand_name != $1
|
||||
GROUP BY type
|
||||
)
|
||||
SELECT
|
||||
bp.category,
|
||||
bp.brand_avg,
|
||||
mp.market_avg,
|
||||
bp.sku_count,
|
||||
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||
FROM brand_prices bp
|
||||
JOIN market_prices mp ON bp.category = mp.category
|
||||
WHERE bp.brand_avg < mp.market_avg * 0.9 -- 10% or more below market
|
||||
AND bp.brand_avg IS NOT NULL
|
||||
AND mp.market_avg IS NOT NULL
|
||||
ORDER BY diff_pct
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category: row.category,
|
||||
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
suggestion: `Consider price increase - ${Math.abs(Math.round(parseFloat(row.diff_pct)))}% below market average`,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories where brand is overpriced vs market
|
||||
*/
|
||||
async getOverpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
market_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL AND brand_name != $1
|
||||
GROUP BY type
|
||||
)
|
||||
SELECT
|
||||
bp.category,
|
||||
bp.brand_avg,
|
||||
mp.market_avg,
|
||||
bp.sku_count,
|
||||
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||
FROM brand_prices bp
|
||||
JOIN market_prices mp ON bp.category = mp.category
|
||||
WHERE bp.brand_avg > mp.market_avg * 1.15 -- 15% or more above market
|
||||
AND bp.brand_avg IS NOT NULL
|
||||
AND mp.market_avg IS NOT NULL
|
||||
ORDER BY diff_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category: row.category,
|
||||
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
suggestion: `Price sensitivity risk - ${Math.round(parseFloat(row.diff_pct))}% above market average`,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get missing SKU opportunities (category gaps)
|
||||
*/
|
||||
async getMissingSkuOpportunities(brandName: string): Promise<MissingSkuOpportunity[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH market_categories AS (
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as market_skus,
|
||||
ARRAY_AGG(DISTINCT brand_name ORDER BY brand_name) FILTER (WHERE brand_name IS NOT NULL) as top_brands
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
HAVING COUNT(*) >= 20
|
||||
),
|
||||
brand_presence AS (
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as brand_skus
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
)
|
||||
SELECT
|
||||
mc.category,
|
||||
mc.subcategory,
|
||||
mc.market_skus,
|
||||
COALESCE(bp.brand_skus, 0) as brand_skus,
|
||||
mc.top_brands[1:5] as competitors
|
||||
FROM market_categories mc
|
||||
LEFT JOIN brand_presence bp ON mc.category = bp.category
|
||||
AND (mc.subcategory = bp.subcategory OR (mc.subcategory IS NULL AND bp.subcategory IS NULL))
|
||||
WHERE COALESCE(bp.brand_skus, 0) < mc.market_skus * 0.05 -- Brand has <5% of market presence
|
||||
ORDER BY mc.market_skus DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const marketSkus = parseInt(row.market_skus) || 0;
|
||||
const brandSkus = parseInt(row.brand_skus) || 0;
|
||||
const gapPercent = marketSkus > 0 ? ((marketSkus - brandSkus) / marketSkus) * 100 : 100;
|
||||
const opportunityScore = Math.min(100, Math.round((marketSkus / 100) * (gapPercent / 100) * 100));
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
subcategory: row.subcategory,
|
||||
marketSkuCount: marketSkus,
|
||||
brandSkuCount: brandSkus,
|
||||
gapPercent: Math.round(gapPercent),
|
||||
topCompetitors: (row.competitors || []).filter((c: string) => c !== brandName).slice(0, 5),
|
||||
opportunityScore,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores where brand's shelf share is declining
|
||||
*/
|
||||
async getStoresWithDecliningShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||
// Use brand_snapshots for historical comparison
|
||||
const result = await this.pool.query(`
|
||||
WITH current_share AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||
COUNT(*) as total_skus,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||
)
|
||||
SELECT
|
||||
cs.store_id,
|
||||
cs.store_name,
|
||||
cs.city,
|
||||
cs.state,
|
||||
cs.brand_skus as current_skus,
|
||||
cs.total_skus,
|
||||
ROUND((cs.brand_skus::NUMERIC / cs.total_skus) * 100, 2) as current_share,
|
||||
cs.competitors[1:5] as top_competitors
|
||||
FROM current_share cs
|
||||
WHERE cs.brand_skus < 10 -- Low presence
|
||||
ORDER BY cs.brand_skus
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||
previousShelfShare: parseFloat(row.current_share) || 0, // Would need historical data
|
||||
changePercent: 0,
|
||||
currentSkus: parseInt(row.current_skus) || 0,
|
||||
competitors: row.top_competitors || [],
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores where brand's shelf share is growing
|
||||
*/
|
||||
async getStoresWithGrowingShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH store_share AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||
COUNT(*) as total_skus,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||
)
|
||||
SELECT
|
||||
ss.store_id,
|
||||
ss.store_name,
|
||||
ss.city,
|
||||
ss.state,
|
||||
ss.brand_skus as current_skus,
|
||||
ss.total_skus,
|
||||
ROUND((ss.brand_skus::NUMERIC / ss.total_skus) * 100, 2) as current_share,
|
||||
ss.competitors[1:5] as top_competitors
|
||||
FROM store_share ss
|
||||
ORDER BY current_share DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||
previousShelfShare: parseFloat(row.current_share) || 0,
|
||||
changePercent: 0,
|
||||
currentSkus: parseInt(row.current_skus) || 0,
|
||||
competitors: row.top_competitors || [],
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get competitor intrusion alerts
|
||||
*/
|
||||
async getCompetitorAlerts(brandName: string): Promise<CompetitorAlert[]> {
|
||||
// Check for competitor entries in stores where this brand has presence
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_stores AS (
|
||||
SELECT DISTINCT dispensary_id
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1
|
||||
),
|
||||
competitor_presence AS (
|
||||
SELECT
|
||||
dp.brand_name as competitor,
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
COUNT(*) as sku_count,
|
||||
MAX(dp.created_at) as latest_add
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.dispensary_id IN (SELECT dispensary_id FROM brand_stores)
|
||||
AND dp.brand_name != $1
|
||||
AND dp.brand_name IS NOT NULL
|
||||
AND dp.created_at >= NOW() - INTERVAL '30 days'
|
||||
GROUP BY dp.brand_name, dp.dispensary_id, d.name
|
||||
HAVING COUNT(*) >= 5
|
||||
)
|
||||
SELECT
|
||||
competitor,
|
||||
store_id,
|
||||
store_name,
|
||||
sku_count,
|
||||
latest_add
|
||||
FROM competitor_presence
|
||||
ORDER BY sku_count DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const skuCount = parseInt(row.sku_count) || 0;
|
||||
let severity: 'low' | 'medium' | 'high' = 'low';
|
||||
if (skuCount >= 20) severity = 'high';
|
||||
else if (skuCount >= 10) severity = 'medium';
|
||||
|
||||
return {
|
||||
competitorBrand: row.competitor,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
alertType: 'expanding' as const,
|
||||
details: `${row.competitor} has ${skuCount} SKUs in ${row.store_name}`,
|
||||
severity,
|
||||
date: new Date(row.latest_add).toISOString().split('T')[0],
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get market position summary for a brand
|
||||
*/
|
||||
async getMarketPositionSummary(brandName: string): Promise<MarketPositionSummary> {
|
||||
const key = cacheKey('market_position', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [shareResult, priceResult, categoryResult, threatResult] = await Promise.all([
|
||||
// Market share
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE brand_name = $1) as brand_count,
|
||||
(SELECT COUNT(*) FROM dutchie_products) as total_count
|
||||
`, [brandName]),
|
||||
|
||||
// Price vs market
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name = $1) as brand_avg,
|
||||
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name != $1) as market_avg
|
||||
`, [brandName]),
|
||||
|
||||
// Category strengths/weaknesses
|
||||
this.pool.query(`
|
||||
WITH brand_by_cat AS (
|
||||
SELECT type as category, COUNT(*) as brand_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
market_by_cat AS (
|
||||
SELECT type as category, COUNT(*) as total_count
|
||||
FROM dutchie_products WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
leaders AS (
|
||||
SELECT type as category, brand_name, COUNT(*) as cnt,
|
||||
RANK() OVER (PARTITION BY type ORDER BY COUNT(*) DESC) as rnk
|
||||
FROM dutchie_products WHERE type IS NOT NULL AND brand_name IS NOT NULL
|
||||
GROUP BY type, brand_name
|
||||
)
|
||||
SELECT
|
||||
mc.category,
|
||||
COALESCE(bc.brand_count, 0) as brand_count,
|
||||
mc.total_count,
|
||||
ROUND((COALESCE(bc.brand_count, 0)::NUMERIC / mc.total_count) * 100, 2) as share_pct,
|
||||
(SELECT brand_name FROM leaders WHERE category = mc.category AND rnk = 1) as leader
|
||||
FROM market_by_cat mc
|
||||
LEFT JOIN brand_by_cat bc ON mc.category = bc.category
|
||||
ORDER BY share_pct DESC
|
||||
`, [brandName]),
|
||||
|
||||
// Top competitors
|
||||
this.pool.query(`
|
||||
SELECT brand_name, COUNT(*) as cnt
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL AND brand_name != $1
|
||||
GROUP BY brand_name
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 5
|
||||
`, [brandName]),
|
||||
]);
|
||||
|
||||
const brandCount = parseInt(shareResult.rows[0]?.brand_count) || 0;
|
||||
const totalCount = parseInt(shareResult.rows[0]?.total_count) || 1;
|
||||
const marketSharePercent = Math.round((brandCount / totalCount) * 1000) / 10;
|
||||
|
||||
const brandAvg = parseFloat(priceResult.rows[0]?.brand_avg) || 0;
|
||||
const marketAvg = parseFloat(priceResult.rows[0]?.market_avg) || 1;
|
||||
const avgPriceVsMarket = Math.round(((brandAvg - marketAvg) / marketAvg) * 1000) / 10;
|
||||
|
||||
const categories = categoryResult.rows;
|
||||
const strengths = categories
|
||||
.filter(c => parseFloat(c.share_pct) > 5)
|
||||
.map(c => ({ category: c.category, shelfSharePercent: parseFloat(c.share_pct) }));
|
||||
|
||||
const weaknesses = categories
|
||||
.filter(c => parseFloat(c.share_pct) < 2 && c.leader !== brandName)
|
||||
.map(c => ({
|
||||
category: c.category,
|
||||
shelfSharePercent: parseFloat(c.share_pct),
|
||||
marketLeader: c.leader || 'Unknown',
|
||||
}));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
marketSharePercent,
|
||||
avgPriceVsMarket,
|
||||
categoryStrengths: strengths.slice(0, 5),
|
||||
categoryWeaknesses: weaknesses.slice(0, 5),
|
||||
growthTrend: 'stable' as const, // Would need historical data
|
||||
competitorThreats: threatResult.rows.map(r => r.brand_name),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an analytics alert
|
||||
*/
|
||||
async createAlert(alert: {
|
||||
alertType: string;
|
||||
severity: 'info' | 'warning' | 'critical';
|
||||
title: string;
|
||||
description?: string;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
productId?: number;
|
||||
category?: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}): Promise<void> {
|
||||
await this.pool.query(`
|
||||
INSERT INTO analytics_alerts
|
||||
(alert_type, severity, title, description, store_id, brand_name, product_id, category, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
alert.alertType,
|
||||
alert.severity,
|
||||
alert.title,
|
||||
alert.description || null,
|
||||
alert.storeId || null,
|
||||
alert.brandName || null,
|
||||
alert.productId || null,
|
||||
alert.category || null,
|
||||
alert.metadata ? JSON.stringify(alert.metadata) : null,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recent alerts
|
||||
*/
|
||||
async getAlerts(filters: {
|
||||
brandName?: string;
|
||||
storeId?: number;
|
||||
alertType?: string;
|
||||
unreadOnly?: boolean;
|
||||
limit?: number;
|
||||
} = {}): Promise<Array<{
|
||||
id: number;
|
||||
alertType: string;
|
||||
severity: string;
|
||||
title: string;
|
||||
description: string | null;
|
||||
storeName: string | null;
|
||||
brandName: string | null;
|
||||
createdAt: string;
|
||||
isRead: boolean;
|
||||
}>> {
|
||||
const { brandName, storeId, alertType, unreadOnly = false, limit = 50 } = filters;
|
||||
const params: (string | number | boolean)[] = [limit];
|
||||
const conditions: string[] = [];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (brandName) {
|
||||
conditions.push(`a.brand_name = $${paramIndex++}`);
|
||||
params.push(brandName);
|
||||
}
|
||||
if (storeId) {
|
||||
conditions.push(`a.store_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
if (alertType) {
|
||||
conditions.push(`a.alert_type = $${paramIndex++}`);
|
||||
params.push(alertType);
|
||||
}
|
||||
if (unreadOnly) {
|
||||
conditions.push('a.is_read = false');
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0
|
||||
? 'WHERE ' + conditions.join(' AND ')
|
||||
: '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
a.id,
|
||||
a.alert_type,
|
||||
a.severity,
|
||||
a.title,
|
||||
a.description,
|
||||
d.name as store_name,
|
||||
a.brand_name,
|
||||
a.created_at,
|
||||
a.is_read
|
||||
FROM analytics_alerts a
|
||||
LEFT JOIN dispensaries d ON a.store_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY a.created_at DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
alertType: row.alert_type,
|
||||
severity: row.severity,
|
||||
title: row.title,
|
||||
description: row.description,
|
||||
storeName: row.store_name,
|
||||
brandName: row.brand_name,
|
||||
createdAt: row.created_at.toISOString(),
|
||||
isRead: row.is_read,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark alerts as read
|
||||
*/
|
||||
async markAlertsRead(alertIds: number[]): Promise<void> {
|
||||
if (alertIds.length === 0) return;
|
||||
|
||||
await this.pool.query(`
|
||||
UPDATE analytics_alerts
|
||||
SET is_read = true
|
||||
WHERE id = ANY($1)
|
||||
`, [alertIds]);
|
||||
}
|
||||
}
|
||||
227
backend/src/dutchie-az/services/analytics/cache.ts
Normal file
227
backend/src/dutchie-az/services/analytics/cache.ts
Normal file
@@ -0,0 +1,227 @@
|
||||
/**
|
||||
* Analytics Cache Service
|
||||
*
|
||||
* Provides caching layer for expensive analytics queries.
|
||||
* Uses PostgreSQL for persistence with configurable TTLs.
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface CacheEntry<T = unknown> {
|
||||
key: string;
|
||||
data: T;
|
||||
computedAt: Date;
|
||||
expiresAt: Date;
|
||||
queryTimeMs?: number;
|
||||
}
|
||||
|
||||
export interface CacheConfig {
|
||||
defaultTtlMinutes: number;
|
||||
}
|
||||
|
||||
const DEFAULT_CONFIG: CacheConfig = {
|
||||
defaultTtlMinutes: 15,
|
||||
};
|
||||
|
||||
export class AnalyticsCache {
|
||||
private pool: Pool;
|
||||
private config: CacheConfig;
|
||||
private memoryCache: Map<string, CacheEntry> = new Map();
|
||||
|
||||
constructor(pool: Pool, config: Partial<CacheConfig> = {}) {
|
||||
this.pool = pool;
|
||||
this.config = { ...DEFAULT_CONFIG, ...config };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached data or compute and cache it
|
||||
*/
|
||||
async getOrCompute<T>(
|
||||
key: string,
|
||||
computeFn: () => Promise<T>,
|
||||
ttlMinutes?: number
|
||||
): Promise<{ data: T; fromCache: boolean; queryTimeMs: number }> {
|
||||
const ttl = ttlMinutes ?? this.config.defaultTtlMinutes;
|
||||
|
||||
// Check memory cache first
|
||||
const memEntry = this.memoryCache.get(key);
|
||||
if (memEntry && new Date() < memEntry.expiresAt) {
|
||||
return { data: memEntry.data as T, fromCache: true, queryTimeMs: memEntry.queryTimeMs || 0 };
|
||||
}
|
||||
|
||||
// Check database cache
|
||||
const dbEntry = await this.getFromDb<T>(key);
|
||||
if (dbEntry && new Date() < dbEntry.expiresAt) {
|
||||
this.memoryCache.set(key, dbEntry);
|
||||
return { data: dbEntry.data, fromCache: true, queryTimeMs: dbEntry.queryTimeMs || 0 };
|
||||
}
|
||||
|
||||
// Compute fresh data
|
||||
const startTime = Date.now();
|
||||
const data = await computeFn();
|
||||
const queryTimeMs = Date.now() - startTime;
|
||||
|
||||
// Cache result
|
||||
const entry: CacheEntry<T> = {
|
||||
key,
|
||||
data,
|
||||
computedAt: new Date(),
|
||||
expiresAt: new Date(Date.now() + ttl * 60 * 1000),
|
||||
queryTimeMs,
|
||||
};
|
||||
|
||||
await this.saveToDb(entry);
|
||||
this.memoryCache.set(key, entry);
|
||||
|
||||
return { data, fromCache: false, queryTimeMs };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get from database cache
|
||||
*/
|
||||
private async getFromDb<T>(key: string): Promise<CacheEntry<T> | null> {
|
||||
try {
|
||||
const result = await this.pool.query(`
|
||||
SELECT cache_data, computed_at, expires_at, query_time_ms
|
||||
FROM analytics_cache
|
||||
WHERE cache_key = $1
|
||||
AND expires_at > NOW()
|
||||
`, [key]);
|
||||
|
||||
if (result.rows.length === 0) return null;
|
||||
|
||||
const row = result.rows[0];
|
||||
return {
|
||||
key,
|
||||
data: row.cache_data as T,
|
||||
computedAt: row.computed_at,
|
||||
expiresAt: row.expires_at,
|
||||
queryTimeMs: row.query_time_ms,
|
||||
};
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to get from DB: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save to database cache
|
||||
*/
|
||||
private async saveToDb<T>(entry: CacheEntry<T>): Promise<void> {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
INSERT INTO analytics_cache (cache_key, cache_data, computed_at, expires_at, query_time_ms)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (cache_key)
|
||||
DO UPDATE SET
|
||||
cache_data = EXCLUDED.cache_data,
|
||||
computed_at = EXCLUDED.computed_at,
|
||||
expires_at = EXCLUDED.expires_at,
|
||||
query_time_ms = EXCLUDED.query_time_ms
|
||||
`, [entry.key, JSON.stringify(entry.data), entry.computedAt, entry.expiresAt, entry.queryTimeMs]);
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to save to DB: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate a cache entry
|
||||
*/
|
||||
async invalidate(key: string): Promise<void> {
|
||||
this.memoryCache.delete(key);
|
||||
try {
|
||||
await this.pool.query('DELETE FROM analytics_cache WHERE cache_key = $1', [key]);
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to invalidate: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate all entries matching a pattern
|
||||
*/
|
||||
async invalidatePattern(pattern: string): Promise<number> {
|
||||
// Clear memory cache
|
||||
for (const key of this.memoryCache.keys()) {
|
||||
if (key.includes(pattern)) {
|
||||
this.memoryCache.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query(
|
||||
'DELETE FROM analytics_cache WHERE cache_key LIKE $1',
|
||||
[`%${pattern}%`]
|
||||
);
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to invalidate pattern: ${error}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean expired entries
|
||||
*/
|
||||
async cleanExpired(): Promise<number> {
|
||||
// Clean memory cache
|
||||
const now = new Date();
|
||||
for (const [key, entry] of this.memoryCache.entries()) {
|
||||
if (now >= entry.expiresAt) {
|
||||
this.memoryCache.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query('DELETE FROM analytics_cache WHERE expires_at < NOW()');
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to clean expired: ${error}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
memoryCacheSize: number;
|
||||
dbCacheSize: number;
|
||||
expiredCount: number;
|
||||
}> {
|
||||
try {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE expires_at > NOW()) as active,
|
||||
COUNT(*) FILTER (WHERE expires_at <= NOW()) as expired
|
||||
FROM analytics_cache
|
||||
`);
|
||||
|
||||
return {
|
||||
memoryCacheSize: this.memoryCache.size,
|
||||
dbCacheSize: parseInt(result.rows[0]?.active || '0'),
|
||||
expiredCount: parseInt(result.rows[0]?.expired || '0'),
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
memoryCacheSize: this.memoryCache.size,
|
||||
dbCacheSize: 0,
|
||||
expiredCount: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate cache key with parameters
|
||||
*/
|
||||
export function cacheKey(prefix: string, params: Record<string, unknown> = {}): string {
|
||||
const sortedParams = Object.keys(params)
|
||||
.sort()
|
||||
.filter(k => params[k] !== undefined && params[k] !== null)
|
||||
.map(k => `${k}=${params[k]}`)
|
||||
.join('&');
|
||||
|
||||
return sortedParams ? `${prefix}:${sortedParams}` : prefix;
|
||||
}
|
||||
530
backend/src/dutchie-az/services/analytics/category-analytics.ts
Normal file
530
backend/src/dutchie-az/services/analytics/category-analytics.ts
Normal file
@@ -0,0 +1,530 @@
|
||||
/**
|
||||
* Category Growth Analytics Service
|
||||
*
|
||||
* Provides category-level analytics including:
|
||||
* - SKU count growth
|
||||
* - Price growth trends
|
||||
* - New product additions
|
||||
* - Category shrinkage
|
||||
* - Seasonality patterns
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface CategoryGrowth {
|
||||
category: string;
|
||||
currentSkuCount: number;
|
||||
previousSkuCount: number;
|
||||
skuGrowthPercent: number;
|
||||
currentBrandCount: number;
|
||||
previousBrandCount: number;
|
||||
brandGrowthPercent: number;
|
||||
currentAvgPrice: number | null;
|
||||
previousAvgPrice: number | null;
|
||||
priceChangePercent: number | null;
|
||||
newProducts: number;
|
||||
discontinuedProducts: number;
|
||||
trend: 'growing' | 'declining' | 'stable';
|
||||
}
|
||||
|
||||
export interface CategorySummary {
|
||||
category: string;
|
||||
totalSkus: number;
|
||||
brandCount: number;
|
||||
storeCount: number;
|
||||
avgPrice: number | null;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
inStockSkus: number;
|
||||
outOfStockSkus: number;
|
||||
stockHealthPercent: number;
|
||||
}
|
||||
|
||||
export interface CategoryGrowthTrend {
|
||||
category: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
storeCount: number;
|
||||
}>;
|
||||
growth7d: number | null;
|
||||
growth30d: number | null;
|
||||
growth90d: number | null;
|
||||
}
|
||||
|
||||
export interface CategoryHeatmapData {
|
||||
categories: string[];
|
||||
periods: string[];
|
||||
data: Array<{
|
||||
category: string;
|
||||
period: string;
|
||||
value: number; // SKU count, growth %, or price
|
||||
changeFromPrevious: number | null;
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface SeasonalityPattern {
|
||||
category: string;
|
||||
monthlyPattern: Array<{
|
||||
month: number;
|
||||
monthName: string;
|
||||
avgSkuCount: number;
|
||||
avgPrice: number | null;
|
||||
seasonalityIndex: number; // 100 = average, >100 = above, <100 = below
|
||||
}>;
|
||||
peakMonth: number;
|
||||
troughMonth: number;
|
||||
}
|
||||
|
||||
export interface CategoryFilters {
|
||||
state?: string;
|
||||
storeId?: number;
|
||||
minSkus?: number;
|
||||
}
|
||||
|
||||
export class CategoryAnalyticsService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current category summary
|
||||
*/
|
||||
async getCategorySummary(
|
||||
category?: string,
|
||||
filters: CategoryFilters = {}
|
||||
): Promise<CategorySummary[]> {
|
||||
const { state, storeId } = filters;
|
||||
const key = cacheKey('category_summary', { category, state, storeId });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [];
|
||||
const conditions: string[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (category) {
|
||||
conditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
if (storeId) {
|
||||
conditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0
|
||||
? 'WHERE dp.type IS NOT NULL AND ' + conditions.join(' AND ')
|
||||
: 'WHERE dp.type IS NOT NULL';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
dp.type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
SUM(CASE WHEN dp.stock_status != 'in_stock' OR dp.stock_status IS NULL THEN 1 ELSE 0 END) as out_of_stock
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
${whereClause}
|
||||
GROUP BY dp.type
|
||||
ORDER BY total_skus DESC
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const inStock = parseInt(row.in_stock) || 0;
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
totalSkus,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
minPrice: row.min_price ? Math.round(parseFloat(row.min_price) * 100) / 100 : null,
|
||||
maxPrice: row.max_price ? Math.round(parseFloat(row.max_price) * 100) / 100 : null,
|
||||
inStockSkus: inStock,
|
||||
outOfStockSkus: parseInt(row.out_of_stock) || 0,
|
||||
stockHealthPercent: totalSkus > 0
|
||||
? Math.round((inStock / totalSkus) * 100)
|
||||
: 0,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category growth (comparing periods)
|
||||
*/
|
||||
async getCategoryGrowth(
|
||||
days: number = 7,
|
||||
filters: CategoryFilters = {}
|
||||
): Promise<CategoryGrowth[]> {
|
||||
const { state, storeId, minSkus = 10 } = filters;
|
||||
const key = cacheKey('category_growth', { days, state, storeId, minSkus });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use category_snapshots for historical comparison
|
||||
const result = await this.pool.query(`
|
||||
WITH current_data AS (
|
||||
SELECT
|
||||
category,
|
||||
total_skus,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date = (SELECT MAX(snapshot_date) FROM category_snapshots)
|
||||
),
|
||||
previous_data AS (
|
||||
SELECT
|
||||
category,
|
||||
total_skus,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date = (
|
||||
SELECT MAX(snapshot_date)
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date < (SELECT MAX(snapshot_date) FROM category_snapshots) - ($1 || ' days')::INTERVAL
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
c.category,
|
||||
c.total_skus as current_skus,
|
||||
COALESCE(p.total_skus, c.total_skus) as previous_skus,
|
||||
c.brand_count as current_brands,
|
||||
COALESCE(p.brand_count, c.brand_count) as previous_brands,
|
||||
c.avg_price as current_price,
|
||||
p.avg_price as previous_price
|
||||
FROM current_data c
|
||||
LEFT JOIN previous_data p ON c.category = p.category
|
||||
WHERE c.total_skus >= $2
|
||||
ORDER BY c.total_skus DESC
|
||||
`, [days, minSkus]);
|
||||
|
||||
// If no snapshots exist, use current data
|
||||
if (result.rows.length === 0) {
|
||||
const fallbackResult = await this.pool.query(`
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT brand_name) as brand_count,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= $1
|
||||
ORDER BY total_skus DESC
|
||||
`, [minSkus]);
|
||||
|
||||
return fallbackResult.rows.map(row => ({
|
||||
category: row.category,
|
||||
currentSkuCount: parseInt(row.total_skus) || 0,
|
||||
previousSkuCount: parseInt(row.total_skus) || 0,
|
||||
skuGrowthPercent: 0,
|
||||
currentBrandCount: parseInt(row.brand_count) || 0,
|
||||
previousBrandCount: parseInt(row.brand_count) || 0,
|
||||
brandGrowthPercent: 0,
|
||||
currentAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
previousAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
priceChangePercent: null,
|
||||
newProducts: 0,
|
||||
discontinuedProducts: 0,
|
||||
trend: 'stable' as const,
|
||||
}));
|
||||
}
|
||||
|
||||
return result.rows.map(row => {
|
||||
const currentSkus = parseInt(row.current_skus) || 0;
|
||||
const previousSkus = parseInt(row.previous_skus) || currentSkus;
|
||||
const currentBrands = parseInt(row.current_brands) || 0;
|
||||
const previousBrands = parseInt(row.previous_brands) || currentBrands;
|
||||
const currentPrice = row.current_price ? parseFloat(row.current_price) : null;
|
||||
const previousPrice = row.previous_price ? parseFloat(row.previous_price) : null;
|
||||
|
||||
const skuGrowth = previousSkus > 0
|
||||
? ((currentSkus - previousSkus) / previousSkus) * 100
|
||||
: 0;
|
||||
const brandGrowth = previousBrands > 0
|
||||
? ((currentBrands - previousBrands) / previousBrands) * 100
|
||||
: 0;
|
||||
const priceChange = previousPrice && currentPrice
|
||||
? ((currentPrice - previousPrice) / previousPrice) * 100
|
||||
: null;
|
||||
|
||||
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||
if (skuGrowth > 5) trend = 'growing';
|
||||
else if (skuGrowth < -5) trend = 'declining';
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
currentSkuCount: currentSkus,
|
||||
previousSkuCount: previousSkus,
|
||||
skuGrowthPercent: Math.round(skuGrowth * 10) / 10,
|
||||
currentBrandCount: currentBrands,
|
||||
previousBrandCount: previousBrands,
|
||||
brandGrowthPercent: Math.round(brandGrowth * 10) / 10,
|
||||
currentAvgPrice: currentPrice ? Math.round(currentPrice * 100) / 100 : null,
|
||||
previousAvgPrice: previousPrice ? Math.round(previousPrice * 100) / 100 : null,
|
||||
priceChangePercent: priceChange !== null ? Math.round(priceChange * 10) / 10 : null,
|
||||
newProducts: Math.max(0, currentSkus - previousSkus),
|
||||
discontinuedProducts: Math.max(0, previousSkus - currentSkus),
|
||||
trend,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category growth trend over time
|
||||
*/
|
||||
async getCategoryGrowthTrend(
|
||||
category: string,
|
||||
days: number = 90
|
||||
): Promise<CategoryGrowthTrend> {
|
||||
const key = cacheKey('category_growth_trend', { category, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
snapshot_date as date,
|
||||
total_skus as sku_count,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE category = $1
|
||||
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY snapshot_date
|
||||
`, [category, days]);
|
||||
|
||||
const dataPoints = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
}));
|
||||
|
||||
// Calculate growth rates
|
||||
const calculateGrowth = (daysBack: number): number | null => {
|
||||
if (dataPoints.length < 2) return null;
|
||||
const targetDate = new Date();
|
||||
targetDate.setDate(targetDate.getDate() - daysBack);
|
||||
const targetDateStr = targetDate.toISOString().split('T')[0];
|
||||
|
||||
const recent = dataPoints[dataPoints.length - 1];
|
||||
const older = dataPoints.find(d => d.date <= targetDateStr) || dataPoints[0];
|
||||
|
||||
if (older.skuCount === 0) return null;
|
||||
return Math.round(((recent.skuCount - older.skuCount) / older.skuCount) * 1000) / 10;
|
||||
};
|
||||
|
||||
return {
|
||||
category,
|
||||
dataPoints,
|
||||
growth7d: calculateGrowth(7),
|
||||
growth30d: calculateGrowth(30),
|
||||
growth90d: calculateGrowth(90),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category heatmap data
|
||||
*/
|
||||
async getCategoryHeatmap(
|
||||
metric: 'skus' | 'growth' | 'price' = 'skus',
|
||||
periods: number = 12 // weeks
|
||||
): Promise<CategoryHeatmapData> {
|
||||
const key = cacheKey('category_heatmap', { metric, periods });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
category,
|
||||
snapshot_date,
|
||||
total_skus,
|
||||
avg_price
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date >= CURRENT_DATE - ($1 * 7 || ' days')::INTERVAL
|
||||
ORDER BY category, snapshot_date
|
||||
`, [periods]);
|
||||
|
||||
// Get unique categories and generate weekly periods
|
||||
const categoriesSet = new Set<string>();
|
||||
const periodsSet = new Set<string>();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
categoriesSet.add(row.category);
|
||||
// Group by week
|
||||
const date = new Date(row.snapshot_date);
|
||||
const weekStart = new Date(date);
|
||||
weekStart.setDate(date.getDate() - date.getDay());
|
||||
periodsSet.add(weekStart.toISOString().split('T')[0]);
|
||||
});
|
||||
|
||||
const categories = Array.from(categoriesSet).sort();
|
||||
const periodsList = Array.from(periodsSet).sort();
|
||||
|
||||
// Aggregate data by category and week
|
||||
const dataMap = new Map<string, Map<string, { skus: number; price: number | null }>>();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
const date = new Date(row.snapshot_date);
|
||||
const weekStart = new Date(date);
|
||||
weekStart.setDate(date.getDate() - date.getDay());
|
||||
const period = weekStart.toISOString().split('T')[0];
|
||||
|
||||
if (!dataMap.has(row.category)) {
|
||||
dataMap.set(row.category, new Map());
|
||||
}
|
||||
const categoryData = dataMap.get(row.category)!;
|
||||
|
||||
if (!categoryData.has(period)) {
|
||||
categoryData.set(period, { skus: 0, price: null });
|
||||
}
|
||||
const existing = categoryData.get(period)!;
|
||||
existing.skus = Math.max(existing.skus, parseInt(row.total_skus) || 0);
|
||||
if (row.avg_price) {
|
||||
existing.price = parseFloat(row.avg_price);
|
||||
}
|
||||
});
|
||||
|
||||
// Build heatmap data
|
||||
const data: CategoryHeatmapData['data'] = [];
|
||||
|
||||
categories.forEach(category => {
|
||||
let previousValue: number | null = null;
|
||||
|
||||
periodsList.forEach(period => {
|
||||
const categoryData = dataMap.get(category)?.get(period);
|
||||
let value = 0;
|
||||
|
||||
if (categoryData) {
|
||||
switch (metric) {
|
||||
case 'skus':
|
||||
value = categoryData.skus;
|
||||
break;
|
||||
case 'price':
|
||||
value = categoryData.price || 0;
|
||||
break;
|
||||
case 'growth':
|
||||
value = previousValue !== null && previousValue > 0
|
||||
? ((categoryData.skus - previousValue) / previousValue) * 100
|
||||
: 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const changeFromPrevious = previousValue !== null && previousValue > 0
|
||||
? ((value - previousValue) / previousValue) * 100
|
||||
: null;
|
||||
|
||||
data.push({
|
||||
category,
|
||||
period,
|
||||
value: Math.round(value * 100) / 100,
|
||||
changeFromPrevious: changeFromPrevious !== null
|
||||
? Math.round(changeFromPrevious * 10) / 10
|
||||
: null,
|
||||
});
|
||||
|
||||
if (metric !== 'growth') {
|
||||
previousValue = value;
|
||||
} else if (categoryData) {
|
||||
previousValue = categoryData.skus;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return {
|
||||
categories,
|
||||
periods: periodsList,
|
||||
data,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top growing/declining categories
|
||||
*/
|
||||
async getTopMovers(
|
||||
limit: number = 5,
|
||||
days: number = 30
|
||||
): Promise<{
|
||||
growing: CategoryGrowth[];
|
||||
declining: CategoryGrowth[];
|
||||
}> {
|
||||
const key = cacheKey('top_movers', { limit, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const allGrowth = await this.getCategoryGrowth(days);
|
||||
|
||||
const sorted = [...allGrowth].sort((a, b) => b.skuGrowthPercent - a.skuGrowthPercent);
|
||||
|
||||
return {
|
||||
growing: sorted.filter(c => c.skuGrowthPercent > 0).slice(0, limit),
|
||||
declining: sorted.filter(c => c.skuGrowthPercent < 0).slice(-limit).reverse(),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category subcategory breakdown
|
||||
*/
|
||||
async getSubcategoryBreakdown(category: string): Promise<Array<{
|
||||
subcategory: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
percentOfCategory: number;
|
||||
}>> {
|
||||
const key = cacheKey('subcategory_breakdown', { category });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH category_total AS (
|
||||
SELECT COUNT(*) as total FROM dutchie_products WHERE type = $1
|
||||
)
|
||||
SELECT
|
||||
COALESCE(dp.subcategory, 'Other') as subcategory,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
ct.total as category_total
|
||||
FROM dutchie_products dp, category_total ct
|
||||
WHERE dp.type = $1
|
||||
GROUP BY dp.subcategory, ct.total
|
||||
ORDER BY sku_count DESC
|
||||
`, [category]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
subcategory: row.subcategory,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
percentOfCategory: parseInt(row.category_total) > 0
|
||||
? Math.round((parseInt(row.sku_count) / parseInt(row.category_total)) * 1000) / 10
|
||||
: 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
}
|
||||
57
backend/src/dutchie-az/services/analytics/index.ts
Normal file
57
backend/src/dutchie-az/services/analytics/index.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
/**
|
||||
* Analytics Module Index
|
||||
*
|
||||
* Exports all analytics services for CannaiQ dashboards.
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
export { AnalyticsCache, cacheKey, type CacheEntry, type CacheConfig } from './cache';
|
||||
|
||||
export {
|
||||
PriceTrendService,
|
||||
type PricePoint,
|
||||
type PriceTrend,
|
||||
type PriceSummary,
|
||||
type PriceCompressionResult,
|
||||
type PriceFilters,
|
||||
} from './price-trends';
|
||||
|
||||
export {
|
||||
PenetrationService,
|
||||
type BrandPenetration,
|
||||
type PenetrationTrend,
|
||||
type ShelfShare,
|
||||
type BrandPresenceByState,
|
||||
type PenetrationFilters,
|
||||
} from './penetration';
|
||||
|
||||
export {
|
||||
CategoryAnalyticsService,
|
||||
type CategoryGrowth,
|
||||
type CategorySummary,
|
||||
type CategoryGrowthTrend,
|
||||
type CategoryHeatmapData,
|
||||
type SeasonalityPattern,
|
||||
type CategoryFilters,
|
||||
} from './category-analytics';
|
||||
|
||||
export {
|
||||
StoreChangeService,
|
||||
type StoreChangeSummary,
|
||||
type StoreChangeEvent,
|
||||
type BrandChange,
|
||||
type ProductChange,
|
||||
type CategoryLeaderboard,
|
||||
type StoreFilters,
|
||||
} from './store-changes';
|
||||
|
||||
export {
|
||||
BrandOpportunityService,
|
||||
type BrandOpportunity,
|
||||
type PricePosition,
|
||||
type MissingSkuOpportunity,
|
||||
type StoreShelfShareChange,
|
||||
type CompetitorAlert,
|
||||
type MarketPositionSummary,
|
||||
} from './brand-opportunity';
|
||||
556
backend/src/dutchie-az/services/analytics/penetration.ts
Normal file
556
backend/src/dutchie-az/services/analytics/penetration.ts
Normal file
@@ -0,0 +1,556 @@
|
||||
/**
|
||||
* Brand Penetration Analytics Service
|
||||
*
|
||||
* Provides analytics for brand market penetration including:
|
||||
* - Stores carrying brand
|
||||
* - SKU counts per brand
|
||||
* - Percentage of stores carrying
|
||||
* - Shelf share calculations
|
||||
* - Penetration trends and momentum
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface BrandPenetration {
|
||||
brandName: string;
|
||||
brandId: string | null;
|
||||
totalStores: number;
|
||||
storesCarrying: number;
|
||||
penetrationPercent: number;
|
||||
totalSkus: number;
|
||||
avgSkusPerStore: number;
|
||||
shelfSharePercent: number;
|
||||
categories: string[];
|
||||
avgPrice: number | null;
|
||||
inStockSkus: number;
|
||||
}
|
||||
|
||||
export interface PenetrationTrend {
|
||||
brandName: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
storeCount: number;
|
||||
skuCount: number;
|
||||
penetrationPercent: number;
|
||||
}>;
|
||||
momentumScore: number; // -100 to +100
|
||||
riskScore: number; // 0 to 100, higher = more risk
|
||||
trend: 'growing' | 'declining' | 'stable';
|
||||
}
|
||||
|
||||
export interface ShelfShare {
|
||||
brandName: string;
|
||||
category: string;
|
||||
skuCount: number;
|
||||
categoryTotalSkus: number;
|
||||
shelfSharePercent: number;
|
||||
rank: number;
|
||||
}
|
||||
|
||||
export interface BrandPresenceByState {
|
||||
state: string;
|
||||
storeCount: number;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
}
|
||||
|
||||
export interface PenetrationFilters {
|
||||
state?: string;
|
||||
category?: string;
|
||||
minStores?: number;
|
||||
minSkus?: number;
|
||||
}
|
||||
|
||||
export class PenetrationService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration data for a specific brand
|
||||
*/
|
||||
async getBrandPenetration(
|
||||
brandName: string,
|
||||
filters: PenetrationFilters = {}
|
||||
): Promise<BrandPenetration> {
|
||||
const { state, category } = filters;
|
||||
const key = cacheKey('brand_penetration', { brandName, state, category });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Build where clauses
|
||||
const conditions: string[] = [];
|
||||
const params: (string | number)[] = [brandName];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
if (category) {
|
||||
conditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
|
||||
const stateCondition = state ? `AND d.state = $${params.indexOf(state) + 1}` : '';
|
||||
const categoryCondition = category ? `AND dp.type = $${params.indexOf(category) + 1}` : '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH total_stores AS (
|
||||
SELECT COUNT(DISTINCT id) as total
|
||||
FROM dispensaries
|
||||
WHERE 1=1 ${state ? `AND state = $2` : ''}
|
||||
),
|
||||
brand_data AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||
COUNT(*) as total_skus,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
${stateCondition}
|
||||
${categoryCondition}
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||
)
|
||||
SELECT
|
||||
bd.brand_name,
|
||||
bd.brand_id,
|
||||
ts.total as total_stores,
|
||||
bd.stores_carrying,
|
||||
bd.total_skus,
|
||||
bd.avg_price,
|
||||
bd.in_stock,
|
||||
bd.categories,
|
||||
tsk.total as market_total_skus
|
||||
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||
`, params);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return {
|
||||
brandName,
|
||||
brandId: null,
|
||||
totalStores: 0,
|
||||
storesCarrying: 0,
|
||||
penetrationPercent: 0,
|
||||
totalSkus: 0,
|
||||
avgSkusPerStore: 0,
|
||||
shelfSharePercent: 0,
|
||||
categories: [],
|
||||
avgPrice: null,
|
||||
inStockSkus: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const row = result.rows[0];
|
||||
const totalStores = parseInt(row.total_stores) || 1;
|
||||
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||
|
||||
return {
|
||||
brandName: row.brand_name,
|
||||
brandId: row.brand_id,
|
||||
totalStores,
|
||||
storesCarrying,
|
||||
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||
totalSkus,
|
||||
avgSkusPerStore: storesCarrying > 0
|
||||
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||
: 0,
|
||||
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||
categories: row.categories || [],
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
inStockSkus: parseInt(row.in_stock) || 0,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top brands by penetration
|
||||
*/
|
||||
async getTopBrandsByPenetration(
|
||||
limit: number = 20,
|
||||
filters: PenetrationFilters = {}
|
||||
): Promise<BrandPenetration[]> {
|
||||
const { state, category, minStores = 2, minSkus = 5 } = filters;
|
||||
const key = cacheKey('top_brands_penetration', { limit, state, category, minStores, minSkus });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [limit, minStores, minSkus];
|
||||
let paramIndex = 4;
|
||||
|
||||
let stateCondition = '';
|
||||
let categoryCondition = '';
|
||||
|
||||
if (state) {
|
||||
stateCondition = `AND d.state = $${paramIndex++}`;
|
||||
params.push(state);
|
||||
}
|
||||
if (category) {
|
||||
categoryCondition = `AND dp.type = $${paramIndex++}`;
|
||||
params.push(category);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH total_stores AS (
|
||||
SELECT COUNT(DISTINCT id) as total
|
||||
FROM dispensaries
|
||||
WHERE 1=1 ${state ? `AND state = $${params.indexOf(state) + 1}` : ''}
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||
),
|
||||
brand_data AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||
COUNT(*) as total_skus,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name IS NOT NULL
|
||||
${stateCondition}
|
||||
${categoryCondition}
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
HAVING COUNT(DISTINCT dp.dispensary_id) >= $2
|
||||
AND COUNT(*) >= $3
|
||||
)
|
||||
SELECT
|
||||
bd.*,
|
||||
ts.total as total_stores,
|
||||
tsk.total as market_total_skus
|
||||
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||
ORDER BY bd.stores_carrying DESC, bd.total_skus DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const totalStores = parseInt(row.total_stores) || 1;
|
||||
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||
|
||||
return {
|
||||
brandName: row.brand_name,
|
||||
brandId: row.brand_id,
|
||||
totalStores,
|
||||
storesCarrying,
|
||||
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||
totalSkus,
|
||||
avgSkusPerStore: storesCarrying > 0
|
||||
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||
: 0,
|
||||
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||
categories: row.categories || [],
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
inStockSkus: parseInt(row.in_stock) || 0,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration trend for a brand (requires historical snapshots)
|
||||
*/
|
||||
async getPenetrationTrend(
|
||||
brandName: string,
|
||||
days: number = 30
|
||||
): Promise<PenetrationTrend> {
|
||||
const key = cacheKey('penetration_trend', { brandName, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use brand_snapshots table for historical data
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
snapshot_date as date,
|
||||
store_count,
|
||||
total_skus
|
||||
FROM brand_snapshots
|
||||
WHERE brand_name = $1
|
||||
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY snapshot_date
|
||||
`, [brandName, days]);
|
||||
|
||||
// Get total stores for penetration calculation
|
||||
const totalResult = await this.pool.query(
|
||||
'SELECT COUNT(*) as total FROM dispensaries'
|
||||
);
|
||||
const totalStores = parseInt(totalResult.rows[0]?.total) || 1;
|
||||
|
||||
const dataPoints = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
skuCount: parseInt(row.total_skus) || 0,
|
||||
penetrationPercent: Math.round((parseInt(row.store_count) / totalStores) * 1000) / 10,
|
||||
}));
|
||||
|
||||
// Calculate momentum and risk scores
|
||||
let momentumScore = 0;
|
||||
let riskScore = 0;
|
||||
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||
|
||||
if (dataPoints.length >= 2) {
|
||||
const first = dataPoints[0];
|
||||
const last = dataPoints[dataPoints.length - 1];
|
||||
|
||||
// Momentum: change in store count
|
||||
const storeChange = last.storeCount - first.storeCount;
|
||||
const storeChangePercent = first.storeCount > 0
|
||||
? (storeChange / first.storeCount) * 100
|
||||
: 0;
|
||||
|
||||
// Momentum score: -100 to +100
|
||||
momentumScore = Math.max(-100, Math.min(100, storeChangePercent * 10));
|
||||
|
||||
// Risk score: higher if losing stores
|
||||
if (storeChange < 0) {
|
||||
riskScore = Math.min(100, Math.abs(storeChangePercent) * 5);
|
||||
}
|
||||
|
||||
// Determine trend
|
||||
if (storeChangePercent > 5) trend = 'growing';
|
||||
else if (storeChangePercent < -5) trend = 'declining';
|
||||
}
|
||||
|
||||
return {
|
||||
brandName,
|
||||
dataPoints,
|
||||
momentumScore: Math.round(momentumScore),
|
||||
riskScore: Math.round(riskScore),
|
||||
trend,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get shelf share by category for a brand
|
||||
*/
|
||||
async getShelfShareByCategory(brandName: string): Promise<ShelfShare[]> {
|
||||
const key = cacheKey('shelf_share_category', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH category_totals AS (
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total_skus
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
brand_by_category AS (
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1
|
||||
AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
ranked AS (
|
||||
SELECT
|
||||
ct.category,
|
||||
COALESCE(bc.sku_count, 0) as sku_count,
|
||||
ct.total_skus,
|
||||
RANK() OVER (PARTITION BY ct.category ORDER BY bc.sku_count DESC NULLS LAST) as rank
|
||||
FROM category_totals ct
|
||||
LEFT JOIN brand_by_category bc ON ct.category = bc.category
|
||||
)
|
||||
SELECT
|
||||
r.category,
|
||||
r.sku_count,
|
||||
r.total_skus as category_total_skus,
|
||||
ROUND((r.sku_count::NUMERIC / r.total_skus) * 100, 2) as shelf_share_pct,
|
||||
(SELECT COUNT(*) + 1 FROM (
|
||||
SELECT brand_name, COUNT(*) as cnt
|
||||
FROM dutchie_products
|
||||
WHERE type = r.category AND brand_name IS NOT NULL
|
||||
GROUP BY brand_name
|
||||
HAVING COUNT(*) > r.sku_count
|
||||
) t) as rank
|
||||
FROM ranked r
|
||||
WHERE r.sku_count > 0
|
||||
ORDER BY r.shelf_share_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName,
|
||||
category: row.category,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
categoryTotalSkus: parseInt(row.category_total_skus) || 0,
|
||||
shelfSharePercent: parseFloat(row.shelf_share_pct) || 0,
|
||||
rank: parseInt(row.rank) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get brand presence by state/region
|
||||
*/
|
||||
async getBrandPresenceByState(brandName: string): Promise<BrandPresenceByState[]> {
|
||||
const key = cacheKey('brand_presence_state', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.state
|
||||
ORDER BY store_count DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores carrying a brand
|
||||
*/
|
||||
async getStoresCarryingBrand(brandName: string): Promise<Array<{
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
categories: string[];
|
||||
}>> {
|
||||
const key = cacheKey('stores_carrying_brand', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.id, d.name, d.city, d.state
|
||||
ORDER BY sku_count DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
categories: row.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration heatmap data (state-based)
|
||||
*/
|
||||
async getPenetrationHeatmap(
|
||||
brandName?: string
|
||||
): Promise<Array<{
|
||||
state: string;
|
||||
totalStores: number;
|
||||
storesWithBrand: number;
|
||||
penetrationPercent: number;
|
||||
totalSkus: number;
|
||||
}>> {
|
||||
const key = cacheKey('penetration_heatmap', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
if (brandName) {
|
||||
const result = await this.pool.query(`
|
||||
WITH state_totals AS (
|
||||
SELECT state, COUNT(*) as total_stores
|
||||
FROM dispensaries
|
||||
GROUP BY state
|
||||
),
|
||||
brand_by_state AS (
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_with_brand,
|
||||
COUNT(*) as total_skus
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.state
|
||||
)
|
||||
SELECT
|
||||
st.state,
|
||||
st.total_stores,
|
||||
COALESCE(bs.stores_with_brand, 0) as stores_with_brand,
|
||||
ROUND(COALESCE(bs.stores_with_brand, 0)::NUMERIC / st.total_stores * 100, 1) as penetration_pct,
|
||||
COALESCE(bs.total_skus, 0) as total_skus
|
||||
FROM state_totals st
|
||||
LEFT JOIN brand_by_state bs ON st.state = bs.state
|
||||
ORDER BY penetration_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
totalStores: parseInt(row.total_stores) || 0,
|
||||
storesWithBrand: parseInt(row.stores_with_brand) || 0,
|
||||
penetrationPercent: parseFloat(row.penetration_pct) || 0,
|
||||
totalSkus: parseInt(row.total_skus) || 0,
|
||||
}));
|
||||
} else {
|
||||
// Overall market data by state
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT d.id) as total_stores,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(*) as total_skus
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
GROUP BY d.state
|
||||
ORDER BY total_stores DESC
|
||||
`);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
totalStores: parseInt(row.total_stores) || 0,
|
||||
storesWithBrand: parseInt(row.brand_count) || 0, // Using brand count here
|
||||
penetrationPercent: 100, // Full penetration for overall view
|
||||
totalSkus: parseInt(row.total_skus) || 0,
|
||||
}));
|
||||
}
|
||||
}, 30)).data;
|
||||
}
|
||||
}
|
||||
534
backend/src/dutchie-az/services/analytics/price-trends.ts
Normal file
534
backend/src/dutchie-az/services/analytics/price-trends.ts
Normal file
@@ -0,0 +1,534 @@
|
||||
/**
|
||||
* Price Trend Analytics Service
|
||||
*
|
||||
* Provides time-series price analytics including:
|
||||
* - Price over time for products
|
||||
* - Average MSRP/Wholesale by period
|
||||
* - Price volatility scoring
|
||||
* - Price compression detection
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface PricePoint {
|
||||
date: string;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
avgPrice: number | null;
|
||||
wholesalePrice: number | null;
|
||||
sampleSize: number;
|
||||
}
|
||||
|
||||
export interface PriceTrend {
|
||||
productId?: number;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
dataPoints: PricePoint[];
|
||||
summary: {
|
||||
currentAvg: number | null;
|
||||
previousAvg: number | null;
|
||||
changePercent: number | null;
|
||||
trend: 'up' | 'down' | 'stable';
|
||||
volatilityScore: number | null;
|
||||
};
|
||||
}
|
||||
|
||||
export interface PriceSummary {
|
||||
avg7d: number | null;
|
||||
avg30d: number | null;
|
||||
avg90d: number | null;
|
||||
wholesaleAvg7d: number | null;
|
||||
wholesaleAvg30d: number | null;
|
||||
wholesaleAvg90d: number | null;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
priceRange: number | null;
|
||||
volatilityScore: number | null;
|
||||
}
|
||||
|
||||
export interface PriceCompressionResult {
|
||||
category: string;
|
||||
brands: Array<{
|
||||
brandName: string;
|
||||
avgPrice: number;
|
||||
priceDistance: number; // distance from category mean
|
||||
}>;
|
||||
compressionScore: number; // 0-100, higher = more compressed
|
||||
standardDeviation: number;
|
||||
}
|
||||
|
||||
export interface PriceFilters {
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
state?: string;
|
||||
days?: number;
|
||||
}
|
||||
|
||||
export class PriceTrendService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trend for a specific product
|
||||
*/
|
||||
async getProductPriceTrend(
|
||||
productId: number,
|
||||
storeId?: number,
|
||||
days: number = 30
|
||||
): Promise<PriceTrend> {
|
||||
const key = cacheKey('price_trend_product', { productId, storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Try to get from snapshots first
|
||||
const snapshotResult = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(crawled_at) as date,
|
||||
MIN(rec_min_price_cents) / 100.0 as min_price,
|
||||
MAX(rec_max_price_cents) / 100.0 as max_price,
|
||||
AVG(rec_min_price_cents) / 100.0 as avg_price,
|
||||
AVG(wholesale_min_price_cents) / 100.0 as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = $1
|
||||
AND crawled_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dispensary_id = $3' : ''}
|
||||
GROUP BY DATE(crawled_at)
|
||||
ORDER BY date
|
||||
`, storeId ? [productId, days, storeId] : [productId, days]);
|
||||
|
||||
let dataPoints: PricePoint[] = snapshotResult.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
// If no snapshots, get current price from product
|
||||
if (dataPoints.length === 0) {
|
||||
const productResult = await this.pool.query(`
|
||||
SELECT
|
||||
extract_min_price(latest_raw_payload) as min_price,
|
||||
extract_max_price(latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(latest_raw_payload) as wholesale_price
|
||||
FROM dutchie_products
|
||||
WHERE id = $1
|
||||
`, [productId]);
|
||||
|
||||
if (productResult.rows.length > 0) {
|
||||
const row = productResult.rows[0];
|
||||
dataPoints = [{
|
||||
date: new Date().toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.min_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: 1,
|
||||
}];
|
||||
}
|
||||
}
|
||||
|
||||
const summary = this.calculatePriceSummary(dataPoints);
|
||||
|
||||
return {
|
||||
productId,
|
||||
storeId,
|
||||
dataPoints,
|
||||
summary,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trends by brand
|
||||
*/
|
||||
async getBrandPriceTrend(
|
||||
brandName: string,
|
||||
filters: PriceFilters = {}
|
||||
): Promise<PriceTrend> {
|
||||
const { storeId, category, state, days = 30 } = filters;
|
||||
const key = cacheKey('price_trend_brand', { brandName, storeId, category, state, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use current product data aggregated by date
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(dp.updated_at) as date,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||
${category ? `AND dp.type = $${storeId ? 4 : 3}` : ''}
|
||||
${state ? `AND d.state = $${storeId ? (category ? 5 : 4) : (category ? 4 : 3)}` : ''}
|
||||
GROUP BY DATE(dp.updated_at)
|
||||
ORDER BY date
|
||||
`, this.buildParams([brandName, days], { storeId, category, state }));
|
||||
|
||||
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
storeId,
|
||||
category,
|
||||
dataPoints,
|
||||
summary: this.calculatePriceSummary(dataPoints),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trends by category
|
||||
*/
|
||||
async getCategoryPriceTrend(
|
||||
category: string,
|
||||
filters: PriceFilters = {}
|
||||
): Promise<PriceTrend> {
|
||||
const { storeId, brandName, state, days = 30 } = filters;
|
||||
const key = cacheKey('price_trend_category', { category, storeId, brandName, state, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(dp.updated_at) as date,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||
${brandName ? `AND dp.brand_name = $${storeId ? 4 : 3}` : ''}
|
||||
${state ? `AND d.state = $${storeId ? (brandName ? 5 : 4) : (brandName ? 4 : 3)}` : ''}
|
||||
GROUP BY DATE(dp.updated_at)
|
||||
ORDER BY date
|
||||
`, this.buildParams([category, days], { storeId, brandName, state }));
|
||||
|
||||
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
return {
|
||||
category,
|
||||
storeId,
|
||||
brandName,
|
||||
dataPoints,
|
||||
summary: this.calculatePriceSummary(dataPoints),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price summary statistics
|
||||
*/
|
||||
async getPriceSummary(filters: PriceFilters = {}): Promise<PriceSummary> {
|
||||
const { storeId, brandName, category, state } = filters;
|
||||
const key = cacheKey('price_summary', filters as Record<string, unknown>);
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const whereConditions: string[] = [];
|
||||
const params: (string | number)[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (storeId) {
|
||||
whereConditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
if (brandName) {
|
||||
whereConditions.push(`dp.brand_name = $${paramIndex++}`);
|
||||
params.push(brandName);
|
||||
}
|
||||
if (category) {
|
||||
whereConditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
if (state) {
|
||||
whereConditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
|
||||
const whereClause = whereConditions.length > 0
|
||||
? 'WHERE ' + whereConditions.join(' AND ')
|
||||
: '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH prices AS (
|
||||
SELECT
|
||||
extract_min_price(dp.latest_raw_payload) as min_price,
|
||||
extract_max_price(dp.latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
${whereClause}
|
||||
)
|
||||
SELECT
|
||||
AVG(min_price) as avg_price,
|
||||
AVG(wholesale_price) as avg_wholesale,
|
||||
MIN(min_price) as min_price,
|
||||
MAX(max_price) as max_price,
|
||||
STDDEV(min_price) as std_dev
|
||||
FROM prices
|
||||
WHERE min_price IS NOT NULL
|
||||
`, params);
|
||||
|
||||
const row = result.rows[0];
|
||||
const avgPrice = parseFloat(row.avg_price) || null;
|
||||
const stdDev = parseFloat(row.std_dev) || null;
|
||||
const volatility = avgPrice && stdDev ? (stdDev / avgPrice) * 100 : null;
|
||||
|
||||
return {
|
||||
avg7d: avgPrice, // Using current data as proxy
|
||||
avg30d: avgPrice,
|
||||
avg90d: avgPrice,
|
||||
wholesaleAvg7d: parseFloat(row.avg_wholesale) || null,
|
||||
wholesaleAvg30d: parseFloat(row.avg_wholesale) || null,
|
||||
wholesaleAvg90d: parseFloat(row.avg_wholesale) || null,
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
priceRange: row.max_price && row.min_price
|
||||
? parseFloat(row.max_price) - parseFloat(row.min_price)
|
||||
: null,
|
||||
volatilityScore: volatility ? Math.round(volatility * 10) / 10 : null,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect price compression in a category
|
||||
*/
|
||||
async detectPriceCompression(
|
||||
category: string,
|
||||
state?: string
|
||||
): Promise<PriceCompressionResult> {
|
||||
const key = cacheKey('price_compression', { category, state });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
AND dp.brand_name IS NOT NULL
|
||||
${state ? 'AND d.state = $2' : ''}
|
||||
GROUP BY dp.brand_name
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
stats AS (
|
||||
SELECT
|
||||
AVG(avg_price) as category_avg,
|
||||
STDDEV(avg_price) as std_dev
|
||||
FROM brand_prices
|
||||
WHERE avg_price IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
bp.brand_name,
|
||||
bp.avg_price,
|
||||
ABS(bp.avg_price - s.category_avg) as price_distance,
|
||||
s.category_avg,
|
||||
s.std_dev
|
||||
FROM brand_prices bp, stats s
|
||||
WHERE bp.avg_price IS NOT NULL
|
||||
ORDER BY bp.avg_price
|
||||
`, state ? [category, state] : [category]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return {
|
||||
category,
|
||||
brands: [],
|
||||
compressionScore: 0,
|
||||
standardDeviation: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const categoryAvg = parseFloat(result.rows[0].category_avg) || 0;
|
||||
const stdDev = parseFloat(result.rows[0].std_dev) || 0;
|
||||
|
||||
// Compression score: lower std dev relative to mean = more compression
|
||||
// Scale to 0-100 where 100 = very compressed
|
||||
const cv = categoryAvg > 0 ? (stdDev / categoryAvg) * 100 : 0;
|
||||
const compressionScore = Math.max(0, Math.min(100, 100 - cv));
|
||||
|
||||
const brands = result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
avgPrice: parseFloat(row.avg_price) || 0,
|
||||
priceDistance: parseFloat(row.price_distance) || 0,
|
||||
}));
|
||||
|
||||
return {
|
||||
category,
|
||||
brands,
|
||||
compressionScore: Math.round(compressionScore),
|
||||
standardDeviation: Math.round(stdDev * 100) / 100,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get global price statistics
|
||||
*/
|
||||
async getGlobalPriceStats(): Promise<{
|
||||
totalProductsWithPrice: number;
|
||||
avgPrice: number | null;
|
||||
medianPrice: number | null;
|
||||
priceByCategory: Array<{ category: string; avgPrice: number; count: number }>;
|
||||
priceByState: Array<{ state: string; avgPrice: number; count: number }>;
|
||||
}> {
|
||||
const key = 'global_price_stats';
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [countResult, categoryResult, stateResult] = await Promise.all([
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE extract_min_price(latest_raw_payload) IS NOT NULL) as with_price,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY extract_min_price(latest_raw_payload)) as median
|
||||
FROM dutchie_products
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as count
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
AND extract_min_price(latest_raw_payload) IS NOT NULL
|
||||
GROUP BY type
|
||||
ORDER BY avg_price DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE extract_min_price(dp.latest_raw_payload) IS NOT NULL
|
||||
GROUP BY d.state
|
||||
ORDER BY avg_price DESC
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
totalProductsWithPrice: parseInt(countResult.rows[0]?.with_price || '0'),
|
||||
avgPrice: parseFloat(countResult.rows[0]?.avg_price) || null,
|
||||
medianPrice: parseFloat(countResult.rows[0]?.median) || null,
|
||||
priceByCategory: categoryResult.rows.map(r => ({
|
||||
category: r.category,
|
||||
avgPrice: parseFloat(r.avg_price) || 0,
|
||||
count: parseInt(r.count),
|
||||
})),
|
||||
priceByState: stateResult.rows.map(r => ({
|
||||
state: r.state,
|
||||
avgPrice: parseFloat(r.avg_price) || 0,
|
||||
count: parseInt(r.count),
|
||||
})),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELPER METHODS
|
||||
// ============================================================
|
||||
|
||||
private calculatePriceSummary(dataPoints: PricePoint[]): PriceTrend['summary'] {
|
||||
if (dataPoints.length === 0) {
|
||||
return {
|
||||
currentAvg: null,
|
||||
previousAvg: null,
|
||||
changePercent: null,
|
||||
trend: 'stable',
|
||||
volatilityScore: null,
|
||||
};
|
||||
}
|
||||
|
||||
const prices = dataPoints
|
||||
.map(d => d.avgPrice)
|
||||
.filter((p): p is number => p !== null);
|
||||
|
||||
if (prices.length === 0) {
|
||||
return {
|
||||
currentAvg: null,
|
||||
previousAvg: null,
|
||||
changePercent: null,
|
||||
trend: 'stable',
|
||||
volatilityScore: null,
|
||||
};
|
||||
}
|
||||
|
||||
const currentAvg = prices[prices.length - 1];
|
||||
const midpoint = Math.floor(prices.length / 2);
|
||||
const previousAvg = prices.length > 1 ? prices[midpoint] : currentAvg;
|
||||
|
||||
const changePercent = previousAvg > 0
|
||||
? ((currentAvg - previousAvg) / previousAvg) * 100
|
||||
: null;
|
||||
|
||||
// Calculate volatility (coefficient of variation)
|
||||
const mean = prices.reduce((a, b) => a + b, 0) / prices.length;
|
||||
const variance = prices.reduce((sum, p) => sum + Math.pow(p - mean, 2), 0) / prices.length;
|
||||
const stdDev = Math.sqrt(variance);
|
||||
const volatilityScore = mean > 0 ? (stdDev / mean) * 100 : null;
|
||||
|
||||
let trend: 'up' | 'down' | 'stable' = 'stable';
|
||||
if (changePercent !== null) {
|
||||
if (changePercent > 5) trend = 'up';
|
||||
else if (changePercent < -5) trend = 'down';
|
||||
}
|
||||
|
||||
return {
|
||||
currentAvg: Math.round(currentAvg * 100) / 100,
|
||||
previousAvg: Math.round(previousAvg * 100) / 100,
|
||||
changePercent: changePercent !== null ? Math.round(changePercent * 10) / 10 : null,
|
||||
trend,
|
||||
volatilityScore: volatilityScore !== null ? Math.round(volatilityScore * 10) / 10 : null,
|
||||
};
|
||||
}
|
||||
|
||||
private buildParams(
|
||||
baseParams: (string | number)[],
|
||||
optionalParams: Record<string, string | number | undefined>
|
||||
): (string | number)[] {
|
||||
const params = [...baseParams];
|
||||
for (const value of Object.values(optionalParams)) {
|
||||
if (value !== undefined) {
|
||||
params.push(value);
|
||||
}
|
||||
}
|
||||
return params;
|
||||
}
|
||||
}
|
||||
587
backend/src/dutchie-az/services/analytics/store-changes.ts
Normal file
587
backend/src/dutchie-az/services/analytics/store-changes.ts
Normal file
@@ -0,0 +1,587 @@
|
||||
/**
|
||||
* Store Change Tracking Service
|
||||
*
|
||||
* Tracks changes at the store level including:
|
||||
* - New/lost brands
|
||||
* - New/discontinued products
|
||||
* - Stock status transitions
|
||||
* - Price changes
|
||||
* - Category movement leaderboards
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface StoreChangeSummary {
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
brandsAdded7d: number;
|
||||
brandsAdded30d: number;
|
||||
brandsLost7d: number;
|
||||
brandsLost30d: number;
|
||||
productsAdded7d: number;
|
||||
productsAdded30d: number;
|
||||
productsDiscontinued7d: number;
|
||||
productsDiscontinued30d: number;
|
||||
priceDrops7d: number;
|
||||
priceIncreases7d: number;
|
||||
restocks7d: number;
|
||||
stockOuts7d: number;
|
||||
}
|
||||
|
||||
export interface StoreChangeEvent {
|
||||
id: number;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
eventType: string;
|
||||
eventDate: string;
|
||||
brandName: string | null;
|
||||
productName: string | null;
|
||||
category: string | null;
|
||||
oldValue: string | null;
|
||||
newValue: string | null;
|
||||
metadata: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
export interface BrandChange {
|
||||
brandName: string;
|
||||
changeType: 'added' | 'removed';
|
||||
date: string;
|
||||
skuCount: number;
|
||||
categories: string[];
|
||||
}
|
||||
|
||||
export interface ProductChange {
|
||||
productId: number;
|
||||
productName: string;
|
||||
brandName: string | null;
|
||||
category: string | null;
|
||||
changeType: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock';
|
||||
date: string;
|
||||
oldValue?: string;
|
||||
newValue?: string;
|
||||
}
|
||||
|
||||
export interface CategoryLeaderboard {
|
||||
category: string;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
changePercent7d: number;
|
||||
rank: number;
|
||||
}
|
||||
|
||||
export interface StoreFilters {
|
||||
storeId?: number;
|
||||
state?: string;
|
||||
days?: number;
|
||||
eventType?: string;
|
||||
}
|
||||
|
||||
export class StoreChangeService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get change summary for a store
|
||||
*/
|
||||
async getStoreChangeSummary(
|
||||
storeId: number
|
||||
): Promise<StoreChangeSummary | null> {
|
||||
const key = cacheKey('store_change_summary', { storeId });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Get store info
|
||||
const storeResult = await this.pool.query(`
|
||||
SELECT id, name, city, state FROM dispensaries WHERE id = $1
|
||||
`, [storeId]);
|
||||
|
||||
if (storeResult.rows.length === 0) return null;
|
||||
const store = storeResult.rows[0];
|
||||
|
||||
// Get change events counts
|
||||
const eventsResult = await this.pool.query(`
|
||||
SELECT
|
||||
event_type,
|
||||
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '7 days') as count_7d,
|
||||
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '30 days') as count_30d
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
GROUP BY event_type
|
||||
`, [storeId]);
|
||||
|
||||
const counts: Record<string, { count_7d: number; count_30d: number }> = {};
|
||||
eventsResult.rows.forEach(row => {
|
||||
counts[row.event_type] = {
|
||||
count_7d: parseInt(row.count_7d) || 0,
|
||||
count_30d: parseInt(row.count_30d) || 0,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
storeId: store.id,
|
||||
storeName: store.name,
|
||||
city: store.city,
|
||||
state: store.state,
|
||||
brandsAdded7d: counts['brand_added']?.count_7d || 0,
|
||||
brandsAdded30d: counts['brand_added']?.count_30d || 0,
|
||||
brandsLost7d: counts['brand_removed']?.count_7d || 0,
|
||||
brandsLost30d: counts['brand_removed']?.count_30d || 0,
|
||||
productsAdded7d: counts['product_added']?.count_7d || 0,
|
||||
productsAdded30d: counts['product_added']?.count_30d || 0,
|
||||
productsDiscontinued7d: counts['product_removed']?.count_7d || 0,
|
||||
productsDiscontinued30d: counts['product_removed']?.count_30d || 0,
|
||||
priceDrops7d: counts['price_drop']?.count_7d || 0,
|
||||
priceIncreases7d: counts['price_increase']?.count_7d || 0,
|
||||
restocks7d: counts['restocked']?.count_7d || 0,
|
||||
stockOuts7d: counts['out_of_stock']?.count_7d || 0,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recent change events for a store
|
||||
*/
|
||||
async getStoreChangeEvents(
|
||||
storeId: number,
|
||||
filters: { eventType?: string; days?: number; limit?: number } = {}
|
||||
): Promise<StoreChangeEvent[]> {
|
||||
const { eventType, days = 30, limit = 100 } = filters;
|
||||
const key = cacheKey('store_change_events', { storeId, eventType, days, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [storeId, days, limit];
|
||||
let eventTypeCondition = '';
|
||||
|
||||
if (eventType) {
|
||||
eventTypeCondition = 'AND event_type = $4';
|
||||
params.push(eventType);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
sce.id,
|
||||
sce.store_id,
|
||||
d.name as store_name,
|
||||
sce.event_type,
|
||||
sce.event_date,
|
||||
sce.brand_name,
|
||||
sce.product_name,
|
||||
sce.category,
|
||||
sce.old_value,
|
||||
sce.new_value,
|
||||
sce.metadata
|
||||
FROM store_change_events sce
|
||||
JOIN dispensaries d ON sce.store_id = d.id
|
||||
WHERE sce.store_id = $1
|
||||
AND sce.event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
${eventTypeCondition}
|
||||
ORDER BY sce.event_date DESC, sce.id DESC
|
||||
LIMIT $3
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
eventType: row.event_type,
|
||||
eventDate: row.event_date.toISOString().split('T')[0],
|
||||
brandName: row.brand_name,
|
||||
productName: row.product_name,
|
||||
category: row.category,
|
||||
oldValue: row.old_value,
|
||||
newValue: row.new_value,
|
||||
metadata: row.metadata,
|
||||
}));
|
||||
}, 5)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get new brands added to a store
|
||||
*/
|
||||
async getNewBrands(
|
||||
storeId: number,
|
||||
days: number = 30
|
||||
): Promise<BrandChange[]> {
|
||||
const key = cacheKey('new_brands', { storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
brand_name,
|
||||
event_date,
|
||||
metadata
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_type = 'brand_added'
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY event_date DESC
|
||||
`, [storeId, days]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
changeType: 'added' as const,
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
skuCount: row.metadata?.sku_count || 0,
|
||||
categories: row.metadata?.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get brands lost from a store
|
||||
*/
|
||||
async getLostBrands(
|
||||
storeId: number,
|
||||
days: number = 30
|
||||
): Promise<BrandChange[]> {
|
||||
const key = cacheKey('lost_brands', { storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
brand_name,
|
||||
event_date,
|
||||
metadata
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_type = 'brand_removed'
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY event_date DESC
|
||||
`, [storeId, days]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
changeType: 'removed' as const,
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
skuCount: row.metadata?.sku_count || 0,
|
||||
categories: row.metadata?.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get product changes for a store
|
||||
*/
|
||||
async getProductChanges(
|
||||
storeId: number,
|
||||
changeType?: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock',
|
||||
days: number = 7
|
||||
): Promise<ProductChange[]> {
|
||||
const key = cacheKey('product_changes', { storeId, changeType, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const eventTypeMap: Record<string, string> = {
|
||||
'added': 'product_added',
|
||||
'discontinued': 'product_removed',
|
||||
'price_drop': 'price_drop',
|
||||
'price_increase': 'price_increase',
|
||||
'restocked': 'restocked',
|
||||
'out_of_stock': 'out_of_stock',
|
||||
};
|
||||
|
||||
const params: (string | number)[] = [storeId, days];
|
||||
let eventCondition = '';
|
||||
|
||||
if (changeType) {
|
||||
eventCondition = 'AND event_type = $3';
|
||||
params.push(eventTypeMap[changeType]);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
product_id,
|
||||
product_name,
|
||||
brand_name,
|
||||
category,
|
||||
event_type,
|
||||
event_date,
|
||||
old_value,
|
||||
new_value
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
AND product_id IS NOT NULL
|
||||
${eventCondition}
|
||||
ORDER BY event_date DESC
|
||||
LIMIT 100
|
||||
`, params);
|
||||
|
||||
const reverseMap: Record<string, ProductChange['changeType']> = {
|
||||
'product_added': 'added',
|
||||
'product_removed': 'discontinued',
|
||||
'price_drop': 'price_drop',
|
||||
'price_increase': 'price_increase',
|
||||
'restocked': 'restocked',
|
||||
'out_of_stock': 'out_of_stock',
|
||||
};
|
||||
|
||||
return result.rows.map(row => ({
|
||||
productId: row.product_id,
|
||||
productName: row.product_name,
|
||||
brandName: row.brand_name,
|
||||
category: row.category,
|
||||
changeType: reverseMap[row.event_type] || 'added',
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
oldValue: row.old_value,
|
||||
newValue: row.new_value,
|
||||
}));
|
||||
}, 5)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category leaderboard across stores
|
||||
*/
|
||||
async getCategoryLeaderboard(
|
||||
category: string,
|
||||
limit: number = 20
|
||||
): Promise<CategoryLeaderboard[]> {
|
||||
const key = cacheKey('category_leaderboard', { category, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH store_category_stats AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
GROUP BY dp.dispensary_id, d.name
|
||||
)
|
||||
SELECT
|
||||
scs.*,
|
||||
RANK() OVER (ORDER BY scs.sku_count DESC) as rank
|
||||
FROM store_category_stats scs
|
||||
ORDER BY scs.sku_count DESC
|
||||
LIMIT $2
|
||||
`, [category, limit]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
changePercent7d: 0, // Would need historical data
|
||||
rank: parseInt(row.rank) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores with most activity (changes)
|
||||
*/
|
||||
async getMostActiveStores(
|
||||
days: number = 7,
|
||||
limit: number = 10
|
||||
): Promise<Array<{
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
totalChanges: number;
|
||||
brandsChanged: number;
|
||||
productsChanged: number;
|
||||
priceChanges: number;
|
||||
stockChanges: number;
|
||||
}>> {
|
||||
const key = cacheKey('most_active_stores', { days, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) as total_changes,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('brand_added', 'brand_removed')) as brands_changed,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('product_added', 'product_removed')) as products_changed,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('price_drop', 'price_increase')) as price_changes,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('restocked', 'out_of_stock')) as stock_changes
|
||||
FROM store_change_events sce
|
||||
JOIN dispensaries d ON sce.store_id = d.id
|
||||
WHERE sce.event_date >= CURRENT_DATE - ($1 || ' days')::INTERVAL
|
||||
GROUP BY d.id, d.name, d.city, d.state
|
||||
ORDER BY total_changes DESC
|
||||
LIMIT $2
|
||||
`, [days, limit]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
totalChanges: parseInt(row.total_changes) || 0,
|
||||
brandsChanged: parseInt(row.brands_changed) || 0,
|
||||
productsChanged: parseInt(row.products_changed) || 0,
|
||||
priceChanges: parseInt(row.price_changes) || 0,
|
||||
stockChanges: parseInt(row.stock_changes) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two stores
|
||||
*/
|
||||
async compareStores(
|
||||
storeId1: number,
|
||||
storeId2: number
|
||||
): Promise<{
|
||||
store1: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||
store2: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||
sharedBrands: string[];
|
||||
uniqueToStore1: string[];
|
||||
uniqueToStore2: string[];
|
||||
categoryComparison: Array<{
|
||||
category: string;
|
||||
store1Skus: number;
|
||||
store2Skus: number;
|
||||
difference: number;
|
||||
}>;
|
||||
}> {
|
||||
const key = cacheKey('compare_stores', { storeId1, storeId2 });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [store1Data, store2Data] = await Promise.all([
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.id, d.name,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
COUNT(*) as sku_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.id = $1
|
||||
GROUP BY d.id, d.name
|
||||
`, [storeId1]),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.id, d.name,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
COUNT(*) as sku_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.id = $1
|
||||
GROUP BY d.id, d.name
|
||||
`, [storeId2]),
|
||||
]);
|
||||
|
||||
const s1 = store1Data.rows[0];
|
||||
const s2 = store2Data.rows[0];
|
||||
|
||||
const brands1Array: string[] = (s1?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||
const brands2Array: string[] = (s2?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||
const brands1 = new Set(brands1Array);
|
||||
const brands2 = new Set(brands2Array);
|
||||
|
||||
const sharedBrands: string[] = brands1Array.filter(b => brands2.has(b));
|
||||
const uniqueToStore1: string[] = brands1Array.filter(b => !brands2.has(b));
|
||||
const uniqueToStore2: string[] = brands2Array.filter(b => !brands1.has(b));
|
||||
|
||||
// Category comparison
|
||||
const categoryResult = await this.pool.query(`
|
||||
WITH store1_cats AS (
|
||||
SELECT type as category, COUNT(*) as sku_count
|
||||
FROM dutchie_products WHERE dispensary_id = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
store2_cats AS (
|
||||
SELECT type as category, COUNT(*) as sku_count
|
||||
FROM dutchie_products WHERE dispensary_id = $2 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
all_cats AS (
|
||||
SELECT category FROM store1_cats
|
||||
UNION
|
||||
SELECT category FROM store2_cats
|
||||
)
|
||||
SELECT
|
||||
ac.category,
|
||||
COALESCE(s1.sku_count, 0) as store1_skus,
|
||||
COALESCE(s2.sku_count, 0) as store2_skus
|
||||
FROM all_cats ac
|
||||
LEFT JOIN store1_cats s1 ON ac.category = s1.category
|
||||
LEFT JOIN store2_cats s2 ON ac.category = s2.category
|
||||
ORDER BY (COALESCE(s1.sku_count, 0) + COALESCE(s2.sku_count, 0)) DESC
|
||||
`, [storeId1, storeId2]);
|
||||
|
||||
return {
|
||||
store1: {
|
||||
id: s1?.id || storeId1,
|
||||
name: s1?.name || 'Unknown',
|
||||
brands: s1?.brands || [],
|
||||
categories: s1?.categories || [],
|
||||
skuCount: parseInt(s1?.sku_count) || 0,
|
||||
},
|
||||
store2: {
|
||||
id: s2?.id || storeId2,
|
||||
name: s2?.name || 'Unknown',
|
||||
brands: s2?.brands || [],
|
||||
categories: s2?.categories || [],
|
||||
skuCount: parseInt(s2?.sku_count) || 0,
|
||||
},
|
||||
sharedBrands,
|
||||
uniqueToStore1,
|
||||
uniqueToStore2,
|
||||
categoryComparison: categoryResult.rows.map(row => ({
|
||||
category: row.category,
|
||||
store1Skus: parseInt(row.store1_skus) || 0,
|
||||
store2Skus: parseInt(row.store2_skus) || 0,
|
||||
difference: (parseInt(row.store1_skus) || 0) - (parseInt(row.store2_skus) || 0),
|
||||
})),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a change event (used by crawler/worker)
|
||||
*/
|
||||
async recordChangeEvent(event: {
|
||||
storeId: number;
|
||||
eventType: string;
|
||||
brandName?: string;
|
||||
productId?: number;
|
||||
productName?: string;
|
||||
category?: string;
|
||||
oldValue?: string;
|
||||
newValue?: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}): Promise<void> {
|
||||
await this.pool.query(`
|
||||
INSERT INTO store_change_events
|
||||
(store_id, event_type, event_date, brand_name, product_id, product_name, category, old_value, new_value, metadata)
|
||||
VALUES ($1, $2, CURRENT_DATE, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
event.storeId,
|
||||
event.eventType,
|
||||
event.brandName || null,
|
||||
event.productId || null,
|
||||
event.productName || null,
|
||||
event.category || null,
|
||||
event.oldValue || null,
|
||||
event.newValue || null,
|
||||
event.metadata ? JSON.stringify(event.metadata) : null,
|
||||
]);
|
||||
|
||||
// Invalidate cache
|
||||
await this.cache.invalidatePattern(`store_change_summary:storeId=${event.storeId}`);
|
||||
}
|
||||
}
|
||||
@@ -1,20 +1,27 @@
|
||||
/**
|
||||
* AZDHS Import Service
|
||||
* LEGACY SERVICE - AZDHS Import
|
||||
*
|
||||
* DEPRECATED: This service creates its own database pool.
|
||||
* Future implementations should use the canonical CannaiQ connection.
|
||||
*
|
||||
* Imports Arizona dispensaries from the main database's dispensaries table
|
||||
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
|
||||
*
|
||||
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
|
||||
*
|
||||
* DO NOT:
|
||||
* - Run this in automated jobs
|
||||
* - Use DATABASE_URL directly
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { query as dutchieQuery } from '../db/connection';
|
||||
import { Dispensary } from '../types';
|
||||
|
||||
// Main database connection (source of AZDHS data)
|
||||
const MAIN_DATABASE_URL =
|
||||
process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
// Single database connection (cannaiq in cannaiq-postgres container)
|
||||
// Use CANNAIQ_DB_* env vars or defaults
|
||||
const MAIN_DB_CONNECTION = process.env.CANNAIQ_DB_URL ||
|
||||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
|
||||
|
||||
/**
|
||||
* AZDHS dispensary record from the main database
|
||||
@@ -57,8 +64,9 @@ interface ImportResult {
|
||||
* Create a temporary connection to the main database
|
||||
*/
|
||||
function getMainDBPool(): Pool {
|
||||
console.warn('[AZDHS Import] LEGACY: Using separate pool. Should use canonical CannaiQ connection.');
|
||||
return new Pool({
|
||||
connectionString: MAIN_DATABASE_URL,
|
||||
connectionString: MAIN_DB_CONNECTION,
|
||||
max: 5,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
|
||||
@@ -344,15 +344,12 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number
|
||||
return { resolved, failed, skipped, notCrawlable };
|
||||
}
|
||||
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
|
||||
/**
|
||||
* Get all dispensaries
|
||||
*/
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
|
||||
export async function getAllDispensaries(): Promise<Dispensary[]> {
|
||||
const { rows } = await query(
|
||||
@@ -386,7 +383,7 @@ export function mapDbRowToDispensary(row: any): Dispensary {
|
||||
id: row.id,
|
||||
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
|
||||
name: row.name,
|
||||
dbaName: row.dbaName || row.dba_name,
|
||||
dbaName: row.dbaName || row.dba_name || undefined, // dba_name column is optional
|
||||
slug: row.slug,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
@@ -421,7 +418,6 @@ export async function getDispensaryById(id: number): Promise<Dispensary | null>
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
dba_name AS "dbaName",
|
||||
slug,
|
||||
city,
|
||||
state,
|
||||
|
||||
491
backend/src/dutchie-az/services/error-taxonomy.ts
Normal file
491
backend/src/dutchie-az/services/error-taxonomy.ts
Normal file
@@ -0,0 +1,491 @@
|
||||
/**
|
||||
* Error Taxonomy Module
|
||||
*
|
||||
* Standardized error codes and classification for crawler reliability.
|
||||
* All crawl results must use these codes for consistent error handling.
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// ERROR CODES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Standardized error codes for all crawl operations.
|
||||
* These codes are stored in the database for analytics and debugging.
|
||||
*/
|
||||
export const CrawlErrorCode = {
|
||||
// Success states
|
||||
SUCCESS: 'SUCCESS',
|
||||
|
||||
// Rate limiting
|
||||
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
|
||||
|
||||
// Proxy issues
|
||||
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
|
||||
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
|
||||
|
||||
// Content issues
|
||||
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
|
||||
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
|
||||
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
|
||||
|
||||
// Network issues
|
||||
TIMEOUT: 'TIMEOUT', // Request timeout
|
||||
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
|
||||
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
|
||||
|
||||
// Authentication
|
||||
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
|
||||
|
||||
// Server errors
|
||||
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
|
||||
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
|
||||
|
||||
// Configuration issues
|
||||
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
|
||||
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
|
||||
|
||||
// Unknown
|
||||
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
|
||||
} as const;
|
||||
|
||||
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
|
||||
|
||||
// ============================================================
|
||||
// ERROR CLASSIFICATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Error metadata for each error code
|
||||
*/
|
||||
interface ErrorMetadata {
|
||||
code: CrawlErrorCodeType;
|
||||
retryable: boolean;
|
||||
rotateProxy: boolean;
|
||||
rotateUserAgent: boolean;
|
||||
backoffMultiplier: number;
|
||||
severity: 'low' | 'medium' | 'high' | 'critical';
|
||||
description: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for each error code - defines retry behavior
|
||||
*/
|
||||
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
|
||||
[CrawlErrorCode.SUCCESS]: {
|
||||
code: CrawlErrorCode.SUCCESS,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'low',
|
||||
description: 'Crawl completed successfully',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.RATE_LIMITED]: {
|
||||
code: CrawlErrorCode.RATE_LIMITED,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'medium',
|
||||
description: 'Rate limited by target (429)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.BLOCKED_PROXY]: {
|
||||
code: CrawlErrorCode.BLOCKED_PROXY,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Proxy blocked or rejected (407)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.PROXY_TIMEOUT]: {
|
||||
code: CrawlErrorCode.PROXY_TIMEOUT,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'low',
|
||||
description: 'Proxy connection timed out',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.HTML_CHANGED]: {
|
||||
code: CrawlErrorCode.HTML_CHANGED,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'high',
|
||||
description: 'Page structure changed - needs selector update',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.NO_PRODUCTS]: {
|
||||
code: CrawlErrorCode.NO_PRODUCTS,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'low',
|
||||
description: 'No products returned (may be temporary)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.PARSE_ERROR]: {
|
||||
code: CrawlErrorCode.PARSE_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'Failed to parse response data',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.TIMEOUT]: {
|
||||
code: CrawlErrorCode.TIMEOUT,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Request timed out',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.NETWORK_ERROR]: {
|
||||
code: CrawlErrorCode.NETWORK_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'Network connection failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.DNS_ERROR]: {
|
||||
code: CrawlErrorCode.DNS_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'DNS resolution failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.AUTH_FAILED]: {
|
||||
code: CrawlErrorCode.AUTH_FAILED,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'high',
|
||||
description: 'Authentication or session failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.SERVER_ERROR]: {
|
||||
code: CrawlErrorCode.SERVER_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Server error (5xx)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
|
||||
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'high',
|
||||
description: 'Service temporarily unavailable (503)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.INVALID_CONFIG]: {
|
||||
code: CrawlErrorCode.INVALID_CONFIG,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'critical',
|
||||
description: 'Invalid store configuration',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
|
||||
code: CrawlErrorCode.MISSING_PLATFORM_ID,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'critical',
|
||||
description: 'Missing platform_dispensary_id',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.UNKNOWN_ERROR]: {
|
||||
code: CrawlErrorCode.UNKNOWN_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'high',
|
||||
description: 'Unknown/unclassified error',
|
||||
},
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ERROR CLASSIFICATION FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Classify an error into a standardized error code.
|
||||
*
|
||||
* @param error - The error to classify (Error object, string, or HTTP status)
|
||||
* @param httpStatus - Optional HTTP status code
|
||||
* @returns Standardized error code
|
||||
*/
|
||||
export function classifyError(
|
||||
error: Error | string | null,
|
||||
httpStatus?: number
|
||||
): CrawlErrorCodeType {
|
||||
// Check HTTP status first
|
||||
if (httpStatus) {
|
||||
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
|
||||
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
|
||||
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
|
||||
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
|
||||
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
|
||||
}
|
||||
|
||||
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
|
||||
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
|
||||
|
||||
// Rate limiting patterns
|
||||
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
|
||||
return CrawlErrorCode.RATE_LIMITED;
|
||||
}
|
||||
|
||||
// Proxy patterns
|
||||
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
|
||||
return CrawlErrorCode.BLOCKED_PROXY;
|
||||
}
|
||||
|
||||
// Timeout patterns
|
||||
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
|
||||
if (message.includes('proxy')) {
|
||||
return CrawlErrorCode.PROXY_TIMEOUT;
|
||||
}
|
||||
return CrawlErrorCode.TIMEOUT;
|
||||
}
|
||||
|
||||
// Network patterns
|
||||
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
|
||||
return CrawlErrorCode.NETWORK_ERROR;
|
||||
}
|
||||
|
||||
// DNS patterns
|
||||
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
|
||||
return CrawlErrorCode.DNS_ERROR;
|
||||
}
|
||||
|
||||
// Auth patterns
|
||||
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
|
||||
return CrawlErrorCode.AUTH_FAILED;
|
||||
}
|
||||
|
||||
// HTML change patterns
|
||||
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
|
||||
return CrawlErrorCode.HTML_CHANGED;
|
||||
}
|
||||
|
||||
// Parse patterns
|
||||
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
|
||||
return CrawlErrorCode.PARSE_ERROR;
|
||||
}
|
||||
|
||||
// No products patterns
|
||||
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
|
||||
return CrawlErrorCode.NO_PRODUCTS;
|
||||
}
|
||||
|
||||
// Server error patterns
|
||||
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
|
||||
return CrawlErrorCode.SERVER_ERROR;
|
||||
}
|
||||
|
||||
// Config patterns
|
||||
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
|
||||
if (message.includes('platform') || message.includes('dispensary_id')) {
|
||||
return CrawlErrorCode.MISSING_PLATFORM_ID;
|
||||
}
|
||||
return CrawlErrorCode.INVALID_CONFIG;
|
||||
}
|
||||
|
||||
return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get metadata for an error code
|
||||
*/
|
||||
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
|
||||
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an error is retryable
|
||||
*/
|
||||
export function isRetryable(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).retryable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy should be rotated for this error
|
||||
*/
|
||||
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).rotateProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if user agent should be rotated for this error
|
||||
*/
|
||||
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).rotateUserAgent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get backoff multiplier for this error
|
||||
*/
|
||||
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
|
||||
return getErrorMetadata(code).backoffMultiplier;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CRAWL RESULT TYPE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Standardized crawl result with error taxonomy
|
||||
*/
|
||||
export interface CrawlResult {
|
||||
success: boolean;
|
||||
dispensaryId: number;
|
||||
|
||||
// Error info
|
||||
errorCode: CrawlErrorCodeType;
|
||||
errorMessage?: string;
|
||||
httpStatus?: number;
|
||||
|
||||
// Timing
|
||||
startedAt: Date;
|
||||
finishedAt: Date;
|
||||
durationMs: number;
|
||||
|
||||
// Context
|
||||
attemptNumber: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
|
||||
// Metrics (on success)
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
imagesDownloaded?: number;
|
||||
|
||||
// Metadata
|
||||
metadata?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a success result
|
||||
*/
|
||||
export function createSuccessResult(
|
||||
dispensaryId: number,
|
||||
startedAt: Date,
|
||||
metrics: {
|
||||
productsFound: number;
|
||||
productsUpserted: number;
|
||||
snapshotsCreated: number;
|
||||
imagesDownloaded?: number;
|
||||
},
|
||||
context?: {
|
||||
attemptNumber?: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
}
|
||||
): CrawlResult {
|
||||
const finishedAt = new Date();
|
||||
return {
|
||||
success: true,
|
||||
dispensaryId,
|
||||
errorCode: CrawlErrorCode.SUCCESS,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||
attemptNumber: context?.attemptNumber || 1,
|
||||
proxyUsed: context?.proxyUsed,
|
||||
userAgentUsed: context?.userAgentUsed,
|
||||
...metrics,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a failure result
|
||||
*/
|
||||
export function createFailureResult(
|
||||
dispensaryId: number,
|
||||
startedAt: Date,
|
||||
error: Error | string,
|
||||
httpStatus?: number,
|
||||
context?: {
|
||||
attemptNumber?: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
}
|
||||
): CrawlResult {
|
||||
const finishedAt = new Date();
|
||||
const errorCode = classifyError(error, httpStatus);
|
||||
const errorMessage = typeof error === 'string' ? error : error.message;
|
||||
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
httpStatus,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||
attemptNumber: context?.attemptNumber || 1,
|
||||
proxyUsed: context?.proxyUsed,
|
||||
userAgentUsed: context?.userAgentUsed,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOGGING HELPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Format error code for logging
|
||||
*/
|
||||
export function formatErrorForLog(result: CrawlResult): string {
|
||||
const metadata = getErrorMetadata(result.errorCode);
|
||||
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
|
||||
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
|
||||
|
||||
if (result.success) {
|
||||
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
|
||||
}
|
||||
|
||||
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get user-friendly error description
|
||||
*/
|
||||
export function getErrorDescription(code: CrawlErrorCodeType): string {
|
||||
return getErrorMetadata(code).description;
|
||||
}
|
||||
@@ -8,6 +8,10 @@
|
||||
import { query, getClient } from '../db/connection';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import * as os from 'os';
|
||||
import { DEFAULT_CONFIG } from './store-validator';
|
||||
|
||||
// Minimum gap between crawls for the same dispensary (in minutes)
|
||||
const MIN_CRAWL_GAP_MINUTES = DEFAULT_CONFIG.minCrawlGapMinutes; // 2 minutes
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
@@ -97,11 +101,30 @@ export function getWorkerHostname(): string {
|
||||
// JOB ENQUEUEING
|
||||
// ============================================================
|
||||
|
||||
export interface EnqueueResult {
|
||||
jobId: number | null;
|
||||
skipped: boolean;
|
||||
reason?: 'already_queued' | 'too_soon' | 'error';
|
||||
message?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new job for processing
|
||||
* Returns null if a pending/running job already exists for this dispensary
|
||||
* or if a job was completed/failed within the minimum gap period
|
||||
*/
|
||||
export async function enqueueJob(options: EnqueueJobOptions): Promise<number | null> {
|
||||
const result = await enqueueJobWithReason(options);
|
||||
return result.jobId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new job with detailed result info
|
||||
* Enforces:
|
||||
* 1. No duplicate pending/running jobs for same dispensary
|
||||
* 2. Minimum 2-minute gap between crawls for same dispensary
|
||||
*/
|
||||
export async function enqueueJobWithReason(options: EnqueueJobOptions): Promise<EnqueueResult> {
|
||||
const {
|
||||
jobType,
|
||||
dispensaryId,
|
||||
@@ -121,10 +144,43 @@ export async function enqueueJob(options: EnqueueJobOptions): Promise<number | n
|
||||
|
||||
if (existing.length > 0) {
|
||||
console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
|
||||
return null;
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'already_queued',
|
||||
message: `Job already pending/running for dispensary ${dispensaryId}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Check minimum gap since last job (2 minutes)
|
||||
const { rows: recent } = await query<any>(
|
||||
`SELECT id, created_at, status
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (recent.length > 0) {
|
||||
const lastJobTime = new Date(recent[0].created_at);
|
||||
const minGapMs = MIN_CRAWL_GAP_MINUTES * 60 * 1000;
|
||||
const timeSinceLastJob = Date.now() - lastJobTime.getTime();
|
||||
|
||||
if (timeSinceLastJob < minGapMs) {
|
||||
const waitSeconds = Math.ceil((minGapMs - timeSinceLastJob) / 1000);
|
||||
console.log(`[JobQueue] Skipping enqueue - minimum ${MIN_CRAWL_GAP_MINUTES}min gap not met for dispensary ${dispensaryId}. Wait ${waitSeconds}s`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'too_soon',
|
||||
message: `Minimum ${MIN_CRAWL_GAP_MINUTES}-minute gap required. Try again in ${waitSeconds} seconds.`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const { rows } = await query<any>(
|
||||
`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
|
||||
@@ -134,18 +190,41 @@ export async function enqueueJob(options: EnqueueJobOptions): Promise<number | n
|
||||
|
||||
const jobId = rows[0].id;
|
||||
console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
|
||||
return jobId;
|
||||
return { jobId, skipped: false };
|
||||
} catch (error: any) {
|
||||
// Handle database trigger rejection for minimum gap
|
||||
if (error.message?.includes('Minimum') && error.message?.includes('gap')) {
|
||||
console.log(`[JobQueue] DB rejected - minimum gap not met for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'too_soon',
|
||||
message: error.message,
|
||||
};
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export interface BulkEnqueueResult {
|
||||
enqueued: number;
|
||||
skipped: number;
|
||||
skippedReasons: {
|
||||
alreadyQueued: number;
|
||||
tooSoon: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Bulk enqueue jobs for multiple dispensaries
|
||||
* Skips dispensaries that already have pending/running jobs
|
||||
* or have jobs within the minimum gap period
|
||||
*/
|
||||
export async function bulkEnqueueJobs(
|
||||
jobType: string,
|
||||
dispensaryIds: number[],
|
||||
options: { priority?: number; metadata?: Record<string, any> } = {}
|
||||
): Promise<{ enqueued: number; skipped: number }> {
|
||||
): Promise<BulkEnqueueResult> {
|
||||
const { priority = 0, metadata } = options;
|
||||
|
||||
// Get dispensaries that already have pending/running jobs
|
||||
@@ -156,11 +235,31 @@ export async function bulkEnqueueJobs(
|
||||
);
|
||||
const existingSet = new Set(existing.map((r: any) => r.dispensary_id));
|
||||
|
||||
// Filter out dispensaries with existing jobs
|
||||
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id));
|
||||
// Get dispensaries that have recent jobs within minimum gap
|
||||
const { rows: recent } = await query<any>(
|
||||
`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1)
|
||||
AND created_at > NOW() - ($2 || ' minutes')::INTERVAL
|
||||
AND dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')
|
||||
)`,
|
||||
[dispensaryIds, MIN_CRAWL_GAP_MINUTES]
|
||||
);
|
||||
const recentSet = new Set(recent.map((r: any) => r.dispensary_id));
|
||||
|
||||
// Filter out dispensaries with existing or recent jobs
|
||||
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id) && !recentSet.has(id));
|
||||
|
||||
if (toEnqueue.length === 0) {
|
||||
return { enqueued: 0, skipped: dispensaryIds.length };
|
||||
return {
|
||||
enqueued: 0,
|
||||
skipped: dispensaryIds.length,
|
||||
skippedReasons: {
|
||||
alreadyQueued: existingSet.size,
|
||||
tooSoon: recentSet.size,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
|
||||
@@ -181,8 +280,15 @@ export async function bulkEnqueueJobs(
|
||||
params
|
||||
);
|
||||
|
||||
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size}`);
|
||||
return { enqueued: toEnqueue.length, skipped: existingSet.size };
|
||||
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size} (queued) + ${recentSet.size} (recent)`);
|
||||
return {
|
||||
enqueued: toEnqueue.length,
|
||||
skipped: existingSet.size + recentSet.size,
|
||||
skippedReasons: {
|
||||
alreadyQueued: existingSet.size,
|
||||
tooSoon: recentSet.size,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
@@ -311,22 +417,48 @@ export async function heartbeat(jobId: number): Promise<void> {
|
||||
|
||||
/**
|
||||
* Mark job as completed
|
||||
*
|
||||
* Stores visibility tracking stats (visibilityLostCount, visibilityRestoredCount)
|
||||
* in the metadata JSONB column for dashboard analytics.
|
||||
*/
|
||||
export async function completeJob(
|
||||
jobId: number,
|
||||
result: { productsFound?: number; productsUpserted?: number; snapshotsCreated?: number }
|
||||
result: {
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
visibilityLostCount?: number;
|
||||
visibilityRestoredCount?: number;
|
||||
}
|
||||
): Promise<void> {
|
||||
// Build metadata with visibility stats if provided
|
||||
const metadata: Record<string, any> = {};
|
||||
if (result.visibilityLostCount !== undefined) {
|
||||
metadata.visibilityLostCount = result.visibilityLostCount;
|
||||
}
|
||||
if (result.visibilityRestoredCount !== undefined) {
|
||||
metadata.visibilityRestoredCount = result.visibilityRestoredCount;
|
||||
}
|
||||
if (result.snapshotsCreated !== undefined) {
|
||||
metadata.snapshotsCreated = result.snapshotsCreated;
|
||||
}
|
||||
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
products_found = COALESCE($2, products_found),
|
||||
products_upserted = COALESCE($3, products_upserted),
|
||||
snapshots_created = COALESCE($4, snapshots_created),
|
||||
products_updated = COALESCE($3, products_updated),
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) || $4::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[jobId, result.productsFound, result.productsUpserted, result.snapshotsCreated]
|
||||
[
|
||||
jobId,
|
||||
result.productsFound,
|
||||
result.productsUpserted,
|
||||
JSON.stringify(metadata),
|
||||
]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} completed`);
|
||||
}
|
||||
|
||||
@@ -16,12 +16,8 @@ import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } fro
|
||||
import { resolveDispensaryId } from './graphql-client';
|
||||
import { Dispensary, JobStatus } from '../types';
|
||||
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
@@ -647,6 +643,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
last_id_resolution_at = NOW(),
|
||||
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||
id_resolution_error = $1,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
@@ -660,7 +659,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`,
|
||||
[result.error, dispensaryId]
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -675,6 +674,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = $1,
|
||||
last_id_resolution_at = NOW(),
|
||||
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||
id_resolution_error = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
@@ -691,7 +693,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`,
|
||||
[platformId, dispensaryId]
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
|
||||
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -714,6 +716,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = $1,
|
||||
last_id_resolution_at = NOW(),
|
||||
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||
id_resolution_error = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
@@ -730,10 +735,10 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`,
|
||||
[platformId, cName, dispensaryId]
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
||||
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
||||
} else {
|
||||
// cName resolution failed - try crawling website as fallback
|
||||
console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
|
||||
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
|
||||
|
||||
if (website && website.trim() !== '') {
|
||||
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
|
||||
@@ -796,6 +801,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = NULL,
|
||||
last_id_resolution_at = NOW(),
|
||||
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||
id_resolution_error = $2,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
@@ -812,7 +820,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`,
|
||||
[cName, result.error, dispensaryId]
|
||||
);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.error = `Resolution failed: ${error.message}`;
|
||||
@@ -820,6 +828,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
last_id_resolution_at = NOW(),
|
||||
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||
id_resolution_error = $2,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
@@ -835,7 +846,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
`,
|
||||
[cName, result.error, dispensaryId]
|
||||
);
|
||||
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
console.error(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
@@ -844,6 +855,11 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
||||
/**
|
||||
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
|
||||
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
|
||||
*
|
||||
* Enhanced for Henry (Entry Point Finder) to also process:
|
||||
* - Stores with slug changes that need re-resolution
|
||||
* - Recently added stores from Alice's discovery
|
||||
* - Stores that failed resolution and need retry
|
||||
*/
|
||||
export async function runBulkDetection(options: {
|
||||
state?: string;
|
||||
@@ -851,6 +867,9 @@ export async function runBulkDetection(options: {
|
||||
onlyMissingPlatformId?: boolean;
|
||||
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
|
||||
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
|
||||
includeSlugChanges?: boolean; // Include stores where Alice detected slug changes
|
||||
includeRecentlyAdded?: boolean; // Include stores recently added by Alice
|
||||
scope?: { states?: string[]; storeIds?: number[] }; // Scope filtering for sharding
|
||||
limit?: number;
|
||||
} = {}): Promise<BulkDetectionResult> {
|
||||
const {
|
||||
@@ -859,14 +878,23 @@ export async function runBulkDetection(options: {
|
||||
onlyMissingPlatformId = false,
|
||||
includeWebsiteCrawl = true,
|
||||
includeDutchieMissingPlatformId = true,
|
||||
includeSlugChanges = true,
|
||||
includeRecentlyAdded = true,
|
||||
scope,
|
||||
limit,
|
||||
} = options;
|
||||
|
||||
console.log('[MenuDetection] Starting bulk detection...');
|
||||
const scopeDesc = scope?.states?.length
|
||||
? ` (states: ${scope.states.join(', ')})`
|
||||
: scope?.storeIds?.length
|
||||
? ` (${scope.storeIds.length} specific stores)`
|
||||
: state ? ` (state: ${state})` : '';
|
||||
|
||||
console.log(`[Henry - Entry Point Finder] Starting bulk detection${scopeDesc}...`);
|
||||
|
||||
// Build query to find dispensaries needing detection
|
||||
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
||||
// Optionally includes dutchie stores missing platform ID
|
||||
// Optionally includes dutchie stores missing platform ID, slug changes, and recently added stores
|
||||
let whereClause = `WHERE (
|
||||
menu_url IS NOT NULL
|
||||
${includeWebsiteCrawl ? `OR (
|
||||
@@ -882,7 +910,14 @@ export async function runBulkDetection(options: {
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state) {
|
||||
// Apply scope filtering (takes precedence over single state filter)
|
||||
if (scope?.storeIds?.length) {
|
||||
whereClause += ` AND id = ANY($${paramIndex++})`;
|
||||
params.push(scope.storeIds);
|
||||
} else if (scope?.states?.length) {
|
||||
whereClause += ` AND state = ANY($${paramIndex++})`;
|
||||
params.push(scope.states);
|
||||
} else if (state) {
|
||||
whereClause += ` AND state = $${paramIndex++}`;
|
||||
params.push(state);
|
||||
}
|
||||
@@ -962,6 +997,19 @@ export async function runBulkDetection(options: {
|
||||
|
||||
/**
|
||||
* Execute the menu detection job (called by scheduler)
|
||||
*
|
||||
* Worker: Henry (Entry Point Finder)
|
||||
* Uses METHOD 1 (reactEnv extraction) as primary method per user requirements.
|
||||
*
|
||||
* Scope filtering:
|
||||
* - config.scope.states: Array of state codes to limit detection (e.g., ["AZ", "CA"])
|
||||
* - config.scope.storeIds: Array of specific store IDs to process
|
||||
*
|
||||
* Processes:
|
||||
* - Stores with unknown/missing menu_type
|
||||
* - Stores with missing platform_dispensary_id
|
||||
* - Stores with slug changes that need re-resolution (from Alice)
|
||||
* - Recently added stores (discovered by Alice)
|
||||
*/
|
||||
export async function executeMenuDetectionJob(config: Record<string, any> = {}): Promise<{
|
||||
status: JobStatus;
|
||||
@@ -972,19 +1020,31 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
||||
metadata?: any;
|
||||
}> {
|
||||
const state = config.state || 'AZ';
|
||||
const scope = config.scope as { states?: string[]; storeIds?: number[] } | undefined;
|
||||
const onlyUnknown = config.onlyUnknown !== false;
|
||||
// Default to true - always try to resolve platform IDs for dutchie stores
|
||||
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
|
||||
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
|
||||
const includeSlugChanges = config.includeSlugChanges !== false;
|
||||
const includeRecentlyAdded = config.includeRecentlyAdded !== false;
|
||||
|
||||
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
|
||||
const scopeDesc = scope?.states?.length
|
||||
? ` (states: ${scope.states.join(', ')})`
|
||||
: scope?.storeIds?.length
|
||||
? ` (${scope.storeIds.length} specific stores)`
|
||||
: ` (state: ${state})`;
|
||||
|
||||
console.log(`[Henry - Entry Point Finder] Executing scheduled job${scopeDesc}...`);
|
||||
|
||||
try {
|
||||
const result = await runBulkDetection({
|
||||
state,
|
||||
state: scope ? undefined : state, // Use scope if provided, otherwise fall back to state
|
||||
scope,
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
includeDutchieMissingPlatformId,
|
||||
includeSlugChanges,
|
||||
includeRecentlyAdded,
|
||||
});
|
||||
|
||||
const status: JobStatus =
|
||||
@@ -998,9 +1058,11 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
||||
itemsFailed: result.totalFailed,
|
||||
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
||||
metadata: {
|
||||
state,
|
||||
scope: scope || { states: [state] },
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
includeSlugChanges,
|
||||
includeRecentlyAdded,
|
||||
providerCounts: countByProvider(result.results),
|
||||
},
|
||||
};
|
||||
@@ -1011,6 +1073,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
||||
itemsSucceeded: 0,
|
||||
itemsFailed: 0,
|
||||
errorMessage: error.message,
|
||||
metadata: { scope: scope || { states: [state] } },
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,12 +24,8 @@ import {
|
||||
} from '../types';
|
||||
import { downloadProductImage, imageExists } from '../../utils/image-storage';
|
||||
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
|
||||
// ============================================================
|
||||
// BATCH PROCESSING CONFIGURATION
|
||||
@@ -648,10 +644,15 @@ async function updateDispensaryCrawlStats(
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark products as missing from feed
|
||||
* Mark products as missing from feed (visibility-loss detection)
|
||||
* Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
|
||||
* for products that were NOT in the UNION of Mode A and Mode B product lists
|
||||
*
|
||||
* Bella (Product Sync) visibility tracking:
|
||||
* - Sets visibility_lost=TRUE and visibility_lost_at=NOW() for disappearing products
|
||||
* - Records visibility event in snapshot metadata JSONB
|
||||
* - NEVER deletes products, just marks them as visibility-lost
|
||||
*
|
||||
* IMPORTANT: Uses UNION of both modes to avoid false positives
|
||||
* If the union is empty (possible outage), we skip marking to avoid data corruption
|
||||
*/
|
||||
@@ -660,25 +661,28 @@ async function markMissingProducts(
|
||||
platformDispensaryId: string,
|
||||
modeAProductIds: Set<string>,
|
||||
modeBProductIds: Set<string>,
|
||||
pricingType: 'rec' | 'med'
|
||||
): Promise<number> {
|
||||
pricingType: 'rec' | 'med',
|
||||
workerName: string = 'Bella'
|
||||
): Promise<{ markedMissing: number; newlyLost: number }> {
|
||||
// Build UNION of Mode A + Mode B product IDs
|
||||
const unionProductIds = new Set<string>([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
|
||||
|
||||
// OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
|
||||
if (unionProductIds.size === 0) {
|
||||
console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
|
||||
return 0;
|
||||
console.warn(`[${workerName} - Product Sync] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping visibility-loss marking.`);
|
||||
return { markedMissing: 0, newlyLost: 0 };
|
||||
}
|
||||
|
||||
// Get all existing products for this dispensary that were not in the UNION
|
||||
// Also check if they were already marked as visibility_lost to track new losses
|
||||
const { rows: missingProducts } = await query<{
|
||||
id: number;
|
||||
external_product_id: string;
|
||||
name: string;
|
||||
visibility_lost: boolean;
|
||||
}>(
|
||||
`
|
||||
SELECT id, external_product_id, name
|
||||
SELECT id, external_product_id, name, COALESCE(visibility_lost, FALSE) as visibility_lost
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1
|
||||
AND external_product_id NOT IN (SELECT unnest($2::text[]))
|
||||
@@ -687,21 +691,27 @@ async function markMissingProducts(
|
||||
);
|
||||
|
||||
if (missingProducts.length === 0) {
|
||||
return 0;
|
||||
return { markedMissing: 0, newlyLost: 0 };
|
||||
}
|
||||
|
||||
console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
|
||||
// Separate newly lost products from already-lost products
|
||||
const newlyLostProducts = missingProducts.filter(p => !p.visibility_lost);
|
||||
const alreadyLostProducts = missingProducts.filter(p => p.visibility_lost);
|
||||
|
||||
console.log(`[${workerName} - Product Sync] Visibility check: ${missingProducts.length} products missing (${newlyLostProducts.length} newly lost, ${alreadyLostProducts.length} already lost)`);
|
||||
|
||||
const crawledAt = new Date();
|
||||
|
||||
// Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
|
||||
const missingSnapshots: Partial<DutchieProductSnapshot>[] = missingProducts.map(product => ({
|
||||
// Build all missing snapshots with visibility_events metadata
|
||||
const missingSnapshots: Partial<DutchieProductSnapshot>[] = missingProducts.map(product => {
|
||||
const isNewlyLost = !product.visibility_lost;
|
||||
return {
|
||||
dutchieProductId: product.id,
|
||||
dispensaryId,
|
||||
platformDispensaryId,
|
||||
externalProductId: product.external_product_id,
|
||||
pricingType,
|
||||
crawlMode: 'mode_a' as CrawlMode, // Use mode_a for missing snapshots (convention)
|
||||
crawlMode: 'mode_a' as CrawlMode,
|
||||
status: undefined,
|
||||
featured: false,
|
||||
special: false,
|
||||
@@ -709,37 +719,113 @@ async function markMissingProducts(
|
||||
recOnly: false,
|
||||
isPresentInFeed: false,
|
||||
stockStatus: 'missing_from_feed' as StockStatus,
|
||||
totalQuantityAvailable: undefined, // null = unknown, not 0
|
||||
totalQuantityAvailable: undefined,
|
||||
manualInventory: false,
|
||||
isBelowThreshold: false,
|
||||
isBelowKioskThreshold: false,
|
||||
options: [],
|
||||
rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
|
||||
rawPayload: {
|
||||
_missingFromFeed: true,
|
||||
lastKnownName: product.name,
|
||||
visibility_events: isNewlyLost ? [{
|
||||
event_type: 'visibility_lost',
|
||||
timestamp: crawledAt.toISOString(),
|
||||
worker_name: workerName,
|
||||
}] : [],
|
||||
},
|
||||
crawledAt,
|
||||
}));
|
||||
};
|
||||
});
|
||||
|
||||
// Batch insert missing snapshots
|
||||
const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
|
||||
|
||||
// Batch update product stock status in chunks
|
||||
// Batch update product visibility status in chunks
|
||||
const productIds = missingProducts.map(p => p.id);
|
||||
const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
|
||||
|
||||
console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
|
||||
console.log(`[${workerName} - Product Sync] Updating ${productIds.length} product visibility in ${productChunks.length} chunks...`);
|
||||
|
||||
for (const chunk of productChunks) {
|
||||
// Update all products: set stock_status to missing
|
||||
// Only set visibility_lost_at for NEWLY lost products (not already lost)
|
||||
await query(
|
||||
`
|
||||
UPDATE dutchie_products
|
||||
SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
|
||||
SET
|
||||
stock_status = 'missing_from_feed',
|
||||
total_quantity_available = NULL,
|
||||
visibility_lost = TRUE,
|
||||
visibility_lost_at = CASE
|
||||
WHEN visibility_lost IS NULL OR visibility_lost = FALSE THEN NOW()
|
||||
ELSE visibility_lost_at -- Keep existing timestamp for already-lost products
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = ANY($1::int[])
|
||||
`,
|
||||
[chunk]
|
||||
);
|
||||
}
|
||||
|
||||
console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
|
||||
return snapshotsInserted;
|
||||
console.log(`[${workerName} - Product Sync] Marked ${snapshotsInserted} products as missing, ${newlyLostProducts.length} newly visibility-lost`);
|
||||
return { markedMissing: snapshotsInserted, newlyLost: newlyLostProducts.length };
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore visibility for products that reappeared in the feed
|
||||
* Called when products that were previously visibility_lost=TRUE are now found in the feed
|
||||
*
|
||||
* Bella (Product Sync) visibility tracking:
|
||||
* - Sets visibility_lost=FALSE and visibility_restored_at=NOW()
|
||||
* - Logs the restoration event
|
||||
*/
|
||||
async function restoreVisibilityForProducts(
|
||||
dispensaryId: number,
|
||||
productIds: Set<string>,
|
||||
workerName: string = 'Bella'
|
||||
): Promise<number> {
|
||||
if (productIds.size === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Find products that were visibility_lost and are now in the feed
|
||||
const { rows: restoredProducts } = await query<{ id: number; external_product_id: string }>(
|
||||
`
|
||||
SELECT id, external_product_id
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1
|
||||
AND visibility_lost = TRUE
|
||||
AND external_product_id = ANY($2::text[])
|
||||
`,
|
||||
[dispensaryId, Array.from(productIds)]
|
||||
);
|
||||
|
||||
if (restoredProducts.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`[${workerName} - Product Sync] Restoring visibility for ${restoredProducts.length} products that reappeared`);
|
||||
|
||||
// Batch update restored products
|
||||
const restoredIds = restoredProducts.map(p => p.id);
|
||||
const chunks = chunkArray(restoredIds, BATCH_CHUNK_SIZE);
|
||||
|
||||
for (const chunk of chunks) {
|
||||
await query(
|
||||
`
|
||||
UPDATE dutchie_products
|
||||
SET
|
||||
visibility_lost = FALSE,
|
||||
visibility_restored_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = ANY($1::int[])
|
||||
`,
|
||||
[chunk]
|
||||
);
|
||||
}
|
||||
|
||||
console.log(`[${workerName} - Product Sync] Restored visibility for ${restoredProducts.length} products`);
|
||||
return restoredProducts.length;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
@@ -756,9 +842,12 @@ export interface CrawlResult {
|
||||
modeAProducts?: number;
|
||||
modeBProducts?: number;
|
||||
missingProductsMarked?: number;
|
||||
visibilityLostCount?: number; // Products newly marked as visibility_lost
|
||||
visibilityRestoredCount?: number; // Products restored from visibility_lost
|
||||
imagesDownloaded?: number;
|
||||
imageErrors?: number;
|
||||
errorMessage?: string;
|
||||
httpStatus?: number; // HTTP status code for error classification
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
@@ -1005,21 +1094,38 @@ export async function crawlDispensaryProducts(
|
||||
}
|
||||
}
|
||||
|
||||
// Build union of all product IDs found in both modes
|
||||
const allFoundProductIds = new Set<string>([
|
||||
...Array.from(modeAProductIds),
|
||||
...Array.from(modeBProductIds),
|
||||
]);
|
||||
|
||||
// VISIBILITY RESTORATION: Check if any previously-lost products have reappeared
|
||||
const visibilityRestored = await restoreVisibilityForProducts(
|
||||
dispensary.id,
|
||||
allFoundProductIds,
|
||||
'Bella'
|
||||
);
|
||||
|
||||
// Mark products as missing using UNION of Mode A + Mode B
|
||||
// The function handles outage detection (empty union = skip marking)
|
||||
missingMarked = await markMissingProducts(
|
||||
// Now also tracks newly lost products vs already-lost products
|
||||
const missingResult = await markMissingProducts(
|
||||
dispensary.id,
|
||||
dispensary.platformDispensaryId,
|
||||
modeAProductIds,
|
||||
modeBProductIds,
|
||||
pricingType
|
||||
pricingType,
|
||||
'Bella'
|
||||
);
|
||||
missingMarked = missingResult.markedMissing;
|
||||
const newlyLostCount = missingResult.newlyLost;
|
||||
totalSnapshots += missingMarked;
|
||||
|
||||
// Update dispensary stats
|
||||
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
|
||||
|
||||
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
|
||||
console.log(`[Bella - Product Sync] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} missing, ${newlyLostCount} newly lost, ${visibilityRestored} restored, ${totalImagesDownloaded} images`);
|
||||
|
||||
const totalProductsFound = modeAProducts + modeBProducts;
|
||||
return {
|
||||
@@ -1032,6 +1138,8 @@ export async function crawlDispensaryProducts(
|
||||
modeAProducts,
|
||||
modeBProducts,
|
||||
missingProductsMarked: missingMarked,
|
||||
visibilityLostCount: newlyLostCount,
|
||||
visibilityRestoredCount: visibilityRestored,
|
||||
imagesDownloaded: totalImagesDownloaded,
|
||||
imageErrors: totalImageErrors,
|
||||
durationMs: Date.now() - startTime,
|
||||
|
||||
455
backend/src/dutchie-az/services/proxy-rotator.ts
Normal file
455
backend/src/dutchie-az/services/proxy-rotator.ts
Normal file
@@ -0,0 +1,455 @@
|
||||
/**
|
||||
* Proxy & User Agent Rotator
|
||||
*
|
||||
* Manages rotation of proxies and user agents to avoid blocks.
|
||||
* Integrates with error taxonomy for intelligent rotation decisions.
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
// ============================================================
|
||||
// USER AGENT CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
|
||||
* Updated: 2024
|
||||
*/
|
||||
export const USER_AGENTS = [
|
||||
// Chrome on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
|
||||
|
||||
// Chrome on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
|
||||
// Firefox on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
|
||||
|
||||
// Firefox on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
|
||||
// Safari on macOS
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
|
||||
// Edge on Windows
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
];
|
||||
|
||||
// ============================================================
|
||||
// PROXY TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface Proxy {
|
||||
id: number;
|
||||
host: string;
|
||||
port: number;
|
||||
username?: string;
|
||||
password?: string;
|
||||
protocol: 'http' | 'https' | 'socks5';
|
||||
isActive: boolean;
|
||||
lastUsedAt: Date | null;
|
||||
failureCount: number;
|
||||
successCount: number;
|
||||
avgResponseTimeMs: number | null;
|
||||
}
|
||||
|
||||
export interface ProxyStats {
|
||||
totalProxies: number;
|
||||
activeProxies: number;
|
||||
blockedProxies: number;
|
||||
avgSuccessRate: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROXY ROTATOR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class ProxyRotator {
|
||||
private pool: Pool | null = null;
|
||||
private proxies: Proxy[] = [];
|
||||
private currentIndex: number = 0;
|
||||
private lastRotation: Date = new Date();
|
||||
|
||||
constructor(pool?: Pool) {
|
||||
this.pool = pool || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize with database pool
|
||||
*/
|
||||
setPool(pool: Pool): void {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load proxies from database
|
||||
*/
|
||||
async loadProxies(): Promise<void> {
|
||||
if (!this.pool) {
|
||||
console.warn('[ProxyRotator] No database pool configured');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query<Proxy>(`
|
||||
SELECT
|
||||
id,
|
||||
host,
|
||||
port,
|
||||
username,
|
||||
password,
|
||||
protocol,
|
||||
is_active as "isActive",
|
||||
last_used_at as "lastUsedAt",
|
||||
failure_count as "failureCount",
|
||||
success_count as "successCount",
|
||||
avg_response_time_ms as "avgResponseTimeMs"
|
||||
FROM proxies
|
||||
WHERE is_active = true
|
||||
ORDER BY failure_count ASC, last_used_at ASC NULLS FIRST
|
||||
`);
|
||||
|
||||
this.proxies = result.rows;
|
||||
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies`);
|
||||
} catch (error) {
|
||||
// Table might not exist - that's okay
|
||||
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
|
||||
this.proxies = [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get next proxy in rotation
|
||||
*/
|
||||
getNext(): Proxy | null {
|
||||
if (this.proxies.length === 0) return null;
|
||||
|
||||
// Round-robin rotation
|
||||
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
|
||||
this.lastRotation = new Date();
|
||||
|
||||
return this.proxies[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current proxy without rotating
|
||||
*/
|
||||
getCurrent(): Proxy | null {
|
||||
if (this.proxies.length === 0) return null;
|
||||
return this.proxies[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get proxy by ID
|
||||
*/
|
||||
getById(id: number): Proxy | null {
|
||||
return this.proxies.find(p => p.id === id) || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate to a specific proxy
|
||||
*/
|
||||
setProxy(id: number): boolean {
|
||||
const index = this.proxies.findIndex(p => p.id === id);
|
||||
if (index === -1) return false;
|
||||
|
||||
this.currentIndex = index;
|
||||
this.lastRotation = new Date();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as failed (temporarily remove from rotation)
|
||||
*/
|
||||
async markFailed(proxyId: number, error?: string): Promise<void> {
|
||||
// Update in-memory
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.failureCount++;
|
||||
|
||||
// Deactivate if too many failures
|
||||
if (proxy.failureCount >= 5) {
|
||||
proxy.isActive = false;
|
||||
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
||||
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update database
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
UPDATE proxies
|
||||
SET
|
||||
failure_count = failure_count + 1,
|
||||
last_failure_at = NOW(),
|
||||
last_error = $2,
|
||||
is_active = CASE WHEN failure_count >= 4 THEN false ELSE is_active END
|
||||
WHERE id = $1
|
||||
`, [proxyId, error || null]);
|
||||
} catch (err) {
|
||||
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark proxy as successful
|
||||
*/
|
||||
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
|
||||
// Update in-memory
|
||||
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||
if (proxy) {
|
||||
proxy.successCount++;
|
||||
proxy.lastUsedAt = new Date();
|
||||
if (responseTimeMs !== undefined) {
|
||||
// Rolling average
|
||||
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
|
||||
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
|
||||
: responseTimeMs;
|
||||
}
|
||||
}
|
||||
|
||||
// Update database
|
||||
if (this.pool) {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
UPDATE proxies
|
||||
SET
|
||||
success_count = success_count + 1,
|
||||
last_used_at = NOW(),
|
||||
avg_response_time_ms = CASE
|
||||
WHEN avg_response_time_ms IS NULL THEN $2
|
||||
ELSE (avg_response_time_ms * 0.8) + ($2 * 0.2)
|
||||
END
|
||||
WHERE id = $1
|
||||
`, [proxyId, responseTimeMs || null]);
|
||||
} catch (err) {
|
||||
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get proxy URL for HTTP client
|
||||
*/
|
||||
getProxyUrl(proxy: Proxy): string {
|
||||
const auth = proxy.username && proxy.password
|
||||
? `${proxy.username}:${proxy.password}@`
|
||||
: '';
|
||||
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stats about proxy pool
|
||||
*/
|
||||
getStats(): ProxyStats {
|
||||
const totalProxies = this.proxies.length;
|
||||
const activeProxies = this.proxies.filter(p => p.isActive).length;
|
||||
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
|
||||
|
||||
const successRates = this.proxies
|
||||
.filter(p => p.successCount + p.failureCount > 0)
|
||||
.map(p => p.successCount / (p.successCount + p.failureCount));
|
||||
|
||||
const avgSuccessRate = successRates.length > 0
|
||||
? successRates.reduce((a, b) => a + b, 0) / successRates.length
|
||||
: 0;
|
||||
|
||||
return {
|
||||
totalProxies,
|
||||
activeProxies,
|
||||
blockedProxies,
|
||||
avgSuccessRate,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy pool has available proxies
|
||||
*/
|
||||
hasAvailableProxies(): boolean {
|
||||
return this.proxies.length > 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// USER AGENT ROTATOR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class UserAgentRotator {
|
||||
private userAgents: string[];
|
||||
private currentIndex: number = 0;
|
||||
private lastRotation: Date = new Date();
|
||||
|
||||
constructor(userAgents: string[] = USER_AGENTS) {
|
||||
this.userAgents = userAgents;
|
||||
// Start at random index to avoid patterns
|
||||
this.currentIndex = Math.floor(Math.random() * userAgents.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get next user agent in rotation
|
||||
*/
|
||||
getNext(): string {
|
||||
this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
|
||||
this.lastRotation = new Date();
|
||||
return this.userAgents[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current user agent without rotating
|
||||
*/
|
||||
getCurrent(): string {
|
||||
return this.userAgents[this.currentIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a random user agent
|
||||
*/
|
||||
getRandom(): string {
|
||||
const index = Math.floor(Math.random() * this.userAgents.length);
|
||||
return this.userAgents[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get total available user agents
|
||||
*/
|
||||
getCount(): number {
|
||||
return this.userAgents.length;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// COMBINED ROTATOR (for convenience)
|
||||
// ============================================================
|
||||
|
||||
export class CrawlRotator {
|
||||
public proxy: ProxyRotator;
|
||||
public userAgent: UserAgentRotator;
|
||||
|
||||
constructor(pool?: Pool) {
|
||||
this.proxy = new ProxyRotator(pool);
|
||||
this.userAgent = new UserAgentRotator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize rotator (load proxies from DB)
|
||||
*/
|
||||
async initialize(): Promise<void> {
|
||||
await this.proxy.loadProxies();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate proxy only
|
||||
*/
|
||||
rotateProxy(): Proxy | null {
|
||||
return this.proxy.getNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate user agent only
|
||||
*/
|
||||
rotateUserAgent(): string {
|
||||
return this.userAgent.getNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate both proxy and user agent
|
||||
*/
|
||||
rotateBoth(): { proxy: Proxy | null; userAgent: string } {
|
||||
return {
|
||||
proxy: this.proxy.getNext(),
|
||||
userAgent: this.userAgent.getNext(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current proxy and user agent without rotating
|
||||
*/
|
||||
getCurrent(): { proxy: Proxy | null; userAgent: string } {
|
||||
return {
|
||||
proxy: this.proxy.getCurrent(),
|
||||
userAgent: this.userAgent.getCurrent(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Record success for current proxy
|
||||
*/
|
||||
async recordSuccess(responseTimeMs?: number): Promise<void> {
|
||||
const current = this.proxy.getCurrent();
|
||||
if (current) {
|
||||
await this.proxy.markSuccess(current.id, responseTimeMs);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Record failure for current proxy
|
||||
*/
|
||||
async recordFailure(error?: string): Promise<void> {
|
||||
const current = this.proxy.getCurrent();
|
||||
if (current) {
|
||||
await this.proxy.markFailed(current.id, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Update dispensary's current proxy and user agent
|
||||
*/
|
||||
export async function updateDispensaryRotation(
|
||||
pool: Pool,
|
||||
dispensaryId: number,
|
||||
proxyId: number | null,
|
||||
userAgent: string | null
|
||||
): Promise<void> {
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
current_proxy_id = $2,
|
||||
current_user_agent = $3
|
||||
WHERE id = $1
|
||||
`, [dispensaryId, proxyId, userAgent]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get dispensary's current proxy and user agent
|
||||
*/
|
||||
export async function getDispensaryRotation(
|
||||
pool: Pool,
|
||||
dispensaryId: number
|
||||
): Promise<{ proxyId: number | null; userAgent: string | null }> {
|
||||
const result = await pool.query(`
|
||||
SELECT current_proxy_id as "proxyId", current_user_agent as "userAgent"
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return { proxyId: null, userAgent: null };
|
||||
}
|
||||
|
||||
return result.rows[0];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLETON INSTANCES
|
||||
// ============================================================
|
||||
|
||||
export const proxyRotator = new ProxyRotator();
|
||||
export const userAgentRotator = new UserAgentRotator();
|
||||
export const crawlRotator = new CrawlRotator();
|
||||
435
backend/src/dutchie-az/services/retry-manager.ts
Normal file
435
backend/src/dutchie-az/services/retry-manager.ts
Normal file
@@ -0,0 +1,435 @@
|
||||
/**
|
||||
* Unified Retry Manager
|
||||
*
|
||||
* Handles retry logic with exponential backoff, jitter, and
|
||||
* intelligent error-based decisions (rotate proxy, rotate UA, etc.)
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
import {
|
||||
CrawlErrorCodeType,
|
||||
CrawlErrorCode,
|
||||
classifyError,
|
||||
getErrorMetadata,
|
||||
isRetryable,
|
||||
shouldRotateProxy,
|
||||
shouldRotateUserAgent,
|
||||
getBackoffMultiplier,
|
||||
} from './error-taxonomy';
|
||||
import { DEFAULT_CONFIG } from './store-validator';
|
||||
|
||||
// ============================================================
|
||||
// RETRY CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
export interface RetryConfig {
|
||||
maxRetries: number;
|
||||
baseBackoffMs: number;
|
||||
maxBackoffMs: number;
|
||||
backoffMultiplier: number;
|
||||
jitterFactor: number; // 0.0 - 1.0 (percentage of backoff to randomize)
|
||||
}
|
||||
|
||||
export const DEFAULT_RETRY_CONFIG: RetryConfig = {
|
||||
maxRetries: DEFAULT_CONFIG.maxRetries,
|
||||
baseBackoffMs: DEFAULT_CONFIG.baseBackoffMs,
|
||||
maxBackoffMs: DEFAULT_CONFIG.maxBackoffMs,
|
||||
backoffMultiplier: DEFAULT_CONFIG.backoffMultiplier,
|
||||
jitterFactor: 0.25, // +/- 25% jitter
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// RETRY CONTEXT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Context for tracking retry state across attempts
|
||||
*/
|
||||
export interface RetryContext {
|
||||
attemptNumber: number;
|
||||
maxAttempts: number;
|
||||
lastErrorCode: CrawlErrorCodeType | null;
|
||||
lastHttpStatus: number | null;
|
||||
totalBackoffMs: number;
|
||||
proxyRotated: boolean;
|
||||
userAgentRotated: boolean;
|
||||
startedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decision about what to do after an error
|
||||
*/
|
||||
export interface RetryDecision {
|
||||
shouldRetry: boolean;
|
||||
reason: string;
|
||||
backoffMs: number;
|
||||
rotateProxy: boolean;
|
||||
rotateUserAgent: boolean;
|
||||
errorCode: CrawlErrorCodeType;
|
||||
attemptNumber: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// RETRY MANAGER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class RetryManager {
|
||||
private config: RetryConfig;
|
||||
private context: RetryContext;
|
||||
|
||||
constructor(config: Partial<RetryConfig> = {}) {
|
||||
this.config = { ...DEFAULT_RETRY_CONFIG, ...config };
|
||||
this.context = this.createInitialContext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create initial retry context
|
||||
*/
|
||||
private createInitialContext(): RetryContext {
|
||||
return {
|
||||
attemptNumber: 0,
|
||||
maxAttempts: this.config.maxRetries + 1, // +1 for initial attempt
|
||||
lastErrorCode: null,
|
||||
lastHttpStatus: null,
|
||||
totalBackoffMs: 0,
|
||||
proxyRotated: false,
|
||||
userAgentRotated: false,
|
||||
startedAt: new Date(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset retry state for a new operation
|
||||
*/
|
||||
reset(): void {
|
||||
this.context = this.createInitialContext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current attempt number (1-based)
|
||||
*/
|
||||
getAttemptNumber(): number {
|
||||
return this.context.attemptNumber + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if we should attempt (call before each attempt)
|
||||
*/
|
||||
shouldAttempt(): boolean {
|
||||
return this.context.attemptNumber < this.context.maxAttempts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record an attempt (call at start of each attempt)
|
||||
*/
|
||||
recordAttempt(): void {
|
||||
this.context.attemptNumber++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate an error and decide what to do
|
||||
*/
|
||||
evaluateError(
|
||||
error: Error | string | null,
|
||||
httpStatus?: number
|
||||
): RetryDecision {
|
||||
const errorCode = classifyError(error, httpStatus);
|
||||
const metadata = getErrorMetadata(errorCode);
|
||||
const attemptNumber = this.context.attemptNumber;
|
||||
|
||||
// Update context
|
||||
this.context.lastErrorCode = errorCode;
|
||||
this.context.lastHttpStatus = httpStatus || null;
|
||||
|
||||
// Check if error is retryable
|
||||
if (!isRetryable(errorCode)) {
|
||||
return {
|
||||
shouldRetry: false,
|
||||
reason: `Error ${errorCode} is not retryable: ${metadata.description}`,
|
||||
backoffMs: 0,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
errorCode,
|
||||
attemptNumber,
|
||||
};
|
||||
}
|
||||
|
||||
// Check if we've exhausted retries
|
||||
if (!this.shouldAttempt()) {
|
||||
return {
|
||||
shouldRetry: false,
|
||||
reason: `Max retries (${this.config.maxRetries}) exhausted`,
|
||||
backoffMs: 0,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
errorCode,
|
||||
attemptNumber,
|
||||
};
|
||||
}
|
||||
|
||||
// Calculate backoff with exponential increase and jitter
|
||||
const baseBackoff = this.calculateBackoff(attemptNumber, errorCode);
|
||||
const backoffWithJitter = this.addJitter(baseBackoff);
|
||||
|
||||
// Track total backoff
|
||||
this.context.totalBackoffMs += backoffWithJitter;
|
||||
|
||||
// Determine rotation needs
|
||||
const rotateProxy = shouldRotateProxy(errorCode);
|
||||
const rotateUserAgent = shouldRotateUserAgent(errorCode);
|
||||
|
||||
if (rotateProxy) this.context.proxyRotated = true;
|
||||
if (rotateUserAgent) this.context.userAgentRotated = true;
|
||||
|
||||
const rotationInfo = [];
|
||||
if (rotateProxy) rotationInfo.push('rotate proxy');
|
||||
if (rotateUserAgent) rotationInfo.push('rotate UA');
|
||||
const rotationStr = rotationInfo.length > 0 ? ` (${rotationInfo.join(', ')})` : '';
|
||||
|
||||
return {
|
||||
shouldRetry: true,
|
||||
reason: `Retrying after ${errorCode}${rotationStr}, backoff ${backoffWithJitter}ms`,
|
||||
backoffMs: backoffWithJitter,
|
||||
rotateProxy,
|
||||
rotateUserAgent,
|
||||
errorCode,
|
||||
attemptNumber,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate exponential backoff for an attempt
|
||||
*/
|
||||
private calculateBackoff(attemptNumber: number, errorCode: CrawlErrorCodeType): number {
|
||||
// Base exponential: baseBackoff * multiplier^(attempt-1)
|
||||
const exponential = this.config.baseBackoffMs *
|
||||
Math.pow(this.config.backoffMultiplier, attemptNumber - 1);
|
||||
|
||||
// Apply error-specific multiplier
|
||||
const errorMultiplier = getBackoffMultiplier(errorCode);
|
||||
const adjusted = exponential * errorMultiplier;
|
||||
|
||||
// Cap at max backoff
|
||||
return Math.min(adjusted, this.config.maxBackoffMs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add jitter to backoff to prevent thundering herd
|
||||
*/
|
||||
private addJitter(backoffMs: number): number {
|
||||
const jitterRange = backoffMs * this.config.jitterFactor;
|
||||
// Random between -jitterRange and +jitterRange
|
||||
const jitter = (Math.random() * 2 - 1) * jitterRange;
|
||||
return Math.max(0, Math.round(backoffMs + jitter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get retry context summary
|
||||
*/
|
||||
getSummary(): RetryContextSummary {
|
||||
const elapsedMs = Date.now() - this.context.startedAt.getTime();
|
||||
return {
|
||||
attemptsMade: this.context.attemptNumber,
|
||||
maxAttempts: this.context.maxAttempts,
|
||||
lastErrorCode: this.context.lastErrorCode,
|
||||
lastHttpStatus: this.context.lastHttpStatus,
|
||||
totalBackoffMs: this.context.totalBackoffMs,
|
||||
totalElapsedMs: elapsedMs,
|
||||
proxyWasRotated: this.context.proxyRotated,
|
||||
userAgentWasRotated: this.context.userAgentRotated,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export interface RetryContextSummary {
|
||||
attemptsMade: number;
|
||||
maxAttempts: number;
|
||||
lastErrorCode: CrawlErrorCodeType | null;
|
||||
lastHttpStatus: number | null;
|
||||
totalBackoffMs: number;
|
||||
totalElapsedMs: number;
|
||||
proxyWasRotated: boolean;
|
||||
userAgentWasRotated: boolean;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CONVENIENCE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Sleep for specified milliseconds
|
||||
*/
|
||||
export function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a function with automatic retry logic
|
||||
*/
|
||||
export async function withRetry<T>(
|
||||
fn: (attemptNumber: number) => Promise<T>,
|
||||
config: Partial<RetryConfig> = {},
|
||||
callbacks?: {
|
||||
onRetry?: (decision: RetryDecision) => void | Promise<void>;
|
||||
onRotateProxy?: () => void | Promise<void>;
|
||||
onRotateUserAgent?: () => void | Promise<void>;
|
||||
}
|
||||
): Promise<{ result: T; summary: RetryContextSummary }> {
|
||||
const manager = new RetryManager(config);
|
||||
|
||||
while (manager.shouldAttempt()) {
|
||||
manager.recordAttempt();
|
||||
const attemptNumber = manager.getAttemptNumber();
|
||||
|
||||
try {
|
||||
const result = await fn(attemptNumber);
|
||||
return { result, summary: manager.getSummary() };
|
||||
} catch (error) {
|
||||
const err = error instanceof Error ? error : new Error(String(error));
|
||||
const httpStatus = (error as any)?.status || (error as any)?.statusCode;
|
||||
|
||||
const decision = manager.evaluateError(err, httpStatus);
|
||||
|
||||
if (!decision.shouldRetry) {
|
||||
// Re-throw with enhanced context
|
||||
const enhancedError = new RetryExhaustedError(
|
||||
`${err.message} (${decision.reason})`,
|
||||
err,
|
||||
manager.getSummary()
|
||||
);
|
||||
throw enhancedError;
|
||||
}
|
||||
|
||||
// Notify callbacks
|
||||
if (callbacks?.onRetry) {
|
||||
await callbacks.onRetry(decision);
|
||||
}
|
||||
if (decision.rotateProxy && callbacks?.onRotateProxy) {
|
||||
await callbacks.onRotateProxy();
|
||||
}
|
||||
if (decision.rotateUserAgent && callbacks?.onRotateUserAgent) {
|
||||
await callbacks.onRotateUserAgent();
|
||||
}
|
||||
|
||||
// Log retry decision
|
||||
console.log(
|
||||
`[RetryManager] Attempt ${attemptNumber} failed: ${decision.errorCode}. ` +
|
||||
`${decision.reason}. Waiting ${decision.backoffMs}ms before retry.`
|
||||
);
|
||||
|
||||
// Wait before retry
|
||||
await sleep(decision.backoffMs);
|
||||
}
|
||||
}
|
||||
|
||||
// Should not reach here, but handle edge case
|
||||
throw new RetryExhaustedError(
|
||||
'Max retries exhausted',
|
||||
null,
|
||||
manager.getSummary()
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CUSTOM ERROR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class RetryExhaustedError extends Error {
|
||||
public readonly originalError: Error | null;
|
||||
public readonly summary: RetryContextSummary;
|
||||
public readonly errorCode: CrawlErrorCodeType;
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
originalError: Error | null,
|
||||
summary: RetryContextSummary
|
||||
) {
|
||||
super(message);
|
||||
this.name = 'RetryExhaustedError';
|
||||
this.originalError = originalError;
|
||||
this.summary = summary;
|
||||
this.errorCode = summary.lastErrorCode || CrawlErrorCode.UNKNOWN_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// BACKOFF CALCULATOR (for external use)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Calculate next crawl time based on consecutive failures
|
||||
*/
|
||||
export function calculateNextCrawlDelay(
|
||||
consecutiveFailures: number,
|
||||
baseFrequencyMinutes: number,
|
||||
maxBackoffMultiplier: number = 4.0
|
||||
): number {
|
||||
// Each failure doubles the delay, up to max multiplier
|
||||
const multiplier = Math.min(
|
||||
Math.pow(2, consecutiveFailures),
|
||||
maxBackoffMultiplier
|
||||
);
|
||||
|
||||
const delayMinutes = baseFrequencyMinutes * multiplier;
|
||||
|
||||
// Add jitter (0-10% of delay)
|
||||
const jitterMinutes = delayMinutes * Math.random() * 0.1;
|
||||
|
||||
return Math.round(delayMinutes + jitterMinutes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate next crawl timestamp
|
||||
*/
|
||||
export function calculateNextCrawlAt(
|
||||
consecutiveFailures: number,
|
||||
baseFrequencyMinutes: number
|
||||
): Date {
|
||||
const delayMinutes = calculateNextCrawlDelay(consecutiveFailures, baseFrequencyMinutes);
|
||||
return new Date(Date.now() + delayMinutes * 60 * 1000);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STATUS DETERMINATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Determine crawl status based on failure count
|
||||
*/
|
||||
export function determineCrawlStatus(
|
||||
consecutiveFailures: number,
|
||||
thresholds: { degraded: number; failed: number } = { degraded: 3, failed: 10 }
|
||||
): 'active' | 'degraded' | 'failed' {
|
||||
if (consecutiveFailures >= thresholds.failed) {
|
||||
return 'failed';
|
||||
}
|
||||
if (consecutiveFailures >= thresholds.degraded) {
|
||||
return 'degraded';
|
||||
}
|
||||
return 'active';
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if store should be auto-recovered
|
||||
* (Called periodically to check if failed stores can be retried)
|
||||
*/
|
||||
export function shouldAttemptRecovery(
|
||||
lastFailureAt: Date | null,
|
||||
consecutiveFailures: number,
|
||||
recoveryIntervalHours: number = 24
|
||||
): boolean {
|
||||
if (!lastFailureAt) return true;
|
||||
|
||||
// Wait longer for more failures
|
||||
const waitHours = recoveryIntervalHours * Math.min(consecutiveFailures, 5);
|
||||
const recoveryTime = new Date(lastFailureAt.getTime() + waitHours * 60 * 60 * 1000);
|
||||
|
||||
return new Date() >= recoveryTime;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLETON INSTANCE
|
||||
// ============================================================
|
||||
|
||||
export const retryManager = new RetryManager();
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user