feat: Add v2 architecture with multi-state support and orchestrator services
Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
48
.gitignore
vendored
Normal file
48
.gitignore
vendored
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# Dependencies
|
||||||
|
node_modules/
|
||||||
|
|
||||||
|
# Build outputs (compiled JS, not source)
|
||||||
|
backend/dist/
|
||||||
|
cannaiq/dist/
|
||||||
|
findadispo/build/
|
||||||
|
findagram/build/
|
||||||
|
frontend/dist/
|
||||||
|
|
||||||
|
# Environment files (local secrets)
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
.env.*.local
|
||||||
|
backend/.env
|
||||||
|
backend/.env.local
|
||||||
|
|
||||||
|
# Database dumps and backups (large files)
|
||||||
|
*.dump
|
||||||
|
*.sql.backup
|
||||||
|
backup_*.sql
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
|
||||||
|
# Local storage (runtime data, not source)
|
||||||
|
backend/storage/
|
||||||
|
|
||||||
|
# Vite cache
|
||||||
|
**/node_modules/.vite/
|
||||||
|
|
||||||
|
# Test coverage
|
||||||
|
coverage/
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
*.tmp
|
||||||
|
*.temp
|
||||||
50
backend/.env.example
Normal file
50
backend/.env.example
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# CannaiQ Backend Environment Configuration
|
||||||
|
# Copy this file to .env and fill in the values
|
||||||
|
|
||||||
|
# Server
|
||||||
|
PORT=3010
|
||||||
|
NODE_ENV=development
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# CANNAIQ DATABASE (dutchie_menus) - PRIMARY DATABASE
|
||||||
|
# =============================================================================
|
||||||
|
# This is where ALL schema migrations run and where canonical tables live.
|
||||||
|
# All CANNAIQ_DB_* variables are REQUIRED - no defaults.
|
||||||
|
# The application will fail to start if any are missing.
|
||||||
|
|
||||||
|
CANNAIQ_DB_HOST=localhost
|
||||||
|
CANNAIQ_DB_PORT=54320
|
||||||
|
CANNAIQ_DB_NAME=dutchie_menus # MUST be dutchie_menus - NOT dutchie_legacy
|
||||||
|
CANNAIQ_DB_USER=dutchie
|
||||||
|
CANNAIQ_DB_PASS=
|
||||||
|
|
||||||
|
# Alternative: Use a full connection URL instead of individual vars
|
||||||
|
# If set, this takes priority over individual vars above
|
||||||
|
# CANNAIQ_DB_URL=postgresql://user:pass@host:port/dutchie_menus
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# LEGACY DATABASE (dutchie_legacy) - READ-ONLY FOR ETL
|
||||||
|
# =============================================================================
|
||||||
|
# Used ONLY by ETL scripts to read historical data.
|
||||||
|
# NEVER run migrations against this database.
|
||||||
|
# These are only needed when running 042_legacy_import.ts
|
||||||
|
|
||||||
|
LEGACY_DB_HOST=localhost
|
||||||
|
LEGACY_DB_PORT=54320
|
||||||
|
LEGACY_DB_NAME=dutchie_legacy # READ-ONLY - never migrated
|
||||||
|
LEGACY_DB_USER=dutchie
|
||||||
|
LEGACY_DB_PASS=
|
||||||
|
|
||||||
|
# Alternative: Use a full connection URL instead of individual vars
|
||||||
|
# LEGACY_DB_URL=postgresql://user:pass@host:port/dutchie_legacy
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# LOCAL STORAGE
|
||||||
|
# =============================================================================
|
||||||
|
# Local image storage path (no MinIO)
|
||||||
|
LOCAL_IMAGES_PATH=./public/images
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# AUTHENTICATION
|
||||||
|
# =============================================================================
|
||||||
|
JWT_SECRET=your-secret-key-change-in-production
|
||||||
30
backend/docker-compose.local.yml
Normal file
30
backend/docker-compose.local.yml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# CannaiQ Local Development Environment
|
||||||
|
# Run: docker-compose -f docker-compose.local.yml up -d
|
||||||
|
#
|
||||||
|
# Services:
|
||||||
|
# - cannaiq-postgres: PostgreSQL at localhost:54320
|
||||||
|
#
|
||||||
|
# Note: Backend and frontend run outside Docker for faster dev iteration
|
||||||
|
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
cannaiq-postgres:
|
||||||
|
image: postgres:15-alpine
|
||||||
|
container_name: cannaiq-postgres
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: cannaiq
|
||||||
|
POSTGRES_PASSWORD: cannaiq_local_pass
|
||||||
|
POSTGRES_DB: cannaiq
|
||||||
|
ports:
|
||||||
|
- "54320:5432"
|
||||||
|
volumes:
|
||||||
|
- cannaiq-postgres-data:/var/lib/postgresql/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U cannaiq"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
cannaiq-postgres-data:
|
||||||
712
backend/docs/ANALYTICS_RUNBOOK.md
Normal file
712
backend/docs/ANALYTICS_RUNBOOK.md
Normal file
@@ -0,0 +1,712 @@
|
|||||||
|
# CannaiQ Analytics Runbook
|
||||||
|
|
||||||
|
Phase 3: Analytics Engine - Complete Implementation Guide
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The CannaiQ Analytics Engine provides real-time insights into cannabis market data across price trends, brand penetration, category performance, store changes, and competitive positioning.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ API Layer │
|
||||||
|
│ /api/az/analytics/* │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Analytics Services │
|
||||||
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
|
||||||
|
│ │PriceTrend │ │Penetration │ │CategoryAnalytics │ │
|
||||||
|
│ │Service │ │Service │ │Service │ │
|
||||||
|
│ └──────────────┘ └──────────────┘ └──────────────────────┘ │
|
||||||
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
|
||||||
|
│ │StoreChange │ │BrandOpportunity│ │AnalyticsCache │ │
|
||||||
|
│ │Service │ │Service │ │(15-min TTL) │ │
|
||||||
|
│ └──────────────┘ └──────────────┘ └──────────────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Canonical Tables │
|
||||||
|
│ store_products │ store_product_snapshots │ brands │ categories │
|
||||||
|
│ dispensaries │ brand_snapshots │ category_snapshots │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Services
|
||||||
|
|
||||||
|
### 1. PriceTrendService
|
||||||
|
|
||||||
|
Provides time-series price analytics.
|
||||||
|
|
||||||
|
**Key Methods:**
|
||||||
|
| Method | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `getProductPriceTrend(productId, storeId?, days)` | Price history for a product |
|
||||||
|
| `getBrandPriceTrend(brandName, filters)` | Average prices for a brand |
|
||||||
|
| `getCategoryPriceTrend(category, filters)` | Category-level price trends |
|
||||||
|
| `getPriceSummary(filters)` | 7d/30d/90d price averages |
|
||||||
|
| `detectPriceCompression(category, state?)` | Price war detection |
|
||||||
|
| `getGlobalPriceStats()` | Market-wide pricing overview |
|
||||||
|
|
||||||
|
**Filters:**
|
||||||
|
```typescript
|
||||||
|
interface PriceFilters {
|
||||||
|
storeId?: number;
|
||||||
|
brandName?: string;
|
||||||
|
category?: string;
|
||||||
|
state?: string;
|
||||||
|
days?: number; // default: 30
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Price Compression Detection:**
|
||||||
|
- Calculates standard deviation of prices within category
|
||||||
|
- Returns compression score 0-100 (higher = more compressed)
|
||||||
|
- Identifies brands converging toward mean price
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. PenetrationService
|
||||||
|
|
||||||
|
Tracks brand market presence across stores and states.
|
||||||
|
|
||||||
|
**Key Methods:**
|
||||||
|
| Method | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `getBrandPenetration(brandName, filters)` | Store count, SKU count, coverage |
|
||||||
|
| `getTopBrandsByPenetration(limit, filters)` | Leaderboard of dominant brands |
|
||||||
|
| `getPenetrationTrend(brandName, days)` | Historical penetration growth |
|
||||||
|
| `getShelfShareByCategory(brandName)` | % of shelf per category |
|
||||||
|
| `getBrandPresenceByState(brandName)` | Multi-state presence map |
|
||||||
|
| `getStoresCarryingBrand(brandName)` | List of stores carrying brand |
|
||||||
|
| `getPenetrationHeatmap(brandName?)` | Geographic distribution |
|
||||||
|
|
||||||
|
**Penetration Calculation:**
|
||||||
|
```
|
||||||
|
Penetration % = (Stores with Brand / Total Stores in Market) × 100
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. CategoryAnalyticsService
|
||||||
|
|
||||||
|
Analyzes category performance and trends.
|
||||||
|
|
||||||
|
**Key Methods:**
|
||||||
|
| Method | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `getCategorySummary(category?, filters)` | SKU count, avg price, stores |
|
||||||
|
| `getCategoryGrowth(days, filters)` | 7d/30d/90d growth rates |
|
||||||
|
| `getCategoryGrowthTrend(category, days)` | Time-series category growth |
|
||||||
|
| `getCategoryHeatmap(metric, periods)` | Visual heatmap data |
|
||||||
|
| `getTopMovers(limit, days)` | Fastest growing/declining categories |
|
||||||
|
| `getSubcategoryBreakdown(category)` | Drill-down into subcategories |
|
||||||
|
|
||||||
|
**Time Windows:**
|
||||||
|
- 7 days: Short-term volatility
|
||||||
|
- 30 days: Monthly trends
|
||||||
|
- 90 days: Seasonal patterns
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. StoreChangeService
|
||||||
|
|
||||||
|
Tracks product adds/drops, brand changes, and price movements per store.
|
||||||
|
|
||||||
|
**Key Methods:**
|
||||||
|
| Method | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `getStoreChangeSummary(storeId)` | Overview of recent changes |
|
||||||
|
| `getStoreChangeEvents(storeId, filters)` | Event log (add, drop, price, OOS) |
|
||||||
|
| `getNewBrands(storeId, days)` | Brands added to store |
|
||||||
|
| `getLostBrands(storeId, days)` | Brands dropped from store |
|
||||||
|
| `getProductChanges(storeId, type, days)` | Filtered product changes |
|
||||||
|
| `getCategoryLeaderboard(category, limit)` | Top stores for category |
|
||||||
|
| `getMostActiveStores(days, limit)` | Stores with most changes |
|
||||||
|
| `compareStores(store1, store2)` | Side-by-side store comparison |
|
||||||
|
|
||||||
|
**Event Types:**
|
||||||
|
- `added` - New product appeared
|
||||||
|
- `discontinued` - Product removed
|
||||||
|
- `price_drop` - Price decreased
|
||||||
|
- `price_increase` - Price increased
|
||||||
|
- `restocked` - OOS → In Stock
|
||||||
|
- `out_of_stock` - In Stock → OOS
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. BrandOpportunityService
|
||||||
|
|
||||||
|
Competitive intelligence and opportunity identification.
|
||||||
|
|
||||||
|
**Key Methods:**
|
||||||
|
| Method | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `getBrandOpportunity(brandName)` | Full opportunity analysis |
|
||||||
|
| `getMarketPositionSummary(brandName)` | Market position vs competitors |
|
||||||
|
| `getAlerts(filters)` | Analytics-generated alerts |
|
||||||
|
| `markAlertsRead(alertIds)` | Mark alerts as read |
|
||||||
|
|
||||||
|
**Opportunity Analysis Includes:**
|
||||||
|
- White space stores (potential targets)
|
||||||
|
- Competitive threats (brands gaining share)
|
||||||
|
- Pricing opportunities (underpriced vs market)
|
||||||
|
- Missing SKU recommendations
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 6. AnalyticsCache
|
||||||
|
|
||||||
|
In-memory caching with database fallback.
|
||||||
|
|
||||||
|
**Configuration:**
|
||||||
|
```typescript
|
||||||
|
const cache = new AnalyticsCache(pool, {
|
||||||
|
defaultTtlMinutes: 15,
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage Pattern:**
|
||||||
|
```typescript
|
||||||
|
const data = await cache.getOrCompute(cacheKey, async () => {
|
||||||
|
// Expensive query here
|
||||||
|
return result;
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cache Management:**
|
||||||
|
- `GET /api/az/analytics/cache/stats` - View cache stats
|
||||||
|
- `POST /api/az/analytics/cache/clear?pattern=price*` - Clear by pattern
|
||||||
|
- Auto-cleanup of expired entries every 5 minutes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Endpoints Reference
|
||||||
|
|
||||||
|
### Price Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Product price trend (last 30 days)
|
||||||
|
GET /api/az/analytics/price/product/12345?days=30
|
||||||
|
|
||||||
|
# Brand price trend with filters
|
||||||
|
GET /api/az/analytics/price/brand/Cookies?storeId=101&category=Flower&days=90
|
||||||
|
|
||||||
|
# Category median price
|
||||||
|
GET /api/az/analytics/price/category/Vaporizers?state=AZ
|
||||||
|
|
||||||
|
# Price summary (7d/30d/90d)
|
||||||
|
GET /api/az/analytics/price/summary?brand=Stiiizy&state=AZ
|
||||||
|
|
||||||
|
# Detect price wars
|
||||||
|
GET /api/az/analytics/price/compression/Flower?state=AZ
|
||||||
|
|
||||||
|
# Global stats
|
||||||
|
GET /api/az/analytics/price/global
|
||||||
|
```
|
||||||
|
|
||||||
|
### Penetration Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Brand penetration
|
||||||
|
GET /api/az/analytics/penetration/brand/Cookies
|
||||||
|
|
||||||
|
# Top brands leaderboard
|
||||||
|
GET /api/az/analytics/penetration/top?limit=20&state=AZ&category=Flower
|
||||||
|
|
||||||
|
# Penetration trend
|
||||||
|
GET /api/az/analytics/penetration/trend/Cookies?days=90
|
||||||
|
|
||||||
|
# Shelf share by category
|
||||||
|
GET /api/az/analytics/penetration/shelf-share/Cookies
|
||||||
|
|
||||||
|
# Multi-state presence
|
||||||
|
GET /api/az/analytics/penetration/by-state/Cookies
|
||||||
|
|
||||||
|
# Stores carrying brand
|
||||||
|
GET /api/az/analytics/penetration/stores/Cookies
|
||||||
|
|
||||||
|
# Heatmap data
|
||||||
|
GET /api/az/analytics/penetration/heatmap?brand=Cookies
|
||||||
|
```
|
||||||
|
|
||||||
|
### Category Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Category summary
|
||||||
|
GET /api/az/analytics/category/summary?category=Flower&state=AZ
|
||||||
|
|
||||||
|
# Category growth (7d/30d/90d)
|
||||||
|
GET /api/az/analytics/category/growth?days=30&state=AZ
|
||||||
|
|
||||||
|
# Category trend
|
||||||
|
GET /api/az/analytics/category/trend/Concentrates?days=90
|
||||||
|
|
||||||
|
# Heatmap
|
||||||
|
GET /api/az/analytics/category/heatmap?metric=growth&periods=12
|
||||||
|
|
||||||
|
# Top movers (growing/declining)
|
||||||
|
GET /api/az/analytics/category/top-movers?limit=5&days=30
|
||||||
|
|
||||||
|
# Subcategory breakdown
|
||||||
|
GET /api/az/analytics/category/Edibles/subcategories
|
||||||
|
```
|
||||||
|
|
||||||
|
### Store Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Store change summary
|
||||||
|
GET /api/az/analytics/store/101/summary
|
||||||
|
|
||||||
|
# Event log
|
||||||
|
GET /api/az/analytics/store/101/events?type=price_drop&days=7&limit=50
|
||||||
|
|
||||||
|
# New brands
|
||||||
|
GET /api/az/analytics/store/101/brands/new?days=30
|
||||||
|
|
||||||
|
# Lost brands
|
||||||
|
GET /api/az/analytics/store/101/brands/lost?days=30
|
||||||
|
|
||||||
|
# Product changes by type
|
||||||
|
GET /api/az/analytics/store/101/products/changes?type=added&days=7
|
||||||
|
|
||||||
|
# Category leaderboard
|
||||||
|
GET /api/az/analytics/store/leaderboard/Flower?limit=20
|
||||||
|
|
||||||
|
# Most active stores
|
||||||
|
GET /api/az/analytics/store/most-active?days=7&limit=10
|
||||||
|
|
||||||
|
# Compare two stores
|
||||||
|
GET /api/az/analytics/store/compare?store1=101&store2=102
|
||||||
|
```
|
||||||
|
|
||||||
|
### Brand Opportunity Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Full opportunity analysis
|
||||||
|
GET /api/az/analytics/brand/Cookies/opportunity
|
||||||
|
|
||||||
|
# Market position summary
|
||||||
|
GET /api/az/analytics/brand/Cookies/position
|
||||||
|
|
||||||
|
# Get alerts
|
||||||
|
GET /api/az/analytics/alerts?brand=Cookies&type=competitive&unreadOnly=true
|
||||||
|
|
||||||
|
# Mark alerts read
|
||||||
|
POST /api/az/analytics/alerts/mark-read
|
||||||
|
Body: { "alertIds": [1, 2, 3] }
|
||||||
|
```
|
||||||
|
|
||||||
|
### Maintenance Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Capture daily snapshots (run by scheduler)
|
||||||
|
POST /api/az/analytics/snapshots/capture
|
||||||
|
|
||||||
|
# Cache statistics
|
||||||
|
GET /api/az/analytics/cache/stats
|
||||||
|
|
||||||
|
# Clear cache (admin)
|
||||||
|
POST /api/az/analytics/cache/clear?pattern=price*
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Incremental Computation
|
||||||
|
|
||||||
|
Analytics are designed for real-time queries without full recomputation:
|
||||||
|
|
||||||
|
### Snapshot Strategy
|
||||||
|
|
||||||
|
1. **Raw Data**: `store_products` (current state)
|
||||||
|
2. **Historical**: `store_product_snapshots` (time-series)
|
||||||
|
3. **Aggregated**: `brand_snapshots`, `category_snapshots` (daily rollups)
|
||||||
|
|
||||||
|
### Window Calculations
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- 7-day window
|
||||||
|
WHERE crawled_at >= NOW() - INTERVAL '7 days'
|
||||||
|
|
||||||
|
-- 30-day window
|
||||||
|
WHERE crawled_at >= NOW() - INTERVAL '30 days'
|
||||||
|
|
||||||
|
-- 90-day window
|
||||||
|
WHERE crawled_at >= NOW() - INTERVAL '90 days'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Materialized Views (Optional)
|
||||||
|
|
||||||
|
For heavy queries, create materialized views:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE MATERIALIZED VIEW mv_brand_daily_metrics AS
|
||||||
|
SELECT
|
||||||
|
DATE(sps.captured_at) as date,
|
||||||
|
sp.brand_id,
|
||||||
|
COUNT(DISTINCT sp.dispensary_id) as store_count,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
AVG(sp.price_rec) as avg_price
|
||||||
|
FROM store_product_snapshots sps
|
||||||
|
JOIN store_products sp ON sps.store_product_id = sp.id
|
||||||
|
WHERE sps.captured_at >= NOW() - INTERVAL '90 days'
|
||||||
|
GROUP BY DATE(sps.captured_at), sp.brand_id;
|
||||||
|
|
||||||
|
-- Refresh daily
|
||||||
|
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_brand_daily_metrics;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Scheduled Jobs
|
||||||
|
|
||||||
|
### Daily Snapshot Capture
|
||||||
|
|
||||||
|
Trigger via cron or scheduler:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:3010/api/az/analytics/snapshots/capture
|
||||||
|
```
|
||||||
|
|
||||||
|
This calls:
|
||||||
|
- `capture_brand_snapshots()` - Captures brand metrics
|
||||||
|
- `capture_category_snapshots()` - Captures category metrics
|
||||||
|
|
||||||
|
### Cache Cleanup
|
||||||
|
|
||||||
|
Automatic cleanup every 5 minutes via in-memory timer.
|
||||||
|
|
||||||
|
For manual cleanup:
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:3010/api/az/analytics/cache/clear
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Extending Analytics (Future Phases)
|
||||||
|
|
||||||
|
### Phase 6: Intelligence Engine
|
||||||
|
- Automated alert generation
|
||||||
|
- Recommendation engine
|
||||||
|
- Price prediction
|
||||||
|
|
||||||
|
### Phase 7: Orders Integration
|
||||||
|
- Sales velocity analytics
|
||||||
|
- Reorder predictions
|
||||||
|
- Inventory turnover
|
||||||
|
|
||||||
|
### Phase 8: Advanced ML
|
||||||
|
- Demand forecasting
|
||||||
|
- Price elasticity modeling
|
||||||
|
- Customer segmentation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
**1. Slow queries**
|
||||||
|
- Check cache stats: `GET /api/az/analytics/cache/stats`
|
||||||
|
- Increase cache TTL if data doesn't need real-time freshness
|
||||||
|
- Add indexes on frequently filtered columns
|
||||||
|
|
||||||
|
**2. Empty results**
|
||||||
|
- Verify data exists in source tables
|
||||||
|
- Check filter parameters (case-sensitive brand names)
|
||||||
|
- Verify state codes are valid
|
||||||
|
|
||||||
|
**3. Stale data**
|
||||||
|
- Run snapshot capture: `POST /api/az/analytics/snapshots/capture`
|
||||||
|
- Clear cache: `POST /api/az/analytics/cache/clear`
|
||||||
|
|
||||||
|
### Debugging
|
||||||
|
|
||||||
|
Enable query logging:
|
||||||
|
```typescript
|
||||||
|
// In service constructor
|
||||||
|
this.debug = process.env.ANALYTICS_DEBUG === 'true';
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Contracts
|
||||||
|
|
||||||
|
### Price Trend Response
|
||||||
|
```typescript
|
||||||
|
interface PriceTrend {
|
||||||
|
productId?: number;
|
||||||
|
storeId?: number;
|
||||||
|
brandName?: string;
|
||||||
|
category?: string;
|
||||||
|
dataPoints: Array<{
|
||||||
|
date: string;
|
||||||
|
minPrice: number | null;
|
||||||
|
maxPrice: number | null;
|
||||||
|
avgPrice: number | null;
|
||||||
|
wholesalePrice: number | null;
|
||||||
|
sampleSize: number;
|
||||||
|
}>;
|
||||||
|
summary: {
|
||||||
|
currentAvg: number | null;
|
||||||
|
previousAvg: number | null;
|
||||||
|
changePercent: number | null;
|
||||||
|
trend: 'up' | 'down' | 'stable';
|
||||||
|
volatilityScore: number | null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Brand Penetration Response
|
||||||
|
```typescript
|
||||||
|
interface BrandPenetration {
|
||||||
|
brandName: string;
|
||||||
|
totalStores: number;
|
||||||
|
storesWithBrand: number;
|
||||||
|
penetrationPercent: number;
|
||||||
|
skuCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
priceRange: { min: number; max: number } | null;
|
||||||
|
topCategories: Array<{ category: string; count: number }>;
|
||||||
|
stateBreakdown?: Array<{ state: string; storeCount: number }>;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Category Growth Response
|
||||||
|
```typescript
|
||||||
|
interface CategoryGrowth {
|
||||||
|
category: string;
|
||||||
|
currentCount: number;
|
||||||
|
previousCount: number;
|
||||||
|
growthPercent: number;
|
||||||
|
growthTrend: 'up' | 'down' | 'stable';
|
||||||
|
avgPrice: number | null;
|
||||||
|
priceChange: number | null;
|
||||||
|
topBrands: Array<{ brandName: string; count: number }>;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files Reference
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `src/dutchie-az/services/analytics/price-trends.ts` | Price analytics |
|
||||||
|
| `src/dutchie-az/services/analytics/penetration.ts` | Brand penetration |
|
||||||
|
| `src/dutchie-az/services/analytics/category-analytics.ts` | Category metrics |
|
||||||
|
| `src/dutchie-az/services/analytics/store-changes.ts` | Store event tracking |
|
||||||
|
| `src/dutchie-az/services/analytics/brand-opportunity.ts` | Competitive intel |
|
||||||
|
| `src/dutchie-az/services/analytics/cache.ts` | Caching layer |
|
||||||
|
| `src/dutchie-az/services/analytics/index.ts` | Module exports |
|
||||||
|
| `src/dutchie-az/routes/analytics.ts` | API routes (680 LOC) |
|
||||||
|
| `src/multi-state/state-query-service.ts` | Cross-state analytics |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Analytics V2: Rec/Med State Segmentation
|
||||||
|
|
||||||
|
Phase 3 Enhancement: Enhanced analytics with recreational vs medical-only state analysis.
|
||||||
|
|
||||||
|
### V2 API Endpoints
|
||||||
|
|
||||||
|
All V2 endpoints are prefixed with `/api/analytics/v2`
|
||||||
|
|
||||||
|
#### V2 Price Analytics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Price trends for a specific product
|
||||||
|
GET /api/analytics/v2/price/product/12345?window=30d
|
||||||
|
|
||||||
|
# Price by category and state (with rec/med segmentation)
|
||||||
|
GET /api/analytics/v2/price/category/Flower?state=AZ
|
||||||
|
|
||||||
|
# Price by brand and state
|
||||||
|
GET /api/analytics/v2/price/brand/Cookies?state=AZ
|
||||||
|
|
||||||
|
# Most volatile products
|
||||||
|
GET /api/analytics/v2/price/volatile?window=30d&limit=50&state=AZ
|
||||||
|
|
||||||
|
# Rec vs Med price comparison by category
|
||||||
|
GET /api/analytics/v2/price/rec-vs-med?category=Flower
|
||||||
|
```
|
||||||
|
|
||||||
|
#### V2 Brand Penetration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Brand penetration metrics with state breakdown
|
||||||
|
GET /api/analytics/v2/brand/Cookies/penetration?window=30d
|
||||||
|
|
||||||
|
# Brand market position within categories
|
||||||
|
GET /api/analytics/v2/brand/Cookies/market-position?category=Flower&state=AZ
|
||||||
|
|
||||||
|
# Brand presence in rec vs med-only states
|
||||||
|
GET /api/analytics/v2/brand/Cookies/rec-vs-med
|
||||||
|
|
||||||
|
# Top brands by penetration
|
||||||
|
GET /api/analytics/v2/brand/top?limit=25&state=AZ
|
||||||
|
|
||||||
|
# Brands expanding or contracting
|
||||||
|
GET /api/analytics/v2/brand/expansion-contraction?window=30d&limit=25
|
||||||
|
```
|
||||||
|
|
||||||
|
#### V2 Category Analytics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Category growth metrics
|
||||||
|
GET /api/analytics/v2/category/Flower/growth?window=30d
|
||||||
|
|
||||||
|
# Category growth trend over time
|
||||||
|
GET /api/analytics/v2/category/Flower/trend?window=30d
|
||||||
|
|
||||||
|
# Top brands in category
|
||||||
|
GET /api/analytics/v2/category/Flower/top-brands?limit=25&state=AZ
|
||||||
|
|
||||||
|
# All categories with metrics
|
||||||
|
GET /api/analytics/v2/category/all?state=AZ&limit=50
|
||||||
|
|
||||||
|
# Rec vs Med category comparison
|
||||||
|
GET /api/analytics/v2/category/rec-vs-med?category=Flower
|
||||||
|
|
||||||
|
# Fastest growing categories
|
||||||
|
GET /api/analytics/v2/category/fastest-growing?window=30d&limit=25
|
||||||
|
```
|
||||||
|
|
||||||
|
#### V2 Store Analytics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Store change summary
|
||||||
|
GET /api/analytics/v2/store/101/summary?window=30d
|
||||||
|
|
||||||
|
# Product change events
|
||||||
|
GET /api/analytics/v2/store/101/events?window=7d&limit=100
|
||||||
|
|
||||||
|
# Store inventory composition
|
||||||
|
GET /api/analytics/v2/store/101/inventory
|
||||||
|
|
||||||
|
# Store price positioning vs market
|
||||||
|
GET /api/analytics/v2/store/101/price-position
|
||||||
|
|
||||||
|
# Most active stores by changes
|
||||||
|
GET /api/analytics/v2/store/most-active?window=7d&limit=25&state=AZ
|
||||||
|
```
|
||||||
|
|
||||||
|
#### V2 State Analytics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# State market summary
|
||||||
|
GET /api/analytics/v2/state/AZ/summary
|
||||||
|
|
||||||
|
# All states with coverage metrics
|
||||||
|
GET /api/analytics/v2/state/all
|
||||||
|
|
||||||
|
# Legal state breakdown (rec, med-only, no program)
|
||||||
|
GET /api/analytics/v2/state/legal-breakdown
|
||||||
|
|
||||||
|
# Rec vs Med pricing by category
|
||||||
|
GET /api/analytics/v2/state/rec-vs-med-pricing?category=Flower
|
||||||
|
|
||||||
|
# States with coverage gaps
|
||||||
|
GET /api/analytics/v2/state/coverage-gaps
|
||||||
|
|
||||||
|
# Cross-state pricing comparison
|
||||||
|
GET /api/analytics/v2/state/price-comparison
|
||||||
|
```
|
||||||
|
|
||||||
|
### V2 Services Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
src/services/analytics/
|
||||||
|
├── index.ts # Exports all V2 services
|
||||||
|
├── types.ts # Shared type definitions
|
||||||
|
├── PriceAnalyticsService.ts # Price trends and volatility
|
||||||
|
├── BrandPenetrationService.ts # Brand market presence
|
||||||
|
├── CategoryAnalyticsService.ts # Category growth analysis
|
||||||
|
├── StoreAnalyticsService.ts # Store change tracking
|
||||||
|
└── StateAnalyticsService.ts # State-level analytics
|
||||||
|
|
||||||
|
src/routes/analytics-v2.ts # V2 API route handlers
|
||||||
|
```
|
||||||
|
|
||||||
|
### Key V2 Features
|
||||||
|
|
||||||
|
1. **Rec/Med State Segmentation**: All analytics can be filtered and compared by legal status
|
||||||
|
2. **State Coverage Gaps**: Identify legal states with missing or stale data
|
||||||
|
3. **Cross-State Pricing**: Compare prices across recreational and medical-only markets
|
||||||
|
4. **Brand Footprint Analysis**: Track brand presence in rec vs med states
|
||||||
|
5. **Category Comparison**: Compare category performance by legal status
|
||||||
|
|
||||||
|
### V2 Migration Path
|
||||||
|
|
||||||
|
1. Run migration 052 for state cannabis flags:
|
||||||
|
```bash
|
||||||
|
psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Run migration 053 for analytics indexes:
|
||||||
|
```bash
|
||||||
|
psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Restart backend to pick up new routes
|
||||||
|
|
||||||
|
### V2 Response Examples
|
||||||
|
|
||||||
|
**Rec vs Med Price Comparison:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"category": "Flower",
|
||||||
|
"recreational": {
|
||||||
|
"state_count": 15,
|
||||||
|
"product_count": 12500,
|
||||||
|
"avg_price": 35.50,
|
||||||
|
"median_price": 32.00
|
||||||
|
},
|
||||||
|
"medical_only": {
|
||||||
|
"state_count": 8,
|
||||||
|
"product_count": 5200,
|
||||||
|
"avg_price": 42.00,
|
||||||
|
"median_price": 40.00
|
||||||
|
},
|
||||||
|
"price_diff_percent": -15.48
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Legal State Breakdown:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"recreational_states": {
|
||||||
|
"count": 24,
|
||||||
|
"dispensary_count": 850,
|
||||||
|
"product_count": 125000,
|
||||||
|
"states": [
|
||||||
|
{ "code": "CA", "name": "California", "dispensary_count": 250 },
|
||||||
|
{ "code": "CO", "name": "Colorado", "dispensary_count": 150 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"medical_only_states": {
|
||||||
|
"count": 18,
|
||||||
|
"dispensary_count": 320,
|
||||||
|
"product_count": 45000,
|
||||||
|
"states": [
|
||||||
|
{ "code": "FL", "name": "Florida", "dispensary_count": 120 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"no_program_states": {
|
||||||
|
"count": 9,
|
||||||
|
"states": [
|
||||||
|
{ "code": "ID", "name": "Idaho" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Phase 3 Analytics Engine - Fully Implemented*
|
||||||
|
*V2 Rec/Med State Analytics - Added December 2024*
|
||||||
594
backend/docs/ANALYTICS_V2_EXAMPLES.md
Normal file
594
backend/docs/ANALYTICS_V2_EXAMPLES.md
Normal file
@@ -0,0 +1,594 @@
|
|||||||
|
# Analytics V2 API Examples
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
All endpoints are prefixed with `/api/analytics/v2`
|
||||||
|
|
||||||
|
### Filtering Options
|
||||||
|
|
||||||
|
**Time Windows:**
|
||||||
|
- `?window=7d` - Last 7 days
|
||||||
|
- `?window=30d` - Last 30 days (default)
|
||||||
|
- `?window=90d` - Last 90 days
|
||||||
|
|
||||||
|
**Legal Type Filtering:**
|
||||||
|
- `?legalType=recreational` - Recreational states only
|
||||||
|
- `?legalType=medical_only` - Medical-only states (not recreational)
|
||||||
|
- `?legalType=no_program` - States with no cannabis program
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Price Analytics
|
||||||
|
|
||||||
|
### GET /price/product/:id
|
||||||
|
|
||||||
|
Get price trends for a specific store product.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/price/product/12345?window=30d
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"store_product_id": 12345,
|
||||||
|
"product_name": "Blue Dream 3.5g",
|
||||||
|
"brand_name": "Cookies",
|
||||||
|
"category": "Flower",
|
||||||
|
"dispensary_id": 101,
|
||||||
|
"dispensary_name": "Green Leaf Dispensary",
|
||||||
|
"state_code": "AZ",
|
||||||
|
"data_points": [
|
||||||
|
{
|
||||||
|
"date": "2024-11-06",
|
||||||
|
"price_rec": 45.00,
|
||||||
|
"price_med": 40.00,
|
||||||
|
"price_rec_special": null,
|
||||||
|
"price_med_special": null,
|
||||||
|
"is_on_special": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"date": "2024-11-07",
|
||||||
|
"price_rec": 42.00,
|
||||||
|
"price_med": 38.00,
|
||||||
|
"price_rec_special": null,
|
||||||
|
"price_med_special": null,
|
||||||
|
"is_on_special": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"summary": {
|
||||||
|
"current_price": 42.00,
|
||||||
|
"min_price": 40.00,
|
||||||
|
"max_price": 48.00,
|
||||||
|
"avg_price": 43.50,
|
||||||
|
"price_change_count": 3,
|
||||||
|
"volatility_percent": 8.2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /price/rec-vs-med
|
||||||
|
|
||||||
|
Get recreational vs medical-only price comparison by category.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/price/rec-vs-med?category=Flower
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"category": "Flower",
|
||||||
|
"rec_avg": 38.50,
|
||||||
|
"rec_median": 35.00,
|
||||||
|
"med_avg": 42.00,
|
||||||
|
"med_median": 40.00
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"category": "Concentrates",
|
||||||
|
"rec_avg": 45.00,
|
||||||
|
"rec_median": 42.00,
|
||||||
|
"med_avg": 48.00,
|
||||||
|
"med_median": 45.00
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Brand Analytics
|
||||||
|
|
||||||
|
### GET /brand/:name/penetration
|
||||||
|
|
||||||
|
Get brand penetration metrics with state breakdown.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/brand/Cookies/penetration?window=30d
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"brand_name": "Cookies",
|
||||||
|
"total_dispensaries": 125,
|
||||||
|
"total_skus": 450,
|
||||||
|
"avg_skus_per_dispensary": 3.6,
|
||||||
|
"states_present": ["AZ", "CA", "CO", "NV", "MI"],
|
||||||
|
"state_breakdown": [
|
||||||
|
{
|
||||||
|
"state_code": "CA",
|
||||||
|
"state_name": "California",
|
||||||
|
"legal_type": "recreational",
|
||||||
|
"dispensary_count": 45,
|
||||||
|
"sku_count": 180,
|
||||||
|
"avg_skus_per_dispensary": 4.0,
|
||||||
|
"market_share_percent": 12.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"state_code": "AZ",
|
||||||
|
"state_name": "Arizona",
|
||||||
|
"legal_type": "recreational",
|
||||||
|
"dispensary_count": 32,
|
||||||
|
"sku_count": 128,
|
||||||
|
"avg_skus_per_dispensary": 4.0,
|
||||||
|
"market_share_percent": 15.2
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"penetration_trend": [
|
||||||
|
{
|
||||||
|
"date": "2024-11-01",
|
||||||
|
"dispensary_count": 120,
|
||||||
|
"new_dispensaries": 0,
|
||||||
|
"dropped_dispensaries": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"date": "2024-11-08",
|
||||||
|
"dispensary_count": 123,
|
||||||
|
"new_dispensaries": 3,
|
||||||
|
"dropped_dispensaries": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"date": "2024-11-15",
|
||||||
|
"dispensary_count": 125,
|
||||||
|
"new_dispensaries": 2,
|
||||||
|
"dropped_dispensaries": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /brand/:name/rec-vs-med
|
||||||
|
|
||||||
|
Get brand presence in recreational vs medical-only states.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/brand/Cookies/rec-vs-med
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"brand_name": "Cookies",
|
||||||
|
"rec_states_count": 4,
|
||||||
|
"rec_states": ["AZ", "CA", "CO", "NV"],
|
||||||
|
"rec_dispensary_count": 110,
|
||||||
|
"rec_avg_skus": 3.8,
|
||||||
|
"med_only_states_count": 2,
|
||||||
|
"med_only_states": ["FL", "OH"],
|
||||||
|
"med_only_dispensary_count": 15,
|
||||||
|
"med_only_avg_skus": 2.5
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Category Analytics
|
||||||
|
|
||||||
|
### GET /category/:name/growth
|
||||||
|
|
||||||
|
Get category growth metrics with state breakdown.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/category/Flower/growth?window=30d
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"category": "Flower",
|
||||||
|
"current_sku_count": 5200,
|
||||||
|
"current_dispensary_count": 320,
|
||||||
|
"avg_price": 38.50,
|
||||||
|
"growth_data": [
|
||||||
|
{
|
||||||
|
"date": "2024-11-01",
|
||||||
|
"sku_count": 4800,
|
||||||
|
"dispensary_count": 310,
|
||||||
|
"avg_price": 39.00
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"date": "2024-11-15",
|
||||||
|
"sku_count": 5000,
|
||||||
|
"dispensary_count": 315,
|
||||||
|
"avg_price": 38.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"date": "2024-12-01",
|
||||||
|
"sku_count": 5200,
|
||||||
|
"dispensary_count": 320,
|
||||||
|
"avg_price": 38.50
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"state_breakdown": [
|
||||||
|
{
|
||||||
|
"state_code": "CA",
|
||||||
|
"state_name": "California",
|
||||||
|
"legal_type": "recreational",
|
||||||
|
"sku_count": 2100,
|
||||||
|
"dispensary_count": 145,
|
||||||
|
"avg_price": 36.00
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"state_code": "AZ",
|
||||||
|
"state_name": "Arizona",
|
||||||
|
"legal_type": "recreational",
|
||||||
|
"sku_count": 950,
|
||||||
|
"dispensary_count": 85,
|
||||||
|
"avg_price": 40.00
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /category/rec-vs-med
|
||||||
|
|
||||||
|
Get category comparison between recreational and medical-only states.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/category/rec-vs-med
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"category": "Flower",
|
||||||
|
"recreational": {
|
||||||
|
"state_count": 15,
|
||||||
|
"dispensary_count": 650,
|
||||||
|
"sku_count": 12500,
|
||||||
|
"avg_price": 35.50,
|
||||||
|
"median_price": 32.00
|
||||||
|
},
|
||||||
|
"medical_only": {
|
||||||
|
"state_count": 8,
|
||||||
|
"dispensary_count": 220,
|
||||||
|
"sku_count": 4200,
|
||||||
|
"avg_price": 42.00,
|
||||||
|
"median_price": 40.00
|
||||||
|
},
|
||||||
|
"price_diff_percent": -15.48
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"category": "Concentrates",
|
||||||
|
"recreational": {
|
||||||
|
"state_count": 15,
|
||||||
|
"dispensary_count": 600,
|
||||||
|
"sku_count": 8500,
|
||||||
|
"avg_price": 42.00,
|
||||||
|
"median_price": 40.00
|
||||||
|
},
|
||||||
|
"medical_only": {
|
||||||
|
"state_count": 8,
|
||||||
|
"dispensary_count": 200,
|
||||||
|
"sku_count": 3100,
|
||||||
|
"avg_price": 48.00,
|
||||||
|
"median_price": 45.00
|
||||||
|
},
|
||||||
|
"price_diff_percent": -12.50
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Store Analytics
|
||||||
|
|
||||||
|
### GET /store/:id/summary
|
||||||
|
|
||||||
|
Get change summary for a store over a time window.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/store/101/summary?window=30d
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"dispensary_id": 101,
|
||||||
|
"dispensary_name": "Green Leaf Dispensary",
|
||||||
|
"state_code": "AZ",
|
||||||
|
"window": "30d",
|
||||||
|
"products_added": 45,
|
||||||
|
"products_dropped": 12,
|
||||||
|
"brands_added": ["Alien Labs", "Connected"],
|
||||||
|
"brands_dropped": ["House Brand"],
|
||||||
|
"price_changes": 156,
|
||||||
|
"avg_price_change_percent": 3.2,
|
||||||
|
"stock_in_events": 89,
|
||||||
|
"stock_out_events": 34,
|
||||||
|
"current_product_count": 512,
|
||||||
|
"current_in_stock_count": 478
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /store/:id/events
|
||||||
|
|
||||||
|
Get recent product change events for a store.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/store/101/events?window=7d&limit=50
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"store_product_id": 12345,
|
||||||
|
"product_name": "Blue Dream 3.5g",
|
||||||
|
"brand_name": "Cookies",
|
||||||
|
"category": "Flower",
|
||||||
|
"event_type": "price_change",
|
||||||
|
"event_date": "2024-12-05T14:30:00.000Z",
|
||||||
|
"old_value": "45.00",
|
||||||
|
"new_value": "42.00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"store_product_id": 12346,
|
||||||
|
"product_name": "OG Kush 1g",
|
||||||
|
"brand_name": "Alien Labs",
|
||||||
|
"category": "Flower",
|
||||||
|
"event_type": "added",
|
||||||
|
"event_date": "2024-12-04T10:00:00.000Z",
|
||||||
|
"old_value": null,
|
||||||
|
"new_value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"store_product_id": 12300,
|
||||||
|
"product_name": "Sour Diesel Cart",
|
||||||
|
"brand_name": "Select",
|
||||||
|
"category": "Vaporizers",
|
||||||
|
"event_type": "stock_out",
|
||||||
|
"event_date": "2024-12-03T16:45:00.000Z",
|
||||||
|
"old_value": "true",
|
||||||
|
"new_value": "false"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. State Analytics
|
||||||
|
|
||||||
|
### GET /state/:code/summary
|
||||||
|
|
||||||
|
Get market summary for a specific state with rec/med breakdown.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/state/AZ/summary
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"state_code": "AZ",
|
||||||
|
"state_name": "Arizona",
|
||||||
|
"legal_status": {
|
||||||
|
"recreational_legal": true,
|
||||||
|
"rec_year": 2020,
|
||||||
|
"medical_legal": true,
|
||||||
|
"med_year": 2010
|
||||||
|
},
|
||||||
|
"coverage": {
|
||||||
|
"dispensary_count": 145,
|
||||||
|
"product_count": 18500,
|
||||||
|
"brand_count": 320,
|
||||||
|
"category_count": 12,
|
||||||
|
"snapshot_count": 2450000,
|
||||||
|
"last_crawl_at": "2024-12-06T02:30:00.000Z"
|
||||||
|
},
|
||||||
|
"pricing": {
|
||||||
|
"avg_price": 42.50,
|
||||||
|
"median_price": 38.00,
|
||||||
|
"min_price": 5.00,
|
||||||
|
"max_price": 250.00
|
||||||
|
},
|
||||||
|
"top_categories": [
|
||||||
|
{ "category": "Flower", "count": 5200 },
|
||||||
|
{ "category": "Concentrates", "count": 3800 },
|
||||||
|
{ "category": "Vaporizers", "count": 2950 },
|
||||||
|
{ "category": "Edibles", "count": 2400 },
|
||||||
|
{ "category": "Pre-Rolls", "count": 1850 }
|
||||||
|
],
|
||||||
|
"top_brands": [
|
||||||
|
{ "brand": "Cookies", "count": 450 },
|
||||||
|
{ "brand": "Alien Labs", "count": 380 },
|
||||||
|
{ "brand": "Connected", "count": 320 },
|
||||||
|
{ "brand": "Stiiizy", "count": 290 },
|
||||||
|
{ "brand": "Raw Garden", "count": 275 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /state/legal-breakdown
|
||||||
|
|
||||||
|
Get breakdown by legal status (recreational, medical-only, no program).
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/state/legal-breakdown
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"recreational_states": {
|
||||||
|
"count": 24,
|
||||||
|
"dispensary_count": 850,
|
||||||
|
"product_count": 125000,
|
||||||
|
"snapshot_count": 15000000,
|
||||||
|
"states": [
|
||||||
|
{ "code": "CA", "name": "California", "dispensary_count": 250 },
|
||||||
|
{ "code": "CO", "name": "Colorado", "dispensary_count": 150 },
|
||||||
|
{ "code": "AZ", "name": "Arizona", "dispensary_count": 145 },
|
||||||
|
{ "code": "MI", "name": "Michigan", "dispensary_count": 120 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"medical_only_states": {
|
||||||
|
"count": 18,
|
||||||
|
"dispensary_count": 320,
|
||||||
|
"product_count": 45000,
|
||||||
|
"snapshot_count": 5000000,
|
||||||
|
"states": [
|
||||||
|
{ "code": "FL", "name": "Florida", "dispensary_count": 120 },
|
||||||
|
{ "code": "OH", "name": "Ohio", "dispensary_count": 85 },
|
||||||
|
{ "code": "PA", "name": "Pennsylvania", "dispensary_count": 75 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"no_program_states": {
|
||||||
|
"count": 9,
|
||||||
|
"states": [
|
||||||
|
{ "code": "ID", "name": "Idaho" },
|
||||||
|
{ "code": "WY", "name": "Wyoming" },
|
||||||
|
{ "code": "KS", "name": "Kansas" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /state/recreational
|
||||||
|
|
||||||
|
Get list of recreational state codes.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/state/recreational
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"legal_type": "recreational",
|
||||||
|
"states": ["AK", "AZ", "CA", "CO", "CT", "DE", "IL", "MA", "MD", "ME", "MI", "MN", "MO", "MT", "NJ", "NM", "NV", "NY", "OH", "OR", "RI", "VA", "VT", "WA"],
|
||||||
|
"count": 24
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /state/medical-only
|
||||||
|
|
||||||
|
Get list of medical-only state codes (not recreational).
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/state/medical-only
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"legal_type": "medical_only",
|
||||||
|
"states": ["AR", "FL", "HI", "LA", "MS", "ND", "NH", "OK", "PA", "SD", "UT", "WV"],
|
||||||
|
"count": 12
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /state/rec-vs-med-pricing
|
||||||
|
|
||||||
|
Get rec vs med price comparison by category.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
GET /api/analytics/v2/state/rec-vs-med-pricing?category=Flower
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"category": "Flower",
|
||||||
|
"recreational": {
|
||||||
|
"state_count": 15,
|
||||||
|
"product_count": 12500,
|
||||||
|
"avg_price": 35.50,
|
||||||
|
"median_price": 32.00
|
||||||
|
},
|
||||||
|
"medical_only": {
|
||||||
|
"state_count": 8,
|
||||||
|
"product_count": 5200,
|
||||||
|
"avg_price": 42.00,
|
||||||
|
"median_price": 40.00
|
||||||
|
},
|
||||||
|
"price_diff_percent": -15.48
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How These Endpoints Support Portals
|
||||||
|
|
||||||
|
### Brand Portal Use Cases
|
||||||
|
|
||||||
|
1. **Track brand penetration**: Use `/brand/:name/penetration` to see how many stores carry the brand
|
||||||
|
2. **Compare rec vs med markets**: Use `/brand/:name/rec-vs-med` to understand footprint by legal status
|
||||||
|
3. **Identify expansion opportunities**: Use `/state/coverage-gaps` to find underserved markets
|
||||||
|
4. **Monitor pricing**: Use `/price/brand/:brand` to track pricing by state
|
||||||
|
|
||||||
|
### Buyer Portal Use Cases
|
||||||
|
|
||||||
|
1. **Compare stores**: Use `/store/:id/summary` to see activity levels
|
||||||
|
2. **Track price changes**: Use `/store/:id/events` to monitor competitor pricing
|
||||||
|
3. **Analyze categories**: Use `/category/:name/growth` to identify trending products
|
||||||
|
4. **State-level insights**: Use `/state/:code/summary` for market overview
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Time Window Filtering
|
||||||
|
|
||||||
|
All time-based endpoints support the `window` query parameter:
|
||||||
|
|
||||||
|
| Value | Description |
|
||||||
|
|-------|-------------|
|
||||||
|
| `7d` | Last 7 days |
|
||||||
|
| `30d` | Last 30 days (default) |
|
||||||
|
| `90d` | Last 90 days |
|
||||||
|
|
||||||
|
The window affects:
|
||||||
|
- `store_product_snapshots.captured_at` for historical data
|
||||||
|
- `store_products.first_seen_at` / `last_seen_at` for product lifecycle
|
||||||
|
- `crawl_runs.started_at` for crawl-based metrics
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Rec/Med Segmentation
|
||||||
|
|
||||||
|
All state-level endpoints automatically segment by:
|
||||||
|
|
||||||
|
- **Recreational**: `states.recreational_legal = TRUE`
|
||||||
|
- **Medical-only**: `states.medical_legal = TRUE AND states.recreational_legal = FALSE`
|
||||||
|
- **No program**: Both flags are FALSE or NULL
|
||||||
|
|
||||||
|
This segmentation appears in:
|
||||||
|
- `legal_type` field in responses
|
||||||
|
- State breakdown arrays
|
||||||
|
- Price comparison endpoints
|
||||||
90
backend/migrations/037_dispensary_crawler_profiles.sql
Normal file
90
backend/migrations/037_dispensary_crawler_profiles.sql
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
-- Migration 037: Add per-store crawler profiles for Dutchie dispensaries
|
||||||
|
-- This enables per-store crawler configuration without changing shared logic
|
||||||
|
-- Phase 1: Schema only - no automatic behavior changes
|
||||||
|
|
||||||
|
-- Create the crawler profiles table
|
||||||
|
CREATE TABLE IF NOT EXISTS dispensary_crawler_profiles (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
|
||||||
|
-- Human readable name for this profile
|
||||||
|
profile_name VARCHAR(255) NOT NULL,
|
||||||
|
|
||||||
|
-- High-level type, e.g. 'dutchie', 'treez', 'jane'
|
||||||
|
crawler_type VARCHAR(50) NOT NULL,
|
||||||
|
|
||||||
|
-- Optional key for mapping to a per-store crawler module later,
|
||||||
|
-- e.g. 'curaleaf-dispensary-gilbert'
|
||||||
|
profile_key VARCHAR(255),
|
||||||
|
|
||||||
|
-- Generic configuration bucket; will hold selectors, URLs, flags, etc.
|
||||||
|
config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
|
||||||
|
-- Execution hints (safe defaults; can be overridden in config if needed)
|
||||||
|
timeout_ms INTEGER DEFAULT 30000,
|
||||||
|
download_images BOOLEAN DEFAULT TRUE,
|
||||||
|
track_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
|
||||||
|
version INTEGER DEFAULT 1,
|
||||||
|
enabled BOOLEAN DEFAULT TRUE,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Unique index on dispensary_id + profile_name
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS dispensary_crawler_profiles_unique_name
|
||||||
|
ON dispensary_crawler_profiles (dispensary_id, profile_name);
|
||||||
|
|
||||||
|
-- Index for finding enabled profiles by type
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_type_enabled
|
||||||
|
ON dispensary_crawler_profiles (crawler_type, enabled);
|
||||||
|
|
||||||
|
-- Index for dispensary lookup
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_dispensary
|
||||||
|
ON dispensary_crawler_profiles (dispensary_id);
|
||||||
|
|
||||||
|
-- Add FK from dispensaries to active profile
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries'
|
||||||
|
AND column_name = 'active_crawler_profile_id') THEN
|
||||||
|
ALTER TABLE dispensaries
|
||||||
|
ADD COLUMN active_crawler_profile_id INTEGER NULL
|
||||||
|
REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Create index on the FK for faster joins
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_active_profile
|
||||||
|
ON dispensaries (active_crawler_profile_id)
|
||||||
|
WHERE active_crawler_profile_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Create or replace trigger function for updated_at
|
||||||
|
CREATE OR REPLACE FUNCTION set_updated_at_timestamp()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.updated_at = NOW();
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Add trigger to keep updated_at fresh (drop first if exists to avoid duplicates)
|
||||||
|
DROP TRIGGER IF EXISTS dispensary_crawler_profiles_set_timestamp ON dispensary_crawler_profiles;
|
||||||
|
CREATE TRIGGER dispensary_crawler_profiles_set_timestamp
|
||||||
|
BEFORE UPDATE ON dispensary_crawler_profiles
|
||||||
|
FOR EACH ROW EXECUTE PROCEDURE set_updated_at_timestamp();
|
||||||
|
|
||||||
|
-- Add comments for documentation
|
||||||
|
COMMENT ON TABLE dispensary_crawler_profiles IS 'Per-store crawler configuration profiles. Each dispensary can have multiple profiles but only one active at a time.';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.profile_name IS 'Human readable name for the profile, e.g. "Curaleaf Gilbert - Dutchie v1"';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.crawler_type IS 'The crawler implementation type: dutchie, treez, jane, sandbox, custom';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.profile_key IS 'Optional identifier for per-store crawler module mapping';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.config IS 'JSONB configuration for the crawler. Schema depends on crawler_type.';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.timeout_ms IS 'Request timeout in milliseconds (default 30000)';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.download_images IS 'Whether to download product images locally';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.track_stock IS 'Whether to track inventory/stock levels';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.version IS 'Profile version number for A/B testing or upgrades';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.enabled IS 'Whether this profile can be used (soft delete)';
|
||||||
|
COMMENT ON COLUMN dispensaries.active_crawler_profile_id IS 'FK to the currently active crawler profile for this dispensary';
|
||||||
84
backend/migrations/038_profile_status_field.sql
Normal file
84
backend/migrations/038_profile_status_field.sql
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
-- Migration: Add status field to dispensary_crawler_profiles
|
||||||
|
-- This adds a proper status column for crawler state machine
|
||||||
|
-- Status values: 'production', 'sandbox', 'needs_manual', 'disabled'
|
||||||
|
|
||||||
|
-- Add status column with default 'production' for existing profiles
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'production';
|
||||||
|
|
||||||
|
-- Add next_retry_at column for sandbox retry scheduling
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS next_retry_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
-- Add sandbox_attempt_count for quick lookup
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS sandbox_attempt_count INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
-- Add last_sandbox_at for tracking
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS last_sandbox_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
-- Create index for finding profiles by status
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_status
|
||||||
|
ON dispensary_crawler_profiles(status) WHERE enabled = true;
|
||||||
|
|
||||||
|
-- Create index for finding profiles needing retry
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_next_retry
|
||||||
|
ON dispensary_crawler_profiles(next_retry_at) WHERE enabled = true AND status = 'sandbox';
|
||||||
|
|
||||||
|
-- Add comment explaining status values
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.status IS
|
||||||
|
'Crawler status: production (ready for regular crawls), sandbox (discovery mode), needs_manual (max retries exceeded), disabled (turned off)';
|
||||||
|
|
||||||
|
-- Update existing profiles to have status based on config if present
|
||||||
|
UPDATE dispensary_crawler_profiles
|
||||||
|
SET status = COALESCE(config->>'status', 'production')
|
||||||
|
WHERE status IS NULL OR status = '';
|
||||||
|
|
||||||
|
-- Backfill sandbox_attempt_count from config
|
||||||
|
UPDATE dispensary_crawler_profiles
|
||||||
|
SET sandbox_attempt_count = COALESCE(
|
||||||
|
jsonb_array_length(config->'sandboxAttempts'),
|
||||||
|
0
|
||||||
|
)
|
||||||
|
WHERE config->'sandboxAttempts' IS NOT NULL;
|
||||||
|
|
||||||
|
-- Backfill next_retry_at from config
|
||||||
|
UPDATE dispensary_crawler_profiles
|
||||||
|
SET next_retry_at = (config->>'nextRetryAt')::timestamptz
|
||||||
|
WHERE config->>'nextRetryAt' IS NOT NULL;
|
||||||
|
|
||||||
|
-- Create view for crawler profile summary
|
||||||
|
CREATE OR REPLACE VIEW v_crawler_profile_summary AS
|
||||||
|
SELECT
|
||||||
|
dcp.id,
|
||||||
|
dcp.dispensary_id,
|
||||||
|
d.name AS dispensary_name,
|
||||||
|
d.city,
|
||||||
|
d.menu_type,
|
||||||
|
dcp.profile_name,
|
||||||
|
dcp.profile_key,
|
||||||
|
dcp.crawler_type,
|
||||||
|
dcp.status,
|
||||||
|
dcp.enabled,
|
||||||
|
dcp.sandbox_attempt_count,
|
||||||
|
dcp.next_retry_at,
|
||||||
|
dcp.last_sandbox_at,
|
||||||
|
dcp.created_at,
|
||||||
|
dcp.updated_at,
|
||||||
|
CASE
|
||||||
|
WHEN dcp.profile_key IS NOT NULL THEN 'per-store'
|
||||||
|
ELSE 'legacy'
|
||||||
|
END AS crawler_mode,
|
||||||
|
CASE
|
||||||
|
WHEN dcp.status = 'production' THEN 'Ready'
|
||||||
|
WHEN dcp.status = 'sandbox' AND dcp.next_retry_at <= NOW() THEN 'Retry Due'
|
||||||
|
WHEN dcp.status = 'sandbox' THEN 'Waiting'
|
||||||
|
WHEN dcp.status = 'needs_manual' THEN 'Needs Manual'
|
||||||
|
WHEN dcp.status = 'disabled' THEN 'Disabled'
|
||||||
|
ELSE 'Unknown'
|
||||||
|
END AS status_display
|
||||||
|
FROM dispensary_crawler_profiles dcp
|
||||||
|
JOIN dispensaries d ON d.id = dcp.dispensary_id
|
||||||
|
WHERE dcp.enabled = true
|
||||||
|
ORDER BY dcp.status, dcp.updated_at DESC;
|
||||||
73
backend/migrations/039_crawl_orchestration_traces.sql
Normal file
73
backend/migrations/039_crawl_orchestration_traces.sql
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
-- Migration: Create crawl_orchestration_traces table
|
||||||
|
-- Purpose: Store detailed step-by-step traces for every crawl orchestration run
|
||||||
|
-- This enables full visibility into per-store crawler behavior
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_orchestration_traces (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
run_id VARCHAR(255), -- UUID or job ID for this crawl run
|
||||||
|
profile_id INTEGER REFERENCES dispensary_crawler_profiles(id) ON DELETE SET NULL,
|
||||||
|
profile_key VARCHAR(255), -- e.g. "trulieve-scottsdale"
|
||||||
|
crawler_module VARCHAR(255), -- Full path to .ts file loaded
|
||||||
|
state_at_start VARCHAR(50), -- sandbox, production, legacy, disabled
|
||||||
|
state_at_end VARCHAR(50), -- sandbox, production, needs_manual, etc.
|
||||||
|
|
||||||
|
-- The trace: ordered array of step objects
|
||||||
|
trace JSONB NOT NULL DEFAULT '[]'::jsonb,
|
||||||
|
|
||||||
|
-- Summary metrics for quick querying
|
||||||
|
total_steps INTEGER DEFAULT 0,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
success BOOLEAN,
|
||||||
|
error_message TEXT,
|
||||||
|
products_found INTEGER,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
completed_at TIMESTAMPTZ,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Index for quick lookup by dispensary
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_id
|
||||||
|
ON crawl_orchestration_traces(dispensary_id);
|
||||||
|
|
||||||
|
-- Index for finding latest trace per dispensary
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_traces_dispensary_created
|
||||||
|
ON crawl_orchestration_traces(dispensary_id, created_at DESC);
|
||||||
|
|
||||||
|
-- Index for finding traces by run_id
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_traces_run_id
|
||||||
|
ON crawl_orchestration_traces(run_id) WHERE run_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Index for finding traces by profile
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_traces_profile_id
|
||||||
|
ON crawl_orchestration_traces(profile_id) WHERE profile_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Comment explaining trace structure
|
||||||
|
COMMENT ON COLUMN crawl_orchestration_traces.trace IS
|
||||||
|
'Ordered array of step objects. Each step has:
|
||||||
|
{
|
||||||
|
"step": 1,
|
||||||
|
"action": "load_profile",
|
||||||
|
"description": "Loading crawler profile for dispensary",
|
||||||
|
"timestamp": 1701234567890,
|
||||||
|
"duration_ms": 45,
|
||||||
|
"input": { ... },
|
||||||
|
"output": { ... },
|
||||||
|
"what": "Description of what happened",
|
||||||
|
"why": "Reason this step was taken",
|
||||||
|
"where": "Code location / module",
|
||||||
|
"how": "Method or approach used",
|
||||||
|
"when": "ISO timestamp"
|
||||||
|
}';
|
||||||
|
|
||||||
|
-- View for easy access to latest traces
|
||||||
|
CREATE OR REPLACE VIEW v_latest_crawl_traces AS
|
||||||
|
SELECT DISTINCT ON (dispensary_id)
|
||||||
|
cot.*,
|
||||||
|
d.name AS dispensary_name,
|
||||||
|
d.city AS dispensary_city
|
||||||
|
FROM crawl_orchestration_traces cot
|
||||||
|
JOIN dispensaries d ON d.id = cot.dispensary_id
|
||||||
|
ORDER BY dispensary_id, cot.created_at DESC;
|
||||||
73
backend/migrations/040_dispensary_dba_name.sql
Normal file
73
backend/migrations/040_dispensary_dba_name.sql
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
-- Migration 040: Add dba_name column to dispensaries table
|
||||||
|
-- DBA (Doing Business As) name - the name the dispensary operates under,
|
||||||
|
-- which may differ from the legal entity name
|
||||||
|
-- This migration is idempotent - safe to run multiple times
|
||||||
|
|
||||||
|
-- Add dba_name column
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'dba_name') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN dba_name TEXT DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add company_name column (legal entity name)
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'company_name') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN company_name TEXT DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add azdhs_id for Arizona Department of Health Services license number
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'azdhs_id') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN azdhs_id INTEGER DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add phone column
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'phone') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN phone TEXT DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add email column
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'email') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN email TEXT DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add google_rating column
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_rating') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN google_rating NUMERIC(2,1) DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add google_review_count column
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'dispensaries' AND column_name = 'google_review_count') THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN google_review_count INTEGER DEFAULT NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add comments for documentation
|
||||||
|
COMMENT ON COLUMN dispensaries.dba_name IS 'DBA (Doing Business As) name - the public-facing name the dispensary operates under';
|
||||||
|
COMMENT ON COLUMN dispensaries.company_name IS 'Legal entity/company name that owns the dispensary';
|
||||||
|
COMMENT ON COLUMN dispensaries.azdhs_id IS 'Arizona Department of Health Services license number';
|
||||||
|
COMMENT ON COLUMN dispensaries.phone IS 'Contact phone number';
|
||||||
|
COMMENT ON COLUMN dispensaries.email IS 'Contact email address';
|
||||||
|
COMMENT ON COLUMN dispensaries.google_rating IS 'Google Maps rating (1.0 to 5.0)';
|
||||||
|
COMMENT ON COLUMN dispensaries.google_review_count IS 'Number of Google reviews';
|
||||||
|
|
||||||
|
-- Create index for searching by dba_name
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_dba_name ON dispensaries (dba_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_azdhs_id ON dispensaries (azdhs_id);
|
||||||
376
backend/migrations/041_cannaiq_canonical_schema.sql
Normal file
376
backend/migrations/041_cannaiq_canonical_schema.sql
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
-- Migration 041: CannaiQ Canonical Schema
|
||||||
|
--
|
||||||
|
-- This migration adds the canonical CannaiQ schema tables and columns.
|
||||||
|
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
|
||||||
|
--
|
||||||
|
-- Run with: psql $CANNAIQ_DB_URL -f migrations/041_cannaiq_canonical_schema.sql
|
||||||
|
--
|
||||||
|
-- Tables created:
|
||||||
|
-- - states (new)
|
||||||
|
-- - chains (new)
|
||||||
|
-- - brands (new)
|
||||||
|
-- - store_products (new - normalized view of current menu)
|
||||||
|
-- - store_product_snapshots (new - historical crawl data)
|
||||||
|
-- - crawl_runs (new - replaces/supplements dispensary_crawl_jobs)
|
||||||
|
--
|
||||||
|
-- Tables modified:
|
||||||
|
-- - dispensaries (add state_id, chain_id FKs)
|
||||||
|
-- - dispensary_crawler_profiles (add status, allow_autopromote, validated_at)
|
||||||
|
-- - crawl_orchestration_traces (add run_id FK)
|
||||||
|
--
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 1) STATES TABLE
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS states (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
code VARCHAR(2) NOT NULL UNIQUE,
|
||||||
|
name VARCHAR(100) NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Insert known states
|
||||||
|
INSERT INTO states (code, name) VALUES
|
||||||
|
('AZ', 'Arizona'),
|
||||||
|
('CA', 'California'),
|
||||||
|
('CO', 'Colorado'),
|
||||||
|
('FL', 'Florida'),
|
||||||
|
('IL', 'Illinois'),
|
||||||
|
('MA', 'Massachusetts'),
|
||||||
|
('MD', 'Maryland'),
|
||||||
|
('MI', 'Michigan'),
|
||||||
|
('MO', 'Missouri'),
|
||||||
|
('NV', 'Nevada'),
|
||||||
|
('NJ', 'New Jersey'),
|
||||||
|
('NY', 'New York'),
|
||||||
|
('OH', 'Ohio'),
|
||||||
|
('OK', 'Oklahoma'),
|
||||||
|
('OR', 'Oregon'),
|
||||||
|
('PA', 'Pennsylvania'),
|
||||||
|
('WA', 'Washington')
|
||||||
|
ON CONFLICT (code) DO NOTHING;
|
||||||
|
|
||||||
|
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state codes.';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 2) CHAINS TABLE (retail groups)
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS chains (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
website_url TEXT,
|
||||||
|
logo_url TEXT,
|
||||||
|
description TEXT,
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations (e.g., Curaleaf, Trulieve).';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 3) BRANDS TABLE (canonical brand catalog)
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS brands (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
external_id VARCHAR(100), -- Provider-specific brand ID
|
||||||
|
website_url TEXT,
|
||||||
|
instagram_handle VARCHAR(100),
|
||||||
|
logo_url TEXT,
|
||||||
|
description TEXT,
|
||||||
|
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brands_slug ON brands(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brands_external_id ON brands(external_id) WHERE external_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brands_portfolio ON brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE brands IS 'Canonical brand catalog. Brands may appear across multiple dispensaries.';
|
||||||
|
COMMENT ON COLUMN brands.is_portfolio_brand IS 'TRUE if this is a brand we represent/manage (vs third-party brand)';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 4) ADD state_id AND chain_id TO dispensaries
|
||||||
|
-- =====================================================
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
|
||||||
|
|
||||||
|
-- NOTE: state_id backfill is done by ETL script (042_legacy_import.ts), not this migration.
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
|
||||||
|
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 5) STORE_PRODUCTS TABLE (current menu state)
|
||||||
|
-- =====================================================
|
||||||
|
-- This is the normalized "what is currently on the menu" table.
|
||||||
|
-- It supplements dutchie_products with a provider-agnostic structure.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS store_products (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL, -- Link to canonical product
|
||||||
|
brand_id INTEGER REFERENCES brands(id) ON DELETE SET NULL, -- Link to canonical brand
|
||||||
|
|
||||||
|
-- Provider-specific identifiers
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie', -- dutchie, treez, jane, etc.
|
||||||
|
provider_product_id VARCHAR(100), -- Platform-specific product ID
|
||||||
|
provider_brand_id VARCHAR(100), -- Platform-specific brand ID
|
||||||
|
|
||||||
|
-- Raw data from platform (not normalized)
|
||||||
|
name_raw VARCHAR(500) NOT NULL,
|
||||||
|
brand_name_raw VARCHAR(255),
|
||||||
|
category_raw VARCHAR(100),
|
||||||
|
subcategory_raw VARCHAR(100),
|
||||||
|
|
||||||
|
-- Pricing
|
||||||
|
price_rec NUMERIC(10,2),
|
||||||
|
price_med NUMERIC(10,2),
|
||||||
|
price_rec_special NUMERIC(10,2),
|
||||||
|
price_med_special NUMERIC(10,2),
|
||||||
|
is_on_special BOOLEAN DEFAULT FALSE,
|
||||||
|
special_name TEXT,
|
||||||
|
discount_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Inventory
|
||||||
|
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
stock_quantity INTEGER,
|
||||||
|
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||||
|
|
||||||
|
-- Potency
|
||||||
|
thc_percent NUMERIC(5,2),
|
||||||
|
cbd_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Images
|
||||||
|
image_url TEXT,
|
||||||
|
local_image_path TEXT,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
|
||||||
|
UNIQUE(dispensary_id, provider, provider_product_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_product ON store_products(product_id) WHERE product_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(brand_id) WHERE brand_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
|
||||||
|
COMMENT ON COLUMN store_products.product_id IS 'FK to canonical products table. NULL if not yet mapped.';
|
||||||
|
COMMENT ON COLUMN store_products.brand_id IS 'FK to canonical brands table. NULL if not yet mapped.';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 6) STORE_PRODUCT_SNAPSHOTS TABLE (historical data)
|
||||||
|
-- =====================================================
|
||||||
|
-- This is the critical time-series table for analytics.
|
||||||
|
-- One row per product per crawl.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS store_product_snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
|
||||||
|
product_id INTEGER REFERENCES products(id) ON DELETE SET NULL,
|
||||||
|
|
||||||
|
-- Provider info
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
provider_product_id VARCHAR(100),
|
||||||
|
|
||||||
|
-- Link to crawl run
|
||||||
|
crawl_run_id INTEGER, -- FK added after crawl_runs table created
|
||||||
|
|
||||||
|
-- Capture timestamp
|
||||||
|
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
-- Raw data from platform
|
||||||
|
name_raw VARCHAR(500),
|
||||||
|
brand_name_raw VARCHAR(255),
|
||||||
|
category_raw VARCHAR(100),
|
||||||
|
subcategory_raw VARCHAR(100),
|
||||||
|
|
||||||
|
-- Pricing at time of capture
|
||||||
|
price_rec NUMERIC(10,2),
|
||||||
|
price_med NUMERIC(10,2),
|
||||||
|
price_rec_special NUMERIC(10,2),
|
||||||
|
price_med_special NUMERIC(10,2),
|
||||||
|
is_on_special BOOLEAN DEFAULT FALSE,
|
||||||
|
discount_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Inventory at time of capture
|
||||||
|
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
stock_quantity INTEGER,
|
||||||
|
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||||
|
|
||||||
|
-- Potency at time of capture
|
||||||
|
thc_percent NUMERIC(5,2),
|
||||||
|
cbd_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Image URL at time of capture
|
||||||
|
image_url TEXT,
|
||||||
|
|
||||||
|
-- Full raw response for debugging
|
||||||
|
raw_data JSONB,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(product_id, captured_at DESC) WHERE product_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_store_product ON store_product_snapshots(store_product_id) WHERE store_product_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
|
||||||
|
COMMENT ON COLUMN store_product_snapshots.captured_at IS 'When this snapshot was captured (crawl time).';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 7) CRAWL_RUNS TABLE (job execution records)
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
|
||||||
|
-- Provider
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
|
||||||
|
-- Execution times
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
-- Results
|
||||||
|
products_found INTEGER DEFAULT 0,
|
||||||
|
products_new INTEGER DEFAULT 0,
|
||||||
|
products_updated INTEGER DEFAULT 0,
|
||||||
|
snapshots_written INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
worker_id VARCHAR(100),
|
||||||
|
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
|
||||||
|
metadata JSONB DEFAULT '{}',
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
|
||||||
|
|
||||||
|
-- Add FK from store_product_snapshots to crawl_runs
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.table_constraints
|
||||||
|
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
|
||||||
|
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 8) UPDATE crawl_orchestration_traces
|
||||||
|
-- =====================================================
|
||||||
|
-- Add run_id FK if not exists
|
||||||
|
ALTER TABLE crawl_orchestration_traces
|
||||||
|
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
|
||||||
|
ON crawl_orchestration_traces(crawl_run_id)
|
||||||
|
WHERE crawl_run_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 9) UPDATE dispensary_crawler_profiles
|
||||||
|
-- =====================================================
|
||||||
|
-- Add missing columns from canonical schema
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_profiles_status
|
||||||
|
ON dispensary_crawler_profiles(status);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.allow_autopromote IS 'Whether this profile can be auto-promoted from sandbox to production';
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.validated_at IS 'When this profile was last validated as working';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 10) VIEWS FOR BACKWARD COMPATIBILITY
|
||||||
|
-- =====================================================
|
||||||
|
|
||||||
|
-- View to get latest snapshot per store product
|
||||||
|
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
|
||||||
|
SELECT DISTINCT ON (dispensary_id, provider_product_id)
|
||||||
|
sps.*
|
||||||
|
FROM store_product_snapshots sps
|
||||||
|
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
|
||||||
|
|
||||||
|
-- View to get crawl run summary per dispensary
|
||||||
|
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
|
||||||
|
SELECT
|
||||||
|
d.id AS dispensary_id,
|
||||||
|
d.name AS dispensary_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
COUNT(DISTINCT sp.id) AS current_product_count,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
|
||||||
|
MAX(cr.finished_at) AS last_crawl_at,
|
||||||
|
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||||
|
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
|
||||||
|
GROUP BY d.id, d.name, d.city, d.state;
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 11) COMMENTS
|
||||||
|
-- =====================================================
|
||||||
|
COMMENT ON TABLE states IS 'Canonical list of US states. Use state_id FK in dispensaries.';
|
||||||
|
COMMENT ON TABLE chains IS 'Retail chains (multi-location operators).';
|
||||||
|
COMMENT ON TABLE brands IS 'Canonical brand catalog across all providers.';
|
||||||
|
COMMENT ON TABLE store_products IS 'Current menu state per dispensary. Provider-agnostic.';
|
||||||
|
COMMENT ON TABLE store_product_snapshots IS 'Historical price/stock data. One row per product per crawl.';
|
||||||
|
COMMENT ON TABLE crawl_runs IS 'Crawl execution records. Links snapshots to runs.';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- MIGRATION COMPLETE
|
||||||
|
-- =====================================================
|
||||||
|
--
|
||||||
|
-- Next steps (manual - not in this migration):
|
||||||
|
-- 1. Populate chains table from known retail groups
|
||||||
|
-- 2. Populate brands table from existing dutchie_products.brand_name
|
||||||
|
-- 3. Migrate data from dutchie_products → store_products
|
||||||
|
-- 4. Migrate data from dutchie_product_snapshots → store_product_snapshots
|
||||||
|
-- 5. Link dispensaries.chain_id to chains where applicable
|
||||||
|
--
|
||||||
50
backend/migrations/043_add_states_table.sql
Normal file
50
backend/migrations/043_add_states_table.sql
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
-- Migration 043: Add States Table
|
||||||
|
--
|
||||||
|
-- Creates the states table if it does not exist.
|
||||||
|
-- Safe to run multiple times (idempotent).
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- CANNAIQ_DB_URL="postgresql://..." psql $CANNAIQ_DB_URL -f migrations/043_add_states_table.sql
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 1) CREATE STATES TABLE
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS states (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
code TEXT NOT NULL UNIQUE,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 2) INSERT CORE US STATES
|
||||||
|
-- =====================================================
|
||||||
|
INSERT INTO states (code, name) VALUES
|
||||||
|
('AZ', 'Arizona'),
|
||||||
|
('CA', 'California'),
|
||||||
|
('CO', 'Colorado'),
|
||||||
|
('FL', 'Florida'),
|
||||||
|
('IL', 'Illinois'),
|
||||||
|
('MA', 'Massachusetts'),
|
||||||
|
('MD', 'Maryland'),
|
||||||
|
('MI', 'Michigan'),
|
||||||
|
('MO', 'Missouri'),
|
||||||
|
('NV', 'Nevada'),
|
||||||
|
('NJ', 'New Jersey'),
|
||||||
|
('NY', 'New York'),
|
||||||
|
('OH', 'Ohio'),
|
||||||
|
('OK', 'Oklahoma'),
|
||||||
|
('OR', 'Oregon'),
|
||||||
|
('PA', 'Pennsylvania'),
|
||||||
|
('WA', 'Washington')
|
||||||
|
ON CONFLICT (code) DO NOTHING;
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 3) ADD INDEX
|
||||||
|
-- =====================================================
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- DONE
|
||||||
|
-- =====================================================
|
||||||
45
backend/migrations/044_add_provider_detection_data.sql
Normal file
45
backend/migrations/044_add_provider_detection_data.sql
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
-- Migration 044: Add provider_detection_data column to dispensaries
|
||||||
|
--
|
||||||
|
-- This column stores detection metadata for menu provider discovery.
|
||||||
|
-- Used by menu-detection.ts and discovery.ts to track:
|
||||||
|
-- - Detected provider type
|
||||||
|
-- - Resolution attempts
|
||||||
|
-- - Error messages
|
||||||
|
-- - not_crawlable flag
|
||||||
|
--
|
||||||
|
-- Run with: psql $CANNAIQ_DB_URL -f migrations/044_add_provider_detection_data.sql
|
||||||
|
--
|
||||||
|
-- ALL CHANGES ARE ADDITIVE - NO DROPS, NO DELETES, NO TRUNCATES.
|
||||||
|
|
||||||
|
-- Add provider_detection_data to dispensaries table
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries
|
||||||
|
ADD COLUMN provider_detection_data JSONB DEFAULT NULL;
|
||||||
|
|
||||||
|
RAISE NOTICE 'Added provider_detection_data column to dispensaries table';
|
||||||
|
ELSE
|
||||||
|
RAISE NOTICE 'provider_detection_data column already exists on dispensaries table';
|
||||||
|
END IF;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Add index for querying by not_crawlable flag
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_not_crawlable
|
||||||
|
ON dispensaries ((provider_detection_data->>'not_crawlable'))
|
||||||
|
WHERE provider_detection_data IS NOT NULL;
|
||||||
|
|
||||||
|
-- Add index for querying by detected provider
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_provider_detection_provider
|
||||||
|
ON dispensaries ((provider_detection_data->>'detected_provider'))
|
||||||
|
WHERE provider_detection_data IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensaries.provider_detection_data IS 'JSONB metadata from menu provider detection. Keys: detected_provider, resolution_error, not_crawlable, detection_timestamp';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- MIGRATION COMPLETE
|
||||||
|
-- =====================================================
|
||||||
27
backend/migrations/045_add_image_columns.sql
Normal file
27
backend/migrations/045_add_image_columns.sql
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
-- Migration 045: Add thumbnail_url columns to canonical tables
|
||||||
|
--
|
||||||
|
-- NOTE: image_url already exists in both tables from migration 041.
|
||||||
|
-- This migration adds thumbnail_url for cached thumbnail images.
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
-- Add thumbnail_url to store_products if not exists
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'store_products' AND column_name = 'thumbnail_url'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_products ADD COLUMN thumbnail_url TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Add thumbnail_url to store_product_snapshots if not exists
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'store_product_snapshots' AND column_name = 'thumbnail_url'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN thumbnail_url TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_products.thumbnail_url IS 'URL to cached thumbnail image';
|
||||||
|
COMMENT ON COLUMN store_product_snapshots.thumbnail_url IS 'URL to cached thumbnail image at time of snapshot';
|
||||||
351
backend/migrations/046_crawler_reliability.sql
Normal file
351
backend/migrations/046_crawler_reliability.sql
Normal file
@@ -0,0 +1,351 @@
|
|||||||
|
-- Migration 046: Crawler Reliability & Stabilization
|
||||||
|
-- Phase 1: Add fields for error taxonomy, retry management, and self-healing
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 1: Error Taxonomy - Standardized error codes
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Create enum for standardized error codes
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crawl_error_code') THEN
|
||||||
|
CREATE TYPE crawl_error_code AS ENUM (
|
||||||
|
'SUCCESS',
|
||||||
|
'RATE_LIMITED',
|
||||||
|
'BLOCKED_PROXY',
|
||||||
|
'HTML_CHANGED',
|
||||||
|
'TIMEOUT',
|
||||||
|
'AUTH_FAILED',
|
||||||
|
'NETWORK_ERROR',
|
||||||
|
'PARSE_ERROR',
|
||||||
|
'NO_PRODUCTS',
|
||||||
|
'UNKNOWN_ERROR'
|
||||||
|
);
|
||||||
|
END IF;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 2: Dispensary Crawl Configuration
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add crawl config columns to dispensaries
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
-- Crawl frequency (minutes between crawls)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'crawl_frequency_minutes'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN crawl_frequency_minutes INTEGER DEFAULT 240;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Max retries per crawl
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'max_retries'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN max_retries INTEGER DEFAULT 3;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Current proxy ID
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'current_proxy_id'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN current_proxy_id INTEGER NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Current user agent
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'current_user_agent'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN current_user_agent TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Next scheduled run
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'next_crawl_at'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN next_crawl_at TIMESTAMPTZ NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Last successful crawl
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'last_success_at'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN last_success_at TIMESTAMPTZ NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Last error code (using text for flexibility, validated in app)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'last_error_code'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN last_error_code TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Crawl status: active, degraded, paused, failed
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'crawl_status'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN crawl_status TEXT DEFAULT 'active';
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Backoff multiplier (increases with failures)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'backoff_multiplier'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN backoff_multiplier NUMERIC(4,2) DEFAULT 1.0;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Total attempt count (lifetime)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'total_attempts'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN total_attempts INTEGER DEFAULT 0;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Total success count (lifetime)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'total_successes'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN total_successes INTEGER DEFAULT 0;
|
||||||
|
END IF;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 3: Enhanced Job Tracking
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add columns to dispensary_crawl_jobs
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
-- Error code
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'error_code'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN error_code TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Proxy used for this job
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'proxy_used'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN proxy_used TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- User agent used for this job
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'user_agent_used'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN user_agent_used TEXT NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Attempt number for this job
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'attempt_number'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN attempt_number INTEGER DEFAULT 1;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Backoff delay applied (ms)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'backoff_delay_ms'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN backoff_delay_ms INTEGER DEFAULT 0;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- HTTP status code received
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'http_status'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN http_status INTEGER NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Response time (ms)
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'response_time_ms'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN response_time_ms INTEGER NULL;
|
||||||
|
END IF;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 4: Crawl History Table (for detailed tracking)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_attempts (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||||
|
job_id INTEGER REFERENCES dispensary_crawl_jobs(id),
|
||||||
|
|
||||||
|
-- Timing
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
|
||||||
|
-- Result
|
||||||
|
error_code TEXT NOT NULL DEFAULT 'UNKNOWN_ERROR',
|
||||||
|
error_message TEXT,
|
||||||
|
http_status INTEGER,
|
||||||
|
|
||||||
|
-- Context
|
||||||
|
attempt_number INTEGER NOT NULL DEFAULT 1,
|
||||||
|
proxy_used TEXT,
|
||||||
|
user_agent_used TEXT,
|
||||||
|
|
||||||
|
-- Metrics
|
||||||
|
products_found INTEGER DEFAULT 0,
|
||||||
|
products_upserted INTEGER DEFAULT 0,
|
||||||
|
snapshots_created INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
metadata JSONB,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Index for quick lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_dispensary_id ON crawl_attempts(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_error_code ON crawl_attempts(error_code);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_attempts_started_at ON crawl_attempts(started_at DESC);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 5: Views for Monitoring
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Drop existing view if exists
|
||||||
|
DROP VIEW IF EXISTS v_crawler_status;
|
||||||
|
|
||||||
|
-- Crawler status view with all reliability fields
|
||||||
|
CREATE VIEW v_crawler_status AS
|
||||||
|
SELECT
|
||||||
|
d.id,
|
||||||
|
d.name,
|
||||||
|
d.slug,
|
||||||
|
d.menu_type,
|
||||||
|
d.platform_dispensary_id,
|
||||||
|
d.crawl_status,
|
||||||
|
d.consecutive_failures,
|
||||||
|
d.last_crawl_at,
|
||||||
|
d.last_success_at,
|
||||||
|
d.last_failure_at,
|
||||||
|
d.last_error_code,
|
||||||
|
d.next_crawl_at,
|
||||||
|
d.crawl_frequency_minutes,
|
||||||
|
d.max_retries,
|
||||||
|
d.current_proxy_id,
|
||||||
|
d.current_user_agent,
|
||||||
|
d.backoff_multiplier,
|
||||||
|
d.total_attempts,
|
||||||
|
d.total_successes,
|
||||||
|
d.product_count,
|
||||||
|
CASE
|
||||||
|
WHEN d.total_attempts > 0
|
||||||
|
THEN ROUND(d.total_successes::NUMERIC / d.total_attempts * 100, 1)
|
||||||
|
ELSE 0
|
||||||
|
END AS success_rate,
|
||||||
|
CASE
|
||||||
|
WHEN d.crawl_status = 'failed' THEN 'FAILED'
|
||||||
|
WHEN d.crawl_status = 'paused' THEN 'PAUSED'
|
||||||
|
WHEN d.crawl_status = 'degraded' THEN 'DEGRADED'
|
||||||
|
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'NEEDS_DETECTION'
|
||||||
|
WHEN d.platform_dispensary_id IS NULL THEN 'NEEDS_PLATFORM_ID'
|
||||||
|
WHEN d.next_crawl_at IS NULL THEN 'NOT_SCHEDULED'
|
||||||
|
WHEN d.next_crawl_at <= NOW() THEN 'DUE'
|
||||||
|
ELSE 'SCHEDULED'
|
||||||
|
END AS schedule_status,
|
||||||
|
d.failed_at,
|
||||||
|
d.failure_notes
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.state = 'AZ';
|
||||||
|
|
||||||
|
-- Drop existing view if exists
|
||||||
|
DROP VIEW IF EXISTS v_crawl_error_summary;
|
||||||
|
|
||||||
|
-- Error summary view
|
||||||
|
CREATE VIEW v_crawl_error_summary AS
|
||||||
|
SELECT
|
||||||
|
error_code,
|
||||||
|
COUNT(*) as total_occurrences,
|
||||||
|
COUNT(DISTINCT dispensary_id) as affected_stores,
|
||||||
|
MAX(started_at) as last_occurrence,
|
||||||
|
AVG(duration_ms)::INTEGER as avg_duration_ms
|
||||||
|
FROM crawl_attempts
|
||||||
|
WHERE started_at > NOW() - INTERVAL '7 days'
|
||||||
|
GROUP BY error_code
|
||||||
|
ORDER BY total_occurrences DESC;
|
||||||
|
|
||||||
|
-- Drop existing view if exists
|
||||||
|
DROP VIEW IF EXISTS v_crawl_health;
|
||||||
|
|
||||||
|
-- Overall crawl health view
|
||||||
|
CREATE VIEW v_crawl_health AS
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE crawl_status = 'active') as active_crawlers,
|
||||||
|
COUNT(*) FILTER (WHERE crawl_status = 'degraded') as degraded_crawlers,
|
||||||
|
COUNT(*) FILTER (WHERE crawl_status = 'paused') as paused_crawlers,
|
||||||
|
COUNT(*) FILTER (WHERE crawl_status = 'failed') as failed_crawlers,
|
||||||
|
COUNT(*) FILTER (WHERE next_crawl_at <= NOW()) as due_now,
|
||||||
|
COUNT(*) FILTER (WHERE consecutive_failures > 0) as stores_with_failures,
|
||||||
|
AVG(consecutive_failures)::NUMERIC(4,2) as avg_consecutive_failures,
|
||||||
|
COUNT(*) FILTER (WHERE last_success_at > NOW() - INTERVAL '24 hours') as successful_last_24h
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE state = 'AZ' AND menu_type = 'dutchie';
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 6: Constraint for minimum crawl gap
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Function to check minimum crawl gap (2 minutes)
|
||||||
|
CREATE OR REPLACE FUNCTION check_minimum_crawl_gap()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
-- Only check for new pending jobs
|
||||||
|
IF NEW.status = 'pending' AND NEW.dispensary_id IS NOT NULL THEN
|
||||||
|
-- Check if there's a recent job for same dispensary
|
||||||
|
IF EXISTS (
|
||||||
|
SELECT 1 FROM dispensary_crawl_jobs
|
||||||
|
WHERE dispensary_id = NEW.dispensary_id
|
||||||
|
AND id != NEW.id
|
||||||
|
AND status IN ('pending', 'running')
|
||||||
|
AND created_at > NOW() - INTERVAL '2 minutes'
|
||||||
|
) THEN
|
||||||
|
RAISE EXCEPTION 'Minimum 2-minute gap required between crawls for same dispensary';
|
||||||
|
END IF;
|
||||||
|
END IF;
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Create trigger (drop first if exists)
|
||||||
|
DROP TRIGGER IF EXISTS enforce_minimum_crawl_gap ON dispensary_crawl_jobs;
|
||||||
|
CREATE TRIGGER enforce_minimum_crawl_gap
|
||||||
|
BEFORE INSERT ON dispensary_crawl_jobs
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION check_minimum_crawl_gap();
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PART 7: Comments
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
COMMENT ON TABLE crawl_attempts IS 'Detailed history of every crawl attempt for analytics and debugging';
|
||||||
|
COMMENT ON VIEW v_crawler_status IS 'Current status of all crawlers with reliability metrics';
|
||||||
|
COMMENT ON VIEW v_crawl_error_summary IS 'Summary of errors by type over last 7 days';
|
||||||
|
COMMENT ON VIEW v_crawl_health IS 'Overall health metrics for the crawling system';
|
||||||
130
backend/migrations/046_raw_payloads_table.sql
Normal file
130
backend/migrations/046_raw_payloads_table.sql
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
-- Migration 046: Raw Payloads Table
|
||||||
|
--
|
||||||
|
-- Immutable event stream for raw crawler responses.
|
||||||
|
-- NEVER delete or overwrite historical payloads.
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- DATABASE_URL="postgresql://..." psql $DATABASE_URL -f migrations/046_raw_payloads_table.sql
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 1) RAW_PAYLOADS TABLE
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS raw_payloads (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
|
||||||
|
-- Store reference
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
|
||||||
|
-- Crawl run reference (nullable for backfilled data)
|
||||||
|
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||||
|
|
||||||
|
-- Platform identification
|
||||||
|
platform VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
|
||||||
|
-- Versioning for schema evolution
|
||||||
|
payload_version INTEGER NOT NULL DEFAULT 1,
|
||||||
|
|
||||||
|
-- The raw JSON response from the crawler (immutable)
|
||||||
|
raw_json JSONB NOT NULL,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
product_count INTEGER, -- Number of products in payload
|
||||||
|
pricing_type VARCHAR(20), -- 'rec', 'med', or 'both'
|
||||||
|
crawl_mode VARCHAR(20), -- 'mode_a', 'mode_b', 'dual'
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
-- Hydration status
|
||||||
|
processed BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
|
normalized_at TIMESTAMPTZ,
|
||||||
|
hydration_error TEXT,
|
||||||
|
hydration_attempts INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Audit
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 2) INDEXES FOR EFFICIENT QUERYING
|
||||||
|
-- =====================================================
|
||||||
|
|
||||||
|
-- Primary lookup: unprocessed payloads in FIFO order
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed
|
||||||
|
ON raw_payloads(fetched_at ASC)
|
||||||
|
WHERE processed = FALSE;
|
||||||
|
|
||||||
|
-- Store-based lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary
|
||||||
|
ON raw_payloads(dispensary_id, fetched_at DESC);
|
||||||
|
|
||||||
|
-- Platform filtering
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_platform
|
||||||
|
ON raw_payloads(platform);
|
||||||
|
|
||||||
|
-- Crawl run linkage
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run
|
||||||
|
ON raw_payloads(crawl_run_id)
|
||||||
|
WHERE crawl_run_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Error tracking
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_errors
|
||||||
|
ON raw_payloads(hydration_attempts, processed)
|
||||||
|
WHERE hydration_error IS NOT NULL;
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 3) HYDRATION LOCKS TABLE (distributed locking)
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS hydration_locks (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
lock_name VARCHAR(100) NOT NULL UNIQUE,
|
||||||
|
worker_id VARCHAR(100) NOT NULL,
|
||||||
|
acquired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
expires_at TIMESTAMPTZ NOT NULL,
|
||||||
|
heartbeat_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_hydration_locks_expires
|
||||||
|
ON hydration_locks(expires_at);
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 4) HYDRATION_RUNS TABLE (audit trail)
|
||||||
|
-- =====================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS hydration_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
worker_id VARCHAR(100) NOT NULL,
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed
|
||||||
|
|
||||||
|
-- Metrics
|
||||||
|
payloads_processed INTEGER DEFAULT 0,
|
||||||
|
products_upserted INTEGER DEFAULT 0,
|
||||||
|
snapshots_created INTEGER DEFAULT 0,
|
||||||
|
brands_created INTEGER DEFAULT 0,
|
||||||
|
errors_count INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Error details
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_hydration_runs_status
|
||||||
|
ON hydration_runs(status, started_at DESC);
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- 5) COMMENTS
|
||||||
|
-- =====================================================
|
||||||
|
COMMENT ON TABLE raw_payloads IS 'Immutable event stream of raw crawler responses. NEVER DELETE.';
|
||||||
|
COMMENT ON COLUMN raw_payloads.raw_json IS 'Complete raw JSON from GraphQL/API response. Immutable.';
|
||||||
|
COMMENT ON COLUMN raw_payloads.payload_version IS 'Schema version for normalization compatibility.';
|
||||||
|
COMMENT ON COLUMN raw_payloads.processed IS 'TRUE when payload has been hydrated to canonical tables.';
|
||||||
|
COMMENT ON COLUMN raw_payloads.normalized_at IS 'When the payload was successfully hydrated.';
|
||||||
|
|
||||||
|
COMMENT ON TABLE hydration_locks IS 'Distributed locks for hydration workers to prevent double-processing.';
|
||||||
|
COMMENT ON TABLE hydration_runs IS 'Audit trail of hydration job executions.';
|
||||||
|
|
||||||
|
-- =====================================================
|
||||||
|
-- MIGRATION COMPLETE
|
||||||
|
-- =====================================================
|
||||||
473
backend/migrations/047_analytics_infrastructure.sql
Normal file
473
backend/migrations/047_analytics_infrastructure.sql
Normal file
@@ -0,0 +1,473 @@
|
|||||||
|
-- Migration 047: Analytics Infrastructure
|
||||||
|
-- Phase 3: Analytics Dashboards for CannaiQ
|
||||||
|
-- Creates views, functions, and tables for price trends, brand penetration, category growth, etc.
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- ANALYTICS CACHE TABLE (for expensive query results)
|
||||||
|
-- ============================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS analytics_cache (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
cache_key VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
cache_data JSONB NOT NULL,
|
||||||
|
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
expires_at TIMESTAMPTZ NOT NULL,
|
||||||
|
query_time_ms INTEGER,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_cache_key ON analytics_cache(cache_key);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_cache_expires ON analytics_cache(expires_at);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- PRICE EXTRACTION HELPER FUNCTION
|
||||||
|
-- Extracts pricing from JSONB latest_raw_payload
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE FUNCTION extract_min_price(payload JSONB)
|
||||||
|
RETURNS NUMERIC AS $$
|
||||||
|
DECLARE
|
||||||
|
prices JSONB;
|
||||||
|
min_val NUMERIC;
|
||||||
|
BEGIN
|
||||||
|
-- Try recPrices first (retail prices)
|
||||||
|
prices := payload->'recPrices';
|
||||||
|
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||||
|
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||||
|
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Try Prices array
|
||||||
|
prices := payload->'Prices';
|
||||||
|
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||||
|
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||||
|
IF min_val IS NOT NULL THEN RETURN min_val; END IF;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN NULL;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION extract_max_price(payload JSONB)
|
||||||
|
RETURNS NUMERIC AS $$
|
||||||
|
DECLARE
|
||||||
|
prices JSONB;
|
||||||
|
max_val NUMERIC;
|
||||||
|
BEGIN
|
||||||
|
prices := payload->'recPrices';
|
||||||
|
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||||
|
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||||
|
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
prices := payload->'Prices';
|
||||||
|
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||||
|
SELECT MAX(value::NUMERIC) INTO max_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||||
|
IF max_val IS NOT NULL THEN RETURN max_val; END IF;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN NULL;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION extract_wholesale_price(payload JSONB)
|
||||||
|
RETURNS NUMERIC AS $$
|
||||||
|
DECLARE
|
||||||
|
prices JSONB;
|
||||||
|
min_val NUMERIC;
|
||||||
|
BEGIN
|
||||||
|
prices := payload->'wholesalePrices';
|
||||||
|
IF prices IS NOT NULL AND jsonb_array_length(prices) > 0 THEN
|
||||||
|
SELECT MIN(value::NUMERIC) INTO min_val FROM jsonb_array_elements_text(prices) AS value WHERE value ~ '^[0-9.]+$';
|
||||||
|
RETURN min_val;
|
||||||
|
END IF;
|
||||||
|
RETURN NULL;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- VIEW: v_product_pricing
|
||||||
|
-- Flattened view of products with extracted pricing
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE VIEW v_product_pricing AS
|
||||||
|
SELECT
|
||||||
|
dp.id,
|
||||||
|
dp.dispensary_id,
|
||||||
|
dp.name,
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
dp.type as category,
|
||||||
|
dp.subcategory,
|
||||||
|
dp.strain_type,
|
||||||
|
dp.stock_status,
|
||||||
|
dp.status,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
extract_min_price(dp.latest_raw_payload) as min_price,
|
||||||
|
extract_max_price(dp.latest_raw_payload) as max_price,
|
||||||
|
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price,
|
||||||
|
dp.thc,
|
||||||
|
dp.cbd,
|
||||||
|
dp.updated_at,
|
||||||
|
dp.created_at
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- VIEW: v_brand_store_presence
|
||||||
|
-- Which brands are in which stores
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE VIEW v_brand_store_presence AS
|
||||||
|
SELECT
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
dp.dispensary_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
dp.type as category,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count,
|
||||||
|
MAX(dp.updated_at) as last_updated
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name IS NOT NULL
|
||||||
|
GROUP BY dp.brand_name, dp.brand_id, dp.dispensary_id, d.name, d.city, d.state, dp.type;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- VIEW: v_category_store_summary
|
||||||
|
-- Category breakdown per store
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE VIEW v_category_store_summary AS
|
||||||
|
SELECT
|
||||||
|
dp.dispensary_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
dp.type as category,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||||
|
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_count
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.type IS NOT NULL
|
||||||
|
GROUP BY dp.dispensary_id, d.name, d.city, d.state, dp.type;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- VIEW: v_brand_summary
|
||||||
|
-- Global brand statistics
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE VIEW v_brand_summary AS
|
||||||
|
SELECT
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||||
|
COUNT(DISTINCT dp.type) as category_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||||
|
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
|
||||||
|
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||||
|
MAX(dp.updated_at) as last_updated
|
||||||
|
FROM dutchie_products dp
|
||||||
|
WHERE dp.brand_name IS NOT NULL
|
||||||
|
GROUP BY dp.brand_name, dp.brand_id
|
||||||
|
ORDER BY total_skus DESC;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- VIEW: v_category_summary
|
||||||
|
-- Global category statistics
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE VIEW v_category_summary AS
|
||||||
|
SELECT
|
||||||
|
dp.type as category,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||||
|
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus
|
||||||
|
FROM dutchie_products dp
|
||||||
|
WHERE dp.type IS NOT NULL
|
||||||
|
GROUP BY dp.type
|
||||||
|
ORDER BY total_skus DESC;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- VIEW: v_store_summary
|
||||||
|
-- Store-level statistics
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE VIEW v_store_summary AS
|
||||||
|
SELECT
|
||||||
|
d.id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
d.chain_id,
|
||||||
|
COUNT(dp.id) as total_skus,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
COUNT(DISTINCT dp.type) as category_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock_skus,
|
||||||
|
d.last_crawl_at,
|
||||||
|
d.product_count
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||||
|
GROUP BY d.id, d.name, d.city, d.state, d.chain_id, d.last_crawl_at, d.product_count;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- TABLE: brand_snapshots (for historical brand tracking)
|
||||||
|
-- ============================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS brand_snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
brand_name VARCHAR(255) NOT NULL,
|
||||||
|
brand_id VARCHAR(255),
|
||||||
|
snapshot_date DATE NOT NULL,
|
||||||
|
store_count INTEGER NOT NULL DEFAULT 0,
|
||||||
|
total_skus INTEGER NOT NULL DEFAULT 0,
|
||||||
|
avg_price NUMERIC(10,2),
|
||||||
|
in_stock_skus INTEGER NOT NULL DEFAULT 0,
|
||||||
|
categories TEXT[],
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
UNIQUE(brand_name, snapshot_date)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brand_snapshots_date ON brand_snapshots(snapshot_date);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- TABLE: category_snapshots (for historical category tracking)
|
||||||
|
-- ============================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS category_snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
category VARCHAR(255) NOT NULL,
|
||||||
|
snapshot_date DATE NOT NULL,
|
||||||
|
store_count INTEGER NOT NULL DEFAULT 0,
|
||||||
|
brand_count INTEGER NOT NULL DEFAULT 0,
|
||||||
|
total_skus INTEGER NOT NULL DEFAULT 0,
|
||||||
|
avg_price NUMERIC(10,2),
|
||||||
|
in_stock_skus INTEGER NOT NULL DEFAULT 0,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
UNIQUE(category, snapshot_date)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_category_snapshots_cat ON category_snapshots(category);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_category_snapshots_date ON category_snapshots(snapshot_date);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- TABLE: store_change_events (for tracking store changes)
|
||||||
|
-- ============================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS store_change_events (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
store_id INTEGER NOT NULL REFERENCES dispensaries(id),
|
||||||
|
event_type VARCHAR(50) NOT NULL, -- brand_added, brand_removed, product_added, product_removed, price_change, stock_change
|
||||||
|
event_date DATE NOT NULL,
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
product_id INTEGER,
|
||||||
|
product_name VARCHAR(500),
|
||||||
|
category VARCHAR(255),
|
||||||
|
old_value TEXT,
|
||||||
|
new_value TEXT,
|
||||||
|
metadata JSONB,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_events_store ON store_change_events(store_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_events_type ON store_change_events(event_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_events_date ON store_change_events(event_date);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_events_brand ON store_change_events(brand_name);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- TABLE: analytics_alerts
|
||||||
|
-- ============================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS analytics_alerts (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
alert_type VARCHAR(50) NOT NULL, -- price_warning, brand_dropped, competitive_intrusion, restock_event
|
||||||
|
severity VARCHAR(20) NOT NULL DEFAULT 'info', -- info, warning, critical
|
||||||
|
title VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
store_id INTEGER REFERENCES dispensaries(id),
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
product_id INTEGER,
|
||||||
|
category VARCHAR(255),
|
||||||
|
metadata JSONB,
|
||||||
|
is_read BOOLEAN DEFAULT FALSE,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_type ON analytics_alerts(alert_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_read ON analytics_alerts(is_read);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_alerts_created ON analytics_alerts(created_at DESC);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- FUNCTION: Capture daily brand snapshots
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE FUNCTION capture_brand_snapshots()
|
||||||
|
RETURNS INTEGER AS $$
|
||||||
|
DECLARE
|
||||||
|
inserted_count INTEGER;
|
||||||
|
BEGIN
|
||||||
|
INSERT INTO brand_snapshots (brand_name, brand_id, snapshot_date, store_count, total_skus, avg_price, in_stock_skus, categories)
|
||||||
|
SELECT
|
||||||
|
brand_name,
|
||||||
|
brand_id,
|
||||||
|
CURRENT_DATE,
|
||||||
|
COUNT(DISTINCT dispensary_id),
|
||||||
|
COUNT(*),
|
||||||
|
AVG(extract_min_price(latest_raw_payload)),
|
||||||
|
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END),
|
||||||
|
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL)
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name IS NOT NULL
|
||||||
|
GROUP BY brand_name, brand_id
|
||||||
|
ON CONFLICT (brand_name, snapshot_date)
|
||||||
|
DO UPDATE SET
|
||||||
|
store_count = EXCLUDED.store_count,
|
||||||
|
total_skus = EXCLUDED.total_skus,
|
||||||
|
avg_price = EXCLUDED.avg_price,
|
||||||
|
in_stock_skus = EXCLUDED.in_stock_skus,
|
||||||
|
categories = EXCLUDED.categories;
|
||||||
|
|
||||||
|
GET DIAGNOSTICS inserted_count = ROW_COUNT;
|
||||||
|
RETURN inserted_count;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- FUNCTION: Capture daily category snapshots
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE FUNCTION capture_category_snapshots()
|
||||||
|
RETURNS INTEGER AS $$
|
||||||
|
DECLARE
|
||||||
|
inserted_count INTEGER;
|
||||||
|
BEGIN
|
||||||
|
INSERT INTO category_snapshots (category, snapshot_date, store_count, brand_count, total_skus, avg_price, in_stock_skus)
|
||||||
|
SELECT
|
||||||
|
type,
|
||||||
|
CURRENT_DATE,
|
||||||
|
COUNT(DISTINCT dispensary_id),
|
||||||
|
COUNT(DISTINCT brand_name),
|
||||||
|
COUNT(*),
|
||||||
|
AVG(extract_min_price(latest_raw_payload)),
|
||||||
|
SUM(CASE WHEN stock_status = 'in_stock' THEN 1 ELSE 0 END)
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
ON CONFLICT (category, snapshot_date)
|
||||||
|
DO UPDATE SET
|
||||||
|
store_count = EXCLUDED.store_count,
|
||||||
|
brand_count = EXCLUDED.brand_count,
|
||||||
|
total_skus = EXCLUDED.total_skus,
|
||||||
|
avg_price = EXCLUDED.avg_price,
|
||||||
|
in_stock_skus = EXCLUDED.in_stock_skus;
|
||||||
|
|
||||||
|
GET DIAGNOSTICS inserted_count = ROW_COUNT;
|
||||||
|
RETURN inserted_count;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- FUNCTION: Calculate price volatility for a product
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE FUNCTION calculate_price_volatility(
|
||||||
|
p_product_id INTEGER,
|
||||||
|
p_days INTEGER DEFAULT 30
|
||||||
|
)
|
||||||
|
RETURNS NUMERIC AS $$
|
||||||
|
DECLARE
|
||||||
|
std_dev NUMERIC;
|
||||||
|
avg_price NUMERIC;
|
||||||
|
BEGIN
|
||||||
|
-- Using dutchie_product_snapshots if available
|
||||||
|
SELECT
|
||||||
|
STDDEV(rec_min_price_cents / 100.0),
|
||||||
|
AVG(rec_min_price_cents / 100.0)
|
||||||
|
INTO std_dev, avg_price
|
||||||
|
FROM dutchie_product_snapshots
|
||||||
|
WHERE dutchie_product_id = p_product_id
|
||||||
|
AND crawled_at >= NOW() - (p_days || ' days')::INTERVAL
|
||||||
|
AND rec_min_price_cents IS NOT NULL;
|
||||||
|
|
||||||
|
IF avg_price IS NULL OR avg_price = 0 THEN
|
||||||
|
RETURN NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Return coefficient of variation (CV)
|
||||||
|
RETURN ROUND((std_dev / avg_price) * 100, 2);
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- FUNCTION: Get brand penetration stats
|
||||||
|
-- ============================================================
|
||||||
|
CREATE OR REPLACE FUNCTION get_brand_penetration(
|
||||||
|
p_brand_name VARCHAR,
|
||||||
|
p_state VARCHAR DEFAULT NULL
|
||||||
|
)
|
||||||
|
RETURNS TABLE (
|
||||||
|
total_stores BIGINT,
|
||||||
|
stores_carrying BIGINT,
|
||||||
|
penetration_pct NUMERIC,
|
||||||
|
total_skus BIGINT,
|
||||||
|
avg_skus_per_store NUMERIC,
|
||||||
|
shelf_share_pct NUMERIC
|
||||||
|
) AS $$
|
||||||
|
BEGIN
|
||||||
|
RETURN QUERY
|
||||||
|
WITH store_counts AS (
|
||||||
|
SELECT
|
||||||
|
COUNT(DISTINCT d.id) as total,
|
||||||
|
COUNT(DISTINCT CASE WHEN dp.brand_name = p_brand_name THEN dp.dispensary_id END) as carrying
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||||
|
WHERE (p_state IS NULL OR d.state = p_state)
|
||||||
|
),
|
||||||
|
sku_counts AS (
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as brand_skus,
|
||||||
|
COUNT(DISTINCT dispensary_id) as stores_with_brand
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = p_brand_name
|
||||||
|
),
|
||||||
|
total_skus AS (
|
||||||
|
SELECT COUNT(*) as total FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE (p_state IS NULL OR d.state = p_state)
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
sc.total,
|
||||||
|
sc.carrying,
|
||||||
|
ROUND((sc.carrying::NUMERIC / NULLIF(sc.total, 0)) * 100, 2),
|
||||||
|
skc.brand_skus,
|
||||||
|
ROUND(skc.brand_skus::NUMERIC / NULLIF(skc.stores_with_brand, 0), 2),
|
||||||
|
ROUND((skc.brand_skus::NUMERIC / NULLIF(ts.total, 0)) * 100, 2)
|
||||||
|
FROM store_counts sc, sku_counts skc, total_skus ts;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- Initial snapshot capture (run manually if needed)
|
||||||
|
-- ============================================================
|
||||||
|
-- Note: Run these after migration to capture initial snapshots:
|
||||||
|
-- SELECT capture_brand_snapshots();
|
||||||
|
-- SELECT capture_category_snapshots();
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- Grant permissions
|
||||||
|
-- ============================================================
|
||||||
|
-- Views are accessible to all roles by default
|
||||||
|
|
||||||
|
COMMENT ON VIEW v_product_pricing IS 'Flattened product view with extracted pricing from JSONB';
|
||||||
|
COMMENT ON VIEW v_brand_store_presence IS 'Brand presence across stores with SKU counts';
|
||||||
|
COMMENT ON VIEW v_brand_summary IS 'Global brand statistics';
|
||||||
|
COMMENT ON VIEW v_category_summary IS 'Global category statistics';
|
||||||
|
COMMENT ON VIEW v_store_summary IS 'Store-level statistics';
|
||||||
|
COMMENT ON TABLE analytics_cache IS 'Cache for expensive analytics queries';
|
||||||
|
COMMENT ON TABLE brand_snapshots IS 'Historical daily snapshots of brand metrics';
|
||||||
|
COMMENT ON TABLE category_snapshots IS 'Historical daily snapshots of category metrics';
|
||||||
|
COMMENT ON TABLE store_change_events IS 'Log of brand/product changes at stores';
|
||||||
|
COMMENT ON TABLE analytics_alerts IS 'Analytics-generated alerts and notifications';
|
||||||
598
backend/migrations/048_production_sync_monitoring.sql
Normal file
598
backend/migrations/048_production_sync_monitoring.sql
Normal file
@@ -0,0 +1,598 @@
|
|||||||
|
-- Migration 048: Production Sync + Monitoring Infrastructure
|
||||||
|
-- Phase 5: Full Production Sync + Monitoring
|
||||||
|
--
|
||||||
|
-- Creates:
|
||||||
|
-- 1. Sync orchestrator tables
|
||||||
|
-- 2. Dead-letter queue (DLQ)
|
||||||
|
-- 3. System metrics tracking
|
||||||
|
-- 4. Integrity check results
|
||||||
|
-- 5. Auto-fix audit log
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- SYNC ORCHESTRATOR TABLES
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Orchestrator state and control
|
||||||
|
CREATE TABLE IF NOT EXISTS sync_orchestrator_state (
|
||||||
|
id INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1), -- Singleton row
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'SLEEPING', -- RUNNING, SLEEPING, LOCKED, PAUSED
|
||||||
|
current_worker_id VARCHAR(100),
|
||||||
|
last_heartbeat_at TIMESTAMPTZ,
|
||||||
|
last_run_started_at TIMESTAMPTZ,
|
||||||
|
last_run_completed_at TIMESTAMPTZ,
|
||||||
|
last_run_duration_ms INTEGER,
|
||||||
|
last_run_payloads_processed INTEGER DEFAULT 0,
|
||||||
|
last_run_errors INTEGER DEFAULT 0,
|
||||||
|
consecutive_failures INTEGER DEFAULT 0,
|
||||||
|
is_paused BOOLEAN DEFAULT FALSE,
|
||||||
|
pause_reason TEXT,
|
||||||
|
config JSONB DEFAULT '{
|
||||||
|
"batchSize": 50,
|
||||||
|
"pollIntervalMs": 5000,
|
||||||
|
"maxRetries": 3,
|
||||||
|
"lockTimeoutMs": 300000,
|
||||||
|
"enableAnalyticsPrecompute": true,
|
||||||
|
"enableIntegrityChecks": true
|
||||||
|
}'::jsonb,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Insert singleton row if not exists
|
||||||
|
INSERT INTO sync_orchestrator_state (id) VALUES (1) ON CONFLICT (id) DO NOTHING;
|
||||||
|
|
||||||
|
-- Sync run history
|
||||||
|
CREATE TABLE IF NOT EXISTS sync_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
|
||||||
|
worker_id VARCHAR(100) NOT NULL,
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed, cancelled
|
||||||
|
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
|
||||||
|
-- Metrics
|
||||||
|
payloads_queued INTEGER DEFAULT 0,
|
||||||
|
payloads_processed INTEGER DEFAULT 0,
|
||||||
|
payloads_skipped INTEGER DEFAULT 0,
|
||||||
|
payloads_failed INTEGER DEFAULT 0,
|
||||||
|
payloads_dlq INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
products_upserted INTEGER DEFAULT 0,
|
||||||
|
products_inserted INTEGER DEFAULT 0,
|
||||||
|
products_updated INTEGER DEFAULT 0,
|
||||||
|
products_discontinued INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
snapshots_created INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Error tracking
|
||||||
|
errors JSONB DEFAULT '[]'::jsonb,
|
||||||
|
error_summary TEXT,
|
||||||
|
|
||||||
|
-- Diff stats (before/after)
|
||||||
|
diff_stats JSONB DEFAULT '{}'::jsonb,
|
||||||
|
|
||||||
|
-- Analytics precompute triggered
|
||||||
|
analytics_updated BOOLEAN DEFAULT FALSE,
|
||||||
|
analytics_duration_ms INTEGER,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sync_runs_status ON sync_runs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sync_runs_started_at ON sync_runs(started_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sync_runs_run_id ON sync_runs(run_id);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- DEAD-LETTER QUEUE (DLQ)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- DLQ for failed payloads
|
||||||
|
CREATE TABLE IF NOT EXISTS raw_payloads_dlq (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
original_payload_id UUID NOT NULL,
|
||||||
|
dispensary_id INTEGER REFERENCES dispensaries(id),
|
||||||
|
state_code VARCHAR(2),
|
||||||
|
platform VARCHAR(50) DEFAULT 'dutchie',
|
||||||
|
|
||||||
|
-- Original payload data (preserved)
|
||||||
|
raw_json JSONB NOT NULL,
|
||||||
|
product_count INTEGER,
|
||||||
|
pricing_type VARCHAR(10),
|
||||||
|
crawl_mode VARCHAR(20),
|
||||||
|
|
||||||
|
-- DLQ metadata
|
||||||
|
moved_to_dlq_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
failure_count INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Error history (array of error objects)
|
||||||
|
error_history JSONB DEFAULT '[]'::jsonb,
|
||||||
|
last_error_type VARCHAR(50),
|
||||||
|
last_error_message TEXT,
|
||||||
|
last_error_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Retry tracking
|
||||||
|
retry_count INTEGER DEFAULT 0,
|
||||||
|
last_retry_at TIMESTAMPTZ,
|
||||||
|
next_retry_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Resolution
|
||||||
|
status VARCHAR(20) DEFAULT 'pending', -- pending, retrying, resolved, abandoned
|
||||||
|
resolved_at TIMESTAMPTZ,
|
||||||
|
resolved_by VARCHAR(100),
|
||||||
|
resolution_notes TEXT,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dlq_status ON raw_payloads_dlq(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dlq_dispensary ON raw_payloads_dlq(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dlq_error_type ON raw_payloads_dlq(last_error_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dlq_moved_at ON raw_payloads_dlq(moved_to_dlq_at DESC);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- SYSTEM METRICS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- System metrics time series
|
||||||
|
CREATE TABLE IF NOT EXISTS system_metrics (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
metric_name VARCHAR(100) NOT NULL,
|
||||||
|
metric_value NUMERIC NOT NULL,
|
||||||
|
labels JSONB DEFAULT '{}',
|
||||||
|
recorded_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_metrics_name_time ON system_metrics(metric_name, recorded_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_metrics_recorded_at ON system_metrics(recorded_at DESC);
|
||||||
|
|
||||||
|
-- Metrics snapshot (current state, updated continuously)
|
||||||
|
CREATE TABLE IF NOT EXISTS system_metrics_current (
|
||||||
|
metric_name VARCHAR(100) PRIMARY KEY,
|
||||||
|
metric_value NUMERIC NOT NULL,
|
||||||
|
labels JSONB DEFAULT '{}',
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Error buckets for classification
|
||||||
|
CREATE TABLE IF NOT EXISTS error_buckets (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
error_type VARCHAR(50) NOT NULL,
|
||||||
|
error_message TEXT,
|
||||||
|
source_table VARCHAR(50),
|
||||||
|
source_id TEXT,
|
||||||
|
dispensary_id INTEGER,
|
||||||
|
state_code VARCHAR(2),
|
||||||
|
context JSONB DEFAULT '{}',
|
||||||
|
occurred_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
acknowledged BOOLEAN DEFAULT FALSE,
|
||||||
|
acknowledged_at TIMESTAMPTZ,
|
||||||
|
acknowledged_by VARCHAR(100)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_error_buckets_type ON error_buckets(error_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_error_buckets_occurred ON error_buckets(occurred_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_error_buckets_unacked ON error_buckets(acknowledged) WHERE acknowledged = FALSE;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- INTEGRITY CHECK RESULTS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS integrity_check_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
|
||||||
|
check_type VARCHAR(50) NOT NULL, -- daily, on_demand, scheduled
|
||||||
|
triggered_by VARCHAR(100),
|
||||||
|
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed
|
||||||
|
|
||||||
|
-- Results summary
|
||||||
|
total_checks INTEGER DEFAULT 0,
|
||||||
|
passed_checks INTEGER DEFAULT 0,
|
||||||
|
failed_checks INTEGER DEFAULT 0,
|
||||||
|
warning_checks INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Detailed results
|
||||||
|
results JSONB DEFAULT '[]'::jsonb,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_integrity_runs_status ON integrity_check_runs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_integrity_runs_started ON integrity_check_runs(started_at DESC);
|
||||||
|
|
||||||
|
-- Individual integrity check results
|
||||||
|
CREATE TABLE IF NOT EXISTS integrity_check_results (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
run_id UUID REFERENCES integrity_check_runs(run_id) ON DELETE CASCADE,
|
||||||
|
check_name VARCHAR(100) NOT NULL,
|
||||||
|
check_category VARCHAR(50) NOT NULL,
|
||||||
|
status VARCHAR(20) NOT NULL, -- passed, failed, warning, skipped
|
||||||
|
|
||||||
|
-- Check details
|
||||||
|
expected_value TEXT,
|
||||||
|
actual_value TEXT,
|
||||||
|
difference TEXT,
|
||||||
|
affected_count INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Context
|
||||||
|
details JSONB DEFAULT '{}',
|
||||||
|
affected_ids JSONB DEFAULT '[]'::jsonb,
|
||||||
|
|
||||||
|
-- Remediation
|
||||||
|
can_auto_fix BOOLEAN DEFAULT FALSE,
|
||||||
|
fix_routine VARCHAR(100),
|
||||||
|
|
||||||
|
checked_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_integrity_results_run ON integrity_check_results(run_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_integrity_results_status ON integrity_check_results(status);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- AUTO-FIX AUDIT LOG
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS auto_fix_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
run_id UUID DEFAULT gen_random_uuid() UNIQUE NOT NULL,
|
||||||
|
routine_name VARCHAR(100) NOT NULL,
|
||||||
|
triggered_by VARCHAR(100) NOT NULL,
|
||||||
|
trigger_type VARCHAR(20) NOT NULL, -- manual, auto, scheduled
|
||||||
|
|
||||||
|
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
status VARCHAR(20) DEFAULT 'running', -- running, completed, failed, rolled_back
|
||||||
|
|
||||||
|
-- What was changed
|
||||||
|
rows_affected INTEGER DEFAULT 0,
|
||||||
|
changes JSONB DEFAULT '[]'::jsonb,
|
||||||
|
|
||||||
|
-- Dry run support
|
||||||
|
is_dry_run BOOLEAN DEFAULT FALSE,
|
||||||
|
dry_run_preview JSONB,
|
||||||
|
|
||||||
|
-- Error handling
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_fix_runs_routine ON auto_fix_runs(routine_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_fix_runs_started ON auto_fix_runs(started_at DESC);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- ALERTS TABLE
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS system_alerts (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
alert_type VARCHAR(50) NOT NULL,
|
||||||
|
severity VARCHAR(20) NOT NULL, -- info, warning, error, critical
|
||||||
|
title VARCHAR(255) NOT NULL,
|
||||||
|
message TEXT,
|
||||||
|
source VARCHAR(100),
|
||||||
|
|
||||||
|
-- Context
|
||||||
|
context JSONB DEFAULT '{}',
|
||||||
|
|
||||||
|
-- State
|
||||||
|
status VARCHAR(20) DEFAULT 'active', -- active, acknowledged, resolved, muted
|
||||||
|
acknowledged_at TIMESTAMPTZ,
|
||||||
|
acknowledged_by VARCHAR(100),
|
||||||
|
resolved_at TIMESTAMPTZ,
|
||||||
|
resolved_by VARCHAR(100),
|
||||||
|
|
||||||
|
-- Deduplication
|
||||||
|
fingerprint VARCHAR(64), -- Hash for dedup
|
||||||
|
occurrence_count INTEGER DEFAULT 1,
|
||||||
|
first_occurred_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
last_occurred_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_alerts_status ON system_alerts(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_alerts_severity ON system_alerts(severity);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_alerts_type ON system_alerts(alert_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_alerts_fingerprint ON system_alerts(fingerprint);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_alerts_active ON system_alerts(status, created_at DESC) WHERE status = 'active';
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- HELPER VIEWS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Current sync status view
|
||||||
|
CREATE OR REPLACE VIEW v_sync_status AS
|
||||||
|
SELECT
|
||||||
|
sos.status as orchestrator_status,
|
||||||
|
sos.current_worker_id,
|
||||||
|
sos.last_heartbeat_at,
|
||||||
|
sos.is_paused,
|
||||||
|
sos.pause_reason,
|
||||||
|
sos.consecutive_failures,
|
||||||
|
sos.last_run_started_at,
|
||||||
|
sos.last_run_completed_at,
|
||||||
|
sos.last_run_duration_ms,
|
||||||
|
sos.last_run_payloads_processed,
|
||||||
|
sos.last_run_errors,
|
||||||
|
sos.config,
|
||||||
|
(SELECT COUNT(*) FROM raw_payloads WHERE processed = FALSE) as unprocessed_payloads,
|
||||||
|
(SELECT COUNT(*) FROM raw_payloads_dlq WHERE status = 'pending') as dlq_pending,
|
||||||
|
(SELECT COUNT(*) FROM system_alerts WHERE status = 'active') as active_alerts,
|
||||||
|
(
|
||||||
|
SELECT json_build_object(
|
||||||
|
'total', COUNT(*),
|
||||||
|
'completed', COUNT(*) FILTER (WHERE status = 'completed'),
|
||||||
|
'failed', COUNT(*) FILTER (WHERE status = 'failed')
|
||||||
|
)
|
||||||
|
FROM sync_runs
|
||||||
|
WHERE started_at >= NOW() - INTERVAL '24 hours'
|
||||||
|
) as runs_24h
|
||||||
|
FROM sync_orchestrator_state sos
|
||||||
|
WHERE sos.id = 1;
|
||||||
|
|
||||||
|
-- DLQ summary view
|
||||||
|
CREATE OR REPLACE VIEW v_dlq_summary AS
|
||||||
|
SELECT
|
||||||
|
status,
|
||||||
|
last_error_type,
|
||||||
|
COUNT(*) as count,
|
||||||
|
MIN(moved_to_dlq_at) as oldest,
|
||||||
|
MAX(moved_to_dlq_at) as newest
|
||||||
|
FROM raw_payloads_dlq
|
||||||
|
GROUP BY status, last_error_type
|
||||||
|
ORDER BY count DESC;
|
||||||
|
|
||||||
|
-- Error bucket summary (last 24h)
|
||||||
|
CREATE OR REPLACE VIEW v_error_summary AS
|
||||||
|
SELECT
|
||||||
|
error_type,
|
||||||
|
COUNT(*) as count,
|
||||||
|
COUNT(*) FILTER (WHERE acknowledged = FALSE) as unacknowledged,
|
||||||
|
MIN(occurred_at) as first_occurred,
|
||||||
|
MAX(occurred_at) as last_occurred
|
||||||
|
FROM error_buckets
|
||||||
|
WHERE occurred_at >= NOW() - INTERVAL '24 hours'
|
||||||
|
GROUP BY error_type
|
||||||
|
ORDER BY count DESC;
|
||||||
|
|
||||||
|
-- Metrics summary view
|
||||||
|
CREATE OR REPLACE VIEW v_metrics_summary AS
|
||||||
|
SELECT
|
||||||
|
metric_name,
|
||||||
|
metric_value,
|
||||||
|
labels,
|
||||||
|
updated_at,
|
||||||
|
NOW() - updated_at as age
|
||||||
|
FROM system_metrics_current
|
||||||
|
ORDER BY metric_name;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- HELPER FUNCTIONS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Record a metric
|
||||||
|
CREATE OR REPLACE FUNCTION record_metric(
|
||||||
|
p_name VARCHAR(100),
|
||||||
|
p_value NUMERIC,
|
||||||
|
p_labels JSONB DEFAULT '{}'
|
||||||
|
) RETURNS VOID AS $$
|
||||||
|
BEGIN
|
||||||
|
-- Insert into time series
|
||||||
|
INSERT INTO system_metrics (metric_name, metric_value, labels)
|
||||||
|
VALUES (p_name, p_value, p_labels);
|
||||||
|
|
||||||
|
-- Upsert current value
|
||||||
|
INSERT INTO system_metrics_current (metric_name, metric_value, labels, updated_at)
|
||||||
|
VALUES (p_name, p_value, p_labels, NOW())
|
||||||
|
ON CONFLICT (metric_name) DO UPDATE SET
|
||||||
|
metric_value = EXCLUDED.metric_value,
|
||||||
|
labels = EXCLUDED.labels,
|
||||||
|
updated_at = NOW();
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Record an error
|
||||||
|
CREATE OR REPLACE FUNCTION record_error(
|
||||||
|
p_type VARCHAR(50),
|
||||||
|
p_message TEXT,
|
||||||
|
p_source_table VARCHAR(50) DEFAULT NULL,
|
||||||
|
p_source_id TEXT DEFAULT NULL,
|
||||||
|
p_dispensary_id INTEGER DEFAULT NULL,
|
||||||
|
p_context JSONB DEFAULT '{}'
|
||||||
|
) RETURNS INTEGER AS $$
|
||||||
|
DECLARE
|
||||||
|
v_id INTEGER;
|
||||||
|
BEGIN
|
||||||
|
INSERT INTO error_buckets (
|
||||||
|
error_type, error_message, source_table, source_id,
|
||||||
|
dispensary_id, context
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
p_type, p_message, p_source_table, p_source_id,
|
||||||
|
p_dispensary_id, p_context
|
||||||
|
)
|
||||||
|
RETURNING id INTO v_id;
|
||||||
|
|
||||||
|
-- Update error count metric
|
||||||
|
PERFORM record_metric(
|
||||||
|
'error_count_' || p_type,
|
||||||
|
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'error_count_' || p_type), 0) + 1
|
||||||
|
);
|
||||||
|
|
||||||
|
RETURN v_id;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Create or update alert (with deduplication)
|
||||||
|
CREATE OR REPLACE FUNCTION upsert_alert(
|
||||||
|
p_type VARCHAR(50),
|
||||||
|
p_severity VARCHAR(20),
|
||||||
|
p_title VARCHAR(255),
|
||||||
|
p_message TEXT DEFAULT NULL,
|
||||||
|
p_source VARCHAR(100) DEFAULT NULL,
|
||||||
|
p_context JSONB DEFAULT '{}'
|
||||||
|
) RETURNS INTEGER AS $$
|
||||||
|
DECLARE
|
||||||
|
v_fingerprint VARCHAR(64);
|
||||||
|
v_id INTEGER;
|
||||||
|
BEGIN
|
||||||
|
-- Generate fingerprint for dedup
|
||||||
|
v_fingerprint := md5(p_type || p_title || COALESCE(p_source, ''));
|
||||||
|
|
||||||
|
-- Try to find existing active alert
|
||||||
|
SELECT id INTO v_id
|
||||||
|
FROM system_alerts
|
||||||
|
WHERE fingerprint = v_fingerprint AND status = 'active';
|
||||||
|
|
||||||
|
IF v_id IS NOT NULL THEN
|
||||||
|
-- Update existing alert
|
||||||
|
UPDATE system_alerts
|
||||||
|
SET occurrence_count = occurrence_count + 1,
|
||||||
|
last_occurred_at = NOW(),
|
||||||
|
context = p_context
|
||||||
|
WHERE id = v_id;
|
||||||
|
ELSE
|
||||||
|
-- Create new alert
|
||||||
|
INSERT INTO system_alerts (
|
||||||
|
alert_type, severity, title, message, source, context, fingerprint
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
p_type, p_severity, p_title, p_message, p_source, p_context, v_fingerprint
|
||||||
|
)
|
||||||
|
RETURNING id INTO v_id;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN v_id;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Move payload to DLQ
|
||||||
|
CREATE OR REPLACE FUNCTION move_to_dlq(
|
||||||
|
p_payload_id UUID,
|
||||||
|
p_error_type VARCHAR(50),
|
||||||
|
p_error_message TEXT
|
||||||
|
) RETURNS UUID AS $$
|
||||||
|
DECLARE
|
||||||
|
v_dlq_id UUID;
|
||||||
|
v_payload RECORD;
|
||||||
|
BEGIN
|
||||||
|
-- Get the original payload
|
||||||
|
SELECT * INTO v_payload
|
||||||
|
FROM raw_payloads
|
||||||
|
WHERE id = p_payload_id;
|
||||||
|
|
||||||
|
IF v_payload IS NULL THEN
|
||||||
|
RAISE EXCEPTION 'Payload not found: %', p_payload_id;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
-- Insert into DLQ
|
||||||
|
INSERT INTO raw_payloads_dlq (
|
||||||
|
original_payload_id, dispensary_id, state_code, platform,
|
||||||
|
raw_json, product_count, pricing_type, crawl_mode,
|
||||||
|
failure_count, last_error_type, last_error_message, last_error_at,
|
||||||
|
error_history
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
p_payload_id, v_payload.dispensary_id,
|
||||||
|
(SELECT state FROM dispensaries WHERE id = v_payload.dispensary_id),
|
||||||
|
v_payload.platform,
|
||||||
|
v_payload.raw_json, v_payload.product_count, v_payload.pricing_type, v_payload.crawl_mode,
|
||||||
|
v_payload.hydration_attempts,
|
||||||
|
p_error_type, p_error_message, NOW(),
|
||||||
|
COALESCE(v_payload.hydration_error::jsonb, '[]'::jsonb) || jsonb_build_object(
|
||||||
|
'type', p_error_type,
|
||||||
|
'message', p_error_message,
|
||||||
|
'at', NOW()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
RETURNING id INTO v_dlq_id;
|
||||||
|
|
||||||
|
-- Mark original as processed (moved to DLQ)
|
||||||
|
UPDATE raw_payloads
|
||||||
|
SET processed = TRUE,
|
||||||
|
hydration_error = 'Moved to DLQ: ' || p_error_message
|
||||||
|
WHERE id = p_payload_id;
|
||||||
|
|
||||||
|
-- Record metric
|
||||||
|
PERFORM record_metric('payloads_dlq_total',
|
||||||
|
COALESCE((SELECT metric_value FROM system_metrics_current WHERE metric_name = 'payloads_dlq_total'), 0) + 1
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Create alert for DLQ
|
||||||
|
PERFORM upsert_alert(
|
||||||
|
'DLQ_ARRIVAL',
|
||||||
|
'warning',
|
||||||
|
'Payload moved to Dead-Letter Queue',
|
||||||
|
p_error_message,
|
||||||
|
'hydration',
|
||||||
|
jsonb_build_object('payload_id', p_payload_id, 'dlq_id', v_dlq_id, 'error_type', p_error_type)
|
||||||
|
);
|
||||||
|
|
||||||
|
RETURN v_dlq_id;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Cleanup old metrics (keep 7 days of time series)
|
||||||
|
CREATE OR REPLACE FUNCTION cleanup_old_metrics() RETURNS INTEGER AS $$
|
||||||
|
DECLARE
|
||||||
|
v_deleted INTEGER;
|
||||||
|
BEGIN
|
||||||
|
DELETE FROM system_metrics
|
||||||
|
WHERE recorded_at < NOW() - INTERVAL '7 days';
|
||||||
|
|
||||||
|
GET DIAGNOSTICS v_deleted = ROW_COUNT;
|
||||||
|
RETURN v_deleted;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- ENSURE RAW_PAYLOADS HAS REQUIRED COLUMNS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add state column to raw_payloads if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'raw_payloads' AND column_name = 'state_code'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE raw_payloads ADD COLUMN state_code VARCHAR(2);
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- INITIAL METRICS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Initialize core metrics
|
||||||
|
INSERT INTO system_metrics_current (metric_name, metric_value, labels)
|
||||||
|
VALUES
|
||||||
|
('payloads_unprocessed', 0, '{}'),
|
||||||
|
('payloads_processed_today', 0, '{}'),
|
||||||
|
('hydration_errors', 0, '{}'),
|
||||||
|
('hydration_success_rate', 100, '{}'),
|
||||||
|
('canonical_rows_inserted', 0, '{}'),
|
||||||
|
('canonical_rows_updated', 0, '{}'),
|
||||||
|
('canonical_rows_discontinued', 0, '{}'),
|
||||||
|
('snapshot_volume', 0, '{}'),
|
||||||
|
('ingestion_latency_avg_ms', 0, '{}'),
|
||||||
|
('payloads_dlq_total', 0, '{}')
|
||||||
|
ON CONFLICT (metric_name) DO NOTHING;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- COMMENTS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
COMMENT ON TABLE sync_orchestrator_state IS 'Singleton table tracking orchestrator status and config';
|
||||||
|
COMMENT ON TABLE sync_runs IS 'History of sync runs with metrics';
|
||||||
|
COMMENT ON TABLE raw_payloads_dlq IS 'Dead-letter queue for failed payloads';
|
||||||
|
COMMENT ON TABLE system_metrics IS 'Time-series metrics storage';
|
||||||
|
COMMENT ON TABLE system_metrics_current IS 'Current metric values (fast lookup)';
|
||||||
|
COMMENT ON TABLE error_buckets IS 'Classified errors for monitoring';
|
||||||
|
COMMENT ON TABLE integrity_check_runs IS 'Integrity check execution history';
|
||||||
|
COMMENT ON TABLE integrity_check_results IS 'Individual check results';
|
||||||
|
COMMENT ON TABLE auto_fix_runs IS 'Audit log for auto-fix routines';
|
||||||
|
COMMENT ON TABLE system_alerts IS 'System alerts with deduplication';
|
||||||
750
backend/migrations/050_cannaiq_canonical_v2.sql
Normal file
750
backend/migrations/050_cannaiq_canonical_v2.sql
Normal file
@@ -0,0 +1,750 @@
|
|||||||
|
-- ============================================================================
|
||||||
|
-- Migration 050: CannaiQ Canonical Schema v2
|
||||||
|
-- ============================================================================
|
||||||
|
--
|
||||||
|
-- Purpose: Add canonical tables for multi-state analytics, pricing engine,
|
||||||
|
-- promotions, intelligence, and brand/buyer portals.
|
||||||
|
--
|
||||||
|
-- RULES:
|
||||||
|
-- - STRICTLY ADDITIVE (no DROP, DELETE, TRUNCATE, or ALTER column type)
|
||||||
|
-- - All new tables use IF NOT EXISTS
|
||||||
|
-- - All new columns use ADD COLUMN IF NOT EXISTS
|
||||||
|
-- - All indexes use IF NOT EXISTS
|
||||||
|
-- - Compatible with existing dutchie_products, dispensaries, etc.
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- psql $CANNAIQ_DB_URL -f migrations/050_cannaiq_canonical_v2.sql
|
||||||
|
--
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 1: STATES TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Reference table for US states. Already may exist from 041/043.
|
||||||
|
-- This is idempotent.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS states (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
code VARCHAR(2) NOT NULL UNIQUE,
|
||||||
|
name VARCHAR(100) NOT NULL,
|
||||||
|
timezone VARCHAR(50) DEFAULT 'America/Phoenix',
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
crawl_enabled BOOLEAN DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Insert states if not present
|
||||||
|
INSERT INTO states (code, name, timezone) VALUES
|
||||||
|
('AZ', 'Arizona', 'America/Phoenix'),
|
||||||
|
('CA', 'California', 'America/Los_Angeles'),
|
||||||
|
('CO', 'Colorado', 'America/Denver'),
|
||||||
|
('FL', 'Florida', 'America/New_York'),
|
||||||
|
('IL', 'Illinois', 'America/Chicago'),
|
||||||
|
('MA', 'Massachusetts', 'America/New_York'),
|
||||||
|
('MD', 'Maryland', 'America/New_York'),
|
||||||
|
('MI', 'Michigan', 'America/Detroit'),
|
||||||
|
('MO', 'Missouri', 'America/Chicago'),
|
||||||
|
('NV', 'Nevada', 'America/Los_Angeles'),
|
||||||
|
('NJ', 'New Jersey', 'America/New_York'),
|
||||||
|
('NY', 'New York', 'America/New_York'),
|
||||||
|
('OH', 'Ohio', 'America/New_York'),
|
||||||
|
('OK', 'Oklahoma', 'America/Chicago'),
|
||||||
|
('OR', 'Oregon', 'America/Los_Angeles'),
|
||||||
|
('PA', 'Pennsylvania', 'America/New_York'),
|
||||||
|
('WA', 'Washington', 'America/Los_Angeles')
|
||||||
|
ON CONFLICT (code) DO UPDATE SET
|
||||||
|
timezone = EXCLUDED.timezone,
|
||||||
|
updated_at = NOW();
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 2: CHAINS TABLE (Retail Groups)
|
||||||
|
-- ============================================================================
|
||||||
|
-- Chains are multi-location operators like Curaleaf, Trulieve, Harvest, etc.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS chains (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
|
||||||
|
-- Branding
|
||||||
|
website_url TEXT,
|
||||||
|
logo_url TEXT,
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
-- Business info
|
||||||
|
headquarters_city VARCHAR(100),
|
||||||
|
headquarters_state_id INTEGER REFERENCES states(id),
|
||||||
|
founded_year INTEGER,
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
is_public BOOLEAN DEFAULT FALSE, -- Publicly traded?
|
||||||
|
stock_ticker VARCHAR(10),
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 3: CANONICAL BRANDS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- This is the master brand catalog across all providers and states.
|
||||||
|
-- Distinct from the per-store `brands` table which tracks store-level brand presence.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS canonical_brands (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
|
||||||
|
-- External IDs from various platforms
|
||||||
|
dutchie_brand_id VARCHAR(100),
|
||||||
|
jane_brand_id VARCHAR(100),
|
||||||
|
treez_brand_id VARCHAR(100),
|
||||||
|
weedmaps_brand_id VARCHAR(100),
|
||||||
|
|
||||||
|
-- Branding
|
||||||
|
logo_url TEXT,
|
||||||
|
local_logo_path TEXT, -- Local storage path
|
||||||
|
website_url TEXT,
|
||||||
|
instagram_handle VARCHAR(100),
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
-- Classification
|
||||||
|
is_portfolio_brand BOOLEAN DEFAULT FALSE, -- TRUE if brand we represent
|
||||||
|
is_house_brand BOOLEAN DEFAULT FALSE, -- TRUE if dispensary house brand
|
||||||
|
parent_company VARCHAR(255), -- Parent company name if subsidiary
|
||||||
|
|
||||||
|
-- State presence
|
||||||
|
states_available TEXT[], -- Array of state codes where brand is present
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
is_verified BOOLEAN DEFAULT FALSE, -- Manually verified brand info
|
||||||
|
verified_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_canonical_brands_slug ON canonical_brands(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_canonical_brands_dutchie ON canonical_brands(dutchie_brand_id) WHERE dutchie_brand_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_canonical_brands_portfolio ON canonical_brands(is_portfolio_brand) WHERE is_portfolio_brand = TRUE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_canonical_brands_states ON canonical_brands USING GIN(states_available);
|
||||||
|
|
||||||
|
COMMENT ON TABLE canonical_brands IS 'Canonical brand catalog across all providers. Master brand reference.';
|
||||||
|
COMMENT ON COLUMN canonical_brands.is_portfolio_brand IS 'TRUE if this is a brand CannaiQ represents/manages.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 4: CRAWL_RUNS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- One record per crawl execution. Links to snapshots.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
state_id INTEGER REFERENCES states(id),
|
||||||
|
|
||||||
|
-- Provider info
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
|
||||||
|
-- Timing
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, success, failed, partial
|
||||||
|
error_code VARCHAR(50),
|
||||||
|
error_message TEXT,
|
||||||
|
http_status INTEGER,
|
||||||
|
|
||||||
|
-- Results
|
||||||
|
products_found INTEGER DEFAULT 0,
|
||||||
|
products_new INTEGER DEFAULT 0,
|
||||||
|
products_updated INTEGER DEFAULT 0,
|
||||||
|
products_missing INTEGER DEFAULT 0, -- Products gone from feed
|
||||||
|
snapshots_written INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Infrastructure
|
||||||
|
worker_id VARCHAR(100),
|
||||||
|
worker_hostname VARCHAR(100),
|
||||||
|
proxy_used TEXT,
|
||||||
|
trigger_type VARCHAR(50) DEFAULT 'scheduled', -- scheduled, manual, api
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
metadata JSONB DEFAULT '{}',
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 5: STORE_PRODUCTS TABLE (Current Menu State)
|
||||||
|
-- ============================================================================
|
||||||
|
-- Canonical representation of what's currently on the menu.
|
||||||
|
-- Provider-agnostic structure for analytics.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS store_products (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
state_id INTEGER REFERENCES states(id),
|
||||||
|
|
||||||
|
-- Links to canonical entities
|
||||||
|
canonical_brand_id INTEGER REFERENCES canonical_brands(id) ON DELETE SET NULL,
|
||||||
|
category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL,
|
||||||
|
|
||||||
|
-- Provider-specific identifiers
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
provider_product_id VARCHAR(100) NOT NULL, -- Platform product ID
|
||||||
|
provider_brand_id VARCHAR(100), -- Platform brand ID
|
||||||
|
enterprise_product_id VARCHAR(100), -- Cross-store product ID
|
||||||
|
|
||||||
|
-- Raw data from platform (not normalized)
|
||||||
|
name VARCHAR(500) NOT NULL,
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
category VARCHAR(100),
|
||||||
|
subcategory VARCHAR(100),
|
||||||
|
strain_type VARCHAR(50),
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
-- Pricing (current)
|
||||||
|
price_rec NUMERIC(10,2),
|
||||||
|
price_med NUMERIC(10,2),
|
||||||
|
price_rec_special NUMERIC(10,2),
|
||||||
|
price_med_special NUMERIC(10,2),
|
||||||
|
is_on_special BOOLEAN DEFAULT FALSE,
|
||||||
|
special_name TEXT,
|
||||||
|
discount_percent NUMERIC(5,2),
|
||||||
|
price_unit VARCHAR(20) DEFAULT 'each', -- gram, ounce, each, mg
|
||||||
|
|
||||||
|
-- Inventory
|
||||||
|
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
stock_quantity INTEGER,
|
||||||
|
stock_status VARCHAR(50) DEFAULT 'in_stock', -- in_stock, out_of_stock, low_stock, missing_from_feed
|
||||||
|
|
||||||
|
-- Potency
|
||||||
|
thc_percent NUMERIC(5,2),
|
||||||
|
cbd_percent NUMERIC(5,2),
|
||||||
|
thc_mg NUMERIC(10,2),
|
||||||
|
cbd_mg NUMERIC(10,2),
|
||||||
|
|
||||||
|
-- Weight/Size
|
||||||
|
weight_value NUMERIC(10,2),
|
||||||
|
weight_unit VARCHAR(20), -- g, oz, mg
|
||||||
|
|
||||||
|
-- Images
|
||||||
|
image_url TEXT,
|
||||||
|
local_image_path TEXT,
|
||||||
|
thumbnail_url TEXT,
|
||||||
|
|
||||||
|
-- Flags
|
||||||
|
is_featured BOOLEAN DEFAULT FALSE,
|
||||||
|
medical_only BOOLEAN DEFAULT FALSE,
|
||||||
|
rec_only BOOLEAN DEFAULT FALSE,
|
||||||
|
|
||||||
|
-- Menu position (for tracking prominence)
|
||||||
|
menu_position INTEGER,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
last_price_change_at TIMESTAMPTZ,
|
||||||
|
last_stock_change_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
|
||||||
|
UNIQUE(dispensary_id, provider, provider_product_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_brand ON store_products(canonical_brand_id) WHERE canonical_brand_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE (Historical Data)
|
||||||
|
-- ============================================================================
|
||||||
|
-- Time-series data for analytics. One row per product per crawl.
|
||||||
|
-- CRITICAL: NEVER DELETE from this table.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS store_product_snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
|
||||||
|
state_id INTEGER REFERENCES states(id),
|
||||||
|
|
||||||
|
-- Provider info
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
provider_product_id VARCHAR(100),
|
||||||
|
|
||||||
|
-- Link to crawl run
|
||||||
|
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||||
|
|
||||||
|
-- Capture timestamp
|
||||||
|
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
-- Raw data from platform
|
||||||
|
name VARCHAR(500),
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
category VARCHAR(100),
|
||||||
|
subcategory VARCHAR(100),
|
||||||
|
|
||||||
|
-- Pricing at time of capture
|
||||||
|
price_rec NUMERIC(10,2),
|
||||||
|
price_med NUMERIC(10,2),
|
||||||
|
price_rec_special NUMERIC(10,2),
|
||||||
|
price_med_special NUMERIC(10,2),
|
||||||
|
is_on_special BOOLEAN DEFAULT FALSE,
|
||||||
|
discount_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Inventory at time of capture
|
||||||
|
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
stock_quantity INTEGER,
|
||||||
|
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||||
|
is_present_in_feed BOOLEAN DEFAULT TRUE, -- FALSE = missing from feed
|
||||||
|
|
||||||
|
-- Potency at time of capture
|
||||||
|
thc_percent NUMERIC(5,2),
|
||||||
|
cbd_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Menu position (for tracking prominence changes)
|
||||||
|
menu_position INTEGER,
|
||||||
|
|
||||||
|
-- Image URL at time of capture
|
||||||
|
image_url TEXT,
|
||||||
|
|
||||||
|
-- Full raw response for debugging
|
||||||
|
raw_data JSONB,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Partitioning-ready indexes (for future table partitioning by month)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 7: ADD state_id AND chain_id TO DISPENSARIES
|
||||||
|
-- ============================================================================
|
||||||
|
-- Link dispensaries to states and chains tables.
|
||||||
|
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER REFERENCES states(id);
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER REFERENCES chains(id);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Backfill state_id from existing state column
|
||||||
|
UPDATE dispensaries d
|
||||||
|
SET state_id = s.id
|
||||||
|
FROM states s
|
||||||
|
WHERE d.state = s.code
|
||||||
|
AND d.state_id IS NULL;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
|
||||||
|
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 8: BRAND PENETRATION TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Pre-computed brand presence across stores for analytics dashboards.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS brand_penetration (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
|
||||||
|
state_id INTEGER NOT NULL REFERENCES states(id) ON DELETE CASCADE,
|
||||||
|
|
||||||
|
-- Metrics
|
||||||
|
stores_carrying INTEGER DEFAULT 0,
|
||||||
|
stores_total INTEGER DEFAULT 0,
|
||||||
|
penetration_pct NUMERIC(5,2) DEFAULT 0,
|
||||||
|
|
||||||
|
-- Product breakdown
|
||||||
|
products_count INTEGER DEFAULT 0,
|
||||||
|
products_in_stock INTEGER DEFAULT 0,
|
||||||
|
products_on_special INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Pricing
|
||||||
|
avg_price NUMERIC(10,2),
|
||||||
|
min_price NUMERIC(10,2),
|
||||||
|
max_price NUMERIC(10,2),
|
||||||
|
|
||||||
|
-- Time range
|
||||||
|
calculated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
period_start TIMESTAMPTZ,
|
||||||
|
period_end TIMESTAMPTZ,
|
||||||
|
|
||||||
|
UNIQUE(canonical_brand_id, state_id, calculated_at)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brand_penetration_brand ON brand_penetration(canonical_brand_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brand_penetration_state ON brand_penetration(state_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_brand_penetration_calculated ON brand_penetration(calculated_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE brand_penetration IS 'Pre-computed brand penetration metrics by state.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 9: PRICE_ALERTS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Track significant price changes for intelligence/alerts.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS price_alerts (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
store_product_id INTEGER REFERENCES store_products(id) ON DELETE CASCADE,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
state_id INTEGER REFERENCES states(id),
|
||||||
|
|
||||||
|
-- What changed
|
||||||
|
alert_type VARCHAR(50) NOT NULL, -- price_drop, price_increase, new_special, special_ended
|
||||||
|
|
||||||
|
-- Values
|
||||||
|
old_price NUMERIC(10,2),
|
||||||
|
new_price NUMERIC(10,2),
|
||||||
|
change_amount NUMERIC(10,2),
|
||||||
|
change_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Context
|
||||||
|
product_name VARCHAR(500),
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
category VARCHAR(100),
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
is_processed BOOLEAN DEFAULT FALSE,
|
||||||
|
processed_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_price_alerts_dispensary ON price_alerts(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_price_alerts_state ON price_alerts(state_id) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_price_alerts_type ON price_alerts(alert_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_price_alerts_unprocessed ON price_alerts(is_processed) WHERE is_processed = FALSE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_price_alerts_created ON price_alerts(created_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE price_alerts IS 'Significant price changes for intelligence/alerting.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 10: RAW_PAYLOADS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Store raw API responses for replay/debugging. Separate from snapshots.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS raw_payloads (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||||
|
|
||||||
|
-- Payload info
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
payload_type VARCHAR(50) NOT NULL DEFAULT 'products', -- products, brands, specials
|
||||||
|
|
||||||
|
-- The raw data
|
||||||
|
payload JSONB NOT NULL,
|
||||||
|
payload_size_bytes INTEGER,
|
||||||
|
|
||||||
|
-- Deduplication
|
||||||
|
payload_hash VARCHAR(64), -- SHA256 for deduplication
|
||||||
|
|
||||||
|
-- Processing status
|
||||||
|
is_processed BOOLEAN DEFAULT FALSE,
|
||||||
|
processed_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
captured_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_dispensary ON raw_payloads(dispensary_id, captured_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_crawl_run ON raw_payloads(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_unprocessed ON raw_payloads(is_processed) WHERE is_processed = FALSE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_raw_payloads_hash ON raw_payloads(payload_hash) WHERE payload_hash IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON TABLE raw_payloads IS 'Raw API responses for replay/debugging. Enables re-hydration.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 11: ANALYTICS CACHE TABLES
|
||||||
|
-- ============================================================================
|
||||||
|
-- Pre-computed analytics for dashboard performance.
|
||||||
|
|
||||||
|
-- Daily store metrics
|
||||||
|
CREATE TABLE IF NOT EXISTS analytics_store_daily (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||||
|
state_id INTEGER REFERENCES states(id),
|
||||||
|
date DATE NOT NULL,
|
||||||
|
|
||||||
|
-- Product counts
|
||||||
|
total_products INTEGER DEFAULT 0,
|
||||||
|
in_stock_products INTEGER DEFAULT 0,
|
||||||
|
out_of_stock_products INTEGER DEFAULT 0,
|
||||||
|
on_special_products INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Brand/category diversity
|
||||||
|
unique_brands INTEGER DEFAULT 0,
|
||||||
|
unique_categories INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Pricing
|
||||||
|
avg_price NUMERIC(10,2),
|
||||||
|
median_price NUMERIC(10,2),
|
||||||
|
|
||||||
|
-- Crawl health
|
||||||
|
crawl_count INTEGER DEFAULT 0,
|
||||||
|
successful_crawls INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
|
||||||
|
UNIQUE(dispensary_id, date)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_dispensary ON analytics_store_daily(dispensary_id, date DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_state ON analytics_store_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_store_daily_date ON analytics_store_daily(date DESC);
|
||||||
|
|
||||||
|
|
||||||
|
-- Daily brand metrics
|
||||||
|
CREATE TABLE IF NOT EXISTS analytics_brand_daily (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
canonical_brand_id INTEGER NOT NULL REFERENCES canonical_brands(id) ON DELETE CASCADE,
|
||||||
|
state_id INTEGER REFERENCES states(id),
|
||||||
|
date DATE NOT NULL,
|
||||||
|
|
||||||
|
-- Presence
|
||||||
|
stores_carrying INTEGER DEFAULT 0,
|
||||||
|
products_count INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Stock
|
||||||
|
in_stock_count INTEGER DEFAULT 0,
|
||||||
|
out_of_stock_count INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Pricing
|
||||||
|
avg_price NUMERIC(10,2),
|
||||||
|
min_price NUMERIC(10,2),
|
||||||
|
max_price NUMERIC(10,2),
|
||||||
|
on_special_count INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
|
||||||
|
UNIQUE(canonical_brand_id, state_id, date)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_brand ON analytics_brand_daily(canonical_brand_id, date DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_analytics_brand_daily_state ON analytics_brand_daily(state_id, date DESC) WHERE state_id IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 12: VIEWS FOR COMPATIBILITY
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- View: Latest snapshot per store product
|
||||||
|
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
|
||||||
|
SELECT DISTINCT ON (dispensary_id, provider_product_id)
|
||||||
|
sps.*
|
||||||
|
FROM store_product_snapshots sps
|
||||||
|
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
|
||||||
|
|
||||||
|
-- View: Crawl run summary per dispensary
|
||||||
|
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
|
||||||
|
SELECT
|
||||||
|
d.id AS dispensary_id,
|
||||||
|
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
d.state_id,
|
||||||
|
s.name AS state_name,
|
||||||
|
COUNT(DISTINCT sp.id) AS current_product_count,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
|
||||||
|
MAX(cr.finished_at) AS last_crawl_at,
|
||||||
|
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN states s ON s.id = d.state_id
|
||||||
|
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||||
|
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
|
||||||
|
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
|
||||||
|
|
||||||
|
-- View: Brand presence across stores
|
||||||
|
CREATE OR REPLACE VIEW v_brand_store_presence AS
|
||||||
|
SELECT
|
||||||
|
cb.id AS brand_id,
|
||||||
|
cb.name AS brand_name,
|
||||||
|
cb.slug AS brand_slug,
|
||||||
|
s.id AS state_id,
|
||||||
|
s.code AS state_code,
|
||||||
|
COUNT(DISTINCT sp.dispensary_id) AS store_count,
|
||||||
|
COUNT(sp.id) AS product_count,
|
||||||
|
COUNT(sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||||
|
AVG(sp.price_rec) AS avg_price,
|
||||||
|
MIN(sp.price_rec) AS min_price,
|
||||||
|
MAX(sp.price_rec) AS max_price
|
||||||
|
FROM canonical_brands cb
|
||||||
|
JOIN store_products sp ON sp.canonical_brand_id = cb.id
|
||||||
|
LEFT JOIN states s ON s.id = sp.state_id
|
||||||
|
GROUP BY cb.id, cb.name, cb.slug, s.id, s.code;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 13: ADD FK FROM store_product_snapshots TO crawl_runs
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM information_schema.table_constraints
|
||||||
|
WHERE constraint_name = 'store_product_snapshots_crawl_run_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
|
||||||
|
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 14: ADD crawl_run_id TO crawl_orchestration_traces
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
ALTER TABLE crawl_orchestration_traces
|
||||||
|
ADD COLUMN IF NOT EXISTS crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_traces_crawl_run
|
||||||
|
ON crawl_orchestration_traces(crawl_run_id)
|
||||||
|
WHERE crawl_run_id IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 15: UPDATE dispensary_crawler_profiles
|
||||||
|
-- ============================================================================
|
||||||
|
-- Add status columns for profile lifecycle.
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS status VARCHAR(50) DEFAULT 'sandbox';
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS allow_autopromote BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawler_profiles
|
||||||
|
ADD COLUMN IF NOT EXISTS validated_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_profiles_status
|
||||||
|
ON dispensary_crawler_profiles(status);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensary_crawler_profiles.status IS 'Profile status: sandbox, production, needs_manual, disabled';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 16: UPDATE dispensary_crawl_jobs WITH ADDITIONAL COLUMNS
|
||||||
|
-- ============================================================================
|
||||||
|
-- Add columns needed for enhanced job tracking.
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_id VARCHAR(100);
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_hostname VARCHAR(100);
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS claimed_by VARCHAR(100);
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS claimed_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS locked_until TIMESTAMPTZ;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS last_heartbeat_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS max_retries INTEGER DEFAULT 3;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS products_upserted INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS snapshots_created INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS current_page INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS total_pages INTEGER;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status_pending ON dispensary_crawl_jobs(status) WHERE status = 'pending';
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_claimed_by ON dispensary_crawl_jobs(claimed_by) WHERE claimed_by IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 17: QUEUE MONITORING VIEWS
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW v_queue_stats AS
|
||||||
|
SELECT
|
||||||
|
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'pending') AS pending_jobs,
|
||||||
|
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'running') AS running_jobs,
|
||||||
|
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS completed_1h,
|
||||||
|
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') AS failed_1h,
|
||||||
|
(SELECT COUNT(DISTINCT worker_id) FROM dispensary_crawl_jobs WHERE status = 'running' AND worker_id IS NOT NULL) AS active_workers,
|
||||||
|
(SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) FROM dispensary_crawl_jobs WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') AS avg_duration_seconds;
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW v_active_workers AS
|
||||||
|
SELECT
|
||||||
|
worker_id,
|
||||||
|
worker_hostname,
|
||||||
|
COUNT(*) AS current_jobs,
|
||||||
|
SUM(products_found) AS total_products_found,
|
||||||
|
SUM(products_upserted) AS total_products_upserted,
|
||||||
|
SUM(snapshots_created) AS total_snapshots,
|
||||||
|
MIN(claimed_at) AS first_claimed_at,
|
||||||
|
MAX(last_heartbeat_at) AS last_heartbeat
|
||||||
|
FROM dispensary_crawl_jobs
|
||||||
|
WHERE status = 'running' AND worker_id IS NOT NULL
|
||||||
|
GROUP BY worker_id, worker_hostname;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- DONE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT 'Migration 050 completed successfully. Canonical schema v2 is ready.' AS status;
|
||||||
642
backend/migrations/051_cannaiq_canonical_safe_bootstrap.sql
Normal file
642
backend/migrations/051_cannaiq_canonical_safe_bootstrap.sql
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
-- ============================================================================
|
||||||
|
-- Migration 051: CannaiQ Canonical Schema - Safe Bootstrap
|
||||||
|
-- ============================================================================
|
||||||
|
--
|
||||||
|
-- Purpose: Create the canonical CannaiQ schema tables from scratch.
|
||||||
|
-- This migration is FULLY IDEMPOTENT and safe to run multiple times.
|
||||||
|
--
|
||||||
|
-- SAFETY RULES FOLLOWED:
|
||||||
|
-- 1. ALL tables use CREATE TABLE IF NOT EXISTS
|
||||||
|
-- 2. ALL columns use ALTER TABLE ADD COLUMN IF NOT EXISTS
|
||||||
|
-- 3. ALL indexes use CREATE INDEX IF NOT EXISTS
|
||||||
|
-- 4. NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||||
|
-- 5. NO assumptions about existing data or column existence
|
||||||
|
-- 6. NO dependencies on migrations 041, 043, or 050
|
||||||
|
-- 7. Compatible with dutchie_menus database as it exists today
|
||||||
|
-- 8. Safe handling of pre-existing states table with missing columns
|
||||||
|
--
|
||||||
|
-- Tables Created:
|
||||||
|
-- - states (US state reference table)
|
||||||
|
-- - chains (retail chain/group table)
|
||||||
|
-- - crawl_runs (crawl execution records)
|
||||||
|
-- - store_products (current menu state)
|
||||||
|
-- - store_product_snapshots (historical price/stock data)
|
||||||
|
--
|
||||||
|
-- Columns Added:
|
||||||
|
-- - dispensaries.state_id (FK to states)
|
||||||
|
-- - dispensaries.chain_id (FK to chains)
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
||||||
|
-- -f migrations/051_cannaiq_canonical_safe_bootstrap.sql
|
||||||
|
--
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 1: STATES TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Reference table for US states where CannaiQ operates.
|
||||||
|
-- This section handles the case where the table exists but is missing columns.
|
||||||
|
|
||||||
|
-- First, create the table if it doesn't exist (minimal definition)
|
||||||
|
CREATE TABLE IF NOT EXISTS states (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
code VARCHAR(2) NOT NULL,
|
||||||
|
name VARCHAR(100) NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Now safely add any missing columns (each is independent, won't fail if exists)
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS timezone TEXT;
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS is_active BOOLEAN DEFAULT TRUE;
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE;
|
||||||
|
|
||||||
|
-- Add unique constraint on code if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'states_code_key' AND conrelid = 'states'::regclass
|
||||||
|
) THEN
|
||||||
|
-- Check if there's already a unique constraint with a different name
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_indexes
|
||||||
|
WHERE tablename = 'states' AND indexdef LIKE '%UNIQUE%code%'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE states ADD CONSTRAINT states_code_key UNIQUE (code);
|
||||||
|
END IF;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL; -- Constraint already exists
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL; -- Handle any other errors gracefully
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Set default timezone values for existing rows that have NULL
|
||||||
|
UPDATE states SET timezone = 'America/Phoenix' WHERE timezone IS NULL AND code = 'AZ';
|
||||||
|
UPDATE states SET timezone = 'America/Los_Angeles' WHERE timezone IS NULL AND code IN ('CA', 'NV', 'OR', 'WA');
|
||||||
|
UPDATE states SET timezone = 'America/Denver' WHERE timezone IS NULL AND code = 'CO';
|
||||||
|
UPDATE states SET timezone = 'America/New_York' WHERE timezone IS NULL AND code IN ('FL', 'MA', 'MD', 'NJ', 'NY', 'OH', 'PA');
|
||||||
|
UPDATE states SET timezone = 'America/Chicago' WHERE timezone IS NULL AND code IN ('IL', 'MO', 'OK');
|
||||||
|
UPDATE states SET timezone = 'America/Detroit' WHERE timezone IS NULL AND code = 'MI';
|
||||||
|
|
||||||
|
-- Set default is_active for existing rows
|
||||||
|
UPDATE states SET is_active = TRUE WHERE is_active IS NULL;
|
||||||
|
UPDATE states SET crawl_enabled = TRUE WHERE crawl_enabled IS NULL;
|
||||||
|
|
||||||
|
-- Insert known states (idempotent - ON CONFLICT DO UPDATE to fill missing values)
|
||||||
|
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
|
||||||
|
('AZ', 'Arizona', 'America/Phoenix', TRUE, TRUE),
|
||||||
|
('CA', 'California', 'America/Los_Angeles', TRUE, TRUE),
|
||||||
|
('CO', 'Colorado', 'America/Denver', TRUE, TRUE),
|
||||||
|
('FL', 'Florida', 'America/New_York', TRUE, TRUE),
|
||||||
|
('IL', 'Illinois', 'America/Chicago', TRUE, TRUE),
|
||||||
|
('MA', 'Massachusetts', 'America/New_York', TRUE, TRUE),
|
||||||
|
('MD', 'Maryland', 'America/New_York', TRUE, TRUE),
|
||||||
|
('MI', 'Michigan', 'America/Detroit', TRUE, TRUE),
|
||||||
|
('MO', 'Missouri', 'America/Chicago', TRUE, TRUE),
|
||||||
|
('NV', 'Nevada', 'America/Los_Angeles', TRUE, TRUE),
|
||||||
|
('NJ', 'New Jersey', 'America/New_York', TRUE, TRUE),
|
||||||
|
('NY', 'New York', 'America/New_York', TRUE, TRUE),
|
||||||
|
('OH', 'Ohio', 'America/New_York', TRUE, TRUE),
|
||||||
|
('OK', 'Oklahoma', 'America/Chicago', TRUE, TRUE),
|
||||||
|
('OR', 'Oregon', 'America/Los_Angeles', TRUE, TRUE),
|
||||||
|
('PA', 'Pennsylvania', 'America/New_York', TRUE, TRUE),
|
||||||
|
('WA', 'Washington', 'America/Los_Angeles', TRUE, TRUE)
|
||||||
|
ON CONFLICT (code) DO UPDATE SET
|
||||||
|
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
|
||||||
|
is_active = COALESCE(states.is_active, EXCLUDED.is_active),
|
||||||
|
crawl_enabled = COALESCE(states.crawl_enabled, EXCLUDED.crawl_enabled),
|
||||||
|
updated_at = NOW();
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_code ON states(code);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_active ON states(is_active) WHERE is_active = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE states IS 'US states where CannaiQ operates. Single source of truth for state configuration.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 2: CHAINS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Retail chains/groups that own multiple dispensary locations.
|
||||||
|
-- Examples: Curaleaf, Trulieve, Harvest, Columbia Care
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS chains (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
slug VARCHAR(255) NOT NULL,
|
||||||
|
website_url TEXT,
|
||||||
|
logo_url TEXT,
|
||||||
|
description TEXT,
|
||||||
|
headquarters_city VARCHAR(100),
|
||||||
|
headquarters_state_id INTEGER,
|
||||||
|
founded_year INTEGER,
|
||||||
|
is_active BOOLEAN DEFAULT TRUE,
|
||||||
|
is_public BOOLEAN DEFAULT FALSE,
|
||||||
|
stock_ticker VARCHAR(10),
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Add unique constraint on slug if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'chains_slug_key' AND conrelid = 'chains'::regclass
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE chains ADD CONSTRAINT chains_slug_key UNIQUE (slug);
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add FK to states if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'chains_headquarters_state_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE chains
|
||||||
|
ADD CONSTRAINT chains_headquarters_state_id_fkey
|
||||||
|
FOREIGN KEY (headquarters_state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chains_slug ON chains(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chains_active ON chains(is_active) WHERE is_active = TRUE;
|
||||||
|
|
||||||
|
COMMENT ON TABLE chains IS 'Retail chains/groups that own multiple dispensary locations.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 3: ADD state_id AND chain_id TO DISPENSARIES
|
||||||
|
-- ============================================================================
|
||||||
|
-- Link existing dispensaries table to states and chains.
|
||||||
|
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS state_id INTEGER;
|
||||||
|
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_id INTEGER;
|
||||||
|
|
||||||
|
-- Add FK constraints if not exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dispensaries_state_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries
|
||||||
|
ADD CONSTRAINT dispensaries_state_id_fkey
|
||||||
|
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dispensaries_chain_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dispensaries
|
||||||
|
ADD CONSTRAINT dispensaries_chain_id_fkey
|
||||||
|
FOREIGN KEY (chain_id) REFERENCES chains(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_id ON dispensaries(state_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_id ON dispensaries(chain_id) WHERE chain_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Backfill state_id from existing state column (safe - only updates NULL values)
|
||||||
|
UPDATE dispensaries d
|
||||||
|
SET state_id = s.id
|
||||||
|
FROM states s
|
||||||
|
WHERE d.state = s.code
|
||||||
|
AND d.state_id IS NULL;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensaries.state_id IS 'FK to states table. Canonical state reference.';
|
||||||
|
COMMENT ON COLUMN dispensaries.chain_id IS 'FK to chains table. NULL if independent dispensary.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 4: CRAWL_RUNS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- One record per crawl execution. Links to snapshots.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL,
|
||||||
|
state_id INTEGER,
|
||||||
|
|
||||||
|
-- Provider info
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
|
||||||
|
-- Timing
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
finished_at TIMESTAMPTZ,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
|
||||||
|
-- Status
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running',
|
||||||
|
error_code VARCHAR(50),
|
||||||
|
error_message TEXT,
|
||||||
|
http_status INTEGER,
|
||||||
|
|
||||||
|
-- Results
|
||||||
|
products_found INTEGER DEFAULT 0,
|
||||||
|
products_new INTEGER DEFAULT 0,
|
||||||
|
products_updated INTEGER DEFAULT 0,
|
||||||
|
products_missing INTEGER DEFAULT 0,
|
||||||
|
snapshots_written INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Infrastructure
|
||||||
|
worker_id VARCHAR(100),
|
||||||
|
worker_hostname VARCHAR(100),
|
||||||
|
proxy_used TEXT,
|
||||||
|
trigger_type VARCHAR(50) DEFAULT 'scheduled',
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
metadata JSONB DEFAULT '{}',
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Add FK constraints if not exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'crawl_runs_dispensary_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE crawl_runs
|
||||||
|
ADD CONSTRAINT crawl_runs_dispensary_id_fkey
|
||||||
|
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'crawl_runs_state_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE crawl_runs
|
||||||
|
ADD CONSTRAINT crawl_runs_state_id_fkey
|
||||||
|
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_state ON crawl_runs(state_id) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_started ON crawl_runs(started_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary_started ON crawl_runs(dispensary_id, started_at DESC);
|
||||||
|
|
||||||
|
COMMENT ON TABLE crawl_runs IS 'Each crawl execution. Links to snapshots and traces.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 5: STORE_PRODUCTS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Current state of products on each dispensary menu.
|
||||||
|
-- Provider-agnostic structure for analytics.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS store_products (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL,
|
||||||
|
state_id INTEGER,
|
||||||
|
|
||||||
|
-- Provider-specific identifiers
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
provider_product_id VARCHAR(100) NOT NULL,
|
||||||
|
provider_brand_id VARCHAR(100),
|
||||||
|
enterprise_product_id VARCHAR(100),
|
||||||
|
|
||||||
|
-- Raw data from platform (not normalized)
|
||||||
|
name VARCHAR(500) NOT NULL,
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
category VARCHAR(100),
|
||||||
|
subcategory VARCHAR(100),
|
||||||
|
strain_type VARCHAR(50),
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
-- Pricing (current)
|
||||||
|
price_rec NUMERIC(10,2),
|
||||||
|
price_med NUMERIC(10,2),
|
||||||
|
price_rec_special NUMERIC(10,2),
|
||||||
|
price_med_special NUMERIC(10,2),
|
||||||
|
is_on_special BOOLEAN DEFAULT FALSE,
|
||||||
|
special_name TEXT,
|
||||||
|
discount_percent NUMERIC(5,2),
|
||||||
|
price_unit VARCHAR(20) DEFAULT 'each',
|
||||||
|
|
||||||
|
-- Inventory
|
||||||
|
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
stock_quantity INTEGER,
|
||||||
|
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||||
|
|
||||||
|
-- Potency
|
||||||
|
thc_percent NUMERIC(5,2),
|
||||||
|
cbd_percent NUMERIC(5,2),
|
||||||
|
thc_mg NUMERIC(10,2),
|
||||||
|
cbd_mg NUMERIC(10,2),
|
||||||
|
|
||||||
|
-- Weight/Size
|
||||||
|
weight_value NUMERIC(10,2),
|
||||||
|
weight_unit VARCHAR(20),
|
||||||
|
|
||||||
|
-- Images
|
||||||
|
image_url TEXT,
|
||||||
|
local_image_path TEXT,
|
||||||
|
thumbnail_url TEXT,
|
||||||
|
|
||||||
|
-- Flags
|
||||||
|
is_featured BOOLEAN DEFAULT FALSE,
|
||||||
|
medical_only BOOLEAN DEFAULT FALSE,
|
||||||
|
rec_only BOOLEAN DEFAULT FALSE,
|
||||||
|
|
||||||
|
-- Menu position (for tracking prominence)
|
||||||
|
menu_position INTEGER,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
last_price_change_at TIMESTAMPTZ,
|
||||||
|
last_stock_change_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Add unique constraint if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_products_dispensary_provider_product_key'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD CONSTRAINT store_products_dispensary_provider_product_key
|
||||||
|
UNIQUE (dispensary_id, provider, provider_product_id);
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add FK constraints if not exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_products_dispensary_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD CONSTRAINT store_products_dispensary_id_fkey
|
||||||
|
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_products_state_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD CONSTRAINT store_products_state_id_fkey
|
||||||
|
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_state ON store_products(state_id) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_category ON store_products(category) WHERE category IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_brand_name ON store_products(brand_name) WHERE brand_name IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_in_stock ON store_products(dispensary_id, is_in_stock);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_special ON store_products(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_last_seen ON store_products(last_seen_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_provider ON store_products(provider);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_enterprise ON store_products(enterprise_product_id) WHERE enterprise_product_id IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON TABLE store_products IS 'Current state of products on each dispensary menu. Provider-agnostic.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 6: STORE_PRODUCT_SNAPSHOTS TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Historical price/stock data. One row per product per crawl.
|
||||||
|
-- CRITICAL: NEVER DELETE from this table.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS store_product_snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dispensary_id INTEGER NOT NULL,
|
||||||
|
store_product_id INTEGER,
|
||||||
|
state_id INTEGER,
|
||||||
|
|
||||||
|
-- Provider info
|
||||||
|
provider VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||||
|
provider_product_id VARCHAR(100),
|
||||||
|
|
||||||
|
-- Link to crawl run
|
||||||
|
crawl_run_id INTEGER,
|
||||||
|
|
||||||
|
-- Capture timestamp
|
||||||
|
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
-- Raw data from platform
|
||||||
|
name VARCHAR(500),
|
||||||
|
brand_name VARCHAR(255),
|
||||||
|
category VARCHAR(100),
|
||||||
|
subcategory VARCHAR(100),
|
||||||
|
|
||||||
|
-- Pricing at time of capture
|
||||||
|
price_rec NUMERIC(10,2),
|
||||||
|
price_med NUMERIC(10,2),
|
||||||
|
price_rec_special NUMERIC(10,2),
|
||||||
|
price_med_special NUMERIC(10,2),
|
||||||
|
is_on_special BOOLEAN DEFAULT FALSE,
|
||||||
|
discount_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Inventory at time of capture
|
||||||
|
is_in_stock BOOLEAN DEFAULT TRUE,
|
||||||
|
stock_quantity INTEGER,
|
||||||
|
stock_status VARCHAR(50) DEFAULT 'in_stock',
|
||||||
|
is_present_in_feed BOOLEAN DEFAULT TRUE,
|
||||||
|
|
||||||
|
-- Potency at time of capture
|
||||||
|
thc_percent NUMERIC(5,2),
|
||||||
|
cbd_percent NUMERIC(5,2),
|
||||||
|
|
||||||
|
-- Menu position (for tracking prominence changes)
|
||||||
|
menu_position INTEGER,
|
||||||
|
|
||||||
|
-- Image URL at time of capture
|
||||||
|
image_url TEXT,
|
||||||
|
|
||||||
|
-- Full raw response for debugging
|
||||||
|
raw_data JSONB,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Add FK constraints if not exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_product_snapshots_dispensary_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD CONSTRAINT store_product_snapshots_dispensary_id_fkey
|
||||||
|
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_product_snapshots_store_product_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD CONSTRAINT store_product_snapshots_store_product_id_fkey
|
||||||
|
FOREIGN KEY (store_product_id) REFERENCES store_products(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_product_snapshots_state_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD CONSTRAINT store_product_snapshots_state_id_fkey
|
||||||
|
FOREIGN KEY (state_id) REFERENCES states(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_product_snapshots_crawl_run_id_fkey'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD CONSTRAINT store_product_snapshots_crawl_run_id_fkey
|
||||||
|
FOREIGN KEY (crawl_run_id) REFERENCES crawl_runs(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN
|
||||||
|
NULL;
|
||||||
|
WHEN OTHERS THEN
|
||||||
|
NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Indexes optimized for analytics queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_captured ON store_product_snapshots(dispensary_id, captured_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_state_captured ON store_product_snapshots(state_id, captured_at DESC) WHERE state_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_product_captured ON store_product_snapshots(store_product_id, captured_at DESC) WHERE store_product_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_run ON store_product_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_captured_at ON store_product_snapshots(captured_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON store_product_snapshots(brand_name) WHERE brand_name IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_product ON store_product_snapshots(provider_product_id) WHERE provider_product_id IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON TABLE store_product_snapshots IS 'Historical crawl data. One row per product per crawl. NEVER DELETE.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 7: VIEWS FOR BACKWARD COMPATIBILITY
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- View: Latest snapshot per store product
|
||||||
|
CREATE OR REPLACE VIEW v_latest_store_snapshots AS
|
||||||
|
SELECT DISTINCT ON (dispensary_id, provider_product_id)
|
||||||
|
sps.*
|
||||||
|
FROM store_product_snapshots sps
|
||||||
|
ORDER BY dispensary_id, provider_product_id, captured_at DESC;
|
||||||
|
|
||||||
|
-- View: Crawl run summary per dispensary
|
||||||
|
CREATE OR REPLACE VIEW v_dispensary_crawl_summary AS
|
||||||
|
SELECT
|
||||||
|
d.id AS dispensary_id,
|
||||||
|
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
d.state_id,
|
||||||
|
s.name AS state_name,
|
||||||
|
COUNT(DISTINCT sp.id) AS current_product_count,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock) AS in_stock_count,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_on_special) AS on_special_count,
|
||||||
|
MAX(cr.finished_at) AS last_crawl_at,
|
||||||
|
(SELECT status FROM crawl_runs WHERE dispensary_id = d.id ORDER BY started_at DESC LIMIT 1) AS last_crawl_status
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN states s ON s.id = d.state_id
|
||||||
|
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||||
|
LEFT JOIN crawl_runs cr ON cr.dispensary_id = d.id
|
||||||
|
GROUP BY d.id, d.dba_name, d.name, d.city, d.state, d.state_id, s.name;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- MIGRATION 051 COMPLETE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT 'Migration 051 completed successfully. Canonical schema is ready.' AS status;
|
||||||
98
backend/migrations/051_create_mv_state_metrics.sql
Normal file
98
backend/migrations/051_create_mv_state_metrics.sql
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
-- Migration 051: Create materialized view for state metrics
|
||||||
|
-- Used by Analytics V2 state endpoints for fast aggregated queries
|
||||||
|
-- Canonical tables: states, dispensaries, store_products, store_product_snapshots, brands
|
||||||
|
|
||||||
|
-- Drop existing view if it exists (for clean recreation)
|
||||||
|
DROP MATERIALIZED VIEW IF EXISTS mv_state_metrics;
|
||||||
|
|
||||||
|
-- Create materialized view with comprehensive state metrics
|
||||||
|
-- Schema verified via information_schema on 2025-12-06
|
||||||
|
-- Real columns used:
|
||||||
|
-- states: id, code, name, recreational_legal, medical_legal, rec_year, med_year
|
||||||
|
-- dispensaries: id, state_id (NO is_active column)
|
||||||
|
-- store_products: id, dispensary_id, brand_id, category_raw, price_rec, price_med, is_in_stock
|
||||||
|
-- store_product_snapshots: id, store_product_id, captured_at
|
||||||
|
-- brands: id (joined via sp.brand_id)
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW mv_state_metrics AS
|
||||||
|
SELECT
|
||||||
|
s.id AS state_id,
|
||||||
|
s.code AS state,
|
||||||
|
s.name AS state_name,
|
||||||
|
COALESCE(s.recreational_legal, FALSE) AS recreational_legal,
|
||||||
|
COALESCE(s.medical_legal, FALSE) AS medical_legal,
|
||||||
|
s.rec_year,
|
||||||
|
s.med_year,
|
||||||
|
|
||||||
|
-- Dispensary metrics
|
||||||
|
COUNT(DISTINCT d.id) AS dispensary_count,
|
||||||
|
|
||||||
|
-- Product metrics
|
||||||
|
COUNT(DISTINCT sp.id) AS total_products,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = TRUE) AS in_stock_products,
|
||||||
|
COUNT(DISTINCT sp.id) FILTER (WHERE sp.is_in_stock = FALSE) AS out_of_stock_products,
|
||||||
|
|
||||||
|
-- Brand metrics (using brand_id FK, not brand_name)
|
||||||
|
COUNT(DISTINCT sp.brand_id) FILTER (WHERE sp.brand_id IS NOT NULL) AS unique_brands,
|
||||||
|
|
||||||
|
-- Category metrics (using category_raw, not category)
|
||||||
|
COUNT(DISTINCT sp.category_raw) FILTER (WHERE sp.category_raw IS NOT NULL) AS unique_categories,
|
||||||
|
|
||||||
|
-- Pricing metrics (recreational)
|
||||||
|
AVG(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_rec,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)
|
||||||
|
FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_rec,
|
||||||
|
MIN(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS min_price_rec,
|
||||||
|
MAX(sp.price_rec) FILTER (WHERE sp.price_rec IS NOT NULL AND sp.is_in_stock = TRUE) AS max_price_rec,
|
||||||
|
|
||||||
|
-- Pricing metrics (medical)
|
||||||
|
AVG(sp.price_med) FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS avg_price_med,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_med)
|
||||||
|
FILTER (WHERE sp.price_med IS NOT NULL AND sp.is_in_stock = TRUE) AS median_price_med,
|
||||||
|
|
||||||
|
-- Snapshot/crawl metrics
|
||||||
|
COUNT(sps.id) AS total_snapshots,
|
||||||
|
MAX(sps.captured_at) AS last_crawl_at,
|
||||||
|
MIN(sps.captured_at) AS first_crawl_at,
|
||||||
|
|
||||||
|
-- Data freshness
|
||||||
|
CASE
|
||||||
|
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '24 hours' THEN 'fresh'
|
||||||
|
WHEN MAX(sps.captured_at) > NOW() - INTERVAL '7 days' THEN 'recent'
|
||||||
|
WHEN MAX(sps.captured_at) IS NOT NULL THEN 'stale'
|
||||||
|
ELSE 'no_data'
|
||||||
|
END AS data_freshness,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
NOW() AS refreshed_at
|
||||||
|
|
||||||
|
FROM states s
|
||||||
|
LEFT JOIN dispensaries d ON d.state_id = s.id
|
||||||
|
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||||
|
LEFT JOIN store_product_snapshots sps ON sps.store_product_id = sp.id
|
||||||
|
GROUP BY s.id, s.code, s.name, s.recreational_legal, s.medical_legal, s.rec_year, s.med_year;
|
||||||
|
|
||||||
|
-- Create unique index on state code for fast lookups
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS mv_state_metrics_state_idx
|
||||||
|
ON mv_state_metrics (state);
|
||||||
|
|
||||||
|
-- Create index on state_id for joins
|
||||||
|
CREATE INDEX IF NOT EXISTS mv_state_metrics_state_id_idx
|
||||||
|
ON mv_state_metrics (state_id);
|
||||||
|
|
||||||
|
-- Create index for legal status filtering
|
||||||
|
CREATE INDEX IF NOT EXISTS mv_state_metrics_legal_idx
|
||||||
|
ON mv_state_metrics (recreational_legal, medical_legal);
|
||||||
|
|
||||||
|
-- Create index for data freshness queries
|
||||||
|
CREATE INDEX IF NOT EXISTS mv_state_metrics_freshness_idx
|
||||||
|
ON mv_state_metrics (data_freshness);
|
||||||
|
|
||||||
|
-- Comment on the view
|
||||||
|
COMMENT ON MATERIALIZED VIEW mv_state_metrics IS
|
||||||
|
'Aggregated state-level metrics for Analytics V2 endpoints. Refresh periodically with: REFRESH MATERIALIZED VIEW CONCURRENTLY mv_state_metrics;';
|
||||||
|
|
||||||
|
-- Record migration
|
||||||
|
INSERT INTO schema_migrations (version, name, applied_at)
|
||||||
|
VALUES ('051', 'create_mv_state_metrics', NOW())
|
||||||
|
ON CONFLICT (version) DO NOTHING;
|
||||||
96
backend/migrations/052_add_provider_data_columns.sql
Normal file
96
backend/migrations/052_add_provider_data_columns.sql
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
-- Migration 052: Add provider_data JSONB and frequently-queried columns
|
||||||
|
--
|
||||||
|
-- Adds hybrid storage for legacy data:
|
||||||
|
-- 1. provider_data JSONB on both tables for all extra fields
|
||||||
|
-- 2. Specific columns for frequently-queried fields
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- store_products: Add provider_data and queryable columns
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- JSONB for all extra provider-specific data
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD COLUMN IF NOT EXISTS provider_data JSONB;
|
||||||
|
|
||||||
|
-- Frequently-queried columns
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD COLUMN IF NOT EXISTS strain_type TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD COLUMN IF NOT EXISTS medical_only BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD COLUMN IF NOT EXISTS rec_only BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD COLUMN IF NOT EXISTS brand_logo_url TEXT;
|
||||||
|
|
||||||
|
ALTER TABLE store_products
|
||||||
|
ADD COLUMN IF NOT EXISTS platform_dispensary_id TEXT;
|
||||||
|
|
||||||
|
-- Index for strain_type queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_strain_type
|
||||||
|
ON store_products(strain_type)
|
||||||
|
WHERE strain_type IS NOT NULL;
|
||||||
|
|
||||||
|
-- Index for medical/rec filtering
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_medical_rec
|
||||||
|
ON store_products(medical_only, rec_only);
|
||||||
|
|
||||||
|
-- GIN index for provider_data JSONB queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_provider_data
|
||||||
|
ON store_products USING GIN (provider_data);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- store_product_snapshots: Add provider_data and queryable columns
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- JSONB for all extra provider-specific data
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD COLUMN IF NOT EXISTS provider_data JSONB;
|
||||||
|
|
||||||
|
-- Frequently-queried columns
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD COLUMN IF NOT EXISTS featured BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
ALTER TABLE store_product_snapshots
|
||||||
|
ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
-- Index for featured products
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_featured
|
||||||
|
ON store_product_snapshots(dispensary_id, featured)
|
||||||
|
WHERE featured = TRUE;
|
||||||
|
|
||||||
|
-- Index for low stock alerts
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_below_threshold
|
||||||
|
ON store_product_snapshots(dispensary_id, is_below_threshold)
|
||||||
|
WHERE is_below_threshold = TRUE;
|
||||||
|
|
||||||
|
-- GIN index for provider_data JSONB queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_provider_data
|
||||||
|
ON store_product_snapshots USING GIN (provider_data);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- Comments for documentation
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_products.provider_data IS
|
||||||
|
'JSONB blob containing all provider-specific fields not in canonical columns (effects, terpenes, cannabinoids_v2, etc.)';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_products.strain_type IS
|
||||||
|
'Cannabis strain type: Indica, Sativa, Hybrid, Indica-Hybrid, Sativa-Hybrid';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_products.platform_dispensary_id IS
|
||||||
|
'Provider platform dispensary ID (e.g., Dutchie MongoDB ObjectId)';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_product_snapshots.provider_data IS
|
||||||
|
'JSONB blob containing all provider-specific snapshot fields (options, kiosk data, etc.)';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_product_snapshots.featured IS
|
||||||
|
'Whether product was featured/highlighted at capture time';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN store_product_snapshots.is_below_threshold IS
|
||||||
|
'Whether product was below inventory threshold at capture time';
|
||||||
127
backend/migrations/052_add_state_cannabis_flags.sql
Normal file
127
backend/migrations/052_add_state_cannabis_flags.sql
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
-- ============================================================================
|
||||||
|
-- Migration 052: Add Cannabis Legalization Flags to States
|
||||||
|
-- ============================================================================
|
||||||
|
--
|
||||||
|
-- Purpose: Add recreational/medical cannabis legalization status and years
|
||||||
|
-- to the existing states table, then seed all 50 states + DC.
|
||||||
|
--
|
||||||
|
-- SAFETY RULES:
|
||||||
|
-- - Uses ADD COLUMN IF NOT EXISTS (idempotent)
|
||||||
|
-- - Uses INSERT ... ON CONFLICT (code) DO UPDATE (idempotent)
|
||||||
|
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||||
|
-- - Safe to run multiple times
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- psql "$DATABASE_URL" -f migrations/052_add_state_cannabis_flags.sql
|
||||||
|
--
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 1: Add cannabis legalization columns
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS recreational_legal BOOLEAN;
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS rec_year INTEGER;
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS medical_legal BOOLEAN;
|
||||||
|
ALTER TABLE states ADD COLUMN IF NOT EXISTS med_year INTEGER;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN states.recreational_legal IS 'Whether recreational cannabis is legal in this state';
|
||||||
|
COMMENT ON COLUMN states.rec_year IS 'Year recreational cannabis was legalized (NULL if not legal)';
|
||||||
|
COMMENT ON COLUMN states.medical_legal IS 'Whether medical cannabis is legal in this state';
|
||||||
|
COMMENT ON COLUMN states.med_year IS 'Year medical cannabis was legalized (NULL if not legal)';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 2: Seed all 50 states + DC with cannabis legalization data
|
||||||
|
-- ============================================================================
|
||||||
|
-- Data sourced from state legalization records as of 2024
|
||||||
|
-- States ordered by medical legalization year, then alphabetically
|
||||||
|
|
||||||
|
INSERT INTO states (code, name, timezone, recreational_legal, rec_year, medical_legal, med_year)
|
||||||
|
VALUES
|
||||||
|
-- Recreational + Medical States (ordered by rec year)
|
||||||
|
('WA', 'Washington', 'America/Los_Angeles', TRUE, 2012, TRUE, 1998),
|
||||||
|
('CO', 'Colorado', 'America/Denver', TRUE, 2012, TRUE, 2000),
|
||||||
|
('AK', 'Alaska', 'America/Anchorage', TRUE, 2014, TRUE, 1998),
|
||||||
|
('OR', 'Oregon', 'America/Los_Angeles', TRUE, 2014, TRUE, 1998),
|
||||||
|
('DC', 'District of Columbia', 'America/New_York', TRUE, 2015, TRUE, 2011),
|
||||||
|
('CA', 'California', 'America/Los_Angeles', TRUE, 2016, TRUE, 1996),
|
||||||
|
('NV', 'Nevada', 'America/Los_Angeles', TRUE, 2016, TRUE, 1998),
|
||||||
|
('ME', 'Maine', 'America/New_York', TRUE, 2016, TRUE, 1999),
|
||||||
|
('MA', 'Massachusetts', 'America/New_York', TRUE, 2016, TRUE, 2012),
|
||||||
|
('MI', 'Michigan', 'America/Detroit', TRUE, 2018, TRUE, 2008),
|
||||||
|
('IL', 'Illinois', 'America/Chicago', TRUE, 2019, TRUE, 2013),
|
||||||
|
('AZ', 'Arizona', 'America/Phoenix', TRUE, 2020, TRUE, 2010),
|
||||||
|
('MT', 'Montana', 'America/Denver', TRUE, 2020, TRUE, 2004),
|
||||||
|
('NJ', 'New Jersey', 'America/New_York', TRUE, 2020, TRUE, 2010),
|
||||||
|
('VT', 'Vermont', 'America/New_York', TRUE, 2020, TRUE, 2004),
|
||||||
|
('CT', 'Connecticut', 'America/New_York', TRUE, 2021, TRUE, 2012),
|
||||||
|
('NM', 'New Mexico', 'America/Denver', TRUE, 2021, TRUE, 2007),
|
||||||
|
('NY', 'New York', 'America/New_York', TRUE, 2021, TRUE, 2014),
|
||||||
|
('VA', 'Virginia', 'America/New_York', TRUE, 2021, TRUE, 2020),
|
||||||
|
('MD', 'Maryland', 'America/New_York', TRUE, 2022, TRUE, 2013),
|
||||||
|
('MO', 'Missouri', 'America/Chicago', TRUE, 2022, TRUE, 2018),
|
||||||
|
('RI', 'Rhode Island', 'America/New_York', TRUE, 2022, TRUE, 2006),
|
||||||
|
('DE', 'Delaware', 'America/New_York', TRUE, 2023, TRUE, 2011),
|
||||||
|
('MN', 'Minnesota', 'America/Chicago', TRUE, 2023, TRUE, 2014),
|
||||||
|
('OH', 'Ohio', 'America/New_York', TRUE, 2023, TRUE, 2016),
|
||||||
|
|
||||||
|
-- Medical Only States (no recreational)
|
||||||
|
('HI', 'Hawaii', 'Pacific/Honolulu', FALSE, NULL, TRUE, 2000),
|
||||||
|
('NH', 'New Hampshire', 'America/New_York', FALSE, NULL, TRUE, 2013),
|
||||||
|
('GA', 'Georgia', 'America/New_York', FALSE, NULL, TRUE, 2015),
|
||||||
|
('LA', 'Louisiana', 'America/Chicago', FALSE, NULL, TRUE, 2015),
|
||||||
|
('TX', 'Texas', 'America/Chicago', FALSE, NULL, TRUE, 2015),
|
||||||
|
('AR', 'Arkansas', 'America/Chicago', FALSE, NULL, TRUE, 2016),
|
||||||
|
('FL', 'Florida', 'America/New_York', FALSE, NULL, TRUE, 2016),
|
||||||
|
('ND', 'North Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2016),
|
||||||
|
('PA', 'Pennsylvania', 'America/New_York', FALSE, NULL, TRUE, 2016),
|
||||||
|
('IA', 'Iowa', 'America/Chicago', FALSE, NULL, TRUE, 2017),
|
||||||
|
('WV', 'West Virginia', 'America/New_York', FALSE, NULL, TRUE, 2017),
|
||||||
|
('OK', 'Oklahoma', 'America/Chicago', FALSE, NULL, TRUE, 2018),
|
||||||
|
('UT', 'Utah', 'America/Denver', FALSE, NULL, TRUE, 2018),
|
||||||
|
('SD', 'South Dakota', 'America/Chicago', FALSE, NULL, TRUE, 2020),
|
||||||
|
('AL', 'Alabama', 'America/Chicago', FALSE, NULL, TRUE, 2021),
|
||||||
|
('MS', 'Mississippi', 'America/Chicago', FALSE, NULL, TRUE, 2022),
|
||||||
|
('KY', 'Kentucky', 'America/New_York', FALSE, NULL, TRUE, 2023),
|
||||||
|
('NE', 'Nebraska', 'America/Chicago', FALSE, NULL, TRUE, 2024),
|
||||||
|
|
||||||
|
-- No Cannabis Programs (neither rec nor medical)
|
||||||
|
('ID', 'Idaho', 'America/Boise', FALSE, NULL, FALSE, NULL),
|
||||||
|
('IN', 'Indiana', 'America/Indiana/Indianapolis', FALSE, NULL, FALSE, NULL),
|
||||||
|
('KS', 'Kansas', 'America/Chicago', FALSE, NULL, FALSE, NULL),
|
||||||
|
('NC', 'North Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
|
||||||
|
('SC', 'South Carolina', 'America/New_York', FALSE, NULL, FALSE, NULL),
|
||||||
|
('TN', 'Tennessee', 'America/Chicago', FALSE, NULL, FALSE, NULL),
|
||||||
|
('WI', 'Wisconsin', 'America/Chicago', FALSE, NULL, FALSE, NULL),
|
||||||
|
('WY', 'Wyoming', 'America/Denver', FALSE, NULL, FALSE, NULL)
|
||||||
|
|
||||||
|
ON CONFLICT (code) DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
|
||||||
|
recreational_legal = EXCLUDED.recreational_legal,
|
||||||
|
rec_year = EXCLUDED.rec_year,
|
||||||
|
medical_legal = EXCLUDED.medical_legal,
|
||||||
|
med_year = EXCLUDED.med_year,
|
||||||
|
updated_at = NOW();
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 3: Add indexes for common queries
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_recreational ON states(recreational_legal) WHERE recreational_legal = TRUE;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_states_medical ON states(medical_legal) WHERE medical_legal = TRUE;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 4: Verification query (informational only)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Migration 052 completed successfully.' AS status,
|
||||||
|
(SELECT COUNT(*) FROM states WHERE recreational_legal = TRUE) AS rec_states,
|
||||||
|
(SELECT COUNT(*) FROM states WHERE medical_legal = TRUE AND recreational_legal = FALSE) AS med_only_states,
|
||||||
|
(SELECT COUNT(*) FROM states WHERE medical_legal = FALSE OR medical_legal IS NULL) AS no_program_states,
|
||||||
|
(SELECT COUNT(*) FROM states) AS total_states;
|
||||||
249
backend/migrations/052_hydration_schema_alignment.sql
Normal file
249
backend/migrations/052_hydration_schema_alignment.sql
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
-- ============================================================================
|
||||||
|
-- Migration 052: Hydration Schema Alignment
|
||||||
|
-- ============================================================================
|
||||||
|
--
|
||||||
|
-- Purpose: Add columns to canonical tables needed for hydration from
|
||||||
|
-- dutchie_products and dutchie_product_snapshots.
|
||||||
|
--
|
||||||
|
-- This migration ensures store_products and store_product_snapshots can
|
||||||
|
-- receive all data from the legacy dutchie_* tables.
|
||||||
|
--
|
||||||
|
-- SAFETY RULES:
|
||||||
|
-- - ALL columns use ADD COLUMN IF NOT EXISTS
|
||||||
|
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||||
|
-- - Fully idempotent - safe to run multiple times
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
||||||
|
-- -f migrations/052_hydration_schema_alignment.sql
|
||||||
|
--
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 1: store_products - Additional columns from dutchie_products
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Brand ID from Dutchie GraphQL (brandId field)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_brand_id VARCHAR(100);
|
||||||
|
|
||||||
|
-- Legacy dutchie_products.id for cross-reference during migration
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
|
||||||
|
|
||||||
|
-- THC/CBD content as text (from dutchie_products.thc_content/cbd_content)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS thc_content_text VARCHAR(50);
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cbd_content_text VARCHAR(50);
|
||||||
|
|
||||||
|
-- Full cannabinoid data
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cannabinoids JSONB;
|
||||||
|
|
||||||
|
-- Effects array
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS effects TEXT[];
|
||||||
|
|
||||||
|
-- Type (Flower, Edible, etc.) - maps to category in legacy
|
||||||
|
-- Already have category VARCHAR(100), but type may differ
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS product_type VARCHAR(100);
|
||||||
|
|
||||||
|
-- Additional images array
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS additional_images TEXT[];
|
||||||
|
|
||||||
|
-- Local image paths (from 032 migration)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_url TEXT;
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT;
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT;
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS original_image_url TEXT;
|
||||||
|
|
||||||
|
-- Status from Dutchie (Active/Inactive)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
|
||||||
|
|
||||||
|
-- Threshold flags
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
-- cName / slug from Dutchie
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
|
||||||
|
|
||||||
|
-- Coming soon flag
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_coming_soon BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
-- Provider column already exists, ensure we have provider_dispensary_id
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
|
||||||
|
|
||||||
|
-- Enterprise product ID (cross-store product linking)
|
||||||
|
-- Already exists from migration 051
|
||||||
|
|
||||||
|
-- Total quantity available (from POSMetaData.children)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
|
||||||
|
|
||||||
|
-- Weight
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weight VARCHAR(50);
|
||||||
|
|
||||||
|
-- Options array (size/weight options)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options TEXT[];
|
||||||
|
|
||||||
|
-- Measurements
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS measurements JSONB;
|
||||||
|
|
||||||
|
-- Raw data from last crawl
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS raw_data JSONB;
|
||||||
|
|
||||||
|
-- Source timestamps from Dutchie
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_created_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS source_updated_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 2: store_product_snapshots - Additional columns for hydration
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Legacy dutchie_product_snapshot.id for cross-reference
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_snapshot_id INTEGER;
|
||||||
|
|
||||||
|
-- Legacy dutchie_product_id reference
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS legacy_dutchie_product_id INTEGER;
|
||||||
|
|
||||||
|
-- Options JSONB from dutchie_product_snapshots
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS options JSONB;
|
||||||
|
|
||||||
|
-- Provider dispensary ID
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS provider_dispensary_id VARCHAR(100);
|
||||||
|
|
||||||
|
-- Inventory details
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER;
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS total_kiosk_quantity_available INTEGER;
|
||||||
|
|
||||||
|
-- Platform status at time of snapshot
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS platform_status VARCHAR(20);
|
||||||
|
|
||||||
|
-- Threshold flags at time of snapshot
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
-- Special data
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_data JSONB;
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS special_name TEXT;
|
||||||
|
|
||||||
|
-- Pricing mode (rec/med)
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS pricing_type VARCHAR(10);
|
||||||
|
|
||||||
|
-- Crawl mode (mode_a/mode_b)
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS crawl_mode VARCHAR(20);
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 3: crawl_runs - Additional columns for hydration
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Legacy job ID references
|
||||||
|
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_dispensary_crawl_job_id INTEGER;
|
||||||
|
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS legacy_job_run_log_id INTEGER;
|
||||||
|
|
||||||
|
-- Schedule reference
|
||||||
|
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS schedule_id INTEGER;
|
||||||
|
|
||||||
|
-- Job type
|
||||||
|
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS job_type VARCHAR(50);
|
||||||
|
|
||||||
|
-- Brands found count
|
||||||
|
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS brands_found INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
-- Retry count
|
||||||
|
ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 4: INDEXES for hydration queries
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Index on legacy IDs for migration lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_legacy_id
|
||||||
|
ON store_products(legacy_dutchie_product_id)
|
||||||
|
WHERE legacy_dutchie_product_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_id
|
||||||
|
ON store_product_snapshots(legacy_snapshot_id)
|
||||||
|
WHERE legacy_snapshot_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_legacy_product_id
|
||||||
|
ON store_product_snapshots(legacy_dutchie_product_id)
|
||||||
|
WHERE legacy_dutchie_product_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_runs_legacy_job_id
|
||||||
|
ON crawl_runs(legacy_dispensary_crawl_job_id)
|
||||||
|
WHERE legacy_dispensary_crawl_job_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Index on provider_product_id for upserts
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_provider_id
|
||||||
|
ON store_products(provider_product_id);
|
||||||
|
|
||||||
|
-- Composite index for canonical key lookup
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_store_products_canonical_key
|
||||||
|
ON store_products(dispensary_id, provider, provider_product_id);
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 5: Unique constraint for idempotent hydration
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Ensure unique snapshots per product per crawl
|
||||||
|
-- This prevents duplicate snapshots during re-runs
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'store_product_snapshots_unique_per_crawl'
|
||||||
|
) THEN
|
||||||
|
-- Can't add unique constraint on nullable columns directly,
|
||||||
|
-- so we use a partial unique index instead
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_snapshots_unique_per_crawl
|
||||||
|
ON store_product_snapshots(store_product_id, crawl_run_id)
|
||||||
|
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN NULL;
|
||||||
|
WHEN OTHERS THEN NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 6: View for hydration status monitoring
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW v_hydration_status AS
|
||||||
|
SELECT
|
||||||
|
'dutchie_products' AS source_table,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products) AS source_count,
|
||||||
|
(SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) AS hydrated_count,
|
||||||
|
ROUND(
|
||||||
|
100.0 * (SELECT COUNT(*) FROM store_products WHERE legacy_dutchie_product_id IS NOT NULL) /
|
||||||
|
NULLIF((SELECT COUNT(*) FROM dutchie_products), 0),
|
||||||
|
2
|
||||||
|
) AS hydration_pct
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
'dutchie_product_snapshots' AS source_table,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_product_snapshots) AS source_count,
|
||||||
|
(SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) AS hydrated_count,
|
||||||
|
ROUND(
|
||||||
|
100.0 * (SELECT COUNT(*) FROM store_product_snapshots WHERE legacy_snapshot_id IS NOT NULL) /
|
||||||
|
NULLIF((SELECT COUNT(*) FROM dutchie_product_snapshots), 0),
|
||||||
|
2
|
||||||
|
) AS hydration_pct
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
'dispensary_crawl_jobs' AS source_table,
|
||||||
|
(SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed') AS source_count,
|
||||||
|
(SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) AS hydrated_count,
|
||||||
|
ROUND(
|
||||||
|
100.0 * (SELECT COUNT(*) FROM crawl_runs WHERE legacy_dispensary_crawl_job_id IS NOT NULL) /
|
||||||
|
NULLIF((SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed'), 0),
|
||||||
|
2
|
||||||
|
) AS hydration_pct;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- DONE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT 'Migration 052 completed successfully. Hydration schema aligned.' AS status;
|
||||||
157
backend/migrations/053_analytics_indexes.sql
Normal file
157
backend/migrations/053_analytics_indexes.sql
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
-- ============================================================================
|
||||||
|
-- Migration 053: Analytics Engine Indexes
|
||||||
|
-- ============================================================================
|
||||||
|
--
|
||||||
|
-- Purpose: Add indexes optimized for analytics queries on canonical tables.
|
||||||
|
-- These indexes support price trends, brand penetration, category
|
||||||
|
-- growth, and state-level analytics.
|
||||||
|
--
|
||||||
|
-- SAFETY RULES:
|
||||||
|
-- - Uses CREATE INDEX IF NOT EXISTS (idempotent)
|
||||||
|
-- - Uses ADD COLUMN IF NOT EXISTS for helper columns
|
||||||
|
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||||
|
-- - Safe to run multiple times
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- psql "$DATABASE_URL" -f migrations/053_analytics_indexes.sql
|
||||||
|
--
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 1: Helper columns for analytics (if missing)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Ensure store_products has brand_id for faster brand analytics joins
|
||||||
|
-- (brand_name exists, but a normalized brand_id helps)
|
||||||
|
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS brand_id INTEGER;
|
||||||
|
|
||||||
|
-- Ensure snapshots have category for time-series category analytics
|
||||||
|
ALTER TABLE store_product_snapshots ADD COLUMN IF NOT EXISTS category VARCHAR(100);
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 2: Price Analytics Indexes
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Price trends by store_product over time
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_product_price_time
|
||||||
|
ON store_product_snapshots(store_product_id, captured_at DESC, price_rec, price_med)
|
||||||
|
WHERE store_product_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Price by category over time (for category price trends)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_category_price_time
|
||||||
|
ON store_product_snapshots(category, captured_at DESC, price_rec)
|
||||||
|
WHERE category IS NOT NULL;
|
||||||
|
|
||||||
|
-- Price changes detection (for volatility analysis)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_price_change
|
||||||
|
ON store_products(last_price_change_at DESC)
|
||||||
|
WHERE last_price_change_at IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 3: Brand Penetration Indexes
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Brand by dispensary (for penetration counts)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_brand_dispensary
|
||||||
|
ON store_products(brand_name, dispensary_id)
|
||||||
|
WHERE brand_name IS NOT NULL;
|
||||||
|
|
||||||
|
-- Brand by state (for state-level brand analytics)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_brand_state
|
||||||
|
ON store_products(brand_name, state_id)
|
||||||
|
WHERE brand_name IS NOT NULL AND state_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Brand first/last seen (for penetration trends)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_brand_first_seen
|
||||||
|
ON store_products(brand_name, first_seen_at)
|
||||||
|
WHERE brand_name IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 4: Category Analytics Indexes
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Category by state (for state-level category analytics)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_category_state
|
||||||
|
ON store_products(category, state_id)
|
||||||
|
WHERE category IS NOT NULL;
|
||||||
|
|
||||||
|
-- Category by dispensary
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_category_dispensary
|
||||||
|
ON store_products(category, dispensary_id)
|
||||||
|
WHERE category IS NOT NULL;
|
||||||
|
|
||||||
|
-- Category first seen (for growth tracking)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_category_first_seen
|
||||||
|
ON store_products(category, first_seen_at)
|
||||||
|
WHERE category IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 5: Store Analytics Indexes
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Products added/removed by dispensary
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_dispensary_first_seen
|
||||||
|
ON store_products(dispensary_id, first_seen_at DESC);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_dispensary_last_seen
|
||||||
|
ON store_products(dispensary_id, last_seen_at DESC);
|
||||||
|
|
||||||
|
-- Stock status changes
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_stock_change
|
||||||
|
ON store_products(dispensary_id, last_stock_change_at DESC)
|
||||||
|
WHERE last_stock_change_at IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 6: State Analytics Indexes
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Dispensary count by state
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_active
|
||||||
|
ON dispensaries(state_id)
|
||||||
|
WHERE state_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Products by state
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_state_active
|
||||||
|
ON store_products(state_id, is_in_stock)
|
||||||
|
WHERE state_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Snapshots by state for time-series
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_snapshots_state_time
|
||||||
|
ON store_product_snapshots(state_id, captured_at DESC)
|
||||||
|
WHERE state_id IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 7: Composite indexes for common analytics queries
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Brand + Category + State (for market share calculations)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_brand_category_state
|
||||||
|
ON store_products(brand_name, category, state_id)
|
||||||
|
WHERE brand_name IS NOT NULL AND category IS NOT NULL;
|
||||||
|
|
||||||
|
-- Dispensary + Category + Brand (for store-level brand analysis)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_disp_cat_brand
|
||||||
|
ON store_products(dispensary_id, category, brand_name)
|
||||||
|
WHERE category IS NOT NULL;
|
||||||
|
|
||||||
|
-- Special pricing by category (for promo analysis)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_special_category
|
||||||
|
ON store_products(category, is_on_special)
|
||||||
|
WHERE is_on_special = TRUE;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 8: Verification
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Migration 053 completed successfully.' AS status,
|
||||||
|
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_products_%') AS product_indexes,
|
||||||
|
(SELECT COUNT(*) FROM pg_indexes WHERE indexname LIKE 'idx_snapshots_%') AS snapshot_indexes;
|
||||||
346
backend/migrations/053_dutchie_discovery_schema.sql
Normal file
346
backend/migrations/053_dutchie_discovery_schema.sql
Normal file
@@ -0,0 +1,346 @@
|
|||||||
|
-- ============================================================================
|
||||||
|
-- Migration 053: Dutchie Discovery Schema
|
||||||
|
-- ============================================================================
|
||||||
|
--
|
||||||
|
-- Purpose: Create tables for Dutchie store discovery workflow.
|
||||||
|
-- Stores are discovered and held in staging tables until verified,
|
||||||
|
-- then promoted to the canonical dispensaries table.
|
||||||
|
--
|
||||||
|
-- Tables Created:
|
||||||
|
-- - dutchie_discovery_cities: City pages from Dutchie
|
||||||
|
-- - dutchie_discovery_locations: Individual store locations
|
||||||
|
--
|
||||||
|
-- SAFETY RULES:
|
||||||
|
-- - ALL tables use CREATE TABLE IF NOT EXISTS
|
||||||
|
-- - NO DROP, DELETE, TRUNCATE, or destructive operations
|
||||||
|
-- - Does NOT touch canonical dispensaries table
|
||||||
|
-- - Fully idempotent - safe to run multiple times
|
||||||
|
--
|
||||||
|
-- Run with:
|
||||||
|
-- psql "postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus" \
|
||||||
|
-- -f migrations/053_dutchie_discovery_schema.sql
|
||||||
|
--
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 1: DUTCHIE_DISCOVERY_CITIES
|
||||||
|
-- ============================================================================
|
||||||
|
-- Stores Dutchie city pages for systematic crawling.
|
||||||
|
-- Each city can contain multiple dispensary locations.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS dutchie_discovery_cities (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
-- Platform identification (future-proof for other platforms)
|
||||||
|
platform TEXT NOT NULL DEFAULT 'dutchie',
|
||||||
|
|
||||||
|
-- City identification
|
||||||
|
city_name TEXT NOT NULL,
|
||||||
|
city_slug TEXT NOT NULL,
|
||||||
|
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
|
||||||
|
country_code TEXT NOT NULL DEFAULT 'US',
|
||||||
|
|
||||||
|
-- Crawl management
|
||||||
|
last_crawled_at TIMESTAMPTZ,
|
||||||
|
crawl_enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
location_count INTEGER, -- Number of locations found in this city
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
notes TEXT,
|
||||||
|
metadata JSONB,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Add unique constraint if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dutchie_discovery_cities_unique'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dutchie_discovery_cities
|
||||||
|
ADD CONSTRAINT dutchie_discovery_cities_unique
|
||||||
|
UNIQUE (platform, country_code, state_code, city_slug);
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN NULL;
|
||||||
|
WHEN OTHERS THEN NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Indexes
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_cities_platform
|
||||||
|
ON dutchie_discovery_cities(platform);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_cities_state
|
||||||
|
ON dutchie_discovery_cities(country_code, state_code);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_cities_crawl_enabled
|
||||||
|
ON dutchie_discovery_cities(crawl_enabled)
|
||||||
|
WHERE crawl_enabled = TRUE;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_cities_last_crawled
|
||||||
|
ON dutchie_discovery_cities(last_crawled_at);
|
||||||
|
|
||||||
|
COMMENT ON TABLE dutchie_discovery_cities IS 'City pages from Dutchie for systematic store discovery.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 2: DUTCHIE_DISCOVERY_LOCATIONS
|
||||||
|
-- ============================================================================
|
||||||
|
-- Individual store locations discovered from Dutchie.
|
||||||
|
-- These are NOT promoted to canonical dispensaries until verified.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS dutchie_discovery_locations (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
|
||||||
|
-- Platform identification
|
||||||
|
platform TEXT NOT NULL DEFAULT 'dutchie',
|
||||||
|
platform_location_id TEXT NOT NULL, -- Dutchie's internal Location ID
|
||||||
|
platform_slug TEXT NOT NULL, -- URL slug for the store
|
||||||
|
platform_menu_url TEXT NOT NULL, -- Full menu URL
|
||||||
|
|
||||||
|
-- Store name
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
|
||||||
|
-- Address components
|
||||||
|
raw_address TEXT,
|
||||||
|
address_line1 TEXT,
|
||||||
|
address_line2 TEXT,
|
||||||
|
city TEXT,
|
||||||
|
state_code TEXT, -- 'AZ', 'CA', 'ON', etc.
|
||||||
|
postal_code TEXT,
|
||||||
|
country_code TEXT, -- 'US' or 'CA'
|
||||||
|
|
||||||
|
-- Coordinates
|
||||||
|
latitude DOUBLE PRECISION,
|
||||||
|
longitude DOUBLE PRECISION,
|
||||||
|
timezone TEXT,
|
||||||
|
|
||||||
|
-- Discovery status
|
||||||
|
status TEXT NOT NULL DEFAULT 'discovered',
|
||||||
|
-- discovered: Just found, not yet verified
|
||||||
|
-- verified: Verified and promoted to canonical dispensaries
|
||||||
|
-- rejected: Manually rejected (e.g., duplicate, test store)
|
||||||
|
-- merged: Linked to existing canonical dispensary
|
||||||
|
|
||||||
|
-- Link to canonical dispensaries (only after verification)
|
||||||
|
dispensary_id INTEGER,
|
||||||
|
|
||||||
|
-- Reference to discovery city
|
||||||
|
discovery_city_id BIGINT,
|
||||||
|
|
||||||
|
-- Raw data from Dutchie
|
||||||
|
metadata JSONB,
|
||||||
|
notes TEXT,
|
||||||
|
|
||||||
|
-- Store capabilities (from Dutchie)
|
||||||
|
offers_delivery BOOLEAN,
|
||||||
|
offers_pickup BOOLEAN,
|
||||||
|
is_recreational BOOLEAN,
|
||||||
|
is_medical BOOLEAN,
|
||||||
|
|
||||||
|
-- Tracking
|
||||||
|
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
last_checked_at TIMESTAMPTZ,
|
||||||
|
verified_at TIMESTAMPTZ,
|
||||||
|
verified_by TEXT, -- User who verified
|
||||||
|
|
||||||
|
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Add unique constraints if not exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dutchie_discovery_locations_platform_id_unique'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dutchie_discovery_locations
|
||||||
|
ADD CONSTRAINT dutchie_discovery_locations_platform_id_unique
|
||||||
|
UNIQUE (platform, platform_location_id);
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN NULL;
|
||||||
|
WHEN OTHERS THEN NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dutchie_discovery_locations_slug_unique'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dutchie_discovery_locations
|
||||||
|
ADD CONSTRAINT dutchie_discovery_locations_slug_unique
|
||||||
|
UNIQUE (platform, platform_slug, country_code, state_code, city);
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN NULL;
|
||||||
|
WHEN OTHERS THEN NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add FK to dispensaries if not exists (allows NULL)
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dutchie_discovery_locations_dispensary_fk'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dutchie_discovery_locations
|
||||||
|
ADD CONSTRAINT dutchie_discovery_locations_dispensary_fk
|
||||||
|
FOREIGN KEY (dispensary_id) REFERENCES dispensaries(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN NULL;
|
||||||
|
WHEN OTHERS THEN NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Add FK to discovery cities if not exists
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (
|
||||||
|
SELECT 1 FROM pg_constraint
|
||||||
|
WHERE conname = 'dutchie_discovery_locations_city_fk'
|
||||||
|
) THEN
|
||||||
|
ALTER TABLE dutchie_discovery_locations
|
||||||
|
ADD CONSTRAINT dutchie_discovery_locations_city_fk
|
||||||
|
FOREIGN KEY (discovery_city_id) REFERENCES dutchie_discovery_cities(id) ON DELETE SET NULL;
|
||||||
|
END IF;
|
||||||
|
EXCEPTION
|
||||||
|
WHEN duplicate_object THEN NULL;
|
||||||
|
WHEN OTHERS THEN NULL;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Indexes
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_platform
|
||||||
|
ON dutchie_discovery_locations(platform);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_status
|
||||||
|
ON dutchie_discovery_locations(status);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_state
|
||||||
|
ON dutchie_discovery_locations(country_code, state_code);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_city
|
||||||
|
ON dutchie_discovery_locations(city, state_code);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_dispensary
|
||||||
|
ON dutchie_discovery_locations(dispensary_id)
|
||||||
|
WHERE dispensary_id IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_discovered
|
||||||
|
ON dutchie_discovery_locations(status, first_seen_at DESC)
|
||||||
|
WHERE status = 'discovered';
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_active
|
||||||
|
ON dutchie_discovery_locations(active)
|
||||||
|
WHERE active = TRUE;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_discovery_locations_coords
|
||||||
|
ON dutchie_discovery_locations(latitude, longitude)
|
||||||
|
WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
|
||||||
|
|
||||||
|
COMMENT ON TABLE dutchie_discovery_locations IS 'Discovered store locations from Dutchie. Held in staging until verified.';
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 3: ADD CANADIAN PROVINCES TO STATES TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
-- Support for Canadian provinces (Ontario, BC, Alberta, etc.)
|
||||||
|
|
||||||
|
INSERT INTO states (code, name, timezone, is_active, crawl_enabled) VALUES
|
||||||
|
('AB', 'Alberta', 'America/Edmonton', TRUE, TRUE),
|
||||||
|
('BC', 'British Columbia', 'America/Vancouver', TRUE, TRUE),
|
||||||
|
('MB', 'Manitoba', 'America/Winnipeg', TRUE, TRUE),
|
||||||
|
('NB', 'New Brunswick', 'America/Moncton', TRUE, TRUE),
|
||||||
|
('NL', 'Newfoundland and Labrador', 'America/St_Johns', TRUE, TRUE),
|
||||||
|
('NS', 'Nova Scotia', 'America/Halifax', TRUE, TRUE),
|
||||||
|
('NT', 'Northwest Territories', 'America/Yellowknife', TRUE, TRUE),
|
||||||
|
('NU', 'Nunavut', 'America/Iqaluit', TRUE, TRUE),
|
||||||
|
('ON', 'Ontario', 'America/Toronto', TRUE, TRUE),
|
||||||
|
('PE', 'Prince Edward Island', 'America/Halifax', TRUE, TRUE),
|
||||||
|
('QC', 'Quebec', 'America/Montreal', TRUE, TRUE),
|
||||||
|
('SK', 'Saskatchewan', 'America/Regina', TRUE, TRUE),
|
||||||
|
('YT', 'Yukon', 'America/Whitehorse', TRUE, TRUE)
|
||||||
|
ON CONFLICT (code) DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
timezone = COALESCE(states.timezone, EXCLUDED.timezone),
|
||||||
|
updated_at = NOW();
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- SECTION 4: VIEWS FOR DISCOVERY MONITORING
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- View: Discovery status summary
|
||||||
|
CREATE OR REPLACE VIEW v_discovery_status AS
|
||||||
|
SELECT
|
||||||
|
platform,
|
||||||
|
country_code,
|
||||||
|
state_code,
|
||||||
|
status,
|
||||||
|
COUNT(*) AS location_count,
|
||||||
|
COUNT(*) FILTER (WHERE dispensary_id IS NOT NULL) AS linked_count,
|
||||||
|
MIN(first_seen_at) AS earliest_discovery,
|
||||||
|
MAX(last_seen_at) AS latest_activity
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
GROUP BY platform, country_code, state_code, status
|
||||||
|
ORDER BY country_code, state_code, status;
|
||||||
|
|
||||||
|
-- View: Unverified discoveries awaiting action
|
||||||
|
CREATE OR REPLACE VIEW v_discovery_pending AS
|
||||||
|
SELECT
|
||||||
|
dl.id,
|
||||||
|
dl.platform,
|
||||||
|
dl.name,
|
||||||
|
dl.city,
|
||||||
|
dl.state_code,
|
||||||
|
dl.country_code,
|
||||||
|
dl.platform_menu_url,
|
||||||
|
dl.first_seen_at,
|
||||||
|
dl.last_seen_at,
|
||||||
|
dl.offers_delivery,
|
||||||
|
dl.offers_pickup,
|
||||||
|
dl.is_recreational,
|
||||||
|
dl.is_medical,
|
||||||
|
dc.city_name AS discovery_city_name
|
||||||
|
FROM dutchie_discovery_locations dl
|
||||||
|
LEFT JOIN dutchie_discovery_cities dc ON dc.id = dl.discovery_city_id
|
||||||
|
WHERE dl.status = 'discovered'
|
||||||
|
AND dl.active = TRUE
|
||||||
|
ORDER BY dl.state_code, dl.city, dl.name;
|
||||||
|
|
||||||
|
-- View: City crawl status
|
||||||
|
CREATE OR REPLACE VIEW v_discovery_cities_status AS
|
||||||
|
SELECT
|
||||||
|
dc.id,
|
||||||
|
dc.platform,
|
||||||
|
dc.city_name,
|
||||||
|
dc.state_code,
|
||||||
|
dc.country_code,
|
||||||
|
dc.crawl_enabled,
|
||||||
|
dc.last_crawled_at,
|
||||||
|
dc.location_count,
|
||||||
|
COUNT(dl.id) AS actual_locations,
|
||||||
|
COUNT(dl.id) FILTER (WHERE dl.status = 'discovered') AS pending_count,
|
||||||
|
COUNT(dl.id) FILTER (WHERE dl.status = 'verified') AS verified_count,
|
||||||
|
COUNT(dl.id) FILTER (WHERE dl.status = 'rejected') AS rejected_count
|
||||||
|
FROM dutchie_discovery_cities dc
|
||||||
|
LEFT JOIN dutchie_discovery_locations dl ON dl.discovery_city_id = dc.id
|
||||||
|
GROUP BY dc.id, dc.platform, dc.city_name, dc.state_code, dc.country_code,
|
||||||
|
dc.crawl_enabled, dc.last_crawled_at, dc.location_count
|
||||||
|
ORDER BY dc.country_code, dc.state_code, dc.city_name;
|
||||||
|
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- DONE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT 'Migration 053 completed successfully. Discovery schema created.' AS status;
|
||||||
49
backend/migrations/054_worker_metadata.sql
Normal file
49
backend/migrations/054_worker_metadata.sql
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
-- Migration 054: Worker Metadata for Named Workforce
|
||||||
|
-- Adds worker_name and worker_role to job tables for displaying friendly worker identities
|
||||||
|
|
||||||
|
-- Add worker metadata columns to job_schedules
|
||||||
|
ALTER TABLE job_schedules
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
|
||||||
|
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., Store Discovery Worker, GraphQL Product Sync)';
|
||||||
|
|
||||||
|
-- Add worker metadata columns to job_run_logs
|
||||||
|
ALTER TABLE job_run_logs
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
|
||||||
|
ADD COLUMN IF NOT EXISTS run_role VARCHAR(100);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN job_run_logs.worker_name IS 'Name of the worker that executed this run (copied from schedule)';
|
||||||
|
COMMENT ON COLUMN job_run_logs.run_role IS 'Role description for this specific run';
|
||||||
|
|
||||||
|
-- Add worker_name to dispensary_crawl_jobs (for tracking which named worker enqueued it)
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
|
||||||
|
|
||||||
|
-- Update existing schedules with worker names
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Bella',
|
||||||
|
worker_role = 'GraphQL Product Sync'
|
||||||
|
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Henry',
|
||||||
|
worker_role = 'Entry Point Finder'
|
||||||
|
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Alice',
|
||||||
|
worker_role = 'Store Discovery'
|
||||||
|
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Oscar',
|
||||||
|
worker_role = 'Analytics Refresh'
|
||||||
|
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
-- Create index for worker name lookups
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by ON dispensary_crawl_jobs(enqueued_by_worker);
|
||||||
123
backend/migrations/055_workforce_enhancements.sql
Normal file
123
backend/migrations/055_workforce_enhancements.sql
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
-- Migration 055: Workforce System Enhancements
|
||||||
|
-- Adds visibility tracking, slug change tracking, and scope support for workers
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 1. VISIBILITY TRACKING FOR BELLA (Product Sync)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add visibility tracking to dutchie_products
|
||||||
|
ALTER TABLE dutchie_products
|
||||||
|
ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE,
|
||||||
|
ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMPTZ,
|
||||||
|
ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dutchie_products.visibility_lost IS 'True if product disappeared from GraphQL results';
|
||||||
|
COMMENT ON COLUMN dutchie_products.visibility_lost_at IS 'When product was last marked as visibility lost';
|
||||||
|
COMMENT ON COLUMN dutchie_products.visibility_restored_at IS 'When product reappeared after being lost';
|
||||||
|
|
||||||
|
-- Index for visibility queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dutchie_products_visibility_lost
|
||||||
|
ON dutchie_products(dispensary_id, visibility_lost)
|
||||||
|
WHERE visibility_lost = TRUE;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 2. SLUG CHANGE TRACKING FOR ALICE (Store Discovery)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add slug change and retirement tracking to discovery locations
|
||||||
|
ALTER TABLE dutchie_discovery_locations
|
||||||
|
ADD COLUMN IF NOT EXISTS slug_changed_at TIMESTAMPTZ,
|
||||||
|
ADD COLUMN IF NOT EXISTS previous_slug VARCHAR(255),
|
||||||
|
ADD COLUMN IF NOT EXISTS retired_at TIMESTAMPTZ,
|
||||||
|
ADD COLUMN IF NOT EXISTS retirement_reason VARCHAR(100);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_locations.slug_changed_at IS 'When the platform slug was last changed';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_locations.previous_slug IS 'Previous slug before the last change';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_locations.retired_at IS 'When store was marked as retired/removed';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_locations.retirement_reason IS 'Reason for retirement (removed_from_source, closed, etc.)';
|
||||||
|
|
||||||
|
-- Index for finding retired stores
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_retired
|
||||||
|
ON dutchie_discovery_locations(retired_at)
|
||||||
|
WHERE retired_at IS NOT NULL;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 3. ID RESOLUTION TRACKING FOR HENRY (Entry Point Finder)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add resolution tracking to dispensaries
|
||||||
|
ALTER TABLE dispensaries
|
||||||
|
ADD COLUMN IF NOT EXISTS last_id_resolution_at TIMESTAMPTZ,
|
||||||
|
ADD COLUMN IF NOT EXISTS id_resolution_attempts INT DEFAULT 0,
|
||||||
|
ADD COLUMN IF NOT EXISTS id_resolution_error TEXT;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensaries.last_id_resolution_at IS 'When platform_dispensary_id was last resolved/attempted';
|
||||||
|
COMMENT ON COLUMN dispensaries.id_resolution_attempts IS 'Number of resolution attempts';
|
||||||
|
COMMENT ON COLUMN dispensaries.id_resolution_error IS 'Last error message from resolution attempt';
|
||||||
|
|
||||||
|
-- Index for finding stores needing resolution
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_needs_resolution
|
||||||
|
ON dispensaries(state, menu_type)
|
||||||
|
WHERE platform_dispensary_id IS NULL AND menu_type = 'dutchie';
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 4. ENHANCED CITIES TABLE FOR ALICE
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add tracking columns to cities table
|
||||||
|
ALTER TABLE dutchie_discovery_cities
|
||||||
|
ADD COLUMN IF NOT EXISTS state_name VARCHAR(100),
|
||||||
|
ADD COLUMN IF NOT EXISTS discovered_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
ADD COLUMN IF NOT EXISTS last_verified_at TIMESTAMPTZ,
|
||||||
|
ADD COLUMN IF NOT EXISTS store_count_reported INT,
|
||||||
|
ADD COLUMN IF NOT EXISTS store_count_actual INT;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_cities.state_name IS 'Full state name from source';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_cities.discovered_at IS 'When city was first discovered';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_cities.last_verified_at IS 'When city was last verified to exist';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_cities.store_count_reported IS 'Store count reported by source';
|
||||||
|
COMMENT ON COLUMN dutchie_discovery_cities.store_count_actual IS 'Actual store count from discovery';
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 5. UPDATE WORKER ROLES (Standardize naming)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Update existing workers to use standardized role names
|
||||||
|
UPDATE job_schedules SET worker_role = 'store_discovery'
|
||||||
|
WHERE worker_name = 'Alice' AND worker_role = 'Store Discovery';
|
||||||
|
|
||||||
|
UPDATE job_schedules SET worker_role = 'entry_point_finder'
|
||||||
|
WHERE worker_name = 'Henry' AND worker_role = 'Entry Point Finder';
|
||||||
|
|
||||||
|
UPDATE job_schedules SET worker_role = 'product_sync'
|
||||||
|
WHERE worker_name = 'Bella' AND worker_role = 'GraphQL Product Sync';
|
||||||
|
|
||||||
|
UPDATE job_schedules SET worker_role = 'analytics_refresh'
|
||||||
|
WHERE worker_name = 'Oscar' AND worker_role = 'Analytics Refresh';
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 6. VISIBILITY EVENTS IN SNAPSHOTS (JSONB approach)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Add visibility_events array to product snapshots metadata
|
||||||
|
-- This will store: [{event_type, timestamp, worker_name}]
|
||||||
|
-- No schema change needed - we use existing metadata JSONB column
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 7. INDEXES FOR WORKER QUERIES
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
-- Index for finding recently added stores (for Henry)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dutchie_discovery_locations_created
|
||||||
|
ON dutchie_discovery_locations(created_at DESC)
|
||||||
|
WHERE active = TRUE;
|
||||||
|
|
||||||
|
-- Index for scope-based queries (by state)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensaries_state_menu
|
||||||
|
ON dispensaries(state, menu_type)
|
||||||
|
WHERE menu_type IS NOT NULL;
|
||||||
|
|
||||||
|
-- Record migration
|
||||||
|
INSERT INTO schema_migrations (version, name, applied_at)
|
||||||
|
VALUES (55, '055_workforce_enhancements', NOW())
|
||||||
|
ON CONFLICT (version) DO NOTHING;
|
||||||
110
backend/migrations/056_fix_worker_and_run_logs.sql
Normal file
110
backend/migrations/056_fix_worker_and_run_logs.sql
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
-- Migration 056: Fix Worker Metadata and Job Run Logs
|
||||||
|
--
|
||||||
|
-- This migration safely ensures all expected schema exists for:
|
||||||
|
-- 1. job_schedules - worker_name, worker_role columns
|
||||||
|
-- 2. job_run_logs - entire table creation if missing
|
||||||
|
--
|
||||||
|
-- Uses IF NOT EXISTS / ADD COLUMN IF NOT EXISTS for idempotency.
|
||||||
|
-- Safe to run on databases that already have some or all of these changes.
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 1. ADD MISSING COLUMNS TO job_schedules
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
ALTER TABLE job_schedules
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_name VARCHAR(50),
|
||||||
|
ADD COLUMN IF NOT EXISTS worker_role VARCHAR(100);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN job_schedules.worker_name IS 'Friendly name for the worker (e.g., Alice, Henry, Bella, Oscar)';
|
||||||
|
COMMENT ON COLUMN job_schedules.worker_role IS 'Description of worker role (e.g., store_discovery, product_sync)';
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 2. CREATE job_run_logs TABLE IF NOT EXISTS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS job_run_logs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
|
||||||
|
job_name VARCHAR(100) NOT NULL,
|
||||||
|
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
|
||||||
|
started_at TIMESTAMPTZ,
|
||||||
|
completed_at TIMESTAMPTZ,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
-- Results summary
|
||||||
|
items_processed INTEGER DEFAULT 0,
|
||||||
|
items_succeeded INTEGER DEFAULT 0,
|
||||||
|
items_failed INTEGER DEFAULT 0,
|
||||||
|
|
||||||
|
-- Worker metadata (from scheduler.ts createRunLog function)
|
||||||
|
worker_name VARCHAR(50),
|
||||||
|
run_role VARCHAR(100),
|
||||||
|
|
||||||
|
-- Additional run details
|
||||||
|
metadata JSONB,
|
||||||
|
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Create indexes if they don't exist
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_job_run_logs_worker_name ON job_run_logs(worker_name);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 3. ADD enqueued_by_worker TO dispensary_crawl_jobs IF EXISTS
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
-- Only add column if dispensary_crawl_jobs table exists
|
||||||
|
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'dispensary_crawl_jobs') THEN
|
||||||
|
ALTER TABLE dispensary_crawl_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS enqueued_by_worker VARCHAR(50);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN dispensary_crawl_jobs.enqueued_by_worker IS 'Name of the worker that enqueued this job';
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_dispensary_crawl_jobs_enqueued_by
|
||||||
|
ON dispensary_crawl_jobs(enqueued_by_worker);
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 4. SEED DEFAULT WORKER NAMES FOR EXISTING SCHEDULES
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Bella',
|
||||||
|
worker_role = 'product_sync'
|
||||||
|
WHERE job_name = 'dutchie_az_product_crawl' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Henry',
|
||||||
|
worker_role = 'entry_point_finder'
|
||||||
|
WHERE job_name = 'dutchie_az_menu_detection' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Alice',
|
||||||
|
worker_role = 'store_discovery'
|
||||||
|
WHERE job_name = 'dutchie_store_discovery' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
UPDATE job_schedules SET
|
||||||
|
worker_name = 'Oscar',
|
||||||
|
worker_role = 'analytics_refresh'
|
||||||
|
WHERE job_name = 'analytics_refresh' AND worker_name IS NULL;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- 5. RECORD MIGRATION (if schema_migrations table exists)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'schema_migrations') THEN
|
||||||
|
INSERT INTO schema_migrations (version, name, applied_at)
|
||||||
|
VALUES (56, '056_fix_worker_and_run_logs', NOW())
|
||||||
|
ON CONFLICT (version) DO NOTHING;
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
@@ -10,11 +10,18 @@
|
|||||||
"migrate": "tsx src/db/migrate.ts",
|
"migrate": "tsx src/db/migrate.ts",
|
||||||
"seed": "tsx src/db/seed.ts",
|
"seed": "tsx src/db/seed.ts",
|
||||||
"migrate:az": "tsx src/dutchie-az/db/migrate.ts",
|
"migrate:az": "tsx src/dutchie-az/db/migrate.ts",
|
||||||
"health:az": "tsx -e \"import { healthCheck } from './src/dutchie-az/db/connection'; (async()=>{ const ok=await healthCheck(); console.log(ok?'AZ DB healthy':'AZ DB NOT reachable'); process.exit(ok?0:1); })();\""
|
"health:az": "tsx -e \"import { healthCheck } from './src/dutchie-az/db/connection'; (async()=>{ const ok=await healthCheck(); console.log(ok?'AZ DB healthy':'AZ DB NOT reachable'); process.exit(ok?0:1); })();\"",
|
||||||
|
"system:smoke-test": "tsx src/scripts/system-smoke-test.ts",
|
||||||
|
"discovery:dt:cities:auto": "tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts",
|
||||||
|
"discovery:dt:cities:manual": "tsx src/dutchie-az/discovery/discovery-dt-cities-manual-seed.ts",
|
||||||
|
"discovery:dt:locations": "tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts",
|
||||||
|
"backfill:legacy:canonical": "tsx src/scripts/backfill-legacy-to-canonical.ts",
|
||||||
|
"seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"axios": "^1.6.2",
|
"axios": "^1.6.2",
|
||||||
"bcrypt": "^5.1.1",
|
"bcrypt": "^5.1.1",
|
||||||
|
"cheerio": "^1.1.2",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dotenv": "^16.3.1",
|
"dotenv": "^16.3.1",
|
||||||
"express": "^4.18.2",
|
"express": "^4.18.2",
|
||||||
|
|||||||
224
backend/setup-local.sh
Executable file
224
backend/setup-local.sh
Executable file
@@ -0,0 +1,224 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# CannaiQ Local Development Setup (Idempotent)
|
||||||
|
#
|
||||||
|
# This script starts the complete local development environment:
|
||||||
|
# - PostgreSQL (cannaiq-postgres) on port 54320
|
||||||
|
# - Backend API on port 3010
|
||||||
|
# - CannaiQ Admin UI on port 8080
|
||||||
|
# - FindADispo Consumer UI on port 3001
|
||||||
|
# - Findagram Consumer UI on port 3002
|
||||||
|
#
|
||||||
|
# Usage: ./setup-local.sh
|
||||||
|
#
|
||||||
|
# URLs:
|
||||||
|
# Admin: http://localhost:8080/admin
|
||||||
|
# FindADispo: http://localhost:3001
|
||||||
|
# Findagram: http://localhost:3002
|
||||||
|
# Backend: http://localhost:3010
|
||||||
|
#
|
||||||
|
# Idempotent: Safe to run multiple times. Already-running services are left alone.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
echo -e "${BLUE}================================${NC}"
|
||||||
|
echo -e "${BLUE} CannaiQ Local Dev Setup${NC}"
|
||||||
|
echo -e "${BLUE}================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check for required tools
|
||||||
|
command -v docker >/dev/null 2>&1 || { echo -e "${RED}Error: docker is required but not installed.${NC}" >&2; exit 1; }
|
||||||
|
command -v npm >/dev/null 2>&1 || { echo -e "${RED}Error: npm is required but not installed.${NC}" >&2; exit 1; }
|
||||||
|
|
||||||
|
# Get the script directory
|
||||||
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
|
ROOT_DIR="$SCRIPT_DIR/.."
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# Step 1: PostgreSQL
|
||||||
|
PG_RUNNING=$(docker ps --filter "name=cannaiq-postgres" --filter "status=running" -q)
|
||||||
|
if [ -n "$PG_RUNNING" ]; then
|
||||||
|
echo -e "${GREEN}[1/6] PostgreSQL already running (cannaiq-postgres)${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}[1/6] Starting PostgreSQL (cannaiq-postgres)...${NC}"
|
||||||
|
docker compose -f docker-compose.local.yml up -d cannaiq-postgres
|
||||||
|
|
||||||
|
# Wait for PostgreSQL to be ready
|
||||||
|
echo -e "${YELLOW} Waiting for PostgreSQL to be ready...${NC}"
|
||||||
|
until docker exec cannaiq-postgres pg_isready -U cannaiq >/dev/null 2>&1; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
echo -e "${GREEN} PostgreSQL ready on port 54320${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 2: Create storage directories (always safe to run)
|
||||||
|
mkdir -p storage/images/products
|
||||||
|
mkdir -p storage/images/brands
|
||||||
|
mkdir -p public/images
|
||||||
|
|
||||||
|
# Step 3: Backend
|
||||||
|
if lsof -i:3010 >/dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}[2/6] Backend already running on port 3010${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}[2/6] Starting Backend API...${NC}"
|
||||||
|
|
||||||
|
# Install dependencies if needed
|
||||||
|
if [ ! -d "node_modules" ]; then
|
||||||
|
echo -e "${YELLOW} Installing backend dependencies...${NC}"
|
||||||
|
npm install
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set environment for local mode
|
||||||
|
export STORAGE_DRIVER=local
|
||||||
|
export STORAGE_BASE_PATH=./storage
|
||||||
|
export PORT=3010
|
||||||
|
|
||||||
|
# Start backend in background
|
||||||
|
npm run dev > /tmp/cannaiq-backend.log 2>&1 &
|
||||||
|
BACKEND_PID=$!
|
||||||
|
echo $BACKEND_PID > /tmp/cannaiq-backend.pid
|
||||||
|
echo -e "${GREEN} Backend starting (PID: $BACKEND_PID)${NC}"
|
||||||
|
|
||||||
|
# Wait briefly for backend to start
|
||||||
|
sleep 3
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 4: CannaiQ Admin UI
|
||||||
|
if lsof -i:8080 >/dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}[3/6] CannaiQ Admin already running on port 8080${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}[3/6] Starting CannaiQ Admin UI...${NC}"
|
||||||
|
|
||||||
|
cd "$ROOT_DIR/cannaiq"
|
||||||
|
|
||||||
|
# Install dependencies if needed
|
||||||
|
if [ ! -d "node_modules" ]; then
|
||||||
|
echo -e "${YELLOW} Installing cannaiq dependencies...${NC}"
|
||||||
|
npm install
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start frontend in background
|
||||||
|
npm run dev:admin > /tmp/cannaiq-frontend.log 2>&1 &
|
||||||
|
FRONTEND_PID=$!
|
||||||
|
echo $FRONTEND_PID > /tmp/cannaiq-frontend.pid
|
||||||
|
echo -e "${GREEN} CannaiQ Admin starting (PID: $FRONTEND_PID)${NC}"
|
||||||
|
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 5: FindADispo Consumer UI
|
||||||
|
if lsof -i:3001 >/dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}[4/6] FindADispo already running on port 3001${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}[4/6] Starting FindADispo Consumer UI...${NC}"
|
||||||
|
|
||||||
|
cd "$ROOT_DIR/findadispo/frontend"
|
||||||
|
|
||||||
|
# Install dependencies if needed
|
||||||
|
if [ ! -d "node_modules" ]; then
|
||||||
|
echo -e "${YELLOW} Installing findadispo dependencies...${NC}"
|
||||||
|
npm install
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start in background on port 3001
|
||||||
|
PORT=3001 npm run dev > /tmp/findadispo-frontend.log 2>&1 &
|
||||||
|
FINDADISPO_PID=$!
|
||||||
|
echo $FINDADISPO_PID > /tmp/findadispo-frontend.pid
|
||||||
|
echo -e "${GREEN} FindADispo starting (PID: $FINDADISPO_PID)${NC}"
|
||||||
|
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 6: Findagram Consumer UI
|
||||||
|
if lsof -i:3002 >/dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}[5/6] Findagram already running on port 3002${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}[5/6] Starting Findagram Consumer UI...${NC}"
|
||||||
|
|
||||||
|
cd "$ROOT_DIR/findagram/frontend"
|
||||||
|
|
||||||
|
# Install dependencies if needed
|
||||||
|
if [ ! -d "node_modules" ]; then
|
||||||
|
echo -e "${YELLOW} Installing findagram dependencies...${NC}"
|
||||||
|
npm install
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start in background on port 3002
|
||||||
|
PORT=3002 npm run dev > /tmp/findagram-frontend.log 2>&1 &
|
||||||
|
FINDAGRAM_PID=$!
|
||||||
|
echo $FINDAGRAM_PID > /tmp/findagram-frontend.pid
|
||||||
|
echo -e "${GREEN} Findagram starting (PID: $FINDAGRAM_PID)${NC}"
|
||||||
|
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 7: Health checks for newly started services
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}[6/6] Checking service health...${NC}"
|
||||||
|
|
||||||
|
# Check backend if it was just started
|
||||||
|
if ! lsof -i:3010 >/dev/null 2>&1; then
|
||||||
|
for i in {1..15}; do
|
||||||
|
if curl -s http://localhost:3010/health > /dev/null 2>&1; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if curl -s http://localhost:3010/health > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN} Backend API: OK (port 3010)${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW} Backend API: Starting (check: tail -f /tmp/cannaiq-backend.log)${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check CannaiQ Admin
|
||||||
|
if curl -s http://localhost:8080 > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN} CannaiQ Admin: OK (port 8080)${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW} CannaiQ Admin: Starting (check: tail -f /tmp/cannaiq-frontend.log)${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check FindADispo
|
||||||
|
sleep 2
|
||||||
|
if curl -s http://localhost:3001 > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN} FindADispo: OK (port 3001)${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW} FindADispo: Starting (check: tail -f /tmp/findadispo-frontend.log)${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check Findagram
|
||||||
|
if curl -s http://localhost:3002 > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN} Findagram: OK (port 3002)${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW} Findagram: Starting (check: tail -f /tmp/findagram-frontend.log)${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Print final status
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}================================${NC}"
|
||||||
|
echo -e "${GREEN} Local Environment Ready${NC}"
|
||||||
|
echo -e "${BLUE}================================${NC}"
|
||||||
|
echo ""
|
||||||
|
echo -e " ${BLUE}Services:${NC}"
|
||||||
|
echo -e " Postgres: localhost:54320"
|
||||||
|
echo -e " Backend API: http://localhost:3010"
|
||||||
|
echo ""
|
||||||
|
echo -e " ${BLUE}Frontends:${NC}"
|
||||||
|
echo -e " CannaiQ Admin: http://localhost:8080/admin"
|
||||||
|
echo -e " FindADispo: http://localhost:3001"
|
||||||
|
echo -e " Findagram: http://localhost:3002"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}To stop services:${NC} ./stop-local.sh"
|
||||||
|
echo -e "${YELLOW}View logs:${NC}"
|
||||||
|
echo " Backend: tail -f /tmp/cannaiq-backend.log"
|
||||||
|
echo " CannaiQ: tail -f /tmp/cannaiq-frontend.log"
|
||||||
|
echo " FindADispo: tail -f /tmp/findadispo-frontend.log"
|
||||||
|
echo " Findagram: tail -f /tmp/findagram-frontend.log"
|
||||||
|
echo ""
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import { Request, Response, NextFunction } from 'express';
|
import { Request, Response, NextFunction } from 'express';
|
||||||
import jwt from 'jsonwebtoken';
|
import jwt from 'jsonwebtoken';
|
||||||
import bcrypt from 'bcrypt';
|
import bcrypt from 'bcrypt';
|
||||||
import { pool } from '../db/migrate';
|
import { pool } from '../db/pool';
|
||||||
|
|
||||||
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
|
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
|
||||||
|
|
||||||
|
|||||||
204
backend/src/canonical-hydration/RUNBOOK.md
Normal file
204
backend/src/canonical-hydration/RUNBOOK.md
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
# Canonical Hydration Pipeline - Runbook
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Canonical Hydration Pipeline transforms data from the `dutchie_*` source tables into the provider-agnostic canonical tables (`store_products`, `store_product_snapshots`, `crawl_runs`). This enables:
|
||||||
|
|
||||||
|
- Unified analytics across multiple data providers
|
||||||
|
- Historical price/inventory tracking
|
||||||
|
- Provider-agnostic API endpoints
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Source Tables (read-only):
|
||||||
|
dutchie_products → StoreProductNormalizer → store_products
|
||||||
|
dutchie_product_snapshots → SnapshotWriter → store_product_snapshots
|
||||||
|
dispensary_crawl_jobs → CrawlRunRecorder → crawl_runs
|
||||||
|
|
||||||
|
Orchestration:
|
||||||
|
CanonicalHydrationService coordinates all transformations
|
||||||
|
```
|
||||||
|
|
||||||
|
## Table Mappings
|
||||||
|
|
||||||
|
### dutchie_products → store_products
|
||||||
|
|
||||||
|
| Source Column | Target Column | Notes |
|
||||||
|
|---------------|---------------|-------|
|
||||||
|
| dispensary_id | dispensary_id | Direct mapping |
|
||||||
|
| external_product_id | provider_product_id | Canonical key |
|
||||||
|
| platform | provider | 'dutchie' |
|
||||||
|
| name | name_raw | Raw product name |
|
||||||
|
| brand_name | brand_name_raw | Raw brand name |
|
||||||
|
| type/subcategory | category_raw | Category info |
|
||||||
|
| price_rec (JSONB) | price_rec (DECIMAL) | Extracted from JSONB |
|
||||||
|
| price_med (JSONB) | price_med (DECIMAL) | Extracted from JSONB |
|
||||||
|
| thc | thc_percent | Parsed percentage |
|
||||||
|
| cbd | cbd_percent | Parsed percentage |
|
||||||
|
| stock_status | is_in_stock | Boolean conversion |
|
||||||
|
| total_quantity_available | stock_quantity | Direct mapping |
|
||||||
|
| primary_image_url | image_url | Direct mapping |
|
||||||
|
| created_at | first_seen_at | First seen timestamp |
|
||||||
|
| updated_at | last_seen_at | Last seen timestamp |
|
||||||
|
|
||||||
|
### Canonical Keys
|
||||||
|
|
||||||
|
- **store_products**: `(dispensary_id, provider, provider_product_id)`
|
||||||
|
- **store_product_snapshots**: `(store_product_id, crawl_run_id)`
|
||||||
|
- **crawl_runs**: `(source_job_type, source_job_id)`
|
||||||
|
|
||||||
|
## CLI Commands
|
||||||
|
|
||||||
|
### Check Hydration Status
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Overall status
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --status
|
||||||
|
|
||||||
|
# Single dispensary status
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --status --dispensary-id 112
|
||||||
|
```
|
||||||
|
|
||||||
|
### Products-Only Hydration
|
||||||
|
|
||||||
|
Use when source data has products but no historical snapshots/job records.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Dry run (see what would be done)
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
|
||||||
|
|
||||||
|
# Hydrate single dispensary
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
|
||||||
|
|
||||||
|
# Hydrate all dispensaries
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/products-only.ts
|
||||||
|
```
|
||||||
|
|
||||||
|
### Backfill Hydration
|
||||||
|
|
||||||
|
Use when source data has historical job records in `dispensary_crawl_jobs`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Dry run
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
|
||||||
|
|
||||||
|
# Backfill with date range
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
|
||||||
|
|
||||||
|
# Backfill single dispensary
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
|
||||||
|
```
|
||||||
|
|
||||||
|
### Incremental Hydration
|
||||||
|
|
||||||
|
Use for ongoing hydration of new data.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Single run
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts
|
||||||
|
|
||||||
|
# Continuous loop (runs every 60 seconds)
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts --loop
|
||||||
|
|
||||||
|
# Continuous loop with custom interval
|
||||||
|
DATABASE_URL="..." npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migration
|
||||||
|
|
||||||
|
Apply the schema migration before first use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Apply migration 050
|
||||||
|
DATABASE_URL="..." psql -f src/migrations/050_canonical_hydration_schema.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
This migration adds:
|
||||||
|
- `source_job_type` and `source_job_id` columns to `crawl_runs`
|
||||||
|
- Unique index on `crawl_runs (source_job_type, source_job_id)`
|
||||||
|
- Unique index on `store_product_snapshots (store_product_id, crawl_run_id)`
|
||||||
|
- Performance indexes for hydration queries
|
||||||
|
|
||||||
|
## Idempotency
|
||||||
|
|
||||||
|
All hydration operations are idempotent:
|
||||||
|
|
||||||
|
- **crawl_runs**: ON CONFLICT updates existing records
|
||||||
|
- **store_products**: ON CONFLICT updates mutable fields
|
||||||
|
- **store_product_snapshots**: ON CONFLICT DO NOTHING
|
||||||
|
|
||||||
|
Re-running hydration is safe and will not create duplicates.
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
### Check Canonical Data
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Count canonical records
|
||||||
|
SELECT
|
||||||
|
(SELECT COUNT(*) FROM crawl_runs WHERE provider = 'dutchie') as crawl_runs,
|
||||||
|
(SELECT COUNT(*) FROM store_products WHERE provider = 'dutchie') as products,
|
||||||
|
(SELECT COUNT(*) FROM store_product_snapshots) as snapshots;
|
||||||
|
|
||||||
|
-- Products by dispensary
|
||||||
|
SELECT dispensary_id, COUNT(*) as products
|
||||||
|
FROM store_products
|
||||||
|
WHERE provider = 'dutchie'
|
||||||
|
GROUP BY dispensary_id
|
||||||
|
ORDER BY products DESC;
|
||||||
|
|
||||||
|
-- Recent crawl runs
|
||||||
|
SELECT id, dispensary_id, started_at, products_found, snapshots_written
|
||||||
|
FROM crawl_runs
|
||||||
|
ORDER BY started_at DESC
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verify Hydration Completeness
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Compare source vs canonical product counts
|
||||||
|
SELECT
|
||||||
|
dp.dispensary_id,
|
||||||
|
COUNT(DISTINCT dp.id) as source_products,
|
||||||
|
COUNT(DISTINCT sp.id) as canonical_products
|
||||||
|
FROM dutchie_products dp
|
||||||
|
LEFT JOIN store_products sp
|
||||||
|
ON sp.dispensary_id = dp.dispensary_id
|
||||||
|
AND sp.provider = 'dutchie'
|
||||||
|
AND sp.provider_product_id = dp.external_product_id
|
||||||
|
GROUP BY dp.dispensary_id
|
||||||
|
ORDER BY dp.dispensary_id;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "invalid input syntax for type integer"
|
||||||
|
|
||||||
|
This usually means a type mismatch between source and target columns. The most common case is `brand_id` - the source has UUID strings but the target expects integers. The normalizer sets `brand_id = null` to handle this.
|
||||||
|
|
||||||
|
### "could not determine data type of parameter $1"
|
||||||
|
|
||||||
|
This indicates a batch insert issue with parameter indexing. Ensure each batch has its own parameter indexing starting from $1.
|
||||||
|
|
||||||
|
### Empty Snapshots
|
||||||
|
|
||||||
|
If `snapshotsWritten` is 0 but products were upserted:
|
||||||
|
1. Check if snapshots already exist for the crawl run (ON CONFLICT DO NOTHING)
|
||||||
|
2. Verify store_products exist with the correct dispensary_id and provider
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
Typical performance metrics:
|
||||||
|
- ~1000 products/second for upsert
|
||||||
|
- ~2000 snapshots/second for insert
|
||||||
|
- 39 dispensaries with 37K products: ~17 seconds
|
||||||
|
|
||||||
|
For large backfills, use `--batch-size` to control memory usage.
|
||||||
|
|
||||||
|
## Known Limitations
|
||||||
|
|
||||||
|
1. **brand_id not mapped**: Source brand_id is UUID, target expects integer. Currently set to null.
|
||||||
|
2. **No historical snapshots**: If source has no `dutchie_product_snapshots`, use products-only mode which creates initial snapshots from current product state.
|
||||||
|
3. **Source jobs empty**: If `dispensary_crawl_jobs` is empty, use products-only mode.
|
||||||
170
backend/src/canonical-hydration/cli/backfill.ts
Normal file
170
backend/src/canonical-hydration/cli/backfill.ts
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Backfill CLI - Historical data hydration
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/canonical-hydration/cli/backfill.ts [options]
|
||||||
|
*
|
||||||
|
* Options:
|
||||||
|
* --dispensary-id <id> Hydrate only a specific dispensary
|
||||||
|
* --start-date <date> Start date for backfill (ISO format)
|
||||||
|
* --end-date <date> End date for backfill (ISO format)
|
||||||
|
* --batch-size <n> Number of jobs to process per batch (default: 50)
|
||||||
|
* --dry-run Show what would be done without making changes
|
||||||
|
* --status Show hydration status and exit
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
|
* npx tsx src/canonical-hydration/cli/backfill.ts --status
|
||||||
|
* npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
|
||||||
|
* npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
|
||||||
|
* npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { CanonicalHydrationService } from '../hydration-service';
|
||||||
|
import { HydrationOptions } from '../types';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
|
||||||
|
// Parse command line arguments
|
||||||
|
const options: HydrationOptions = {
|
||||||
|
mode: 'backfill',
|
||||||
|
};
|
||||||
|
let showStatus = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < args.length; i++) {
|
||||||
|
const arg = args[i];
|
||||||
|
switch (arg) {
|
||||||
|
case '--dispensary-id':
|
||||||
|
options.dispensaryId = parseInt(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--start-date':
|
||||||
|
options.startDate = new Date(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--end-date':
|
||||||
|
options.endDate = new Date(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--batch-size':
|
||||||
|
options.batchSize = parseInt(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--dry-run':
|
||||||
|
options.dryRun = true;
|
||||||
|
break;
|
||||||
|
case '--status':
|
||||||
|
showStatus = true;
|
||||||
|
break;
|
||||||
|
case '--help':
|
||||||
|
console.log(`
|
||||||
|
Backfill CLI - Historical data hydration
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
npx tsx src/canonical-hydration/cli/backfill.ts [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--dispensary-id <id> Hydrate only a specific dispensary
|
||||||
|
--start-date <date> Start date for backfill (ISO format)
|
||||||
|
--end-date <date> End date for backfill (ISO format)
|
||||||
|
--batch-size <n> Number of jobs to process per batch (default: 50)
|
||||||
|
--dry-run Show what would be done without making changes
|
||||||
|
--status Show hydration status and exit
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
npx tsx src/canonical-hydration/cli/backfill.ts --status
|
||||||
|
npx tsx src/canonical-hydration/cli/backfill.ts --dispensary-id 112
|
||||||
|
npx tsx src/canonical-hydration/cli/backfill.ts --start-date 2024-01-01 --end-date 2024-12-31
|
||||||
|
npx tsx src/canonical-hydration/cli/backfill.ts --dry-run
|
||||||
|
`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect to database
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: process.env.DATABASE_URL,
|
||||||
|
});
|
||||||
|
|
||||||
|
const service = new CanonicalHydrationService({
|
||||||
|
pool,
|
||||||
|
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (showStatus) {
|
||||||
|
// Show status and exit
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
const status = await service.getHydrationStatus(options.dispensaryId);
|
||||||
|
console.log(`\nHydration Status for Dispensary ${options.dispensaryId}:`);
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(` Source Jobs (completed): ${status.sourceJobs}`);
|
||||||
|
console.log(` Hydrated Jobs: ${status.hydratedJobs}`);
|
||||||
|
console.log(` Unhydrated Jobs: ${status.unhydratedJobs}`);
|
||||||
|
console.log('');
|
||||||
|
console.log(` Source Products: ${status.sourceProducts}`);
|
||||||
|
console.log(` Store Products: ${status.storeProducts}`);
|
||||||
|
console.log('');
|
||||||
|
console.log(` Source Snapshots: ${status.sourceSnapshots}`);
|
||||||
|
console.log(` Store Snapshots: ${status.storeSnapshots}`);
|
||||||
|
} else {
|
||||||
|
const status = await service.getOverallStatus();
|
||||||
|
console.log('\nOverall Hydration Status:');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(` Dispensaries with Data: ${status.dispensariesWithData}`);
|
||||||
|
console.log('');
|
||||||
|
console.log(` Source Jobs (completed): ${status.totalSourceJobs}`);
|
||||||
|
console.log(` Hydrated Jobs: ${status.totalHydratedJobs}`);
|
||||||
|
console.log(` Unhydrated Jobs: ${status.totalSourceJobs - status.totalHydratedJobs}`);
|
||||||
|
console.log('');
|
||||||
|
console.log(` Source Products: ${status.totalSourceProducts}`);
|
||||||
|
console.log(` Store Products: ${status.totalStoreProducts}`);
|
||||||
|
console.log('');
|
||||||
|
console.log(` Source Snapshots: ${status.totalSourceSnapshots}`);
|
||||||
|
console.log(` Store Snapshots: ${status.totalStoreSnapshots}`);
|
||||||
|
}
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run backfill
|
||||||
|
console.log('\n' + '═'.repeat(60));
|
||||||
|
console.log(' CANONICAL HYDRATION - BACKFILL MODE');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log(` Dispensary ID: ${options.dispensaryId || 'ALL'}`);
|
||||||
|
console.log(` Start Date: ${options.startDate?.toISOString() || 'N/A'}`);
|
||||||
|
console.log(` End Date: ${options.endDate?.toISOString() || 'N/A'}`);
|
||||||
|
console.log(` Batch Size: ${options.batchSize || 50}`);
|
||||||
|
console.log(` Dry Run: ${options.dryRun ? 'YES' : 'NO'}`);
|
||||||
|
console.log('═'.repeat(60) + '\n');
|
||||||
|
|
||||||
|
const result = await service.hydrate(options);
|
||||||
|
|
||||||
|
console.log('\n' + '═'.repeat(60));
|
||||||
|
console.log(' HYDRATION COMPLETE');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log(` Crawl Runs Created: ${result.crawlRunsCreated}`);
|
||||||
|
console.log(` Crawl Runs Skipped: ${result.crawlRunsSkipped}`);
|
||||||
|
console.log(` Products Upserted: ${result.productsUpserted}`);
|
||||||
|
console.log(` Snapshots Written: ${result.snapshotsWritten}`);
|
||||||
|
console.log(` Duration: ${result.durationMs}ms`);
|
||||||
|
console.log(` Errors: ${result.errors.length}`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\nErrors:');
|
||||||
|
for (const error of result.errors.slice(0, 10)) {
|
||||||
|
console.log(` - ${error}`);
|
||||||
|
}
|
||||||
|
if (result.errors.length > 10) {
|
||||||
|
console.log(` ... and ${result.errors.length - 10} more`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log('═'.repeat(60) + '\n');
|
||||||
|
|
||||||
|
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Fatal error:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
142
backend/src/canonical-hydration/cli/incremental.ts
Normal file
142
backend/src/canonical-hydration/cli/incremental.ts
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Incremental CLI - Ongoing data hydration
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/canonical-hydration/cli/incremental.ts [options]
|
||||||
|
*
|
||||||
|
* Options:
|
||||||
|
* --dispensary-id <id> Hydrate only a specific dispensary
|
||||||
|
* --batch-size <n> Number of jobs to process per batch (default: 100)
|
||||||
|
* --loop Run continuously in a loop
|
||||||
|
* --interval <seconds> Interval between loops (default: 60)
|
||||||
|
* --dry-run Show what would be done without making changes
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
|
* npx tsx src/canonical-hydration/cli/incremental.ts
|
||||||
|
* npx tsx src/canonical-hydration/cli/incremental.ts --dispensary-id 112
|
||||||
|
* npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
|
||||||
|
* npx tsx src/canonical-hydration/cli/incremental.ts --dry-run
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { CanonicalHydrationService } from '../hydration-service';
|
||||||
|
import { HydrationOptions } from '../types';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
|
||||||
|
// Parse command line arguments
|
||||||
|
const options: HydrationOptions = {
|
||||||
|
mode: 'incremental',
|
||||||
|
};
|
||||||
|
let loop = false;
|
||||||
|
let intervalSeconds = 60;
|
||||||
|
|
||||||
|
for (let i = 0; i < args.length; i++) {
|
||||||
|
const arg = args[i];
|
||||||
|
switch (arg) {
|
||||||
|
case '--dispensary-id':
|
||||||
|
options.dispensaryId = parseInt(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--batch-size':
|
||||||
|
options.batchSize = parseInt(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--loop':
|
||||||
|
loop = true;
|
||||||
|
break;
|
||||||
|
case '--interval':
|
||||||
|
intervalSeconds = parseInt(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--dry-run':
|
||||||
|
options.dryRun = true;
|
||||||
|
break;
|
||||||
|
case '--help':
|
||||||
|
console.log(`
|
||||||
|
Incremental CLI - Ongoing data hydration
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
npx tsx src/canonical-hydration/cli/incremental.ts [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--dispensary-id <id> Hydrate only a specific dispensary
|
||||||
|
--batch-size <n> Number of jobs to process per batch (default: 100)
|
||||||
|
--loop Run continuously in a loop
|
||||||
|
--interval <seconds> Interval between loops (default: 60)
|
||||||
|
--dry-run Show what would be done without making changes
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
npx tsx src/canonical-hydration/cli/incremental.ts
|
||||||
|
npx tsx src/canonical-hydration/cli/incremental.ts --dispensary-id 112
|
||||||
|
npx tsx src/canonical-hydration/cli/incremental.ts --loop --interval 300
|
||||||
|
npx tsx src/canonical-hydration/cli/incremental.ts --dry-run
|
||||||
|
`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect to database
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: process.env.DATABASE_URL,
|
||||||
|
});
|
||||||
|
|
||||||
|
const service = new CanonicalHydrationService({
|
||||||
|
pool,
|
||||||
|
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
|
||||||
|
});
|
||||||
|
|
||||||
|
const log = (msg: string) => console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||||
|
|
||||||
|
// Graceful shutdown
|
||||||
|
let running = true;
|
||||||
|
process.on('SIGINT', () => {
|
||||||
|
log('Received SIGINT, shutting down...');
|
||||||
|
running = false;
|
||||||
|
});
|
||||||
|
process.on('SIGTERM', () => {
|
||||||
|
log('Received SIGTERM, shutting down...');
|
||||||
|
running = false;
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log('\n' + '═'.repeat(60));
|
||||||
|
console.log(' CANONICAL HYDRATION - INCREMENTAL MODE');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log(` Dispensary ID: ${options.dispensaryId || 'ALL'}`);
|
||||||
|
console.log(` Batch Size: ${options.batchSize || 100}`);
|
||||||
|
console.log(` Loop Mode: ${loop ? 'YES' : 'NO'}`);
|
||||||
|
if (loop) {
|
||||||
|
console.log(` Interval: ${intervalSeconds}s`);
|
||||||
|
}
|
||||||
|
console.log(` Dry Run: ${options.dryRun ? 'YES' : 'NO'}`);
|
||||||
|
console.log('═'.repeat(60) + '\n');
|
||||||
|
|
||||||
|
do {
|
||||||
|
const result = await service.hydrate(options);
|
||||||
|
|
||||||
|
log(`Hydration complete: ${result.crawlRunsCreated} runs, ${result.productsUpserted} products, ${result.snapshotsWritten} snapshots (${result.durationMs}ms)`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
log(`Errors: ${result.errors.length}`);
|
||||||
|
for (const error of result.errors.slice(0, 5)) {
|
||||||
|
log(` - ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (loop && running) {
|
||||||
|
log(`Sleeping for ${intervalSeconds}s...`);
|
||||||
|
await new Promise(resolve => setTimeout(resolve, intervalSeconds * 1000));
|
||||||
|
}
|
||||||
|
} while (loop && running);
|
||||||
|
|
||||||
|
log('Incremental hydration completed');
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Fatal error:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
113
backend/src/canonical-hydration/cli/products-only.ts
Normal file
113
backend/src/canonical-hydration/cli/products-only.ts
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Products-Only Hydration CLI
|
||||||
|
*
|
||||||
|
* Used when there are no historical job records - creates synthetic crawl runs
|
||||||
|
* from current product data.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/canonical-hydration/cli/products-only.ts [options]
|
||||||
|
*
|
||||||
|
* Options:
|
||||||
|
* --dispensary-id <id> Hydrate only a specific dispensary
|
||||||
|
* --dry-run Show what would be done without making changes
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
|
* npx tsx src/canonical-hydration/cli/products-only.ts
|
||||||
|
* npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
|
||||||
|
* npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { CanonicalHydrationService } from '../hydration-service';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
|
||||||
|
// Parse command line arguments
|
||||||
|
let dispensaryId: number | undefined;
|
||||||
|
let dryRun = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < args.length; i++) {
|
||||||
|
const arg = args[i];
|
||||||
|
switch (arg) {
|
||||||
|
case '--dispensary-id':
|
||||||
|
dispensaryId = parseInt(args[++i]);
|
||||||
|
break;
|
||||||
|
case '--dry-run':
|
||||||
|
dryRun = true;
|
||||||
|
break;
|
||||||
|
case '--help':
|
||||||
|
console.log(`
|
||||||
|
Products-Only Hydration CLI
|
||||||
|
|
||||||
|
Used when there are no historical job records - creates synthetic crawl runs
|
||||||
|
from current product data.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
npx tsx src/canonical-hydration/cli/products-only.ts [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--dispensary-id <id> Hydrate only a specific dispensary
|
||||||
|
--dry-run Show what would be done without making changes
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
npx tsx src/canonical-hydration/cli/products-only.ts
|
||||||
|
npx tsx src/canonical-hydration/cli/products-only.ts --dispensary-id 112
|
||||||
|
npx tsx src/canonical-hydration/cli/products-only.ts --dry-run
|
||||||
|
`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect to database
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: process.env.DATABASE_URL,
|
||||||
|
});
|
||||||
|
|
||||||
|
const service = new CanonicalHydrationService({
|
||||||
|
pool,
|
||||||
|
logger: (msg) => console.log(`[${new Date().toISOString()}] ${msg}`),
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log('\n' + '═'.repeat(60));
|
||||||
|
console.log(' CANONICAL HYDRATION - PRODUCTS-ONLY MODE');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log(` Dispensary ID: ${dispensaryId || 'ALL'}`);
|
||||||
|
console.log(` Dry Run: ${dryRun ? 'YES' : 'NO'}`);
|
||||||
|
console.log('═'.repeat(60) + '\n');
|
||||||
|
|
||||||
|
const result = await service.hydrateProductsOnly({ dispensaryId, dryRun });
|
||||||
|
|
||||||
|
console.log('\n' + '═'.repeat(60));
|
||||||
|
console.log(' HYDRATION COMPLETE');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log(` Crawl Runs Created: ${result.crawlRunsCreated}`);
|
||||||
|
console.log(` Crawl Runs Skipped: ${result.crawlRunsSkipped}`);
|
||||||
|
console.log(` Products Upserted: ${result.productsUpserted}`);
|
||||||
|
console.log(` Snapshots Written: ${result.snapshotsWritten}`);
|
||||||
|
console.log(` Duration: ${result.durationMs}ms`);
|
||||||
|
console.log(` Errors: ${result.errors.length}`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\nErrors:');
|
||||||
|
for (const error of result.errors.slice(0, 10)) {
|
||||||
|
console.log(` - ${error}`);
|
||||||
|
}
|
||||||
|
if (result.errors.length > 10) {
|
||||||
|
console.log(` ... and ${result.errors.length - 10} more`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log('═'.repeat(60) + '\n');
|
||||||
|
|
||||||
|
process.exit(result.errors.length > 0 ? 1 : 0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Fatal error:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
226
backend/src/canonical-hydration/crawl-run-recorder.ts
Normal file
226
backend/src/canonical-hydration/crawl-run-recorder.ts
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
/**
|
||||||
|
* CrawlRunRecorder
|
||||||
|
* Records crawl runs from source job tables (dispensary_crawl_jobs) to canonical crawl_runs table
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool, PoolClient } from 'pg';
|
||||||
|
import { SourceJob, CrawlRun, ServiceContext, SourceJobType } from './types';
|
||||||
|
|
||||||
|
export class CrawlRunRecorder {
|
||||||
|
private pool: Pool;
|
||||||
|
private log: (message: string) => void;
|
||||||
|
|
||||||
|
constructor(ctx: ServiceContext) {
|
||||||
|
this.pool = ctx.pool;
|
||||||
|
this.log = ctx.logger || console.log;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record a single crawl run from a source job
|
||||||
|
* Uses ON CONFLICT to ensure idempotency
|
||||||
|
*/
|
||||||
|
async recordCrawlRun(
|
||||||
|
sourceJob: SourceJob,
|
||||||
|
sourceJobType: SourceJobType = 'dispensary_crawl_jobs'
|
||||||
|
): Promise<number | null> {
|
||||||
|
// Skip jobs that aren't completed successfully
|
||||||
|
if (sourceJob.status !== 'completed') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const crawlRun: Partial<CrawlRun> = {
|
||||||
|
dispensary_id: sourceJob.dispensary_id,
|
||||||
|
provider: 'dutchie', // Source is always dutchie for now
|
||||||
|
started_at: sourceJob.started_at || new Date(),
|
||||||
|
finished_at: sourceJob.completed_at,
|
||||||
|
duration_ms: sourceJob.duration_ms,
|
||||||
|
status: this.mapStatus(sourceJob.status),
|
||||||
|
error_message: sourceJob.error_message,
|
||||||
|
products_found: sourceJob.products_found,
|
||||||
|
products_new: sourceJob.products_new,
|
||||||
|
products_updated: sourceJob.products_updated,
|
||||||
|
snapshots_written: null, // Will be updated after snapshot insertion
|
||||||
|
worker_id: null,
|
||||||
|
trigger_type: sourceJob.job_type === 'dutchie_product_crawl' ? 'scheduled' : 'manual',
|
||||||
|
metadata: { sourceJobType, originalJobType: sourceJob.job_type },
|
||||||
|
source_job_type: sourceJobType,
|
||||||
|
source_job_id: sourceJob.id,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`INSERT INTO crawl_runs (
|
||||||
|
dispensary_id, provider, started_at, finished_at, duration_ms,
|
||||||
|
status, error_message, products_found, products_new, products_updated,
|
||||||
|
snapshots_written, worker_id, trigger_type, metadata,
|
||||||
|
source_job_type, source_job_id
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)
|
||||||
|
ON CONFLICT (source_job_type, source_job_id) WHERE source_job_id IS NOT NULL
|
||||||
|
DO UPDATE SET
|
||||||
|
finished_at = EXCLUDED.finished_at,
|
||||||
|
duration_ms = EXCLUDED.duration_ms,
|
||||||
|
status = EXCLUDED.status,
|
||||||
|
error_message = EXCLUDED.error_message,
|
||||||
|
products_found = EXCLUDED.products_found,
|
||||||
|
products_new = EXCLUDED.products_new,
|
||||||
|
products_updated = EXCLUDED.products_updated
|
||||||
|
RETURNING id`,
|
||||||
|
[
|
||||||
|
crawlRun.dispensary_id,
|
||||||
|
crawlRun.provider,
|
||||||
|
crawlRun.started_at,
|
||||||
|
crawlRun.finished_at,
|
||||||
|
crawlRun.duration_ms,
|
||||||
|
crawlRun.status,
|
||||||
|
crawlRun.error_message,
|
||||||
|
crawlRun.products_found,
|
||||||
|
crawlRun.products_new,
|
||||||
|
crawlRun.products_updated,
|
||||||
|
crawlRun.snapshots_written,
|
||||||
|
crawlRun.worker_id,
|
||||||
|
crawlRun.trigger_type,
|
||||||
|
JSON.stringify(crawlRun.metadata),
|
||||||
|
crawlRun.source_job_type,
|
||||||
|
crawlRun.source_job_id,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.rows[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record multiple crawl runs in a batch
|
||||||
|
*/
|
||||||
|
async recordCrawlRunsBatch(
|
||||||
|
sourceJobs: SourceJob[],
|
||||||
|
sourceJobType: SourceJobType = 'dispensary_crawl_jobs'
|
||||||
|
): Promise<{ created: number; skipped: number; crawlRunIds: Map<number, number> }> {
|
||||||
|
let created = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
const crawlRunIds = new Map<number, number>(); // sourceJobId -> crawlRunId
|
||||||
|
|
||||||
|
for (const job of sourceJobs) {
|
||||||
|
const crawlRunId = await this.recordCrawlRun(job, sourceJobType);
|
||||||
|
if (crawlRunId) {
|
||||||
|
created++;
|
||||||
|
crawlRunIds.set(job.id, crawlRunId);
|
||||||
|
} else {
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { created, skipped, crawlRunIds };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update snapshots_written count for a crawl run
|
||||||
|
*/
|
||||||
|
async updateSnapshotsWritten(crawlRunId: number, snapshotsWritten: number): Promise<void> {
|
||||||
|
await this.pool.query(
|
||||||
|
'UPDATE crawl_runs SET snapshots_written = $1 WHERE id = $2',
|
||||||
|
[snapshotsWritten, crawlRunId]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get crawl run ID by source job
|
||||||
|
*/
|
||||||
|
async getCrawlRunIdBySourceJob(
|
||||||
|
sourceJobType: SourceJobType,
|
||||||
|
sourceJobId: number
|
||||||
|
): Promise<number | null> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
'SELECT id FROM crawl_runs WHERE source_job_type = $1 AND source_job_id = $2',
|
||||||
|
[sourceJobType, sourceJobId]
|
||||||
|
);
|
||||||
|
return result.rows[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get unhydrated source jobs (jobs not yet recorded in crawl_runs)
|
||||||
|
*/
|
||||||
|
async getUnhydratedJobs(
|
||||||
|
dispensaryId?: number,
|
||||||
|
startDate?: Date,
|
||||||
|
limit: number = 100
|
||||||
|
): Promise<SourceJob[]> {
|
||||||
|
let query = `
|
||||||
|
SELECT j.*
|
||||||
|
FROM dispensary_crawl_jobs j
|
||||||
|
LEFT JOIN crawl_runs cr ON cr.source_job_type = 'dispensary_crawl_jobs' AND cr.source_job_id = j.id
|
||||||
|
WHERE cr.id IS NULL
|
||||||
|
AND j.status = 'completed'
|
||||||
|
AND j.job_type = 'dutchie_product_crawl'
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (dispensaryId) {
|
||||||
|
query += ` AND j.dispensary_id = $${paramIndex++}`;
|
||||||
|
params.push(dispensaryId);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (startDate) {
|
||||||
|
query += ` AND j.completed_at >= $${paramIndex++}`;
|
||||||
|
params.push(startDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY j.completed_at ASC LIMIT $${paramIndex}`;
|
||||||
|
params.push(limit);
|
||||||
|
|
||||||
|
const result = await this.pool.query(query, params);
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all source jobs for backfill (within date range)
|
||||||
|
*/
|
||||||
|
async getSourceJobsForBackfill(
|
||||||
|
startDate?: Date,
|
||||||
|
endDate?: Date,
|
||||||
|
dispensaryId?: number,
|
||||||
|
limit: number = 1000
|
||||||
|
): Promise<SourceJob[]> {
|
||||||
|
let query = `
|
||||||
|
SELECT *
|
||||||
|
FROM dispensary_crawl_jobs
|
||||||
|
WHERE status = 'completed'
|
||||||
|
AND job_type = 'dutchie_product_crawl'
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (startDate) {
|
||||||
|
query += ` AND completed_at >= $${paramIndex++}`;
|
||||||
|
params.push(startDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endDate) {
|
||||||
|
query += ` AND completed_at <= $${paramIndex++}`;
|
||||||
|
params.push(endDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dispensaryId) {
|
||||||
|
query += ` AND dispensary_id = $${paramIndex++}`;
|
||||||
|
params.push(dispensaryId);
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY completed_at ASC LIMIT $${paramIndex}`;
|
||||||
|
params.push(limit);
|
||||||
|
|
||||||
|
const result = await this.pool.query(query, params);
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
private mapStatus(sourceStatus: string): string {
|
||||||
|
switch (sourceStatus) {
|
||||||
|
case 'completed':
|
||||||
|
return 'success';
|
||||||
|
case 'failed':
|
||||||
|
return 'failed';
|
||||||
|
case 'running':
|
||||||
|
return 'running';
|
||||||
|
default:
|
||||||
|
return sourceStatus;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
560
backend/src/canonical-hydration/hydration-service.ts
Normal file
560
backend/src/canonical-hydration/hydration-service.ts
Normal file
@@ -0,0 +1,560 @@
|
|||||||
|
/**
|
||||||
|
* CanonicalHydrationService
|
||||||
|
* Orchestrates the full hydration pipeline from dutchie_* to canonical tables
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { CrawlRunRecorder } from './crawl-run-recorder';
|
||||||
|
import { StoreProductNormalizer } from './store-product-normalizer';
|
||||||
|
import { SnapshotWriter } from './snapshot-writer';
|
||||||
|
import { HydrationOptions, HydrationResult, ServiceContext, SourceJob } from './types';
|
||||||
|
|
||||||
|
export class CanonicalHydrationService {
|
||||||
|
private pool: Pool;
|
||||||
|
private log: (message: string) => void;
|
||||||
|
private crawlRunRecorder: CrawlRunRecorder;
|
||||||
|
private productNormalizer: StoreProductNormalizer;
|
||||||
|
private snapshotWriter: SnapshotWriter;
|
||||||
|
|
||||||
|
constructor(ctx: ServiceContext) {
|
||||||
|
this.pool = ctx.pool;
|
||||||
|
this.log = ctx.logger || console.log;
|
||||||
|
this.crawlRunRecorder = new CrawlRunRecorder(ctx);
|
||||||
|
this.productNormalizer = new StoreProductNormalizer(ctx);
|
||||||
|
this.snapshotWriter = new SnapshotWriter(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the full hydration pipeline
|
||||||
|
* Supports both backfill (historical) and incremental (ongoing) modes
|
||||||
|
*/
|
||||||
|
async hydrate(options: HydrationOptions): Promise<HydrationResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const result: HydrationResult = {
|
||||||
|
crawlRunsCreated: 0,
|
||||||
|
crawlRunsSkipped: 0,
|
||||||
|
productsUpserted: 0,
|
||||||
|
snapshotsWritten: 0,
|
||||||
|
errors: [],
|
||||||
|
durationMs: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
this.log(`Starting hydration in ${options.mode} mode`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (options.mode === 'backfill') {
|
||||||
|
await this.runBackfill(options, result);
|
||||||
|
} else {
|
||||||
|
await this.runIncremental(options, result);
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Fatal error: ${err.message}`);
|
||||||
|
this.log(`Hydration failed: ${err.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
result.durationMs = Date.now() - startTime;
|
||||||
|
this.log(`Hydration completed in ${result.durationMs}ms: ${JSON.stringify({
|
||||||
|
crawlRunsCreated: result.crawlRunsCreated,
|
||||||
|
crawlRunsSkipped: result.crawlRunsSkipped,
|
||||||
|
productsUpserted: result.productsUpserted,
|
||||||
|
snapshotsWritten: result.snapshotsWritten,
|
||||||
|
errors: result.errors.length,
|
||||||
|
})}`);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backfill mode: Process historical data from source tables
|
||||||
|
*/
|
||||||
|
private async runBackfill(options: HydrationOptions, result: HydrationResult): Promise<void> {
|
||||||
|
const batchSize = options.batchSize || 50;
|
||||||
|
|
||||||
|
// Get source jobs to process
|
||||||
|
const sourceJobs = await this.crawlRunRecorder.getSourceJobsForBackfill(
|
||||||
|
options.startDate,
|
||||||
|
options.endDate,
|
||||||
|
options.dispensaryId,
|
||||||
|
1000 // Max jobs to process
|
||||||
|
);
|
||||||
|
|
||||||
|
this.log(`Found ${sourceJobs.length} source jobs to backfill`);
|
||||||
|
|
||||||
|
// Group jobs by dispensary for efficient processing
|
||||||
|
const jobsByDispensary = this.groupJobsByDispensary(sourceJobs);
|
||||||
|
|
||||||
|
for (const [dispensaryId, jobs] of jobsByDispensary) {
|
||||||
|
this.log(`Processing dispensary ${dispensaryId} (${jobs.length} jobs)`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Step 1: Upsert products for this dispensary
|
||||||
|
if (!options.dryRun) {
|
||||||
|
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
|
||||||
|
result.productsUpserted += productResult.upserted;
|
||||||
|
if (productResult.errors.length > 0) {
|
||||||
|
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get store_product_id map for snapshot writing
|
||||||
|
const storeProductIdMap = await this.productNormalizer.getStoreProductIdMap(dispensaryId);
|
||||||
|
|
||||||
|
// Step 2: Record crawl runs and write snapshots for each job
|
||||||
|
for (const job of jobs) {
|
||||||
|
try {
|
||||||
|
await this.processJob(job, storeProductIdMap, result, options.dryRun);
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Job ${job.id}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Incremental mode: Process only unhydrated jobs
|
||||||
|
*/
|
||||||
|
private async runIncremental(options: HydrationOptions, result: HydrationResult): Promise<void> {
|
||||||
|
const limit = options.batchSize || 100;
|
||||||
|
|
||||||
|
// Get unhydrated jobs
|
||||||
|
const unhydratedJobs = await this.crawlRunRecorder.getUnhydratedJobs(
|
||||||
|
options.dispensaryId,
|
||||||
|
options.startDate,
|
||||||
|
limit
|
||||||
|
);
|
||||||
|
|
||||||
|
this.log(`Found ${unhydratedJobs.length} unhydrated jobs`);
|
||||||
|
|
||||||
|
// Group by dispensary
|
||||||
|
const jobsByDispensary = this.groupJobsByDispensary(unhydratedJobs);
|
||||||
|
|
||||||
|
for (const [dispensaryId, jobs] of jobsByDispensary) {
|
||||||
|
this.log(`Processing dispensary ${dispensaryId} (${jobs.length} jobs)`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Step 1: Upsert products
|
||||||
|
if (!options.dryRun) {
|
||||||
|
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
|
||||||
|
result.productsUpserted += productResult.upserted;
|
||||||
|
if (productResult.errors.length > 0) {
|
||||||
|
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get store_product_id map
|
||||||
|
const storeProductIdMap = await this.productNormalizer.getStoreProductIdMap(dispensaryId);
|
||||||
|
|
||||||
|
// Step 2: Process each job
|
||||||
|
for (const job of jobs) {
|
||||||
|
try {
|
||||||
|
await this.processJob(job, storeProductIdMap, result, options.dryRun);
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Job ${job.id}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process a single job: record crawl run and write snapshots
|
||||||
|
*/
|
||||||
|
private async processJob(
|
||||||
|
job: SourceJob,
|
||||||
|
storeProductIdMap: Map<string, number>,
|
||||||
|
result: HydrationResult,
|
||||||
|
dryRun?: boolean
|
||||||
|
): Promise<void> {
|
||||||
|
// Step 1: Record the crawl run
|
||||||
|
let crawlRunId: number | null = null;
|
||||||
|
|
||||||
|
if (!dryRun) {
|
||||||
|
crawlRunId = await this.crawlRunRecorder.recordCrawlRun(job);
|
||||||
|
if (crawlRunId) {
|
||||||
|
result.crawlRunsCreated++;
|
||||||
|
} else {
|
||||||
|
result.crawlRunsSkipped++;
|
||||||
|
return; // Skip snapshot writing if crawl run wasn't created
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// In dry run, check if it would be created
|
||||||
|
const existingId = await this.crawlRunRecorder.getCrawlRunIdBySourceJob(
|
||||||
|
'dispensary_crawl_jobs',
|
||||||
|
job.id
|
||||||
|
);
|
||||||
|
if (existingId) {
|
||||||
|
result.crawlRunsSkipped++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
result.crawlRunsCreated++;
|
||||||
|
return; // Skip snapshot writing in dry run
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Write snapshots for this crawl run
|
||||||
|
if (crawlRunId && job.completed_at) {
|
||||||
|
const snapshotResult = await this.snapshotWriter.writeSnapshotsForCrawlRun(
|
||||||
|
crawlRunId,
|
||||||
|
job.dispensary_id,
|
||||||
|
storeProductIdMap,
|
||||||
|
job.completed_at
|
||||||
|
);
|
||||||
|
|
||||||
|
result.snapshotsWritten += snapshotResult.written;
|
||||||
|
if (snapshotResult.errors.length > 0) {
|
||||||
|
result.errors.push(...snapshotResult.errors);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update crawl_run with snapshots_written count
|
||||||
|
await this.crawlRunRecorder.updateSnapshotsWritten(crawlRunId, snapshotResult.written);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hydrate a single dispensary (convenience method)
|
||||||
|
*/
|
||||||
|
async hydrateDispensary(
|
||||||
|
dispensaryId: number,
|
||||||
|
mode: 'backfill' | 'incremental' = 'incremental'
|
||||||
|
): Promise<HydrationResult> {
|
||||||
|
return this.hydrate({
|
||||||
|
mode,
|
||||||
|
dispensaryId,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get hydration status for a dispensary
|
||||||
|
*/
|
||||||
|
async getHydrationStatus(dispensaryId: number): Promise<{
|
||||||
|
sourceJobs: number;
|
||||||
|
hydratedJobs: number;
|
||||||
|
unhydratedJobs: number;
|
||||||
|
sourceProducts: number;
|
||||||
|
storeProducts: number;
|
||||||
|
sourceSnapshots: number;
|
||||||
|
storeSnapshots: number;
|
||||||
|
}> {
|
||||||
|
const [sourceJobs, hydratedJobs, sourceProducts, storeProducts, sourceSnapshots, storeSnapshots] =
|
||||||
|
await Promise.all([
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM dispensary_crawl_jobs
|
||||||
|
WHERE dispensary_id = $1 AND status = 'completed' AND job_type = 'dutchie_product_crawl'`,
|
||||||
|
[dispensaryId]
|
||||||
|
),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM crawl_runs
|
||||||
|
WHERE dispensary_id = $1 AND source_job_type = 'dispensary_crawl_jobs'`,
|
||||||
|
[dispensaryId]
|
||||||
|
),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM dutchie_products WHERE dispensary_id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM store_products WHERE dispensary_id = $1 AND provider = 'dutchie'`,
|
||||||
|
[dispensaryId]
|
||||||
|
),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM dutchie_product_snapshots WHERE dispensary_id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM store_product_snapshots WHERE dispensary_id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const sourceJobCount = parseInt(sourceJobs.rows[0].count);
|
||||||
|
const hydratedJobCount = parseInt(hydratedJobs.rows[0].count);
|
||||||
|
|
||||||
|
return {
|
||||||
|
sourceJobs: sourceJobCount,
|
||||||
|
hydratedJobs: hydratedJobCount,
|
||||||
|
unhydratedJobs: sourceJobCount - hydratedJobCount,
|
||||||
|
sourceProducts: parseInt(sourceProducts.rows[0].count),
|
||||||
|
storeProducts: parseInt(storeProducts.rows[0].count),
|
||||||
|
sourceSnapshots: parseInt(sourceSnapshots.rows[0].count),
|
||||||
|
storeSnapshots: parseInt(storeSnapshots.rows[0].count),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get overall hydration status
|
||||||
|
*/
|
||||||
|
async getOverallStatus(): Promise<{
|
||||||
|
totalSourceJobs: number;
|
||||||
|
totalHydratedJobs: number;
|
||||||
|
totalSourceProducts: number;
|
||||||
|
totalStoreProducts: number;
|
||||||
|
totalSourceSnapshots: number;
|
||||||
|
totalStoreSnapshots: number;
|
||||||
|
dispensariesWithData: number;
|
||||||
|
}> {
|
||||||
|
const [sourceJobs, hydratedJobs, sourceProducts, storeProducts, sourceSnapshots, storeSnapshots, dispensaries] =
|
||||||
|
await Promise.all([
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM dispensary_crawl_jobs
|
||||||
|
WHERE status = 'completed' AND job_type = 'dutchie_product_crawl'`
|
||||||
|
),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(*) FROM crawl_runs WHERE source_job_type = 'dispensary_crawl_jobs'`
|
||||||
|
),
|
||||||
|
this.pool.query(`SELECT COUNT(*) FROM dutchie_products`),
|
||||||
|
this.pool.query(`SELECT COUNT(*) FROM store_products WHERE provider = 'dutchie'`),
|
||||||
|
this.pool.query(`SELECT COUNT(*) FROM dutchie_product_snapshots`),
|
||||||
|
this.pool.query(`SELECT COUNT(*) FROM store_product_snapshots`),
|
||||||
|
this.pool.query(
|
||||||
|
`SELECT COUNT(DISTINCT dispensary_id) FROM dutchie_products`
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalSourceJobs: parseInt(sourceJobs.rows[0].count),
|
||||||
|
totalHydratedJobs: parseInt(hydratedJobs.rows[0].count),
|
||||||
|
totalSourceProducts: parseInt(sourceProducts.rows[0].count),
|
||||||
|
totalStoreProducts: parseInt(storeProducts.rows[0].count),
|
||||||
|
totalSourceSnapshots: parseInt(sourceSnapshots.rows[0].count),
|
||||||
|
totalStoreSnapshots: parseInt(storeSnapshots.rows[0].count),
|
||||||
|
dispensariesWithData: parseInt(dispensaries.rows[0].count),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Group jobs by dispensary ID
|
||||||
|
*/
|
||||||
|
private groupJobsByDispensary(jobs: SourceJob[]): Map<number, SourceJob[]> {
|
||||||
|
const map = new Map<number, SourceJob[]>();
|
||||||
|
for (const job of jobs) {
|
||||||
|
const list = map.get(job.dispensary_id) || [];
|
||||||
|
list.push(job);
|
||||||
|
map.set(job.dispensary_id, list);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Products-only hydration mode
|
||||||
|
* Used when there are no historical job records - creates synthetic crawl runs
|
||||||
|
* from current product data
|
||||||
|
*/
|
||||||
|
async hydrateProductsOnly(options: {
|
||||||
|
dispensaryId?: number;
|
||||||
|
dryRun?: boolean;
|
||||||
|
} = {}): Promise<HydrationResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const result: HydrationResult = {
|
||||||
|
crawlRunsCreated: 0,
|
||||||
|
crawlRunsSkipped: 0,
|
||||||
|
productsUpserted: 0,
|
||||||
|
snapshotsWritten: 0,
|
||||||
|
errors: [],
|
||||||
|
durationMs: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
this.log('Starting products-only hydration mode');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Get all dispensaries with products
|
||||||
|
let dispensaryIds: number[];
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
dispensaryIds = [options.dispensaryId];
|
||||||
|
} else {
|
||||||
|
const dispResult = await this.pool.query(
|
||||||
|
'SELECT DISTINCT dispensary_id FROM dutchie_products ORDER BY dispensary_id'
|
||||||
|
);
|
||||||
|
dispensaryIds = dispResult.rows.map(r => r.dispensary_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.log(`Processing ${dispensaryIds.length} dispensaries`);
|
||||||
|
|
||||||
|
for (const dispensaryId of dispensaryIds) {
|
||||||
|
try {
|
||||||
|
await this.hydrateDispensaryProductsOnly(dispensaryId, result, options.dryRun);
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Dispensary ${dispensaryId}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
result.errors.push(`Fatal error: ${err.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
result.durationMs = Date.now() - startTime;
|
||||||
|
this.log(`Products-only hydration completed in ${result.durationMs}ms: ${JSON.stringify({
|
||||||
|
crawlRunsCreated: result.crawlRunsCreated,
|
||||||
|
productsUpserted: result.productsUpserted,
|
||||||
|
snapshotsWritten: result.snapshotsWritten,
|
||||||
|
errors: result.errors.length,
|
||||||
|
})}`);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hydrate a single dispensary in products-only mode
|
||||||
|
*/
|
||||||
|
private async hydrateDispensaryProductsOnly(
|
||||||
|
dispensaryId: number,
|
||||||
|
result: HydrationResult,
|
||||||
|
dryRun?: boolean
|
||||||
|
): Promise<void> {
|
||||||
|
// Get product count and timestamps for this dispensary
|
||||||
|
const statsResult = await this.pool.query(
|
||||||
|
`SELECT COUNT(*) as cnt, MIN(created_at) as min_date, MAX(updated_at) as max_date
|
||||||
|
FROM dutchie_products WHERE dispensary_id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
const stats = statsResult.rows[0];
|
||||||
|
const productCount = parseInt(stats.cnt);
|
||||||
|
|
||||||
|
if (productCount === 0) {
|
||||||
|
this.log(`Dispensary ${dispensaryId}: No products, skipping`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.log(`Dispensary ${dispensaryId}: ${productCount} products`);
|
||||||
|
|
||||||
|
// Step 1: Create synthetic crawl run
|
||||||
|
let crawlRunId: number | null = null;
|
||||||
|
const now = new Date();
|
||||||
|
|
||||||
|
if (!dryRun) {
|
||||||
|
// Check if we already have a synthetic run for this dispensary
|
||||||
|
const existingRun = await this.pool.query(
|
||||||
|
`SELECT id FROM crawl_runs
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
AND source_job_type = 'products_only_hydration'
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existingRun.rows.length > 0) {
|
||||||
|
crawlRunId = existingRun.rows[0].id;
|
||||||
|
this.log(`Dispensary ${dispensaryId}: Using existing synthetic crawl run ${crawlRunId}`);
|
||||||
|
result.crawlRunsSkipped++;
|
||||||
|
} else {
|
||||||
|
// Create new synthetic crawl run
|
||||||
|
const insertResult = await this.pool.query(
|
||||||
|
`INSERT INTO crawl_runs (
|
||||||
|
dispensary_id, provider, started_at, finished_at, duration_ms,
|
||||||
|
status, products_found, trigger_type, metadata,
|
||||||
|
source_job_type, source_job_id
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||||
|
RETURNING id`,
|
||||||
|
[
|
||||||
|
dispensaryId,
|
||||||
|
'dutchie',
|
||||||
|
stats.min_date || now,
|
||||||
|
stats.max_date || now,
|
||||||
|
0,
|
||||||
|
'success',
|
||||||
|
productCount,
|
||||||
|
'hydration',
|
||||||
|
JSON.stringify({ mode: 'products_only', hydratedAt: now.toISOString() }),
|
||||||
|
'products_only_hydration',
|
||||||
|
dispensaryId, // Use dispensary_id as synthetic job_id
|
||||||
|
]
|
||||||
|
);
|
||||||
|
crawlRunId = insertResult.rows[0].id;
|
||||||
|
result.crawlRunsCreated++;
|
||||||
|
this.log(`Dispensary ${dispensaryId}: Created synthetic crawl run ${crawlRunId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Upsert products
|
||||||
|
const productResult = await this.productNormalizer.upsertProductsForDispensary(dispensaryId);
|
||||||
|
result.productsUpserted += productResult.upserted;
|
||||||
|
if (productResult.errors.length > 0) {
|
||||||
|
result.errors.push(...productResult.errors.map(e => `Dispensary ${dispensaryId}: ${e}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: Create initial snapshots from current product state
|
||||||
|
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId);
|
||||||
|
result.snapshotsWritten += snapshotsWritten;
|
||||||
|
|
||||||
|
// Update crawl run with snapshot count
|
||||||
|
await this.pool.query(
|
||||||
|
'UPDATE crawl_runs SET snapshots_written = $1 WHERE id = $2',
|
||||||
|
[snapshotsWritten, crawlRunId]
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Dry run - just count what would be done
|
||||||
|
result.crawlRunsCreated++;
|
||||||
|
result.productsUpserted += productCount;
|
||||||
|
result.snapshotsWritten += productCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create initial snapshots from current product state
|
||||||
|
*/
|
||||||
|
private async createInitialSnapshots(
|
||||||
|
dispensaryId: number,
|
||||||
|
crawlRunId: number
|
||||||
|
): Promise<number> {
|
||||||
|
// Get all store products for this dispensary
|
||||||
|
const products = await this.pool.query(
|
||||||
|
`SELECT sp.id, sp.price_rec, sp.price_med, sp.is_on_special, sp.is_in_stock,
|
||||||
|
sp.stock_quantity, sp.thc_percent, sp.cbd_percent
|
||||||
|
FROM store_products sp
|
||||||
|
WHERE sp.dispensary_id = $1 AND sp.provider = 'dutchie'`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (products.rows.length === 0) return 0;
|
||||||
|
|
||||||
|
const now = new Date();
|
||||||
|
const batchSize = 100;
|
||||||
|
let totalInserted = 0;
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < products.rows.length; i += batchSize) {
|
||||||
|
const batch = products.rows.slice(i, i + batchSize);
|
||||||
|
const values: any[] = [];
|
||||||
|
const placeholders: string[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
for (const product of batch) {
|
||||||
|
values.push(
|
||||||
|
dispensaryId,
|
||||||
|
product.id,
|
||||||
|
crawlRunId,
|
||||||
|
now,
|
||||||
|
product.price_rec,
|
||||||
|
product.price_med,
|
||||||
|
product.is_on_special || false,
|
||||||
|
product.is_in_stock || false,
|
||||||
|
product.stock_quantity,
|
||||||
|
product.thc_percent,
|
||||||
|
product.cbd_percent,
|
||||||
|
JSON.stringify({ source: 'initial_hydration' })
|
||||||
|
);
|
||||||
|
|
||||||
|
const rowPlaceholders = [];
|
||||||
|
for (let j = 0; j < 12; j++) {
|
||||||
|
rowPlaceholders.push(`$${paramIndex++}`);
|
||||||
|
}
|
||||||
|
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW())`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const query = `
|
||||||
|
INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, store_product_id, crawl_run_id, captured_at,
|
||||||
|
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||||
|
thc_percent, cbd_percent, raw_data, created_at
|
||||||
|
) VALUES ${placeholders.join(', ')}
|
||||||
|
ON CONFLICT (store_product_id, crawl_run_id)
|
||||||
|
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
|
||||||
|
DO NOTHING
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await this.pool.query(query, values);
|
||||||
|
totalInserted += result.rowCount || 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalInserted;
|
||||||
|
}
|
||||||
|
}
|
||||||
13
backend/src/canonical-hydration/index.ts
Normal file
13
backend/src/canonical-hydration/index.ts
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
/**
|
||||||
|
* Canonical Hydration Module
|
||||||
|
* Phase 2: Hydration Pipeline from dutchie_* to store_products/store_product_snapshots/crawl_runs
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Types
|
||||||
|
export * from './types';
|
||||||
|
|
||||||
|
// Services
|
||||||
|
export { CrawlRunRecorder } from './crawl-run-recorder';
|
||||||
|
export { StoreProductNormalizer } from './store-product-normalizer';
|
||||||
|
export { SnapshotWriter } from './snapshot-writer';
|
||||||
|
export { CanonicalHydrationService } from './hydration-service';
|
||||||
303
backend/src/canonical-hydration/snapshot-writer.ts
Normal file
303
backend/src/canonical-hydration/snapshot-writer.ts
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
/**
|
||||||
|
* SnapshotWriter
|
||||||
|
* Inserts store_product_snapshots from dutchie_product_snapshots source table
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { SourceSnapshot, StoreProductSnapshot, ServiceContext } from './types';
|
||||||
|
|
||||||
|
export class SnapshotWriter {
|
||||||
|
private pool: Pool;
|
||||||
|
private log: (message: string) => void;
|
||||||
|
private batchSize: number;
|
||||||
|
|
||||||
|
constructor(ctx: ServiceContext, batchSize: number = 100) {
|
||||||
|
this.pool = ctx.pool;
|
||||||
|
this.log = ctx.logger || console.log;
|
||||||
|
this.batchSize = batchSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write snapshots for a crawl run
|
||||||
|
* Reads from dutchie_product_snapshots and inserts to store_product_snapshots
|
||||||
|
*/
|
||||||
|
async writeSnapshotsForCrawlRun(
|
||||||
|
crawlRunId: number,
|
||||||
|
dispensaryId: number,
|
||||||
|
storeProductIdMap: Map<string, number>,
|
||||||
|
crawledAt: Date
|
||||||
|
): Promise<{ written: number; skipped: number; errors: string[] }> {
|
||||||
|
const errors: string[] = [];
|
||||||
|
let written = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
// Get source snapshots for this dispensary at this crawl time
|
||||||
|
const sourceSnapshots = await this.getSourceSnapshots(dispensaryId, crawledAt);
|
||||||
|
this.log(`Found ${sourceSnapshots.length} source snapshots for dispensary ${dispensaryId} at ${crawledAt.toISOString()}`);
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < sourceSnapshots.length; i += this.batchSize) {
|
||||||
|
const batch = sourceSnapshots.slice(i, i + this.batchSize);
|
||||||
|
try {
|
||||||
|
const { batchWritten, batchSkipped } = await this.writeBatch(
|
||||||
|
batch,
|
||||||
|
crawlRunId,
|
||||||
|
storeProductIdMap
|
||||||
|
);
|
||||||
|
written += batchWritten;
|
||||||
|
skipped += batchSkipped;
|
||||||
|
} catch (err: any) {
|
||||||
|
errors.push(`Batch ${i / this.batchSize}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { written, skipped, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a single snapshot
|
||||||
|
*/
|
||||||
|
async writeSnapshot(
|
||||||
|
source: SourceSnapshot,
|
||||||
|
crawlRunId: number,
|
||||||
|
storeProductId: number
|
||||||
|
): Promise<number | null> {
|
||||||
|
const normalized = this.normalizeSnapshot(source, crawlRunId, storeProductId);
|
||||||
|
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, store_product_id, crawl_run_id, captured_at,
|
||||||
|
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||||
|
thc_percent, cbd_percent, raw_data, created_at
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
|
||||||
|
ON CONFLICT (store_product_id, crawl_run_id)
|
||||||
|
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
|
||||||
|
DO UPDATE SET
|
||||||
|
price_rec = EXCLUDED.price_rec,
|
||||||
|
price_med = EXCLUDED.price_med,
|
||||||
|
is_on_special = EXCLUDED.is_on_special,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_quantity = EXCLUDED.stock_quantity,
|
||||||
|
thc_percent = EXCLUDED.thc_percent,
|
||||||
|
cbd_percent = EXCLUDED.cbd_percent,
|
||||||
|
raw_data = EXCLUDED.raw_data
|
||||||
|
RETURNING id`,
|
||||||
|
[
|
||||||
|
normalized.dispensary_id,
|
||||||
|
normalized.store_product_id,
|
||||||
|
normalized.crawl_run_id,
|
||||||
|
normalized.captured_at,
|
||||||
|
normalized.price_rec,
|
||||||
|
normalized.price_med,
|
||||||
|
normalized.is_on_special,
|
||||||
|
normalized.is_in_stock,
|
||||||
|
normalized.stock_quantity,
|
||||||
|
normalized.thc_percent,
|
||||||
|
normalized.cbd_percent,
|
||||||
|
JSON.stringify(normalized.raw_data),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.rows[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a batch of snapshots
|
||||||
|
*/
|
||||||
|
async writeBatch(
|
||||||
|
sourceSnapshots: SourceSnapshot[],
|
||||||
|
crawlRunId: number,
|
||||||
|
storeProductIdMap: Map<string, number>
|
||||||
|
): Promise<{ batchWritten: number; batchSkipped: number }> {
|
||||||
|
if (sourceSnapshots.length === 0) return { batchWritten: 0, batchSkipped: 0 };
|
||||||
|
|
||||||
|
const values: any[] = [];
|
||||||
|
const placeholders: string[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
for (const source of sourceSnapshots) {
|
||||||
|
// Look up store_product_id
|
||||||
|
const storeProductId = storeProductIdMap.get(source.external_product_id);
|
||||||
|
if (!storeProductId) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalized = this.normalizeSnapshot(source, crawlRunId, storeProductId);
|
||||||
|
|
||||||
|
values.push(
|
||||||
|
normalized.dispensary_id,
|
||||||
|
normalized.store_product_id,
|
||||||
|
normalized.crawl_run_id,
|
||||||
|
normalized.captured_at,
|
||||||
|
normalized.price_rec,
|
||||||
|
normalized.price_med,
|
||||||
|
normalized.is_on_special,
|
||||||
|
normalized.is_in_stock,
|
||||||
|
normalized.stock_quantity,
|
||||||
|
normalized.thc_percent,
|
||||||
|
normalized.cbd_percent,
|
||||||
|
JSON.stringify(normalized.raw_data)
|
||||||
|
);
|
||||||
|
|
||||||
|
const rowPlaceholders = [];
|
||||||
|
for (let j = 0; j < 12; j++) {
|
||||||
|
rowPlaceholders.push(`$${paramIndex++}`);
|
||||||
|
}
|
||||||
|
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW())`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (placeholders.length === 0) {
|
||||||
|
return { batchWritten: 0, batchSkipped: skipped };
|
||||||
|
}
|
||||||
|
|
||||||
|
const query = `
|
||||||
|
INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, store_product_id, crawl_run_id, captured_at,
|
||||||
|
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||||
|
thc_percent, cbd_percent, raw_data, created_at
|
||||||
|
) VALUES ${placeholders.join(', ')}
|
||||||
|
ON CONFLICT (store_product_id, crawl_run_id)
|
||||||
|
WHERE store_product_id IS NOT NULL AND crawl_run_id IS NOT NULL
|
||||||
|
DO UPDATE SET
|
||||||
|
price_rec = EXCLUDED.price_rec,
|
||||||
|
price_med = EXCLUDED.price_med,
|
||||||
|
is_on_special = EXCLUDED.is_on_special,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_quantity = EXCLUDED.stock_quantity,
|
||||||
|
thc_percent = EXCLUDED.thc_percent,
|
||||||
|
cbd_percent = EXCLUDED.cbd_percent,
|
||||||
|
raw_data = EXCLUDED.raw_data
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await this.pool.query(query, values);
|
||||||
|
return { batchWritten: result.rowCount || 0, batchSkipped: skipped };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get source snapshots from dutchie_product_snapshots for a specific crawl time
|
||||||
|
* Groups snapshots by crawled_at time (within a 5-minute window)
|
||||||
|
*/
|
||||||
|
async getSourceSnapshots(
|
||||||
|
dispensaryId: number,
|
||||||
|
crawledAt: Date
|
||||||
|
): Promise<SourceSnapshot[]> {
|
||||||
|
// Find snapshots within 5 minutes of the target time
|
||||||
|
const windowMinutes = 5;
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`SELECT * FROM dutchie_product_snapshots
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
AND crawled_at >= $2 - INTERVAL '${windowMinutes} minutes'
|
||||||
|
AND crawled_at <= $2 + INTERVAL '${windowMinutes} minutes'
|
||||||
|
ORDER BY crawled_at ASC`,
|
||||||
|
[dispensaryId, crawledAt]
|
||||||
|
);
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get distinct crawl times from dutchie_product_snapshots for a dispensary
|
||||||
|
* Used for backfill to identify each crawl run
|
||||||
|
*/
|
||||||
|
async getDistinctCrawlTimes(
|
||||||
|
dispensaryId: number,
|
||||||
|
startDate?: Date,
|
||||||
|
endDate?: Date
|
||||||
|
): Promise<Date[]> {
|
||||||
|
let query = `
|
||||||
|
SELECT DISTINCT date_trunc('minute', crawled_at) as crawl_time
|
||||||
|
FROM dutchie_product_snapshots
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
`;
|
||||||
|
const params: any[] = [dispensaryId];
|
||||||
|
let paramIndex = 2;
|
||||||
|
|
||||||
|
if (startDate) {
|
||||||
|
query += ` AND crawled_at >= $${paramIndex++}`;
|
||||||
|
params.push(startDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endDate) {
|
||||||
|
query += ` AND crawled_at <= $${paramIndex++}`;
|
||||||
|
params.push(endDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ' ORDER BY crawl_time ASC';
|
||||||
|
|
||||||
|
const result = await this.pool.query(query, params);
|
||||||
|
return result.rows.map(row => new Date(row.crawl_time));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if snapshots already exist for a crawl run
|
||||||
|
*/
|
||||||
|
async snapshotsExistForCrawlRun(crawlRunId: number): Promise<boolean> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
'SELECT 1 FROM store_product_snapshots WHERE crawl_run_id = $1 LIMIT 1',
|
||||||
|
[crawlRunId]
|
||||||
|
);
|
||||||
|
return result.rows.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a source snapshot to store_product_snapshot format
|
||||||
|
*/
|
||||||
|
private normalizeSnapshot(
|
||||||
|
source: SourceSnapshot,
|
||||||
|
crawlRunId: number,
|
||||||
|
storeProductId: number
|
||||||
|
): StoreProductSnapshot {
|
||||||
|
// Convert cents to dollars
|
||||||
|
const priceRec = source.rec_min_price_cents !== null
|
||||||
|
? source.rec_min_price_cents / 100
|
||||||
|
: null;
|
||||||
|
const priceMed = source.med_min_price_cents !== null
|
||||||
|
? source.med_min_price_cents / 100
|
||||||
|
: null;
|
||||||
|
|
||||||
|
// Determine stock status
|
||||||
|
const isInStock = this.isSnapshotInStock(source.stock_status, source.total_quantity_available);
|
||||||
|
|
||||||
|
return {
|
||||||
|
dispensary_id: source.dispensary_id,
|
||||||
|
store_product_id: storeProductId,
|
||||||
|
crawl_run_id: crawlRunId,
|
||||||
|
captured_at: source.crawled_at,
|
||||||
|
price_rec: priceRec,
|
||||||
|
price_med: priceMed,
|
||||||
|
is_on_special: false, // Source doesn't have special flag
|
||||||
|
is_in_stock: isInStock,
|
||||||
|
stock_quantity: source.total_quantity_available,
|
||||||
|
thc_percent: null, // Not in snapshot, would need to join with product
|
||||||
|
cbd_percent: null, // Not in snapshot, would need to join with product
|
||||||
|
raw_data: {
|
||||||
|
source_id: source.id,
|
||||||
|
status: source.status,
|
||||||
|
rec_min_price_cents: source.rec_min_price_cents,
|
||||||
|
rec_max_price_cents: source.rec_max_price_cents,
|
||||||
|
med_min_price_cents: source.med_min_price_cents,
|
||||||
|
med_max_price_cents: source.med_max_price_cents,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine if snapshot is in stock
|
||||||
|
*/
|
||||||
|
private isSnapshotInStock(stockStatus: string | null, quantity: number | null): boolean {
|
||||||
|
if (quantity !== null && quantity > 0) return true;
|
||||||
|
|
||||||
|
if (stockStatus) {
|
||||||
|
const status = stockStatus.toLowerCase();
|
||||||
|
if (status === 'in_stock' || status === 'instock' || status === 'available') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (status === 'out_of_stock' || status === 'outofstock' || status === 'unavailable') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
322
backend/src/canonical-hydration/store-product-normalizer.ts
Normal file
322
backend/src/canonical-hydration/store-product-normalizer.ts
Normal file
@@ -0,0 +1,322 @@
|
|||||||
|
/**
|
||||||
|
* StoreProductNormalizer
|
||||||
|
* Upserts store_products from dutchie_products source table
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { SourceProduct, StoreProduct, ServiceContext } from './types';
|
||||||
|
|
||||||
|
export class StoreProductNormalizer {
|
||||||
|
private pool: Pool;
|
||||||
|
private log: (message: string) => void;
|
||||||
|
private batchSize: number;
|
||||||
|
|
||||||
|
constructor(ctx: ServiceContext, batchSize: number = 100) {
|
||||||
|
this.pool = ctx.pool;
|
||||||
|
this.log = ctx.logger || console.log;
|
||||||
|
this.batchSize = batchSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert products for a specific dispensary
|
||||||
|
* Reads from dutchie_products and upserts to store_products
|
||||||
|
*/
|
||||||
|
async upsertProductsForDispensary(dispensaryId: number): Promise<{ upserted: number; errors: string[] }> {
|
||||||
|
const errors: string[] = [];
|
||||||
|
let upserted = 0;
|
||||||
|
|
||||||
|
// Get all products for this dispensary from source
|
||||||
|
const sourceProducts = await this.getSourceProducts(dispensaryId);
|
||||||
|
this.log(`Found ${sourceProducts.length} source products for dispensary ${dispensaryId}`);
|
||||||
|
|
||||||
|
// Process in batches to avoid memory issues
|
||||||
|
for (let i = 0; i < sourceProducts.length; i += this.batchSize) {
|
||||||
|
const batch = sourceProducts.slice(i, i + this.batchSize);
|
||||||
|
try {
|
||||||
|
const batchUpserted = await this.upsertBatch(batch);
|
||||||
|
upserted += batchUpserted;
|
||||||
|
} catch (err: any) {
|
||||||
|
errors.push(`Batch ${i / this.batchSize}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { upserted, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a single product
|
||||||
|
*/
|
||||||
|
async upsertProduct(source: SourceProduct): Promise<number | null> {
|
||||||
|
const normalized = this.normalizeProduct(source);
|
||||||
|
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`INSERT INTO store_products (
|
||||||
|
dispensary_id, brand_id, provider, provider_product_id,
|
||||||
|
name_raw, brand_name_raw, category_raw,
|
||||||
|
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||||
|
thc_percent, cbd_percent, image_url,
|
||||||
|
first_seen_at, last_seen_at, created_at, updated_at
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, NOW(), NOW())
|
||||||
|
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||||
|
DO UPDATE SET
|
||||||
|
name_raw = EXCLUDED.name_raw,
|
||||||
|
brand_name_raw = EXCLUDED.brand_name_raw,
|
||||||
|
category_raw = EXCLUDED.category_raw,
|
||||||
|
price_rec = EXCLUDED.price_rec,
|
||||||
|
price_med = EXCLUDED.price_med,
|
||||||
|
is_on_special = EXCLUDED.is_on_special,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_quantity = EXCLUDED.stock_quantity,
|
||||||
|
thc_percent = EXCLUDED.thc_percent,
|
||||||
|
cbd_percent = EXCLUDED.cbd_percent,
|
||||||
|
image_url = COALESCE(EXCLUDED.image_url, store_products.image_url),
|
||||||
|
last_seen_at = EXCLUDED.last_seen_at,
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING id`,
|
||||||
|
[
|
||||||
|
normalized.dispensary_id,
|
||||||
|
normalized.brand_id,
|
||||||
|
normalized.provider,
|
||||||
|
normalized.provider_product_id,
|
||||||
|
normalized.name_raw,
|
||||||
|
normalized.brand_name_raw,
|
||||||
|
normalized.category_raw,
|
||||||
|
normalized.price_rec,
|
||||||
|
normalized.price_med,
|
||||||
|
normalized.is_on_special,
|
||||||
|
normalized.is_in_stock,
|
||||||
|
normalized.stock_quantity,
|
||||||
|
normalized.thc_percent,
|
||||||
|
normalized.cbd_percent,
|
||||||
|
normalized.image_url,
|
||||||
|
normalized.first_seen_at,
|
||||||
|
normalized.last_seen_at,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.rows[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a batch of products
|
||||||
|
*/
|
||||||
|
async upsertBatch(sourceProducts: SourceProduct[]): Promise<number> {
|
||||||
|
if (sourceProducts.length === 0) return 0;
|
||||||
|
|
||||||
|
// Build multi-row INSERT with ON CONFLICT
|
||||||
|
const values: any[] = [];
|
||||||
|
const placeholders: string[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
for (const source of sourceProducts) {
|
||||||
|
const normalized = this.normalizeProduct(source);
|
||||||
|
values.push(
|
||||||
|
normalized.dispensary_id,
|
||||||
|
normalized.brand_id,
|
||||||
|
normalized.provider,
|
||||||
|
normalized.provider_product_id,
|
||||||
|
normalized.name_raw,
|
||||||
|
normalized.brand_name_raw,
|
||||||
|
normalized.category_raw,
|
||||||
|
normalized.price_rec,
|
||||||
|
normalized.price_med,
|
||||||
|
normalized.is_on_special,
|
||||||
|
normalized.is_in_stock,
|
||||||
|
normalized.stock_quantity,
|
||||||
|
normalized.thc_percent,
|
||||||
|
normalized.cbd_percent,
|
||||||
|
normalized.image_url,
|
||||||
|
normalized.first_seen_at,
|
||||||
|
normalized.last_seen_at
|
||||||
|
);
|
||||||
|
|
||||||
|
const rowPlaceholders = [];
|
||||||
|
for (let j = 0; j < 17; j++) {
|
||||||
|
rowPlaceholders.push(`$${paramIndex++}`);
|
||||||
|
}
|
||||||
|
placeholders.push(`(${rowPlaceholders.join(', ')}, NOW(), NOW())`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const query = `
|
||||||
|
INSERT INTO store_products (
|
||||||
|
dispensary_id, brand_id, provider, provider_product_id,
|
||||||
|
name_raw, brand_name_raw, category_raw,
|
||||||
|
price_rec, price_med, is_on_special, is_in_stock, stock_quantity,
|
||||||
|
thc_percent, cbd_percent, image_url,
|
||||||
|
first_seen_at, last_seen_at, created_at, updated_at
|
||||||
|
) VALUES ${placeholders.join(', ')}
|
||||||
|
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||||
|
DO UPDATE SET
|
||||||
|
name_raw = EXCLUDED.name_raw,
|
||||||
|
brand_name_raw = EXCLUDED.brand_name_raw,
|
||||||
|
category_raw = EXCLUDED.category_raw,
|
||||||
|
price_rec = EXCLUDED.price_rec,
|
||||||
|
price_med = EXCLUDED.price_med,
|
||||||
|
is_on_special = EXCLUDED.is_on_special,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_quantity = EXCLUDED.stock_quantity,
|
||||||
|
thc_percent = EXCLUDED.thc_percent,
|
||||||
|
cbd_percent = EXCLUDED.cbd_percent,
|
||||||
|
image_url = COALESCE(EXCLUDED.image_url, store_products.image_url),
|
||||||
|
last_seen_at = EXCLUDED.last_seen_at,
|
||||||
|
updated_at = NOW()
|
||||||
|
`;
|
||||||
|
|
||||||
|
const result = await this.pool.query(query, values);
|
||||||
|
return result.rowCount || 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get store_product ID by canonical key
|
||||||
|
*/
|
||||||
|
async getStoreProductId(
|
||||||
|
dispensaryId: number,
|
||||||
|
provider: string,
|
||||||
|
providerProductId: string
|
||||||
|
): Promise<number | null> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
'SELECT id FROM store_products WHERE dispensary_id = $1 AND provider = $2 AND provider_product_id = $3',
|
||||||
|
[dispensaryId, provider, providerProductId]
|
||||||
|
);
|
||||||
|
return result.rows[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all store_product IDs for a dispensary (for snapshot writing)
|
||||||
|
*/
|
||||||
|
async getStoreProductIdMap(dispensaryId: number): Promise<Map<string, number>> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
'SELECT id, provider_product_id FROM store_products WHERE dispensary_id = $1',
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
const map = new Map<string, number>();
|
||||||
|
for (const row of result.rows) {
|
||||||
|
map.set(row.provider_product_id, row.id);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get source products from dutchie_products
|
||||||
|
*/
|
||||||
|
private async getSourceProducts(dispensaryId: number): Promise<SourceProduct[]> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`SELECT * FROM dutchie_products WHERE dispensary_id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
return result.rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a source product to store_product format
|
||||||
|
*/
|
||||||
|
private normalizeProduct(source: SourceProduct): StoreProduct {
|
||||||
|
// Extract price from JSONB if present
|
||||||
|
const priceRec = this.extractPrice(source.price_rec);
|
||||||
|
const priceMed = this.extractPrice(source.price_med);
|
||||||
|
|
||||||
|
// Parse THC/CBD percentages
|
||||||
|
const thcPercent = this.parsePercentage(source.thc);
|
||||||
|
const cbdPercent = this.parsePercentage(source.cbd);
|
||||||
|
|
||||||
|
// Determine stock status
|
||||||
|
const isInStock = this.isProductInStock(source.stock_status, source.total_quantity_available);
|
||||||
|
|
||||||
|
return {
|
||||||
|
dispensary_id: source.dispensary_id,
|
||||||
|
brand_id: null, // Source has UUID strings, target expects integer - set to null for now
|
||||||
|
provider: source.platform || 'dutchie',
|
||||||
|
provider_product_id: source.external_product_id,
|
||||||
|
name_raw: source.name,
|
||||||
|
brand_name_raw: source.brand_name,
|
||||||
|
category_raw: source.type || source.subcategory,
|
||||||
|
price_rec: priceRec,
|
||||||
|
price_med: priceMed,
|
||||||
|
is_on_special: false, // Dutchie doesn't have a direct special flag, would need to check specials table
|
||||||
|
is_in_stock: isInStock,
|
||||||
|
stock_quantity: source.total_quantity_available,
|
||||||
|
thc_percent: thcPercent,
|
||||||
|
cbd_percent: cbdPercent,
|
||||||
|
image_url: source.primary_image_url,
|
||||||
|
first_seen_at: source.created_at,
|
||||||
|
last_seen_at: source.updated_at,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract price from JSONB price field
|
||||||
|
* Handles formats like: {min: 10, max: 20}, {value: 15}, or just a number
|
||||||
|
*/
|
||||||
|
private extractPrice(priceData: any): number | null {
|
||||||
|
if (priceData === null || priceData === undefined) return null;
|
||||||
|
|
||||||
|
// If it's already a number
|
||||||
|
if (typeof priceData === 'number') return priceData;
|
||||||
|
|
||||||
|
// If it's a string that looks like a number
|
||||||
|
if (typeof priceData === 'string') {
|
||||||
|
const parsed = parseFloat(priceData);
|
||||||
|
return isNaN(parsed) ? null : parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If it's an object with price data
|
||||||
|
if (typeof priceData === 'object') {
|
||||||
|
// Try common price formats
|
||||||
|
if (priceData.min !== undefined && priceData.min !== null) {
|
||||||
|
return typeof priceData.min === 'number' ? priceData.min : parseFloat(priceData.min);
|
||||||
|
}
|
||||||
|
if (priceData.value !== undefined && priceData.value !== null) {
|
||||||
|
return typeof priceData.value === 'number' ? priceData.value : parseFloat(priceData.value);
|
||||||
|
}
|
||||||
|
if (priceData.price !== undefined && priceData.price !== null) {
|
||||||
|
return typeof priceData.price === 'number' ? priceData.price : parseFloat(priceData.price);
|
||||||
|
}
|
||||||
|
// Check for array of variants
|
||||||
|
if (Array.isArray(priceData) && priceData.length > 0) {
|
||||||
|
const firstVariant = priceData[0];
|
||||||
|
if (firstVariant.price !== undefined) {
|
||||||
|
return typeof firstVariant.price === 'number' ? firstVariant.price : parseFloat(firstVariant.price);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse percentage string to number
|
||||||
|
* Handles formats like: "25.5%", "25.5", "25.5 %", etc.
|
||||||
|
*/
|
||||||
|
private parsePercentage(value: string | null | undefined): number | null {
|
||||||
|
if (value === null || value === undefined) return null;
|
||||||
|
|
||||||
|
// Remove percentage sign and whitespace
|
||||||
|
const cleaned = value.toString().replace(/%/g, '').trim();
|
||||||
|
|
||||||
|
const parsed = parseFloat(cleaned);
|
||||||
|
return isNaN(parsed) ? null : parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine if product is in stock based on status and quantity
|
||||||
|
*/
|
||||||
|
private isProductInStock(stockStatus: string | null, quantity: number | null): boolean {
|
||||||
|
// Check quantity first
|
||||||
|
if (quantity !== null && quantity > 0) return true;
|
||||||
|
|
||||||
|
// Check status string
|
||||||
|
if (stockStatus) {
|
||||||
|
const status = stockStatus.toLowerCase();
|
||||||
|
if (status === 'in_stock' || status === 'instock' || status === 'available') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (status === 'out_of_stock' || status === 'outofstock' || status === 'unavailable') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to false if unknown
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
150
backend/src/canonical-hydration/types.ts
Normal file
150
backend/src/canonical-hydration/types.ts
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
/**
|
||||||
|
* Canonical Hydration Types
|
||||||
|
* Phase 2: Hydration Pipeline from dutchie_* to store_products/store_product_snapshots/crawl_runs
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
|
||||||
|
// Source job types for hydration
|
||||||
|
export type SourceJobType = 'dispensary_crawl_jobs' | 'crawl_jobs' | 'job_run_logs';
|
||||||
|
|
||||||
|
// Source job record (from dispensary_crawl_jobs)
|
||||||
|
export interface SourceJob {
|
||||||
|
id: number;
|
||||||
|
dispensary_id: number;
|
||||||
|
job_type: string;
|
||||||
|
status: string;
|
||||||
|
started_at: Date | null;
|
||||||
|
completed_at: Date | null;
|
||||||
|
duration_ms: number | null;
|
||||||
|
products_found: number | null;
|
||||||
|
products_new: number | null;
|
||||||
|
products_updated: number | null;
|
||||||
|
error_message: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Source product record (from dutchie_products)
|
||||||
|
export interface SourceProduct {
|
||||||
|
id: number;
|
||||||
|
dispensary_id: number;
|
||||||
|
platform: string;
|
||||||
|
external_product_id: string;
|
||||||
|
name: string;
|
||||||
|
brand_name: string | null;
|
||||||
|
brand_id: number | null;
|
||||||
|
type: string | null;
|
||||||
|
subcategory: string | null;
|
||||||
|
strain_type: string | null;
|
||||||
|
thc: string | null;
|
||||||
|
cbd: string | null;
|
||||||
|
price_rec: any; // JSONB
|
||||||
|
price_med: any; // JSONB
|
||||||
|
stock_status: string | null;
|
||||||
|
total_quantity_available: number | null;
|
||||||
|
primary_image_url: string | null;
|
||||||
|
created_at: Date;
|
||||||
|
updated_at: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Source snapshot record (from dutchie_product_snapshots)
|
||||||
|
export interface SourceSnapshot {
|
||||||
|
id: number;
|
||||||
|
dutchie_product_id: number;
|
||||||
|
dispensary_id: number;
|
||||||
|
external_product_id: string;
|
||||||
|
status: string | null;
|
||||||
|
rec_min_price_cents: number | null;
|
||||||
|
rec_max_price_cents: number | null;
|
||||||
|
med_min_price_cents: number | null;
|
||||||
|
med_max_price_cents: number | null;
|
||||||
|
stock_status: string | null;
|
||||||
|
total_quantity_available: number | null;
|
||||||
|
crawled_at: Date;
|
||||||
|
created_at: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Crawl run record for canonical table
|
||||||
|
export interface CrawlRun {
|
||||||
|
id?: number;
|
||||||
|
dispensary_id: number;
|
||||||
|
provider: string;
|
||||||
|
started_at: Date;
|
||||||
|
finished_at: Date | null;
|
||||||
|
duration_ms: number | null;
|
||||||
|
status: string;
|
||||||
|
error_message: string | null;
|
||||||
|
products_found: number | null;
|
||||||
|
products_new: number | null;
|
||||||
|
products_updated: number | null;
|
||||||
|
snapshots_written: number | null;
|
||||||
|
worker_id: string | null;
|
||||||
|
trigger_type: string | null;
|
||||||
|
metadata: any;
|
||||||
|
source_job_type: SourceJobType;
|
||||||
|
source_job_id: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store product record for canonical table
|
||||||
|
export interface StoreProduct {
|
||||||
|
id?: number;
|
||||||
|
dispensary_id: number;
|
||||||
|
brand_id: number | null;
|
||||||
|
provider: string;
|
||||||
|
provider_product_id: string;
|
||||||
|
name_raw: string;
|
||||||
|
brand_name_raw: string | null;
|
||||||
|
category_raw: string | null;
|
||||||
|
price_rec: number | null;
|
||||||
|
price_med: number | null;
|
||||||
|
is_on_special: boolean;
|
||||||
|
is_in_stock: boolean;
|
||||||
|
stock_quantity: number | null;
|
||||||
|
thc_percent: number | null;
|
||||||
|
cbd_percent: number | null;
|
||||||
|
image_url: string | null;
|
||||||
|
first_seen_at: Date;
|
||||||
|
last_seen_at: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store product snapshot record for canonical table
|
||||||
|
export interface StoreProductSnapshot {
|
||||||
|
id?: number;
|
||||||
|
dispensary_id: number;
|
||||||
|
store_product_id: number;
|
||||||
|
crawl_run_id: number;
|
||||||
|
captured_at: Date;
|
||||||
|
price_rec: number | null;
|
||||||
|
price_med: number | null;
|
||||||
|
is_on_special: boolean;
|
||||||
|
is_in_stock: boolean;
|
||||||
|
stock_quantity: number | null;
|
||||||
|
thc_percent: number | null;
|
||||||
|
cbd_percent: number | null;
|
||||||
|
raw_data: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hydration options
|
||||||
|
export interface HydrationOptions {
|
||||||
|
mode: 'backfill' | 'incremental';
|
||||||
|
dispensaryId?: number;
|
||||||
|
startDate?: Date;
|
||||||
|
endDate?: Date;
|
||||||
|
batchSize?: number;
|
||||||
|
dryRun?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hydration result
|
||||||
|
export interface HydrationResult {
|
||||||
|
crawlRunsCreated: number;
|
||||||
|
crawlRunsSkipped: number;
|
||||||
|
productsUpserted: number;
|
||||||
|
snapshotsWritten: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Service context
|
||||||
|
export interface ServiceContext {
|
||||||
|
pool: Pool;
|
||||||
|
logger?: (message: string) => void;
|
||||||
|
}
|
||||||
657
backend/src/crawlers/base/base-dutchie.ts
Normal file
657
backend/src/crawlers/base/base-dutchie.ts
Normal file
@@ -0,0 +1,657 @@
|
|||||||
|
/**
|
||||||
|
* Base Dutchie Crawler Template
|
||||||
|
*
|
||||||
|
* This is the base template for all Dutchie store crawlers.
|
||||||
|
* Per-store crawlers extend this by overriding specific methods.
|
||||||
|
*
|
||||||
|
* Exports:
|
||||||
|
* - crawlProducts(dispensary, options) - Main crawl entry point
|
||||||
|
* - detectStructure(page) - Detect page structure for sandbox mode
|
||||||
|
* - extractProducts(document) - Extract product data
|
||||||
|
* - extractImages(document) - Extract product images
|
||||||
|
* - extractStock(document) - Extract stock status
|
||||||
|
* - extractPagination(document) - Extract pagination info
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
crawlDispensaryProducts as baseCrawlDispensaryProducts,
|
||||||
|
CrawlResult,
|
||||||
|
} from '../../dutchie-az/services/product-crawler';
|
||||||
|
import { Dispensary, CrawlerProfileOptions } from '../../dutchie-az/types';
|
||||||
|
|
||||||
|
// Re-export CrawlResult for convenience
|
||||||
|
export { CrawlResult };
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Options passed to the per-store crawler
|
||||||
|
*/
|
||||||
|
export interface StoreCrawlOptions {
|
||||||
|
pricingType?: 'rec' | 'med';
|
||||||
|
useBothModes?: boolean;
|
||||||
|
downloadImages?: boolean;
|
||||||
|
trackStock?: boolean;
|
||||||
|
timeoutMs?: number;
|
||||||
|
config?: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Progress callback for reporting crawl progress
|
||||||
|
*/
|
||||||
|
export interface CrawlProgressCallback {
|
||||||
|
phase: 'fetching' | 'processing' | 'saving' | 'images' | 'complete';
|
||||||
|
current: number;
|
||||||
|
total: number;
|
||||||
|
message?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Structure detection result for sandbox mode
|
||||||
|
*/
|
||||||
|
export interface StructureDetectionResult {
|
||||||
|
success: boolean;
|
||||||
|
menuType: 'dutchie' | 'treez' | 'jane' | 'unknown';
|
||||||
|
iframeUrl?: string;
|
||||||
|
graphqlEndpoint?: string;
|
||||||
|
dispensaryId?: string;
|
||||||
|
selectors: {
|
||||||
|
productContainer?: string;
|
||||||
|
productName?: string;
|
||||||
|
productPrice?: string;
|
||||||
|
productImage?: string;
|
||||||
|
productCategory?: string;
|
||||||
|
pagination?: string;
|
||||||
|
loadMore?: string;
|
||||||
|
};
|
||||||
|
pagination: {
|
||||||
|
type: 'scroll' | 'click' | 'graphql' | 'none';
|
||||||
|
hasMore?: boolean;
|
||||||
|
pageSize?: number;
|
||||||
|
};
|
||||||
|
errors: string[];
|
||||||
|
metadata: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Product extraction result
|
||||||
|
*/
|
||||||
|
export interface ExtractedProduct {
|
||||||
|
externalId: string;
|
||||||
|
name: string;
|
||||||
|
brand?: string;
|
||||||
|
category?: string;
|
||||||
|
subcategory?: string;
|
||||||
|
price?: number;
|
||||||
|
priceRec?: number;
|
||||||
|
priceMed?: number;
|
||||||
|
weight?: string;
|
||||||
|
thcContent?: string;
|
||||||
|
cbdContent?: string;
|
||||||
|
description?: string;
|
||||||
|
imageUrl?: string;
|
||||||
|
stockStatus?: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||||
|
quantity?: number;
|
||||||
|
raw?: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Image extraction result
|
||||||
|
*/
|
||||||
|
export interface ExtractedImage {
|
||||||
|
productId: string;
|
||||||
|
imageUrl: string;
|
||||||
|
isPrimary: boolean;
|
||||||
|
position: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stock extraction result
|
||||||
|
*/
|
||||||
|
export interface ExtractedStock {
|
||||||
|
productId: string;
|
||||||
|
status: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||||
|
quantity?: number;
|
||||||
|
lastChecked: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pagination extraction result
|
||||||
|
*/
|
||||||
|
export interface ExtractedPagination {
|
||||||
|
hasNextPage: boolean;
|
||||||
|
currentPage?: number;
|
||||||
|
totalPages?: number;
|
||||||
|
totalProducts?: number;
|
||||||
|
nextCursor?: string;
|
||||||
|
loadMoreSelector?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hook points that per-store crawlers can override
|
||||||
|
*/
|
||||||
|
export interface DutchieCrawlerHooks {
|
||||||
|
/**
|
||||||
|
* Called before fetching products
|
||||||
|
* Can be used to set up custom headers, cookies, etc.
|
||||||
|
*/
|
||||||
|
beforeFetch?: (dispensary: Dispensary) => Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called after fetching products, before processing
|
||||||
|
* Can be used to filter or transform raw products
|
||||||
|
*/
|
||||||
|
afterFetch?: (products: any[], dispensary: Dispensary) => Promise<any[]>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called after all processing is complete
|
||||||
|
* Can be used for cleanup or post-processing
|
||||||
|
*/
|
||||||
|
afterComplete?: (result: CrawlResult, dispensary: Dispensary) => Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Custom selector resolver for iframe detection
|
||||||
|
*/
|
||||||
|
resolveIframe?: (page: any) => Promise<string | null>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Custom product container selector
|
||||||
|
*/
|
||||||
|
getProductContainerSelector?: () => string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Custom product extraction from container element
|
||||||
|
*/
|
||||||
|
extractProductFromElement?: (element: any) => Promise<ExtractedProduct | null>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selectors configuration for per-store overrides
|
||||||
|
*/
|
||||||
|
export interface DutchieSelectors {
|
||||||
|
iframe?: string;
|
||||||
|
productContainer?: string;
|
||||||
|
productName?: string;
|
||||||
|
productPrice?: string;
|
||||||
|
productPriceRec?: string;
|
||||||
|
productPriceMed?: string;
|
||||||
|
productImage?: string;
|
||||||
|
productCategory?: string;
|
||||||
|
productBrand?: string;
|
||||||
|
productWeight?: string;
|
||||||
|
productThc?: string;
|
||||||
|
productCbd?: string;
|
||||||
|
productDescription?: string;
|
||||||
|
productStock?: string;
|
||||||
|
loadMore?: string;
|
||||||
|
pagination?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DEFAULT SELECTORS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const DEFAULT_DUTCHIE_SELECTORS: DutchieSelectors = {
|
||||||
|
iframe: 'iframe[src*="dutchie.com"]',
|
||||||
|
productContainer: '[data-testid="product-card"], .product-card, [class*="ProductCard"]',
|
||||||
|
productName: '[data-testid="product-title"], .product-title, [class*="ProductTitle"]',
|
||||||
|
productPrice: '[data-testid="product-price"], .product-price, [class*="ProductPrice"]',
|
||||||
|
productImage: 'img[src*="dutchie"], img[src*="product"], .product-image img',
|
||||||
|
productCategory: '[data-testid="category-name"], .category-name',
|
||||||
|
productBrand: '[data-testid="brand-name"], .brand-name, [class*="BrandName"]',
|
||||||
|
loadMore: 'button[data-testid="load-more"], .load-more-button',
|
||||||
|
pagination: '.pagination, [class*="Pagination"]',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BASE CRAWLER CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* BaseDutchieCrawler - Base class for all Dutchie store crawlers
|
||||||
|
*
|
||||||
|
* Per-store crawlers extend this class and override methods as needed.
|
||||||
|
* The default implementation delegates to the existing shared Dutchie logic.
|
||||||
|
*/
|
||||||
|
export class BaseDutchieCrawler {
|
||||||
|
protected dispensary: Dispensary;
|
||||||
|
protected options: StoreCrawlOptions;
|
||||||
|
protected hooks: DutchieCrawlerHooks;
|
||||||
|
protected selectors: DutchieSelectors;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {},
|
||||||
|
hooks: DutchieCrawlerHooks = {},
|
||||||
|
selectors: DutchieSelectors = {}
|
||||||
|
) {
|
||||||
|
this.dispensary = dispensary;
|
||||||
|
this.options = {
|
||||||
|
pricingType: 'rec',
|
||||||
|
useBothModes: true,
|
||||||
|
downloadImages: true,
|
||||||
|
trackStock: true,
|
||||||
|
timeoutMs: 30000,
|
||||||
|
...options,
|
||||||
|
};
|
||||||
|
this.hooks = hooks;
|
||||||
|
this.selectors = { ...DEFAULT_DUTCHIE_SELECTORS, ...selectors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main entry point - crawl products for this dispensary
|
||||||
|
* Override this in per-store crawlers to customize behavior
|
||||||
|
*/
|
||||||
|
async crawlProducts(): Promise<CrawlResult> {
|
||||||
|
// Call beforeFetch hook if defined
|
||||||
|
if (this.hooks.beforeFetch) {
|
||||||
|
await this.hooks.beforeFetch(this.dispensary);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the existing shared Dutchie crawl logic
|
||||||
|
const result = await baseCrawlDispensaryProducts(
|
||||||
|
this.dispensary,
|
||||||
|
this.options.pricingType || 'rec',
|
||||||
|
{
|
||||||
|
useBothModes: this.options.useBothModes,
|
||||||
|
downloadImages: this.options.downloadImages,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Call afterComplete hook if defined
|
||||||
|
if (this.hooks.afterComplete) {
|
||||||
|
await this.hooks.afterComplete(result, this.dispensary);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect page structure for sandbox discovery mode
|
||||||
|
* Override in per-store crawlers if needed
|
||||||
|
*
|
||||||
|
* @param page - Puppeteer page object or HTML string
|
||||||
|
* @returns Structure detection result
|
||||||
|
*/
|
||||||
|
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||||
|
const result: StructureDetectionResult = {
|
||||||
|
success: false,
|
||||||
|
menuType: 'unknown',
|
||||||
|
selectors: {},
|
||||||
|
pagination: { type: 'none' },
|
||||||
|
errors: [],
|
||||||
|
metadata: {},
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Default implementation: check for Dutchie iframe
|
||||||
|
if (typeof page === 'string') {
|
||||||
|
// HTML string mode
|
||||||
|
if (page.includes('dutchie.com')) {
|
||||||
|
result.menuType = 'dutchie';
|
||||||
|
result.success = true;
|
||||||
|
}
|
||||||
|
} else if (page && typeof page.evaluate === 'function') {
|
||||||
|
// Puppeteer page mode
|
||||||
|
const detection = await page.evaluate((selectorConfig: DutchieSelectors) => {
|
||||||
|
const iframe = document.querySelector(selectorConfig.iframe || '') as HTMLIFrameElement;
|
||||||
|
const iframeUrl = iframe?.src || null;
|
||||||
|
|
||||||
|
// Check for product containers
|
||||||
|
const containers = document.querySelectorAll(selectorConfig.productContainer || '');
|
||||||
|
|
||||||
|
return {
|
||||||
|
hasIframe: !!iframe,
|
||||||
|
iframeUrl,
|
||||||
|
productCount: containers.length,
|
||||||
|
isDutchie: !!iframeUrl?.includes('dutchie.com'),
|
||||||
|
};
|
||||||
|
}, this.selectors);
|
||||||
|
|
||||||
|
if (detection.isDutchie) {
|
||||||
|
result.menuType = 'dutchie';
|
||||||
|
result.iframeUrl = detection.iframeUrl;
|
||||||
|
result.success = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.metadata = detection;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set default selectors for Dutchie
|
||||||
|
if (result.menuType === 'dutchie') {
|
||||||
|
result.selectors = {
|
||||||
|
productContainer: this.selectors.productContainer,
|
||||||
|
productName: this.selectors.productName,
|
||||||
|
productPrice: this.selectors.productPrice,
|
||||||
|
productImage: this.selectors.productImage,
|
||||||
|
productCategory: this.selectors.productCategory,
|
||||||
|
};
|
||||||
|
result.pagination = { type: 'graphql' };
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
result.errors.push(`Detection error: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products from page/document
|
||||||
|
* Override in per-store crawlers for custom extraction
|
||||||
|
*
|
||||||
|
* @param document - DOM document, Puppeteer page, or raw products array
|
||||||
|
* @returns Array of extracted products
|
||||||
|
*/
|
||||||
|
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||||
|
// Default implementation: assume document is already an array of products
|
||||||
|
// from the GraphQL response
|
||||||
|
if (Array.isArray(document)) {
|
||||||
|
return document.map((product) => this.mapRawProduct(product));
|
||||||
|
}
|
||||||
|
|
||||||
|
// If document is a Puppeteer page, extract from DOM
|
||||||
|
if (document && typeof document.evaluate === 'function') {
|
||||||
|
return this.extractProductsFromPage(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products from Puppeteer page
|
||||||
|
* Override for custom DOM extraction
|
||||||
|
*/
|
||||||
|
protected async extractProductsFromPage(page: any): Promise<ExtractedProduct[]> {
|
||||||
|
const products = await page.evaluate((selectors: DutchieSelectors) => {
|
||||||
|
const containers = document.querySelectorAll(selectors.productContainer || '');
|
||||||
|
return Array.from(containers).map((container) => {
|
||||||
|
const nameEl = container.querySelector(selectors.productName || '');
|
||||||
|
const priceEl = container.querySelector(selectors.productPrice || '');
|
||||||
|
const imageEl = container.querySelector(selectors.productImage || '') as HTMLImageElement;
|
||||||
|
const brandEl = container.querySelector(selectors.productBrand || '');
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: nameEl?.textContent?.trim() || '',
|
||||||
|
price: priceEl?.textContent?.trim() || '',
|
||||||
|
imageUrl: imageEl?.src || '',
|
||||||
|
brand: brandEl?.textContent?.trim() || '',
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}, this.selectors);
|
||||||
|
|
||||||
|
return products.map((p: any, i: number) => ({
|
||||||
|
externalId: `dom-product-${i}`,
|
||||||
|
name: p.name,
|
||||||
|
brand: p.brand,
|
||||||
|
price: this.parsePrice(p.price),
|
||||||
|
imageUrl: p.imageUrl,
|
||||||
|
stockStatus: 'unknown' as const,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map raw product from GraphQL to ExtractedProduct
|
||||||
|
* Override for custom mapping
|
||||||
|
*/
|
||||||
|
protected mapRawProduct(raw: any): ExtractedProduct {
|
||||||
|
return {
|
||||||
|
externalId: raw.id || raw._id || raw.externalId,
|
||||||
|
name: raw.name || raw.Name,
|
||||||
|
brand: raw.brand?.name || raw.brandName || raw.brand,
|
||||||
|
category: raw.type || raw.category || raw.Category,
|
||||||
|
subcategory: raw.subcategory || raw.Subcategory,
|
||||||
|
price: raw.recPrice || raw.price || raw.Price,
|
||||||
|
priceRec: raw.recPrice || raw.Prices?.rec,
|
||||||
|
priceMed: raw.medPrice || raw.Prices?.med,
|
||||||
|
weight: raw.weight || raw.Weight,
|
||||||
|
thcContent: raw.potencyThc?.formatted || raw.THCContent?.formatted,
|
||||||
|
cbdContent: raw.potencyCbd?.formatted || raw.CBDContent?.formatted,
|
||||||
|
description: raw.description || raw.Description,
|
||||||
|
imageUrl: raw.image || raw.Image,
|
||||||
|
stockStatus: this.mapStockStatus(raw),
|
||||||
|
quantity: raw.quantity || raw.Quantity,
|
||||||
|
raw,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map raw stock status to standardized value
|
||||||
|
*/
|
||||||
|
protected mapStockStatus(raw: any): 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown' {
|
||||||
|
const status = raw.Status || raw.status || raw.stockStatus;
|
||||||
|
if (status === 'Active' || status === 'active' || status === 'in_stock') {
|
||||||
|
return 'in_stock';
|
||||||
|
}
|
||||||
|
if (status === 'Inactive' || status === 'inactive' || status === 'out_of_stock') {
|
||||||
|
return 'out_of_stock';
|
||||||
|
}
|
||||||
|
if (status === 'low_stock') {
|
||||||
|
return 'low_stock';
|
||||||
|
}
|
||||||
|
return 'unknown';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse price string to number
|
||||||
|
*/
|
||||||
|
protected parsePrice(priceStr: string): number | undefined {
|
||||||
|
if (!priceStr) return undefined;
|
||||||
|
const cleaned = priceStr.replace(/[^0-9.]/g, '');
|
||||||
|
const num = parseFloat(cleaned);
|
||||||
|
return isNaN(num) ? undefined : num;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract images from document
|
||||||
|
* Override for custom image extraction
|
||||||
|
*
|
||||||
|
* @param document - DOM document, Puppeteer page, or products array
|
||||||
|
* @returns Array of extracted images
|
||||||
|
*/
|
||||||
|
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||||
|
if (Array.isArray(document)) {
|
||||||
|
return document
|
||||||
|
.filter((p) => p.image || p.Image || p.imageUrl)
|
||||||
|
.map((p, i) => ({
|
||||||
|
productId: p.id || p._id || `product-${i}`,
|
||||||
|
imageUrl: p.image || p.Image || p.imageUrl,
|
||||||
|
isPrimary: true,
|
||||||
|
position: 0,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Puppeteer page extraction
|
||||||
|
if (document && typeof document.evaluate === 'function') {
|
||||||
|
return this.extractImagesFromPage(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract images from Puppeteer page
|
||||||
|
*/
|
||||||
|
protected async extractImagesFromPage(page: any): Promise<ExtractedImage[]> {
|
||||||
|
const images = await page.evaluate((selector: string) => {
|
||||||
|
const imgs = document.querySelectorAll(selector);
|
||||||
|
return Array.from(imgs).map((img, i) => ({
|
||||||
|
src: (img as HTMLImageElement).src,
|
||||||
|
position: i,
|
||||||
|
}));
|
||||||
|
}, this.selectors.productImage || 'img');
|
||||||
|
|
||||||
|
return images.map((img: any, i: number) => ({
|
||||||
|
productId: `dom-product-${i}`,
|
||||||
|
imageUrl: img.src,
|
||||||
|
isPrimary: i === 0,
|
||||||
|
position: img.position,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract stock information from document
|
||||||
|
* Override for custom stock extraction
|
||||||
|
*
|
||||||
|
* @param document - DOM document, Puppeteer page, or products array
|
||||||
|
* @returns Array of extracted stock statuses
|
||||||
|
*/
|
||||||
|
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||||
|
if (Array.isArray(document)) {
|
||||||
|
return document.map((p) => ({
|
||||||
|
productId: p.id || p._id || p.externalId,
|
||||||
|
status: this.mapStockStatus(p),
|
||||||
|
quantity: p.quantity || p.Quantity,
|
||||||
|
lastChecked: new Date(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract pagination information from document
|
||||||
|
* Override for custom pagination handling
|
||||||
|
*
|
||||||
|
* @param document - DOM document, Puppeteer page, or GraphQL response
|
||||||
|
* @returns Pagination info
|
||||||
|
*/
|
||||||
|
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||||
|
// Default: check for page info in GraphQL response
|
||||||
|
if (document && document.pageInfo) {
|
||||||
|
return {
|
||||||
|
hasNextPage: document.pageInfo.hasNextPage || false,
|
||||||
|
currentPage: document.pageInfo.currentPage,
|
||||||
|
totalPages: document.pageInfo.totalPages,
|
||||||
|
totalProducts: document.pageInfo.totalCount || document.totalCount,
|
||||||
|
nextCursor: document.pageInfo.endCursor,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default: no pagination
|
||||||
|
return {
|
||||||
|
hasNextPage: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the cName (Dutchie slug) for this dispensary
|
||||||
|
* Override to customize cName extraction
|
||||||
|
*/
|
||||||
|
getCName(): string {
|
||||||
|
if (this.dispensary.menuUrl) {
|
||||||
|
try {
|
||||||
|
const url = new URL(this.dispensary.menuUrl);
|
||||||
|
const segments = url.pathname.split('/').filter(Boolean);
|
||||||
|
if (segments.length >= 2) {
|
||||||
|
return segments[segments.length - 1];
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Fall through to default
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return this.dispensary.slug || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get custom headers for API requests
|
||||||
|
* Override for store-specific headers
|
||||||
|
*/
|
||||||
|
getCustomHeaders(): Record<string, string> {
|
||||||
|
const cName = this.getCName();
|
||||||
|
return {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
Origin: 'https://dutchie.com',
|
||||||
|
Referer: `https://dutchie.com/embedded-menu/${cName}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// FACTORY FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a base Dutchie crawler instance
|
||||||
|
* This is the default export used when no per-store override exists
|
||||||
|
*/
|
||||||
|
export function createCrawler(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {},
|
||||||
|
hooks: DutchieCrawlerHooks = {},
|
||||||
|
selectors: DutchieSelectors = {}
|
||||||
|
): BaseDutchieCrawler {
|
||||||
|
return new BaseDutchieCrawler(dispensary, options, hooks, selectors);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STANDALONE FUNCTIONS (required exports for orchestrator)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Crawl products using the base Dutchie logic
|
||||||
|
* Per-store files can call this or override it completely
|
||||||
|
*/
|
||||||
|
export async function crawlProducts(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {}
|
||||||
|
): Promise<CrawlResult> {
|
||||||
|
const crawler = createCrawler(dispensary, options);
|
||||||
|
return crawler.crawlProducts();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect structure using the base Dutchie logic
|
||||||
|
*/
|
||||||
|
export async function detectStructure(
|
||||||
|
page: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<StructureDetectionResult> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.detectStructure(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products using the base Dutchie logic
|
||||||
|
*/
|
||||||
|
export async function extractProducts(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedProduct[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractProducts(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract images using the base Dutchie logic
|
||||||
|
*/
|
||||||
|
export async function extractImages(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedImage[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractImages(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract stock using the base Dutchie logic
|
||||||
|
*/
|
||||||
|
export async function extractStock(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedStock[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractStock(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract pagination using the base Dutchie logic
|
||||||
|
*/
|
||||||
|
export async function extractPagination(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedPagination> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractPagination(document);
|
||||||
|
}
|
||||||
330
backend/src/crawlers/base/base-jane.ts
Normal file
330
backend/src/crawlers/base/base-jane.ts
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
/**
|
||||||
|
* Base Jane Crawler Template (PLACEHOLDER)
|
||||||
|
*
|
||||||
|
* This is the base template for all Jane (iheartjane) store crawlers.
|
||||||
|
* Per-store crawlers extend this by overriding specific methods.
|
||||||
|
*
|
||||||
|
* TODO: Implement Jane-specific crawling logic (Algolia-based)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Dispensary } from '../../dutchie-az/types';
|
||||||
|
import {
|
||||||
|
StoreCrawlOptions,
|
||||||
|
CrawlResult,
|
||||||
|
StructureDetectionResult,
|
||||||
|
ExtractedProduct,
|
||||||
|
ExtractedImage,
|
||||||
|
ExtractedStock,
|
||||||
|
ExtractedPagination,
|
||||||
|
} from './base-dutchie';
|
||||||
|
|
||||||
|
// Re-export types
|
||||||
|
export {
|
||||||
|
StoreCrawlOptions,
|
||||||
|
CrawlResult,
|
||||||
|
StructureDetectionResult,
|
||||||
|
ExtractedProduct,
|
||||||
|
ExtractedImage,
|
||||||
|
ExtractedStock,
|
||||||
|
ExtractedPagination,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// JANE-SPECIFIC TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface JaneConfig {
|
||||||
|
algoliaAppId?: string;
|
||||||
|
algoliaApiKey?: string;
|
||||||
|
algoliaIndex?: string;
|
||||||
|
storeId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface JaneSelectors {
|
||||||
|
productContainer?: string;
|
||||||
|
productName?: string;
|
||||||
|
productPrice?: string;
|
||||||
|
productImage?: string;
|
||||||
|
productCategory?: string;
|
||||||
|
productBrand?: string;
|
||||||
|
pagination?: string;
|
||||||
|
loadMore?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const DEFAULT_JANE_SELECTORS: JaneSelectors = {
|
||||||
|
productContainer: '[data-testid="product-card"], .product-card',
|
||||||
|
productName: '[data-testid="product-name"], .product-name',
|
||||||
|
productPrice: '[data-testid="product-price"], .product-price',
|
||||||
|
productImage: '.product-image img, [data-testid="product-image"] img',
|
||||||
|
productCategory: '.product-category',
|
||||||
|
productBrand: '.product-brand, [data-testid="brand-name"]',
|
||||||
|
loadMore: '[data-testid="load-more"], .load-more-btn',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BASE JANE CRAWLER CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class BaseJaneCrawler {
|
||||||
|
protected dispensary: Dispensary;
|
||||||
|
protected options: StoreCrawlOptions;
|
||||||
|
protected selectors: JaneSelectors;
|
||||||
|
protected janeConfig: JaneConfig;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {},
|
||||||
|
selectors: JaneSelectors = {},
|
||||||
|
janeConfig: JaneConfig = {}
|
||||||
|
) {
|
||||||
|
this.dispensary = dispensary;
|
||||||
|
this.options = {
|
||||||
|
pricingType: 'rec',
|
||||||
|
useBothModes: false,
|
||||||
|
downloadImages: true,
|
||||||
|
trackStock: true,
|
||||||
|
timeoutMs: 30000,
|
||||||
|
...options,
|
||||||
|
};
|
||||||
|
this.selectors = { ...DEFAULT_JANE_SELECTORS, ...selectors };
|
||||||
|
this.janeConfig = janeConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main entry point - crawl products for this dispensary
|
||||||
|
* TODO: Implement Jane/Algolia-specific crawling
|
||||||
|
*/
|
||||||
|
async crawlProducts(): Promise<CrawlResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
console.warn(`[BaseJaneCrawler] Jane crawling not yet implemented for ${this.dispensary.name}`);
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
dispensaryId: this.dispensary.id || 0,
|
||||||
|
productsFound: 0,
|
||||||
|
productsFetched: 0,
|
||||||
|
productsUpserted: 0,
|
||||||
|
snapshotsCreated: 0,
|
||||||
|
imagesDownloaded: 0,
|
||||||
|
errorMessage: 'Jane crawler not yet implemented',
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect page structure for sandbox discovery mode
|
||||||
|
* Jane uses Algolia, so we look for Algolia config
|
||||||
|
*/
|
||||||
|
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||||
|
const result: StructureDetectionResult = {
|
||||||
|
success: false,
|
||||||
|
menuType: 'unknown',
|
||||||
|
selectors: {},
|
||||||
|
pagination: { type: 'none' },
|
||||||
|
errors: [],
|
||||||
|
metadata: {},
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (page && typeof page.evaluate === 'function') {
|
||||||
|
// Look for Jane/Algolia indicators
|
||||||
|
const detection = await page.evaluate(() => {
|
||||||
|
// Check for iheartjane in page
|
||||||
|
const hasJane = document.documentElement.innerHTML.includes('iheartjane') ||
|
||||||
|
document.documentElement.innerHTML.includes('jane-menu');
|
||||||
|
|
||||||
|
// Look for Algolia config
|
||||||
|
const scripts = Array.from(document.querySelectorAll('script'));
|
||||||
|
let algoliaConfig: any = null;
|
||||||
|
|
||||||
|
for (const script of scripts) {
|
||||||
|
const content = script.textContent || '';
|
||||||
|
if (content.includes('algolia') || content.includes('ALGOLIA')) {
|
||||||
|
// Try to extract config
|
||||||
|
const appIdMatch = content.match(/applicationId['":\s]+['"]([^'"]+)['"]/);
|
||||||
|
const apiKeyMatch = content.match(/apiKey['":\s]+['"]([^'"]+)['"]/);
|
||||||
|
if (appIdMatch && apiKeyMatch) {
|
||||||
|
algoliaConfig = {
|
||||||
|
appId: appIdMatch[1],
|
||||||
|
apiKey: apiKeyMatch[1],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
hasJane,
|
||||||
|
algoliaConfig,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
if (detection.hasJane) {
|
||||||
|
result.menuType = 'jane';
|
||||||
|
result.success = true;
|
||||||
|
result.metadata = detection;
|
||||||
|
|
||||||
|
if (detection.algoliaConfig) {
|
||||||
|
result.metadata.algoliaAppId = detection.algoliaConfig.appId;
|
||||||
|
result.metadata.algoliaApiKey = detection.algoliaConfig.apiKey;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
result.errors.push(`Detection error: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products from Algolia response or page
|
||||||
|
*/
|
||||||
|
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||||
|
// If document is Algolia hits array
|
||||||
|
if (Array.isArray(document)) {
|
||||||
|
return document.map((hit) => this.mapAlgoliaHit(hit));
|
||||||
|
}
|
||||||
|
|
||||||
|
console.warn('[BaseJaneCrawler] extractProducts not yet fully implemented');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map Algolia hit to ExtractedProduct
|
||||||
|
*/
|
||||||
|
protected mapAlgoliaHit(hit: any): ExtractedProduct {
|
||||||
|
return {
|
||||||
|
externalId: hit.objectID || hit.id || hit.product_id,
|
||||||
|
name: hit.name || hit.product_name,
|
||||||
|
brand: hit.brand || hit.brand_name,
|
||||||
|
category: hit.category || hit.kind,
|
||||||
|
subcategory: hit.subcategory,
|
||||||
|
price: hit.price || hit.bucket_price,
|
||||||
|
priceRec: hit.prices?.rec || hit.price_rec,
|
||||||
|
priceMed: hit.prices?.med || hit.price_med,
|
||||||
|
weight: hit.weight || hit.amount,
|
||||||
|
thcContent: hit.percent_thc ? `${hit.percent_thc}%` : undefined,
|
||||||
|
cbdContent: hit.percent_cbd ? `${hit.percent_cbd}%` : undefined,
|
||||||
|
description: hit.description,
|
||||||
|
imageUrl: hit.image_url || hit.product_image_url,
|
||||||
|
stockStatus: hit.available ? 'in_stock' : 'out_of_stock',
|
||||||
|
quantity: hit.quantity_available,
|
||||||
|
raw: hit,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract images from document
|
||||||
|
*/
|
||||||
|
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||||
|
if (Array.isArray(document)) {
|
||||||
|
return document
|
||||||
|
.filter((hit) => hit.image_url || hit.product_image_url)
|
||||||
|
.map((hit, i) => ({
|
||||||
|
productId: hit.objectID || hit.id || `jane-product-${i}`,
|
||||||
|
imageUrl: hit.image_url || hit.product_image_url,
|
||||||
|
isPrimary: true,
|
||||||
|
position: 0,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract stock information from document
|
||||||
|
*/
|
||||||
|
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||||
|
if (Array.isArray(document)) {
|
||||||
|
return document.map((hit) => ({
|
||||||
|
productId: hit.objectID || hit.id,
|
||||||
|
status: hit.available ? 'in_stock' as const : 'out_of_stock' as const,
|
||||||
|
quantity: hit.quantity_available,
|
||||||
|
lastChecked: new Date(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract pagination information
|
||||||
|
* Algolia uses cursor-based pagination
|
||||||
|
*/
|
||||||
|
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||||
|
if (document && typeof document === 'object' && !Array.isArray(document)) {
|
||||||
|
return {
|
||||||
|
hasNextPage: document.page < document.nbPages - 1,
|
||||||
|
currentPage: document.page,
|
||||||
|
totalPages: document.nbPages,
|
||||||
|
totalProducts: document.nbHits,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return { hasNextPage: false };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// FACTORY FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export function createCrawler(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {},
|
||||||
|
selectors: JaneSelectors = {},
|
||||||
|
janeConfig: JaneConfig = {}
|
||||||
|
): BaseJaneCrawler {
|
||||||
|
return new BaseJaneCrawler(dispensary, options, selectors, janeConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STANDALONE FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export async function crawlProducts(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {}
|
||||||
|
): Promise<CrawlResult> {
|
||||||
|
const crawler = createCrawler(dispensary, options);
|
||||||
|
return crawler.crawlProducts();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function detectStructure(
|
||||||
|
page: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<StructureDetectionResult> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.detectStructure(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractProducts(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedProduct[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractProducts(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractImages(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedImage[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractImages(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractStock(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedStock[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractStock(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractPagination(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedPagination> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractPagination(document);
|
||||||
|
}
|
||||||
212
backend/src/crawlers/base/base-treez.ts
Normal file
212
backend/src/crawlers/base/base-treez.ts
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
/**
|
||||||
|
* Base Treez Crawler Template (PLACEHOLDER)
|
||||||
|
*
|
||||||
|
* This is the base template for all Treez store crawlers.
|
||||||
|
* Per-store crawlers extend this by overriding specific methods.
|
||||||
|
*
|
||||||
|
* TODO: Implement Treez-specific crawling logic
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Dispensary } from '../../dutchie-az/types';
|
||||||
|
import {
|
||||||
|
StoreCrawlOptions,
|
||||||
|
CrawlResult,
|
||||||
|
StructureDetectionResult,
|
||||||
|
ExtractedProduct,
|
||||||
|
ExtractedImage,
|
||||||
|
ExtractedStock,
|
||||||
|
ExtractedPagination,
|
||||||
|
} from './base-dutchie';
|
||||||
|
|
||||||
|
// Re-export types
|
||||||
|
export {
|
||||||
|
StoreCrawlOptions,
|
||||||
|
CrawlResult,
|
||||||
|
StructureDetectionResult,
|
||||||
|
ExtractedProduct,
|
||||||
|
ExtractedImage,
|
||||||
|
ExtractedStock,
|
||||||
|
ExtractedPagination,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TREEZ-SPECIFIC TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface TreezSelectors {
|
||||||
|
productContainer?: string;
|
||||||
|
productName?: string;
|
||||||
|
productPrice?: string;
|
||||||
|
productImage?: string;
|
||||||
|
productCategory?: string;
|
||||||
|
productBrand?: string;
|
||||||
|
addToCart?: string;
|
||||||
|
pagination?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const DEFAULT_TREEZ_SELECTORS: TreezSelectors = {
|
||||||
|
productContainer: '.product-tile, [class*="ProductCard"]',
|
||||||
|
productName: '.product-name, [class*="ProductName"]',
|
||||||
|
productPrice: '.product-price, [class*="ProductPrice"]',
|
||||||
|
productImage: '.product-image img',
|
||||||
|
productCategory: '.product-category',
|
||||||
|
productBrand: '.product-brand',
|
||||||
|
addToCart: '.add-to-cart-btn',
|
||||||
|
pagination: '.pagination',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BASE TREEZ CRAWLER CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class BaseTreezCrawler {
|
||||||
|
protected dispensary: Dispensary;
|
||||||
|
protected options: StoreCrawlOptions;
|
||||||
|
protected selectors: TreezSelectors;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {},
|
||||||
|
selectors: TreezSelectors = {}
|
||||||
|
) {
|
||||||
|
this.dispensary = dispensary;
|
||||||
|
this.options = {
|
||||||
|
pricingType: 'rec',
|
||||||
|
useBothModes: false,
|
||||||
|
downloadImages: true,
|
||||||
|
trackStock: true,
|
||||||
|
timeoutMs: 30000,
|
||||||
|
...options,
|
||||||
|
};
|
||||||
|
this.selectors = { ...DEFAULT_TREEZ_SELECTORS, ...selectors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main entry point - crawl products for this dispensary
|
||||||
|
* TODO: Implement Treez-specific crawling
|
||||||
|
*/
|
||||||
|
async crawlProducts(): Promise<CrawlResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
console.warn(`[BaseTreezCrawler] Treez crawling not yet implemented for ${this.dispensary.name}`);
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
dispensaryId: this.dispensary.id || 0,
|
||||||
|
productsFound: 0,
|
||||||
|
productsFetched: 0,
|
||||||
|
productsUpserted: 0,
|
||||||
|
snapshotsCreated: 0,
|
||||||
|
imagesDownloaded: 0,
|
||||||
|
errorMessage: 'Treez crawler not yet implemented',
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect page structure for sandbox discovery mode
|
||||||
|
*/
|
||||||
|
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
menuType: 'unknown',
|
||||||
|
selectors: {},
|
||||||
|
pagination: { type: 'none' },
|
||||||
|
errors: ['Treez structure detection not yet implemented'],
|
||||||
|
metadata: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract products from page/document
|
||||||
|
*/
|
||||||
|
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||||
|
console.warn('[BaseTreezCrawler] extractProducts not yet implemented');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract images from document
|
||||||
|
*/
|
||||||
|
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||||
|
console.warn('[BaseTreezCrawler] extractImages not yet implemented');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract stock information from document
|
||||||
|
*/
|
||||||
|
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||||
|
console.warn('[BaseTreezCrawler] extractStock not yet implemented');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract pagination information from document
|
||||||
|
*/
|
||||||
|
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||||
|
return { hasNextPage: false };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// FACTORY FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export function createCrawler(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {},
|
||||||
|
selectors: TreezSelectors = {}
|
||||||
|
): BaseTreezCrawler {
|
||||||
|
return new BaseTreezCrawler(dispensary, options, selectors);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STANDALONE FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export async function crawlProducts(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {}
|
||||||
|
): Promise<CrawlResult> {
|
||||||
|
const crawler = createCrawler(dispensary, options);
|
||||||
|
return crawler.crawlProducts();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function detectStructure(
|
||||||
|
page: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<StructureDetectionResult> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.detectStructure(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractProducts(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedProduct[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractProducts(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractImages(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedImage[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractImages(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractStock(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedStock[]> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractStock(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractPagination(
|
||||||
|
document: any,
|
||||||
|
dispensary?: Dispensary
|
||||||
|
): Promise<ExtractedPagination> {
|
||||||
|
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||||
|
return crawler.extractPagination(document);
|
||||||
|
}
|
||||||
27
backend/src/crawlers/base/index.ts
Normal file
27
backend/src/crawlers/base/index.ts
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
/**
|
||||||
|
* Base Crawler Templates Index
|
||||||
|
*
|
||||||
|
* Exports all base crawler templates for easy importing.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Dutchie base (primary implementation)
|
||||||
|
export * from './base-dutchie';
|
||||||
|
|
||||||
|
// Treez base (placeholder)
|
||||||
|
export * as Treez from './base-treez';
|
||||||
|
|
||||||
|
// Jane base (placeholder)
|
||||||
|
export * as Jane from './base-jane';
|
||||||
|
|
||||||
|
// Re-export common types from dutchie for convenience
|
||||||
|
export type {
|
||||||
|
StoreCrawlOptions,
|
||||||
|
CrawlResult,
|
||||||
|
StructureDetectionResult,
|
||||||
|
ExtractedProduct,
|
||||||
|
ExtractedImage,
|
||||||
|
ExtractedStock,
|
||||||
|
ExtractedPagination,
|
||||||
|
DutchieCrawlerHooks,
|
||||||
|
DutchieSelectors,
|
||||||
|
} from './base-dutchie';
|
||||||
9
backend/src/crawlers/dutchie/base-dutchie.ts
Normal file
9
backend/src/crawlers/dutchie/base-dutchie.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* Base Dutchie Crawler Template (Re-export for backward compatibility)
|
||||||
|
*
|
||||||
|
* DEPRECATED: Import from '../base/base-dutchie' instead.
|
||||||
|
* This file re-exports everything from the new location for existing code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Re-export everything from the new base location
|
||||||
|
export * from '../base/base-dutchie';
|
||||||
118
backend/src/crawlers/dutchie/stores/trulieve-scottsdale.ts
Normal file
118
backend/src/crawlers/dutchie/stores/trulieve-scottsdale.ts
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
/**
|
||||||
|
* Trulieve Scottsdale - Per-Store Dutchie Crawler
|
||||||
|
*
|
||||||
|
* Store ID: 101
|
||||||
|
* Profile Key: trulieve-scottsdale
|
||||||
|
* Platform Dispensary ID: 5eaf489fa8a61801212577cc
|
||||||
|
*
|
||||||
|
* Phase 1: Identity implementation - no overrides, just uses base Dutchie logic.
|
||||||
|
* Future: Add store-specific selectors, timing, or custom logic as needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
BaseDutchieCrawler,
|
||||||
|
StoreCrawlOptions,
|
||||||
|
CrawlResult,
|
||||||
|
DutchieSelectors,
|
||||||
|
crawlProducts as baseCrawlProducts,
|
||||||
|
} from '../../base/base-dutchie';
|
||||||
|
import { Dispensary } from '../../../dutchie-az/types';
|
||||||
|
|
||||||
|
// Re-export CrawlResult for the orchestrator
|
||||||
|
export { CrawlResult };
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STORE CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store-specific configuration
|
||||||
|
* These can be used to customize crawler behavior for this store
|
||||||
|
*/
|
||||||
|
export const STORE_CONFIG = {
|
||||||
|
storeId: 101,
|
||||||
|
profileKey: 'trulieve-scottsdale',
|
||||||
|
name: 'Trulieve of Scottsdale Dispensary',
|
||||||
|
platformDispensaryId: '5eaf489fa8a61801212577cc',
|
||||||
|
|
||||||
|
// Store-specific overrides (none for Phase 1)
|
||||||
|
customOptions: {
|
||||||
|
// Example future overrides:
|
||||||
|
// pricingType: 'rec',
|
||||||
|
// useBothModes: true,
|
||||||
|
// customHeaders: {},
|
||||||
|
// maxRetries: 3,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STORE CRAWLER CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TrulieveScottsdaleCrawler - Per-store crawler for Trulieve Scottsdale
|
||||||
|
*
|
||||||
|
* Phase 1: Identity implementation - extends BaseDutchieCrawler with no overrides.
|
||||||
|
* Future phases can override methods like:
|
||||||
|
* - getCName() for custom slug handling
|
||||||
|
* - crawlProducts() for completely custom logic
|
||||||
|
* - Add hooks for pre/post processing
|
||||||
|
*/
|
||||||
|
export class TrulieveScottsdaleCrawler extends BaseDutchieCrawler {
|
||||||
|
constructor(dispensary: Dispensary, options: StoreCrawlOptions = {}) {
|
||||||
|
// Merge store-specific options with provided options
|
||||||
|
const mergedOptions: StoreCrawlOptions = {
|
||||||
|
...STORE_CONFIG.customOptions,
|
||||||
|
...options,
|
||||||
|
};
|
||||||
|
|
||||||
|
super(dispensary, mergedOptions);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 1: No overrides - use base implementation
|
||||||
|
// Future phases can add overrides here:
|
||||||
|
//
|
||||||
|
// async crawlProducts(): Promise<CrawlResult> {
|
||||||
|
// // Custom pre-processing
|
||||||
|
// // ...
|
||||||
|
// const result = await super.crawlProducts();
|
||||||
|
// // Custom post-processing
|
||||||
|
// // ...
|
||||||
|
// return result;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// EXPORTED CRAWL FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main entry point for the orchestrator
|
||||||
|
*
|
||||||
|
* The orchestrator calls: mod.crawlProducts(dispensary, options)
|
||||||
|
* This function creates a TrulieveScottsdaleCrawler and runs it.
|
||||||
|
*/
|
||||||
|
export async function crawlProducts(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {}
|
||||||
|
): Promise<CrawlResult> {
|
||||||
|
console.log(`[TrulieveScottsdale] Using per-store crawler for ${dispensary.name}`);
|
||||||
|
|
||||||
|
const crawler = new TrulieveScottsdaleCrawler(dispensary, options);
|
||||||
|
return crawler.crawlProducts();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// FACTORY FUNCTION (alternative API)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a crawler instance without running it
|
||||||
|
* Useful for testing or when you need to configure before running
|
||||||
|
*/
|
||||||
|
export function createCrawler(
|
||||||
|
dispensary: Dispensary,
|
||||||
|
options: StoreCrawlOptions = {}
|
||||||
|
): TrulieveScottsdaleCrawler {
|
||||||
|
return new TrulieveScottsdaleCrawler(dispensary, options);
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
import { pool } from './migrate';
|
import { pool } from './pool';
|
||||||
|
|
||||||
async function addJobsTable() {
|
async function addJobsTable() {
|
||||||
const client = await pool.connect();
|
const client = await pool.connect();
|
||||||
|
|||||||
@@ -1,18 +1,58 @@
|
|||||||
|
/**
|
||||||
|
* Database Migration Script (CLI-ONLY)
|
||||||
|
*
|
||||||
|
* This file is for running migrations via CLI only:
|
||||||
|
* npx tsx src/db/migrate.ts
|
||||||
|
*
|
||||||
|
* DO NOT import this file from runtime code.
|
||||||
|
* Runtime code should import from src/db/pool.ts instead.
|
||||||
|
*/
|
||||||
|
|
||||||
import { Pool } from 'pg';
|
import { Pool } from 'pg';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
// Consolidated DB connection:
|
// Load .env BEFORE any env var access
|
||||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
dotenv.config();
|
||||||
// - Then DATABASE_URL (default)
|
|
||||||
const DATABASE_URL =
|
|
||||||
process.env.CRAWLSY_DATABASE_URL ||
|
|
||||||
process.env.DATABASE_URL ||
|
|
||||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
|
||||||
|
|
||||||
const pool = new Pool({
|
/**
|
||||||
connectionString: DATABASE_URL,
|
* Get the database connection string from environment variables.
|
||||||
});
|
* Strict validation - will throw if required vars are missing.
|
||||||
|
*/
|
||||||
|
function getConnectionString(): string {
|
||||||
|
// Priority 1: Full connection URL
|
||||||
|
if (process.env.CANNAIQ_DB_URL) {
|
||||||
|
return process.env.CANNAIQ_DB_URL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Priority 2: Build from individual env vars (all required)
|
||||||
|
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||||
|
const missing = required.filter((key) => !process.env[key]);
|
||||||
|
|
||||||
|
if (missing.length > 0) {
|
||||||
|
throw new Error(
|
||||||
|
`[Migrate] Missing required environment variables: ${missing.join(', ')}\n` +
|
||||||
|
`Either set CANNAIQ_DB_URL or all of: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const host = process.env.CANNAIQ_DB_HOST!;
|
||||||
|
const port = process.env.CANNAIQ_DB_PORT!;
|
||||||
|
const name = process.env.CANNAIQ_DB_NAME!;
|
||||||
|
const user = process.env.CANNAIQ_DB_USER!;
|
||||||
|
const pass = process.env.CANNAIQ_DB_PASS!;
|
||||||
|
|
||||||
|
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run all database migrations
|
||||||
|
*/
|
||||||
|
async function runMigrations() {
|
||||||
|
// Create pool only when migrations are actually run
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: getConnectionString(),
|
||||||
|
});
|
||||||
|
|
||||||
export async function runMigrations() {
|
|
||||||
const client = await pool.connect();
|
const client = await pool.connect();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -340,12 +380,12 @@ export async function runMigrations() {
|
|||||||
throw error;
|
throw error;
|
||||||
} finally {
|
} finally {
|
||||||
client.release();
|
client.release();
|
||||||
|
await pool.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export { pool };
|
// Only run when executed directly (CLI mode)
|
||||||
|
// DO NOT export pool - runtime code must use src/db/pool.ts
|
||||||
// Run migrations if this file is executed directly
|
|
||||||
if (require.main === module) {
|
if (require.main === module) {
|
||||||
runMigrations()
|
runMigrations()
|
||||||
.then(() => process.exit(0))
|
.then(() => process.exit(0))
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { pool } from './migrate';
|
import { pool } from './pool';
|
||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { pool } from './migrate';
|
import { pool } from './pool';
|
||||||
import bcrypt from 'bcrypt';
|
import bcrypt from 'bcrypt';
|
||||||
|
|
||||||
export async function seedDatabase() {
|
export async function seedDatabase() {
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { pool } from './migrate';
|
import { pool } from './pool';
|
||||||
|
|
||||||
async function updateCategoriesHierarchy() {
|
async function updateCategoriesHierarchy() {
|
||||||
const client = await pool.connect();
|
const client = await pool.connect();
|
||||||
|
|||||||
474
backend/src/discovery/city-discovery.ts
Normal file
474
backend/src/discovery/city-discovery.ts
Normal file
@@ -0,0 +1,474 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie City Discovery Service
|
||||||
|
*
|
||||||
|
* Discovers cities from the Dutchie cities page.
|
||||||
|
* Each city can contain multiple dispensary locations.
|
||||||
|
*
|
||||||
|
* Source: https://dutchie.com/cities
|
||||||
|
*
|
||||||
|
* This module ONLY handles city discovery and upserts to dutchie_discovery_cities.
|
||||||
|
* It does NOT create any dispensary records.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import axios from 'axios';
|
||||||
|
import * as cheerio from 'cheerio';
|
||||||
|
import {
|
||||||
|
DiscoveryCity,
|
||||||
|
DiscoveryCityRow,
|
||||||
|
DutchieCityResponse,
|
||||||
|
CityDiscoveryResult,
|
||||||
|
mapCityRowToCity,
|
||||||
|
} from './types';
|
||||||
|
|
||||||
|
const CITIES_PAGE_URL = 'https://dutchie.com/cities';
|
||||||
|
const PLATFORM = 'dutchie';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CITY PAGE SCRAPING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch and parse the Dutchie cities page.
|
||||||
|
* Returns a list of cities with their slugs and states.
|
||||||
|
*/
|
||||||
|
export async function fetchCitiesFromPage(): Promise<DutchieCityResponse[]> {
|
||||||
|
console.log(`[CityDiscovery] Fetching cities from ${CITIES_PAGE_URL}...`);
|
||||||
|
|
||||||
|
const response = await axios.get(CITIES_PAGE_URL, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
},
|
||||||
|
timeout: 30000,
|
||||||
|
});
|
||||||
|
|
||||||
|
const $ = cheerio.load(response.data);
|
||||||
|
const cities: DutchieCityResponse[] = [];
|
||||||
|
|
||||||
|
// Look for city links in various possible structures
|
||||||
|
// Structure 1: Links in /dispensaries/{state}/{city} format
|
||||||
|
$('a[href*="/dispensaries/"]').each((_, element) => {
|
||||||
|
const href = $(element).attr('href') || '';
|
||||||
|
const text = $(element).text().trim();
|
||||||
|
|
||||||
|
// Match /dispensaries/{state}/{city} pattern
|
||||||
|
const match = href.match(/\/dispensaries\/([a-z]{2,3})\/([a-z0-9-]+)/i);
|
||||||
|
if (match) {
|
||||||
|
const [, stateCode, citySlug] = match;
|
||||||
|
cities.push({
|
||||||
|
slug: citySlug,
|
||||||
|
name: text || citySlug.replace(/-/g, ' '),
|
||||||
|
stateCode: stateCode.toUpperCase(),
|
||||||
|
countryCode: stateCode.length === 2 ? 'US' : 'CA', // 2-letter = US state, 3+ = Canadian province
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Structure 2: Links in /city/{slug} format
|
||||||
|
$('a[href*="/city/"]').each((_, element) => {
|
||||||
|
const href = $(element).attr('href') || '';
|
||||||
|
const text = $(element).text().trim();
|
||||||
|
|
||||||
|
const match = href.match(/\/city\/([a-z0-9-]+)/i);
|
||||||
|
if (match) {
|
||||||
|
const [, citySlug] = match;
|
||||||
|
cities.push({
|
||||||
|
slug: citySlug,
|
||||||
|
name: text || citySlug.replace(/-/g, ' '),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Dedupe by slug
|
||||||
|
const uniqueCities = new Map<string, DutchieCityResponse>();
|
||||||
|
for (const city of cities) {
|
||||||
|
const key = `${city.countryCode || 'unknown'}-${city.stateCode || 'unknown'}-${city.slug}`;
|
||||||
|
if (!uniqueCities.has(key)) {
|
||||||
|
uniqueCities.set(key, city);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = Array.from(uniqueCities.values());
|
||||||
|
console.log(`[CityDiscovery] Found ${result.length} unique cities`);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alternative: Fetch cities from Dutchie's internal API/GraphQL
|
||||||
|
* This is a fallback if the HTML scraping doesn't work.
|
||||||
|
*/
|
||||||
|
export async function fetchCitiesFromApi(): Promise<DutchieCityResponse[]> {
|
||||||
|
console.log('[CityDiscovery] Attempting to fetch cities from API...');
|
||||||
|
|
||||||
|
// Try to find the cities endpoint - this is exploratory
|
||||||
|
// Dutchie may expose cities via their public API
|
||||||
|
|
||||||
|
// Common patterns to try:
|
||||||
|
const possibleEndpoints = [
|
||||||
|
'https://dutchie.com/api/cities',
|
||||||
|
'https://dutchie.com/api-3/cities',
|
||||||
|
'https://api.dutchie.com/v1/cities',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const endpoint of possibleEndpoints) {
|
||||||
|
try {
|
||||||
|
const response = await axios.get(endpoint, {
|
||||||
|
headers: {
|
||||||
|
'Accept': 'application/json',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||||
|
},
|
||||||
|
timeout: 10000,
|
||||||
|
validateStatus: () => true,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.status === 200 && Array.isArray(response.data)) {
|
||||||
|
console.log(`[CityDiscovery] Found cities at ${endpoint}`);
|
||||||
|
return response.data.map((city: any) => ({
|
||||||
|
slug: city.slug || city.city_slug,
|
||||||
|
name: city.name || city.city_name,
|
||||||
|
stateCode: city.stateCode || city.state_code || city.state,
|
||||||
|
countryCode: city.countryCode || city.country_code || city.country || 'US',
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Continue to next endpoint
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[CityDiscovery] No API endpoint found, falling back to page scraping');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a city into dutchie_discovery_cities.
|
||||||
|
* Returns the city ID.
|
||||||
|
*/
|
||||||
|
export async function upsertCity(
|
||||||
|
pool: Pool,
|
||||||
|
city: DutchieCityResponse
|
||||||
|
): Promise<{ id: number; isNew: boolean }> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO dutchie_discovery_cities (
|
||||||
|
platform,
|
||||||
|
city_name,
|
||||||
|
city_slug,
|
||||||
|
state_code,
|
||||||
|
country_code,
|
||||||
|
updated_at
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, NOW())
|
||||||
|
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||||
|
DO UPDATE SET
|
||||||
|
city_name = EXCLUDED.city_name,
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING id, (xmax = 0) as is_new`,
|
||||||
|
[
|
||||||
|
PLATFORM,
|
||||||
|
city.name,
|
||||||
|
city.slug,
|
||||||
|
city.stateCode || null,
|
||||||
|
city.countryCode || 'US',
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: result.rows[0].id,
|
||||||
|
isNew: result.rows[0].is_new,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark a city as crawled and update location count.
|
||||||
|
*/
|
||||||
|
export async function markCityCrawled(
|
||||||
|
pool: Pool,
|
||||||
|
cityId: number,
|
||||||
|
locationCount: number
|
||||||
|
): Promise<void> {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dutchie_discovery_cities
|
||||||
|
SET last_crawled_at = NOW(),
|
||||||
|
location_count = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[cityId, locationCount]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all cities that need to be crawled.
|
||||||
|
*/
|
||||||
|
export async function getCitiesToCrawl(
|
||||||
|
pool: Pool,
|
||||||
|
options: {
|
||||||
|
stateCode?: string;
|
||||||
|
countryCode?: string;
|
||||||
|
limit?: number;
|
||||||
|
onlyStale?: boolean;
|
||||||
|
staleDays?: number;
|
||||||
|
} = {}
|
||||||
|
): Promise<DiscoveryCity[]> {
|
||||||
|
const {
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
limit = 100,
|
||||||
|
onlyStale = false,
|
||||||
|
staleDays = 7,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
let query = `
|
||||||
|
SELECT *
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE crawl_enabled = TRUE
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIdx = 1;
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
query += ` AND state_code = $${paramIdx}`;
|
||||||
|
params.push(stateCode);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countryCode) {
|
||||||
|
query += ` AND country_code = $${paramIdx}`;
|
||||||
|
params.push(countryCode);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (onlyStale) {
|
||||||
|
query += ` AND (last_crawled_at IS NULL OR last_crawled_at < NOW() - INTERVAL '${staleDays} days')`;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY last_crawled_at ASC NULLS FIRST LIMIT $${paramIdx}`;
|
||||||
|
params.push(limit);
|
||||||
|
|
||||||
|
const result = await pool.query<DiscoveryCityRow>(query, params);
|
||||||
|
return result.rows.map(mapCityRowToCity);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a city by ID.
|
||||||
|
*/
|
||||||
|
export async function getCityById(
|
||||||
|
pool: Pool,
|
||||||
|
id: number
|
||||||
|
): Promise<DiscoveryCity | null> {
|
||||||
|
const result = await pool.query<DiscoveryCityRow>(
|
||||||
|
`SELECT * FROM dutchie_discovery_cities WHERE id = $1`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mapCityRowToCity(result.rows[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a city by slug.
|
||||||
|
*/
|
||||||
|
export async function getCityBySlug(
|
||||||
|
pool: Pool,
|
||||||
|
slug: string,
|
||||||
|
stateCode?: string,
|
||||||
|
countryCode: string = 'US'
|
||||||
|
): Promise<DiscoveryCity | null> {
|
||||||
|
let query = `
|
||||||
|
SELECT * FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = $1 AND city_slug = $2 AND country_code = $3
|
||||||
|
`;
|
||||||
|
const params: any[] = [PLATFORM, slug, countryCode];
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
query += ` AND state_code = $4`;
|
||||||
|
params.push(stateCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await pool.query<DiscoveryCityRow>(query, params);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mapCityRowToCity(result.rows[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN DISCOVERY FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the full city discovery process.
|
||||||
|
* Fetches cities from Dutchie and upserts them into the database.
|
||||||
|
*/
|
||||||
|
export async function discoverCities(
|
||||||
|
pool: Pool,
|
||||||
|
options: {
|
||||||
|
dryRun?: boolean;
|
||||||
|
verbose?: boolean;
|
||||||
|
} = {}
|
||||||
|
): Promise<CityDiscoveryResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const { dryRun = false, verbose = false } = options;
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
console.log('[CityDiscovery] Starting city discovery...');
|
||||||
|
console.log(`[CityDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||||
|
|
||||||
|
// Try API first, fall back to page scraping
|
||||||
|
let cities = await fetchCitiesFromApi();
|
||||||
|
if (cities.length === 0) {
|
||||||
|
cities = await fetchCitiesFromPage();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cities.length === 0) {
|
||||||
|
console.log('[CityDiscovery] No cities found');
|
||||||
|
return {
|
||||||
|
citiesFound: 0,
|
||||||
|
citiesUpserted: 0,
|
||||||
|
citiesSkipped: 0,
|
||||||
|
errors: ['No cities found from page or API'],
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let upserted = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
for (const city of cities) {
|
||||||
|
try {
|
||||||
|
if (dryRun) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[CityDiscovery][DryRun] Would upsert: ${city.name} (${city.stateCode}, ${city.countryCode})`);
|
||||||
|
}
|
||||||
|
upserted++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await upsertCity(pool, city);
|
||||||
|
upserted++;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
const action = result.isNew ? 'Created' : 'Updated';
|
||||||
|
console.log(`[CityDiscovery] ${action}: ${city.name} (${city.stateCode}, ${city.countryCode}) -> ID ${result.id}`);
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`City ${city.slug}: ${error.message}`);
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log(`[CityDiscovery] Complete: ${upserted} upserted, ${skipped} skipped, ${errors.length} errors in ${durationMs}ms`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
citiesFound: cities.length,
|
||||||
|
citiesUpserted: upserted,
|
||||||
|
citiesSkipped: skipped,
|
||||||
|
errors,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MANUAL CITY SEEDING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seed known cities manually.
|
||||||
|
* Use this when the cities page doesn't expose all cities.
|
||||||
|
*/
|
||||||
|
export async function seedKnownCities(
|
||||||
|
pool: Pool,
|
||||||
|
cities: Array<{
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
stateCode: string;
|
||||||
|
countryCode?: string;
|
||||||
|
}>
|
||||||
|
): Promise<{ created: number; updated: number }> {
|
||||||
|
let created = 0;
|
||||||
|
let updated = 0;
|
||||||
|
|
||||||
|
for (const city of cities) {
|
||||||
|
const result = await upsertCity(pool, {
|
||||||
|
name: city.name,
|
||||||
|
slug: city.slug,
|
||||||
|
stateCode: city.stateCode,
|
||||||
|
countryCode: city.countryCode || 'US',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (result.isNew) {
|
||||||
|
created++;
|
||||||
|
} else {
|
||||||
|
updated++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { created, updated };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pre-defined Arizona cities for seeding.
|
||||||
|
*/
|
||||||
|
export const ARIZONA_CITIES = [
|
||||||
|
{ name: 'Phoenix', slug: 'phoenix', stateCode: 'AZ' },
|
||||||
|
{ name: 'Tucson', slug: 'tucson', stateCode: 'AZ' },
|
||||||
|
{ name: 'Mesa', slug: 'mesa', stateCode: 'AZ' },
|
||||||
|
{ name: 'Chandler', slug: 'chandler', stateCode: 'AZ' },
|
||||||
|
{ name: 'Scottsdale', slug: 'scottsdale', stateCode: 'AZ' },
|
||||||
|
{ name: 'Glendale', slug: 'glendale', stateCode: 'AZ' },
|
||||||
|
{ name: 'Gilbert', slug: 'gilbert', stateCode: 'AZ' },
|
||||||
|
{ name: 'Tempe', slug: 'tempe', stateCode: 'AZ' },
|
||||||
|
{ name: 'Peoria', slug: 'peoria', stateCode: 'AZ' },
|
||||||
|
{ name: 'Surprise', slug: 'surprise', stateCode: 'AZ' },
|
||||||
|
{ name: 'Yuma', slug: 'yuma', stateCode: 'AZ' },
|
||||||
|
{ name: 'Avondale', slug: 'avondale', stateCode: 'AZ' },
|
||||||
|
{ name: 'Flagstaff', slug: 'flagstaff', stateCode: 'AZ' },
|
||||||
|
{ name: 'Goodyear', slug: 'goodyear', stateCode: 'AZ' },
|
||||||
|
{ name: 'Lake Havasu City', slug: 'lake-havasu-city', stateCode: 'AZ' },
|
||||||
|
{ name: 'Buckeye', slug: 'buckeye', stateCode: 'AZ' },
|
||||||
|
{ name: 'Casa Grande', slug: 'casa-grande', stateCode: 'AZ' },
|
||||||
|
{ name: 'Sierra Vista', slug: 'sierra-vista', stateCode: 'AZ' },
|
||||||
|
{ name: 'Maricopa', slug: 'maricopa', stateCode: 'AZ' },
|
||||||
|
{ name: 'Oro Valley', slug: 'oro-valley', stateCode: 'AZ' },
|
||||||
|
{ name: 'Prescott', slug: 'prescott', stateCode: 'AZ' },
|
||||||
|
{ name: 'Bullhead City', slug: 'bullhead-city', stateCode: 'AZ' },
|
||||||
|
{ name: 'Prescott Valley', slug: 'prescott-valley', stateCode: 'AZ' },
|
||||||
|
{ name: 'Apache Junction', slug: 'apache-junction', stateCode: 'AZ' },
|
||||||
|
{ name: 'Marana', slug: 'marana', stateCode: 'AZ' },
|
||||||
|
{ name: 'El Mirage', slug: 'el-mirage', stateCode: 'AZ' },
|
||||||
|
{ name: 'Kingman', slug: 'kingman', stateCode: 'AZ' },
|
||||||
|
{ name: 'Queen Creek', slug: 'queen-creek', stateCode: 'AZ' },
|
||||||
|
{ name: 'San Luis', slug: 'san-luis', stateCode: 'AZ' },
|
||||||
|
{ name: 'Sahuarita', slug: 'sahuarita', stateCode: 'AZ' },
|
||||||
|
{ name: 'Fountain Hills', slug: 'fountain-hills', stateCode: 'AZ' },
|
||||||
|
{ name: 'Nogales', slug: 'nogales', stateCode: 'AZ' },
|
||||||
|
{ name: 'Douglas', slug: 'douglas', stateCode: 'AZ' },
|
||||||
|
{ name: 'Eloy', slug: 'eloy', stateCode: 'AZ' },
|
||||||
|
{ name: 'Somerton', slug: 'somerton', stateCode: 'AZ' },
|
||||||
|
{ name: 'Paradise Valley', slug: 'paradise-valley', stateCode: 'AZ' },
|
||||||
|
{ name: 'Coolidge', slug: 'coolidge', stateCode: 'AZ' },
|
||||||
|
{ name: 'Cottonwood', slug: 'cottonwood', stateCode: 'AZ' },
|
||||||
|
{ name: 'Camp Verde', slug: 'camp-verde', stateCode: 'AZ' },
|
||||||
|
{ name: 'Show Low', slug: 'show-low', stateCode: 'AZ' },
|
||||||
|
{ name: 'Payson', slug: 'payson', stateCode: 'AZ' },
|
||||||
|
{ name: 'Sedona', slug: 'sedona', stateCode: 'AZ' },
|
||||||
|
{ name: 'Winslow', slug: 'winslow', stateCode: 'AZ' },
|
||||||
|
{ name: 'Globe', slug: 'globe', stateCode: 'AZ' },
|
||||||
|
{ name: 'Safford', slug: 'safford', stateCode: 'AZ' },
|
||||||
|
{ name: 'Bisbee', slug: 'bisbee', stateCode: 'AZ' },
|
||||||
|
{ name: 'Wickenburg', slug: 'wickenburg', stateCode: 'AZ' },
|
||||||
|
{ name: 'Page', slug: 'page', stateCode: 'AZ' },
|
||||||
|
{ name: 'Holbrook', slug: 'holbrook', stateCode: 'AZ' },
|
||||||
|
{ name: 'Willcox', slug: 'willcox', stateCode: 'AZ' },
|
||||||
|
];
|
||||||
327
backend/src/discovery/discovery-crawler.ts
Normal file
327
backend/src/discovery/discovery-crawler.ts
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie Discovery Crawler
|
||||||
|
*
|
||||||
|
* Main orchestrator for the Dutchie store discovery pipeline.
|
||||||
|
*
|
||||||
|
* Flow:
|
||||||
|
* 1. Discover cities from Dutchie (or use seeded cities)
|
||||||
|
* 2. For each city, discover store locations
|
||||||
|
* 3. Upsert all data to discovery tables
|
||||||
|
* 4. Admin verifies locations manually
|
||||||
|
* 5. Verified locations are promoted to canonical dispensaries
|
||||||
|
*
|
||||||
|
* This module does NOT create canonical dispensaries automatically.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import {
|
||||||
|
FullDiscoveryResult,
|
||||||
|
LocationDiscoveryResult,
|
||||||
|
DiscoveryCity,
|
||||||
|
} from './types';
|
||||||
|
import {
|
||||||
|
discoverCities,
|
||||||
|
getCitiesToCrawl,
|
||||||
|
getCityBySlug,
|
||||||
|
seedKnownCities,
|
||||||
|
ARIZONA_CITIES,
|
||||||
|
} from './city-discovery';
|
||||||
|
import {
|
||||||
|
discoverLocationsForCity,
|
||||||
|
} from './location-discovery';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// FULL DISCOVERY
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DiscoveryCrawlerOptions {
|
||||||
|
dryRun?: boolean;
|
||||||
|
verbose?: boolean;
|
||||||
|
stateCode?: string;
|
||||||
|
countryCode?: string;
|
||||||
|
cityLimit?: number;
|
||||||
|
skipCityDiscovery?: boolean;
|
||||||
|
onlyStale?: boolean;
|
||||||
|
staleDays?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the full discovery pipeline:
|
||||||
|
* 1. Discover/refresh cities
|
||||||
|
* 2. For each city, discover locations
|
||||||
|
*/
|
||||||
|
export async function runFullDiscovery(
|
||||||
|
pool: Pool,
|
||||||
|
options: DiscoveryCrawlerOptions = {}
|
||||||
|
): Promise<FullDiscoveryResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const {
|
||||||
|
dryRun = false,
|
||||||
|
verbose = false,
|
||||||
|
stateCode,
|
||||||
|
countryCode = 'US',
|
||||||
|
cityLimit = 50,
|
||||||
|
skipCityDiscovery = false,
|
||||||
|
onlyStale = true,
|
||||||
|
staleDays = 7,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log('DUTCHIE DISCOVERY CRAWLER');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||||
|
if (stateCode) console.log(`State: ${stateCode}`);
|
||||||
|
console.log(`Country: ${countryCode}`);
|
||||||
|
console.log(`City limit: ${cityLimit}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
// Step 1: Discover/refresh cities
|
||||||
|
let cityResult = {
|
||||||
|
citiesFound: 0,
|
||||||
|
citiesUpserted: 0,
|
||||||
|
citiesSkipped: 0,
|
||||||
|
errors: [] as string[],
|
||||||
|
durationMs: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!skipCityDiscovery) {
|
||||||
|
console.log('[Discovery] Step 1: Discovering cities...');
|
||||||
|
cityResult = await discoverCities(pool, { dryRun, verbose });
|
||||||
|
} else {
|
||||||
|
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Get cities to crawl
|
||||||
|
console.log('[Discovery] Step 2: Getting cities to crawl...');
|
||||||
|
const cities = await getCitiesToCrawl(pool, {
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
limit: cityLimit,
|
||||||
|
onlyStale,
|
||||||
|
staleDays,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
|
||||||
|
|
||||||
|
// Step 3: Discover locations for each city
|
||||||
|
console.log('[Discovery] Step 3: Discovering locations...');
|
||||||
|
const locationResults: LocationDiscoveryResult[] = [];
|
||||||
|
let totalLocationsFound = 0;
|
||||||
|
let totalLocationsUpserted = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < cities.length; i++) {
|
||||||
|
const city = cities[i];
|
||||||
|
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
||||||
|
locationResults.push(result);
|
||||||
|
totalLocationsFound += result.locationsFound;
|
||||||
|
totalLocationsUpserted += result.locationsUpserted;
|
||||||
|
|
||||||
|
// Rate limiting between cities
|
||||||
|
if (i < cities.length - 1) {
|
||||||
|
await new Promise((r) => setTimeout(r, 2000));
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
|
||||||
|
locationResults.push({
|
||||||
|
cityId: city.id,
|
||||||
|
citySlug: city.citySlug,
|
||||||
|
locationsFound: 0,
|
||||||
|
locationsUpserted: 0,
|
||||||
|
locationsNew: 0,
|
||||||
|
locationsUpdated: 0,
|
||||||
|
errors: [error.message],
|
||||||
|
durationMs: 0,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
// Summary
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('DISCOVERY COMPLETE');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||||
|
console.log('');
|
||||||
|
console.log('Cities:');
|
||||||
|
console.log(` Discovered: ${cityResult.citiesFound}`);
|
||||||
|
console.log(` Upserted: ${cityResult.citiesUpserted}`);
|
||||||
|
console.log(` Crawled: ${cities.length}`);
|
||||||
|
console.log('');
|
||||||
|
console.log('Locations:');
|
||||||
|
console.log(` Found: ${totalLocationsFound}`);
|
||||||
|
console.log(` Upserted: ${totalLocationsUpserted}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
const totalErrors = cityResult.errors.length +
|
||||||
|
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
|
||||||
|
if (totalErrors > 0) {
|
||||||
|
console.log(`Errors: ${totalErrors}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
cities: cityResult,
|
||||||
|
locations: locationResults,
|
||||||
|
totalLocationsFound,
|
||||||
|
totalLocationsUpserted,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SINGLE CITY DISCOVERY
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discover locations for a single city by slug.
|
||||||
|
*/
|
||||||
|
export async function discoverCity(
|
||||||
|
pool: Pool,
|
||||||
|
citySlug: string,
|
||||||
|
options: {
|
||||||
|
stateCode?: string;
|
||||||
|
countryCode?: string;
|
||||||
|
dryRun?: boolean;
|
||||||
|
verbose?: boolean;
|
||||||
|
} = {}
|
||||||
|
): Promise<LocationDiscoveryResult | null> {
|
||||||
|
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
|
||||||
|
|
||||||
|
// Find the city
|
||||||
|
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
||||||
|
|
||||||
|
if (!city) {
|
||||||
|
// Try to create it if we have enough info
|
||||||
|
if (stateCode) {
|
||||||
|
console.log(`[Discovery] City ${citySlug} not found, creating...`);
|
||||||
|
await seedKnownCities(pool, [{
|
||||||
|
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
|
||||||
|
slug: citySlug,
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
}]);
|
||||||
|
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!city) {
|
||||||
|
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STATE-WIDE DISCOVERY
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seed and discover all cities for a state.
|
||||||
|
*/
|
||||||
|
export async function discoverState(
|
||||||
|
pool: Pool,
|
||||||
|
stateCode: string,
|
||||||
|
options: {
|
||||||
|
dryRun?: boolean;
|
||||||
|
verbose?: boolean;
|
||||||
|
cityLimit?: number;
|
||||||
|
} = {}
|
||||||
|
): Promise<FullDiscoveryResult> {
|
||||||
|
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
|
||||||
|
|
||||||
|
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
||||||
|
|
||||||
|
// Seed known cities for this state
|
||||||
|
if (stateCode === 'AZ') {
|
||||||
|
console.log('[Discovery] Seeding Arizona cities...');
|
||||||
|
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
|
||||||
|
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run full discovery for this state
|
||||||
|
return await runFullDiscovery(pool, {
|
||||||
|
dryRun,
|
||||||
|
verbose,
|
||||||
|
stateCode,
|
||||||
|
countryCode: 'US',
|
||||||
|
cityLimit,
|
||||||
|
skipCityDiscovery: true, // Use seeded cities
|
||||||
|
onlyStale: false, // Crawl all
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STATISTICS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DiscoveryStats {
|
||||||
|
cities: {
|
||||||
|
total: number;
|
||||||
|
crawledLast24h: number;
|
||||||
|
neverCrawled: number;
|
||||||
|
};
|
||||||
|
locations: {
|
||||||
|
total: number;
|
||||||
|
discovered: number;
|
||||||
|
verified: number;
|
||||||
|
rejected: number;
|
||||||
|
merged: number;
|
||||||
|
byState: Array<{ stateCode: string; count: number }>;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get discovery statistics.
|
||||||
|
*/
|
||||||
|
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
|
||||||
|
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
|
||||||
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||||
|
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
|
||||||
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
|
||||||
|
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
|
||||||
|
pool.query(`
|
||||||
|
SELECT status, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
WHERE active = TRUE
|
||||||
|
GROUP BY status
|
||||||
|
`),
|
||||||
|
pool.query(`
|
||||||
|
SELECT state_code, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
WHERE active = TRUE AND state_code IS NOT NULL
|
||||||
|
GROUP BY state_code
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
`),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
|
||||||
|
acc[row.status] = parseInt(row.cnt, 10);
|
||||||
|
return acc;
|
||||||
|
}, {} as Record<string, number>);
|
||||||
|
|
||||||
|
return {
|
||||||
|
cities: {
|
||||||
|
total: parseInt(citiesTotal.rows[0].cnt, 10),
|
||||||
|
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
|
||||||
|
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
|
||||||
|
},
|
||||||
|
locations: {
|
||||||
|
total: parseInt(locsTotal.rows[0].cnt, 10),
|
||||||
|
discovered: statusCounts.discovered || 0,
|
||||||
|
verified: statusCounts.verified || 0,
|
||||||
|
rejected: statusCounts.rejected || 0,
|
||||||
|
merged: statusCounts.merged || 0,
|
||||||
|
byState: locsByState.rows.map(row => ({
|
||||||
|
stateCode: row.state_code,
|
||||||
|
count: parseInt(row.cnt, 10),
|
||||||
|
})),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
37
backend/src/discovery/index.ts
Normal file
37
backend/src/discovery/index.ts
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie Discovery Module
|
||||||
|
*
|
||||||
|
* Exports all discovery-related functionality for use in the main application.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Types
|
||||||
|
export * from './types';
|
||||||
|
|
||||||
|
// City Discovery
|
||||||
|
export {
|
||||||
|
discoverCities,
|
||||||
|
getCitiesToCrawl,
|
||||||
|
getCityBySlug,
|
||||||
|
seedKnownCities,
|
||||||
|
ARIZONA_CITIES,
|
||||||
|
} from './city-discovery';
|
||||||
|
|
||||||
|
// Location Discovery
|
||||||
|
export {
|
||||||
|
discoverLocationsForCity,
|
||||||
|
fetchLocationsForCity,
|
||||||
|
upsertLocation,
|
||||||
|
} from './location-discovery';
|
||||||
|
|
||||||
|
// Discovery Crawler (Orchestrator)
|
||||||
|
export {
|
||||||
|
runFullDiscovery,
|
||||||
|
discoverCity,
|
||||||
|
discoverState,
|
||||||
|
getDiscoveryStats,
|
||||||
|
DiscoveryCrawlerOptions,
|
||||||
|
DiscoveryStats,
|
||||||
|
} from './discovery-crawler';
|
||||||
|
|
||||||
|
// Routes
|
||||||
|
export { createDiscoveryRoutes } from './routes';
|
||||||
686
backend/src/discovery/location-discovery.ts
Normal file
686
backend/src/discovery/location-discovery.ts
Normal file
@@ -0,0 +1,686 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie Location Discovery Service
|
||||||
|
*
|
||||||
|
* Discovers store locations from Dutchie city pages.
|
||||||
|
* Each city can contain multiple dispensary locations.
|
||||||
|
*
|
||||||
|
* This module:
|
||||||
|
* 1. Fetches location listings for a given city
|
||||||
|
* 2. Upserts locations into dutchie_discovery_locations
|
||||||
|
* 3. Does NOT create any canonical dispensary records
|
||||||
|
*
|
||||||
|
* Locations remain in "discovered" status until manually verified.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import axios from 'axios';
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import type { Browser, Page, Protocol } from 'puppeteer';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
import {
|
||||||
|
DiscoveryLocation,
|
||||||
|
DiscoveryLocationRow,
|
||||||
|
DutchieLocationResponse,
|
||||||
|
LocationDiscoveryResult,
|
||||||
|
DiscoveryStatus,
|
||||||
|
mapLocationRowToLocation,
|
||||||
|
} from './types';
|
||||||
|
import { DiscoveryCity } from './types';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
const PLATFORM = 'dutchie';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// GRAPHQL / API FETCHING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
interface SessionCredentials {
|
||||||
|
cookies: string;
|
||||||
|
userAgent: string;
|
||||||
|
browser: Browser;
|
||||||
|
page: Page;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a browser session for fetching location data.
|
||||||
|
*/
|
||||||
|
async function createSession(citySlug: string): Promise<SessionCredentials> {
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: 'new',
|
||||||
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await browser.newPage();
|
||||||
|
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||||
|
|
||||||
|
await page.setUserAgent(userAgent);
|
||||||
|
await page.setViewport({ width: 1920, height: 1080 });
|
||||||
|
await page.evaluateOnNewDocument(() => {
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||||
|
(window as any).chrome = { runtime: {} };
|
||||||
|
});
|
||||||
|
|
||||||
|
// Navigate to a dispensaries page to get cookies
|
||||||
|
const url = `https://dutchie.com/dispensaries/az/${citySlug}`;
|
||||||
|
console.log(`[LocationDiscovery] Loading ${url} to establish session...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(url, {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
await new Promise((r) => setTimeout(r, 2000));
|
||||||
|
} catch (error: any) {
|
||||||
|
console.warn(`[LocationDiscovery] Navigation warning: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const cookies = await page.cookies();
|
||||||
|
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
|
||||||
|
|
||||||
|
return { cookies: cookieString, userAgent, browser, page };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function closeSession(session: SessionCredentials): Promise<void> {
|
||||||
|
await session.browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch locations for a city using Dutchie's internal search API.
|
||||||
|
*/
|
||||||
|
export async function fetchLocationsForCity(
|
||||||
|
city: DiscoveryCity,
|
||||||
|
options: {
|
||||||
|
session?: SessionCredentials;
|
||||||
|
verbose?: boolean;
|
||||||
|
} = {}
|
||||||
|
): Promise<DutchieLocationResponse[]> {
|
||||||
|
const { verbose = false } = options;
|
||||||
|
let session = options.session;
|
||||||
|
let shouldCloseSession = false;
|
||||||
|
|
||||||
|
if (!session) {
|
||||||
|
session = await createSession(city.citySlug);
|
||||||
|
shouldCloseSession = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||||
|
|
||||||
|
// Try multiple approaches to get location data
|
||||||
|
|
||||||
|
// Approach 1: Extract from page __NEXT_DATA__ or similar
|
||||||
|
const locations = await extractLocationsFromPage(session.page, verbose);
|
||||||
|
if (locations.length > 0) {
|
||||||
|
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data`);
|
||||||
|
return locations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Approach 2: Try the geo-based GraphQL query
|
||||||
|
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
||||||
|
if (geoLocations.length > 0) {
|
||||||
|
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from GraphQL`);
|
||||||
|
return geoLocations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Approach 3: Scrape visible location cards
|
||||||
|
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
||||||
|
if (scrapedLocations.length > 0) {
|
||||||
|
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping`);
|
||||||
|
return scrapedLocations;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||||
|
return [];
|
||||||
|
} finally {
|
||||||
|
if (shouldCloseSession) {
|
||||||
|
await closeSession(session);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract locations from page's embedded data (__NEXT_DATA__, window.*, etc.)
|
||||||
|
*/
|
||||||
|
async function extractLocationsFromPage(
|
||||||
|
page: Page,
|
||||||
|
verbose: boolean
|
||||||
|
): Promise<DutchieLocationResponse[]> {
|
||||||
|
try {
|
||||||
|
const data = await page.evaluate(() => {
|
||||||
|
// Try __NEXT_DATA__
|
||||||
|
const nextDataEl = document.querySelector('#__NEXT_DATA__');
|
||||||
|
if (nextDataEl?.textContent) {
|
||||||
|
try {
|
||||||
|
const nextData = JSON.parse(nextDataEl.textContent);
|
||||||
|
// Look for dispensaries in various paths
|
||||||
|
const dispensaries =
|
||||||
|
nextData?.props?.pageProps?.dispensaries ||
|
||||||
|
nextData?.props?.pageProps?.initialDispensaries ||
|
||||||
|
nextData?.props?.pageProps?.data?.dispensaries ||
|
||||||
|
[];
|
||||||
|
if (Array.isArray(dispensaries) && dispensaries.length > 0) {
|
||||||
|
return { source: '__NEXT_DATA__', dispensaries };
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore parse errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try window variables
|
||||||
|
const win = window as any;
|
||||||
|
if (win.__APOLLO_STATE__) {
|
||||||
|
// Extract from Apollo cache
|
||||||
|
const entries = Object.entries(win.__APOLLO_STATE__).filter(
|
||||||
|
([key]) => key.startsWith('Dispensary:')
|
||||||
|
);
|
||||||
|
if (entries.length > 0) {
|
||||||
|
return { source: 'APOLLO_STATE', dispensaries: entries.map(([, v]) => v) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { source: 'none', dispensaries: [] };
|
||||||
|
});
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[LocationDiscovery] Page data source: ${data.source}, count: ${data.dispensaries.length}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return data.dispensaries.map((d: any) => normalizeLocationResponse(d));
|
||||||
|
} catch (error: any) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[LocationDiscovery] Could not extract from page data: ${error.message}`);
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch locations via GraphQL geo-based query.
|
||||||
|
*/
|
||||||
|
async function fetchLocationsViaGraphQL(
|
||||||
|
session: SessionCredentials,
|
||||||
|
city: DiscoveryCity,
|
||||||
|
verbose: boolean
|
||||||
|
): Promise<DutchieLocationResponse[]> {
|
||||||
|
// Use a known center point for the city or default to a central US location
|
||||||
|
const CITY_COORDS: Record<string, { lat: number; lng: number }> = {
|
||||||
|
'phoenix': { lat: 33.4484, lng: -112.074 },
|
||||||
|
'tucson': { lat: 32.2226, lng: -110.9747 },
|
||||||
|
'scottsdale': { lat: 33.4942, lng: -111.9261 },
|
||||||
|
'mesa': { lat: 33.4152, lng: -111.8315 },
|
||||||
|
'tempe': { lat: 33.4255, lng: -111.94 },
|
||||||
|
'flagstaff': { lat: 35.1983, lng: -111.6513 },
|
||||||
|
// Add more as needed
|
||||||
|
};
|
||||||
|
|
||||||
|
const coords = CITY_COORDS[city.citySlug] || { lat: 33.4484, lng: -112.074 };
|
||||||
|
|
||||||
|
const variables = {
|
||||||
|
dispensariesFilter: {
|
||||||
|
latitude: coords.lat,
|
||||||
|
longitude: coords.lng,
|
||||||
|
distance: 50, // miles
|
||||||
|
state: city.stateCode,
|
||||||
|
city: city.cityName,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.post(
|
||||||
|
'https://dutchie.com/api-3/graphql',
|
||||||
|
{
|
||||||
|
operationName: 'ConsumerDispensaries',
|
||||||
|
variables,
|
||||||
|
extensions: {
|
||||||
|
persistedQuery: { version: 1, sha256Hash: hash },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
'content-type': 'application/json',
|
||||||
|
'origin': 'https://dutchie.com',
|
||||||
|
'referer': `https://dutchie.com/dispensaries/${city.stateCode?.toLowerCase()}/${city.citySlug}`,
|
||||||
|
'user-agent': session.userAgent,
|
||||||
|
'cookie': session.cookies,
|
||||||
|
},
|
||||||
|
timeout: 30000,
|
||||||
|
validateStatus: () => true,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (response.status !== 200) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[LocationDiscovery] GraphQL returned ${response.status}`);
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const dispensaries = response.data?.data?.consumerDispensaries || [];
|
||||||
|
return dispensaries.map((d: any) => normalizeLocationResponse(d));
|
||||||
|
} catch (error: any) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrape location cards from the visible page.
|
||||||
|
*/
|
||||||
|
async function scrapeLocationCards(
|
||||||
|
page: Page,
|
||||||
|
verbose: boolean
|
||||||
|
): Promise<DutchieLocationResponse[]> {
|
||||||
|
try {
|
||||||
|
const locations = await page.evaluate(() => {
|
||||||
|
const cards: any[] = [];
|
||||||
|
|
||||||
|
// Look for common dispensary card patterns
|
||||||
|
const selectors = [
|
||||||
|
'[data-testid="dispensary-card"]',
|
||||||
|
'.dispensary-card',
|
||||||
|
'a[href*="/dispensary/"]',
|
||||||
|
'[class*="DispensaryCard"]',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of selectors) {
|
||||||
|
const elements = document.querySelectorAll(selector);
|
||||||
|
if (elements.length > 0) {
|
||||||
|
elements.forEach((el) => {
|
||||||
|
const link = el.querySelector('a')?.href || (el as HTMLAnchorElement).href || '';
|
||||||
|
const name = el.querySelector('h2, h3, [class*="name"]')?.textContent?.trim() || '';
|
||||||
|
const address = el.querySelector('[class*="address"], address')?.textContent?.trim() || '';
|
||||||
|
|
||||||
|
// Extract slug from URL
|
||||||
|
const slugMatch = link.match(/\/dispensary\/([^/?]+)/);
|
||||||
|
const slug = slugMatch ? slugMatch[1] : '';
|
||||||
|
|
||||||
|
if (slug && name) {
|
||||||
|
cards.push({
|
||||||
|
slug,
|
||||||
|
name,
|
||||||
|
address,
|
||||||
|
menuUrl: link,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
break; // Stop after first successful selector
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cards;
|
||||||
|
});
|
||||||
|
|
||||||
|
return locations.map((d: any) => ({
|
||||||
|
id: '',
|
||||||
|
name: d.name,
|
||||||
|
slug: d.slug,
|
||||||
|
address: d.address,
|
||||||
|
menuUrl: d.menuUrl,
|
||||||
|
}));
|
||||||
|
} catch (error: any) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[LocationDiscovery] Scraping error: ${error.message}`);
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a raw location response to a consistent format.
|
||||||
|
*/
|
||||||
|
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
||||||
|
const slug = raw.slug || raw.cName || raw.urlSlug || '';
|
||||||
|
const id = raw.id || raw._id || raw.dispensaryId || '';
|
||||||
|
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
name: raw.name || raw.dispensaryName || '',
|
||||||
|
slug,
|
||||||
|
address: raw.address || raw.fullAddress || '',
|
||||||
|
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || '',
|
||||||
|
address2: raw.address2 || raw.addressLine2 || '',
|
||||||
|
city: raw.city || '',
|
||||||
|
state: raw.state || raw.stateCode || '',
|
||||||
|
zip: raw.zip || raw.zipCode || raw.postalCode || '',
|
||||||
|
country: raw.country || raw.countryCode || 'US',
|
||||||
|
latitude: raw.latitude || raw.lat || raw.location?.latitude,
|
||||||
|
longitude: raw.longitude || raw.lng || raw.location?.longitude,
|
||||||
|
timezone: raw.timezone || raw.tz || '',
|
||||||
|
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
|
||||||
|
retailType: raw.retailType || raw.type || '',
|
||||||
|
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
|
||||||
|
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
|
||||||
|
isRecreational: raw.isRecreational ?? raw.retailType?.includes('Recreational') ?? true,
|
||||||
|
isMedical: raw.isMedical ?? raw.retailType?.includes('Medical') ?? true,
|
||||||
|
// Preserve raw data
|
||||||
|
...raw,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a location into dutchie_discovery_locations.
|
||||||
|
*/
|
||||||
|
export async function upsertLocation(
|
||||||
|
pool: Pool,
|
||||||
|
location: DutchieLocationResponse,
|
||||||
|
cityId: number | null
|
||||||
|
): Promise<{ id: number; isNew: boolean }> {
|
||||||
|
const platformLocationId = location.id || location.slug;
|
||||||
|
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
|
||||||
|
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO dutchie_discovery_locations (
|
||||||
|
platform,
|
||||||
|
platform_location_id,
|
||||||
|
platform_slug,
|
||||||
|
platform_menu_url,
|
||||||
|
name,
|
||||||
|
raw_address,
|
||||||
|
address_line1,
|
||||||
|
address_line2,
|
||||||
|
city,
|
||||||
|
state_code,
|
||||||
|
postal_code,
|
||||||
|
country_code,
|
||||||
|
latitude,
|
||||||
|
longitude,
|
||||||
|
timezone,
|
||||||
|
discovery_city_id,
|
||||||
|
metadata,
|
||||||
|
offers_delivery,
|
||||||
|
offers_pickup,
|
||||||
|
is_recreational,
|
||||||
|
is_medical,
|
||||||
|
last_seen_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW(), NOW())
|
||||||
|
ON CONFLICT (platform, platform_location_id)
|
||||||
|
DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
platform_menu_url = EXCLUDED.platform_menu_url,
|
||||||
|
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
|
||||||
|
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
|
||||||
|
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
|
||||||
|
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
|
||||||
|
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
|
||||||
|
latitude = COALESCE(EXCLUDED.latitude, dutchie_discovery_locations.latitude),
|
||||||
|
longitude = COALESCE(EXCLUDED.longitude, dutchie_discovery_locations.longitude),
|
||||||
|
timezone = COALESCE(EXCLUDED.timezone, dutchie_discovery_locations.timezone),
|
||||||
|
metadata = EXCLUDED.metadata,
|
||||||
|
offers_delivery = COALESCE(EXCLUDED.offers_delivery, dutchie_discovery_locations.offers_delivery),
|
||||||
|
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
|
||||||
|
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
|
||||||
|
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
|
||||||
|
last_seen_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING id, (xmax = 0) as is_new`,
|
||||||
|
[
|
||||||
|
PLATFORM,
|
||||||
|
platformLocationId,
|
||||||
|
location.slug,
|
||||||
|
menuUrl,
|
||||||
|
location.name,
|
||||||
|
location.address || null,
|
||||||
|
location.address1 || null,
|
||||||
|
location.address2 || null,
|
||||||
|
location.city || null,
|
||||||
|
location.state || null,
|
||||||
|
location.zip || null,
|
||||||
|
location.country || 'US',
|
||||||
|
location.latitude || null,
|
||||||
|
location.longitude || null,
|
||||||
|
location.timezone || null,
|
||||||
|
cityId,
|
||||||
|
JSON.stringify(location),
|
||||||
|
location.offerDelivery ?? null,
|
||||||
|
location.offerPickup ?? null,
|
||||||
|
location.isRecreational ?? null,
|
||||||
|
location.isMedical ?? null,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: result.rows[0].id,
|
||||||
|
isNew: result.rows[0].is_new,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get locations by status.
|
||||||
|
*/
|
||||||
|
export async function getLocationsByStatus(
|
||||||
|
pool: Pool,
|
||||||
|
status: DiscoveryStatus,
|
||||||
|
options: {
|
||||||
|
stateCode?: string;
|
||||||
|
countryCode?: string;
|
||||||
|
limit?: number;
|
||||||
|
offset?: number;
|
||||||
|
} = {}
|
||||||
|
): Promise<DiscoveryLocation[]> {
|
||||||
|
const { stateCode, countryCode, limit = 100, offset = 0 } = options;
|
||||||
|
|
||||||
|
let query = `
|
||||||
|
SELECT * FROM dutchie_discovery_locations
|
||||||
|
WHERE status = $1 AND active = TRUE
|
||||||
|
`;
|
||||||
|
const params: any[] = [status];
|
||||||
|
let paramIdx = 2;
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
query += ` AND state_code = $${paramIdx}`;
|
||||||
|
params.push(stateCode);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countryCode) {
|
||||||
|
query += ` AND country_code = $${paramIdx}`;
|
||||||
|
params.push(countryCode);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY first_seen_at DESC LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`;
|
||||||
|
params.push(limit, offset);
|
||||||
|
|
||||||
|
const result = await pool.query<DiscoveryLocationRow>(query, params);
|
||||||
|
return result.rows.map(mapLocationRowToLocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a location by ID.
|
||||||
|
*/
|
||||||
|
export async function getLocationById(
|
||||||
|
pool: Pool,
|
||||||
|
id: number
|
||||||
|
): Promise<DiscoveryLocation | null> {
|
||||||
|
const result = await pool.query<DiscoveryLocationRow>(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mapLocationRowToLocation(result.rows[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update location status.
|
||||||
|
*/
|
||||||
|
export async function updateLocationStatus(
|
||||||
|
pool: Pool,
|
||||||
|
locationId: number,
|
||||||
|
status: DiscoveryStatus,
|
||||||
|
options: {
|
||||||
|
dispensaryId?: number;
|
||||||
|
verifiedBy?: string;
|
||||||
|
notes?: string;
|
||||||
|
} = {}
|
||||||
|
): Promise<void> {
|
||||||
|
const { dispensaryId, verifiedBy, notes } = options;
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dutchie_discovery_locations
|
||||||
|
SET status = $2,
|
||||||
|
dispensary_id = COALESCE($3, dispensary_id),
|
||||||
|
verified_at = CASE WHEN $2 IN ('verified', 'merged') THEN NOW() ELSE verified_at END,
|
||||||
|
verified_by = COALESCE($4, verified_by),
|
||||||
|
notes = COALESCE($5, notes),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[locationId, status, dispensaryId || null, verifiedBy || null, notes || null]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search locations by name or address.
|
||||||
|
*/
|
||||||
|
export async function searchLocations(
|
||||||
|
pool: Pool,
|
||||||
|
query: string,
|
||||||
|
options: {
|
||||||
|
status?: DiscoveryStatus;
|
||||||
|
stateCode?: string;
|
||||||
|
limit?: number;
|
||||||
|
} = {}
|
||||||
|
): Promise<DiscoveryLocation[]> {
|
||||||
|
const { status, stateCode, limit = 50 } = options;
|
||||||
|
const searchPattern = `%${query}%`;
|
||||||
|
|
||||||
|
let sql = `
|
||||||
|
SELECT * FROM dutchie_discovery_locations
|
||||||
|
WHERE active = TRUE
|
||||||
|
AND (name ILIKE $1 OR city ILIKE $1 OR raw_address ILIKE $1 OR platform_slug ILIKE $1)
|
||||||
|
`;
|
||||||
|
const params: any[] = [searchPattern];
|
||||||
|
let paramIdx = 2;
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
sql += ` AND status = $${paramIdx}`;
|
||||||
|
params.push(status);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
sql += ` AND state_code = $${paramIdx}`;
|
||||||
|
params.push(stateCode);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
sql += ` ORDER BY name LIMIT $${paramIdx}`;
|
||||||
|
params.push(limit);
|
||||||
|
|
||||||
|
const result = await pool.query<DiscoveryLocationRow>(sql, params);
|
||||||
|
return result.rows.map(mapLocationRowToLocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN DISCOVERY FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discover locations for a specific city.
|
||||||
|
*/
|
||||||
|
export async function discoverLocationsForCity(
|
||||||
|
pool: Pool,
|
||||||
|
city: DiscoveryCity,
|
||||||
|
options: {
|
||||||
|
dryRun?: boolean;
|
||||||
|
verbose?: boolean;
|
||||||
|
} = {}
|
||||||
|
): Promise<LocationDiscoveryResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const { dryRun = false, verbose = false } = options;
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
console.log(`[LocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
||||||
|
console.log(`[LocationDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||||
|
|
||||||
|
const locations = await fetchLocationsForCity(city, { verbose });
|
||||||
|
|
||||||
|
if (locations.length === 0) {
|
||||||
|
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||||
|
return {
|
||||||
|
cityId: city.id,
|
||||||
|
citySlug: city.citySlug,
|
||||||
|
locationsFound: 0,
|
||||||
|
locationsUpserted: 0,
|
||||||
|
locationsNew: 0,
|
||||||
|
locationsUpdated: 0,
|
||||||
|
errors: [],
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let newCount = 0;
|
||||||
|
let updatedCount = 0;
|
||||||
|
|
||||||
|
for (const location of locations) {
|
||||||
|
try {
|
||||||
|
if (dryRun) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[LocationDiscovery][DryRun] Would upsert: ${location.name} (${location.slug})`);
|
||||||
|
}
|
||||||
|
newCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await upsertLocation(pool, location, city.id);
|
||||||
|
|
||||||
|
if (result.isNew) {
|
||||||
|
newCount++;
|
||||||
|
} else {
|
||||||
|
updatedCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
const action = result.isNew ? 'Created' : 'Updated';
|
||||||
|
console.log(`[LocationDiscovery] ${action}: ${location.name} -> ID ${result.id}`);
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`Location ${location.slug}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update city crawl status
|
||||||
|
if (!dryRun) {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dutchie_discovery_cities
|
||||||
|
SET last_crawled_at = NOW(),
|
||||||
|
location_count = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1`,
|
||||||
|
[city.id, locations.length]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log(`[LocationDiscovery] Complete for ${city.cityName}: ${newCount} new, ${updatedCount} updated, ${errors.length} errors in ${durationMs}ms`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
cityId: city.id,
|
||||||
|
citySlug: city.citySlug,
|
||||||
|
locationsFound: locations.length,
|
||||||
|
locationsUpserted: newCount + updatedCount,
|
||||||
|
locationsNew: newCount,
|
||||||
|
locationsUpdated: updatedCount,
|
||||||
|
errors,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
840
backend/src/discovery/routes.ts
Normal file
840
backend/src/discovery/routes.ts
Normal file
@@ -0,0 +1,840 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie Discovery API Routes
|
||||||
|
*
|
||||||
|
* Express routes for the Dutchie store discovery pipeline.
|
||||||
|
* Provides endpoints for discovering, listing, and verifying locations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Router, Request, Response } from 'express';
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import {
|
||||||
|
runFullDiscovery,
|
||||||
|
discoverCity,
|
||||||
|
discoverState,
|
||||||
|
getDiscoveryStats,
|
||||||
|
} from './discovery-crawler';
|
||||||
|
import {
|
||||||
|
discoverCities,
|
||||||
|
getCitiesToCrawl,
|
||||||
|
getCityBySlug,
|
||||||
|
seedKnownCities,
|
||||||
|
ARIZONA_CITIES,
|
||||||
|
} from './city-discovery';
|
||||||
|
import {
|
||||||
|
DiscoveryLocation,
|
||||||
|
DiscoveryCity,
|
||||||
|
DiscoveryStatus,
|
||||||
|
mapLocationRowToLocation,
|
||||||
|
mapCityRowToCity,
|
||||||
|
} from './types';
|
||||||
|
|
||||||
|
export function createDiscoveryRoutes(pool: Pool): Router {
|
||||||
|
const router = Router();
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY LOCATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/locations
|
||||||
|
* List discovered locations with filtering
|
||||||
|
*/
|
||||||
|
router.get('/locations', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const {
|
||||||
|
status,
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
city,
|
||||||
|
platform = 'dutchie',
|
||||||
|
search,
|
||||||
|
hasDispensary,
|
||||||
|
limit = '50',
|
||||||
|
offset = '0',
|
||||||
|
} = req.query;
|
||||||
|
|
||||||
|
let whereClause = 'WHERE platform = $1 AND active = TRUE';
|
||||||
|
const params: any[] = [platform];
|
||||||
|
let paramIndex = 2;
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
whereClause += ` AND status = $${paramIndex}`;
|
||||||
|
params.push(status);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
whereClause += ` AND state_code = $${paramIndex}`;
|
||||||
|
params.push(stateCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countryCode) {
|
||||||
|
whereClause += ` AND country_code = $${paramIndex}`;
|
||||||
|
params.push(countryCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (city) {
|
||||||
|
whereClause += ` AND city ILIKE $${paramIndex}`;
|
||||||
|
params.push(`%${city}%`);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (search) {
|
||||||
|
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||||
|
params.push(`%${search}%`);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasDispensary === 'true') {
|
||||||
|
whereClause += ' AND dispensary_id IS NOT NULL';
|
||||||
|
} else if (hasDispensary === 'false') {
|
||||||
|
whereClause += ' AND dispensary_id IS NULL';
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
dl.*,
|
||||||
|
d.name as dispensary_name,
|
||||||
|
dc.city_name as discovery_city_name
|
||||||
|
FROM dutchie_discovery_locations dl
|
||||||
|
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||||
|
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY dl.first_seen_at DESC
|
||||||
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||||
|
`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
const { rows: countRows } = await pool.query(
|
||||||
|
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
|
||||||
|
params.slice(0, -2)
|
||||||
|
);
|
||||||
|
|
||||||
|
const locations = rows.map((row: any) => ({
|
||||||
|
...mapLocationRowToLocation(row),
|
||||||
|
dispensaryName: row.dispensary_name,
|
||||||
|
discoveryCityName: row.discovery_city_name,
|
||||||
|
}));
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
locations,
|
||||||
|
total: parseInt(countRows[0]?.total || '0', 10),
|
||||||
|
limit: parseInt(limit as string, 10),
|
||||||
|
offset: parseInt(offset as string, 10),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/locations/:id
|
||||||
|
* Get a single discovery location
|
||||||
|
*/
|
||||||
|
router.get('/locations/:id', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
dl.*,
|
||||||
|
d.name as dispensary_name,
|
||||||
|
d.menu_url as dispensary_menu_url,
|
||||||
|
dc.city_name as discovery_city_name
|
||||||
|
FROM dutchie_discovery_locations dl
|
||||||
|
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||||
|
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
|
||||||
|
WHERE dl.id = $1
|
||||||
|
`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
...mapLocationRowToLocation(rows[0]),
|
||||||
|
dispensaryName: rows[0].dispensary_name,
|
||||||
|
dispensaryMenuUrl: rows[0].dispensary_menu_url,
|
||||||
|
discoveryCityName: rows[0].discovery_city_name,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/locations/pending
|
||||||
|
* Get locations awaiting verification
|
||||||
|
*/
|
||||||
|
router.get('/locations/pending', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { stateCode, countryCode, limit = '100' } = req.query;
|
||||||
|
|
||||||
|
let whereClause = `WHERE status = 'discovered' AND active = TRUE`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
whereClause += ` AND state_code = $${paramIndex}`;
|
||||||
|
params.push(stateCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countryCode) {
|
||||||
|
whereClause += ` AND country_code = $${paramIndex}`;
|
||||||
|
params.push(countryCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(parseInt(limit as string, 10));
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT * FROM dutchie_discovery_locations
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY state_code, city, name
|
||||||
|
LIMIT $${paramIndex}
|
||||||
|
`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
locations: rows.map(mapLocationRowToLocation),
|
||||||
|
total: rows.length,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY CITIES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/cities
|
||||||
|
* List discovery cities
|
||||||
|
*/
|
||||||
|
router.get('/cities', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const {
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
crawlEnabled,
|
||||||
|
platform = 'dutchie',
|
||||||
|
limit = '100',
|
||||||
|
offset = '0',
|
||||||
|
} = req.query;
|
||||||
|
|
||||||
|
let whereClause = 'WHERE platform = $1';
|
||||||
|
const params: any[] = [platform];
|
||||||
|
let paramIndex = 2;
|
||||||
|
|
||||||
|
if (stateCode) {
|
||||||
|
whereClause += ` AND state_code = $${paramIndex}`;
|
||||||
|
params.push(stateCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countryCode) {
|
||||||
|
whereClause += ` AND country_code = $${paramIndex}`;
|
||||||
|
params.push(countryCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (crawlEnabled === 'true') {
|
||||||
|
whereClause += ' AND crawl_enabled = TRUE';
|
||||||
|
} else if (crawlEnabled === 'false') {
|
||||||
|
whereClause += ' AND crawl_enabled = FALSE';
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
dc.*,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_discovery_locations dl WHERE dl.discovery_city_id = dc.id) as actual_location_count
|
||||||
|
FROM dutchie_discovery_cities dc
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY dc.country_code, dc.state_code, dc.city_name
|
||||||
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||||
|
`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
const { rows: countRows } = await pool.query(
|
||||||
|
`SELECT COUNT(*) as total FROM dutchie_discovery_cities dc ${whereClause}`,
|
||||||
|
params.slice(0, -2)
|
||||||
|
);
|
||||||
|
|
||||||
|
const cities = rows.map((row: any) => ({
|
||||||
|
...mapCityRowToCity(row),
|
||||||
|
actualLocationCount: parseInt(row.actual_location_count || '0', 10),
|
||||||
|
}));
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
cities,
|
||||||
|
total: parseInt(countRows[0]?.total || '0', 10),
|
||||||
|
limit: parseInt(limit as string, 10),
|
||||||
|
offset: parseInt(offset as string, 10),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STATISTICS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/stats
|
||||||
|
* Get discovery statistics
|
||||||
|
*/
|
||||||
|
router.get('/stats', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const stats = await getDiscoveryStats(pool);
|
||||||
|
res.json(stats);
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// VERIFICATION ACTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/locations/:id/verify
|
||||||
|
* Verify a discovered location and create a new canonical dispensary
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/verify', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { verifiedBy = 'admin' } = req.body;
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await pool.query(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
if (location.status !== 'discovered') {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: `Location already has status: ${location.status}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the canonical dispensary
|
||||||
|
const { rows: dispRows } = await pool.query(
|
||||||
|
`
|
||||||
|
INSERT INTO dispensaries (
|
||||||
|
name,
|
||||||
|
slug,
|
||||||
|
address,
|
||||||
|
city,
|
||||||
|
state,
|
||||||
|
zip,
|
||||||
|
latitude,
|
||||||
|
longitude,
|
||||||
|
timezone,
|
||||||
|
menu_type,
|
||||||
|
menu_url,
|
||||||
|
platform_dispensary_id,
|
||||||
|
active,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, TRUE, NOW(), NOW()
|
||||||
|
)
|
||||||
|
RETURNING id
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
location.name,
|
||||||
|
location.platform_slug,
|
||||||
|
location.address_line1,
|
||||||
|
location.city,
|
||||||
|
location.state_code,
|
||||||
|
location.postal_code,
|
||||||
|
location.latitude,
|
||||||
|
location.longitude,
|
||||||
|
location.timezone,
|
||||||
|
location.platform,
|
||||||
|
location.platform_menu_url,
|
||||||
|
location.platform_location_id,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
const dispensaryId = dispRows[0].id;
|
||||||
|
|
||||||
|
// Update the discovery location
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'verified',
|
||||||
|
dispensary_id = $1,
|
||||||
|
verified_at = NOW(),
|
||||||
|
verified_by = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[dispensaryId, verifiedBy, id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'created',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
dispensaryId,
|
||||||
|
message: `Created new dispensary (ID: ${dispensaryId})`,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/locations/:id/link
|
||||||
|
* Link a discovered location to an existing dispensary
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/link', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { dispensaryId, verifiedBy = 'admin' } = req.body;
|
||||||
|
|
||||||
|
if (!dispensaryId) {
|
||||||
|
return res.status(400).json({ error: 'dispensaryId is required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify dispensary exists
|
||||||
|
const { rows: dispRows } = await pool.query(
|
||||||
|
`SELECT id, name FROM dispensaries WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (dispRows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Dispensary not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await pool.query(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
if (location.status !== 'discovered') {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: `Location already has status: ${location.status}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update dispensary with platform info if missing
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||||
|
menu_url = COALESCE(menu_url, $2),
|
||||||
|
menu_type = COALESCE(menu_type, $3),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $4
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
location.platform_location_id,
|
||||||
|
location.platform_menu_url,
|
||||||
|
location.platform,
|
||||||
|
dispensaryId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the discovery location
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'merged',
|
||||||
|
dispensary_id = $1,
|
||||||
|
verified_at = NOW(),
|
||||||
|
verified_by = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[dispensaryId, verifiedBy, id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'linked',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
dispensaryId,
|
||||||
|
dispensaryName: dispRows[0].name,
|
||||||
|
message: `Linked to existing dispensary: ${dispRows[0].name}`,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/locations/:id/reject
|
||||||
|
* Reject a discovered location
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { reason, verifiedBy = 'admin' } = req.body;
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rows[0].status !== 'discovered') {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: `Location already has status: ${rows[0].status}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'rejected',
|
||||||
|
verified_at = NOW(),
|
||||||
|
verified_by = $1,
|
||||||
|
notes = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[verifiedBy, reason || 'Rejected by admin', id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'rejected',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
message: 'Location rejected',
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/locations/:id/unreject
|
||||||
|
* Restore a rejected location back to discovered status
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rows[0].status !== 'rejected') {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: `Location is not rejected. Current status: ${rows[0].status}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'discovered',
|
||||||
|
verified_at = NULL,
|
||||||
|
verified_by = NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'unrejected',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
message: 'Location restored to discovered status',
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY ADMIN ACTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/admin/discover-state
|
||||||
|
* Run discovery for an entire state
|
||||||
|
*/
|
||||||
|
router.post('/admin/discover-state', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { stateCode, dryRun = false, cityLimit = 100 } = req.body;
|
||||||
|
|
||||||
|
if (!stateCode) {
|
||||||
|
return res.status(400).json({ error: 'stateCode is required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[Discovery API] Starting state discovery for ${stateCode}`);
|
||||||
|
const result = await discoverState(pool, stateCode, {
|
||||||
|
dryRun,
|
||||||
|
cityLimit,
|
||||||
|
verbose: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
stateCode,
|
||||||
|
result,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/admin/discover-city
|
||||||
|
* Run discovery for a single city
|
||||||
|
*/
|
||||||
|
router.post('/admin/discover-city', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { citySlug, stateCode, countryCode = 'US', dryRun = false } = req.body;
|
||||||
|
|
||||||
|
if (!citySlug) {
|
||||||
|
return res.status(400).json({ error: 'citySlug is required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[Discovery API] Starting city discovery for ${citySlug}`);
|
||||||
|
const result = await discoverCity(pool, citySlug, {
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
dryRun,
|
||||||
|
verbose: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!result) {
|
||||||
|
return res.status(404).json({ error: `City not found: ${citySlug}` });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
citySlug,
|
||||||
|
result,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/admin/run-full
|
||||||
|
* Run full discovery pipeline
|
||||||
|
*/
|
||||||
|
router.post('/admin/run-full', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const {
|
||||||
|
stateCode,
|
||||||
|
countryCode = 'US',
|
||||||
|
cityLimit = 50,
|
||||||
|
skipCityDiscovery = false,
|
||||||
|
onlyStale = true,
|
||||||
|
staleDays = 7,
|
||||||
|
dryRun = false,
|
||||||
|
} = req.body;
|
||||||
|
|
||||||
|
console.log(`[Discovery API] Starting full discovery`);
|
||||||
|
const result = await runFullDiscovery(pool, {
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
cityLimit,
|
||||||
|
skipCityDiscovery,
|
||||||
|
onlyStale,
|
||||||
|
staleDays,
|
||||||
|
dryRun,
|
||||||
|
verbose: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
result,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/admin/seed-cities
|
||||||
|
* Seed known cities for a state
|
||||||
|
*/
|
||||||
|
router.post('/admin/seed-cities', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { stateCode } = req.body;
|
||||||
|
|
||||||
|
if (!stateCode) {
|
||||||
|
return res.status(400).json({ error: 'stateCode is required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
let cities: any[] = [];
|
||||||
|
if (stateCode === 'AZ') {
|
||||||
|
cities = ARIZONA_CITIES;
|
||||||
|
} else {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: `No predefined cities for state: ${stateCode}. Add cities to city-discovery.ts`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await seedKnownCities(pool, cities);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
stateCode,
|
||||||
|
...result,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/admin/match-candidates/:id
|
||||||
|
* Find potential dispensary matches for a discovery location
|
||||||
|
*/
|
||||||
|
router.get('/admin/match-candidates/:id', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await pool.query(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
return res.status(404).json({ error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
// Find potential matches by name similarity and location
|
||||||
|
const { rows: candidates } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
d.id,
|
||||||
|
d.name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
d.address,
|
||||||
|
d.menu_type,
|
||||||
|
d.platform_dispensary_id,
|
||||||
|
d.menu_url,
|
||||||
|
d.latitude,
|
||||||
|
d.longitude,
|
||||||
|
CASE
|
||||||
|
WHEN d.name ILIKE $1 THEN 'exact_name'
|
||||||
|
WHEN d.name ILIKE $2 THEN 'partial_name'
|
||||||
|
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
|
||||||
|
ELSE 'location_match'
|
||||||
|
END as match_type,
|
||||||
|
-- Distance in miles if coordinates available
|
||||||
|
CASE
|
||||||
|
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
|
||||||
|
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
|
||||||
|
THEN (3959 * acos(
|
||||||
|
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||||
|
cos(radians(d.longitude) - radians($6::float)) +
|
||||||
|
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||||
|
))
|
||||||
|
ELSE NULL
|
||||||
|
END as distance_miles
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.state = $4
|
||||||
|
AND (
|
||||||
|
d.name ILIKE $1
|
||||||
|
OR d.name ILIKE $2
|
||||||
|
OR d.city ILIKE $3
|
||||||
|
OR (
|
||||||
|
d.latitude IS NOT NULL
|
||||||
|
AND d.longitude IS NOT NULL
|
||||||
|
AND $5::float IS NOT NULL
|
||||||
|
AND $6::float IS NOT NULL
|
||||||
|
AND (3959 * acos(
|
||||||
|
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||||
|
cos(radians(d.longitude) - radians($6::float)) +
|
||||||
|
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||||
|
)) < 5
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ORDER BY
|
||||||
|
CASE
|
||||||
|
WHEN d.name ILIKE $1 THEN 1
|
||||||
|
WHEN d.name ILIKE $2 THEN 2
|
||||||
|
ELSE 3
|
||||||
|
END,
|
||||||
|
distance_miles NULLS LAST
|
||||||
|
LIMIT 10
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
location.name,
|
||||||
|
`%${location.name.split(' ')[0]}%`,
|
||||||
|
location.city,
|
||||||
|
location.state_code,
|
||||||
|
location.latitude,
|
||||||
|
location.longitude,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
location: mapLocationRowToLocation(location),
|
||||||
|
candidates: candidates.map((c: any) => ({
|
||||||
|
id: c.id,
|
||||||
|
name: c.name,
|
||||||
|
city: c.city,
|
||||||
|
state: c.state,
|
||||||
|
address: c.address,
|
||||||
|
menuType: c.menu_type,
|
||||||
|
platformDispensaryId: c.platform_dispensary_id,
|
||||||
|
menuUrl: c.menu_url,
|
||||||
|
matchType: c.match_type,
|
||||||
|
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return router;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default createDiscoveryRoutes;
|
||||||
269
backend/src/discovery/types.ts
Normal file
269
backend/src/discovery/types.ts
Normal file
@@ -0,0 +1,269 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie Discovery Types
|
||||||
|
*
|
||||||
|
* Type definitions for the Dutchie store discovery pipeline.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY CITY
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DiscoveryCity {
|
||||||
|
id: number;
|
||||||
|
platform: string;
|
||||||
|
cityName: string;
|
||||||
|
citySlug: string;
|
||||||
|
stateCode: string | null;
|
||||||
|
countryCode: string;
|
||||||
|
lastCrawledAt: Date | null;
|
||||||
|
crawlEnabled: boolean;
|
||||||
|
locationCount: number | null;
|
||||||
|
notes: string | null;
|
||||||
|
metadata: Record<string, any> | null;
|
||||||
|
createdAt: Date;
|
||||||
|
updatedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DiscoveryCityRow {
|
||||||
|
id: number;
|
||||||
|
platform: string;
|
||||||
|
city_name: string;
|
||||||
|
city_slug: string;
|
||||||
|
state_code: string | null;
|
||||||
|
country_code: string;
|
||||||
|
last_crawled_at: Date | null;
|
||||||
|
crawl_enabled: boolean;
|
||||||
|
location_count: number | null;
|
||||||
|
notes: string | null;
|
||||||
|
metadata: Record<string, any> | null;
|
||||||
|
created_at: Date;
|
||||||
|
updated_at: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY LOCATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export type DiscoveryStatus = 'discovered' | 'verified' | 'rejected' | 'merged';
|
||||||
|
|
||||||
|
export interface DiscoveryLocation {
|
||||||
|
id: number;
|
||||||
|
platform: string;
|
||||||
|
platformLocationId: string;
|
||||||
|
platformSlug: string;
|
||||||
|
platformMenuUrl: string;
|
||||||
|
name: string;
|
||||||
|
rawAddress: string | null;
|
||||||
|
addressLine1: string | null;
|
||||||
|
addressLine2: string | null;
|
||||||
|
city: string | null;
|
||||||
|
stateCode: string | null;
|
||||||
|
postalCode: string | null;
|
||||||
|
countryCode: string | null;
|
||||||
|
latitude: number | null;
|
||||||
|
longitude: number | null;
|
||||||
|
timezone: string | null;
|
||||||
|
status: DiscoveryStatus;
|
||||||
|
dispensaryId: number | null;
|
||||||
|
discoveryCityId: number | null;
|
||||||
|
metadata: Record<string, any> | null;
|
||||||
|
notes: string | null;
|
||||||
|
offersDelivery: boolean | null;
|
||||||
|
offersPickup: boolean | null;
|
||||||
|
isRecreational: boolean | null;
|
||||||
|
isMedical: boolean | null;
|
||||||
|
firstSeenAt: Date;
|
||||||
|
lastSeenAt: Date;
|
||||||
|
lastCheckedAt: Date | null;
|
||||||
|
verifiedAt: Date | null;
|
||||||
|
verifiedBy: string | null;
|
||||||
|
active: boolean;
|
||||||
|
createdAt: Date;
|
||||||
|
updatedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DiscoveryLocationRow {
|
||||||
|
id: number;
|
||||||
|
platform: string;
|
||||||
|
platform_location_id: string;
|
||||||
|
platform_slug: string;
|
||||||
|
platform_menu_url: string;
|
||||||
|
name: string;
|
||||||
|
raw_address: string | null;
|
||||||
|
address_line1: string | null;
|
||||||
|
address_line2: string | null;
|
||||||
|
city: string | null;
|
||||||
|
state_code: string | null;
|
||||||
|
postal_code: string | null;
|
||||||
|
country_code: string | null;
|
||||||
|
latitude: number | null;
|
||||||
|
longitude: number | null;
|
||||||
|
timezone: string | null;
|
||||||
|
status: DiscoveryStatus;
|
||||||
|
dispensary_id: number | null;
|
||||||
|
discovery_city_id: number | null;
|
||||||
|
metadata: Record<string, any> | null;
|
||||||
|
notes: string | null;
|
||||||
|
offers_delivery: boolean | null;
|
||||||
|
offers_pickup: boolean | null;
|
||||||
|
is_recreational: boolean | null;
|
||||||
|
is_medical: boolean | null;
|
||||||
|
first_seen_at: Date;
|
||||||
|
last_seen_at: Date;
|
||||||
|
last_checked_at: Date | null;
|
||||||
|
verified_at: Date | null;
|
||||||
|
verified_by: string | null;
|
||||||
|
active: boolean;
|
||||||
|
created_at: Date;
|
||||||
|
updated_at: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RAW API RESPONSES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DutchieCityResponse {
|
||||||
|
slug: string;
|
||||||
|
name: string;
|
||||||
|
state?: string;
|
||||||
|
stateCode?: string;
|
||||||
|
country?: string;
|
||||||
|
countryCode?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DutchieLocationResponse {
|
||||||
|
id: string;
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
address?: string;
|
||||||
|
address1?: string;
|
||||||
|
address2?: string;
|
||||||
|
city?: string;
|
||||||
|
state?: string;
|
||||||
|
zip?: string;
|
||||||
|
zipCode?: string;
|
||||||
|
country?: string;
|
||||||
|
latitude?: number;
|
||||||
|
longitude?: number;
|
||||||
|
timezone?: string;
|
||||||
|
menuUrl?: string;
|
||||||
|
retailType?: string;
|
||||||
|
offerPickup?: boolean;
|
||||||
|
offerDelivery?: boolean;
|
||||||
|
isRecreational?: boolean;
|
||||||
|
isMedical?: boolean;
|
||||||
|
// Raw response preserved
|
||||||
|
[key: string]: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCOVERY RESULTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface CityDiscoveryResult {
|
||||||
|
citiesFound: number;
|
||||||
|
citiesUpserted: number;
|
||||||
|
citiesSkipped: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LocationDiscoveryResult {
|
||||||
|
cityId: number;
|
||||||
|
citySlug: string;
|
||||||
|
locationsFound: number;
|
||||||
|
locationsUpserted: number;
|
||||||
|
locationsNew: number;
|
||||||
|
locationsUpdated: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FullDiscoveryResult {
|
||||||
|
cities: CityDiscoveryResult;
|
||||||
|
locations: LocationDiscoveryResult[];
|
||||||
|
totalLocationsFound: number;
|
||||||
|
totalLocationsUpserted: number;
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// VERIFICATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface VerificationResult {
|
||||||
|
success: boolean;
|
||||||
|
discoveryId: number;
|
||||||
|
dispensaryId: number | null;
|
||||||
|
action: 'created' | 'linked' | 'rejected';
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PromotionResult {
|
||||||
|
success: boolean;
|
||||||
|
discoveryId: number;
|
||||||
|
dispensaryId: number;
|
||||||
|
crawlProfileId?: number;
|
||||||
|
scheduleId?: number;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAPPER FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export function mapCityRowToCity(row: DiscoveryCityRow): DiscoveryCity {
|
||||||
|
return {
|
||||||
|
id: row.id,
|
||||||
|
platform: row.platform,
|
||||||
|
cityName: row.city_name,
|
||||||
|
citySlug: row.city_slug,
|
||||||
|
stateCode: row.state_code,
|
||||||
|
countryCode: row.country_code,
|
||||||
|
lastCrawledAt: row.last_crawled_at,
|
||||||
|
crawlEnabled: row.crawl_enabled,
|
||||||
|
locationCount: row.location_count,
|
||||||
|
notes: row.notes,
|
||||||
|
metadata: row.metadata,
|
||||||
|
createdAt: row.created_at,
|
||||||
|
updatedAt: row.updated_at,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLocation {
|
||||||
|
return {
|
||||||
|
id: row.id,
|
||||||
|
platform: row.platform,
|
||||||
|
platformLocationId: row.platform_location_id,
|
||||||
|
platformSlug: row.platform_slug,
|
||||||
|
platformMenuUrl: row.platform_menu_url,
|
||||||
|
name: row.name,
|
||||||
|
rawAddress: row.raw_address,
|
||||||
|
addressLine1: row.address_line1,
|
||||||
|
addressLine2: row.address_line2,
|
||||||
|
city: row.city,
|
||||||
|
stateCode: row.state_code,
|
||||||
|
postalCode: row.postal_code,
|
||||||
|
countryCode: row.country_code,
|
||||||
|
latitude: row.latitude,
|
||||||
|
longitude: row.longitude,
|
||||||
|
timezone: row.timezone,
|
||||||
|
status: row.status,
|
||||||
|
dispensaryId: row.dispensary_id,
|
||||||
|
discoveryCityId: row.discovery_city_id,
|
||||||
|
metadata: row.metadata,
|
||||||
|
notes: row.notes,
|
||||||
|
offersDelivery: row.offers_delivery,
|
||||||
|
offersPickup: row.offers_pickup,
|
||||||
|
isRecreational: row.is_recreational,
|
||||||
|
isMedical: row.is_medical,
|
||||||
|
firstSeenAt: row.first_seen_at,
|
||||||
|
lastSeenAt: row.last_seen_at,
|
||||||
|
lastCheckedAt: row.last_checked_at,
|
||||||
|
verifiedAt: row.verified_at,
|
||||||
|
verifiedBy: row.verified_by,
|
||||||
|
active: row.active,
|
||||||
|
createdAt: row.created_at,
|
||||||
|
updatedAt: row.updated_at,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,50 +1,99 @@
|
|||||||
/**
|
/**
|
||||||
* Dutchie AZ Database Connection
|
* CannaiQ Database Connection
|
||||||
*
|
*
|
||||||
* Isolated database connection for Dutchie Arizona data.
|
* All database access for the CannaiQ platform goes through this module.
|
||||||
* Uses a separate database/schema to prevent cross-contamination with main app data.
|
*
|
||||||
|
* SINGLE DATABASE ARCHITECTURE:
|
||||||
|
* - All services (auth, orchestrator, crawlers, admin) use this ONE database
|
||||||
|
* - States are modeled via states table + state_id on dispensaries (not separate DBs)
|
||||||
|
*
|
||||||
|
* CONFIGURATION (in priority order):
|
||||||
|
* 1. CANNAIQ_DB_URL - Full connection string (preferred)
|
||||||
|
* 2. Individual vars: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS
|
||||||
|
* 3. DATABASE_URL - Legacy fallback for K8s compatibility
|
||||||
|
*
|
||||||
|
* IMPORTANT:
|
||||||
|
* - Do NOT create separate pools elsewhere
|
||||||
|
* - All services should import from this module
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Pool, PoolClient } from 'pg';
|
import { Pool, PoolClient } from 'pg';
|
||||||
|
|
||||||
// Consolidated DB naming:
|
/**
|
||||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
* Get the database connection string from environment variables.
|
||||||
// - Then DUTCHIE_AZ_DATABASE_URL (legacy)
|
* Supports multiple configuration methods with fallback for legacy compatibility.
|
||||||
// - Finally DATABASE_URL (legacy main DB)
|
*/
|
||||||
const DUTCHIE_AZ_DATABASE_URL =
|
function getConnectionString(): string {
|
||||||
process.env.CRAWLSY_DATABASE_URL ||
|
// Priority 1: Full CANNAIQ connection URL
|
||||||
process.env.DUTCHIE_AZ_DATABASE_URL ||
|
if (process.env.CANNAIQ_DB_URL) {
|
||||||
process.env.DATABASE_URL ||
|
return process.env.CANNAIQ_DB_URL;
|
||||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
}
|
||||||
|
|
||||||
|
// Priority 2: Build from individual CANNAIQ env vars
|
||||||
|
const host = process.env.CANNAIQ_DB_HOST;
|
||||||
|
const port = process.env.CANNAIQ_DB_PORT;
|
||||||
|
const name = process.env.CANNAIQ_DB_NAME;
|
||||||
|
const user = process.env.CANNAIQ_DB_USER;
|
||||||
|
const pass = process.env.CANNAIQ_DB_PASS;
|
||||||
|
|
||||||
|
if (host && port && name && user && pass) {
|
||||||
|
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Priority 3: Fallback to DATABASE_URL for legacy/K8s compatibility
|
||||||
|
if (process.env.DATABASE_URL) {
|
||||||
|
return process.env.DATABASE_URL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report what's missing
|
||||||
|
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||||
|
const missing = required.filter((key) => !process.env[key]);
|
||||||
|
|
||||||
|
throw new Error(
|
||||||
|
`[CannaiQ DB] Missing database configuration.\n` +
|
||||||
|
`Set CANNAIQ_DB_URL, DATABASE_URL, or all of: ${missing.join(', ')}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let pool: Pool | null = null;
|
let pool: Pool | null = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the Dutchie AZ database pool (singleton)
|
* Get the CannaiQ database pool (singleton)
|
||||||
|
*
|
||||||
|
* This is the canonical pool for all CannaiQ services.
|
||||||
|
* Do NOT create separate pools elsewhere.
|
||||||
*/
|
*/
|
||||||
export function getDutchieAZPool(): Pool {
|
export function getPool(): Pool {
|
||||||
if (!pool) {
|
if (!pool) {
|
||||||
pool = new Pool({
|
pool = new Pool({
|
||||||
connectionString: DUTCHIE_AZ_DATABASE_URL,
|
connectionString: getConnectionString(),
|
||||||
max: 10,
|
max: 10,
|
||||||
idleTimeoutMillis: 30000,
|
idleTimeoutMillis: 30000,
|
||||||
connectionTimeoutMillis: 5000,
|
connectionTimeoutMillis: 5000,
|
||||||
});
|
});
|
||||||
|
|
||||||
pool.on('error', (err) => {
|
pool.on('error', (err) => {
|
||||||
console.error('[DutchieAZ DB] Unexpected error on idle client:', err);
|
console.error('[CannaiQ DB] Unexpected error on idle client:', err);
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log('[DutchieAZ DB] Pool initialized');
|
console.log('[CannaiQ DB] Pool initialized');
|
||||||
}
|
}
|
||||||
return pool;
|
return pool;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute a query on the Dutchie AZ database
|
* @deprecated Use getPool() instead
|
||||||
|
*/
|
||||||
|
export function getDutchieAZPool(): Pool {
|
||||||
|
console.warn('[CannaiQ DB] getDutchieAZPool() is deprecated. Use getPool() instead.');
|
||||||
|
return getPool();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute a query on the CannaiQ database
|
||||||
*/
|
*/
|
||||||
export async function query<T = any>(text: string, params?: any[]): Promise<{ rows: T[]; rowCount: number }> {
|
export async function query<T = any>(text: string, params?: any[]): Promise<{ rows: T[]; rowCount: number }> {
|
||||||
const p = getDutchieAZPool();
|
const p = getPool();
|
||||||
const result = await p.query(text, params);
|
const result = await p.query(text, params);
|
||||||
return { rows: result.rows as T[], rowCount: result.rowCount || 0 };
|
return { rows: result.rows as T[], rowCount: result.rowCount || 0 };
|
||||||
}
|
}
|
||||||
@@ -53,7 +102,7 @@ export async function query<T = any>(text: string, params?: any[]): Promise<{ ro
|
|||||||
* Get a client from the pool for transaction use
|
* Get a client from the pool for transaction use
|
||||||
*/
|
*/
|
||||||
export async function getClient(): Promise<PoolClient> {
|
export async function getClient(): Promise<PoolClient> {
|
||||||
const p = getDutchieAZPool();
|
const p = getPool();
|
||||||
return p.connect();
|
return p.connect();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,7 +113,7 @@ export async function closePool(): Promise<void> {
|
|||||||
if (pool) {
|
if (pool) {
|
||||||
await pool.end();
|
await pool.end();
|
||||||
pool = null;
|
pool = null;
|
||||||
console.log('[DutchieAZ DB] Pool closed');
|
console.log('[CannaiQ DB] Pool closed');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -76,7 +125,7 @@ export async function healthCheck(): Promise<boolean> {
|
|||||||
const result = await query('SELECT 1 as ok');
|
const result = await query('SELECT 1 as ok');
|
||||||
return result.rows.length > 0 && result.rows[0].ok === 1;
|
return result.rows.length > 0 && result.rows[0].ok === 1;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('[DutchieAZ DB] Health check failed:', error);
|
console.error('[CannaiQ DB] Health check failed:', error);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
137
backend/src/dutchie-az/db/dispensary-columns.ts
Normal file
137
backend/src/dutchie-az/db/dispensary-columns.ts
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
/**
|
||||||
|
* Dispensary Column Definitions
|
||||||
|
*
|
||||||
|
* Centralized column list for dispensaries table queries.
|
||||||
|
* Handles optional columns that may not exist in all environments.
|
||||||
|
*
|
||||||
|
* USAGE:
|
||||||
|
* import { DISPENSARY_COLUMNS, DISPENSARY_COLUMNS_WITH_FAILED } from '../db/dispensary-columns';
|
||||||
|
* const result = await query(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE ...`);
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Core dispensary columns that always exist.
|
||||||
|
* These are guaranteed to be present in all environments.
|
||||||
|
*/
|
||||||
|
const CORE_COLUMNS = `
|
||||||
|
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||||
|
menu_type, menu_url, platform_dispensary_id, website,
|
||||||
|
created_at, updated_at
|
||||||
|
`;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optional columns with NULL fallback.
|
||||||
|
*
|
||||||
|
* provider_detection_data: Added in migration 044
|
||||||
|
* active_crawler_profile_id: Added in migration 041
|
||||||
|
*
|
||||||
|
* Using COALESCE ensures the query works whether or not the column exists:
|
||||||
|
* - If column exists: returns the actual value
|
||||||
|
* - If column doesn't exist: query fails (but migration should be run)
|
||||||
|
*
|
||||||
|
* For pre-migration compatibility, we select NULL::jsonb which always works.
|
||||||
|
* After migration 044 is applied, this can be changed to the real column.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// TEMPORARY: Use NULL fallback until migration 044 is applied
|
||||||
|
// After running 044, change this to: provider_detection_data
|
||||||
|
const PROVIDER_DETECTION_COLUMN = `NULL::jsonb AS provider_detection_data`;
|
||||||
|
|
||||||
|
// After migration 044 is applied, uncomment this line and remove the above:
|
||||||
|
// const PROVIDER_DETECTION_COLUMN = `provider_detection_data`;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Standard dispensary columns for most queries.
|
||||||
|
* Includes provider_detection_data with NULL fallback for pre-migration compatibility.
|
||||||
|
*/
|
||||||
|
export const DISPENSARY_COLUMNS = `${CORE_COLUMNS.trim()},
|
||||||
|
${PROVIDER_DETECTION_COLUMN}`;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dispensary columns including active_crawler_profile_id.
|
||||||
|
* Used by routes that need profile information.
|
||||||
|
*/
|
||||||
|
export const DISPENSARY_COLUMNS_WITH_PROFILE = `${CORE_COLUMNS.trim()},
|
||||||
|
${PROVIDER_DETECTION_COLUMN},
|
||||||
|
active_crawler_profile_id`;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dispensary columns including failed_at.
|
||||||
|
* Used by worker for compatibility checks.
|
||||||
|
*/
|
||||||
|
export const DISPENSARY_COLUMNS_WITH_FAILED = `${CORE_COLUMNS.trim()},
|
||||||
|
${PROVIDER_DETECTION_COLUMN},
|
||||||
|
failed_at`;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: After migration 044 is applied, update PROVIDER_DETECTION_COLUMN above
|
||||||
|
* to use the real column instead of NULL fallback.
|
||||||
|
*
|
||||||
|
* To verify migration status:
|
||||||
|
* SELECT column_name FROM information_schema.columns
|
||||||
|
* WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data';
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Cache for column existence check
|
||||||
|
let _providerDetectionColumnExists: boolean | null = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if provider_detection_data column exists in dispensaries table.
|
||||||
|
* Result is cached after first check.
|
||||||
|
*/
|
||||||
|
export async function hasProviderDetectionColumn(pool: { query: (sql: string) => Promise<{ rows: any[] }> }): Promise<boolean> {
|
||||||
|
if (_providerDetectionColumnExists !== null) {
|
||||||
|
return _providerDetectionColumnExists;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT 1 FROM information_schema.columns
|
||||||
|
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
|
||||||
|
`);
|
||||||
|
_providerDetectionColumnExists = result.rows.length > 0;
|
||||||
|
} catch {
|
||||||
|
_providerDetectionColumnExists = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _providerDetectionColumnExists;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Safely update provider_detection_data column.
|
||||||
|
* If column doesn't exist, logs a warning but doesn't crash.
|
||||||
|
*
|
||||||
|
* @param pool - Database pool with query method
|
||||||
|
* @param dispensaryId - ID of dispensary to update
|
||||||
|
* @param data - JSONB data to merge into provider_detection_data
|
||||||
|
* @returns true if update succeeded, false if column doesn't exist
|
||||||
|
*/
|
||||||
|
export async function safeUpdateProviderDetectionData(
|
||||||
|
pool: { query: (sql: string, params?: any[]) => Promise<any> },
|
||||||
|
dispensaryId: number,
|
||||||
|
data: Record<string, any>
|
||||||
|
): Promise<boolean> {
|
||||||
|
const hasColumn = await hasProviderDetectionColumn(pool);
|
||||||
|
|
||||||
|
if (!hasColumn) {
|
||||||
|
console.warn(`[DispensaryColumns] provider_detection_data column not found. Run migration 044 to add it.`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE dispensaries
|
||||||
|
SET provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || $1::jsonb,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $2`,
|
||||||
|
[JSON.stringify(data), dispensaryId]
|
||||||
|
);
|
||||||
|
return true;
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.message?.includes('provider_detection_data')) {
|
||||||
|
console.warn(`[DispensaryColumns] Failed to update provider_detection_data: ${error.message}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
403
backend/src/dutchie-az/discovery/DtCityDiscoveryService.ts
Normal file
403
backend/src/dutchie-az/discovery/DtCityDiscoveryService.ts
Normal file
@@ -0,0 +1,403 @@
|
|||||||
|
/**
|
||||||
|
* DtCityDiscoveryService
|
||||||
|
*
|
||||||
|
* Core service for Dutchie city discovery.
|
||||||
|
* Contains shared logic used by multiple entrypoints.
|
||||||
|
*
|
||||||
|
* Responsibilities:
|
||||||
|
* - Browser/API-based city fetching
|
||||||
|
* - Manual city seeding
|
||||||
|
* - City upsert operations
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import axios from 'axios';
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DutchieCity {
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
stateCode: string | null;
|
||||||
|
countryCode: string;
|
||||||
|
url?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CityDiscoveryResult {
|
||||||
|
citiesFound: number;
|
||||||
|
citiesInserted: number;
|
||||||
|
citiesUpdated: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ManualSeedResult {
|
||||||
|
city: DutchieCity;
|
||||||
|
id: number;
|
||||||
|
wasInserted: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// US STATE CODE MAPPING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const US_STATE_MAP: Record<string, string> = {
|
||||||
|
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||||
|
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||||
|
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||||
|
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||||
|
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||||
|
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||||
|
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||||
|
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||||
|
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||||
|
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||||
|
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||||
|
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||||
|
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Canadian province mapping
|
||||||
|
export const CA_PROVINCE_MAP: Record<string, string> = {
|
||||||
|
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||||
|
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||||
|
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||||
|
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||||
|
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CITY FETCHING (AUTO DISCOVERY)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch cities from Dutchie's /cities page using Puppeteer.
|
||||||
|
*/
|
||||||
|
export async function fetchCitiesFromBrowser(): Promise<DutchieCity[]> {
|
||||||
|
console.log('[DtCityDiscoveryService] Launching browser to fetch cities...');
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: 'new',
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setUserAgent(
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log('[DtCityDiscoveryService] Navigating to https://dutchie.com/cities...');
|
||||||
|
await page.goto('https://dutchie.com/cities', {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, 3000));
|
||||||
|
|
||||||
|
const cities = await page.evaluate(() => {
|
||||||
|
const cityLinks: Array<{
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
url: string;
|
||||||
|
stateSlug: string | null;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||||
|
links.forEach((link) => {
|
||||||
|
const href = (link as HTMLAnchorElement).href;
|
||||||
|
const text = (link as HTMLElement).innerText?.trim();
|
||||||
|
|
||||||
|
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||||
|
if (match && text) {
|
||||||
|
cityLinks.push({
|
||||||
|
name: text,
|
||||||
|
slug: match[2],
|
||||||
|
url: href,
|
||||||
|
stateSlug: match[1],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return cityLinks;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[DtCityDiscoveryService] Extracted ${cities.length} city links from page`);
|
||||||
|
|
||||||
|
return cities.map((city) => {
|
||||||
|
let countryCode = 'US';
|
||||||
|
let stateCode: string | null = null;
|
||||||
|
|
||||||
|
if (city.stateSlug) {
|
||||||
|
if (US_STATE_MAP[city.stateSlug]) {
|
||||||
|
stateCode = US_STATE_MAP[city.stateSlug];
|
||||||
|
countryCode = 'US';
|
||||||
|
} else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||||
|
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||||
|
countryCode = 'CA';
|
||||||
|
} else if (city.stateSlug.length === 2) {
|
||||||
|
stateCode = city.stateSlug.toUpperCase();
|
||||||
|
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||||
|
countryCode = 'CA';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: city.name,
|
||||||
|
slug: city.slug,
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
url: city.url,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch cities via API endpoints (fallback).
|
||||||
|
*/
|
||||||
|
export async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||||
|
console.log('[DtCityDiscoveryService] Attempting API-based city discovery...');
|
||||||
|
|
||||||
|
const apiEndpoints = [
|
||||||
|
'https://dutchie.com/api/cities',
|
||||||
|
'https://api.dutchie.com/v1/cities',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const endpoint of apiEndpoints) {
|
||||||
|
try {
|
||||||
|
const response = await axios.get(endpoint, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||||
|
Accept: 'application/json',
|
||||||
|
},
|
||||||
|
timeout: 15000,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.data && Array.isArray(response.data)) {
|
||||||
|
console.log(`[DtCityDiscoveryService] API returned ${response.data.length} cities`);
|
||||||
|
return response.data.map((c: any) => ({
|
||||||
|
name: c.name || c.city,
|
||||||
|
slug: c.slug || c.citySlug,
|
||||||
|
stateCode: c.stateCode || c.state,
|
||||||
|
countryCode: c.countryCode || c.country || 'US',
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
console.log(`[DtCityDiscoveryService] API ${endpoint} failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a city into dutchie_discovery_cities
|
||||||
|
*/
|
||||||
|
export async function upsertCity(
|
||||||
|
pool: Pool,
|
||||||
|
city: DutchieCity
|
||||||
|
): Promise<{ id: number; inserted: boolean; updated: boolean }> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`
|
||||||
|
INSERT INTO dutchie_discovery_cities (
|
||||||
|
platform,
|
||||||
|
city_name,
|
||||||
|
city_slug,
|
||||||
|
state_code,
|
||||||
|
country_code,
|
||||||
|
crawl_enabled,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
'dutchie',
|
||||||
|
$1,
|
||||||
|
$2,
|
||||||
|
$3,
|
||||||
|
$4,
|
||||||
|
TRUE,
|
||||||
|
NOW(),
|
||||||
|
NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||||
|
DO UPDATE SET
|
||||||
|
city_name = EXCLUDED.city_name,
|
||||||
|
crawl_enabled = TRUE,
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING id, (xmax = 0) AS inserted
|
||||||
|
`,
|
||||||
|
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||||
|
);
|
||||||
|
|
||||||
|
const inserted = result.rows[0]?.inserted === true;
|
||||||
|
return {
|
||||||
|
id: result.rows[0]?.id,
|
||||||
|
inserted,
|
||||||
|
updated: !inserted,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN SERVICE CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class DtCityDiscoveryService {
|
||||||
|
constructor(private pool: Pool) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run auto-discovery (browser + API fallback)
|
||||||
|
*/
|
||||||
|
async runAutoDiscovery(): Promise<CityDiscoveryResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
let citiesFound = 0;
|
||||||
|
let citiesInserted = 0;
|
||||||
|
let citiesUpdated = 0;
|
||||||
|
|
||||||
|
console.log('[DtCityDiscoveryService] Starting auto city discovery...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
let cities = await fetchCitiesFromBrowser();
|
||||||
|
|
||||||
|
if (cities.length === 0) {
|
||||||
|
console.log('[DtCityDiscoveryService] Browser returned 0 cities, trying API...');
|
||||||
|
cities = await fetchCitiesFromAPI();
|
||||||
|
}
|
||||||
|
|
||||||
|
citiesFound = cities.length;
|
||||||
|
console.log(`[DtCityDiscoveryService] Found ${citiesFound} cities`);
|
||||||
|
|
||||||
|
for (const city of cities) {
|
||||||
|
try {
|
||||||
|
const result = await upsertCity(this.pool, city);
|
||||||
|
if (result.inserted) citiesInserted++;
|
||||||
|
else if (result.updated) citiesUpdated++;
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||||
|
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||||
|
errors.push(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = `Auto discovery failed: ${error.message}`;
|
||||||
|
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||||
|
errors.push(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
return {
|
||||||
|
citiesFound,
|
||||||
|
citiesInserted,
|
||||||
|
citiesUpdated,
|
||||||
|
errors,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seed a single city manually
|
||||||
|
*/
|
||||||
|
async seedCity(city: DutchieCity): Promise<ManualSeedResult> {
|
||||||
|
console.log(`[DtCityDiscoveryService] Seeding city: ${city.name} (${city.slug}), ${city.stateCode}, ${city.countryCode}`);
|
||||||
|
|
||||||
|
const result = await upsertCity(this.pool, city);
|
||||||
|
|
||||||
|
return {
|
||||||
|
city,
|
||||||
|
id: result.id,
|
||||||
|
wasInserted: result.inserted,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seed multiple cities from a list
|
||||||
|
*/
|
||||||
|
async seedCities(cities: DutchieCity[]): Promise<{
|
||||||
|
results: ManualSeedResult[];
|
||||||
|
errors: string[];
|
||||||
|
}> {
|
||||||
|
const results: ManualSeedResult[] = [];
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const city of cities) {
|
||||||
|
try {
|
||||||
|
const result = await this.seedCity(city);
|
||||||
|
results.push(result);
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`${city.slug}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { results, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get statistics about discovered cities
|
||||||
|
*/
|
||||||
|
async getStats(): Promise<{
|
||||||
|
total: number;
|
||||||
|
byCountry: Array<{ countryCode: string; count: number }>;
|
||||||
|
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||||
|
crawlEnabled: number;
|
||||||
|
neverCrawled: number;
|
||||||
|
}> {
|
||||||
|
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||||
|
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE platform = \'dutchie\''),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT country_code, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = 'dutchie'
|
||||||
|
GROUP BY country_code
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT state_code, country_code, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = 'dutchie' AND state_code IS NOT NULL
|
||||||
|
GROUP BY state_code, country_code
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = 'dutchie' AND last_crawled_at IS NULL
|
||||||
|
`),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||||
|
byCountry: byCountryRes.rows.map((r) => ({
|
||||||
|
countryCode: r.country_code,
|
||||||
|
count: parseInt(r.cnt, 10),
|
||||||
|
})),
|
||||||
|
byState: byStateRes.rows.map((r) => ({
|
||||||
|
stateCode: r.state_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
count: parseInt(r.cnt, 10),
|
||||||
|
})),
|
||||||
|
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||||
|
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default DtCityDiscoveryService;
|
||||||
1249
backend/src/dutchie-az/discovery/DtLocationDiscoveryService.ts
Normal file
1249
backend/src/dutchie-az/discovery/DtLocationDiscoveryService.ts
Normal file
File diff suppressed because it is too large
Load Diff
390
backend/src/dutchie-az/discovery/DutchieCityDiscovery.ts
Normal file
390
backend/src/dutchie-az/discovery/DutchieCityDiscovery.ts
Normal file
@@ -0,0 +1,390 @@
|
|||||||
|
/**
|
||||||
|
* DutchieCityDiscovery
|
||||||
|
*
|
||||||
|
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||||
|
*
|
||||||
|
* Responsibilities:
|
||||||
|
* - Fetch all cities available on Dutchie
|
||||||
|
* - For each city derive: city_name, city_slug, state_code, country_code
|
||||||
|
* - Upsert into dutchie_discovery_cities
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import axios from 'axios';
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
import type { Browser, Page } from 'puppeteer';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DutchieCity {
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
stateCode: string | null;
|
||||||
|
countryCode: string;
|
||||||
|
url?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CityDiscoveryResult {
|
||||||
|
citiesFound: number;
|
||||||
|
citiesInserted: number;
|
||||||
|
citiesUpdated: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// US STATE CODE MAPPING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
const US_STATE_MAP: Record<string, string> = {
|
||||||
|
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||||
|
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||||
|
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||||
|
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||||
|
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||||
|
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||||
|
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||||
|
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||||
|
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||||
|
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||||
|
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||||
|
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||||
|
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Canadian province mapping
|
||||||
|
const CA_PROVINCE_MAP: Record<string, string> = {
|
||||||
|
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||||
|
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||||
|
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||||
|
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||||
|
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CITY FETCHING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch cities from Dutchie's /cities page using Puppeteer to extract data.
|
||||||
|
*/
|
||||||
|
async function fetchCitiesFromDutchie(): Promise<DutchieCity[]> {
|
||||||
|
console.log('[DutchieCityDiscovery] Launching browser to fetch cities...');
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: 'new',
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setUserAgent(
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Navigate to cities page
|
||||||
|
console.log('[DutchieCityDiscovery] Navigating to https://dutchie.com/cities...');
|
||||||
|
await page.goto('https://dutchie.com/cities', {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait for content to load
|
||||||
|
await new Promise((r) => setTimeout(r, 3000));
|
||||||
|
|
||||||
|
// Extract city links from the page
|
||||||
|
const cities = await page.evaluate(() => {
|
||||||
|
const cityLinks: Array<{
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
url: string;
|
||||||
|
stateSlug: string | null;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
// Find all city links - they typically follow pattern /city/{state}/{city}
|
||||||
|
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||||
|
links.forEach((link) => {
|
||||||
|
const href = (link as HTMLAnchorElement).href;
|
||||||
|
const text = (link as HTMLElement).innerText?.trim();
|
||||||
|
|
||||||
|
// Parse URL: https://dutchie.com/city/{state}/{city}
|
||||||
|
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||||
|
if (match && text) {
|
||||||
|
cityLinks.push({
|
||||||
|
name: text,
|
||||||
|
slug: match[2],
|
||||||
|
url: href,
|
||||||
|
stateSlug: match[1],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return cityLinks;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[DutchieCityDiscovery] Extracted ${cities.length} city links from page`);
|
||||||
|
|
||||||
|
// Convert to DutchieCity format
|
||||||
|
const result: DutchieCity[] = [];
|
||||||
|
|
||||||
|
for (const city of cities) {
|
||||||
|
// Determine country and state code
|
||||||
|
let countryCode = 'US';
|
||||||
|
let stateCode: string | null = null;
|
||||||
|
|
||||||
|
if (city.stateSlug) {
|
||||||
|
// Check if it's a US state
|
||||||
|
if (US_STATE_MAP[city.stateSlug]) {
|
||||||
|
stateCode = US_STATE_MAP[city.stateSlug];
|
||||||
|
countryCode = 'US';
|
||||||
|
}
|
||||||
|
// Check if it's a Canadian province
|
||||||
|
else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||||
|
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||||
|
countryCode = 'CA';
|
||||||
|
}
|
||||||
|
// Check if it's already a 2-letter code
|
||||||
|
else if (city.stateSlug.length === 2) {
|
||||||
|
stateCode = city.stateSlug.toUpperCase();
|
||||||
|
// Determine country based on state code
|
||||||
|
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||||
|
countryCode = 'CA';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.push({
|
||||||
|
name: city.name,
|
||||||
|
slug: city.slug,
|
||||||
|
stateCode,
|
||||||
|
countryCode,
|
||||||
|
url: city.url,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alternative: Fetch cities by making API/GraphQL requests.
|
||||||
|
* Falls back to this if scraping fails.
|
||||||
|
*/
|
||||||
|
async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||||
|
console.log('[DutchieCityDiscovery] Attempting API-based city discovery...');
|
||||||
|
|
||||||
|
// Dutchie may have an API endpoint for cities
|
||||||
|
// Try common patterns
|
||||||
|
const apiEndpoints = [
|
||||||
|
'https://dutchie.com/api/cities',
|
||||||
|
'https://api.dutchie.com/v1/cities',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const endpoint of apiEndpoints) {
|
||||||
|
try {
|
||||||
|
const response = await axios.get(endpoint, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||||
|
Accept: 'application/json',
|
||||||
|
},
|
||||||
|
timeout: 15000,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.data && Array.isArray(response.data)) {
|
||||||
|
console.log(`[DutchieCityDiscovery] API returned ${response.data.length} cities`);
|
||||||
|
return response.data.map((c: any) => ({
|
||||||
|
name: c.name || c.city,
|
||||||
|
slug: c.slug || c.citySlug,
|
||||||
|
stateCode: c.stateCode || c.state,
|
||||||
|
countryCode: c.countryCode || c.country || 'US',
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
console.log(`[DutchieCityDiscovery] API ${endpoint} failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a city into dutchie_discovery_cities
|
||||||
|
*/
|
||||||
|
async function upsertCity(
|
||||||
|
pool: Pool,
|
||||||
|
city: DutchieCity
|
||||||
|
): Promise<{ inserted: boolean; updated: boolean }> {
|
||||||
|
const result = await pool.query(
|
||||||
|
`
|
||||||
|
INSERT INTO dutchie_discovery_cities (
|
||||||
|
platform,
|
||||||
|
city_name,
|
||||||
|
city_slug,
|
||||||
|
state_code,
|
||||||
|
country_code,
|
||||||
|
last_crawled_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
'dutchie',
|
||||||
|
$1,
|
||||||
|
$2,
|
||||||
|
$3,
|
||||||
|
$4,
|
||||||
|
NOW(),
|
||||||
|
NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||||
|
DO UPDATE SET
|
||||||
|
city_name = EXCLUDED.city_name,
|
||||||
|
last_crawled_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING (xmax = 0) AS inserted
|
||||||
|
`,
|
||||||
|
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||||
|
);
|
||||||
|
|
||||||
|
const inserted = result.rows[0]?.inserted === true;
|
||||||
|
return { inserted, updated: !inserted };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN DISCOVERY FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class DutchieCityDiscovery {
|
||||||
|
private pool: Pool;
|
||||||
|
|
||||||
|
constructor(pool: Pool) {
|
||||||
|
this.pool = pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the city discovery process
|
||||||
|
*/
|
||||||
|
async run(): Promise<CityDiscoveryResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
let citiesFound = 0;
|
||||||
|
let citiesInserted = 0;
|
||||||
|
let citiesUpdated = 0;
|
||||||
|
|
||||||
|
console.log('[DutchieCityDiscovery] Starting city discovery...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Try scraping first, fall back to API
|
||||||
|
let cities = await fetchCitiesFromDutchie();
|
||||||
|
|
||||||
|
if (cities.length === 0) {
|
||||||
|
console.log('[DutchieCityDiscovery] Scraping returned 0 cities, trying API...');
|
||||||
|
cities = await fetchCitiesFromAPI();
|
||||||
|
}
|
||||||
|
|
||||||
|
citiesFound = cities.length;
|
||||||
|
console.log(`[DutchieCityDiscovery] Found ${citiesFound} cities`);
|
||||||
|
|
||||||
|
// Upsert each city
|
||||||
|
for (const city of cities) {
|
||||||
|
try {
|
||||||
|
const result = await upsertCity(this.pool, city);
|
||||||
|
if (result.inserted) {
|
||||||
|
citiesInserted++;
|
||||||
|
} else if (result.updated) {
|
||||||
|
citiesUpdated++;
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||||
|
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||||
|
errors.push(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = `City discovery failed: ${error.message}`;
|
||||||
|
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||||
|
errors.push(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log('[DutchieCityDiscovery] Discovery complete:');
|
||||||
|
console.log(` Cities found: ${citiesFound}`);
|
||||||
|
console.log(` Inserted: ${citiesInserted}`);
|
||||||
|
console.log(` Updated: ${citiesUpdated}`);
|
||||||
|
console.log(` Errors: ${errors.length}`);
|
||||||
|
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
citiesFound,
|
||||||
|
citiesInserted,
|
||||||
|
citiesUpdated,
|
||||||
|
errors,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get statistics about discovered cities
|
||||||
|
*/
|
||||||
|
async getStats(): Promise<{
|
||||||
|
total: number;
|
||||||
|
byCountry: Array<{ countryCode: string; count: number }>;
|
||||||
|
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||||
|
crawlEnabled: number;
|
||||||
|
neverCrawled: number;
|
||||||
|
}> {
|
||||||
|
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||||
|
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT country_code, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
GROUP BY country_code
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT state_code, country_code, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE state_code IS NOT NULL
|
||||||
|
GROUP BY state_code, country_code
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE crawl_enabled = TRUE
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE last_crawled_at IS NULL
|
||||||
|
`),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||||
|
byCountry: byCountryRes.rows.map((r) => ({
|
||||||
|
countryCode: r.country_code,
|
||||||
|
count: parseInt(r.cnt, 10),
|
||||||
|
})),
|
||||||
|
byState: byStateRes.rows.map((r) => ({
|
||||||
|
stateCode: r.state_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
count: parseInt(r.cnt, 10),
|
||||||
|
})),
|
||||||
|
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||||
|
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default DutchieCityDiscovery;
|
||||||
639
backend/src/dutchie-az/discovery/DutchieLocationDiscovery.ts
Normal file
639
backend/src/dutchie-az/discovery/DutchieLocationDiscovery.ts
Normal file
@@ -0,0 +1,639 @@
|
|||||||
|
/**
|
||||||
|
* DutchieLocationDiscovery
|
||||||
|
*
|
||||||
|
* Discovers store locations for each city from Dutchie and upserts to dutchie_discovery_locations.
|
||||||
|
*
|
||||||
|
* Responsibilities:
|
||||||
|
* - Given a dutchie_discovery_cities row, call Dutchie's location/search endpoint
|
||||||
|
* - For each store: extract platform_location_id, platform_slug, platform_menu_url, name, address, coords
|
||||||
|
* - Upsert into dutchie_discovery_locations
|
||||||
|
* - DO NOT overwrite status if already verified/merged/rejected
|
||||||
|
* - DO NOT overwrite dispensary_id if already set
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import axios from 'axios';
|
||||||
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
|
|
||||||
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface DiscoveryCity {
|
||||||
|
id: number;
|
||||||
|
platform: string;
|
||||||
|
cityName: string;
|
||||||
|
citySlug: string;
|
||||||
|
stateCode: string | null;
|
||||||
|
countryCode: string;
|
||||||
|
crawlEnabled: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DutchieLocation {
|
||||||
|
platformLocationId: string;
|
||||||
|
platformSlug: string;
|
||||||
|
platformMenuUrl: string;
|
||||||
|
name: string;
|
||||||
|
rawAddress: string | null;
|
||||||
|
addressLine1: string | null;
|
||||||
|
addressLine2: string | null;
|
||||||
|
city: string | null;
|
||||||
|
stateCode: string | null;
|
||||||
|
postalCode: string | null;
|
||||||
|
countryCode: string | null;
|
||||||
|
latitude: number | null;
|
||||||
|
longitude: number | null;
|
||||||
|
timezone: string | null;
|
||||||
|
offersDelivery: boolean | null;
|
||||||
|
offersPickup: boolean | null;
|
||||||
|
isRecreational: boolean | null;
|
||||||
|
isMedical: boolean | null;
|
||||||
|
metadata: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LocationDiscoveryResult {
|
||||||
|
cityId: number;
|
||||||
|
citySlug: string;
|
||||||
|
locationsFound: number;
|
||||||
|
locationsInserted: number;
|
||||||
|
locationsUpdated: number;
|
||||||
|
locationsSkipped: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// LOCATION FETCHING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch locations for a city using Puppeteer to scrape the city page
|
||||||
|
*/
|
||||||
|
async function fetchLocationsForCity(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||||
|
console.log(`[DutchieLocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: 'new',
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.setUserAgent(
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Navigate to city page - use /us/dispensaries/{city_slug} pattern
|
||||||
|
const cityUrl = `https://dutchie.com/us/dispensaries/${city.citySlug}`;
|
||||||
|
console.log(`[DutchieLocationDiscovery] Navigating to ${cityUrl}...`);
|
||||||
|
|
||||||
|
await page.goto(cityUrl, {
|
||||||
|
waitUntil: 'networkidle2',
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait for content
|
||||||
|
await new Promise((r) => setTimeout(r, 3000));
|
||||||
|
|
||||||
|
// Try to extract __NEXT_DATA__ which often contains store data
|
||||||
|
const nextData = await page.evaluate(() => {
|
||||||
|
const script = document.querySelector('script#__NEXT_DATA__');
|
||||||
|
if (script) {
|
||||||
|
try {
|
||||||
|
return JSON.parse(script.textContent || '{}');
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
let locations: DutchieLocation[] = [];
|
||||||
|
|
||||||
|
if (nextData?.props?.pageProps?.dispensaries) {
|
||||||
|
// Extract from Next.js data
|
||||||
|
const dispensaries = nextData.props.pageProps.dispensaries;
|
||||||
|
console.log(`[DutchieLocationDiscovery] Found ${dispensaries.length} dispensaries in __NEXT_DATA__`);
|
||||||
|
|
||||||
|
locations = dispensaries.map((d: any) => parseDispensaryData(d, city));
|
||||||
|
} else {
|
||||||
|
// Fall back to DOM scraping
|
||||||
|
console.log('[DutchieLocationDiscovery] No __NEXT_DATA__, trying DOM scraping...');
|
||||||
|
|
||||||
|
const scrapedData = await page.evaluate(() => {
|
||||||
|
const stores: Array<{
|
||||||
|
name: string;
|
||||||
|
href: string;
|
||||||
|
address: string | null;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
// Look for dispensary cards/links
|
||||||
|
const cards = document.querySelectorAll('[data-testid="dispensary-card"], .dispensary-card, a[href*="/dispensary/"]');
|
||||||
|
cards.forEach((card) => {
|
||||||
|
const link = card.querySelector('a[href*="/dispensary/"]') || (card as HTMLAnchorElement);
|
||||||
|
const href = (link as HTMLAnchorElement).href || '';
|
||||||
|
const name =
|
||||||
|
card.querySelector('[data-testid="dispensary-name"]')?.textContent ||
|
||||||
|
card.querySelector('h2, h3, .name')?.textContent ||
|
||||||
|
link.textContent ||
|
||||||
|
'';
|
||||||
|
const address = card.querySelector('[data-testid="dispensary-address"], .address')?.textContent || null;
|
||||||
|
|
||||||
|
if (href && name) {
|
||||||
|
stores.push({
|
||||||
|
name: name.trim(),
|
||||||
|
href,
|
||||||
|
address: address?.trim() || null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return stores;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[DutchieLocationDiscovery] DOM scraping found ${scrapedData.length} stores`);
|
||||||
|
|
||||||
|
locations = scrapedData.map((s) => {
|
||||||
|
// Parse slug from URL
|
||||||
|
const match = s.href.match(/\/dispensary\/([^/?]+)/);
|
||||||
|
const slug = match ? match[1] : s.name.toLowerCase().replace(/\s+/g, '-');
|
||||||
|
|
||||||
|
return {
|
||||||
|
platformLocationId: slug, // Will be resolved later
|
||||||
|
platformSlug: slug,
|
||||||
|
platformMenuUrl: `https://dutchie.com/dispensary/${slug}`,
|
||||||
|
name: s.name,
|
||||||
|
rawAddress: s.address,
|
||||||
|
addressLine1: null,
|
||||||
|
addressLine2: null,
|
||||||
|
city: city.cityName,
|
||||||
|
stateCode: city.stateCode,
|
||||||
|
postalCode: null,
|
||||||
|
countryCode: city.countryCode,
|
||||||
|
latitude: null,
|
||||||
|
longitude: null,
|
||||||
|
timezone: null,
|
||||||
|
offersDelivery: null,
|
||||||
|
offersPickup: null,
|
||||||
|
isRecreational: null,
|
||||||
|
isMedical: null,
|
||||||
|
metadata: { source: 'dom_scrape', originalUrl: s.href },
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return locations;
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse dispensary data from Dutchie's API/JSON response
|
||||||
|
*/
|
||||||
|
function parseDispensaryData(d: any, city: DiscoveryCity): DutchieLocation {
|
||||||
|
const id = d.id || d._id || d.dispensaryId || '';
|
||||||
|
const slug = d.slug || d.cName || d.name?.toLowerCase().replace(/\s+/g, '-') || '';
|
||||||
|
|
||||||
|
// Build menu URL
|
||||||
|
let menuUrl = `https://dutchie.com/dispensary/${slug}`;
|
||||||
|
if (d.menuUrl) {
|
||||||
|
menuUrl = d.menuUrl;
|
||||||
|
} else if (d.embeddedMenuUrl) {
|
||||||
|
menuUrl = d.embeddedMenuUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse address
|
||||||
|
const address = d.address || d.location?.address || {};
|
||||||
|
const rawAddress = [
|
||||||
|
address.line1 || address.street1 || d.address1,
|
||||||
|
address.line2 || address.street2 || d.address2,
|
||||||
|
[
|
||||||
|
address.city || d.city,
|
||||||
|
address.state || address.stateCode || d.state,
|
||||||
|
address.zip || address.zipCode || address.postalCode || d.zip,
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(' '),
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(', ');
|
||||||
|
|
||||||
|
return {
|
||||||
|
platformLocationId: id,
|
||||||
|
platformSlug: slug,
|
||||||
|
platformMenuUrl: menuUrl,
|
||||||
|
name: d.name || d.dispensaryName || '',
|
||||||
|
rawAddress: rawAddress || null,
|
||||||
|
addressLine1: address.line1 || address.street1 || d.address1 || null,
|
||||||
|
addressLine2: address.line2 || address.street2 || d.address2 || null,
|
||||||
|
city: address.city || d.city || city.cityName,
|
||||||
|
stateCode: address.state || address.stateCode || d.state || city.stateCode,
|
||||||
|
postalCode: address.zip || address.zipCode || address.postalCode || d.zip || null,
|
||||||
|
countryCode: address.country || address.countryCode || d.country || city.countryCode,
|
||||||
|
latitude: d.latitude ?? d.location?.latitude ?? d.location?.lat ?? null,
|
||||||
|
longitude: d.longitude ?? d.location?.longitude ?? d.location?.lng ?? null,
|
||||||
|
timezone: d.timezone || d.timeZone || null,
|
||||||
|
offersDelivery: d.offerDelivery ?? d.offersDelivery ?? d.delivery ?? null,
|
||||||
|
offersPickup: d.offerPickup ?? d.offersPickup ?? d.pickup ?? null,
|
||||||
|
isRecreational: d.isRecreational ?? d.recreational ?? (d.retailType === 'recreational' || d.retailType === 'both'),
|
||||||
|
isMedical: d.isMedical ?? d.medical ?? (d.retailType === 'medical' || d.retailType === 'both'),
|
||||||
|
metadata: {
|
||||||
|
source: 'next_data',
|
||||||
|
retailType: d.retailType,
|
||||||
|
brand: d.brand,
|
||||||
|
logo: d.logo || d.logoUrl,
|
||||||
|
raw: d,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alternative: Use GraphQL to discover locations
|
||||||
|
*/
|
||||||
|
async function fetchLocationsViaGraphQL(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||||
|
console.log(`[DutchieLocationDiscovery] Trying GraphQL for ${city.cityName}...`);
|
||||||
|
|
||||||
|
// Try geo-based search
|
||||||
|
// This would require knowing the city's coordinates
|
||||||
|
// For now, return empty and rely on page scraping
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert a location into dutchie_discovery_locations
|
||||||
|
* Does NOT overwrite status if already verified/merged/rejected
|
||||||
|
* Does NOT overwrite dispensary_id if already set
|
||||||
|
*/
|
||||||
|
async function upsertLocation(
|
||||||
|
pool: Pool,
|
||||||
|
location: DutchieLocation,
|
||||||
|
cityId: number
|
||||||
|
): Promise<{ inserted: boolean; updated: boolean; skipped: boolean }> {
|
||||||
|
// First check if this location exists and has a protected status
|
||||||
|
const existing = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT id, status, dispensary_id
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
WHERE platform = 'dutchie' AND platform_location_id = $1
|
||||||
|
`,
|
||||||
|
[location.platformLocationId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
const row = existing.rows[0];
|
||||||
|
const protectedStatuses = ['verified', 'merged', 'rejected'];
|
||||||
|
|
||||||
|
if (protectedStatuses.includes(row.status)) {
|
||||||
|
// Only update last_seen_at for protected statuses
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET last_seen_at = NOW(), updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`,
|
||||||
|
[row.id]
|
||||||
|
);
|
||||||
|
return { inserted: false, updated: false, skipped: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update existing discovered location (but preserve dispensary_id if set)
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET
|
||||||
|
platform_slug = $2,
|
||||||
|
platform_menu_url = $3,
|
||||||
|
name = $4,
|
||||||
|
raw_address = COALESCE($5, raw_address),
|
||||||
|
address_line1 = COALESCE($6, address_line1),
|
||||||
|
address_line2 = COALESCE($7, address_line2),
|
||||||
|
city = COALESCE($8, city),
|
||||||
|
state_code = COALESCE($9, state_code),
|
||||||
|
postal_code = COALESCE($10, postal_code),
|
||||||
|
country_code = COALESCE($11, country_code),
|
||||||
|
latitude = COALESCE($12, latitude),
|
||||||
|
longitude = COALESCE($13, longitude),
|
||||||
|
timezone = COALESCE($14, timezone),
|
||||||
|
offers_delivery = COALESCE($15, offers_delivery),
|
||||||
|
offers_pickup = COALESCE($16, offers_pickup),
|
||||||
|
is_recreational = COALESCE($17, is_recreational),
|
||||||
|
is_medical = COALESCE($18, is_medical),
|
||||||
|
metadata = COALESCE($19, metadata),
|
||||||
|
discovery_city_id = $20,
|
||||||
|
last_seen_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
row.id,
|
||||||
|
location.platformSlug,
|
||||||
|
location.platformMenuUrl,
|
||||||
|
location.name,
|
||||||
|
location.rawAddress,
|
||||||
|
location.addressLine1,
|
||||||
|
location.addressLine2,
|
||||||
|
location.city,
|
||||||
|
location.stateCode,
|
||||||
|
location.postalCode,
|
||||||
|
location.countryCode,
|
||||||
|
location.latitude,
|
||||||
|
location.longitude,
|
||||||
|
location.timezone,
|
||||||
|
location.offersDelivery,
|
||||||
|
location.offersPickup,
|
||||||
|
location.isRecreational,
|
||||||
|
location.isMedical,
|
||||||
|
JSON.stringify(location.metadata),
|
||||||
|
cityId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
return { inserted: false, updated: true, skipped: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert new location
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
INSERT INTO dutchie_discovery_locations (
|
||||||
|
platform,
|
||||||
|
platform_location_id,
|
||||||
|
platform_slug,
|
||||||
|
platform_menu_url,
|
||||||
|
name,
|
||||||
|
raw_address,
|
||||||
|
address_line1,
|
||||||
|
address_line2,
|
||||||
|
city,
|
||||||
|
state_code,
|
||||||
|
postal_code,
|
||||||
|
country_code,
|
||||||
|
latitude,
|
||||||
|
longitude,
|
||||||
|
timezone,
|
||||||
|
status,
|
||||||
|
offers_delivery,
|
||||||
|
offers_pickup,
|
||||||
|
is_recreational,
|
||||||
|
is_medical,
|
||||||
|
metadata,
|
||||||
|
discovery_city_id,
|
||||||
|
first_seen_at,
|
||||||
|
last_seen_at,
|
||||||
|
active,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
'dutchie',
|
||||||
|
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
|
||||||
|
'discovered',
|
||||||
|
$15, $16, $17, $18, $19, $20,
|
||||||
|
NOW(), NOW(), TRUE, NOW(), NOW()
|
||||||
|
)
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
location.platformLocationId,
|
||||||
|
location.platformSlug,
|
||||||
|
location.platformMenuUrl,
|
||||||
|
location.name,
|
||||||
|
location.rawAddress,
|
||||||
|
location.addressLine1,
|
||||||
|
location.addressLine2,
|
||||||
|
location.city,
|
||||||
|
location.stateCode,
|
||||||
|
location.postalCode,
|
||||||
|
location.countryCode,
|
||||||
|
location.latitude,
|
||||||
|
location.longitude,
|
||||||
|
location.timezone,
|
||||||
|
location.offersDelivery,
|
||||||
|
location.offersPickup,
|
||||||
|
location.isRecreational,
|
||||||
|
location.isMedical,
|
||||||
|
JSON.stringify(location.metadata),
|
||||||
|
cityId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
return { inserted: true, updated: false, skipped: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN DISCOVERY CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class DutchieLocationDiscovery {
|
||||||
|
private pool: Pool;
|
||||||
|
|
||||||
|
constructor(pool: Pool) {
|
||||||
|
this.pool = pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a city by slug
|
||||||
|
*/
|
||||||
|
async getCityBySlug(citySlug: string): Promise<DiscoveryCity | null> {
|
||||||
|
const { rows } = await this.pool.query(
|
||||||
|
`
|
||||||
|
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = 'dutchie' AND city_slug = $1
|
||||||
|
LIMIT 1
|
||||||
|
`,
|
||||||
|
[citySlug]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) return null;
|
||||||
|
|
||||||
|
const r = rows[0];
|
||||||
|
return {
|
||||||
|
id: r.id,
|
||||||
|
platform: r.platform,
|
||||||
|
cityName: r.city_name,
|
||||||
|
citySlug: r.city_slug,
|
||||||
|
stateCode: r.state_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
crawlEnabled: r.crawl_enabled,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all crawl-enabled cities
|
||||||
|
*/
|
||||||
|
async getEnabledCities(limit?: number): Promise<DiscoveryCity[]> {
|
||||||
|
const { rows } = await this.pool.query(
|
||||||
|
`
|
||||||
|
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||||
|
ORDER BY last_crawled_at ASC NULLS FIRST, city_name ASC
|
||||||
|
${limit ? `LIMIT ${limit}` : ''}
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
return rows.map((r) => ({
|
||||||
|
id: r.id,
|
||||||
|
platform: r.platform,
|
||||||
|
cityName: r.city_name,
|
||||||
|
citySlug: r.city_slug,
|
||||||
|
stateCode: r.state_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
crawlEnabled: r.crawl_enabled,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discover locations for a single city
|
||||||
|
*/
|
||||||
|
async discoverForCity(city: DiscoveryCity): Promise<LocationDiscoveryResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
let locationsFound = 0;
|
||||||
|
let locationsInserted = 0;
|
||||||
|
let locationsUpdated = 0;
|
||||||
|
let locationsSkipped = 0;
|
||||||
|
|
||||||
|
console.log(`[DutchieLocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Fetch locations
|
||||||
|
let locations = await fetchLocationsForCity(city);
|
||||||
|
|
||||||
|
// If scraping fails, try GraphQL
|
||||||
|
if (locations.length === 0) {
|
||||||
|
locations = await fetchLocationsViaGraphQL(city);
|
||||||
|
}
|
||||||
|
|
||||||
|
locationsFound = locations.length;
|
||||||
|
console.log(`[DutchieLocationDiscovery] Found ${locationsFound} locations`);
|
||||||
|
|
||||||
|
// Upsert each location
|
||||||
|
for (const location of locations) {
|
||||||
|
try {
|
||||||
|
const result = await upsertLocation(this.pool, location, city.id);
|
||||||
|
if (result.inserted) locationsInserted++;
|
||||||
|
else if (result.updated) locationsUpdated++;
|
||||||
|
else if (result.skipped) locationsSkipped++;
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = `Failed to upsert location ${location.platformSlug}: ${error.message}`;
|
||||||
|
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||||
|
errors.push(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update city's last_crawled_at and location_count
|
||||||
|
await this.pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_cities
|
||||||
|
SET last_crawled_at = NOW(),
|
||||||
|
location_count = $1,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $2
|
||||||
|
`,
|
||||||
|
[locationsFound, city.id]
|
||||||
|
);
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = `Location discovery failed for ${city.citySlug}: ${error.message}`;
|
||||||
|
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||||
|
errors.push(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log(`[DutchieLocationDiscovery] City ${city.citySlug} complete:`);
|
||||||
|
console.log(` Locations found: ${locationsFound}`);
|
||||||
|
console.log(` Inserted: ${locationsInserted}`);
|
||||||
|
console.log(` Updated: ${locationsUpdated}`);
|
||||||
|
console.log(` Skipped (protected): ${locationsSkipped}`);
|
||||||
|
console.log(` Errors: ${errors.length}`);
|
||||||
|
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
cityId: city.id,
|
||||||
|
citySlug: city.citySlug,
|
||||||
|
locationsFound,
|
||||||
|
locationsInserted,
|
||||||
|
locationsUpdated,
|
||||||
|
locationsSkipped,
|
||||||
|
errors,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discover locations for all enabled cities
|
||||||
|
*/
|
||||||
|
async discoverAllEnabled(options: {
|
||||||
|
limit?: number;
|
||||||
|
delayMs?: number;
|
||||||
|
} = {}): Promise<{
|
||||||
|
totalCities: number;
|
||||||
|
totalLocationsFound: number;
|
||||||
|
totalInserted: number;
|
||||||
|
totalUpdated: number;
|
||||||
|
totalSkipped: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}> {
|
||||||
|
const { limit, delayMs = 2000 } = options;
|
||||||
|
const startTime = Date.now();
|
||||||
|
let totalLocationsFound = 0;
|
||||||
|
let totalInserted = 0;
|
||||||
|
let totalUpdated = 0;
|
||||||
|
let totalSkipped = 0;
|
||||||
|
const allErrors: string[] = [];
|
||||||
|
|
||||||
|
const cities = await this.getEnabledCities(limit);
|
||||||
|
console.log(`[DutchieLocationDiscovery] Discovering locations for ${cities.length} cities...`);
|
||||||
|
|
||||||
|
for (let i = 0; i < cities.length; i++) {
|
||||||
|
const city = cities[i];
|
||||||
|
console.log(`\n[DutchieLocationDiscovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await this.discoverForCity(city);
|
||||||
|
totalLocationsFound += result.locationsFound;
|
||||||
|
totalInserted += result.locationsInserted;
|
||||||
|
totalUpdated += result.locationsUpdated;
|
||||||
|
totalSkipped += result.locationsSkipped;
|
||||||
|
allErrors.push(...result.errors);
|
||||||
|
} catch (error: any) {
|
||||||
|
allErrors.push(`City ${city.citySlug} failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delay between cities
|
||||||
|
if (i < cities.length - 1 && delayMs > 0) {
|
||||||
|
await new Promise((r) => setTimeout(r, delayMs));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log('\n[DutchieLocationDiscovery] All cities complete:');
|
||||||
|
console.log(` Total cities: ${cities.length}`);
|
||||||
|
console.log(` Total locations found: ${totalLocationsFound}`);
|
||||||
|
console.log(` Total inserted: ${totalInserted}`);
|
||||||
|
console.log(` Total updated: ${totalUpdated}`);
|
||||||
|
console.log(` Total skipped: ${totalSkipped}`);
|
||||||
|
console.log(` Total errors: ${allErrors.length}`);
|
||||||
|
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalCities: cities.length,
|
||||||
|
totalLocationsFound,
|
||||||
|
totalInserted,
|
||||||
|
totalUpdated,
|
||||||
|
totalSkipped,
|
||||||
|
errors: allErrors,
|
||||||
|
durationMs,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default DutchieLocationDiscovery;
|
||||||
73
backend/src/dutchie-az/discovery/discovery-dt-cities-auto.ts
Normal file
73
backend/src/dutchie-az/discovery/discovery-dt-cities-auto.ts
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Discovery Entrypoint: Dutchie Cities (Auto)
|
||||||
|
*
|
||||||
|
* Attempts browser/API-based /cities discovery.
|
||||||
|
* Even if currently blocked (403), this runner preserves the auto-discovery path.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npm run discovery:dt:cities:auto
|
||||||
|
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { DtCityDiscoveryService } from './DtCityDiscoveryService';
|
||||||
|
|
||||||
|
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||||
|
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('╔══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ Dutchie City Discovery (AUTO) ║');
|
||||||
|
console.log('║ Browser + API fallback ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════════╝');
|
||||||
|
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||||
|
|
||||||
|
const pool = new Pool({ connectionString: DB_URL });
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { rows } = await pool.query('SELECT NOW() as time');
|
||||||
|
console.log(`Connected at: ${rows[0].time}\n`);
|
||||||
|
|
||||||
|
const service = new DtCityDiscoveryService(pool);
|
||||||
|
const result = await service.runAutoDiscovery();
|
||||||
|
|
||||||
|
console.log('\n' + '═'.repeat(50));
|
||||||
|
console.log('SUMMARY');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(`Cities found: ${result.citiesFound}`);
|
||||||
|
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||||
|
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||||
|
console.log(`Errors: ${result.errors.length}`);
|
||||||
|
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\nErrors:');
|
||||||
|
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
const stats = await service.getStats();
|
||||||
|
console.log('\nCurrent Database Stats:');
|
||||||
|
console.log(` Total cities: ${stats.total}`);
|
||||||
|
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||||
|
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||||
|
|
||||||
|
if (result.citiesFound === 0) {
|
||||||
|
console.log('\n⚠️ No cities found via auto-discovery.');
|
||||||
|
console.log(' This may be due to Dutchie blocking scraping/API access.');
|
||||||
|
console.log(' Use manual seeding instead:');
|
||||||
|
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n✅ Auto city discovery completed');
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\n❌ Auto city discovery failed:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
@@ -0,0 +1,137 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Discovery Entrypoint: Dutchie Cities (Manual Seed)
|
||||||
|
*
|
||||||
|
* Manually seeds cities into dutchie_discovery_cities via CLI args.
|
||||||
|
* Use this when auto-discovery is blocked (403).
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||||
|
* npm run discovery:dt:cities:manual -- --city-slug=ma-boston --city-name=Boston --state-code=MA --country-code=US
|
||||||
|
*
|
||||||
|
* Options:
|
||||||
|
* --city-slug Required. URL slug (e.g., "ny-hudson")
|
||||||
|
* --city-name Required. Display name (e.g., "Hudson")
|
||||||
|
* --state-code Required. State/province code (e.g., "NY", "CA", "ON")
|
||||||
|
* --country-code Optional. Country code (default: "US")
|
||||||
|
*
|
||||||
|
* After seeding, run location discovery:
|
||||||
|
* npm run discovery:dt:locations
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { DtCityDiscoveryService, DutchieCity } from './DtCityDiscoveryService';
|
||||||
|
|
||||||
|
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||||
|
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||||
|
|
||||||
|
interface Args {
|
||||||
|
citySlug?: string;
|
||||||
|
cityName?: string;
|
||||||
|
stateCode?: string;
|
||||||
|
countryCode: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseArgs(): Args {
|
||||||
|
const args: Args = { countryCode: 'US' };
|
||||||
|
|
||||||
|
for (const arg of process.argv.slice(2)) {
|
||||||
|
const citySlugMatch = arg.match(/--city-slug=(.+)/);
|
||||||
|
if (citySlugMatch) args.citySlug = citySlugMatch[1];
|
||||||
|
|
||||||
|
const cityNameMatch = arg.match(/--city-name=(.+)/);
|
||||||
|
if (cityNameMatch) args.cityName = cityNameMatch[1];
|
||||||
|
|
||||||
|
const stateCodeMatch = arg.match(/--state-code=(.+)/);
|
||||||
|
if (stateCodeMatch) args.stateCode = stateCodeMatch[1].toUpperCase();
|
||||||
|
|
||||||
|
const countryCodeMatch = arg.match(/--country-code=(.+)/);
|
||||||
|
if (countryCodeMatch) args.countryCode = countryCodeMatch[1].toUpperCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
function printUsage() {
|
||||||
|
console.log(`
|
||||||
|
Usage:
|
||||||
|
npm run discovery:dt:cities:manual -- --city-slug=<slug> --city-name=<name> --state-code=<state>
|
||||||
|
|
||||||
|
Required arguments:
|
||||||
|
--city-slug URL slug for the city (e.g., "ny-hudson", "ma-boston")
|
||||||
|
--city-name Display name (e.g., "Hudson", "Boston")
|
||||||
|
--state-code State/province code (e.g., "NY", "CA", "ON")
|
||||||
|
|
||||||
|
Optional arguments:
|
||||||
|
--country-code Country code (default: "US")
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||||
|
npm run discovery:dt:cities:manual -- --city-slug=ca-los-angeles --city-name="Los Angeles" --state-code=CA
|
||||||
|
npm run discovery:dt:cities:manual -- --city-slug=on-toronto --city-name=Toronto --state-code=ON --country-code=CA
|
||||||
|
|
||||||
|
After seeding, run location discovery:
|
||||||
|
npm run discovery:dt:locations
|
||||||
|
`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = parseArgs();
|
||||||
|
|
||||||
|
console.log('╔══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ Dutchie City Discovery (MANUAL SEED) ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════════╝');
|
||||||
|
|
||||||
|
if (!args.citySlug || !args.cityName || !args.stateCode) {
|
||||||
|
console.error('\n❌ Error: Missing required arguments\n');
|
||||||
|
printUsage();
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\nCity Slug: ${args.citySlug}`);
|
||||||
|
console.log(`City Name: ${args.cityName}`);
|
||||||
|
console.log(`State Code: ${args.stateCode}`);
|
||||||
|
console.log(`Country Code: ${args.countryCode}`);
|
||||||
|
console.log(`Database: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||||
|
|
||||||
|
const pool = new Pool({ connectionString: DB_URL });
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { rows } = await pool.query('SELECT NOW() as time');
|
||||||
|
console.log(`\nConnected at: ${rows[0].time}`);
|
||||||
|
|
||||||
|
const service = new DtCityDiscoveryService(pool);
|
||||||
|
|
||||||
|
const city: DutchieCity = {
|
||||||
|
slug: args.citySlug,
|
||||||
|
name: args.cityName,
|
||||||
|
stateCode: args.stateCode,
|
||||||
|
countryCode: args.countryCode,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await service.seedCity(city);
|
||||||
|
|
||||||
|
const action = result.wasInserted ? 'INSERTED' : 'UPDATED';
|
||||||
|
console.log(`\n✅ City ${action}:`);
|
||||||
|
console.log(` ID: ${result.id}`);
|
||||||
|
console.log(` City Slug: ${result.city.slug}`);
|
||||||
|
console.log(` City Name: ${result.city.name}`);
|
||||||
|
console.log(` State Code: ${result.city.stateCode}`);
|
||||||
|
console.log(` Country Code: ${result.city.countryCode}`);
|
||||||
|
|
||||||
|
const stats = await service.getStats();
|
||||||
|
console.log(`\nTotal Dutchie cities: ${stats.total} (${stats.crawlEnabled} enabled)`);
|
||||||
|
|
||||||
|
console.log('\n📍 Next step: Run location discovery');
|
||||||
|
console.log(' npm run discovery:dt:locations');
|
||||||
|
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\n❌ Failed to seed city:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
73
backend/src/dutchie-az/discovery/discovery-dt-cities.ts
Normal file
73
backend/src/dutchie-az/discovery/discovery-dt-cities.ts
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Discovery Runner: Dutchie Cities
|
||||||
|
*
|
||||||
|
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npm run discovery:platforms:dt:cities
|
||||||
|
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||||
|
|
||||||
|
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||||
|
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('╔══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ Dutchie City Discovery Runner ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════════╝');
|
||||||
|
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||||
|
|
||||||
|
const pool = new Pool({ connectionString: DB_URL });
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Test DB connection
|
||||||
|
const { rows } = await pool.query('SELECT NOW() as time');
|
||||||
|
console.log(`Connected at: ${rows[0].time}\n`);
|
||||||
|
|
||||||
|
// Run city discovery
|
||||||
|
const discovery = new DutchieCityDiscovery(pool);
|
||||||
|
const result = await discovery.run();
|
||||||
|
|
||||||
|
// Print summary
|
||||||
|
console.log('\n' + '═'.repeat(50));
|
||||||
|
console.log('SUMMARY');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(`Cities found: ${result.citiesFound}`);
|
||||||
|
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||||
|
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||||
|
console.log(`Errors: ${result.errors.length}`);
|
||||||
|
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\nErrors:');
|
||||||
|
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get final stats
|
||||||
|
const stats = await discovery.getStats();
|
||||||
|
console.log('\nCurrent Database Stats:');
|
||||||
|
console.log(` Total cities: ${stats.total}`);
|
||||||
|
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||||
|
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||||
|
console.log(` By country: ${stats.byCountry.map(c => `${c.countryCode}=${c.count}`).join(', ')}`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\n⚠️ Completed with errors');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n✅ City discovery completed successfully');
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\n❌ City discovery failed:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
@@ -0,0 +1,113 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Discovery Entrypoint: Dutchie Locations (From Cities)
|
||||||
|
*
|
||||||
|
* Reads from dutchie_discovery_cities (crawl_enabled = true)
|
||||||
|
* and discovers store locations for each city.
|
||||||
|
*
|
||||||
|
* Geo coordinates are captured when available from Dutchie's payloads.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npm run discovery:dt:locations
|
||||||
|
* npm run discovery:dt:locations -- --limit=10
|
||||||
|
* npm run discovery:dt:locations -- --delay=3000
|
||||||
|
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts
|
||||||
|
*
|
||||||
|
* Options:
|
||||||
|
* --limit=N Only process N cities (default: all)
|
||||||
|
* --delay=N Delay between cities in ms (default: 2000)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { DtLocationDiscoveryService } from './DtLocationDiscoveryService';
|
||||||
|
|
||||||
|
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||||
|
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||||
|
|
||||||
|
function parseArgs(): { limit?: number; delay?: number } {
|
||||||
|
const args: { limit?: number; delay?: number } = {};
|
||||||
|
|
||||||
|
for (const arg of process.argv.slice(2)) {
|
||||||
|
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||||
|
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||||
|
|
||||||
|
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||||
|
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = parseArgs();
|
||||||
|
|
||||||
|
console.log('╔══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ Dutchie Location Discovery (From Cities) ║');
|
||||||
|
console.log('║ Reads crawl_enabled cities, discovers stores ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════════╝');
|
||||||
|
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||||
|
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||||
|
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||||
|
|
||||||
|
const pool = new Pool({ connectionString: DB_URL });
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { rows } = await pool.query('SELECT NOW() as time');
|
||||||
|
console.log(`Connected at: ${rows[0].time}\n`);
|
||||||
|
|
||||||
|
const service = new DtLocationDiscoveryService(pool);
|
||||||
|
const result = await service.discoverAllEnabled({
|
||||||
|
limit: args.limit,
|
||||||
|
delayMs: args.delay ?? 2000,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\n' + '═'.repeat(50));
|
||||||
|
console.log('SUMMARY');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(`Cities processed: ${result.totalCities}`);
|
||||||
|
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||||
|
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||||
|
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||||
|
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||||
|
console.log(`Errors: ${result.errors.length}`);
|
||||||
|
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\nErrors (first 10):');
|
||||||
|
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||||
|
if (result.errors.length > 10) {
|
||||||
|
console.log(` ... and ${result.errors.length - 10} more`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get location stats including coordinates
|
||||||
|
const stats = await service.getStats();
|
||||||
|
console.log('\nCurrent Database Stats:');
|
||||||
|
console.log(` Total locations: ${stats.total}`);
|
||||||
|
console.log(` With coordinates: ${stats.withCoordinates}`);
|
||||||
|
console.log(` By status:`);
|
||||||
|
stats.byStatus.forEach(s => console.log(` ${s.status}: ${s.count}`));
|
||||||
|
|
||||||
|
if (result.totalCities === 0) {
|
||||||
|
console.log('\n⚠️ No crawl-enabled cities found.');
|
||||||
|
console.log(' Seed cities first:');
|
||||||
|
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\n⚠️ Completed with errors');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n✅ Location discovery completed successfully');
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\n❌ Location discovery failed:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
117
backend/src/dutchie-az/discovery/discovery-dt-locations.ts
Normal file
117
backend/src/dutchie-az/discovery/discovery-dt-locations.ts
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Discovery Runner: Dutchie Locations
|
||||||
|
*
|
||||||
|
* Discovers store locations for all crawl-enabled cities and upserts to dutchie_discovery_locations.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npm run discovery:platforms:dt:locations
|
||||||
|
* npm run discovery:platforms:dt:locations -- --limit=10
|
||||||
|
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations.ts
|
||||||
|
*
|
||||||
|
* Options (via args):
|
||||||
|
* --limit=N Only process N cities (default: all)
|
||||||
|
* --delay=N Delay between cities in ms (default: 2000)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||||
|
|
||||||
|
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||||
|
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||||
|
|
||||||
|
// Parse CLI args
|
||||||
|
function parseArgs(): { limit?: number; delay?: number } {
|
||||||
|
const args: { limit?: number; delay?: number } = {};
|
||||||
|
|
||||||
|
for (const arg of process.argv.slice(2)) {
|
||||||
|
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||||
|
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||||
|
|
||||||
|
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||||
|
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = parseArgs();
|
||||||
|
|
||||||
|
console.log('╔══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ Dutchie Location Discovery Runner ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════════╝');
|
||||||
|
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||||
|
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||||
|
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||||
|
|
||||||
|
const pool = new Pool({ connectionString: DB_URL });
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Test DB connection
|
||||||
|
const { rows } = await pool.query('SELECT NOW() as time');
|
||||||
|
console.log(`Connected at: ${rows[0].time}\n`);
|
||||||
|
|
||||||
|
// Run location discovery
|
||||||
|
const discovery = new DutchieLocationDiscovery(pool);
|
||||||
|
const result = await discovery.discoverAllEnabled({
|
||||||
|
limit: args.limit,
|
||||||
|
delayMs: args.delay ?? 2000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Print summary
|
||||||
|
console.log('\n' + '═'.repeat(50));
|
||||||
|
console.log('SUMMARY');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(`Cities processed: ${result.totalCities}`);
|
||||||
|
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||||
|
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||||
|
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||||
|
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||||
|
console.log(`Errors: ${result.errors.length}`);
|
||||||
|
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\nErrors (first 10):');
|
||||||
|
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||||
|
if (result.errors.length > 10) {
|
||||||
|
console.log(` ... and ${result.errors.length - 10} more`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get DB counts
|
||||||
|
const { rows: countRows } = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'discovered') as discovered,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'merged') as merged,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'rejected') as rejected
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
WHERE platform = 'dutchie' AND active = TRUE
|
||||||
|
`);
|
||||||
|
|
||||||
|
const counts = countRows[0];
|
||||||
|
console.log('\nCurrent Database Stats:');
|
||||||
|
console.log(` Total locations: ${counts.total}`);
|
||||||
|
console.log(` Status discovered: ${counts.discovered}`);
|
||||||
|
console.log(` Status verified: ${counts.verified}`);
|
||||||
|
console.log(` Status merged: ${counts.merged}`);
|
||||||
|
console.log(` Status rejected: ${counts.rejected}`);
|
||||||
|
|
||||||
|
if (result.errors.length > 0) {
|
||||||
|
console.log('\n⚠️ Completed with errors');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n✅ Location discovery completed successfully');
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\n❌ Location discovery failed:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
10
backend/src/dutchie-az/discovery/index.ts
Normal file
10
backend/src/dutchie-az/discovery/index.ts
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
/**
|
||||||
|
* Dutchie Discovery Module
|
||||||
|
*
|
||||||
|
* Store discovery pipeline for Dutchie platform.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||||
|
export { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||||
|
export { createDutchieDiscoveryRoutes } from './routes';
|
||||||
|
export { promoteDiscoveryLocation } from './promoteDiscoveryLocation';
|
||||||
248
backend/src/dutchie-az/discovery/promoteDiscoveryLocation.ts
Normal file
248
backend/src/dutchie-az/discovery/promoteDiscoveryLocation.ts
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
/**
|
||||||
|
* Promote Discovery Location to Crawlable Dispensary
|
||||||
|
*
|
||||||
|
* When a discovery location is verified or merged:
|
||||||
|
* 1. Ensure a crawl profile exists for the dispensary
|
||||||
|
* 2. Seed/update crawl schedule
|
||||||
|
* 3. Create initial crawl job
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
|
||||||
|
export interface PromotionResult {
|
||||||
|
success: boolean;
|
||||||
|
discoveryId: number;
|
||||||
|
dispensaryId: number;
|
||||||
|
crawlProfileId?: number;
|
||||||
|
scheduleUpdated?: boolean;
|
||||||
|
crawlJobCreated?: boolean;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Promote a verified/merged discovery location to a crawlable dispensary.
|
||||||
|
*
|
||||||
|
* This function:
|
||||||
|
* 1. Verifies the discovery location is verified/merged and has a dispensary_id
|
||||||
|
* 2. Ensures the dispensary has platform info (menu_type, platform_dispensary_id)
|
||||||
|
* 3. Creates/updates a crawler profile if the profile table exists
|
||||||
|
* 4. Queues an initial crawl job
|
||||||
|
*/
|
||||||
|
export async function promoteDiscoveryLocation(
|
||||||
|
pool: Pool,
|
||||||
|
discoveryLocationId: number
|
||||||
|
): Promise<PromotionResult> {
|
||||||
|
console.log(`[Promote] Starting promotion for discovery location ${discoveryLocationId}...`);
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
dl.*,
|
||||||
|
d.id as disp_id,
|
||||||
|
d.name as disp_name,
|
||||||
|
d.menu_type as disp_menu_type,
|
||||||
|
d.platform_dispensary_id as disp_platform_id
|
||||||
|
FROM dutchie_discovery_locations dl
|
||||||
|
JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||||
|
WHERE dl.id = $1
|
||||||
|
`,
|
||||||
|
[discoveryLocationId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
discoveryId: discoveryLocationId,
|
||||||
|
dispensaryId: 0,
|
||||||
|
error: 'Discovery location not found or not linked to a dispensary',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
// Verify status
|
||||||
|
if (!['verified', 'merged'].includes(location.status)) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
discoveryId: discoveryLocationId,
|
||||||
|
dispensaryId: location.dispensary_id || 0,
|
||||||
|
error: `Cannot promote: location status is '${location.status}', must be 'verified' or 'merged'`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const dispensaryId = location.dispensary_id;
|
||||||
|
console.log(`[Promote] Location ${discoveryLocationId} -> Dispensary ${dispensaryId} (${location.disp_name})`);
|
||||||
|
|
||||||
|
// Ensure dispensary has platform info
|
||||||
|
if (!location.disp_platform_id) {
|
||||||
|
console.log(`[Promote] Updating dispensary with platform info...`);
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||||
|
menu_url = COALESCE(menu_url, $2),
|
||||||
|
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let crawlProfileId: number | undefined;
|
||||||
|
let scheduleUpdated = false;
|
||||||
|
let crawlJobCreated = false;
|
||||||
|
|
||||||
|
// Check if dispensary_crawler_profiles table exists
|
||||||
|
const { rows: tableCheck } = await pool.query(`
|
||||||
|
SELECT EXISTS (
|
||||||
|
SELECT FROM information_schema.tables
|
||||||
|
WHERE table_name = 'dispensary_crawler_profiles'
|
||||||
|
) as exists
|
||||||
|
`);
|
||||||
|
|
||||||
|
if (tableCheck[0]?.exists) {
|
||||||
|
// Create or get crawler profile
|
||||||
|
console.log(`[Promote] Checking crawler profile...`);
|
||||||
|
|
||||||
|
const { rows: profileRows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT id FROM dispensary_crawler_profiles
|
||||||
|
WHERE dispensary_id = $1 AND platform = 'dutchie'
|
||||||
|
`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (profileRows.length > 0) {
|
||||||
|
crawlProfileId = profileRows[0].id;
|
||||||
|
console.log(`[Promote] Using existing profile ${crawlProfileId}`);
|
||||||
|
} else {
|
||||||
|
// Create new profile
|
||||||
|
const profileKey = `dutchie-${location.platform_slug}`;
|
||||||
|
const { rows: newProfile } = await pool.query(
|
||||||
|
`
|
||||||
|
INSERT INTO dispensary_crawler_profiles (
|
||||||
|
dispensary_id,
|
||||||
|
profile_key,
|
||||||
|
profile_name,
|
||||||
|
platform,
|
||||||
|
config,
|
||||||
|
status,
|
||||||
|
enabled,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, 'dutchie', $4, 'sandbox', TRUE, NOW(), NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (dispensary_id, platform) DO UPDATE SET
|
||||||
|
enabled = TRUE,
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING id
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
dispensaryId,
|
||||||
|
profileKey,
|
||||||
|
`${location.name} (Dutchie)`,
|
||||||
|
JSON.stringify({
|
||||||
|
platformDispensaryId: location.platform_location_id,
|
||||||
|
platformSlug: location.platform_slug,
|
||||||
|
menuUrl: location.platform_menu_url,
|
||||||
|
pricingType: 'rec',
|
||||||
|
useBothModes: true,
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
crawlProfileId = newProfile[0]?.id;
|
||||||
|
console.log(`[Promote] Created new profile ${crawlProfileId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Link profile to dispensary if not already linked
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET active_crawler_profile_id = COALESCE(active_crawler_profile_id, $1),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $2
|
||||||
|
`,
|
||||||
|
[crawlProfileId, dispensaryId]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if crawl_jobs table exists and create initial job
|
||||||
|
const { rows: jobsTableCheck } = await pool.query(`
|
||||||
|
SELECT EXISTS (
|
||||||
|
SELECT FROM information_schema.tables
|
||||||
|
WHERE table_name = 'crawl_jobs'
|
||||||
|
) as exists
|
||||||
|
`);
|
||||||
|
|
||||||
|
if (jobsTableCheck[0]?.exists) {
|
||||||
|
// Check if there's already a pending job
|
||||||
|
const { rows: existingJobs } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT id FROM crawl_jobs
|
||||||
|
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||||
|
LIMIT 1
|
||||||
|
`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existingJobs.length === 0) {
|
||||||
|
// Create initial crawl job
|
||||||
|
console.log(`[Promote] Creating initial crawl job...`);
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
INSERT INTO crawl_jobs (
|
||||||
|
dispensary_id,
|
||||||
|
job_type,
|
||||||
|
status,
|
||||||
|
priority,
|
||||||
|
config,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, 'dutchie_product_crawl', 'pending', 1, $2, NOW(), NOW()
|
||||||
|
)
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
dispensaryId,
|
||||||
|
JSON.stringify({
|
||||||
|
source: 'discovery_promotion',
|
||||||
|
discoveryLocationId,
|
||||||
|
pricingType: 'rec',
|
||||||
|
useBothModes: true,
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
crawlJobCreated = true;
|
||||||
|
} else {
|
||||||
|
console.log(`[Promote] Crawl job already exists for dispensary`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update discovery location notes
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET notes = COALESCE(notes || E'\n', '') || $1,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $2
|
||||||
|
`,
|
||||||
|
[`Promoted to crawlable at ${new Date().toISOString()}`, discoveryLocationId]
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`[Promote] Promotion complete for discovery location ${discoveryLocationId}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
discoveryId: discoveryLocationId,
|
||||||
|
dispensaryId,
|
||||||
|
crawlProfileId,
|
||||||
|
scheduleUpdated,
|
||||||
|
crawlJobCreated,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export default promoteDiscoveryLocation;
|
||||||
973
backend/src/dutchie-az/discovery/routes.ts
Normal file
973
backend/src/dutchie-az/discovery/routes.ts
Normal file
@@ -0,0 +1,973 @@
|
|||||||
|
/**
|
||||||
|
* Platform Discovery API Routes (DT = Dutchie)
|
||||||
|
*
|
||||||
|
* Routes for the platform-specific store discovery pipeline.
|
||||||
|
* Mount at /api/discovery/platforms/dt
|
||||||
|
*
|
||||||
|
* Platform Slug Mapping (for trademark-safe URLs):
|
||||||
|
* dt = Dutchie
|
||||||
|
* jn = Jane (future)
|
||||||
|
* wm = Weedmaps (future)
|
||||||
|
* lf = Leafly (future)
|
||||||
|
* tz = Treez (future)
|
||||||
|
*
|
||||||
|
* Note: The actual platform value stored in the DB remains 'dutchie'.
|
||||||
|
* Only the URL paths use neutral slugs.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Router, Request, Response } from 'express';
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||||
|
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||||
|
import { DiscoveryGeoService } from '../../services/DiscoveryGeoService';
|
||||||
|
import { GeoValidationService } from '../../services/GeoValidationService';
|
||||||
|
|
||||||
|
export function createDutchieDiscoveryRoutes(pool: Pool): Router {
|
||||||
|
const router = Router();
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// LOCATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/locations
|
||||||
|
*
|
||||||
|
* List discovered locations with filtering.
|
||||||
|
*
|
||||||
|
* Query params:
|
||||||
|
* - status: 'discovered' | 'verified' | 'rejected' | 'merged'
|
||||||
|
* - state_code: e.g., 'AZ', 'CA'
|
||||||
|
* - country_code: 'US' | 'CA'
|
||||||
|
* - unlinked_only: 'true' to show only locations without dispensary_id
|
||||||
|
* - search: search by name
|
||||||
|
* - limit: number (default 50)
|
||||||
|
* - offset: number (default 0)
|
||||||
|
*/
|
||||||
|
router.get('/locations', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const {
|
||||||
|
status,
|
||||||
|
state_code,
|
||||||
|
country_code,
|
||||||
|
unlinked_only,
|
||||||
|
search,
|
||||||
|
limit = '50',
|
||||||
|
offset = '0',
|
||||||
|
} = req.query;
|
||||||
|
|
||||||
|
let whereClause = "WHERE platform = 'dutchie' AND active = TRUE";
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
whereClause += ` AND status = $${paramIndex}`;
|
||||||
|
params.push(status);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state_code) {
|
||||||
|
whereClause += ` AND state_code = $${paramIndex}`;
|
||||||
|
params.push(state_code);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (country_code) {
|
||||||
|
whereClause += ` AND country_code = $${paramIndex}`;
|
||||||
|
params.push(country_code);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlinked_only === 'true') {
|
||||||
|
whereClause += ' AND dispensary_id IS NULL';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (search) {
|
||||||
|
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||||
|
params.push(`%${search}%`);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const limitVal = parseInt(limit as string, 10);
|
||||||
|
const offsetVal = parseInt(offset as string, 10);
|
||||||
|
params.push(limitVal, offsetVal);
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
dl.id,
|
||||||
|
dl.platform,
|
||||||
|
dl.platform_location_id,
|
||||||
|
dl.platform_slug,
|
||||||
|
dl.platform_menu_url,
|
||||||
|
dl.name,
|
||||||
|
dl.raw_address,
|
||||||
|
dl.address_line1,
|
||||||
|
dl.city,
|
||||||
|
dl.state_code,
|
||||||
|
dl.postal_code,
|
||||||
|
dl.country_code,
|
||||||
|
dl.latitude,
|
||||||
|
dl.longitude,
|
||||||
|
dl.status,
|
||||||
|
dl.dispensary_id,
|
||||||
|
dl.offers_delivery,
|
||||||
|
dl.offers_pickup,
|
||||||
|
dl.is_recreational,
|
||||||
|
dl.is_medical,
|
||||||
|
dl.first_seen_at,
|
||||||
|
dl.last_seen_at,
|
||||||
|
dl.verified_at,
|
||||||
|
dl.verified_by,
|
||||||
|
dl.notes,
|
||||||
|
d.name as dispensary_name
|
||||||
|
FROM dutchie_discovery_locations dl
|
||||||
|
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY dl.first_seen_at DESC
|
||||||
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||||
|
`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
// Get total count
|
||||||
|
const countParams = params.slice(0, -2);
|
||||||
|
const { rows: countRows } = await pool.query(
|
||||||
|
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
|
||||||
|
countParams
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
locations: rows.map((r) => ({
|
||||||
|
id: r.id,
|
||||||
|
platform: r.platform,
|
||||||
|
platformLocationId: r.platform_location_id,
|
||||||
|
platformSlug: r.platform_slug,
|
||||||
|
platformMenuUrl: r.platform_menu_url,
|
||||||
|
name: r.name,
|
||||||
|
rawAddress: r.raw_address,
|
||||||
|
addressLine1: r.address_line1,
|
||||||
|
city: r.city,
|
||||||
|
stateCode: r.state_code,
|
||||||
|
postalCode: r.postal_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
latitude: r.latitude,
|
||||||
|
longitude: r.longitude,
|
||||||
|
status: r.status,
|
||||||
|
dispensaryId: r.dispensary_id,
|
||||||
|
dispensaryName: r.dispensary_name,
|
||||||
|
offersDelivery: r.offers_delivery,
|
||||||
|
offersPickup: r.offers_pickup,
|
||||||
|
isRecreational: r.is_recreational,
|
||||||
|
isMedical: r.is_medical,
|
||||||
|
firstSeenAt: r.first_seen_at,
|
||||||
|
lastSeenAt: r.last_seen_at,
|
||||||
|
verifiedAt: r.verified_at,
|
||||||
|
verifiedBy: r.verified_by,
|
||||||
|
notes: r.notes,
|
||||||
|
})),
|
||||||
|
total: parseInt(countRows[0]?.total || '0', 10),
|
||||||
|
limit: limitVal,
|
||||||
|
offset: offsetVal,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error fetching locations:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/locations/:id
|
||||||
|
*
|
||||||
|
* Get a single location by ID.
|
||||||
|
*/
|
||||||
|
router.get('/locations/:id', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
dl.*,
|
||||||
|
d.name as dispensary_name,
|
||||||
|
d.menu_url as dispensary_menu_url
|
||||||
|
FROM dutchie_discovery_locations dl
|
||||||
|
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||||
|
WHERE dl.id = $1
|
||||||
|
`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const r = rows[0];
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
location: {
|
||||||
|
id: r.id,
|
||||||
|
platform: r.platform,
|
||||||
|
platformLocationId: r.platform_location_id,
|
||||||
|
platformSlug: r.platform_slug,
|
||||||
|
platformMenuUrl: r.platform_menu_url,
|
||||||
|
name: r.name,
|
||||||
|
rawAddress: r.raw_address,
|
||||||
|
addressLine1: r.address_line1,
|
||||||
|
addressLine2: r.address_line2,
|
||||||
|
city: r.city,
|
||||||
|
stateCode: r.state_code,
|
||||||
|
postalCode: r.postal_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
latitude: r.latitude,
|
||||||
|
longitude: r.longitude,
|
||||||
|
timezone: r.timezone,
|
||||||
|
status: r.status,
|
||||||
|
dispensaryId: r.dispensary_id,
|
||||||
|
dispensaryName: r.dispensary_name,
|
||||||
|
dispensaryMenuUrl: r.dispensary_menu_url,
|
||||||
|
offersDelivery: r.offers_delivery,
|
||||||
|
offersPickup: r.offers_pickup,
|
||||||
|
isRecreational: r.is_recreational,
|
||||||
|
isMedical: r.is_medical,
|
||||||
|
firstSeenAt: r.first_seen_at,
|
||||||
|
lastSeenAt: r.last_seen_at,
|
||||||
|
verifiedAt: r.verified_at,
|
||||||
|
verifiedBy: r.verified_by,
|
||||||
|
notes: r.notes,
|
||||||
|
metadata: r.metadata,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error fetching location:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// VERIFICATION ACTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/platforms/dt/locations/:id/verify-create
|
||||||
|
*
|
||||||
|
* Verify a discovered location and create a new canonical dispensary.
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/verify-create', async (req: Request, res: Response) => {
|
||||||
|
const client = await pool.connect();
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { verifiedBy = 'admin' } = req.body;
|
||||||
|
|
||||||
|
await client.query('BEGIN');
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await client.query(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
if (location.status !== 'discovered') {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: `Cannot verify: location status is '${location.status}'`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look up state_id if we have a state_code
|
||||||
|
let stateId: number | null = null;
|
||||||
|
if (location.state_code) {
|
||||||
|
const { rows: stateRows } = await client.query(
|
||||||
|
`SELECT id FROM states WHERE code = $1`,
|
||||||
|
[location.state_code]
|
||||||
|
);
|
||||||
|
if (stateRows.length > 0) {
|
||||||
|
stateId = stateRows[0].id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the canonical dispensary
|
||||||
|
const { rows: dispRows } = await client.query(
|
||||||
|
`
|
||||||
|
INSERT INTO dispensaries (
|
||||||
|
name,
|
||||||
|
slug,
|
||||||
|
address,
|
||||||
|
city,
|
||||||
|
state,
|
||||||
|
zip,
|
||||||
|
latitude,
|
||||||
|
longitude,
|
||||||
|
timezone,
|
||||||
|
menu_type,
|
||||||
|
menu_url,
|
||||||
|
platform_dispensary_id,
|
||||||
|
state_id,
|
||||||
|
active,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, TRUE, NOW(), NOW()
|
||||||
|
)
|
||||||
|
RETURNING id
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
location.name,
|
||||||
|
location.platform_slug,
|
||||||
|
location.address_line1,
|
||||||
|
location.city,
|
||||||
|
location.state_code,
|
||||||
|
location.postal_code,
|
||||||
|
location.latitude,
|
||||||
|
location.longitude,
|
||||||
|
location.timezone,
|
||||||
|
'dutchie',
|
||||||
|
location.platform_menu_url,
|
||||||
|
location.platform_location_id,
|
||||||
|
stateId,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
const dispensaryId = dispRows[0].id;
|
||||||
|
|
||||||
|
// Update the discovery location
|
||||||
|
await client.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'verified',
|
||||||
|
dispensary_id = $1,
|
||||||
|
verified_at = NOW(),
|
||||||
|
verified_by = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[dispensaryId, verifiedBy, id]
|
||||||
|
);
|
||||||
|
|
||||||
|
await client.query('COMMIT');
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'created',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
dispensaryId,
|
||||||
|
message: `Created new dispensary (ID: ${dispensaryId})`,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
console.error('[Discovery Routes] Error in verify-create:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
} finally {
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/platforms/dt/locations/:id/verify-link
|
||||||
|
*
|
||||||
|
* Link a discovered location to an existing dispensary.
|
||||||
|
*
|
||||||
|
* Body:
|
||||||
|
* - dispensaryId: number (required)
|
||||||
|
* - verifiedBy: string (optional)
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/verify-link', async (req: Request, res: Response) => {
|
||||||
|
const client = await pool.connect();
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { dispensaryId, verifiedBy = 'admin' } = req.body;
|
||||||
|
|
||||||
|
if (!dispensaryId) {
|
||||||
|
return res.status(400).json({ success: false, error: 'dispensaryId is required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
await client.query('BEGIN');
|
||||||
|
|
||||||
|
// Verify dispensary exists
|
||||||
|
const { rows: dispRows } = await client.query(
|
||||||
|
`SELECT id, name FROM dispensaries WHERE id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (dispRows.length === 0) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
return res.status(404).json({ success: false, error: 'Dispensary not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await client.query(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
if (location.status !== 'discovered') {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: `Cannot link: location status is '${location.status}'`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update dispensary with platform info if missing
|
||||||
|
await client.query(
|
||||||
|
`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||||
|
menu_url = COALESCE(menu_url, $2),
|
||||||
|
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update the discovery location
|
||||||
|
await client.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'merged',
|
||||||
|
dispensary_id = $1,
|
||||||
|
verified_at = NOW(),
|
||||||
|
verified_by = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[dispensaryId, verifiedBy, id]
|
||||||
|
);
|
||||||
|
|
||||||
|
await client.query('COMMIT');
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'linked',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
dispensaryId,
|
||||||
|
dispensaryName: dispRows[0].name,
|
||||||
|
message: `Linked to existing dispensary: ${dispRows[0].name}`,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
console.error('[Discovery Routes] Error in verify-link:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
} finally {
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/platforms/dt/locations/:id/reject
|
||||||
|
*
|
||||||
|
* Reject a discovered location.
|
||||||
|
*
|
||||||
|
* Body:
|
||||||
|
* - reason: string (optional)
|
||||||
|
* - verifiedBy: string (optional)
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { reason, verifiedBy = 'admin' } = req.body;
|
||||||
|
|
||||||
|
// Get current status
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rows[0].status !== 'discovered') {
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: `Cannot reject: location status is '${rows[0].status}'`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'rejected',
|
||||||
|
verified_at = NOW(),
|
||||||
|
verified_by = $1,
|
||||||
|
notes = COALESCE($2, notes),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $3
|
||||||
|
`,
|
||||||
|
[verifiedBy, reason, id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'rejected',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
message: 'Location rejected',
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error in reject:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/discovery/platforms/dt/locations/:id/unreject
|
||||||
|
*
|
||||||
|
* Restore a rejected location to discovered status.
|
||||||
|
*/
|
||||||
|
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
// Get current status
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rows[0].status !== 'rejected') {
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: `Cannot unreject: location status is '${rows[0].status}'`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`
|
||||||
|
UPDATE dutchie_discovery_locations
|
||||||
|
SET status = 'discovered',
|
||||||
|
verified_at = NULL,
|
||||||
|
verified_by = NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`,
|
||||||
|
[id]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
action: 'unrejected',
|
||||||
|
discoveryId: parseInt(id, 10),
|
||||||
|
message: 'Location restored to discovered status',
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error in unreject:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SUMMARY / REPORTING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/summary
|
||||||
|
*
|
||||||
|
* Get discovery summary statistics.
|
||||||
|
*/
|
||||||
|
router.get('/summary', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
// Total counts by status
|
||||||
|
const { rows: statusRows } = await pool.query(`
|
||||||
|
SELECT status, COUNT(*) as cnt
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
WHERE platform = 'dutchie' AND active = TRUE
|
||||||
|
GROUP BY status
|
||||||
|
`);
|
||||||
|
|
||||||
|
const statusCounts: Record<string, number> = {};
|
||||||
|
let totalLocations = 0;
|
||||||
|
for (const row of statusRows) {
|
||||||
|
statusCounts[row.status] = parseInt(row.cnt, 10);
|
||||||
|
totalLocations += parseInt(row.cnt, 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
// By state
|
||||||
|
const { rows: stateRows } = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
state_code,
|
||||||
|
COUNT(*) as total,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||||
|
COUNT(*) FILTER (WHERE dispensary_id IS NULL AND status = 'discovered') as unlinked
|
||||||
|
FROM dutchie_discovery_locations
|
||||||
|
WHERE platform = 'dutchie' AND active = TRUE AND state_code IS NOT NULL
|
||||||
|
GROUP BY state_code
|
||||||
|
ORDER BY total DESC
|
||||||
|
`);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
summary: {
|
||||||
|
total_locations: totalLocations,
|
||||||
|
discovered: statusCounts['discovered'] || 0,
|
||||||
|
verified: statusCounts['verified'] || 0,
|
||||||
|
merged: statusCounts['merged'] || 0,
|
||||||
|
rejected: statusCounts['rejected'] || 0,
|
||||||
|
},
|
||||||
|
by_state: stateRows.map((r) => ({
|
||||||
|
state_code: r.state_code,
|
||||||
|
total: parseInt(r.total, 10),
|
||||||
|
verified: parseInt(r.verified, 10),
|
||||||
|
unlinked: parseInt(r.unlinked, 10),
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error in summary:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CITIES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/cities
|
||||||
|
*
|
||||||
|
* List discovery cities.
|
||||||
|
*/
|
||||||
|
router.get('/cities', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { state_code, country_code, crawl_enabled, limit = '100', offset = '0' } = req.query;
|
||||||
|
|
||||||
|
let whereClause = "WHERE platform = 'dutchie'";
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (state_code) {
|
||||||
|
whereClause += ` AND state_code = $${paramIndex}`;
|
||||||
|
params.push(state_code);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (country_code) {
|
||||||
|
whereClause += ` AND country_code = $${paramIndex}`;
|
||||||
|
params.push(country_code);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (crawl_enabled === 'true') {
|
||||||
|
whereClause += ' AND crawl_enabled = TRUE';
|
||||||
|
} else if (crawl_enabled === 'false') {
|
||||||
|
whereClause += ' AND crawl_enabled = FALSE';
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||||
|
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
platform,
|
||||||
|
city_name,
|
||||||
|
city_slug,
|
||||||
|
state_code,
|
||||||
|
country_code,
|
||||||
|
last_crawled_at,
|
||||||
|
crawl_enabled,
|
||||||
|
location_count
|
||||||
|
FROM dutchie_discovery_cities
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY country_code, state_code, city_name
|
||||||
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||||
|
`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
const { rows: countRows } = await pool.query(
|
||||||
|
`SELECT COUNT(*) as total FROM dutchie_discovery_cities ${whereClause}`,
|
||||||
|
params.slice(0, -2)
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
cities: rows.map((r) => ({
|
||||||
|
id: r.id,
|
||||||
|
platform: r.platform,
|
||||||
|
cityName: r.city_name,
|
||||||
|
citySlug: r.city_slug,
|
||||||
|
stateCode: r.state_code,
|
||||||
|
countryCode: r.country_code,
|
||||||
|
lastCrawledAt: r.last_crawled_at,
|
||||||
|
crawlEnabled: r.crawl_enabled,
|
||||||
|
locationCount: r.location_count,
|
||||||
|
})),
|
||||||
|
total: parseInt(countRows[0]?.total || '0', 10),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error fetching cities:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MATCH CANDIDATES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/locations/:id/match-candidates
|
||||||
|
*
|
||||||
|
* Find potential dispensary matches for a discovery location.
|
||||||
|
*/
|
||||||
|
router.get('/locations/:id/match-candidates', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
// Get the discovery location
|
||||||
|
const { rows: locRows } = await pool.query(
|
||||||
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (locRows.length === 0) {
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = locRows[0];
|
||||||
|
|
||||||
|
// Find potential matches
|
||||||
|
const { rows: candidates } = await pool.query(
|
||||||
|
`
|
||||||
|
SELECT
|
||||||
|
d.id,
|
||||||
|
d.name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
d.address,
|
||||||
|
d.menu_type,
|
||||||
|
d.platform_dispensary_id,
|
||||||
|
d.menu_url,
|
||||||
|
d.latitude,
|
||||||
|
d.longitude,
|
||||||
|
CASE
|
||||||
|
WHEN d.name ILIKE $1 THEN 'exact_name'
|
||||||
|
WHEN d.name ILIKE $2 THEN 'partial_name'
|
||||||
|
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
|
||||||
|
ELSE 'location_match'
|
||||||
|
END as match_type,
|
||||||
|
CASE
|
||||||
|
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
|
||||||
|
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
|
||||||
|
THEN (3959 * acos(
|
||||||
|
LEAST(1.0, GREATEST(-1.0,
|
||||||
|
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||||
|
cos(radians(d.longitude) - radians($6::float)) +
|
||||||
|
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||||
|
))
|
||||||
|
))
|
||||||
|
ELSE NULL
|
||||||
|
END as distance_miles
|
||||||
|
FROM dispensaries d
|
||||||
|
WHERE d.state = $4
|
||||||
|
AND (
|
||||||
|
d.name ILIKE $1
|
||||||
|
OR d.name ILIKE $2
|
||||||
|
OR d.city ILIKE $3
|
||||||
|
OR (
|
||||||
|
d.latitude IS NOT NULL
|
||||||
|
AND d.longitude IS NOT NULL
|
||||||
|
AND $5::float IS NOT NULL
|
||||||
|
AND $6::float IS NOT NULL
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ORDER BY
|
||||||
|
CASE
|
||||||
|
WHEN d.name ILIKE $1 THEN 1
|
||||||
|
WHEN d.name ILIKE $2 THEN 2
|
||||||
|
ELSE 3
|
||||||
|
END,
|
||||||
|
distance_miles NULLS LAST
|
||||||
|
LIMIT 10
|
||||||
|
`,
|
||||||
|
[
|
||||||
|
location.name,
|
||||||
|
`%${location.name.split(' ')[0]}%`,
|
||||||
|
location.city,
|
||||||
|
location.state_code,
|
||||||
|
location.latitude,
|
||||||
|
location.longitude,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
location: {
|
||||||
|
id: location.id,
|
||||||
|
name: location.name,
|
||||||
|
city: location.city,
|
||||||
|
stateCode: location.state_code,
|
||||||
|
},
|
||||||
|
candidates: candidates.map((c) => ({
|
||||||
|
id: c.id,
|
||||||
|
name: c.name,
|
||||||
|
city: c.city,
|
||||||
|
state: c.state,
|
||||||
|
address: c.address,
|
||||||
|
menuType: c.menu_type,
|
||||||
|
platformDispensaryId: c.platform_dispensary_id,
|
||||||
|
menuUrl: c.menu_url,
|
||||||
|
matchType: c.match_type,
|
||||||
|
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error fetching match candidates:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// GEO / NEARBY (Admin/Debug Only)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/nearby
|
||||||
|
*
|
||||||
|
* Find discovery locations near a given coordinate.
|
||||||
|
* This is an internal/debug endpoint for admin use.
|
||||||
|
*
|
||||||
|
* Query params:
|
||||||
|
* - lat: number (required)
|
||||||
|
* - lon: number (required)
|
||||||
|
* - radiusKm: number (optional, default 50)
|
||||||
|
* - limit: number (optional, default 20)
|
||||||
|
* - status: string (optional, filter by status)
|
||||||
|
*/
|
||||||
|
router.get('/nearby', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { lat, lon, radiusKm = '50', limit = '20', status } = req.query;
|
||||||
|
|
||||||
|
// Validate required params
|
||||||
|
if (!lat || !lon) {
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: 'lat and lon are required query parameters',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const latNum = parseFloat(lat as string);
|
||||||
|
const lonNum = parseFloat(lon as string);
|
||||||
|
const radiusNum = parseFloat(radiusKm as string);
|
||||||
|
const limitNum = parseInt(limit as string, 10);
|
||||||
|
|
||||||
|
if (isNaN(latNum) || isNaN(lonNum)) {
|
||||||
|
return res.status(400).json({
|
||||||
|
success: false,
|
||||||
|
error: 'lat and lon must be valid numbers',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const geoService = new DiscoveryGeoService(pool);
|
||||||
|
|
||||||
|
const locations = await geoService.findNearbyDiscoveryLocations(latNum, lonNum, {
|
||||||
|
radiusKm: radiusNum,
|
||||||
|
limit: limitNum,
|
||||||
|
platform: 'dutchie',
|
||||||
|
status: status as string | undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
center: { lat: latNum, lon: lonNum },
|
||||||
|
radiusKm: radiusNum,
|
||||||
|
count: locations.length,
|
||||||
|
locations,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error in nearby:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/geo-stats
|
||||||
|
*
|
||||||
|
* Get coordinate coverage statistics for discovery locations.
|
||||||
|
* This is an internal/debug endpoint for admin use.
|
||||||
|
*/
|
||||||
|
router.get('/geo-stats', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const geoService = new DiscoveryGeoService(pool);
|
||||||
|
const stats = await geoService.getCoordinateCoverageStats();
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
stats,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error in geo-stats:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/discovery/platforms/dt/locations/:id/validate-geo
|
||||||
|
*
|
||||||
|
* Validate the geographic data for a discovery location.
|
||||||
|
* This is an internal/debug endpoint for admin use.
|
||||||
|
*/
|
||||||
|
router.get('/locations/:id/validate-geo', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
// Get the location
|
||||||
|
const { rows } = await pool.query(
|
||||||
|
`SELECT latitude, longitude, state_code, country_code, name
|
||||||
|
FROM dutchie_discovery_locations WHERE id = $1`,
|
||||||
|
[parseInt(id, 10)]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (rows.length === 0) {
|
||||||
|
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const location = rows[0];
|
||||||
|
const geoValidation = new GeoValidationService();
|
||||||
|
const result = geoValidation.validateLocationState({
|
||||||
|
latitude: location.latitude,
|
||||||
|
longitude: location.longitude,
|
||||||
|
state_code: location.state_code,
|
||||||
|
country_code: location.country_code,
|
||||||
|
});
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
location: {
|
||||||
|
id: parseInt(id, 10),
|
||||||
|
name: location.name,
|
||||||
|
latitude: location.latitude,
|
||||||
|
longitude: location.longitude,
|
||||||
|
stateCode: location.state_code,
|
||||||
|
countryCode: location.country_code,
|
||||||
|
},
|
||||||
|
validation: result,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Discovery Routes] Error in validate-geo:', error);
|
||||||
|
res.status(500).json({ success: false, error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return router;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default createDutchieDiscoveryRoutes;
|
||||||
682
backend/src/dutchie-az/routes/analytics.ts
Normal file
682
backend/src/dutchie-az/routes/analytics.ts
Normal file
@@ -0,0 +1,682 @@
|
|||||||
|
/**
|
||||||
|
* Analytics API Routes
|
||||||
|
*
|
||||||
|
* Provides REST API endpoints for all analytics services.
|
||||||
|
* All routes are prefixed with /api/analytics
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Router, Request, Response } from 'express';
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import {
|
||||||
|
AnalyticsCache,
|
||||||
|
PriceTrendService,
|
||||||
|
PenetrationService,
|
||||||
|
CategoryAnalyticsService,
|
||||||
|
StoreChangeService,
|
||||||
|
BrandOpportunityService,
|
||||||
|
} from '../services/analytics';
|
||||||
|
|
||||||
|
export function createAnalyticsRouter(pool: Pool): Router {
|
||||||
|
const router = Router();
|
||||||
|
|
||||||
|
// Initialize services
|
||||||
|
const cache = new AnalyticsCache(pool, { defaultTtlMinutes: 15 });
|
||||||
|
const priceService = new PriceTrendService(pool, cache);
|
||||||
|
const penetrationService = new PenetrationService(pool, cache);
|
||||||
|
const categoryService = new CategoryAnalyticsService(pool, cache);
|
||||||
|
const storeService = new StoreChangeService(pool, cache);
|
||||||
|
const brandOpportunityService = new BrandOpportunityService(pool, cache);
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PRICE ANALYTICS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/price/product/:id
|
||||||
|
* Get price trend for a specific product
|
||||||
|
*/
|
||||||
|
router.get('/price/product/:id', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const productId = parseInt(req.params.id);
|
||||||
|
const storeId = req.query.storeId ? parseInt(req.query.storeId as string) : undefined;
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||||
|
|
||||||
|
const result = await priceService.getProductPriceTrend(productId, storeId, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Price product error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch product price trend' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/price/brand/:name
|
||||||
|
* Get price trend for a brand
|
||||||
|
*/
|
||||||
|
router.get('/price/brand/:name', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.name);
|
||||||
|
const filters = {
|
||||||
|
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||||
|
category: req.query.category as string | undefined,
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await priceService.getBrandPriceTrend(brandName, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Price brand error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch brand price trend' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/price/category/:name
|
||||||
|
* Get price trend for a category
|
||||||
|
*/
|
||||||
|
router.get('/price/category/:name', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const category = decodeURIComponent(req.params.name);
|
||||||
|
const filters = {
|
||||||
|
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||||
|
brandName: req.query.brand as string | undefined,
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await priceService.getCategoryPriceTrend(category, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Price category error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch category price trend' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/price/summary
|
||||||
|
* Get price summary statistics
|
||||||
|
*/
|
||||||
|
router.get('/price/summary', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const filters = {
|
||||||
|
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||||
|
brandName: req.query.brand as string | undefined,
|
||||||
|
category: req.query.category as string | undefined,
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await priceService.getPriceSummary(filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Price summary error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch price summary' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/price/compression/:category
|
||||||
|
* Get price compression analysis for a category
|
||||||
|
*/
|
||||||
|
router.get('/price/compression/:category', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const category = decodeURIComponent(req.params.category);
|
||||||
|
const state = req.query.state as string | undefined;
|
||||||
|
|
||||||
|
const result = await priceService.detectPriceCompression(category, state);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Price compression error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to analyze price compression' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/price/global
|
||||||
|
* Get global price statistics
|
||||||
|
*/
|
||||||
|
router.get('/price/global', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const result = await priceService.getGlobalPriceStats();
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Global price error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch global price stats' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PENETRATION ANALYTICS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/brand/:name
|
||||||
|
* Get penetration data for a brand
|
||||||
|
*/
|
||||||
|
router.get('/penetration/brand/:name', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.name);
|
||||||
|
const filters = {
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
category: req.query.category as string | undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await penetrationService.getBrandPenetration(brandName, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Brand penetration error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch brand penetration' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/top
|
||||||
|
* Get top brands by penetration
|
||||||
|
*/
|
||||||
|
router.get('/penetration/top', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||||
|
const filters = {
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
category: req.query.category as string | undefined,
|
||||||
|
minStores: req.query.minStores ? parseInt(req.query.minStores as string) : 2,
|
||||||
|
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 5,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await penetrationService.getTopBrandsByPenetration(limit, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Top penetration error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch top brands' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/trend/:brand
|
||||||
|
* Get penetration trend for a brand
|
||||||
|
*/
|
||||||
|
router.get('/penetration/trend/:brand', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.brand);
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||||
|
|
||||||
|
const result = await penetrationService.getPenetrationTrend(brandName, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Penetration trend error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch penetration trend' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/shelf-share/:brand
|
||||||
|
* Get shelf share by category for a brand
|
||||||
|
*/
|
||||||
|
router.get('/penetration/shelf-share/:brand', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.brand);
|
||||||
|
const result = await penetrationService.getShelfShareByCategory(brandName);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Shelf share error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch shelf share' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/by-state/:brand
|
||||||
|
* Get brand presence by state
|
||||||
|
*/
|
||||||
|
router.get('/penetration/by-state/:brand', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.brand);
|
||||||
|
const result = await penetrationService.getBrandPresenceByState(brandName);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Brand by state error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch brand presence by state' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/stores/:brand
|
||||||
|
* Get stores carrying a brand
|
||||||
|
*/
|
||||||
|
router.get('/penetration/stores/:brand', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.brand);
|
||||||
|
const result = await penetrationService.getStoresCarryingBrand(brandName);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Stores carrying brand error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch stores' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/penetration/heatmap
|
||||||
|
* Get penetration heatmap data
|
||||||
|
*/
|
||||||
|
router.get('/penetration/heatmap', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = req.query.brand as string | undefined;
|
||||||
|
const result = await penetrationService.getPenetrationHeatmap(brandName);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Heatmap error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch heatmap data' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CATEGORY ANALYTICS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/category/summary
|
||||||
|
* Get category summary
|
||||||
|
*/
|
||||||
|
router.get('/category/summary', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const category = req.query.category as string | undefined;
|
||||||
|
const filters = {
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await categoryService.getCategorySummary(category, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Category summary error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch category summary' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/category/growth
|
||||||
|
* Get category growth data
|
||||||
|
*/
|
||||||
|
router.get('/category/growth', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||||
|
const filters = {
|
||||||
|
state: req.query.state as string | undefined,
|
||||||
|
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||||
|
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await categoryService.getCategoryGrowth(days, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Category growth error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch category growth' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/category/trend/:category
|
||||||
|
* Get category growth trend over time
|
||||||
|
*/
|
||||||
|
router.get('/category/trend/:category', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const category = decodeURIComponent(req.params.category);
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 90;
|
||||||
|
|
||||||
|
const result = await categoryService.getCategoryGrowthTrend(category, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Category trend error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch category trend' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/category/heatmap
|
||||||
|
* Get category heatmap data
|
||||||
|
*/
|
||||||
|
router.get('/category/heatmap', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const metric = (req.query.metric as 'skus' | 'growth' | 'price') || 'skus';
|
||||||
|
const periods = req.query.periods ? parseInt(req.query.periods as string) : 12;
|
||||||
|
|
||||||
|
const result = await categoryService.getCategoryHeatmap(metric, periods);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Category heatmap error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch heatmap' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/category/top-movers
|
||||||
|
* Get top growing and declining categories
|
||||||
|
*/
|
||||||
|
router.get('/category/top-movers', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const limit = req.query.limit ? parseInt(req.query.limit as string) : 5;
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||||
|
|
||||||
|
const result = await categoryService.getTopMovers(limit, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Top movers error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch top movers' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/category/:category/subcategories
|
||||||
|
* Get subcategory breakdown
|
||||||
|
*/
|
||||||
|
router.get('/category/:category/subcategories', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const category = decodeURIComponent(req.params.category);
|
||||||
|
const result = await categoryService.getSubcategoryBreakdown(category);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Subcategory error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch subcategories' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STORE CHANGE TRACKING
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/:id/summary
|
||||||
|
* Get change summary for a store
|
||||||
|
*/
|
||||||
|
router.get('/store/:id/summary', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.id);
|
||||||
|
const result = await storeService.getStoreChangeSummary(storeId);
|
||||||
|
|
||||||
|
if (!result) {
|
||||||
|
return res.status(404).json({ error: 'Store not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Store summary error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch store summary' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/:id/events
|
||||||
|
* Get recent change events for a store
|
||||||
|
*/
|
||||||
|
router.get('/store/:id/events', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.id);
|
||||||
|
const filters = {
|
||||||
|
eventType: req.query.type as string | undefined,
|
||||||
|
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||||
|
limit: req.query.limit ? parseInt(req.query.limit as string) : 100,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await storeService.getStoreChangeEvents(storeId, filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Store events error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch store events' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/:id/brands/new
|
||||||
|
* Get new brands added to a store
|
||||||
|
*/
|
||||||
|
router.get('/store/:id/brands/new', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.id);
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||||
|
|
||||||
|
const result = await storeService.getNewBrands(storeId, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] New brands error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch new brands' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/:id/brands/lost
|
||||||
|
* Get brands lost from a store
|
||||||
|
*/
|
||||||
|
router.get('/store/:id/brands/lost', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.id);
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||||
|
|
||||||
|
const result = await storeService.getLostBrands(storeId, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Lost brands error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch lost brands' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/:id/products/changes
|
||||||
|
* Get product changes for a store
|
||||||
|
*/
|
||||||
|
router.get('/store/:id/products/changes', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const storeId = parseInt(req.params.id);
|
||||||
|
const changeType = req.query.type as 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock' | undefined;
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||||
|
|
||||||
|
const result = await storeService.getProductChanges(storeId, changeType, days);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Product changes error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch product changes' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/leaderboard/:category
|
||||||
|
* Get category leaderboard across stores
|
||||||
|
*/
|
||||||
|
router.get('/store/leaderboard/:category', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const category = decodeURIComponent(req.params.category);
|
||||||
|
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||||
|
|
||||||
|
const result = await storeService.getCategoryLeaderboard(category, limit);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Leaderboard error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch leaderboard' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/most-active
|
||||||
|
* Get most active stores (by changes)
|
||||||
|
*/
|
||||||
|
router.get('/store/most-active', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||||
|
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
|
||||||
|
|
||||||
|
const result = await storeService.getMostActiveStores(days, limit);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Most active error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch active stores' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/store/compare
|
||||||
|
* Compare two stores
|
||||||
|
*/
|
||||||
|
router.get('/store/compare', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const store1 = parseInt(req.query.store1 as string);
|
||||||
|
const store2 = parseInt(req.query.store2 as string);
|
||||||
|
|
||||||
|
if (!store1 || !store2) {
|
||||||
|
return res.status(400).json({ error: 'Both store1 and store2 are required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await storeService.compareStores(store1, store2);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Compare stores error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to compare stores' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BRAND OPPORTUNITY / RISK
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/brand/:name/opportunity
|
||||||
|
* Get full opportunity analysis for a brand
|
||||||
|
*/
|
||||||
|
router.get('/brand/:name/opportunity', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.name);
|
||||||
|
const result = await brandOpportunityService.getBrandOpportunity(brandName);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Brand opportunity error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch brand opportunity' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/brand/:name/position
|
||||||
|
* Get market position summary for a brand
|
||||||
|
*/
|
||||||
|
router.get('/brand/:name/position', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const brandName = decodeURIComponent(req.params.name);
|
||||||
|
const result = await brandOpportunityService.getMarketPositionSummary(brandName);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Brand position error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch brand position' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// ALERTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/alerts
|
||||||
|
* Get analytics alerts
|
||||||
|
*/
|
||||||
|
router.get('/alerts', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const filters = {
|
||||||
|
brandName: req.query.brand as string | undefined,
|
||||||
|
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||||
|
alertType: req.query.type as string | undefined,
|
||||||
|
unreadOnly: req.query.unreadOnly === 'true',
|
||||||
|
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await brandOpportunityService.getAlerts(filters);
|
||||||
|
res.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Alerts error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to fetch alerts' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/analytics/alerts/mark-read
|
||||||
|
* Mark alerts as read
|
||||||
|
*/
|
||||||
|
router.post('/alerts/mark-read', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { alertIds } = req.body;
|
||||||
|
|
||||||
|
if (!Array.isArray(alertIds)) {
|
||||||
|
return res.status(400).json({ error: 'alertIds must be an array' });
|
||||||
|
}
|
||||||
|
|
||||||
|
await brandOpportunityService.markAlertsRead(alertIds);
|
||||||
|
res.json({ success: true });
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Mark read error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to mark alerts as read' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CACHE MANAGEMENT
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/analytics/cache/stats
|
||||||
|
* Get cache statistics
|
||||||
|
*/
|
||||||
|
router.get('/cache/stats', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const stats = await cache.getStats();
|
||||||
|
res.json(stats);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Cache stats error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to get cache stats' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/analytics/cache/clear
|
||||||
|
* Clear cache (admin only)
|
||||||
|
*/
|
||||||
|
router.post('/cache/clear', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const pattern = req.query.pattern as string | undefined;
|
||||||
|
|
||||||
|
if (pattern) {
|
||||||
|
const cleared = await cache.invalidatePattern(pattern);
|
||||||
|
res.json({ success: true, clearedCount: cleared });
|
||||||
|
} else {
|
||||||
|
await cache.cleanExpired();
|
||||||
|
res.json({ success: true, message: 'Expired entries cleaned' });
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Cache clear error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to clear cache' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SNAPSHOT CAPTURE (for cron/scheduled jobs)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/analytics/snapshots/capture
|
||||||
|
* Capture daily snapshots (run by scheduler)
|
||||||
|
*/
|
||||||
|
router.post('/snapshots/capture', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const [brandResult, categoryResult] = await Promise.all([
|
||||||
|
pool.query('SELECT capture_brand_snapshots() as count'),
|
||||||
|
pool.query('SELECT capture_category_snapshots() as count'),
|
||||||
|
]);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
success: true,
|
||||||
|
brandSnapshots: parseInt(brandResult.rows[0]?.count || '0'),
|
||||||
|
categorySnapshots: parseInt(categoryResult.rows[0]?.count || '0'),
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[Analytics] Snapshot capture error:', error);
|
||||||
|
res.status(500).json({ error: 'Failed to capture snapshots' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return router;
|
||||||
|
}
|
||||||
@@ -21,12 +21,8 @@ import {
|
|||||||
} from '../services/discovery';
|
} from '../services/discovery';
|
||||||
import { crawlDispensaryProducts } from '../services/product-crawler';
|
import { crawlDispensaryProducts } from '../services/product-crawler';
|
||||||
|
|
||||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||||
const DISPENSARY_COLUMNS = `
|
import { DISPENSARY_COLUMNS_WITH_PROFILE as DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||||
id, name, dba_name, slug, city, state, zip, address, latitude, longitude,
|
|
||||||
menu_type, menu_url, platform_dispensary_id, website,
|
|
||||||
provider_detection_data, created_at, updated_at
|
|
||||||
`;
|
|
||||||
import {
|
import {
|
||||||
startScheduler,
|
startScheduler,
|
||||||
stopScheduler,
|
stopScheduler,
|
||||||
@@ -43,6 +39,7 @@ import {
|
|||||||
getRunLogs,
|
getRunLogs,
|
||||||
} from '../services/scheduler';
|
} from '../services/scheduler';
|
||||||
import { StockStatus } from '../types';
|
import { StockStatus } from '../types';
|
||||||
|
import { getProviderDisplayName } from '../../utils/provider-display';
|
||||||
|
|
||||||
const router = Router();
|
const router = Router();
|
||||||
|
|
||||||
@@ -113,9 +110,17 @@ router.get('/stores', async (req: Request, res: Response) => {
|
|||||||
|
|
||||||
const { rows, rowCount } = await query(
|
const { rows, rowCount } = await query(
|
||||||
`
|
`
|
||||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
SELECT ${DISPENSARY_COLUMNS},
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products WHERE dispensary_id = dispensaries.id) as product_count,
|
||||||
|
dcp.status as crawler_status,
|
||||||
|
dcp.profile_key as crawler_profile_key,
|
||||||
|
dcp.next_retry_at,
|
||||||
|
dcp.sandbox_attempt_count
|
||||||
|
FROM dispensaries
|
||||||
|
LEFT JOIN dispensary_crawler_profiles dcp
|
||||||
|
ON dcp.dispensary_id = dispensaries.id AND dcp.enabled = true
|
||||||
${whereClause}
|
${whereClause}
|
||||||
ORDER BY name
|
ORDER BY dispensaries.name
|
||||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||||
`,
|
`,
|
||||||
params
|
params
|
||||||
@@ -127,8 +132,15 @@ router.get('/stores', async (req: Request, res: Response) => {
|
|||||||
params.slice(0, -2)
|
params.slice(0, -2)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Transform stores to include provider_display
|
||||||
|
const transformedStores = rows.map((store: any) => ({
|
||||||
|
...store,
|
||||||
|
provider_raw: store.menu_type,
|
||||||
|
provider_display: getProviderDisplayName(store.menu_type),
|
||||||
|
}));
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
stores: rows,
|
stores: transformedStores,
|
||||||
total: parseInt(countRows[0]?.total || '0', 10),
|
total: parseInt(countRows[0]?.total || '0', 10),
|
||||||
limit: parseInt(limit as string, 10),
|
limit: parseInt(limit as string, 10),
|
||||||
offset: parseInt(offset as string, 10),
|
offset: parseInt(offset as string, 10),
|
||||||
@@ -780,7 +792,7 @@ router.get('/products/:id/availability', async (req: Request, res: Response) =>
|
|||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
d.id as dispensary_id,
|
d.id as dispensary_id,
|
||||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
d.name as dispensary_name,
|
||||||
d.city,
|
d.city,
|
||||||
d.state,
|
d.state,
|
||||||
d.address,
|
d.address,
|
||||||
@@ -1042,8 +1054,12 @@ router.post('/admin/scheduler/trigger', async (_req: Request, res: Response) =>
|
|||||||
});
|
});
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* POST /api/dutchie-az/admin/crawl/:id
|
* POST /api/az/admin/crawl/:id
|
||||||
* Crawl a single dispensary with job tracking
|
* Crawl a single dispensary with job tracking
|
||||||
|
*
|
||||||
|
* @deprecated Use POST /api/admin/crawl/:dispensaryId instead.
|
||||||
|
* This route is kept for backward compatibility only.
|
||||||
|
* The canonical crawl endpoint is now /api/admin/crawl/:dispensaryId
|
||||||
*/
|
*/
|
||||||
router.post('/admin/crawl/:id', async (req: Request, res: Response) => {
|
router.post('/admin/crawl/:id', async (req: Request, res: Response) => {
|
||||||
try {
|
try {
|
||||||
@@ -1075,7 +1091,6 @@ router.get('/admin/dutchie-stores', async (_req: Request, res: Response) => {
|
|||||||
SELECT
|
SELECT
|
||||||
d.id,
|
d.id,
|
||||||
d.name,
|
d.name,
|
||||||
d.dba_name,
|
|
||||||
d.city,
|
d.city,
|
||||||
d.state,
|
d.state,
|
||||||
d.menu_type,
|
d.menu_type,
|
||||||
@@ -1113,7 +1128,7 @@ router.get('/admin/dutchie-stores', async (_req: Request, res: Response) => {
|
|||||||
failed: failed.length,
|
failed: failed.length,
|
||||||
stores: rows.map((r: any) => ({
|
stores: rows.map((r: any) => ({
|
||||||
id: r.id,
|
id: r.id,
|
||||||
name: r.dba_name || r.name,
|
name: r.name,
|
||||||
city: r.city,
|
city: r.city,
|
||||||
state: r.state,
|
state: r.state,
|
||||||
menuType: r.menu_type,
|
menuType: r.menu_type,
|
||||||
@@ -1688,6 +1703,7 @@ import {
|
|||||||
router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
||||||
try {
|
try {
|
||||||
// Get running jobs from job_run_logs (scheduled jobs like "enqueue all")
|
// Get running jobs from job_run_logs (scheduled jobs like "enqueue all")
|
||||||
|
// Includes worker_name and run_role for named workforce display
|
||||||
const { rows: runningScheduledJobs } = await query<any>(`
|
const { rows: runningScheduledJobs } = await query<any>(`
|
||||||
SELECT
|
SELECT
|
||||||
jrl.id,
|
jrl.id,
|
||||||
@@ -1699,7 +1715,11 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
|||||||
jrl.items_succeeded,
|
jrl.items_succeeded,
|
||||||
jrl.items_failed,
|
jrl.items_failed,
|
||||||
jrl.metadata,
|
jrl.metadata,
|
||||||
|
jrl.worker_name,
|
||||||
|
jrl.run_role,
|
||||||
js.description as job_description,
|
js.description as job_description,
|
||||||
|
js.worker_name as schedule_worker_name,
|
||||||
|
js.worker_role as schedule_worker_role,
|
||||||
EXTRACT(EPOCH FROM (NOW() - jrl.started_at)) as duration_seconds
|
EXTRACT(EPOCH FROM (NOW() - jrl.started_at)) as duration_seconds
|
||||||
FROM job_run_logs jrl
|
FROM job_run_logs jrl
|
||||||
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
||||||
@@ -1708,7 +1728,7 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
|||||||
`);
|
`);
|
||||||
|
|
||||||
// Get running crawl jobs (individual store crawls with worker info)
|
// Get running crawl jobs (individual store crawls with worker info)
|
||||||
// Note: Use COALESCE for optional columns that may not exist in older schemas
|
// Includes enqueued_by_worker for tracking which named worker enqueued the job
|
||||||
const { rows: runningCrawlJobs } = await query<any>(`
|
const { rows: runningCrawlJobs } = await query<any>(`
|
||||||
SELECT
|
SELECT
|
||||||
cj.id,
|
cj.id,
|
||||||
@@ -1722,6 +1742,7 @@ router.get('/monitor/active-jobs', async (_req: Request, res: Response) => {
|
|||||||
cj.claimed_by as worker_id,
|
cj.claimed_by as worker_id,
|
||||||
cj.worker_hostname,
|
cj.worker_hostname,
|
||||||
cj.claimed_at,
|
cj.claimed_at,
|
||||||
|
cj.enqueued_by_worker,
|
||||||
cj.products_found,
|
cj.products_found,
|
||||||
cj.products_upserted,
|
cj.products_upserted,
|
||||||
cj.snapshots_created,
|
cj.snapshots_created,
|
||||||
@@ -1792,14 +1813,18 @@ router.get('/monitor/recent-jobs', async (req: Request, res: Response) => {
|
|||||||
jrl.items_succeeded,
|
jrl.items_succeeded,
|
||||||
jrl.items_failed,
|
jrl.items_failed,
|
||||||
jrl.metadata,
|
jrl.metadata,
|
||||||
js.description as job_description
|
jrl.worker_name,
|
||||||
|
jrl.run_role,
|
||||||
|
js.description as job_description,
|
||||||
|
js.worker_name as schedule_worker_name,
|
||||||
|
js.worker_role as schedule_worker_role
|
||||||
FROM job_run_logs jrl
|
FROM job_run_logs jrl
|
||||||
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
||||||
ORDER BY jrl.created_at DESC
|
ORDER BY jrl.created_at DESC
|
||||||
LIMIT $1
|
LIMIT $1
|
||||||
`, [limitNum]);
|
`, [limitNum]);
|
||||||
|
|
||||||
// Recent crawl jobs
|
// Recent crawl jobs (includes enqueued_by_worker for named workforce tracking)
|
||||||
const { rows: recentCrawlJobs } = await query<any>(`
|
const { rows: recentCrawlJobs } = await query<any>(`
|
||||||
SELECT
|
SELECT
|
||||||
cj.id,
|
cj.id,
|
||||||
@@ -1814,6 +1839,7 @@ router.get('/monitor/recent-jobs', async (req: Request, res: Response) => {
|
|||||||
cj.products_found,
|
cj.products_found,
|
||||||
cj.snapshots_created,
|
cj.snapshots_created,
|
||||||
cj.metadata,
|
cj.metadata,
|
||||||
|
cj.enqueued_by_worker,
|
||||||
EXTRACT(EPOCH FROM (COALESCE(cj.completed_at, NOW()) - cj.started_at)) * 1000 as duration_ms
|
EXTRACT(EPOCH FROM (COALESCE(cj.completed_at, NOW()) - cj.started_at)) * 1000 as duration_ms
|
||||||
FROM dispensary_crawl_jobs cj
|
FROM dispensary_crawl_jobs cj
|
||||||
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
|
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
|
||||||
@@ -1912,12 +1938,14 @@ router.get('/monitor/summary', async (_req: Request, res: Response) => {
|
|||||||
(SELECT MAX(completed_at) FROM job_run_logs WHERE status = 'success') as last_job_completed
|
(SELECT MAX(completed_at) FROM job_run_logs WHERE status = 'success') as last_job_completed
|
||||||
`);
|
`);
|
||||||
|
|
||||||
// Get next scheduled runs
|
// Get next scheduled runs (with worker names)
|
||||||
const { rows: nextRuns } = await query<any>(`
|
const { rows: nextRuns } = await query<any>(`
|
||||||
SELECT
|
SELECT
|
||||||
id,
|
id,
|
||||||
job_name,
|
job_name,
|
||||||
description,
|
description,
|
||||||
|
worker_name,
|
||||||
|
worker_role,
|
||||||
enabled,
|
enabled,
|
||||||
next_run_at,
|
next_run_at,
|
||||||
last_status,
|
last_status,
|
||||||
@@ -2034,6 +2062,189 @@ router.post('/admin/detection/trigger', async (_req: Request, res: Response) =>
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CRAWLER RELIABILITY / HEALTH ENDPOINTS (Phase 1)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/crawler/health
|
||||||
|
* Get overall crawler health metrics
|
||||||
|
*/
|
||||||
|
router.get('/admin/crawler/health', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { rows } = await query<any>(`SELECT * FROM v_crawl_health`);
|
||||||
|
res.json(rows[0] || {
|
||||||
|
active_crawlers: 0,
|
||||||
|
degraded_crawlers: 0,
|
||||||
|
paused_crawlers: 0,
|
||||||
|
failed_crawlers: 0,
|
||||||
|
due_now: 0,
|
||||||
|
stores_with_failures: 0,
|
||||||
|
avg_consecutive_failures: 0,
|
||||||
|
successful_last_24h: 0,
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
// View might not exist yet
|
||||||
|
res.json({
|
||||||
|
active_crawlers: 0,
|
||||||
|
degraded_crawlers: 0,
|
||||||
|
paused_crawlers: 0,
|
||||||
|
failed_crawlers: 0,
|
||||||
|
due_now: 0,
|
||||||
|
error: 'View not available - run migration 046',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/crawler/error-summary
|
||||||
|
* Get error summary by code over last 7 days
|
||||||
|
*/
|
||||||
|
router.get('/admin/crawler/error-summary', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { rows } = await query<any>(`SELECT * FROM v_crawl_error_summary`);
|
||||||
|
res.json({ errors: rows });
|
||||||
|
} catch (error: any) {
|
||||||
|
res.json({ errors: [], error: 'View not available - run migration 046' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/crawler/status
|
||||||
|
* Get detailed status for all crawlers
|
||||||
|
*/
|
||||||
|
router.get('/admin/crawler/status', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { status, limit = '100', offset = '0' } = req.query;
|
||||||
|
|
||||||
|
let whereClause = '';
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (status) {
|
||||||
|
whereClause = `WHERE crawl_status = $${paramIndex}`;
|
||||||
|
params.push(status);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||||
|
|
||||||
|
const { rows } = await query<any>(
|
||||||
|
`SELECT * FROM v_crawler_status
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY consecutive_failures DESC, name ASC
|
||||||
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
const { rows: countRows } = await query<any>(
|
||||||
|
`SELECT COUNT(*) as total FROM v_crawler_status ${whereClause}`,
|
||||||
|
params.slice(0, -2)
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
stores: rows,
|
||||||
|
total: parseInt(countRows[0]?.total || '0', 10),
|
||||||
|
limit: parseInt(limit as string, 10),
|
||||||
|
offset: parseInt(offset as string, 10),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/crawler/attempts
|
||||||
|
* Get recent crawl attempts (for debugging)
|
||||||
|
*/
|
||||||
|
router.get('/admin/crawler/attempts', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { dispensaryId, errorCode, limit = '50', offset = '0' } = req.query;
|
||||||
|
|
||||||
|
let whereClause = 'WHERE 1=1';
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (dispensaryId) {
|
||||||
|
whereClause += ` AND ca.dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(parseInt(dispensaryId as string, 10));
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorCode) {
|
||||||
|
whereClause += ` AND ca.error_code = $${paramIndex}`;
|
||||||
|
params.push(errorCode);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||||
|
|
||||||
|
const { rows } = await query<any>(
|
||||||
|
`SELECT
|
||||||
|
ca.*,
|
||||||
|
d.name as dispensary_name,
|
||||||
|
d.city
|
||||||
|
FROM crawl_attempts ca
|
||||||
|
LEFT JOIN dispensaries d ON ca.dispensary_id = d.id
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY ca.started_at DESC
|
||||||
|
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`,
|
||||||
|
params
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json({ attempts: rows });
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/dutchie-az/admin/dispensaries/:id/pause
|
||||||
|
* Pause crawling for a dispensary
|
||||||
|
*/
|
||||||
|
router.post('/admin/dispensaries/:id/pause', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
await query(`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET crawl_status = 'paused',
|
||||||
|
next_crawl_at = NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`, [id]);
|
||||||
|
|
||||||
|
res.json({ success: true, message: `Crawling paused for dispensary ${id}` });
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/dutchie-az/admin/dispensaries/:id/resume
|
||||||
|
* Resume crawling for a paused/degraded dispensary
|
||||||
|
*/
|
||||||
|
router.post('/admin/dispensaries/:id/resume', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
|
||||||
|
// Reset to active and schedule next crawl
|
||||||
|
await query(`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET crawl_status = 'active',
|
||||||
|
consecutive_failures = 0,
|
||||||
|
backoff_multiplier = 1.0,
|
||||||
|
next_crawl_at = NOW() + INTERVAL '5 minutes',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`, [id]);
|
||||||
|
|
||||||
|
res.json({ success: true, message: `Crawling resumed for dispensary ${id}` });
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// FAILED DISPENSARIES ROUTES
|
// FAILED DISPENSARIES ROUTES
|
||||||
// ============================================================
|
// ============================================================
|
||||||
@@ -2183,4 +2394,251 @@ router.get('/admin/dispensaries/health-summary', async (_req: Request, res: Resp
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// ORCHESTRATOR TRACE ROUTES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
import {
|
||||||
|
getLatestTrace,
|
||||||
|
getTraceById,
|
||||||
|
getTracesForDispensary,
|
||||||
|
getTraceByRunId,
|
||||||
|
} from '../../services/orchestrator-trace';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/dispensaries/:id/crawl-trace/latest
|
||||||
|
* Get the latest orchestrator trace for a dispensary
|
||||||
|
*/
|
||||||
|
router.get('/admin/dispensaries/:id/crawl-trace/latest', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const trace = await getLatestTrace(parseInt(id, 10));
|
||||||
|
|
||||||
|
if (!trace) {
|
||||||
|
return res.status(404).json({ error: 'No trace found for this dispensary' });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json(trace);
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/dispensaries/:id/crawl-traces
|
||||||
|
* Get paginated list of orchestrator traces for a dispensary
|
||||||
|
*/
|
||||||
|
router.get('/admin/dispensaries/:id/crawl-traces', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { id } = req.params;
|
||||||
|
const { limit = '20', offset = '0' } = req.query;
|
||||||
|
|
||||||
|
const result = await getTracesForDispensary(
|
||||||
|
parseInt(id, 10),
|
||||||
|
parseInt(limit as string, 10),
|
||||||
|
parseInt(offset as string, 10)
|
||||||
|
);
|
||||||
|
|
||||||
|
res.json(result);
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/crawl-traces/:traceId
|
||||||
|
* Get a specific orchestrator trace by ID
|
||||||
|
*/
|
||||||
|
router.get('/admin/crawl-traces/:traceId', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { traceId } = req.params;
|
||||||
|
const trace = await getTraceById(parseInt(traceId, 10));
|
||||||
|
|
||||||
|
if (!trace) {
|
||||||
|
return res.status(404).json({ error: 'Trace not found' });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json(trace);
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/admin/crawl-traces/run/:runId
|
||||||
|
* Get a specific orchestrator trace by run ID
|
||||||
|
*/
|
||||||
|
router.get('/admin/crawl-traces/run/:runId', async (req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
const { runId } = req.params;
|
||||||
|
const trace = await getTraceByRunId(runId);
|
||||||
|
|
||||||
|
if (!trace) {
|
||||||
|
return res.status(404).json({ error: 'Trace not found for this run ID' });
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json(trace);
|
||||||
|
} catch (error: any) {
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SCRAPER OVERVIEW DASHBOARD ENDPOINTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dutchie-az/scraper/overview
|
||||||
|
* Comprehensive scraper overview for the new dashboard
|
||||||
|
*/
|
||||||
|
router.get('/scraper/overview', async (_req: Request, res: Response) => {
|
||||||
|
try {
|
||||||
|
// 1. Core KPI metrics
|
||||||
|
const { rows: kpiRows } = await query<any>(`
|
||||||
|
SELECT
|
||||||
|
-- Total products
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products) AS total_products,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products WHERE stock_status = 'in_stock') AS in_stock_products,
|
||||||
|
-- Total dispensaries
|
||||||
|
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND state = 'AZ') AS total_dispensaries,
|
||||||
|
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND state = 'AZ' AND platform_dispensary_id IS NOT NULL) AS crawlable_dispensaries,
|
||||||
|
-- Visibility stats (24h)
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_lost = true AND visibility_lost_at > NOW() - INTERVAL '24 hours') AS visibility_lost_24h,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_restored_at > NOW() - INTERVAL '24 hours') AS visibility_restored_24h,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products WHERE visibility_lost = true) AS total_visibility_lost,
|
||||||
|
-- Job stats (24h)
|
||||||
|
(SELECT COUNT(*) FROM job_run_logs WHERE status IN ('error', 'partial') AND created_at > NOW() - INTERVAL '24 hours') AS errors_24h,
|
||||||
|
(SELECT COUNT(*) FROM job_run_logs WHERE status = 'success' AND created_at > NOW() - INTERVAL '24 hours') AS successful_jobs_24h,
|
||||||
|
-- Active workers
|
||||||
|
(SELECT COUNT(*) FROM job_schedules WHERE enabled = true) AS active_workers
|
||||||
|
`);
|
||||||
|
|
||||||
|
// 2. Get active worker names
|
||||||
|
const { rows: workerRows } = await query<any>(`
|
||||||
|
SELECT worker_name, worker_role, enabled, last_status, last_run_at, next_run_at
|
||||||
|
FROM job_schedules
|
||||||
|
WHERE enabled = true
|
||||||
|
ORDER BY next_run_at ASC NULLS LAST
|
||||||
|
`);
|
||||||
|
|
||||||
|
// 3. Scrape activity by hour (last 24h)
|
||||||
|
const { rows: activityRows } = await query<any>(`
|
||||||
|
SELECT
|
||||||
|
date_trunc('hour', started_at) AS hour,
|
||||||
|
COUNT(*) FILTER (WHERE status = 'success') AS successful,
|
||||||
|
COUNT(*) FILTER (WHERE status IN ('error', 'partial')) AS failed,
|
||||||
|
COUNT(*) AS total
|
||||||
|
FROM job_run_logs
|
||||||
|
WHERE started_at > NOW() - INTERVAL '24 hours'
|
||||||
|
GROUP BY date_trunc('hour', started_at)
|
||||||
|
ORDER BY hour ASC
|
||||||
|
`);
|
||||||
|
|
||||||
|
// 4. Product growth / coverage (last 7 days)
|
||||||
|
const { rows: growthRows } = await query<any>(`
|
||||||
|
SELECT
|
||||||
|
date_trunc('day', created_at) AS day,
|
||||||
|
COUNT(*) AS new_products
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE created_at > NOW() - INTERVAL '7 days'
|
||||||
|
GROUP BY date_trunc('day', created_at)
|
||||||
|
ORDER BY day ASC
|
||||||
|
`);
|
||||||
|
|
||||||
|
// 5. Recent worker runs (last 20)
|
||||||
|
const { rows: recentRuns } = await query<any>(`
|
||||||
|
SELECT
|
||||||
|
jrl.id,
|
||||||
|
jrl.job_name,
|
||||||
|
jrl.status,
|
||||||
|
jrl.started_at,
|
||||||
|
jrl.completed_at,
|
||||||
|
jrl.items_processed,
|
||||||
|
jrl.items_succeeded,
|
||||||
|
jrl.items_failed,
|
||||||
|
jrl.metadata,
|
||||||
|
js.worker_name,
|
||||||
|
js.worker_role
|
||||||
|
FROM job_run_logs jrl
|
||||||
|
LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
|
||||||
|
ORDER BY jrl.started_at DESC
|
||||||
|
LIMIT 20
|
||||||
|
`);
|
||||||
|
|
||||||
|
// 6. Recent visibility changes by store
|
||||||
|
const { rows: visibilityChanges } = await query<any>(`
|
||||||
|
SELECT
|
||||||
|
d.id AS dispensary_id,
|
||||||
|
d.name AS dispensary_name,
|
||||||
|
d.state,
|
||||||
|
COUNT(dp.id) FILTER (WHERE dp.visibility_lost = true AND dp.visibility_lost_at > NOW() - INTERVAL '24 hours') AS lost_24h,
|
||||||
|
COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at > NOW() - INTERVAL '24 hours') AS restored_24h,
|
||||||
|
MAX(dp.visibility_lost_at) AS latest_loss,
|
||||||
|
MAX(dp.visibility_restored_at) AS latest_restore
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||||
|
WHERE d.menu_type = 'dutchie'
|
||||||
|
GROUP BY d.id, d.name, d.state
|
||||||
|
HAVING COUNT(dp.id) FILTER (WHERE dp.visibility_lost = true AND dp.visibility_lost_at > NOW() - INTERVAL '24 hours') > 0
|
||||||
|
OR COUNT(dp.id) FILTER (WHERE dp.visibility_restored_at > NOW() - INTERVAL '24 hours') > 0
|
||||||
|
ORDER BY lost_24h DESC, restored_24h DESC
|
||||||
|
LIMIT 15
|
||||||
|
`);
|
||||||
|
|
||||||
|
const kpi = kpiRows[0] || {};
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
kpi: {
|
||||||
|
totalProducts: parseInt(kpi.total_products || '0'),
|
||||||
|
inStockProducts: parseInt(kpi.in_stock_products || '0'),
|
||||||
|
totalDispensaries: parseInt(kpi.total_dispensaries || '0'),
|
||||||
|
crawlableDispensaries: parseInt(kpi.crawlable_dispensaries || '0'),
|
||||||
|
visibilityLost24h: parseInt(kpi.visibility_lost_24h || '0'),
|
||||||
|
visibilityRestored24h: parseInt(kpi.visibility_restored_24h || '0'),
|
||||||
|
totalVisibilityLost: parseInt(kpi.total_visibility_lost || '0'),
|
||||||
|
errors24h: parseInt(kpi.errors_24h || '0'),
|
||||||
|
successfulJobs24h: parseInt(kpi.successful_jobs_24h || '0'),
|
||||||
|
activeWorkers: parseInt(kpi.active_workers || '0'),
|
||||||
|
},
|
||||||
|
workers: workerRows,
|
||||||
|
activityByHour: activityRows.map((row: any) => ({
|
||||||
|
hour: row.hour,
|
||||||
|
successful: parseInt(row.successful || '0'),
|
||||||
|
failed: parseInt(row.failed || '0'),
|
||||||
|
total: parseInt(row.total || '0'),
|
||||||
|
})),
|
||||||
|
productGrowth: growthRows.map((row: any) => ({
|
||||||
|
day: row.day,
|
||||||
|
newProducts: parseInt(row.new_products || '0'),
|
||||||
|
})),
|
||||||
|
recentRuns: recentRuns.map((row: any) => ({
|
||||||
|
id: row.id,
|
||||||
|
jobName: row.job_name,
|
||||||
|
status: row.status,
|
||||||
|
startedAt: row.started_at,
|
||||||
|
completedAt: row.completed_at,
|
||||||
|
itemsProcessed: row.items_processed,
|
||||||
|
itemsSucceeded: row.items_succeeded,
|
||||||
|
itemsFailed: row.items_failed,
|
||||||
|
workerName: row.worker_name,
|
||||||
|
workerRole: row.worker_role,
|
||||||
|
visibilityLost: row.metadata?.visibilityLostCount || 0,
|
||||||
|
visibilityRestored: row.metadata?.visibilityRestoredCount || 0,
|
||||||
|
})),
|
||||||
|
visibilityChanges: visibilityChanges.map((row: any) => ({
|
||||||
|
dispensaryId: row.dispensary_id,
|
||||||
|
dispensaryName: row.dispensary_name,
|
||||||
|
state: row.state,
|
||||||
|
lost24h: parseInt(row.lost_24h || '0'),
|
||||||
|
restored24h: parseInt(row.restored_24h || '0'),
|
||||||
|
latestLoss: row.latest_loss,
|
||||||
|
latestRestore: row.latest_restore,
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error fetching scraper overview:', error);
|
||||||
|
res.status(500).json({ error: error.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
export default router;
|
export default router;
|
||||||
|
|||||||
486
backend/src/dutchie-az/scripts/stress-test.ts
Normal file
486
backend/src/dutchie-az/scripts/stress-test.ts
Normal file
@@ -0,0 +1,486 @@
|
|||||||
|
#!/usr/bin/env npx tsx
|
||||||
|
/**
|
||||||
|
* Crawler Reliability Stress Test
|
||||||
|
*
|
||||||
|
* Simulates various failure scenarios to test:
|
||||||
|
* - Retry logic with exponential backoff
|
||||||
|
* - Error taxonomy classification
|
||||||
|
* - Self-healing (proxy/UA rotation)
|
||||||
|
* - Status transitions (active -> degraded -> failed)
|
||||||
|
* - Minimum crawl gap enforcement
|
||||||
|
*
|
||||||
|
* Phase 1: Crawler Reliability & Stabilization
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
|
||||||
|
*
|
||||||
|
* Available tests:
|
||||||
|
* retry - Test retry manager with various error types
|
||||||
|
* backoff - Test exponential backoff calculation
|
||||||
|
* status - Test status transitions
|
||||||
|
* gap - Test minimum crawl gap enforcement
|
||||||
|
* rotation - Test proxy/UA rotation
|
||||||
|
* all - Run all tests
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
CrawlErrorCode,
|
||||||
|
classifyError,
|
||||||
|
isRetryable,
|
||||||
|
shouldRotateProxy,
|
||||||
|
shouldRotateUserAgent,
|
||||||
|
getBackoffMultiplier,
|
||||||
|
getErrorMetadata,
|
||||||
|
} from '../services/error-taxonomy';
|
||||||
|
|
||||||
|
import {
|
||||||
|
RetryManager,
|
||||||
|
withRetry,
|
||||||
|
calculateNextCrawlDelay,
|
||||||
|
calculateNextCrawlAt,
|
||||||
|
determineCrawlStatus,
|
||||||
|
shouldAttemptRecovery,
|
||||||
|
sleep,
|
||||||
|
} from '../services/retry-manager';
|
||||||
|
|
||||||
|
import {
|
||||||
|
UserAgentRotator,
|
||||||
|
USER_AGENTS,
|
||||||
|
} from '../services/proxy-rotator';
|
||||||
|
|
||||||
|
import {
|
||||||
|
validateStoreConfig,
|
||||||
|
isCrawlable,
|
||||||
|
DEFAULT_CONFIG,
|
||||||
|
RawStoreConfig,
|
||||||
|
} from '../services/store-validator';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST UTILITIES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
let testsPassed = 0;
|
||||||
|
let testsFailed = 0;
|
||||||
|
|
||||||
|
function assert(condition: boolean, message: string): void {
|
||||||
|
if (condition) {
|
||||||
|
console.log(` ✓ ${message}`);
|
||||||
|
testsPassed++;
|
||||||
|
} else {
|
||||||
|
console.log(` ✗ ${message}`);
|
||||||
|
testsFailed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function section(name: string): void {
|
||||||
|
console.log(`\n${'='.repeat(60)}`);
|
||||||
|
console.log(`TEST: ${name}`);
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Error Classification
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testErrorClassification(): void {
|
||||||
|
section('Error Classification');
|
||||||
|
|
||||||
|
// HTTP status codes
|
||||||
|
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
|
||||||
|
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
|
||||||
|
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
|
||||||
|
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
|
||||||
|
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
|
||||||
|
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
|
||||||
|
|
||||||
|
// Error messages
|
||||||
|
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
|
||||||
|
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
|
||||||
|
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
|
||||||
|
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
|
||||||
|
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
|
||||||
|
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
|
||||||
|
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
|
||||||
|
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
|
||||||
|
|
||||||
|
// Retryability
|
||||||
|
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
|
||||||
|
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
|
||||||
|
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
|
||||||
|
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
|
||||||
|
|
||||||
|
// Rotation decisions
|
||||||
|
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
|
||||||
|
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
|
||||||
|
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Retry Manager
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testRetryManager(): void {
|
||||||
|
section('Retry Manager');
|
||||||
|
|
||||||
|
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
|
||||||
|
|
||||||
|
// Initial state
|
||||||
|
assert(manager.shouldAttempt() === true, 'Should attempt initially');
|
||||||
|
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
|
||||||
|
|
||||||
|
// First attempt
|
||||||
|
manager.recordAttempt();
|
||||||
|
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
|
||||||
|
|
||||||
|
// Evaluate retryable error
|
||||||
|
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
|
||||||
|
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
|
||||||
|
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
|
||||||
|
assert(decision1.rotateProxy === true, 'Should rotate proxy');
|
||||||
|
assert(decision1.backoffMs > 0, 'Backoff is positive');
|
||||||
|
|
||||||
|
// More attempts
|
||||||
|
manager.recordAttempt();
|
||||||
|
manager.recordAttempt();
|
||||||
|
|
||||||
|
// Now at max retries
|
||||||
|
const decision2 = manager.evaluateError(new Error('timeout'), 504);
|
||||||
|
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
|
||||||
|
|
||||||
|
manager.recordAttempt();
|
||||||
|
const decision3 = manager.evaluateError(new Error('timeout'));
|
||||||
|
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
|
||||||
|
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
|
||||||
|
|
||||||
|
// Reset
|
||||||
|
manager.reset();
|
||||||
|
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
|
||||||
|
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
|
||||||
|
|
||||||
|
// Non-retryable error
|
||||||
|
const manager2 = new RetryManager({ maxRetries: 3 });
|
||||||
|
manager2.recordAttempt();
|
||||||
|
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
|
||||||
|
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
|
||||||
|
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Exponential Backoff
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testExponentialBackoff(): void {
|
||||||
|
section('Exponential Backoff');
|
||||||
|
|
||||||
|
// Calculate next crawl delay
|
||||||
|
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
|
||||||
|
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
|
||||||
|
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
|
||||||
|
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
|
||||||
|
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
|
||||||
|
|
||||||
|
console.log(` Delay with 0 failures: ${delay0} minutes`);
|
||||||
|
console.log(` Delay with 1 failure: ${delay1} minutes`);
|
||||||
|
console.log(` Delay with 2 failures: ${delay2} minutes`);
|
||||||
|
console.log(` Delay with 3 failures: ${delay3} minutes`);
|
||||||
|
console.log(` Delay with 5 failures: ${delay5} minutes`);
|
||||||
|
|
||||||
|
assert(delay1 > delay0, 'Delay increases with failures');
|
||||||
|
assert(delay2 > delay1, 'Delay keeps increasing');
|
||||||
|
assert(delay3 > delay2, 'More delay with more failures');
|
||||||
|
// With jitter, exact values vary but ratio should be close to 2x
|
||||||
|
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
|
||||||
|
|
||||||
|
// Next crawl time calculation
|
||||||
|
const now = new Date();
|
||||||
|
const nextAt = calculateNextCrawlAt(2, 240);
|
||||||
|
assert(nextAt > now, 'Next crawl is in future');
|
||||||
|
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Status Transitions
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testStatusTransitions(): void {
|
||||||
|
section('Status Transitions');
|
||||||
|
|
||||||
|
// Active status
|
||||||
|
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
|
||||||
|
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
|
||||||
|
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
|
||||||
|
|
||||||
|
// Degraded status
|
||||||
|
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
|
||||||
|
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
|
||||||
|
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
|
||||||
|
|
||||||
|
// Failed status
|
||||||
|
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
|
||||||
|
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
|
||||||
|
|
||||||
|
// Custom thresholds
|
||||||
|
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
|
||||||
|
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
|
||||||
|
|
||||||
|
// Recovery check
|
||||||
|
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
|
||||||
|
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
|
||||||
|
|
||||||
|
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
|
||||||
|
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
|
||||||
|
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Store Validation
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testStoreValidation(): void {
|
||||||
|
section('Store Validation');
|
||||||
|
|
||||||
|
// Valid config
|
||||||
|
const validConfig: RawStoreConfig = {
|
||||||
|
id: 1,
|
||||||
|
name: 'Test Store',
|
||||||
|
platformDispensaryId: '123abc',
|
||||||
|
menuType: 'dutchie',
|
||||||
|
};
|
||||||
|
const validResult = validateStoreConfig(validConfig);
|
||||||
|
assert(validResult.isValid === true, 'Valid config passes');
|
||||||
|
assert(validResult.config !== null, 'Valid config returns config');
|
||||||
|
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
|
||||||
|
|
||||||
|
// Missing required fields
|
||||||
|
const missingId: RawStoreConfig = {
|
||||||
|
id: 0,
|
||||||
|
name: 'Test',
|
||||||
|
platformDispensaryId: '123',
|
||||||
|
menuType: 'dutchie',
|
||||||
|
};
|
||||||
|
const missingIdResult = validateStoreConfig(missingId);
|
||||||
|
assert(missingIdResult.isValid === false, 'Missing ID fails');
|
||||||
|
|
||||||
|
// Missing platform ID
|
||||||
|
const missingPlatform: RawStoreConfig = {
|
||||||
|
id: 1,
|
||||||
|
name: 'Test',
|
||||||
|
menuType: 'dutchie',
|
||||||
|
};
|
||||||
|
const missingPlatformResult = validateStoreConfig(missingPlatform);
|
||||||
|
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
|
||||||
|
|
||||||
|
// Unknown menu type
|
||||||
|
const unknownMenu: RawStoreConfig = {
|
||||||
|
id: 1,
|
||||||
|
name: 'Test',
|
||||||
|
platformDispensaryId: '123',
|
||||||
|
menuType: 'unknown',
|
||||||
|
};
|
||||||
|
const unknownMenuResult = validateStoreConfig(unknownMenu);
|
||||||
|
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
|
||||||
|
|
||||||
|
// Crawlable check
|
||||||
|
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
|
||||||
|
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
|
||||||
|
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
|
||||||
|
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: User Agent Rotation
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testUserAgentRotation(): void {
|
||||||
|
section('User Agent Rotation');
|
||||||
|
|
||||||
|
const rotator = new UserAgentRotator();
|
||||||
|
|
||||||
|
const first = rotator.getCurrent();
|
||||||
|
const second = rotator.getNext();
|
||||||
|
const third = rotator.getNext();
|
||||||
|
|
||||||
|
assert(first !== second, 'User agents rotate');
|
||||||
|
assert(second !== third, 'User agents keep rotating');
|
||||||
|
assert(USER_AGENTS.includes(first), 'Returns valid UA');
|
||||||
|
assert(USER_AGENTS.includes(second), 'Returns valid UA');
|
||||||
|
|
||||||
|
// Random UA
|
||||||
|
const random = rotator.getRandom();
|
||||||
|
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
|
||||||
|
|
||||||
|
// Count
|
||||||
|
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: WithRetry Helper
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
async function testWithRetryHelper(): Promise<void> {
|
||||||
|
section('WithRetry Helper');
|
||||||
|
|
||||||
|
// Successful on first try
|
||||||
|
let attempts = 0;
|
||||||
|
const successResult = await withRetry(async () => {
|
||||||
|
attempts++;
|
||||||
|
return 'success';
|
||||||
|
}, { maxRetries: 3 });
|
||||||
|
assert(attempts === 1, 'Succeeds on first try');
|
||||||
|
assert(successResult.result === 'success', 'Returns result');
|
||||||
|
|
||||||
|
// Fails then succeeds
|
||||||
|
let failThenSucceedAttempts = 0;
|
||||||
|
const failThenSuccessResult = await withRetry(async () => {
|
||||||
|
failThenSucceedAttempts++;
|
||||||
|
if (failThenSucceedAttempts < 3) {
|
||||||
|
throw new Error('temporary error');
|
||||||
|
}
|
||||||
|
return 'finally succeeded';
|
||||||
|
}, { maxRetries: 5, baseBackoffMs: 10 });
|
||||||
|
assert(failThenSucceedAttempts === 3, 'Retries until success');
|
||||||
|
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
|
||||||
|
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
|
||||||
|
|
||||||
|
// Exhausts retries
|
||||||
|
let alwaysFailAttempts = 0;
|
||||||
|
try {
|
||||||
|
await withRetry(async () => {
|
||||||
|
alwaysFailAttempts++;
|
||||||
|
throw new Error('always fails');
|
||||||
|
}, { maxRetries: 2, baseBackoffMs: 10 });
|
||||||
|
assert(false, 'Should have thrown');
|
||||||
|
} catch (error: any) {
|
||||||
|
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
|
||||||
|
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-retryable error stops immediately
|
||||||
|
let nonRetryableAttempts = 0;
|
||||||
|
try {
|
||||||
|
await withRetry(async () => {
|
||||||
|
nonRetryableAttempts++;
|
||||||
|
const err = new Error('HTML structure changed - selector not found');
|
||||||
|
throw err;
|
||||||
|
}, { maxRetries: 3, baseBackoffMs: 10 });
|
||||||
|
assert(false, 'Should have thrown');
|
||||||
|
} catch {
|
||||||
|
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Minimum Crawl Gap
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testMinimumCrawlGap(): void {
|
||||||
|
section('Minimum Crawl Gap');
|
||||||
|
|
||||||
|
// Default config
|
||||||
|
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
|
||||||
|
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
|
||||||
|
|
||||||
|
// Gap calculation
|
||||||
|
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
|
||||||
|
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
|
||||||
|
|
||||||
|
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TEST: Error Metadata
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function testErrorMetadata(): void {
|
||||||
|
section('Error Metadata');
|
||||||
|
|
||||||
|
// RATE_LIMITED
|
||||||
|
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
|
||||||
|
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
|
||||||
|
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
|
||||||
|
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
|
||||||
|
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
|
||||||
|
|
||||||
|
// HTML_CHANGED
|
||||||
|
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
|
||||||
|
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
|
||||||
|
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
|
||||||
|
|
||||||
|
// INVALID_CONFIG
|
||||||
|
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
|
||||||
|
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
|
||||||
|
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
async function runTests(testName?: string): Promise<void> {
|
||||||
|
console.log('\n');
|
||||||
|
console.log('╔══════════════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
|
||||||
|
console.log('╚══════════════════════════════════════════════════════════╝');
|
||||||
|
|
||||||
|
const allTests = !testName || testName === 'all';
|
||||||
|
|
||||||
|
if (allTests || testName === 'error' || testName === 'classification') {
|
||||||
|
testErrorClassification();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'retry') {
|
||||||
|
testRetryManager();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'backoff') {
|
||||||
|
testExponentialBackoff();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'status') {
|
||||||
|
testStatusTransitions();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'validation' || testName === 'store') {
|
||||||
|
testStoreValidation();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'rotation' || testName === 'ua') {
|
||||||
|
testUserAgentRotation();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'withRetry' || testName === 'helper') {
|
||||||
|
await testWithRetryHelper();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'gap') {
|
||||||
|
testMinimumCrawlGap();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allTests || testName === 'metadata') {
|
||||||
|
testErrorMetadata();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary
|
||||||
|
console.log('\n');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log('SUMMARY');
|
||||||
|
console.log('═'.repeat(60));
|
||||||
|
console.log(` Passed: ${testsPassed}`);
|
||||||
|
console.log(` Failed: ${testsFailed}`);
|
||||||
|
console.log(` Total: ${testsPassed + testsFailed}`);
|
||||||
|
|
||||||
|
if (testsFailed > 0) {
|
||||||
|
console.log('\n❌ SOME TESTS FAILED\n');
|
||||||
|
process.exit(1);
|
||||||
|
} else {
|
||||||
|
console.log('\n✅ ALL TESTS PASSED\n');
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run tests
|
||||||
|
const testName = process.argv[2];
|
||||||
|
runTests(testName).catch((error) => {
|
||||||
|
console.error('Fatal error:', error);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
659
backend/src/dutchie-az/services/analytics/brand-opportunity.ts
Normal file
659
backend/src/dutchie-az/services/analytics/brand-opportunity.ts
Normal file
@@ -0,0 +1,659 @@
|
|||||||
|
/**
|
||||||
|
* Brand Opportunity / Risk Analytics Service
|
||||||
|
*
|
||||||
|
* Provides brand-level opportunity and risk analysis including:
|
||||||
|
* - Under/overpriced vs market
|
||||||
|
* - Missing SKU opportunities
|
||||||
|
* - Stores with declining/growing shelf share
|
||||||
|
* - Competitor intrusion alerts
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { AnalyticsCache, cacheKey } from './cache';
|
||||||
|
|
||||||
|
export interface BrandOpportunity {
|
||||||
|
brandName: string;
|
||||||
|
underpricedVsMarket: PricePosition[];
|
||||||
|
overpricedVsMarket: PricePosition[];
|
||||||
|
missingSkuOpportunities: MissingSkuOpportunity[];
|
||||||
|
storesWithDecliningShelfShare: StoreShelfShareChange[];
|
||||||
|
storesWithGrowingShelfShare: StoreShelfShareChange[];
|
||||||
|
competitorIntrusionAlerts: CompetitorAlert[];
|
||||||
|
overallScore: number; // 0-100, higher = more opportunity
|
||||||
|
riskScore: number; // 0-100, higher = more risk
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PricePosition {
|
||||||
|
category: string;
|
||||||
|
brandAvgPrice: number;
|
||||||
|
marketAvgPrice: number;
|
||||||
|
priceDifferencePercent: number;
|
||||||
|
skuCount: number;
|
||||||
|
suggestion: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MissingSkuOpportunity {
|
||||||
|
category: string;
|
||||||
|
subcategory: string | null;
|
||||||
|
marketSkuCount: number;
|
||||||
|
brandSkuCount: number;
|
||||||
|
gapPercent: number;
|
||||||
|
topCompetitors: string[];
|
||||||
|
opportunityScore: number; // 0-100
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StoreShelfShareChange {
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
city: string;
|
||||||
|
state: string;
|
||||||
|
currentShelfShare: number;
|
||||||
|
previousShelfShare: number;
|
||||||
|
changePercent: number;
|
||||||
|
currentSkus: number;
|
||||||
|
competitors: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompetitorAlert {
|
||||||
|
competitorBrand: string;
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
alertType: 'new_entry' | 'expanding' | 'price_undercut';
|
||||||
|
details: string;
|
||||||
|
severity: 'low' | 'medium' | 'high';
|
||||||
|
date: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MarketPositionSummary {
|
||||||
|
brandName: string;
|
||||||
|
marketSharePercent: number;
|
||||||
|
avgPriceVsMarket: number; // -X% to +X%
|
||||||
|
categoryStrengths: Array<{ category: string; shelfSharePercent: number }>;
|
||||||
|
categoryWeaknesses: Array<{ category: string; shelfSharePercent: number; marketLeader: string }>;
|
||||||
|
growthTrend: 'growing' | 'stable' | 'declining';
|
||||||
|
competitorThreats: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export class BrandOpportunityService {
|
||||||
|
private pool: Pool;
|
||||||
|
private cache: AnalyticsCache;
|
||||||
|
|
||||||
|
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.cache = cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get full opportunity analysis for a brand
|
||||||
|
*/
|
||||||
|
async getBrandOpportunity(brandName: string): Promise<BrandOpportunity> {
|
||||||
|
const key = cacheKey('brand_opportunity', { brandName });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const [
|
||||||
|
underpriced,
|
||||||
|
overpriced,
|
||||||
|
missingSkus,
|
||||||
|
decliningStores,
|
||||||
|
growingStores,
|
||||||
|
alerts,
|
||||||
|
] = await Promise.all([
|
||||||
|
this.getUnderpricedPositions(brandName),
|
||||||
|
this.getOverpricedPositions(brandName),
|
||||||
|
this.getMissingSkuOpportunities(brandName),
|
||||||
|
this.getStoresWithDecliningShare(brandName),
|
||||||
|
this.getStoresWithGrowingShare(brandName),
|
||||||
|
this.getCompetitorAlerts(brandName),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Calculate opportunity score (higher = more opportunity)
|
||||||
|
const opportunityFactors = [
|
||||||
|
missingSkus.length > 0 ? 20 : 0,
|
||||||
|
underpriced.length > 0 ? 15 : 0,
|
||||||
|
growingStores.length > 5 ? 20 : growingStores.length * 3,
|
||||||
|
missingSkus.reduce((sum, m) => sum + m.opportunityScore, 0) / Math.max(1, missingSkus.length) * 0.3,
|
||||||
|
];
|
||||||
|
const opportunityScore = Math.min(100, opportunityFactors.reduce((a, b) => a + b, 0));
|
||||||
|
|
||||||
|
// Calculate risk score (higher = more risk)
|
||||||
|
const riskFactors = [
|
||||||
|
decliningStores.length > 5 ? 30 : decliningStores.length * 5,
|
||||||
|
alerts.filter(a => a.severity === 'high').length * 15,
|
||||||
|
alerts.filter(a => a.severity === 'medium').length * 8,
|
||||||
|
overpriced.length > 3 ? 15 : overpriced.length * 3,
|
||||||
|
];
|
||||||
|
const riskScore = Math.min(100, riskFactors.reduce((a, b) => a + b, 0));
|
||||||
|
|
||||||
|
return {
|
||||||
|
brandName,
|
||||||
|
underpricedVsMarket: underpriced,
|
||||||
|
overpricedVsMarket: overpriced,
|
||||||
|
missingSkuOpportunities: missingSkus,
|
||||||
|
storesWithDecliningShelfShare: decliningStores,
|
||||||
|
storesWithGrowingShelfShare: growingStores,
|
||||||
|
competitorIntrusionAlerts: alerts,
|
||||||
|
overallScore: Math.round(opportunityScore),
|
||||||
|
riskScore: Math.round(riskScore),
|
||||||
|
};
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get categories where brand is underpriced vs market
|
||||||
|
*/
|
||||||
|
async getUnderpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH brand_prices AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||||
|
COUNT(*) as sku_count
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = $1 AND type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
HAVING COUNT(*) >= 3
|
||||||
|
),
|
||||||
|
market_prices AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL AND brand_name != $1
|
||||||
|
GROUP BY type
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bp.category,
|
||||||
|
bp.brand_avg,
|
||||||
|
mp.market_avg,
|
||||||
|
bp.sku_count,
|
||||||
|
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||||
|
FROM brand_prices bp
|
||||||
|
JOIN market_prices mp ON bp.category = mp.category
|
||||||
|
WHERE bp.brand_avg < mp.market_avg * 0.9 -- 10% or more below market
|
||||||
|
AND bp.brand_avg IS NOT NULL
|
||||||
|
AND mp.market_avg IS NOT NULL
|
||||||
|
ORDER BY diff_pct
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
category: row.category,
|
||||||
|
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||||
|
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||||
|
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
suggestion: `Consider price increase - ${Math.abs(Math.round(parseFloat(row.diff_pct)))}% below market average`,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get categories where brand is overpriced vs market
|
||||||
|
*/
|
||||||
|
async getOverpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH brand_prices AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||||
|
COUNT(*) as sku_count
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = $1 AND type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
HAVING COUNT(*) >= 3
|
||||||
|
),
|
||||||
|
market_prices AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL AND brand_name != $1
|
||||||
|
GROUP BY type
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bp.category,
|
||||||
|
bp.brand_avg,
|
||||||
|
mp.market_avg,
|
||||||
|
bp.sku_count,
|
||||||
|
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||||
|
FROM brand_prices bp
|
||||||
|
JOIN market_prices mp ON bp.category = mp.category
|
||||||
|
WHERE bp.brand_avg > mp.market_avg * 1.15 -- 15% or more above market
|
||||||
|
AND bp.brand_avg IS NOT NULL
|
||||||
|
AND mp.market_avg IS NOT NULL
|
||||||
|
ORDER BY diff_pct DESC
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
category: row.category,
|
||||||
|
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||||
|
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||||
|
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
suggestion: `Price sensitivity risk - ${Math.round(parseFloat(row.diff_pct))}% above market average`,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get missing SKU opportunities (category gaps)
|
||||||
|
*/
|
||||||
|
async getMissingSkuOpportunities(brandName: string): Promise<MissingSkuOpportunity[]> {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH market_categories AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
subcategory,
|
||||||
|
COUNT(*) as market_skus,
|
||||||
|
ARRAY_AGG(DISTINCT brand_name ORDER BY brand_name) FILTER (WHERE brand_name IS NOT NULL) as top_brands
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL
|
||||||
|
GROUP BY type, subcategory
|
||||||
|
HAVING COUNT(*) >= 20
|
||||||
|
),
|
||||||
|
brand_presence AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
subcategory,
|
||||||
|
COUNT(*) as brand_skus
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = $1 AND type IS NOT NULL
|
||||||
|
GROUP BY type, subcategory
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
mc.category,
|
||||||
|
mc.subcategory,
|
||||||
|
mc.market_skus,
|
||||||
|
COALESCE(bp.brand_skus, 0) as brand_skus,
|
||||||
|
mc.top_brands[1:5] as competitors
|
||||||
|
FROM market_categories mc
|
||||||
|
LEFT JOIN brand_presence bp ON mc.category = bp.category
|
||||||
|
AND (mc.subcategory = bp.subcategory OR (mc.subcategory IS NULL AND bp.subcategory IS NULL))
|
||||||
|
WHERE COALESCE(bp.brand_skus, 0) < mc.market_skus * 0.05 -- Brand has <5% of market presence
|
||||||
|
ORDER BY mc.market_skus DESC
|
||||||
|
LIMIT 10
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => {
|
||||||
|
const marketSkus = parseInt(row.market_skus) || 0;
|
||||||
|
const brandSkus = parseInt(row.brand_skus) || 0;
|
||||||
|
const gapPercent = marketSkus > 0 ? ((marketSkus - brandSkus) / marketSkus) * 100 : 100;
|
||||||
|
const opportunityScore = Math.min(100, Math.round((marketSkus / 100) * (gapPercent / 100) * 100));
|
||||||
|
|
||||||
|
return {
|
||||||
|
category: row.category,
|
||||||
|
subcategory: row.subcategory,
|
||||||
|
marketSkuCount: marketSkus,
|
||||||
|
brandSkuCount: brandSkus,
|
||||||
|
gapPercent: Math.round(gapPercent),
|
||||||
|
topCompetitors: (row.competitors || []).filter((c: string) => c !== brandName).slice(0, 5),
|
||||||
|
opportunityScore,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get stores where brand's shelf share is declining
|
||||||
|
*/
|
||||||
|
async getStoresWithDecliningShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||||
|
// Use brand_snapshots for historical comparison
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH current_share AS (
|
||||||
|
SELECT
|
||||||
|
dp.dispensary_id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||||
|
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
cs.store_id,
|
||||||
|
cs.store_name,
|
||||||
|
cs.city,
|
||||||
|
cs.state,
|
||||||
|
cs.brand_skus as current_skus,
|
||||||
|
cs.total_skus,
|
||||||
|
ROUND((cs.brand_skus::NUMERIC / cs.total_skus) * 100, 2) as current_share,
|
||||||
|
cs.competitors[1:5] as top_competitors
|
||||||
|
FROM current_share cs
|
||||||
|
WHERE cs.brand_skus < 10 -- Low presence
|
||||||
|
ORDER BY cs.brand_skus
|
||||||
|
LIMIT 10
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
city: row.city,
|
||||||
|
state: row.state,
|
||||||
|
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||||
|
previousShelfShare: parseFloat(row.current_share) || 0, // Would need historical data
|
||||||
|
changePercent: 0,
|
||||||
|
currentSkus: parseInt(row.current_skus) || 0,
|
||||||
|
competitors: row.top_competitors || [],
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get stores where brand's shelf share is growing
|
||||||
|
*/
|
||||||
|
async getStoresWithGrowingShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH store_share AS (
|
||||||
|
SELECT
|
||||||
|
dp.dispensary_id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||||
|
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
ss.store_id,
|
||||||
|
ss.store_name,
|
||||||
|
ss.city,
|
||||||
|
ss.state,
|
||||||
|
ss.brand_skus as current_skus,
|
||||||
|
ss.total_skus,
|
||||||
|
ROUND((ss.brand_skus::NUMERIC / ss.total_skus) * 100, 2) as current_share,
|
||||||
|
ss.competitors[1:5] as top_competitors
|
||||||
|
FROM store_share ss
|
||||||
|
ORDER BY current_share DESC
|
||||||
|
LIMIT 10
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
city: row.city,
|
||||||
|
state: row.state,
|
||||||
|
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||||
|
previousShelfShare: parseFloat(row.current_share) || 0,
|
||||||
|
changePercent: 0,
|
||||||
|
currentSkus: parseInt(row.current_skus) || 0,
|
||||||
|
competitors: row.top_competitors || [],
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get competitor intrusion alerts
|
||||||
|
*/
|
||||||
|
async getCompetitorAlerts(brandName: string): Promise<CompetitorAlert[]> {
|
||||||
|
// Check for competitor entries in stores where this brand has presence
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH brand_stores AS (
|
||||||
|
SELECT DISTINCT dispensary_id
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = $1
|
||||||
|
),
|
||||||
|
competitor_presence AS (
|
||||||
|
SELECT
|
||||||
|
dp.brand_name as competitor,
|
||||||
|
dp.dispensary_id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
MAX(dp.created_at) as latest_add
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.dispensary_id IN (SELECT dispensary_id FROM brand_stores)
|
||||||
|
AND dp.brand_name != $1
|
||||||
|
AND dp.brand_name IS NOT NULL
|
||||||
|
AND dp.created_at >= NOW() - INTERVAL '30 days'
|
||||||
|
GROUP BY dp.brand_name, dp.dispensary_id, d.name
|
||||||
|
HAVING COUNT(*) >= 5
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
competitor,
|
||||||
|
store_id,
|
||||||
|
store_name,
|
||||||
|
sku_count,
|
||||||
|
latest_add
|
||||||
|
FROM competitor_presence
|
||||||
|
ORDER BY sku_count DESC
|
||||||
|
LIMIT 10
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => {
|
||||||
|
const skuCount = parseInt(row.sku_count) || 0;
|
||||||
|
let severity: 'low' | 'medium' | 'high' = 'low';
|
||||||
|
if (skuCount >= 20) severity = 'high';
|
||||||
|
else if (skuCount >= 10) severity = 'medium';
|
||||||
|
|
||||||
|
return {
|
||||||
|
competitorBrand: row.competitor,
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
alertType: 'expanding' as const,
|
||||||
|
details: `${row.competitor} has ${skuCount} SKUs in ${row.store_name}`,
|
||||||
|
severity,
|
||||||
|
date: new Date(row.latest_add).toISOString().split('T')[0],
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get market position summary for a brand
|
||||||
|
*/
|
||||||
|
async getMarketPositionSummary(brandName: string): Promise<MarketPositionSummary> {
|
||||||
|
const key = cacheKey('market_position', { brandName });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const [shareResult, priceResult, categoryResult, threatResult] = await Promise.all([
|
||||||
|
// Market share
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products WHERE brand_name = $1) as brand_count,
|
||||||
|
(SELECT COUNT(*) FROM dutchie_products) as total_count
|
||||||
|
`, [brandName]),
|
||||||
|
|
||||||
|
// Price vs market
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name = $1) as brand_avg,
|
||||||
|
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name != $1) as market_avg
|
||||||
|
`, [brandName]),
|
||||||
|
|
||||||
|
// Category strengths/weaknesses
|
||||||
|
this.pool.query(`
|
||||||
|
WITH brand_by_cat AS (
|
||||||
|
SELECT type as category, COUNT(*) as brand_count
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = $1 AND type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
),
|
||||||
|
market_by_cat AS (
|
||||||
|
SELECT type as category, COUNT(*) as total_count
|
||||||
|
FROM dutchie_products WHERE type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
),
|
||||||
|
leaders AS (
|
||||||
|
SELECT type as category, brand_name, COUNT(*) as cnt,
|
||||||
|
RANK() OVER (PARTITION BY type ORDER BY COUNT(*) DESC) as rnk
|
||||||
|
FROM dutchie_products WHERE type IS NOT NULL AND brand_name IS NOT NULL
|
||||||
|
GROUP BY type, brand_name
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
mc.category,
|
||||||
|
COALESCE(bc.brand_count, 0) as brand_count,
|
||||||
|
mc.total_count,
|
||||||
|
ROUND((COALESCE(bc.brand_count, 0)::NUMERIC / mc.total_count) * 100, 2) as share_pct,
|
||||||
|
(SELECT brand_name FROM leaders WHERE category = mc.category AND rnk = 1) as leader
|
||||||
|
FROM market_by_cat mc
|
||||||
|
LEFT JOIN brand_by_cat bc ON mc.category = bc.category
|
||||||
|
ORDER BY share_pct DESC
|
||||||
|
`, [brandName]),
|
||||||
|
|
||||||
|
// Top competitors
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT brand_name, COUNT(*) as cnt
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name IS NOT NULL AND brand_name != $1
|
||||||
|
GROUP BY brand_name
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
LIMIT 5
|
||||||
|
`, [brandName]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const brandCount = parseInt(shareResult.rows[0]?.brand_count) || 0;
|
||||||
|
const totalCount = parseInt(shareResult.rows[0]?.total_count) || 1;
|
||||||
|
const marketSharePercent = Math.round((brandCount / totalCount) * 1000) / 10;
|
||||||
|
|
||||||
|
const brandAvg = parseFloat(priceResult.rows[0]?.brand_avg) || 0;
|
||||||
|
const marketAvg = parseFloat(priceResult.rows[0]?.market_avg) || 1;
|
||||||
|
const avgPriceVsMarket = Math.round(((brandAvg - marketAvg) / marketAvg) * 1000) / 10;
|
||||||
|
|
||||||
|
const categories = categoryResult.rows;
|
||||||
|
const strengths = categories
|
||||||
|
.filter(c => parseFloat(c.share_pct) > 5)
|
||||||
|
.map(c => ({ category: c.category, shelfSharePercent: parseFloat(c.share_pct) }));
|
||||||
|
|
||||||
|
const weaknesses = categories
|
||||||
|
.filter(c => parseFloat(c.share_pct) < 2 && c.leader !== brandName)
|
||||||
|
.map(c => ({
|
||||||
|
category: c.category,
|
||||||
|
shelfSharePercent: parseFloat(c.share_pct),
|
||||||
|
marketLeader: c.leader || 'Unknown',
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
brandName,
|
||||||
|
marketSharePercent,
|
||||||
|
avgPriceVsMarket,
|
||||||
|
categoryStrengths: strengths.slice(0, 5),
|
||||||
|
categoryWeaknesses: weaknesses.slice(0, 5),
|
||||||
|
growthTrend: 'stable' as const, // Would need historical data
|
||||||
|
competitorThreats: threatResult.rows.map(r => r.brand_name),
|
||||||
|
};
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an analytics alert
|
||||||
|
*/
|
||||||
|
async createAlert(alert: {
|
||||||
|
alertType: string;
|
||||||
|
severity: 'info' | 'warning' | 'critical';
|
||||||
|
title: string;
|
||||||
|
description?: string;
|
||||||
|
storeId?: number;
|
||||||
|
brandName?: string;
|
||||||
|
productId?: number;
|
||||||
|
category?: string;
|
||||||
|
metadata?: Record<string, unknown>;
|
||||||
|
}): Promise<void> {
|
||||||
|
await this.pool.query(`
|
||||||
|
INSERT INTO analytics_alerts
|
||||||
|
(alert_type, severity, title, description, store_id, brand_name, product_id, category, metadata)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||||
|
`, [
|
||||||
|
alert.alertType,
|
||||||
|
alert.severity,
|
||||||
|
alert.title,
|
||||||
|
alert.description || null,
|
||||||
|
alert.storeId || null,
|
||||||
|
alert.brandName || null,
|
||||||
|
alert.productId || null,
|
||||||
|
alert.category || null,
|
||||||
|
alert.metadata ? JSON.stringify(alert.metadata) : null,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get recent alerts
|
||||||
|
*/
|
||||||
|
async getAlerts(filters: {
|
||||||
|
brandName?: string;
|
||||||
|
storeId?: number;
|
||||||
|
alertType?: string;
|
||||||
|
unreadOnly?: boolean;
|
||||||
|
limit?: number;
|
||||||
|
} = {}): Promise<Array<{
|
||||||
|
id: number;
|
||||||
|
alertType: string;
|
||||||
|
severity: string;
|
||||||
|
title: string;
|
||||||
|
description: string | null;
|
||||||
|
storeName: string | null;
|
||||||
|
brandName: string | null;
|
||||||
|
createdAt: string;
|
||||||
|
isRead: boolean;
|
||||||
|
}>> {
|
||||||
|
const { brandName, storeId, alertType, unreadOnly = false, limit = 50 } = filters;
|
||||||
|
const params: (string | number | boolean)[] = [limit];
|
||||||
|
const conditions: string[] = [];
|
||||||
|
let paramIndex = 2;
|
||||||
|
|
||||||
|
if (brandName) {
|
||||||
|
conditions.push(`a.brand_name = $${paramIndex++}`);
|
||||||
|
params.push(brandName);
|
||||||
|
}
|
||||||
|
if (storeId) {
|
||||||
|
conditions.push(`a.store_id = $${paramIndex++}`);
|
||||||
|
params.push(storeId);
|
||||||
|
}
|
||||||
|
if (alertType) {
|
||||||
|
conditions.push(`a.alert_type = $${paramIndex++}`);
|
||||||
|
params.push(alertType);
|
||||||
|
}
|
||||||
|
if (unreadOnly) {
|
||||||
|
conditions.push('a.is_read = false');
|
||||||
|
}
|
||||||
|
|
||||||
|
const whereClause = conditions.length > 0
|
||||||
|
? 'WHERE ' + conditions.join(' AND ')
|
||||||
|
: '';
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
a.id,
|
||||||
|
a.alert_type,
|
||||||
|
a.severity,
|
||||||
|
a.title,
|
||||||
|
a.description,
|
||||||
|
d.name as store_name,
|
||||||
|
a.brand_name,
|
||||||
|
a.created_at,
|
||||||
|
a.is_read
|
||||||
|
FROM analytics_alerts a
|
||||||
|
LEFT JOIN dispensaries d ON a.store_id = d.id
|
||||||
|
${whereClause}
|
||||||
|
ORDER BY a.created_at DESC
|
||||||
|
LIMIT $1
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
id: row.id,
|
||||||
|
alertType: row.alert_type,
|
||||||
|
severity: row.severity,
|
||||||
|
title: row.title,
|
||||||
|
description: row.description,
|
||||||
|
storeName: row.store_name,
|
||||||
|
brandName: row.brand_name,
|
||||||
|
createdAt: row.created_at.toISOString(),
|
||||||
|
isRead: row.is_read,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark alerts as read
|
||||||
|
*/
|
||||||
|
async markAlertsRead(alertIds: number[]): Promise<void> {
|
||||||
|
if (alertIds.length === 0) return;
|
||||||
|
|
||||||
|
await this.pool.query(`
|
||||||
|
UPDATE analytics_alerts
|
||||||
|
SET is_read = true
|
||||||
|
WHERE id = ANY($1)
|
||||||
|
`, [alertIds]);
|
||||||
|
}
|
||||||
|
}
|
||||||
227
backend/src/dutchie-az/services/analytics/cache.ts
Normal file
227
backend/src/dutchie-az/services/analytics/cache.ts
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
/**
|
||||||
|
* Analytics Cache Service
|
||||||
|
*
|
||||||
|
* Provides caching layer for expensive analytics queries.
|
||||||
|
* Uses PostgreSQL for persistence with configurable TTLs.
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
|
||||||
|
export interface CacheEntry<T = unknown> {
|
||||||
|
key: string;
|
||||||
|
data: T;
|
||||||
|
computedAt: Date;
|
||||||
|
expiresAt: Date;
|
||||||
|
queryTimeMs?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CacheConfig {
|
||||||
|
defaultTtlMinutes: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULT_CONFIG: CacheConfig = {
|
||||||
|
defaultTtlMinutes: 15,
|
||||||
|
};
|
||||||
|
|
||||||
|
export class AnalyticsCache {
|
||||||
|
private pool: Pool;
|
||||||
|
private config: CacheConfig;
|
||||||
|
private memoryCache: Map<string, CacheEntry> = new Map();
|
||||||
|
|
||||||
|
constructor(pool: Pool, config: Partial<CacheConfig> = {}) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.config = { ...DEFAULT_CONFIG, ...config };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get cached data or compute and cache it
|
||||||
|
*/
|
||||||
|
async getOrCompute<T>(
|
||||||
|
key: string,
|
||||||
|
computeFn: () => Promise<T>,
|
||||||
|
ttlMinutes?: number
|
||||||
|
): Promise<{ data: T; fromCache: boolean; queryTimeMs: number }> {
|
||||||
|
const ttl = ttlMinutes ?? this.config.defaultTtlMinutes;
|
||||||
|
|
||||||
|
// Check memory cache first
|
||||||
|
const memEntry = this.memoryCache.get(key);
|
||||||
|
if (memEntry && new Date() < memEntry.expiresAt) {
|
||||||
|
return { data: memEntry.data as T, fromCache: true, queryTimeMs: memEntry.queryTimeMs || 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check database cache
|
||||||
|
const dbEntry = await this.getFromDb<T>(key);
|
||||||
|
if (dbEntry && new Date() < dbEntry.expiresAt) {
|
||||||
|
this.memoryCache.set(key, dbEntry);
|
||||||
|
return { data: dbEntry.data, fromCache: true, queryTimeMs: dbEntry.queryTimeMs || 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute fresh data
|
||||||
|
const startTime = Date.now();
|
||||||
|
const data = await computeFn();
|
||||||
|
const queryTimeMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
// Cache result
|
||||||
|
const entry: CacheEntry<T> = {
|
||||||
|
key,
|
||||||
|
data,
|
||||||
|
computedAt: new Date(),
|
||||||
|
expiresAt: new Date(Date.now() + ttl * 60 * 1000),
|
||||||
|
queryTimeMs,
|
||||||
|
};
|
||||||
|
|
||||||
|
await this.saveToDb(entry);
|
||||||
|
this.memoryCache.set(key, entry);
|
||||||
|
|
||||||
|
return { data, fromCache: false, queryTimeMs };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get from database cache
|
||||||
|
*/
|
||||||
|
private async getFromDb<T>(key: string): Promise<CacheEntry<T> | null> {
|
||||||
|
try {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT cache_data, computed_at, expires_at, query_time_ms
|
||||||
|
FROM analytics_cache
|
||||||
|
WHERE cache_key = $1
|
||||||
|
AND expires_at > NOW()
|
||||||
|
`, [key]);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) return null;
|
||||||
|
|
||||||
|
const row = result.rows[0];
|
||||||
|
return {
|
||||||
|
key,
|
||||||
|
data: row.cache_data as T,
|
||||||
|
computedAt: row.computed_at,
|
||||||
|
expiresAt: row.expires_at,
|
||||||
|
queryTimeMs: row.query_time_ms,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`[AnalyticsCache] Failed to get from DB: ${error}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save to database cache
|
||||||
|
*/
|
||||||
|
private async saveToDb<T>(entry: CacheEntry<T>): Promise<void> {
|
||||||
|
try {
|
||||||
|
await this.pool.query(`
|
||||||
|
INSERT INTO analytics_cache (cache_key, cache_data, computed_at, expires_at, query_time_ms)
|
||||||
|
VALUES ($1, $2, $3, $4, $5)
|
||||||
|
ON CONFLICT (cache_key)
|
||||||
|
DO UPDATE SET
|
||||||
|
cache_data = EXCLUDED.cache_data,
|
||||||
|
computed_at = EXCLUDED.computed_at,
|
||||||
|
expires_at = EXCLUDED.expires_at,
|
||||||
|
query_time_ms = EXCLUDED.query_time_ms
|
||||||
|
`, [entry.key, JSON.stringify(entry.data), entry.computedAt, entry.expiresAt, entry.queryTimeMs]);
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`[AnalyticsCache] Failed to save to DB: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalidate a cache entry
|
||||||
|
*/
|
||||||
|
async invalidate(key: string): Promise<void> {
|
||||||
|
this.memoryCache.delete(key);
|
||||||
|
try {
|
||||||
|
await this.pool.query('DELETE FROM analytics_cache WHERE cache_key = $1', [key]);
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`[AnalyticsCache] Failed to invalidate: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalidate all entries matching a pattern
|
||||||
|
*/
|
||||||
|
async invalidatePattern(pattern: string): Promise<number> {
|
||||||
|
// Clear memory cache
|
||||||
|
for (const key of this.memoryCache.keys()) {
|
||||||
|
if (key.includes(pattern)) {
|
||||||
|
this.memoryCache.delete(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
'DELETE FROM analytics_cache WHERE cache_key LIKE $1',
|
||||||
|
[`%${pattern}%`]
|
||||||
|
);
|
||||||
|
return result.rowCount || 0;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`[AnalyticsCache] Failed to invalidate pattern: ${error}`);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean expired entries
|
||||||
|
*/
|
||||||
|
async cleanExpired(): Promise<number> {
|
||||||
|
// Clean memory cache
|
||||||
|
const now = new Date();
|
||||||
|
for (const [key, entry] of this.memoryCache.entries()) {
|
||||||
|
if (now >= entry.expiresAt) {
|
||||||
|
this.memoryCache.delete(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await this.pool.query('DELETE FROM analytics_cache WHERE expires_at < NOW()');
|
||||||
|
return result.rowCount || 0;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`[AnalyticsCache] Failed to clean expired: ${error}`);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get cache statistics
|
||||||
|
*/
|
||||||
|
async getStats(): Promise<{
|
||||||
|
memoryCacheSize: number;
|
||||||
|
dbCacheSize: number;
|
||||||
|
expiredCount: number;
|
||||||
|
}> {
|
||||||
|
try {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE expires_at > NOW()) as active,
|
||||||
|
COUNT(*) FILTER (WHERE expires_at <= NOW()) as expired
|
||||||
|
FROM analytics_cache
|
||||||
|
`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
memoryCacheSize: this.memoryCache.size,
|
||||||
|
dbCacheSize: parseInt(result.rows[0]?.active || '0'),
|
||||||
|
expiredCount: parseInt(result.rows[0]?.expired || '0'),
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
memoryCacheSize: this.memoryCache.size,
|
||||||
|
dbCacheSize: 0,
|
||||||
|
expiredCount: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate cache key with parameters
|
||||||
|
*/
|
||||||
|
export function cacheKey(prefix: string, params: Record<string, unknown> = {}): string {
|
||||||
|
const sortedParams = Object.keys(params)
|
||||||
|
.sort()
|
||||||
|
.filter(k => params[k] !== undefined && params[k] !== null)
|
||||||
|
.map(k => `${k}=${params[k]}`)
|
||||||
|
.join('&');
|
||||||
|
|
||||||
|
return sortedParams ? `${prefix}:${sortedParams}` : prefix;
|
||||||
|
}
|
||||||
530
backend/src/dutchie-az/services/analytics/category-analytics.ts
Normal file
530
backend/src/dutchie-az/services/analytics/category-analytics.ts
Normal file
@@ -0,0 +1,530 @@
|
|||||||
|
/**
|
||||||
|
* Category Growth Analytics Service
|
||||||
|
*
|
||||||
|
* Provides category-level analytics including:
|
||||||
|
* - SKU count growth
|
||||||
|
* - Price growth trends
|
||||||
|
* - New product additions
|
||||||
|
* - Category shrinkage
|
||||||
|
* - Seasonality patterns
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { AnalyticsCache, cacheKey } from './cache';
|
||||||
|
|
||||||
|
export interface CategoryGrowth {
|
||||||
|
category: string;
|
||||||
|
currentSkuCount: number;
|
||||||
|
previousSkuCount: number;
|
||||||
|
skuGrowthPercent: number;
|
||||||
|
currentBrandCount: number;
|
||||||
|
previousBrandCount: number;
|
||||||
|
brandGrowthPercent: number;
|
||||||
|
currentAvgPrice: number | null;
|
||||||
|
previousAvgPrice: number | null;
|
||||||
|
priceChangePercent: number | null;
|
||||||
|
newProducts: number;
|
||||||
|
discontinuedProducts: number;
|
||||||
|
trend: 'growing' | 'declining' | 'stable';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CategorySummary {
|
||||||
|
category: string;
|
||||||
|
totalSkus: number;
|
||||||
|
brandCount: number;
|
||||||
|
storeCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
minPrice: number | null;
|
||||||
|
maxPrice: number | null;
|
||||||
|
inStockSkus: number;
|
||||||
|
outOfStockSkus: number;
|
||||||
|
stockHealthPercent: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CategoryGrowthTrend {
|
||||||
|
category: string;
|
||||||
|
dataPoints: Array<{
|
||||||
|
date: string;
|
||||||
|
skuCount: number;
|
||||||
|
brandCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
storeCount: number;
|
||||||
|
}>;
|
||||||
|
growth7d: number | null;
|
||||||
|
growth30d: number | null;
|
||||||
|
growth90d: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CategoryHeatmapData {
|
||||||
|
categories: string[];
|
||||||
|
periods: string[];
|
||||||
|
data: Array<{
|
||||||
|
category: string;
|
||||||
|
period: string;
|
||||||
|
value: number; // SKU count, growth %, or price
|
||||||
|
changeFromPrevious: number | null;
|
||||||
|
}>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SeasonalityPattern {
|
||||||
|
category: string;
|
||||||
|
monthlyPattern: Array<{
|
||||||
|
month: number;
|
||||||
|
monthName: string;
|
||||||
|
avgSkuCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
seasonalityIndex: number; // 100 = average, >100 = above, <100 = below
|
||||||
|
}>;
|
||||||
|
peakMonth: number;
|
||||||
|
troughMonth: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CategoryFilters {
|
||||||
|
state?: string;
|
||||||
|
storeId?: number;
|
||||||
|
minSkus?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class CategoryAnalyticsService {
|
||||||
|
private pool: Pool;
|
||||||
|
private cache: AnalyticsCache;
|
||||||
|
|
||||||
|
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.cache = cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current category summary
|
||||||
|
*/
|
||||||
|
async getCategorySummary(
|
||||||
|
category?: string,
|
||||||
|
filters: CategoryFilters = {}
|
||||||
|
): Promise<CategorySummary[]> {
|
||||||
|
const { state, storeId } = filters;
|
||||||
|
const key = cacheKey('category_summary', { category, state, storeId });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const params: (string | number)[] = [];
|
||||||
|
const conditions: string[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (category) {
|
||||||
|
conditions.push(`dp.type = $${paramIndex++}`);
|
||||||
|
params.push(category);
|
||||||
|
}
|
||||||
|
if (state) {
|
||||||
|
conditions.push(`d.state = $${paramIndex++}`);
|
||||||
|
params.push(state);
|
||||||
|
}
|
||||||
|
if (storeId) {
|
||||||
|
conditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||||
|
params.push(storeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
const whereClause = conditions.length > 0
|
||||||
|
? 'WHERE dp.type IS NOT NULL AND ' + conditions.join(' AND ')
|
||||||
|
: 'WHERE dp.type IS NOT NULL';
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
dp.type as category,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||||
|
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||||
|
SUM(CASE WHEN dp.stock_status != 'in_stock' OR dp.stock_status IS NULL THEN 1 ELSE 0 END) as out_of_stock
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
${whereClause}
|
||||||
|
GROUP BY dp.type
|
||||||
|
ORDER BY total_skus DESC
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
return result.rows.map(row => {
|
||||||
|
const totalSkus = parseInt(row.total_skus) || 0;
|
||||||
|
const inStock = parseInt(row.in_stock) || 0;
|
||||||
|
|
||||||
|
return {
|
||||||
|
category: row.category,
|
||||||
|
totalSkus,
|
||||||
|
brandCount: parseInt(row.brand_count) || 0,
|
||||||
|
storeCount: parseInt(row.store_count) || 0,
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
minPrice: row.min_price ? Math.round(parseFloat(row.min_price) * 100) / 100 : null,
|
||||||
|
maxPrice: row.max_price ? Math.round(parseFloat(row.max_price) * 100) / 100 : null,
|
||||||
|
inStockSkus: inStock,
|
||||||
|
outOfStockSkus: parseInt(row.out_of_stock) || 0,
|
||||||
|
stockHealthPercent: totalSkus > 0
|
||||||
|
? Math.round((inStock / totalSkus) * 100)
|
||||||
|
: 0,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get category growth (comparing periods)
|
||||||
|
*/
|
||||||
|
async getCategoryGrowth(
|
||||||
|
days: number = 7,
|
||||||
|
filters: CategoryFilters = {}
|
||||||
|
): Promise<CategoryGrowth[]> {
|
||||||
|
const { state, storeId, minSkus = 10 } = filters;
|
||||||
|
const key = cacheKey('category_growth', { days, state, storeId, minSkus });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
// Use category_snapshots for historical comparison
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH current_data AS (
|
||||||
|
SELECT
|
||||||
|
category,
|
||||||
|
total_skus,
|
||||||
|
brand_count,
|
||||||
|
avg_price,
|
||||||
|
store_count
|
||||||
|
FROM category_snapshots
|
||||||
|
WHERE snapshot_date = (SELECT MAX(snapshot_date) FROM category_snapshots)
|
||||||
|
),
|
||||||
|
previous_data AS (
|
||||||
|
SELECT
|
||||||
|
category,
|
||||||
|
total_skus,
|
||||||
|
brand_count,
|
||||||
|
avg_price,
|
||||||
|
store_count
|
||||||
|
FROM category_snapshots
|
||||||
|
WHERE snapshot_date = (
|
||||||
|
SELECT MAX(snapshot_date)
|
||||||
|
FROM category_snapshots
|
||||||
|
WHERE snapshot_date < (SELECT MAX(snapshot_date) FROM category_snapshots) - ($1 || ' days')::INTERVAL
|
||||||
|
)
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
c.category,
|
||||||
|
c.total_skus as current_skus,
|
||||||
|
COALESCE(p.total_skus, c.total_skus) as previous_skus,
|
||||||
|
c.brand_count as current_brands,
|
||||||
|
COALESCE(p.brand_count, c.brand_count) as previous_brands,
|
||||||
|
c.avg_price as current_price,
|
||||||
|
p.avg_price as previous_price
|
||||||
|
FROM current_data c
|
||||||
|
LEFT JOIN previous_data p ON c.category = p.category
|
||||||
|
WHERE c.total_skus >= $2
|
||||||
|
ORDER BY c.total_skus DESC
|
||||||
|
`, [days, minSkus]);
|
||||||
|
|
||||||
|
// If no snapshots exist, use current data
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
const fallbackResult = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
COUNT(DISTINCT brand_name) as brand_count,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as avg_price
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
HAVING COUNT(*) >= $1
|
||||||
|
ORDER BY total_skus DESC
|
||||||
|
`, [minSkus]);
|
||||||
|
|
||||||
|
return fallbackResult.rows.map(row => ({
|
||||||
|
category: row.category,
|
||||||
|
currentSkuCount: parseInt(row.total_skus) || 0,
|
||||||
|
previousSkuCount: parseInt(row.total_skus) || 0,
|
||||||
|
skuGrowthPercent: 0,
|
||||||
|
currentBrandCount: parseInt(row.brand_count) || 0,
|
||||||
|
previousBrandCount: parseInt(row.brand_count) || 0,
|
||||||
|
brandGrowthPercent: 0,
|
||||||
|
currentAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
previousAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
priceChangePercent: null,
|
||||||
|
newProducts: 0,
|
||||||
|
discontinuedProducts: 0,
|
||||||
|
trend: 'stable' as const,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.rows.map(row => {
|
||||||
|
const currentSkus = parseInt(row.current_skus) || 0;
|
||||||
|
const previousSkus = parseInt(row.previous_skus) || currentSkus;
|
||||||
|
const currentBrands = parseInt(row.current_brands) || 0;
|
||||||
|
const previousBrands = parseInt(row.previous_brands) || currentBrands;
|
||||||
|
const currentPrice = row.current_price ? parseFloat(row.current_price) : null;
|
||||||
|
const previousPrice = row.previous_price ? parseFloat(row.previous_price) : null;
|
||||||
|
|
||||||
|
const skuGrowth = previousSkus > 0
|
||||||
|
? ((currentSkus - previousSkus) / previousSkus) * 100
|
||||||
|
: 0;
|
||||||
|
const brandGrowth = previousBrands > 0
|
||||||
|
? ((currentBrands - previousBrands) / previousBrands) * 100
|
||||||
|
: 0;
|
||||||
|
const priceChange = previousPrice && currentPrice
|
||||||
|
? ((currentPrice - previousPrice) / previousPrice) * 100
|
||||||
|
: null;
|
||||||
|
|
||||||
|
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||||
|
if (skuGrowth > 5) trend = 'growing';
|
||||||
|
else if (skuGrowth < -5) trend = 'declining';
|
||||||
|
|
||||||
|
return {
|
||||||
|
category: row.category,
|
||||||
|
currentSkuCount: currentSkus,
|
||||||
|
previousSkuCount: previousSkus,
|
||||||
|
skuGrowthPercent: Math.round(skuGrowth * 10) / 10,
|
||||||
|
currentBrandCount: currentBrands,
|
||||||
|
previousBrandCount: previousBrands,
|
||||||
|
brandGrowthPercent: Math.round(brandGrowth * 10) / 10,
|
||||||
|
currentAvgPrice: currentPrice ? Math.round(currentPrice * 100) / 100 : null,
|
||||||
|
previousAvgPrice: previousPrice ? Math.round(previousPrice * 100) / 100 : null,
|
||||||
|
priceChangePercent: priceChange !== null ? Math.round(priceChange * 10) / 10 : null,
|
||||||
|
newProducts: Math.max(0, currentSkus - previousSkus),
|
||||||
|
discontinuedProducts: Math.max(0, previousSkus - currentSkus),
|
||||||
|
trend,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get category growth trend over time
|
||||||
|
*/
|
||||||
|
async getCategoryGrowthTrend(
|
||||||
|
category: string,
|
||||||
|
days: number = 90
|
||||||
|
): Promise<CategoryGrowthTrend> {
|
||||||
|
const key = cacheKey('category_growth_trend', { category, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
snapshot_date as date,
|
||||||
|
total_skus as sku_count,
|
||||||
|
brand_count,
|
||||||
|
avg_price,
|
||||||
|
store_count
|
||||||
|
FROM category_snapshots
|
||||||
|
WHERE category = $1
|
||||||
|
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||||
|
ORDER BY snapshot_date
|
||||||
|
`, [category, days]);
|
||||||
|
|
||||||
|
const dataPoints = result.rows.map(row => ({
|
||||||
|
date: row.date.toISOString().split('T')[0],
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
brandCount: parseInt(row.brand_count) || 0,
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
storeCount: parseInt(row.store_count) || 0,
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Calculate growth rates
|
||||||
|
const calculateGrowth = (daysBack: number): number | null => {
|
||||||
|
if (dataPoints.length < 2) return null;
|
||||||
|
const targetDate = new Date();
|
||||||
|
targetDate.setDate(targetDate.getDate() - daysBack);
|
||||||
|
const targetDateStr = targetDate.toISOString().split('T')[0];
|
||||||
|
|
||||||
|
const recent = dataPoints[dataPoints.length - 1];
|
||||||
|
const older = dataPoints.find(d => d.date <= targetDateStr) || dataPoints[0];
|
||||||
|
|
||||||
|
if (older.skuCount === 0) return null;
|
||||||
|
return Math.round(((recent.skuCount - older.skuCount) / older.skuCount) * 1000) / 10;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
category,
|
||||||
|
dataPoints,
|
||||||
|
growth7d: calculateGrowth(7),
|
||||||
|
growth30d: calculateGrowth(30),
|
||||||
|
growth90d: calculateGrowth(90),
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get category heatmap data
|
||||||
|
*/
|
||||||
|
async getCategoryHeatmap(
|
||||||
|
metric: 'skus' | 'growth' | 'price' = 'skus',
|
||||||
|
periods: number = 12 // weeks
|
||||||
|
): Promise<CategoryHeatmapData> {
|
||||||
|
const key = cacheKey('category_heatmap', { metric, periods });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
category,
|
||||||
|
snapshot_date,
|
||||||
|
total_skus,
|
||||||
|
avg_price
|
||||||
|
FROM category_snapshots
|
||||||
|
WHERE snapshot_date >= CURRENT_DATE - ($1 * 7 || ' days')::INTERVAL
|
||||||
|
ORDER BY category, snapshot_date
|
||||||
|
`, [periods]);
|
||||||
|
|
||||||
|
// Get unique categories and generate weekly periods
|
||||||
|
const categoriesSet = new Set<string>();
|
||||||
|
const periodsSet = new Set<string>();
|
||||||
|
|
||||||
|
result.rows.forEach(row => {
|
||||||
|
categoriesSet.add(row.category);
|
||||||
|
// Group by week
|
||||||
|
const date = new Date(row.snapshot_date);
|
||||||
|
const weekStart = new Date(date);
|
||||||
|
weekStart.setDate(date.getDate() - date.getDay());
|
||||||
|
periodsSet.add(weekStart.toISOString().split('T')[0]);
|
||||||
|
});
|
||||||
|
|
||||||
|
const categories = Array.from(categoriesSet).sort();
|
||||||
|
const periodsList = Array.from(periodsSet).sort();
|
||||||
|
|
||||||
|
// Aggregate data by category and week
|
||||||
|
const dataMap = new Map<string, Map<string, { skus: number; price: number | null }>>();
|
||||||
|
|
||||||
|
result.rows.forEach(row => {
|
||||||
|
const date = new Date(row.snapshot_date);
|
||||||
|
const weekStart = new Date(date);
|
||||||
|
weekStart.setDate(date.getDate() - date.getDay());
|
||||||
|
const period = weekStart.toISOString().split('T')[0];
|
||||||
|
|
||||||
|
if (!dataMap.has(row.category)) {
|
||||||
|
dataMap.set(row.category, new Map());
|
||||||
|
}
|
||||||
|
const categoryData = dataMap.get(row.category)!;
|
||||||
|
|
||||||
|
if (!categoryData.has(period)) {
|
||||||
|
categoryData.set(period, { skus: 0, price: null });
|
||||||
|
}
|
||||||
|
const existing = categoryData.get(period)!;
|
||||||
|
existing.skus = Math.max(existing.skus, parseInt(row.total_skus) || 0);
|
||||||
|
if (row.avg_price) {
|
||||||
|
existing.price = parseFloat(row.avg_price);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Build heatmap data
|
||||||
|
const data: CategoryHeatmapData['data'] = [];
|
||||||
|
|
||||||
|
categories.forEach(category => {
|
||||||
|
let previousValue: number | null = null;
|
||||||
|
|
||||||
|
periodsList.forEach(period => {
|
||||||
|
const categoryData = dataMap.get(category)?.get(period);
|
||||||
|
let value = 0;
|
||||||
|
|
||||||
|
if (categoryData) {
|
||||||
|
switch (metric) {
|
||||||
|
case 'skus':
|
||||||
|
value = categoryData.skus;
|
||||||
|
break;
|
||||||
|
case 'price':
|
||||||
|
value = categoryData.price || 0;
|
||||||
|
break;
|
||||||
|
case 'growth':
|
||||||
|
value = previousValue !== null && previousValue > 0
|
||||||
|
? ((categoryData.skus - previousValue) / previousValue) * 100
|
||||||
|
: 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const changeFromPrevious = previousValue !== null && previousValue > 0
|
||||||
|
? ((value - previousValue) / previousValue) * 100
|
||||||
|
: null;
|
||||||
|
|
||||||
|
data.push({
|
||||||
|
category,
|
||||||
|
period,
|
||||||
|
value: Math.round(value * 100) / 100,
|
||||||
|
changeFromPrevious: changeFromPrevious !== null
|
||||||
|
? Math.round(changeFromPrevious * 10) / 10
|
||||||
|
: null,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (metric !== 'growth') {
|
||||||
|
previousValue = value;
|
||||||
|
} else if (categoryData) {
|
||||||
|
previousValue = categoryData.skus;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
categories,
|
||||||
|
periods: periodsList,
|
||||||
|
data,
|
||||||
|
};
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get top growing/declining categories
|
||||||
|
*/
|
||||||
|
async getTopMovers(
|
||||||
|
limit: number = 5,
|
||||||
|
days: number = 30
|
||||||
|
): Promise<{
|
||||||
|
growing: CategoryGrowth[];
|
||||||
|
declining: CategoryGrowth[];
|
||||||
|
}> {
|
||||||
|
const key = cacheKey('top_movers', { limit, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const allGrowth = await this.getCategoryGrowth(days);
|
||||||
|
|
||||||
|
const sorted = [...allGrowth].sort((a, b) => b.skuGrowthPercent - a.skuGrowthPercent);
|
||||||
|
|
||||||
|
return {
|
||||||
|
growing: sorted.filter(c => c.skuGrowthPercent > 0).slice(0, limit),
|
||||||
|
declining: sorted.filter(c => c.skuGrowthPercent < 0).slice(-limit).reverse(),
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get category subcategory breakdown
|
||||||
|
*/
|
||||||
|
async getSubcategoryBreakdown(category: string): Promise<Array<{
|
||||||
|
subcategory: string;
|
||||||
|
skuCount: number;
|
||||||
|
brandCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
percentOfCategory: number;
|
||||||
|
}>> {
|
||||||
|
const key = cacheKey('subcategory_breakdown', { category });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH category_total AS (
|
||||||
|
SELECT COUNT(*) as total FROM dutchie_products WHERE type = $1
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
COALESCE(dp.subcategory, 'Other') as subcategory,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
ct.total as category_total
|
||||||
|
FROM dutchie_products dp, category_total ct
|
||||||
|
WHERE dp.type = $1
|
||||||
|
GROUP BY dp.subcategory, ct.total
|
||||||
|
ORDER BY sku_count DESC
|
||||||
|
`, [category]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
subcategory: row.subcategory,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
brandCount: parseInt(row.brand_count) || 0,
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
percentOfCategory: parseInt(row.category_total) > 0
|
||||||
|
? Math.round((parseInt(row.sku_count) / parseInt(row.category_total)) * 1000) / 10
|
||||||
|
: 0,
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
}
|
||||||
57
backend/src/dutchie-az/services/analytics/index.ts
Normal file
57
backend/src/dutchie-az/services/analytics/index.ts
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
/**
|
||||||
|
* Analytics Module Index
|
||||||
|
*
|
||||||
|
* Exports all analytics services for CannaiQ dashboards.
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
export { AnalyticsCache, cacheKey, type CacheEntry, type CacheConfig } from './cache';
|
||||||
|
|
||||||
|
export {
|
||||||
|
PriceTrendService,
|
||||||
|
type PricePoint,
|
||||||
|
type PriceTrend,
|
||||||
|
type PriceSummary,
|
||||||
|
type PriceCompressionResult,
|
||||||
|
type PriceFilters,
|
||||||
|
} from './price-trends';
|
||||||
|
|
||||||
|
export {
|
||||||
|
PenetrationService,
|
||||||
|
type BrandPenetration,
|
||||||
|
type PenetrationTrend,
|
||||||
|
type ShelfShare,
|
||||||
|
type BrandPresenceByState,
|
||||||
|
type PenetrationFilters,
|
||||||
|
} from './penetration';
|
||||||
|
|
||||||
|
export {
|
||||||
|
CategoryAnalyticsService,
|
||||||
|
type CategoryGrowth,
|
||||||
|
type CategorySummary,
|
||||||
|
type CategoryGrowthTrend,
|
||||||
|
type CategoryHeatmapData,
|
||||||
|
type SeasonalityPattern,
|
||||||
|
type CategoryFilters,
|
||||||
|
} from './category-analytics';
|
||||||
|
|
||||||
|
export {
|
||||||
|
StoreChangeService,
|
||||||
|
type StoreChangeSummary,
|
||||||
|
type StoreChangeEvent,
|
||||||
|
type BrandChange,
|
||||||
|
type ProductChange,
|
||||||
|
type CategoryLeaderboard,
|
||||||
|
type StoreFilters,
|
||||||
|
} from './store-changes';
|
||||||
|
|
||||||
|
export {
|
||||||
|
BrandOpportunityService,
|
||||||
|
type BrandOpportunity,
|
||||||
|
type PricePosition,
|
||||||
|
type MissingSkuOpportunity,
|
||||||
|
type StoreShelfShareChange,
|
||||||
|
type CompetitorAlert,
|
||||||
|
type MarketPositionSummary,
|
||||||
|
} from './brand-opportunity';
|
||||||
556
backend/src/dutchie-az/services/analytics/penetration.ts
Normal file
556
backend/src/dutchie-az/services/analytics/penetration.ts
Normal file
@@ -0,0 +1,556 @@
|
|||||||
|
/**
|
||||||
|
* Brand Penetration Analytics Service
|
||||||
|
*
|
||||||
|
* Provides analytics for brand market penetration including:
|
||||||
|
* - Stores carrying brand
|
||||||
|
* - SKU counts per brand
|
||||||
|
* - Percentage of stores carrying
|
||||||
|
* - Shelf share calculations
|
||||||
|
* - Penetration trends and momentum
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { AnalyticsCache, cacheKey } from './cache';
|
||||||
|
|
||||||
|
export interface BrandPenetration {
|
||||||
|
brandName: string;
|
||||||
|
brandId: string | null;
|
||||||
|
totalStores: number;
|
||||||
|
storesCarrying: number;
|
||||||
|
penetrationPercent: number;
|
||||||
|
totalSkus: number;
|
||||||
|
avgSkusPerStore: number;
|
||||||
|
shelfSharePercent: number;
|
||||||
|
categories: string[];
|
||||||
|
avgPrice: number | null;
|
||||||
|
inStockSkus: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PenetrationTrend {
|
||||||
|
brandName: string;
|
||||||
|
dataPoints: Array<{
|
||||||
|
date: string;
|
||||||
|
storeCount: number;
|
||||||
|
skuCount: number;
|
||||||
|
penetrationPercent: number;
|
||||||
|
}>;
|
||||||
|
momentumScore: number; // -100 to +100
|
||||||
|
riskScore: number; // 0 to 100, higher = more risk
|
||||||
|
trend: 'growing' | 'declining' | 'stable';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ShelfShare {
|
||||||
|
brandName: string;
|
||||||
|
category: string;
|
||||||
|
skuCount: number;
|
||||||
|
categoryTotalSkus: number;
|
||||||
|
shelfSharePercent: number;
|
||||||
|
rank: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BrandPresenceByState {
|
||||||
|
state: string;
|
||||||
|
storeCount: number;
|
||||||
|
skuCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PenetrationFilters {
|
||||||
|
state?: string;
|
||||||
|
category?: string;
|
||||||
|
minStores?: number;
|
||||||
|
minSkus?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class PenetrationService {
|
||||||
|
private pool: Pool;
|
||||||
|
private cache: AnalyticsCache;
|
||||||
|
|
||||||
|
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.cache = cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get penetration data for a specific brand
|
||||||
|
*/
|
||||||
|
async getBrandPenetration(
|
||||||
|
brandName: string,
|
||||||
|
filters: PenetrationFilters = {}
|
||||||
|
): Promise<BrandPenetration> {
|
||||||
|
const { state, category } = filters;
|
||||||
|
const key = cacheKey('brand_penetration', { brandName, state, category });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
// Build where clauses
|
||||||
|
const conditions: string[] = [];
|
||||||
|
const params: (string | number)[] = [brandName];
|
||||||
|
let paramIndex = 2;
|
||||||
|
|
||||||
|
if (state) {
|
||||||
|
conditions.push(`d.state = $${paramIndex++}`);
|
||||||
|
params.push(state);
|
||||||
|
}
|
||||||
|
if (category) {
|
||||||
|
conditions.push(`dp.type = $${paramIndex++}`);
|
||||||
|
params.push(category);
|
||||||
|
}
|
||||||
|
|
||||||
|
const stateCondition = state ? `AND d.state = $${params.indexOf(state) + 1}` : '';
|
||||||
|
const categoryCondition = category ? `AND dp.type = $${params.indexOf(category) + 1}` : '';
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH total_stores AS (
|
||||||
|
SELECT COUNT(DISTINCT id) as total
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE 1=1 ${state ? `AND state = $2` : ''}
|
||||||
|
),
|
||||||
|
brand_data AS (
|
||||||
|
SELECT
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||||
|
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name = $1
|
||||||
|
${stateCondition}
|
||||||
|
${categoryCondition}
|
||||||
|
GROUP BY dp.brand_name, dp.brand_id
|
||||||
|
),
|
||||||
|
total_skus AS (
|
||||||
|
SELECT COUNT(*) as total
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bd.brand_name,
|
||||||
|
bd.brand_id,
|
||||||
|
ts.total as total_stores,
|
||||||
|
bd.stores_carrying,
|
||||||
|
bd.total_skus,
|
||||||
|
bd.avg_price,
|
||||||
|
bd.in_stock,
|
||||||
|
bd.categories,
|
||||||
|
tsk.total as market_total_skus
|
||||||
|
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return {
|
||||||
|
brandName,
|
||||||
|
brandId: null,
|
||||||
|
totalStores: 0,
|
||||||
|
storesCarrying: 0,
|
||||||
|
penetrationPercent: 0,
|
||||||
|
totalSkus: 0,
|
||||||
|
avgSkusPerStore: 0,
|
||||||
|
shelfSharePercent: 0,
|
||||||
|
categories: [],
|
||||||
|
avgPrice: null,
|
||||||
|
inStockSkus: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const row = result.rows[0];
|
||||||
|
const totalStores = parseInt(row.total_stores) || 1;
|
||||||
|
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||||
|
const totalSkus = parseInt(row.total_skus) || 0;
|
||||||
|
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||||
|
|
||||||
|
return {
|
||||||
|
brandName: row.brand_name,
|
||||||
|
brandId: row.brand_id,
|
||||||
|
totalStores,
|
||||||
|
storesCarrying,
|
||||||
|
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||||
|
totalSkus,
|
||||||
|
avgSkusPerStore: storesCarrying > 0
|
||||||
|
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||||
|
: 0,
|
||||||
|
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||||
|
categories: row.categories || [],
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
inStockSkus: parseInt(row.in_stock) || 0,
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get top brands by penetration
|
||||||
|
*/
|
||||||
|
async getTopBrandsByPenetration(
|
||||||
|
limit: number = 20,
|
||||||
|
filters: PenetrationFilters = {}
|
||||||
|
): Promise<BrandPenetration[]> {
|
||||||
|
const { state, category, minStores = 2, minSkus = 5 } = filters;
|
||||||
|
const key = cacheKey('top_brands_penetration', { limit, state, category, minStores, minSkus });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const params: (string | number)[] = [limit, minStores, minSkus];
|
||||||
|
let paramIndex = 4;
|
||||||
|
|
||||||
|
let stateCondition = '';
|
||||||
|
let categoryCondition = '';
|
||||||
|
|
||||||
|
if (state) {
|
||||||
|
stateCondition = `AND d.state = $${paramIndex++}`;
|
||||||
|
params.push(state);
|
||||||
|
}
|
||||||
|
if (category) {
|
||||||
|
categoryCondition = `AND dp.type = $${paramIndex++}`;
|
||||||
|
params.push(category);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH total_stores AS (
|
||||||
|
SELECT COUNT(DISTINCT id) as total
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE 1=1 ${state ? `AND state = $${params.indexOf(state) + 1}` : ''}
|
||||||
|
),
|
||||||
|
total_skus AS (
|
||||||
|
SELECT COUNT(*) as total
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||||
|
),
|
||||||
|
brand_data AS (
|
||||||
|
SELECT
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||||
|
COUNT(*) as total_skus,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||||
|
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name IS NOT NULL
|
||||||
|
${stateCondition}
|
||||||
|
${categoryCondition}
|
||||||
|
GROUP BY dp.brand_name, dp.brand_id
|
||||||
|
HAVING COUNT(DISTINCT dp.dispensary_id) >= $2
|
||||||
|
AND COUNT(*) >= $3
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bd.*,
|
||||||
|
ts.total as total_stores,
|
||||||
|
tsk.total as market_total_skus
|
||||||
|
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||||
|
ORDER BY bd.stores_carrying DESC, bd.total_skus DESC
|
||||||
|
LIMIT $1
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
return result.rows.map(row => {
|
||||||
|
const totalStores = parseInt(row.total_stores) || 1;
|
||||||
|
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||||
|
const totalSkus = parseInt(row.total_skus) || 0;
|
||||||
|
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||||
|
|
||||||
|
return {
|
||||||
|
brandName: row.brand_name,
|
||||||
|
brandId: row.brand_id,
|
||||||
|
totalStores,
|
||||||
|
storesCarrying,
|
||||||
|
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||||
|
totalSkus,
|
||||||
|
avgSkusPerStore: storesCarrying > 0
|
||||||
|
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||||
|
: 0,
|
||||||
|
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||||
|
categories: row.categories || [],
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
inStockSkus: parseInt(row.in_stock) || 0,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get penetration trend for a brand (requires historical snapshots)
|
||||||
|
*/
|
||||||
|
async getPenetrationTrend(
|
||||||
|
brandName: string,
|
||||||
|
days: number = 30
|
||||||
|
): Promise<PenetrationTrend> {
|
||||||
|
const key = cacheKey('penetration_trend', { brandName, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
// Use brand_snapshots table for historical data
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
snapshot_date as date,
|
||||||
|
store_count,
|
||||||
|
total_skus
|
||||||
|
FROM brand_snapshots
|
||||||
|
WHERE brand_name = $1
|
||||||
|
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||||
|
ORDER BY snapshot_date
|
||||||
|
`, [brandName, days]);
|
||||||
|
|
||||||
|
// Get total stores for penetration calculation
|
||||||
|
const totalResult = await this.pool.query(
|
||||||
|
'SELECT COUNT(*) as total FROM dispensaries'
|
||||||
|
);
|
||||||
|
const totalStores = parseInt(totalResult.rows[0]?.total) || 1;
|
||||||
|
|
||||||
|
const dataPoints = result.rows.map(row => ({
|
||||||
|
date: row.date.toISOString().split('T')[0],
|
||||||
|
storeCount: parseInt(row.store_count) || 0,
|
||||||
|
skuCount: parseInt(row.total_skus) || 0,
|
||||||
|
penetrationPercent: Math.round((parseInt(row.store_count) / totalStores) * 1000) / 10,
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Calculate momentum and risk scores
|
||||||
|
let momentumScore = 0;
|
||||||
|
let riskScore = 0;
|
||||||
|
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||||
|
|
||||||
|
if (dataPoints.length >= 2) {
|
||||||
|
const first = dataPoints[0];
|
||||||
|
const last = dataPoints[dataPoints.length - 1];
|
||||||
|
|
||||||
|
// Momentum: change in store count
|
||||||
|
const storeChange = last.storeCount - first.storeCount;
|
||||||
|
const storeChangePercent = first.storeCount > 0
|
||||||
|
? (storeChange / first.storeCount) * 100
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
// Momentum score: -100 to +100
|
||||||
|
momentumScore = Math.max(-100, Math.min(100, storeChangePercent * 10));
|
||||||
|
|
||||||
|
// Risk score: higher if losing stores
|
||||||
|
if (storeChange < 0) {
|
||||||
|
riskScore = Math.min(100, Math.abs(storeChangePercent) * 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine trend
|
||||||
|
if (storeChangePercent > 5) trend = 'growing';
|
||||||
|
else if (storeChangePercent < -5) trend = 'declining';
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
brandName,
|
||||||
|
dataPoints,
|
||||||
|
momentumScore: Math.round(momentumScore),
|
||||||
|
riskScore: Math.round(riskScore),
|
||||||
|
trend,
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get shelf share by category for a brand
|
||||||
|
*/
|
||||||
|
async getShelfShareByCategory(brandName: string): Promise<ShelfShare[]> {
|
||||||
|
const key = cacheKey('shelf_share_category', { brandName });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH category_totals AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
COUNT(*) as total_skus
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
),
|
||||||
|
brand_by_category AS (
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
COUNT(*) as sku_count
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE brand_name = $1
|
||||||
|
AND type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
),
|
||||||
|
ranked AS (
|
||||||
|
SELECT
|
||||||
|
ct.category,
|
||||||
|
COALESCE(bc.sku_count, 0) as sku_count,
|
||||||
|
ct.total_skus,
|
||||||
|
RANK() OVER (PARTITION BY ct.category ORDER BY bc.sku_count DESC NULLS LAST) as rank
|
||||||
|
FROM category_totals ct
|
||||||
|
LEFT JOIN brand_by_category bc ON ct.category = bc.category
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
r.category,
|
||||||
|
r.sku_count,
|
||||||
|
r.total_skus as category_total_skus,
|
||||||
|
ROUND((r.sku_count::NUMERIC / r.total_skus) * 100, 2) as shelf_share_pct,
|
||||||
|
(SELECT COUNT(*) + 1 FROM (
|
||||||
|
SELECT brand_name, COUNT(*) as cnt
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type = r.category AND brand_name IS NOT NULL
|
||||||
|
GROUP BY brand_name
|
||||||
|
HAVING COUNT(*) > r.sku_count
|
||||||
|
) t) as rank
|
||||||
|
FROM ranked r
|
||||||
|
WHERE r.sku_count > 0
|
||||||
|
ORDER BY r.shelf_share_pct DESC
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
brandName,
|
||||||
|
category: row.category,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
categoryTotalSkus: parseInt(row.category_total_skus) || 0,
|
||||||
|
shelfSharePercent: parseFloat(row.shelf_share_pct) || 0,
|
||||||
|
rank: parseInt(row.rank) || 0,
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get brand presence by state/region
|
||||||
|
*/
|
||||||
|
async getBrandPresenceByState(brandName: string): Promise<BrandPresenceByState[]> {
|
||||||
|
const key = cacheKey('brand_presence_state', { brandName });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.state,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name = $1
|
||||||
|
GROUP BY d.state
|
||||||
|
ORDER BY store_count DESC
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
state: row.state,
|
||||||
|
storeCount: parseInt(row.store_count) || 0,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get stores carrying a brand
|
||||||
|
*/
|
||||||
|
async getStoresCarryingBrand(brandName: string): Promise<Array<{
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
city: string;
|
||||||
|
state: string;
|
||||||
|
skuCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
categories: string[];
|
||||||
|
}>> {
|
||||||
|
const key = cacheKey('stores_carrying_brand', { brandName });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name = $1
|
||||||
|
GROUP BY d.id, d.name, d.city, d.state
|
||||||
|
ORDER BY sku_count DESC
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
city: row.city,
|
||||||
|
state: row.state,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
categories: row.categories || [],
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get penetration heatmap data (state-based)
|
||||||
|
*/
|
||||||
|
async getPenetrationHeatmap(
|
||||||
|
brandName?: string
|
||||||
|
): Promise<Array<{
|
||||||
|
state: string;
|
||||||
|
totalStores: number;
|
||||||
|
storesWithBrand: number;
|
||||||
|
penetrationPercent: number;
|
||||||
|
totalSkus: number;
|
||||||
|
}>> {
|
||||||
|
const key = cacheKey('penetration_heatmap', { brandName });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
if (brandName) {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH state_totals AS (
|
||||||
|
SELECT state, COUNT(*) as total_stores
|
||||||
|
FROM dispensaries
|
||||||
|
GROUP BY state
|
||||||
|
),
|
||||||
|
brand_by_state AS (
|
||||||
|
SELECT
|
||||||
|
d.state,
|
||||||
|
COUNT(DISTINCT dp.dispensary_id) as stores_with_brand,
|
||||||
|
COUNT(*) as total_skus
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name = $1
|
||||||
|
GROUP BY d.state
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
st.state,
|
||||||
|
st.total_stores,
|
||||||
|
COALESCE(bs.stores_with_brand, 0) as stores_with_brand,
|
||||||
|
ROUND(COALESCE(bs.stores_with_brand, 0)::NUMERIC / st.total_stores * 100, 1) as penetration_pct,
|
||||||
|
COALESCE(bs.total_skus, 0) as total_skus
|
||||||
|
FROM state_totals st
|
||||||
|
LEFT JOIN brand_by_state bs ON st.state = bs.state
|
||||||
|
ORDER BY penetration_pct DESC
|
||||||
|
`, [brandName]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
state: row.state,
|
||||||
|
totalStores: parseInt(row.total_stores) || 0,
|
||||||
|
storesWithBrand: parseInt(row.stores_with_brand) || 0,
|
||||||
|
penetrationPercent: parseFloat(row.penetration_pct) || 0,
|
||||||
|
totalSkus: parseInt(row.total_skus) || 0,
|
||||||
|
}));
|
||||||
|
} else {
|
||||||
|
// Overall market data by state
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.state,
|
||||||
|
COUNT(DISTINCT d.id) as total_stores,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
COUNT(*) as total_skus
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||||
|
GROUP BY d.state
|
||||||
|
ORDER BY total_stores DESC
|
||||||
|
`);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
state: row.state,
|
||||||
|
totalStores: parseInt(row.total_stores) || 0,
|
||||||
|
storesWithBrand: parseInt(row.brand_count) || 0, // Using brand count here
|
||||||
|
penetrationPercent: 100, // Full penetration for overall view
|
||||||
|
totalSkus: parseInt(row.total_skus) || 0,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
}
|
||||||
534
backend/src/dutchie-az/services/analytics/price-trends.ts
Normal file
534
backend/src/dutchie-az/services/analytics/price-trends.ts
Normal file
@@ -0,0 +1,534 @@
|
|||||||
|
/**
|
||||||
|
* Price Trend Analytics Service
|
||||||
|
*
|
||||||
|
* Provides time-series price analytics including:
|
||||||
|
* - Price over time for products
|
||||||
|
* - Average MSRP/Wholesale by period
|
||||||
|
* - Price volatility scoring
|
||||||
|
* - Price compression detection
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { AnalyticsCache, cacheKey } from './cache';
|
||||||
|
|
||||||
|
export interface PricePoint {
|
||||||
|
date: string;
|
||||||
|
minPrice: number | null;
|
||||||
|
maxPrice: number | null;
|
||||||
|
avgPrice: number | null;
|
||||||
|
wholesalePrice: number | null;
|
||||||
|
sampleSize: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PriceTrend {
|
||||||
|
productId?: number;
|
||||||
|
storeId?: number;
|
||||||
|
brandName?: string;
|
||||||
|
category?: string;
|
||||||
|
dataPoints: PricePoint[];
|
||||||
|
summary: {
|
||||||
|
currentAvg: number | null;
|
||||||
|
previousAvg: number | null;
|
||||||
|
changePercent: number | null;
|
||||||
|
trend: 'up' | 'down' | 'stable';
|
||||||
|
volatilityScore: number | null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PriceSummary {
|
||||||
|
avg7d: number | null;
|
||||||
|
avg30d: number | null;
|
||||||
|
avg90d: number | null;
|
||||||
|
wholesaleAvg7d: number | null;
|
||||||
|
wholesaleAvg30d: number | null;
|
||||||
|
wholesaleAvg90d: number | null;
|
||||||
|
minPrice: number | null;
|
||||||
|
maxPrice: number | null;
|
||||||
|
priceRange: number | null;
|
||||||
|
volatilityScore: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PriceCompressionResult {
|
||||||
|
category: string;
|
||||||
|
brands: Array<{
|
||||||
|
brandName: string;
|
||||||
|
avgPrice: number;
|
||||||
|
priceDistance: number; // distance from category mean
|
||||||
|
}>;
|
||||||
|
compressionScore: number; // 0-100, higher = more compressed
|
||||||
|
standardDeviation: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PriceFilters {
|
||||||
|
storeId?: number;
|
||||||
|
brandName?: string;
|
||||||
|
category?: string;
|
||||||
|
state?: string;
|
||||||
|
days?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class PriceTrendService {
|
||||||
|
private pool: Pool;
|
||||||
|
private cache: AnalyticsCache;
|
||||||
|
|
||||||
|
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.cache = cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get price trend for a specific product
|
||||||
|
*/
|
||||||
|
async getProductPriceTrend(
|
||||||
|
productId: number,
|
||||||
|
storeId?: number,
|
||||||
|
days: number = 30
|
||||||
|
): Promise<PriceTrend> {
|
||||||
|
const key = cacheKey('price_trend_product', { productId, storeId, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
// Try to get from snapshots first
|
||||||
|
const snapshotResult = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
DATE(crawled_at) as date,
|
||||||
|
MIN(rec_min_price_cents) / 100.0 as min_price,
|
||||||
|
MAX(rec_max_price_cents) / 100.0 as max_price,
|
||||||
|
AVG(rec_min_price_cents) / 100.0 as avg_price,
|
||||||
|
AVG(wholesale_min_price_cents) / 100.0 as wholesale_price,
|
||||||
|
COUNT(*) as sample_size
|
||||||
|
FROM dutchie_product_snapshots
|
||||||
|
WHERE dutchie_product_id = $1
|
||||||
|
AND crawled_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||||
|
${storeId ? 'AND dispensary_id = $3' : ''}
|
||||||
|
GROUP BY DATE(crawled_at)
|
||||||
|
ORDER BY date
|
||||||
|
`, storeId ? [productId, days, storeId] : [productId, days]);
|
||||||
|
|
||||||
|
let dataPoints: PricePoint[] = snapshotResult.rows.map(row => ({
|
||||||
|
date: row.date.toISOString().split('T')[0],
|
||||||
|
minPrice: parseFloat(row.min_price) || null,
|
||||||
|
maxPrice: parseFloat(row.max_price) || null,
|
||||||
|
avgPrice: parseFloat(row.avg_price) || null,
|
||||||
|
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||||
|
sampleSize: parseInt(row.sample_size),
|
||||||
|
}));
|
||||||
|
|
||||||
|
// If no snapshots, get current price from product
|
||||||
|
if (dataPoints.length === 0) {
|
||||||
|
const productResult = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
extract_min_price(latest_raw_payload) as min_price,
|
||||||
|
extract_max_price(latest_raw_payload) as max_price,
|
||||||
|
extract_wholesale_price(latest_raw_payload) as wholesale_price
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE id = $1
|
||||||
|
`, [productId]);
|
||||||
|
|
||||||
|
if (productResult.rows.length > 0) {
|
||||||
|
const row = productResult.rows[0];
|
||||||
|
dataPoints = [{
|
||||||
|
date: new Date().toISOString().split('T')[0],
|
||||||
|
minPrice: parseFloat(row.min_price) || null,
|
||||||
|
maxPrice: parseFloat(row.max_price) || null,
|
||||||
|
avgPrice: parseFloat(row.min_price) || null,
|
||||||
|
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||||
|
sampleSize: 1,
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const summary = this.calculatePriceSummary(dataPoints);
|
||||||
|
|
||||||
|
return {
|
||||||
|
productId,
|
||||||
|
storeId,
|
||||||
|
dataPoints,
|
||||||
|
summary,
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get price trends by brand
|
||||||
|
*/
|
||||||
|
async getBrandPriceTrend(
|
||||||
|
brandName: string,
|
||||||
|
filters: PriceFilters = {}
|
||||||
|
): Promise<PriceTrend> {
|
||||||
|
const { storeId, category, state, days = 30 } = filters;
|
||||||
|
const key = cacheKey('price_trend_brand', { brandName, storeId, category, state, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
// Use current product data aggregated by date
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
DATE(dp.updated_at) as date,
|
||||||
|
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||||
|
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||||
|
COUNT(*) as sample_size
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.brand_name = $1
|
||||||
|
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||||
|
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||||
|
${category ? `AND dp.type = $${storeId ? 4 : 3}` : ''}
|
||||||
|
${state ? `AND d.state = $${storeId ? (category ? 5 : 4) : (category ? 4 : 3)}` : ''}
|
||||||
|
GROUP BY DATE(dp.updated_at)
|
||||||
|
ORDER BY date
|
||||||
|
`, this.buildParams([brandName, days], { storeId, category, state }));
|
||||||
|
|
||||||
|
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||||
|
date: row.date.toISOString().split('T')[0],
|
||||||
|
minPrice: parseFloat(row.min_price) || null,
|
||||||
|
maxPrice: parseFloat(row.max_price) || null,
|
||||||
|
avgPrice: parseFloat(row.avg_price) || null,
|
||||||
|
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||||
|
sampleSize: parseInt(row.sample_size),
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
brandName,
|
||||||
|
storeId,
|
||||||
|
category,
|
||||||
|
dataPoints,
|
||||||
|
summary: this.calculatePriceSummary(dataPoints),
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get price trends by category
|
||||||
|
*/
|
||||||
|
async getCategoryPriceTrend(
|
||||||
|
category: string,
|
||||||
|
filters: PriceFilters = {}
|
||||||
|
): Promise<PriceTrend> {
|
||||||
|
const { storeId, brandName, state, days = 30 } = filters;
|
||||||
|
const key = cacheKey('price_trend_category', { category, storeId, brandName, state, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
DATE(dp.updated_at) as date,
|
||||||
|
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||||
|
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||||
|
COUNT(*) as sample_size
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.type = $1
|
||||||
|
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||||
|
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||||
|
${brandName ? `AND dp.brand_name = $${storeId ? 4 : 3}` : ''}
|
||||||
|
${state ? `AND d.state = $${storeId ? (brandName ? 5 : 4) : (brandName ? 4 : 3)}` : ''}
|
||||||
|
GROUP BY DATE(dp.updated_at)
|
||||||
|
ORDER BY date
|
||||||
|
`, this.buildParams([category, days], { storeId, brandName, state }));
|
||||||
|
|
||||||
|
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||||
|
date: row.date.toISOString().split('T')[0],
|
||||||
|
minPrice: parseFloat(row.min_price) || null,
|
||||||
|
maxPrice: parseFloat(row.max_price) || null,
|
||||||
|
avgPrice: parseFloat(row.avg_price) || null,
|
||||||
|
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||||
|
sampleSize: parseInt(row.sample_size),
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
category,
|
||||||
|
storeId,
|
||||||
|
brandName,
|
||||||
|
dataPoints,
|
||||||
|
summary: this.calculatePriceSummary(dataPoints),
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get price summary statistics
|
||||||
|
*/
|
||||||
|
async getPriceSummary(filters: PriceFilters = {}): Promise<PriceSummary> {
|
||||||
|
const { storeId, brandName, category, state } = filters;
|
||||||
|
const key = cacheKey('price_summary', filters as Record<string, unknown>);
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const whereConditions: string[] = [];
|
||||||
|
const params: (string | number)[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (storeId) {
|
||||||
|
whereConditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||||
|
params.push(storeId);
|
||||||
|
}
|
||||||
|
if (brandName) {
|
||||||
|
whereConditions.push(`dp.brand_name = $${paramIndex++}`);
|
||||||
|
params.push(brandName);
|
||||||
|
}
|
||||||
|
if (category) {
|
||||||
|
whereConditions.push(`dp.type = $${paramIndex++}`);
|
||||||
|
params.push(category);
|
||||||
|
}
|
||||||
|
if (state) {
|
||||||
|
whereConditions.push(`d.state = $${paramIndex++}`);
|
||||||
|
params.push(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
const whereClause = whereConditions.length > 0
|
||||||
|
? 'WHERE ' + whereConditions.join(' AND ')
|
||||||
|
: '';
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH prices AS (
|
||||||
|
SELECT
|
||||||
|
extract_min_price(dp.latest_raw_payload) as min_price,
|
||||||
|
extract_max_price(dp.latest_raw_payload) as max_price,
|
||||||
|
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
${whereClause}
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
AVG(min_price) as avg_price,
|
||||||
|
AVG(wholesale_price) as avg_wholesale,
|
||||||
|
MIN(min_price) as min_price,
|
||||||
|
MAX(max_price) as max_price,
|
||||||
|
STDDEV(min_price) as std_dev
|
||||||
|
FROM prices
|
||||||
|
WHERE min_price IS NOT NULL
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
const row = result.rows[0];
|
||||||
|
const avgPrice = parseFloat(row.avg_price) || null;
|
||||||
|
const stdDev = parseFloat(row.std_dev) || null;
|
||||||
|
const volatility = avgPrice && stdDev ? (stdDev / avgPrice) * 100 : null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
avg7d: avgPrice, // Using current data as proxy
|
||||||
|
avg30d: avgPrice,
|
||||||
|
avg90d: avgPrice,
|
||||||
|
wholesaleAvg7d: parseFloat(row.avg_wholesale) || null,
|
||||||
|
wholesaleAvg30d: parseFloat(row.avg_wholesale) || null,
|
||||||
|
wholesaleAvg90d: parseFloat(row.avg_wholesale) || null,
|
||||||
|
minPrice: parseFloat(row.min_price) || null,
|
||||||
|
maxPrice: parseFloat(row.max_price) || null,
|
||||||
|
priceRange: row.max_price && row.min_price
|
||||||
|
? parseFloat(row.max_price) - parseFloat(row.min_price)
|
||||||
|
: null,
|
||||||
|
volatilityScore: volatility ? Math.round(volatility * 10) / 10 : null,
|
||||||
|
};
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect price compression in a category
|
||||||
|
*/
|
||||||
|
async detectPriceCompression(
|
||||||
|
category: string,
|
||||||
|
state?: string
|
||||||
|
): Promise<PriceCompressionResult> {
|
||||||
|
const key = cacheKey('price_compression', { category, state });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH brand_prices AS (
|
||||||
|
SELECT
|
||||||
|
dp.brand_name,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
COUNT(*) as sku_count
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.type = $1
|
||||||
|
AND dp.brand_name IS NOT NULL
|
||||||
|
${state ? 'AND d.state = $2' : ''}
|
||||||
|
GROUP BY dp.brand_name
|
||||||
|
HAVING COUNT(*) >= 3
|
||||||
|
),
|
||||||
|
stats AS (
|
||||||
|
SELECT
|
||||||
|
AVG(avg_price) as category_avg,
|
||||||
|
STDDEV(avg_price) as std_dev
|
||||||
|
FROM brand_prices
|
||||||
|
WHERE avg_price IS NOT NULL
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
bp.brand_name,
|
||||||
|
bp.avg_price,
|
||||||
|
ABS(bp.avg_price - s.category_avg) as price_distance,
|
||||||
|
s.category_avg,
|
||||||
|
s.std_dev
|
||||||
|
FROM brand_prices bp, stats s
|
||||||
|
WHERE bp.avg_price IS NOT NULL
|
||||||
|
ORDER BY bp.avg_price
|
||||||
|
`, state ? [category, state] : [category]);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return {
|
||||||
|
category,
|
||||||
|
brands: [],
|
||||||
|
compressionScore: 0,
|
||||||
|
standardDeviation: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const categoryAvg = parseFloat(result.rows[0].category_avg) || 0;
|
||||||
|
const stdDev = parseFloat(result.rows[0].std_dev) || 0;
|
||||||
|
|
||||||
|
// Compression score: lower std dev relative to mean = more compression
|
||||||
|
// Scale to 0-100 where 100 = very compressed
|
||||||
|
const cv = categoryAvg > 0 ? (stdDev / categoryAvg) * 100 : 0;
|
||||||
|
const compressionScore = Math.max(0, Math.min(100, 100 - cv));
|
||||||
|
|
||||||
|
const brands = result.rows.map(row => ({
|
||||||
|
brandName: row.brand_name,
|
||||||
|
avgPrice: parseFloat(row.avg_price) || 0,
|
||||||
|
priceDistance: parseFloat(row.price_distance) || 0,
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
category,
|
||||||
|
brands,
|
||||||
|
compressionScore: Math.round(compressionScore),
|
||||||
|
standardDeviation: Math.round(stdDev * 100) / 100,
|
||||||
|
};
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get global price statistics
|
||||||
|
*/
|
||||||
|
async getGlobalPriceStats(): Promise<{
|
||||||
|
totalProductsWithPrice: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
medianPrice: number | null;
|
||||||
|
priceByCategory: Array<{ category: string; avgPrice: number; count: number }>;
|
||||||
|
priceByState: Array<{ state: string; avgPrice: number; count: number }>;
|
||||||
|
}> {
|
||||||
|
const key = 'global_price_stats';
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const [countResult, categoryResult, stateResult] = await Promise.all([
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
COUNT(*) FILTER (WHERE extract_min_price(latest_raw_payload) IS NOT NULL) as with_price,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY extract_min_price(latest_raw_payload)) as median
|
||||||
|
FROM dutchie_products
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
type as category,
|
||||||
|
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE type IS NOT NULL
|
||||||
|
AND extract_min_price(latest_raw_payload) IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
ORDER BY avg_price DESC
|
||||||
|
`),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.state,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE extract_min_price(dp.latest_raw_payload) IS NOT NULL
|
||||||
|
GROUP BY d.state
|
||||||
|
ORDER BY avg_price DESC
|
||||||
|
`),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalProductsWithPrice: parseInt(countResult.rows[0]?.with_price || '0'),
|
||||||
|
avgPrice: parseFloat(countResult.rows[0]?.avg_price) || null,
|
||||||
|
medianPrice: parseFloat(countResult.rows[0]?.median) || null,
|
||||||
|
priceByCategory: categoryResult.rows.map(r => ({
|
||||||
|
category: r.category,
|
||||||
|
avgPrice: parseFloat(r.avg_price) || 0,
|
||||||
|
count: parseInt(r.count),
|
||||||
|
})),
|
||||||
|
priceByState: stateResult.rows.map(r => ({
|
||||||
|
state: r.state,
|
||||||
|
avgPrice: parseFloat(r.avg_price) || 0,
|
||||||
|
count: parseInt(r.count),
|
||||||
|
})),
|
||||||
|
};
|
||||||
|
}, 30)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// HELPER METHODS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
private calculatePriceSummary(dataPoints: PricePoint[]): PriceTrend['summary'] {
|
||||||
|
if (dataPoints.length === 0) {
|
||||||
|
return {
|
||||||
|
currentAvg: null,
|
||||||
|
previousAvg: null,
|
||||||
|
changePercent: null,
|
||||||
|
trend: 'stable',
|
||||||
|
volatilityScore: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const prices = dataPoints
|
||||||
|
.map(d => d.avgPrice)
|
||||||
|
.filter((p): p is number => p !== null);
|
||||||
|
|
||||||
|
if (prices.length === 0) {
|
||||||
|
return {
|
||||||
|
currentAvg: null,
|
||||||
|
previousAvg: null,
|
||||||
|
changePercent: null,
|
||||||
|
trend: 'stable',
|
||||||
|
volatilityScore: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const currentAvg = prices[prices.length - 1];
|
||||||
|
const midpoint = Math.floor(prices.length / 2);
|
||||||
|
const previousAvg = prices.length > 1 ? prices[midpoint] : currentAvg;
|
||||||
|
|
||||||
|
const changePercent = previousAvg > 0
|
||||||
|
? ((currentAvg - previousAvg) / previousAvg) * 100
|
||||||
|
: null;
|
||||||
|
|
||||||
|
// Calculate volatility (coefficient of variation)
|
||||||
|
const mean = prices.reduce((a, b) => a + b, 0) / prices.length;
|
||||||
|
const variance = prices.reduce((sum, p) => sum + Math.pow(p - mean, 2), 0) / prices.length;
|
||||||
|
const stdDev = Math.sqrt(variance);
|
||||||
|
const volatilityScore = mean > 0 ? (stdDev / mean) * 100 : null;
|
||||||
|
|
||||||
|
let trend: 'up' | 'down' | 'stable' = 'stable';
|
||||||
|
if (changePercent !== null) {
|
||||||
|
if (changePercent > 5) trend = 'up';
|
||||||
|
else if (changePercent < -5) trend = 'down';
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
currentAvg: Math.round(currentAvg * 100) / 100,
|
||||||
|
previousAvg: Math.round(previousAvg * 100) / 100,
|
||||||
|
changePercent: changePercent !== null ? Math.round(changePercent * 10) / 10 : null,
|
||||||
|
trend,
|
||||||
|
volatilityScore: volatilityScore !== null ? Math.round(volatilityScore * 10) / 10 : null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private buildParams(
|
||||||
|
baseParams: (string | number)[],
|
||||||
|
optionalParams: Record<string, string | number | undefined>
|
||||||
|
): (string | number)[] {
|
||||||
|
const params = [...baseParams];
|
||||||
|
for (const value of Object.values(optionalParams)) {
|
||||||
|
if (value !== undefined) {
|
||||||
|
params.push(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
}
|
||||||
587
backend/src/dutchie-az/services/analytics/store-changes.ts
Normal file
587
backend/src/dutchie-az/services/analytics/store-changes.ts
Normal file
@@ -0,0 +1,587 @@
|
|||||||
|
/**
|
||||||
|
* Store Change Tracking Service
|
||||||
|
*
|
||||||
|
* Tracks changes at the store level including:
|
||||||
|
* - New/lost brands
|
||||||
|
* - New/discontinued products
|
||||||
|
* - Stock status transitions
|
||||||
|
* - Price changes
|
||||||
|
* - Category movement leaderboards
|
||||||
|
*
|
||||||
|
* Phase 3: Analytics Dashboards
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { AnalyticsCache, cacheKey } from './cache';
|
||||||
|
|
||||||
|
export interface StoreChangeSummary {
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
city: string;
|
||||||
|
state: string;
|
||||||
|
brandsAdded7d: number;
|
||||||
|
brandsAdded30d: number;
|
||||||
|
brandsLost7d: number;
|
||||||
|
brandsLost30d: number;
|
||||||
|
productsAdded7d: number;
|
||||||
|
productsAdded30d: number;
|
||||||
|
productsDiscontinued7d: number;
|
||||||
|
productsDiscontinued30d: number;
|
||||||
|
priceDrops7d: number;
|
||||||
|
priceIncreases7d: number;
|
||||||
|
restocks7d: number;
|
||||||
|
stockOuts7d: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StoreChangeEvent {
|
||||||
|
id: number;
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
eventType: string;
|
||||||
|
eventDate: string;
|
||||||
|
brandName: string | null;
|
||||||
|
productName: string | null;
|
||||||
|
category: string | null;
|
||||||
|
oldValue: string | null;
|
||||||
|
newValue: string | null;
|
||||||
|
metadata: Record<string, unknown> | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BrandChange {
|
||||||
|
brandName: string;
|
||||||
|
changeType: 'added' | 'removed';
|
||||||
|
date: string;
|
||||||
|
skuCount: number;
|
||||||
|
categories: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ProductChange {
|
||||||
|
productId: number;
|
||||||
|
productName: string;
|
||||||
|
brandName: string | null;
|
||||||
|
category: string | null;
|
||||||
|
changeType: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock';
|
||||||
|
date: string;
|
||||||
|
oldValue?: string;
|
||||||
|
newValue?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CategoryLeaderboard {
|
||||||
|
category: string;
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
skuCount: number;
|
||||||
|
brandCount: number;
|
||||||
|
avgPrice: number | null;
|
||||||
|
changePercent7d: number;
|
||||||
|
rank: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StoreFilters {
|
||||||
|
storeId?: number;
|
||||||
|
state?: string;
|
||||||
|
days?: number;
|
||||||
|
eventType?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class StoreChangeService {
|
||||||
|
private pool: Pool;
|
||||||
|
private cache: AnalyticsCache;
|
||||||
|
|
||||||
|
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.cache = cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get change summary for a store
|
||||||
|
*/
|
||||||
|
async getStoreChangeSummary(
|
||||||
|
storeId: number
|
||||||
|
): Promise<StoreChangeSummary | null> {
|
||||||
|
const key = cacheKey('store_change_summary', { storeId });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
// Get store info
|
||||||
|
const storeResult = await this.pool.query(`
|
||||||
|
SELECT id, name, city, state FROM dispensaries WHERE id = $1
|
||||||
|
`, [storeId]);
|
||||||
|
|
||||||
|
if (storeResult.rows.length === 0) return null;
|
||||||
|
const store = storeResult.rows[0];
|
||||||
|
|
||||||
|
// Get change events counts
|
||||||
|
const eventsResult = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
event_type,
|
||||||
|
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '7 days') as count_7d,
|
||||||
|
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '30 days') as count_30d
|
||||||
|
FROM store_change_events
|
||||||
|
WHERE store_id = $1
|
||||||
|
GROUP BY event_type
|
||||||
|
`, [storeId]);
|
||||||
|
|
||||||
|
const counts: Record<string, { count_7d: number; count_30d: number }> = {};
|
||||||
|
eventsResult.rows.forEach(row => {
|
||||||
|
counts[row.event_type] = {
|
||||||
|
count_7d: parseInt(row.count_7d) || 0,
|
||||||
|
count_30d: parseInt(row.count_30d) || 0,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
storeId: store.id,
|
||||||
|
storeName: store.name,
|
||||||
|
city: store.city,
|
||||||
|
state: store.state,
|
||||||
|
brandsAdded7d: counts['brand_added']?.count_7d || 0,
|
||||||
|
brandsAdded30d: counts['brand_added']?.count_30d || 0,
|
||||||
|
brandsLost7d: counts['brand_removed']?.count_7d || 0,
|
||||||
|
brandsLost30d: counts['brand_removed']?.count_30d || 0,
|
||||||
|
productsAdded7d: counts['product_added']?.count_7d || 0,
|
||||||
|
productsAdded30d: counts['product_added']?.count_30d || 0,
|
||||||
|
productsDiscontinued7d: counts['product_removed']?.count_7d || 0,
|
||||||
|
productsDiscontinued30d: counts['product_removed']?.count_30d || 0,
|
||||||
|
priceDrops7d: counts['price_drop']?.count_7d || 0,
|
||||||
|
priceIncreases7d: counts['price_increase']?.count_7d || 0,
|
||||||
|
restocks7d: counts['restocked']?.count_7d || 0,
|
||||||
|
stockOuts7d: counts['out_of_stock']?.count_7d || 0,
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get recent change events for a store
|
||||||
|
*/
|
||||||
|
async getStoreChangeEvents(
|
||||||
|
storeId: number,
|
||||||
|
filters: { eventType?: string; days?: number; limit?: number } = {}
|
||||||
|
): Promise<StoreChangeEvent[]> {
|
||||||
|
const { eventType, days = 30, limit = 100 } = filters;
|
||||||
|
const key = cacheKey('store_change_events', { storeId, eventType, days, limit });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const params: (string | number)[] = [storeId, days, limit];
|
||||||
|
let eventTypeCondition = '';
|
||||||
|
|
||||||
|
if (eventType) {
|
||||||
|
eventTypeCondition = 'AND event_type = $4';
|
||||||
|
params.push(eventType);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
sce.id,
|
||||||
|
sce.store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
sce.event_type,
|
||||||
|
sce.event_date,
|
||||||
|
sce.brand_name,
|
||||||
|
sce.product_name,
|
||||||
|
sce.category,
|
||||||
|
sce.old_value,
|
||||||
|
sce.new_value,
|
||||||
|
sce.metadata
|
||||||
|
FROM store_change_events sce
|
||||||
|
JOIN dispensaries d ON sce.store_id = d.id
|
||||||
|
WHERE sce.store_id = $1
|
||||||
|
AND sce.event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||||
|
${eventTypeCondition}
|
||||||
|
ORDER BY sce.event_date DESC, sce.id DESC
|
||||||
|
LIMIT $3
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
id: row.id,
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
eventType: row.event_type,
|
||||||
|
eventDate: row.event_date.toISOString().split('T')[0],
|
||||||
|
brandName: row.brand_name,
|
||||||
|
productName: row.product_name,
|
||||||
|
category: row.category,
|
||||||
|
oldValue: row.old_value,
|
||||||
|
newValue: row.new_value,
|
||||||
|
metadata: row.metadata,
|
||||||
|
}));
|
||||||
|
}, 5)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get new brands added to a store
|
||||||
|
*/
|
||||||
|
async getNewBrands(
|
||||||
|
storeId: number,
|
||||||
|
days: number = 30
|
||||||
|
): Promise<BrandChange[]> {
|
||||||
|
const key = cacheKey('new_brands', { storeId, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
brand_name,
|
||||||
|
event_date,
|
||||||
|
metadata
|
||||||
|
FROM store_change_events
|
||||||
|
WHERE store_id = $1
|
||||||
|
AND event_type = 'brand_added'
|
||||||
|
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||||
|
ORDER BY event_date DESC
|
||||||
|
`, [storeId, days]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
brandName: row.brand_name,
|
||||||
|
changeType: 'added' as const,
|
||||||
|
date: row.event_date.toISOString().split('T')[0],
|
||||||
|
skuCount: row.metadata?.sku_count || 0,
|
||||||
|
categories: row.metadata?.categories || [],
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get brands lost from a store
|
||||||
|
*/
|
||||||
|
async getLostBrands(
|
||||||
|
storeId: number,
|
||||||
|
days: number = 30
|
||||||
|
): Promise<BrandChange[]> {
|
||||||
|
const key = cacheKey('lost_brands', { storeId, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
brand_name,
|
||||||
|
event_date,
|
||||||
|
metadata
|
||||||
|
FROM store_change_events
|
||||||
|
WHERE store_id = $1
|
||||||
|
AND event_type = 'brand_removed'
|
||||||
|
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||||
|
ORDER BY event_date DESC
|
||||||
|
`, [storeId, days]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
brandName: row.brand_name,
|
||||||
|
changeType: 'removed' as const,
|
||||||
|
date: row.event_date.toISOString().split('T')[0],
|
||||||
|
skuCount: row.metadata?.sku_count || 0,
|
||||||
|
categories: row.metadata?.categories || [],
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get product changes for a store
|
||||||
|
*/
|
||||||
|
async getProductChanges(
|
||||||
|
storeId: number,
|
||||||
|
changeType?: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock',
|
||||||
|
days: number = 7
|
||||||
|
): Promise<ProductChange[]> {
|
||||||
|
const key = cacheKey('product_changes', { storeId, changeType, days });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const eventTypeMap: Record<string, string> = {
|
||||||
|
'added': 'product_added',
|
||||||
|
'discontinued': 'product_removed',
|
||||||
|
'price_drop': 'price_drop',
|
||||||
|
'price_increase': 'price_increase',
|
||||||
|
'restocked': 'restocked',
|
||||||
|
'out_of_stock': 'out_of_stock',
|
||||||
|
};
|
||||||
|
|
||||||
|
const params: (string | number)[] = [storeId, days];
|
||||||
|
let eventCondition = '';
|
||||||
|
|
||||||
|
if (changeType) {
|
||||||
|
eventCondition = 'AND event_type = $3';
|
||||||
|
params.push(eventTypeMap[changeType]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
product_id,
|
||||||
|
product_name,
|
||||||
|
brand_name,
|
||||||
|
category,
|
||||||
|
event_type,
|
||||||
|
event_date,
|
||||||
|
old_value,
|
||||||
|
new_value
|
||||||
|
FROM store_change_events
|
||||||
|
WHERE store_id = $1
|
||||||
|
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||||
|
AND product_id IS NOT NULL
|
||||||
|
${eventCondition}
|
||||||
|
ORDER BY event_date DESC
|
||||||
|
LIMIT 100
|
||||||
|
`, params);
|
||||||
|
|
||||||
|
const reverseMap: Record<string, ProductChange['changeType']> = {
|
||||||
|
'product_added': 'added',
|
||||||
|
'product_removed': 'discontinued',
|
||||||
|
'price_drop': 'price_drop',
|
||||||
|
'price_increase': 'price_increase',
|
||||||
|
'restocked': 'restocked',
|
||||||
|
'out_of_stock': 'out_of_stock',
|
||||||
|
};
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
productId: row.product_id,
|
||||||
|
productName: row.product_name,
|
||||||
|
brandName: row.brand_name,
|
||||||
|
category: row.category,
|
||||||
|
changeType: reverseMap[row.event_type] || 'added',
|
||||||
|
date: row.event_date.toISOString().split('T')[0],
|
||||||
|
oldValue: row.old_value,
|
||||||
|
newValue: row.new_value,
|
||||||
|
}));
|
||||||
|
}, 5)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get category leaderboard across stores
|
||||||
|
*/
|
||||||
|
async getCategoryLeaderboard(
|
||||||
|
category: string,
|
||||||
|
limit: number = 20
|
||||||
|
): Promise<CategoryLeaderboard[]> {
|
||||||
|
const key = cacheKey('category_leaderboard', { category, limit });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
WITH store_category_stats AS (
|
||||||
|
SELECT
|
||||||
|
dp.dispensary_id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
COUNT(*) as sku_count,
|
||||||
|
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||||
|
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||||
|
FROM dutchie_products dp
|
||||||
|
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||||
|
WHERE dp.type = $1
|
||||||
|
GROUP BY dp.dispensary_id, d.name
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
scs.*,
|
||||||
|
RANK() OVER (ORDER BY scs.sku_count DESC) as rank
|
||||||
|
FROM store_category_stats scs
|
||||||
|
ORDER BY scs.sku_count DESC
|
||||||
|
LIMIT $2
|
||||||
|
`, [category, limit]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
category,
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
skuCount: parseInt(row.sku_count) || 0,
|
||||||
|
brandCount: parseInt(row.brand_count) || 0,
|
||||||
|
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||||
|
changePercent7d: 0, // Would need historical data
|
||||||
|
rank: parseInt(row.rank) || 0,
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get stores with most activity (changes)
|
||||||
|
*/
|
||||||
|
async getMostActiveStores(
|
||||||
|
days: number = 7,
|
||||||
|
limit: number = 10
|
||||||
|
): Promise<Array<{
|
||||||
|
storeId: number;
|
||||||
|
storeName: string;
|
||||||
|
city: string;
|
||||||
|
state: string;
|
||||||
|
totalChanges: number;
|
||||||
|
brandsChanged: number;
|
||||||
|
productsChanged: number;
|
||||||
|
priceChanges: number;
|
||||||
|
stockChanges: number;
|
||||||
|
}>> {
|
||||||
|
const key = cacheKey('most_active_stores', { days, limit });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const result = await this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.id as store_id,
|
||||||
|
d.name as store_name,
|
||||||
|
d.city,
|
||||||
|
d.state,
|
||||||
|
COUNT(*) as total_changes,
|
||||||
|
COUNT(*) FILTER (WHERE sce.event_type IN ('brand_added', 'brand_removed')) as brands_changed,
|
||||||
|
COUNT(*) FILTER (WHERE sce.event_type IN ('product_added', 'product_removed')) as products_changed,
|
||||||
|
COUNT(*) FILTER (WHERE sce.event_type IN ('price_drop', 'price_increase')) as price_changes,
|
||||||
|
COUNT(*) FILTER (WHERE sce.event_type IN ('restocked', 'out_of_stock')) as stock_changes
|
||||||
|
FROM store_change_events sce
|
||||||
|
JOIN dispensaries d ON sce.store_id = d.id
|
||||||
|
WHERE sce.event_date >= CURRENT_DATE - ($1 || ' days')::INTERVAL
|
||||||
|
GROUP BY d.id, d.name, d.city, d.state
|
||||||
|
ORDER BY total_changes DESC
|
||||||
|
LIMIT $2
|
||||||
|
`, [days, limit]);
|
||||||
|
|
||||||
|
return result.rows.map(row => ({
|
||||||
|
storeId: row.store_id,
|
||||||
|
storeName: row.store_name,
|
||||||
|
city: row.city,
|
||||||
|
state: row.state,
|
||||||
|
totalChanges: parseInt(row.total_changes) || 0,
|
||||||
|
brandsChanged: parseInt(row.brands_changed) || 0,
|
||||||
|
productsChanged: parseInt(row.products_changed) || 0,
|
||||||
|
priceChanges: parseInt(row.price_changes) || 0,
|
||||||
|
stockChanges: parseInt(row.stock_changes) || 0,
|
||||||
|
}));
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare two stores
|
||||||
|
*/
|
||||||
|
async compareStores(
|
||||||
|
storeId1: number,
|
||||||
|
storeId2: number
|
||||||
|
): Promise<{
|
||||||
|
store1: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||||
|
store2: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||||
|
sharedBrands: string[];
|
||||||
|
uniqueToStore1: string[];
|
||||||
|
uniqueToStore2: string[];
|
||||||
|
categoryComparison: Array<{
|
||||||
|
category: string;
|
||||||
|
store1Skus: number;
|
||||||
|
store2Skus: number;
|
||||||
|
difference: number;
|
||||||
|
}>;
|
||||||
|
}> {
|
||||||
|
const key = cacheKey('compare_stores', { storeId1, storeId2 });
|
||||||
|
|
||||||
|
return (await this.cache.getOrCompute(key, async () => {
|
||||||
|
const [store1Data, store2Data] = await Promise.all([
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.id, d.name,
|
||||||
|
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||||
|
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||||
|
COUNT(*) as sku_count
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||||
|
WHERE d.id = $1
|
||||||
|
GROUP BY d.id, d.name
|
||||||
|
`, [storeId1]),
|
||||||
|
this.pool.query(`
|
||||||
|
SELECT
|
||||||
|
d.id, d.name,
|
||||||
|
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||||
|
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||||
|
COUNT(*) as sku_count
|
||||||
|
FROM dispensaries d
|
||||||
|
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||||
|
WHERE d.id = $1
|
||||||
|
GROUP BY d.id, d.name
|
||||||
|
`, [storeId2]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const s1 = store1Data.rows[0];
|
||||||
|
const s2 = store2Data.rows[0];
|
||||||
|
|
||||||
|
const brands1Array: string[] = (s1?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||||
|
const brands2Array: string[] = (s2?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||||
|
const brands1 = new Set(brands1Array);
|
||||||
|
const brands2 = new Set(brands2Array);
|
||||||
|
|
||||||
|
const sharedBrands: string[] = brands1Array.filter(b => brands2.has(b));
|
||||||
|
const uniqueToStore1: string[] = brands1Array.filter(b => !brands2.has(b));
|
||||||
|
const uniqueToStore2: string[] = brands2Array.filter(b => !brands1.has(b));
|
||||||
|
|
||||||
|
// Category comparison
|
||||||
|
const categoryResult = await this.pool.query(`
|
||||||
|
WITH store1_cats AS (
|
||||||
|
SELECT type as category, COUNT(*) as sku_count
|
||||||
|
FROM dutchie_products WHERE dispensary_id = $1 AND type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
),
|
||||||
|
store2_cats AS (
|
||||||
|
SELECT type as category, COUNT(*) as sku_count
|
||||||
|
FROM dutchie_products WHERE dispensary_id = $2 AND type IS NOT NULL
|
||||||
|
GROUP BY type
|
||||||
|
),
|
||||||
|
all_cats AS (
|
||||||
|
SELECT category FROM store1_cats
|
||||||
|
UNION
|
||||||
|
SELECT category FROM store2_cats
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
ac.category,
|
||||||
|
COALESCE(s1.sku_count, 0) as store1_skus,
|
||||||
|
COALESCE(s2.sku_count, 0) as store2_skus
|
||||||
|
FROM all_cats ac
|
||||||
|
LEFT JOIN store1_cats s1 ON ac.category = s1.category
|
||||||
|
LEFT JOIN store2_cats s2 ON ac.category = s2.category
|
||||||
|
ORDER BY (COALESCE(s1.sku_count, 0) + COALESCE(s2.sku_count, 0)) DESC
|
||||||
|
`, [storeId1, storeId2]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
store1: {
|
||||||
|
id: s1?.id || storeId1,
|
||||||
|
name: s1?.name || 'Unknown',
|
||||||
|
brands: s1?.brands || [],
|
||||||
|
categories: s1?.categories || [],
|
||||||
|
skuCount: parseInt(s1?.sku_count) || 0,
|
||||||
|
},
|
||||||
|
store2: {
|
||||||
|
id: s2?.id || storeId2,
|
||||||
|
name: s2?.name || 'Unknown',
|
||||||
|
brands: s2?.brands || [],
|
||||||
|
categories: s2?.categories || [],
|
||||||
|
skuCount: parseInt(s2?.sku_count) || 0,
|
||||||
|
},
|
||||||
|
sharedBrands,
|
||||||
|
uniqueToStore1,
|
||||||
|
uniqueToStore2,
|
||||||
|
categoryComparison: categoryResult.rows.map(row => ({
|
||||||
|
category: row.category,
|
||||||
|
store1Skus: parseInt(row.store1_skus) || 0,
|
||||||
|
store2Skus: parseInt(row.store2_skus) || 0,
|
||||||
|
difference: (parseInt(row.store1_skus) || 0) - (parseInt(row.store2_skus) || 0),
|
||||||
|
})),
|
||||||
|
};
|
||||||
|
}, 15)).data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record a change event (used by crawler/worker)
|
||||||
|
*/
|
||||||
|
async recordChangeEvent(event: {
|
||||||
|
storeId: number;
|
||||||
|
eventType: string;
|
||||||
|
brandName?: string;
|
||||||
|
productId?: number;
|
||||||
|
productName?: string;
|
||||||
|
category?: string;
|
||||||
|
oldValue?: string;
|
||||||
|
newValue?: string;
|
||||||
|
metadata?: Record<string, unknown>;
|
||||||
|
}): Promise<void> {
|
||||||
|
await this.pool.query(`
|
||||||
|
INSERT INTO store_change_events
|
||||||
|
(store_id, event_type, event_date, brand_name, product_id, product_name, category, old_value, new_value, metadata)
|
||||||
|
VALUES ($1, $2, CURRENT_DATE, $3, $4, $5, $6, $7, $8, $9)
|
||||||
|
`, [
|
||||||
|
event.storeId,
|
||||||
|
event.eventType,
|
||||||
|
event.brandName || null,
|
||||||
|
event.productId || null,
|
||||||
|
event.productName || null,
|
||||||
|
event.category || null,
|
||||||
|
event.oldValue || null,
|
||||||
|
event.newValue || null,
|
||||||
|
event.metadata ? JSON.stringify(event.metadata) : null,
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Invalidate cache
|
||||||
|
await this.cache.invalidatePattern(`store_change_summary:storeId=${event.storeId}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,20 +1,27 @@
|
|||||||
/**
|
/**
|
||||||
* AZDHS Import Service
|
* LEGACY SERVICE - AZDHS Import
|
||||||
|
*
|
||||||
|
* DEPRECATED: This service creates its own database pool.
|
||||||
|
* Future implementations should use the canonical CannaiQ connection.
|
||||||
*
|
*
|
||||||
* Imports Arizona dispensaries from the main database's dispensaries table
|
* Imports Arizona dispensaries from the main database's dispensaries table
|
||||||
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
|
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
|
||||||
*
|
*
|
||||||
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
|
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
|
||||||
|
*
|
||||||
|
* DO NOT:
|
||||||
|
* - Run this in automated jobs
|
||||||
|
* - Use DATABASE_URL directly
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Pool } from 'pg';
|
import { Pool } from 'pg';
|
||||||
import { query as dutchieQuery } from '../db/connection';
|
import { query as dutchieQuery } from '../db/connection';
|
||||||
import { Dispensary } from '../types';
|
import { Dispensary } from '../types';
|
||||||
|
|
||||||
// Main database connection (source of AZDHS data)
|
// Single database connection (cannaiq in cannaiq-postgres container)
|
||||||
const MAIN_DATABASE_URL =
|
// Use CANNAIQ_DB_* env vars or defaults
|
||||||
process.env.DATABASE_URL ||
|
const MAIN_DB_CONNECTION = process.env.CANNAIQ_DB_URL ||
|
||||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* AZDHS dispensary record from the main database
|
* AZDHS dispensary record from the main database
|
||||||
@@ -57,8 +64,9 @@ interface ImportResult {
|
|||||||
* Create a temporary connection to the main database
|
* Create a temporary connection to the main database
|
||||||
*/
|
*/
|
||||||
function getMainDBPool(): Pool {
|
function getMainDBPool(): Pool {
|
||||||
|
console.warn('[AZDHS Import] LEGACY: Using separate pool. Should use canonical CannaiQ connection.');
|
||||||
return new Pool({
|
return new Pool({
|
||||||
connectionString: MAIN_DATABASE_URL,
|
connectionString: MAIN_DB_CONNECTION,
|
||||||
max: 5,
|
max: 5,
|
||||||
idleTimeoutMillis: 30000,
|
idleTimeoutMillis: 30000,
|
||||||
connectionTimeoutMillis: 5000,
|
connectionTimeoutMillis: 5000,
|
||||||
|
|||||||
@@ -344,15 +344,12 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number
|
|||||||
return { resolved, failed, skipped, notCrawlable };
|
return { resolved, failed, skipped, notCrawlable };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||||
|
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get all dispensaries
|
* Get all dispensaries
|
||||||
*/
|
*/
|
||||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
|
||||||
const DISPENSARY_COLUMNS = `
|
|
||||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
|
||||||
menu_type, menu_url, platform_dispensary_id, website,
|
|
||||||
provider_detection_data, created_at, updated_at
|
|
||||||
`;
|
|
||||||
|
|
||||||
export async function getAllDispensaries(): Promise<Dispensary[]> {
|
export async function getAllDispensaries(): Promise<Dispensary[]> {
|
||||||
const { rows } = await query(
|
const { rows } = await query(
|
||||||
@@ -386,7 +383,7 @@ export function mapDbRowToDispensary(row: any): Dispensary {
|
|||||||
id: row.id,
|
id: row.id,
|
||||||
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
|
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
|
||||||
name: row.name,
|
name: row.name,
|
||||||
dbaName: row.dbaName || row.dba_name,
|
dbaName: row.dbaName || row.dba_name || undefined, // dba_name column is optional
|
||||||
slug: row.slug,
|
slug: row.slug,
|
||||||
city: row.city,
|
city: row.city,
|
||||||
state: row.state,
|
state: row.state,
|
||||||
@@ -421,7 +418,6 @@ export async function getDispensaryById(id: number): Promise<Dispensary | null>
|
|||||||
SELECT
|
SELECT
|
||||||
id,
|
id,
|
||||||
name,
|
name,
|
||||||
dba_name AS "dbaName",
|
|
||||||
slug,
|
slug,
|
||||||
city,
|
city,
|
||||||
state,
|
state,
|
||||||
|
|||||||
491
backend/src/dutchie-az/services/error-taxonomy.ts
Normal file
491
backend/src/dutchie-az/services/error-taxonomy.ts
Normal file
@@ -0,0 +1,491 @@
|
|||||||
|
/**
|
||||||
|
* Error Taxonomy Module
|
||||||
|
*
|
||||||
|
* Standardized error codes and classification for crawler reliability.
|
||||||
|
* All crawl results must use these codes for consistent error handling.
|
||||||
|
*
|
||||||
|
* Phase 1: Crawler Reliability & Stabilization
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// ERROR CODES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Standardized error codes for all crawl operations.
|
||||||
|
* These codes are stored in the database for analytics and debugging.
|
||||||
|
*/
|
||||||
|
export const CrawlErrorCode = {
|
||||||
|
// Success states
|
||||||
|
SUCCESS: 'SUCCESS',
|
||||||
|
|
||||||
|
// Rate limiting
|
||||||
|
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
|
||||||
|
|
||||||
|
// Proxy issues
|
||||||
|
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
|
||||||
|
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
|
||||||
|
|
||||||
|
// Content issues
|
||||||
|
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
|
||||||
|
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
|
||||||
|
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
|
||||||
|
|
||||||
|
// Network issues
|
||||||
|
TIMEOUT: 'TIMEOUT', // Request timeout
|
||||||
|
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
|
||||||
|
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
|
||||||
|
|
||||||
|
// Authentication
|
||||||
|
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
|
||||||
|
|
||||||
|
// Server errors
|
||||||
|
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
|
||||||
|
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
|
||||||
|
|
||||||
|
// Configuration issues
|
||||||
|
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
|
||||||
|
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
|
||||||
|
|
||||||
|
// Unknown
|
||||||
|
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// ERROR CLASSIFICATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Error metadata for each error code
|
||||||
|
*/
|
||||||
|
interface ErrorMetadata {
|
||||||
|
code: CrawlErrorCodeType;
|
||||||
|
retryable: boolean;
|
||||||
|
rotateProxy: boolean;
|
||||||
|
rotateUserAgent: boolean;
|
||||||
|
backoffMultiplier: number;
|
||||||
|
severity: 'low' | 'medium' | 'high' | 'critical';
|
||||||
|
description: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Metadata for each error code - defines retry behavior
|
||||||
|
*/
|
||||||
|
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
|
||||||
|
[CrawlErrorCode.SUCCESS]: {
|
||||||
|
code: CrawlErrorCode.SUCCESS,
|
||||||
|
retryable: false,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 0,
|
||||||
|
severity: 'low',
|
||||||
|
description: 'Crawl completed successfully',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.RATE_LIMITED]: {
|
||||||
|
code: CrawlErrorCode.RATE_LIMITED,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: true,
|
||||||
|
rotateUserAgent: true,
|
||||||
|
backoffMultiplier: 2.0,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'Rate limited by target (429)',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.BLOCKED_PROXY]: {
|
||||||
|
code: CrawlErrorCode.BLOCKED_PROXY,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: true,
|
||||||
|
rotateUserAgent: true,
|
||||||
|
backoffMultiplier: 1.5,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'Proxy blocked or rejected (407)',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.PROXY_TIMEOUT]: {
|
||||||
|
code: CrawlErrorCode.PROXY_TIMEOUT,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: true,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'low',
|
||||||
|
description: 'Proxy connection timed out',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.HTML_CHANGED]: {
|
||||||
|
code: CrawlErrorCode.HTML_CHANGED,
|
||||||
|
retryable: false,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'high',
|
||||||
|
description: 'Page structure changed - needs selector update',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.NO_PRODUCTS]: {
|
||||||
|
code: CrawlErrorCode.NO_PRODUCTS,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'low',
|
||||||
|
description: 'No products returned (may be temporary)',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.PARSE_ERROR]: {
|
||||||
|
code: CrawlErrorCode.PARSE_ERROR,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'Failed to parse response data',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.TIMEOUT]: {
|
||||||
|
code: CrawlErrorCode.TIMEOUT,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: true,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.5,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'Request timed out',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.NETWORK_ERROR]: {
|
||||||
|
code: CrawlErrorCode.NETWORK_ERROR,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: true,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'Network connection failed',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.DNS_ERROR]: {
|
||||||
|
code: CrawlErrorCode.DNS_ERROR,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: true,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'DNS resolution failed',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.AUTH_FAILED]: {
|
||||||
|
code: CrawlErrorCode.AUTH_FAILED,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: true,
|
||||||
|
backoffMultiplier: 2.0,
|
||||||
|
severity: 'high',
|
||||||
|
description: 'Authentication or session failed',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.SERVER_ERROR]: {
|
||||||
|
code: CrawlErrorCode.SERVER_ERROR,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.5,
|
||||||
|
severity: 'medium',
|
||||||
|
description: 'Server error (5xx)',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
|
||||||
|
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 2.0,
|
||||||
|
severity: 'high',
|
||||||
|
description: 'Service temporarily unavailable (503)',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.INVALID_CONFIG]: {
|
||||||
|
code: CrawlErrorCode.INVALID_CONFIG,
|
||||||
|
retryable: false,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 0,
|
||||||
|
severity: 'critical',
|
||||||
|
description: 'Invalid store configuration',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
|
||||||
|
code: CrawlErrorCode.MISSING_PLATFORM_ID,
|
||||||
|
retryable: false,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 0,
|
||||||
|
severity: 'critical',
|
||||||
|
description: 'Missing platform_dispensary_id',
|
||||||
|
},
|
||||||
|
|
||||||
|
[CrawlErrorCode.UNKNOWN_ERROR]: {
|
||||||
|
code: CrawlErrorCode.UNKNOWN_ERROR,
|
||||||
|
retryable: true,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
backoffMultiplier: 1.0,
|
||||||
|
severity: 'high',
|
||||||
|
description: 'Unknown/unclassified error',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// ERROR CLASSIFICATION FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Classify an error into a standardized error code.
|
||||||
|
*
|
||||||
|
* @param error - The error to classify (Error object, string, or HTTP status)
|
||||||
|
* @param httpStatus - Optional HTTP status code
|
||||||
|
* @returns Standardized error code
|
||||||
|
*/
|
||||||
|
export function classifyError(
|
||||||
|
error: Error | string | null,
|
||||||
|
httpStatus?: number
|
||||||
|
): CrawlErrorCodeType {
|
||||||
|
// Check HTTP status first
|
||||||
|
if (httpStatus) {
|
||||||
|
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
|
||||||
|
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
|
||||||
|
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
|
||||||
|
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
|
||||||
|
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
|
||||||
|
|
||||||
|
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
|
||||||
|
|
||||||
|
// Rate limiting patterns
|
||||||
|
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
|
||||||
|
return CrawlErrorCode.RATE_LIMITED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Proxy patterns
|
||||||
|
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
|
||||||
|
return CrawlErrorCode.BLOCKED_PROXY;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Timeout patterns
|
||||||
|
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
|
||||||
|
if (message.includes('proxy')) {
|
||||||
|
return CrawlErrorCode.PROXY_TIMEOUT;
|
||||||
|
}
|
||||||
|
return CrawlErrorCode.TIMEOUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Network patterns
|
||||||
|
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
|
||||||
|
return CrawlErrorCode.NETWORK_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
// DNS patterns
|
||||||
|
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
|
||||||
|
return CrawlErrorCode.DNS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auth patterns
|
||||||
|
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
|
||||||
|
return CrawlErrorCode.AUTH_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML change patterns
|
||||||
|
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
|
||||||
|
return CrawlErrorCode.HTML_CHANGED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse patterns
|
||||||
|
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
|
||||||
|
return CrawlErrorCode.PARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No products patterns
|
||||||
|
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
|
||||||
|
return CrawlErrorCode.NO_PRODUCTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Server error patterns
|
||||||
|
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
|
||||||
|
return CrawlErrorCode.SERVER_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Config patterns
|
||||||
|
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
|
||||||
|
if (message.includes('platform') || message.includes('dispensary_id')) {
|
||||||
|
return CrawlErrorCode.MISSING_PLATFORM_ID;
|
||||||
|
}
|
||||||
|
return CrawlErrorCode.INVALID_CONFIG;
|
||||||
|
}
|
||||||
|
|
||||||
|
return CrawlErrorCode.UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get metadata for an error code
|
||||||
|
*/
|
||||||
|
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
|
||||||
|
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if an error is retryable
|
||||||
|
*/
|
||||||
|
export function isRetryable(code: CrawlErrorCodeType): boolean {
|
||||||
|
return getErrorMetadata(code).retryable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if proxy should be rotated for this error
|
||||||
|
*/
|
||||||
|
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
|
||||||
|
return getErrorMetadata(code).rotateProxy;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if user agent should be rotated for this error
|
||||||
|
*/
|
||||||
|
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
|
||||||
|
return getErrorMetadata(code).rotateUserAgent;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get backoff multiplier for this error
|
||||||
|
*/
|
||||||
|
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
|
||||||
|
return getErrorMetadata(code).backoffMultiplier;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CRAWL RESULT TYPE
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Standardized crawl result with error taxonomy
|
||||||
|
*/
|
||||||
|
export interface CrawlResult {
|
||||||
|
success: boolean;
|
||||||
|
dispensaryId: number;
|
||||||
|
|
||||||
|
// Error info
|
||||||
|
errorCode: CrawlErrorCodeType;
|
||||||
|
errorMessage?: string;
|
||||||
|
httpStatus?: number;
|
||||||
|
|
||||||
|
// Timing
|
||||||
|
startedAt: Date;
|
||||||
|
finishedAt: Date;
|
||||||
|
durationMs: number;
|
||||||
|
|
||||||
|
// Context
|
||||||
|
attemptNumber: number;
|
||||||
|
proxyUsed?: string;
|
||||||
|
userAgentUsed?: string;
|
||||||
|
|
||||||
|
// Metrics (on success)
|
||||||
|
productsFound?: number;
|
||||||
|
productsUpserted?: number;
|
||||||
|
snapshotsCreated?: number;
|
||||||
|
imagesDownloaded?: number;
|
||||||
|
|
||||||
|
// Metadata
|
||||||
|
metadata?: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a success result
|
||||||
|
*/
|
||||||
|
export function createSuccessResult(
|
||||||
|
dispensaryId: number,
|
||||||
|
startedAt: Date,
|
||||||
|
metrics: {
|
||||||
|
productsFound: number;
|
||||||
|
productsUpserted: number;
|
||||||
|
snapshotsCreated: number;
|
||||||
|
imagesDownloaded?: number;
|
||||||
|
},
|
||||||
|
context?: {
|
||||||
|
attemptNumber?: number;
|
||||||
|
proxyUsed?: string;
|
||||||
|
userAgentUsed?: string;
|
||||||
|
}
|
||||||
|
): CrawlResult {
|
||||||
|
const finishedAt = new Date();
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
dispensaryId,
|
||||||
|
errorCode: CrawlErrorCode.SUCCESS,
|
||||||
|
startedAt,
|
||||||
|
finishedAt,
|
||||||
|
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||||
|
attemptNumber: context?.attemptNumber || 1,
|
||||||
|
proxyUsed: context?.proxyUsed,
|
||||||
|
userAgentUsed: context?.userAgentUsed,
|
||||||
|
...metrics,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a failure result
|
||||||
|
*/
|
||||||
|
export function createFailureResult(
|
||||||
|
dispensaryId: number,
|
||||||
|
startedAt: Date,
|
||||||
|
error: Error | string,
|
||||||
|
httpStatus?: number,
|
||||||
|
context?: {
|
||||||
|
attemptNumber?: number;
|
||||||
|
proxyUsed?: string;
|
||||||
|
userAgentUsed?: string;
|
||||||
|
}
|
||||||
|
): CrawlResult {
|
||||||
|
const finishedAt = new Date();
|
||||||
|
const errorCode = classifyError(error, httpStatus);
|
||||||
|
const errorMessage = typeof error === 'string' ? error : error.message;
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
dispensaryId,
|
||||||
|
errorCode,
|
||||||
|
errorMessage,
|
||||||
|
httpStatus,
|
||||||
|
startedAt,
|
||||||
|
finishedAt,
|
||||||
|
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||||
|
attemptNumber: context?.attemptNumber || 1,
|
||||||
|
proxyUsed: context?.proxyUsed,
|
||||||
|
userAgentUsed: context?.userAgentUsed,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// LOGGING HELPERS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format error code for logging
|
||||||
|
*/
|
||||||
|
export function formatErrorForLog(result: CrawlResult): string {
|
||||||
|
const metadata = getErrorMetadata(result.errorCode);
|
||||||
|
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
|
||||||
|
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
|
||||||
|
|
||||||
|
if (result.success) {
|
||||||
|
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get user-friendly error description
|
||||||
|
*/
|
||||||
|
export function getErrorDescription(code: CrawlErrorCodeType): string {
|
||||||
|
return getErrorMetadata(code).description;
|
||||||
|
}
|
||||||
@@ -16,12 +16,8 @@ import { extractCNameFromMenuUrl, extractFromMenuUrl, mapDbRowToDispensary } fro
|
|||||||
import { resolveDispensaryId } from './graphql-client';
|
import { resolveDispensaryId } from './graphql-client';
|
||||||
import { Dispensary, JobStatus } from '../types';
|
import { Dispensary, JobStatus } from '../types';
|
||||||
|
|
||||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||||
const DISPENSARY_COLUMNS = `
|
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
|
||||||
menu_type, menu_url, platform_dispensary_id, website,
|
|
||||||
provider_detection_data, created_at, updated_at
|
|
||||||
`;
|
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// TYPES
|
// TYPES
|
||||||
@@ -647,6 +643,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`
|
`
|
||||||
UPDATE dispensaries SET
|
UPDATE dispensaries SET
|
||||||
menu_type = 'dutchie',
|
menu_type = 'dutchie',
|
||||||
|
last_id_resolution_at = NOW(),
|
||||||
|
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||||
|
id_resolution_error = $1,
|
||||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||||
jsonb_build_object(
|
jsonb_build_object(
|
||||||
'detected_provider', 'dutchie'::text,
|
'detected_provider', 'dutchie'::text,
|
||||||
@@ -660,7 +659,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`,
|
`,
|
||||||
[result.error, dispensaryId]
|
[result.error, dispensaryId]
|
||||||
);
|
);
|
||||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -675,6 +674,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
UPDATE dispensaries SET
|
UPDATE dispensaries SET
|
||||||
menu_type = 'dutchie',
|
menu_type = 'dutchie',
|
||||||
platform_dispensary_id = $1,
|
platform_dispensary_id = $1,
|
||||||
|
last_id_resolution_at = NOW(),
|
||||||
|
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||||
|
id_resolution_error = NULL,
|
||||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||||
jsonb_build_object(
|
jsonb_build_object(
|
||||||
'detected_provider', 'dutchie'::text,
|
'detected_provider', 'dutchie'::text,
|
||||||
@@ -691,7 +693,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`,
|
`,
|
||||||
[platformId, dispensaryId]
|
[platformId, dispensaryId]
|
||||||
);
|
);
|
||||||
console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
|
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -714,6 +716,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
UPDATE dispensaries SET
|
UPDATE dispensaries SET
|
||||||
menu_type = 'dutchie',
|
menu_type = 'dutchie',
|
||||||
platform_dispensary_id = $1,
|
platform_dispensary_id = $1,
|
||||||
|
last_id_resolution_at = NOW(),
|
||||||
|
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||||
|
id_resolution_error = NULL,
|
||||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||||
jsonb_build_object(
|
jsonb_build_object(
|
||||||
'detected_provider', 'dutchie'::text,
|
'detected_provider', 'dutchie'::text,
|
||||||
@@ -730,10 +735,10 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`,
|
`,
|
||||||
[platformId, cName, dispensaryId]
|
[platformId, cName, dispensaryId]
|
||||||
);
|
);
|
||||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
||||||
} else {
|
} else {
|
||||||
// cName resolution failed - try crawling website as fallback
|
// cName resolution failed - try crawling website as fallback
|
||||||
console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
|
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
|
||||||
|
|
||||||
if (website && website.trim() !== '') {
|
if (website && website.trim() !== '') {
|
||||||
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
|
const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
|
||||||
@@ -796,6 +801,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
UPDATE dispensaries SET
|
UPDATE dispensaries SET
|
||||||
menu_type = 'dutchie',
|
menu_type = 'dutchie',
|
||||||
platform_dispensary_id = NULL,
|
platform_dispensary_id = NULL,
|
||||||
|
last_id_resolution_at = NOW(),
|
||||||
|
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||||
|
id_resolution_error = $2,
|
||||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||||
jsonb_build_object(
|
jsonb_build_object(
|
||||||
'detected_provider', 'dutchie'::text,
|
'detected_provider', 'dutchie'::text,
|
||||||
@@ -812,7 +820,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`,
|
`,
|
||||||
[cName, result.error, dispensaryId]
|
[cName, result.error, dispensaryId]
|
||||||
);
|
);
|
||||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
console.log(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
result.error = `Resolution failed: ${error.message}`;
|
result.error = `Resolution failed: ${error.message}`;
|
||||||
@@ -820,6 +828,9 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`
|
`
|
||||||
UPDATE dispensaries SET
|
UPDATE dispensaries SET
|
||||||
menu_type = 'dutchie',
|
menu_type = 'dutchie',
|
||||||
|
last_id_resolution_at = NOW(),
|
||||||
|
id_resolution_attempts = COALESCE(id_resolution_attempts, 0) + 1,
|
||||||
|
id_resolution_error = $2,
|
||||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||||
jsonb_build_object(
|
jsonb_build_object(
|
||||||
'detected_provider', 'dutchie'::text,
|
'detected_provider', 'dutchie'::text,
|
||||||
@@ -835,7 +846,7 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
`,
|
`,
|
||||||
[cName, result.error, dispensaryId]
|
[cName, result.error, dispensaryId]
|
||||||
);
|
);
|
||||||
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
console.error(`[Henry - Entry Point Finder] ${dispensary.name}: ${result.error}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@@ -844,6 +855,11 @@ export async function detectAndResolveDispensary(dispensaryId: number): Promise<
|
|||||||
/**
|
/**
|
||||||
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
|
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
|
||||||
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
|
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
|
||||||
|
*
|
||||||
|
* Enhanced for Henry (Entry Point Finder) to also process:
|
||||||
|
* - Stores with slug changes that need re-resolution
|
||||||
|
* - Recently added stores from Alice's discovery
|
||||||
|
* - Stores that failed resolution and need retry
|
||||||
*/
|
*/
|
||||||
export async function runBulkDetection(options: {
|
export async function runBulkDetection(options: {
|
||||||
state?: string;
|
state?: string;
|
||||||
@@ -851,6 +867,9 @@ export async function runBulkDetection(options: {
|
|||||||
onlyMissingPlatformId?: boolean;
|
onlyMissingPlatformId?: boolean;
|
||||||
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
|
includeWebsiteCrawl?: boolean; // Include dispensaries with website but no menu_url
|
||||||
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
|
includeDutchieMissingPlatformId?: boolean; // include menu_type='dutchie' with null platform_id
|
||||||
|
includeSlugChanges?: boolean; // Include stores where Alice detected slug changes
|
||||||
|
includeRecentlyAdded?: boolean; // Include stores recently added by Alice
|
||||||
|
scope?: { states?: string[]; storeIds?: number[] }; // Scope filtering for sharding
|
||||||
limit?: number;
|
limit?: number;
|
||||||
} = {}): Promise<BulkDetectionResult> {
|
} = {}): Promise<BulkDetectionResult> {
|
||||||
const {
|
const {
|
||||||
@@ -859,14 +878,23 @@ export async function runBulkDetection(options: {
|
|||||||
onlyMissingPlatformId = false,
|
onlyMissingPlatformId = false,
|
||||||
includeWebsiteCrawl = true,
|
includeWebsiteCrawl = true,
|
||||||
includeDutchieMissingPlatformId = true,
|
includeDutchieMissingPlatformId = true,
|
||||||
|
includeSlugChanges = true,
|
||||||
|
includeRecentlyAdded = true,
|
||||||
|
scope,
|
||||||
limit,
|
limit,
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
console.log('[MenuDetection] Starting bulk detection...');
|
const scopeDesc = scope?.states?.length
|
||||||
|
? ` (states: ${scope.states.join(', ')})`
|
||||||
|
: scope?.storeIds?.length
|
||||||
|
? ` (${scope.storeIds.length} specific stores)`
|
||||||
|
: state ? ` (state: ${state})` : '';
|
||||||
|
|
||||||
|
console.log(`[Henry - Entry Point Finder] Starting bulk detection${scopeDesc}...`);
|
||||||
|
|
||||||
// Build query to find dispensaries needing detection
|
// Build query to find dispensaries needing detection
|
||||||
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
// Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
||||||
// Optionally includes dutchie stores missing platform ID
|
// Optionally includes dutchie stores missing platform ID, slug changes, and recently added stores
|
||||||
let whereClause = `WHERE (
|
let whereClause = `WHERE (
|
||||||
menu_url IS NOT NULL
|
menu_url IS NOT NULL
|
||||||
${includeWebsiteCrawl ? `OR (
|
${includeWebsiteCrawl ? `OR (
|
||||||
@@ -882,7 +910,14 @@ export async function runBulkDetection(options: {
|
|||||||
const params: any[] = [];
|
const params: any[] = [];
|
||||||
let paramIndex = 1;
|
let paramIndex = 1;
|
||||||
|
|
||||||
if (state) {
|
// Apply scope filtering (takes precedence over single state filter)
|
||||||
|
if (scope?.storeIds?.length) {
|
||||||
|
whereClause += ` AND id = ANY($${paramIndex++})`;
|
||||||
|
params.push(scope.storeIds);
|
||||||
|
} else if (scope?.states?.length) {
|
||||||
|
whereClause += ` AND state = ANY($${paramIndex++})`;
|
||||||
|
params.push(scope.states);
|
||||||
|
} else if (state) {
|
||||||
whereClause += ` AND state = $${paramIndex++}`;
|
whereClause += ` AND state = $${paramIndex++}`;
|
||||||
params.push(state);
|
params.push(state);
|
||||||
}
|
}
|
||||||
@@ -962,6 +997,19 @@ export async function runBulkDetection(options: {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute the menu detection job (called by scheduler)
|
* Execute the menu detection job (called by scheduler)
|
||||||
|
*
|
||||||
|
* Worker: Henry (Entry Point Finder)
|
||||||
|
* Uses METHOD 1 (reactEnv extraction) as primary method per user requirements.
|
||||||
|
*
|
||||||
|
* Scope filtering:
|
||||||
|
* - config.scope.states: Array of state codes to limit detection (e.g., ["AZ", "CA"])
|
||||||
|
* - config.scope.storeIds: Array of specific store IDs to process
|
||||||
|
*
|
||||||
|
* Processes:
|
||||||
|
* - Stores with unknown/missing menu_type
|
||||||
|
* - Stores with missing platform_dispensary_id
|
||||||
|
* - Stores with slug changes that need re-resolution (from Alice)
|
||||||
|
* - Recently added stores (discovered by Alice)
|
||||||
*/
|
*/
|
||||||
export async function executeMenuDetectionJob(config: Record<string, any> = {}): Promise<{
|
export async function executeMenuDetectionJob(config: Record<string, any> = {}): Promise<{
|
||||||
status: JobStatus;
|
status: JobStatus;
|
||||||
@@ -972,19 +1020,31 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
|||||||
metadata?: any;
|
metadata?: any;
|
||||||
}> {
|
}> {
|
||||||
const state = config.state || 'AZ';
|
const state = config.state || 'AZ';
|
||||||
|
const scope = config.scope as { states?: string[]; storeIds?: number[] } | undefined;
|
||||||
const onlyUnknown = config.onlyUnknown !== false;
|
const onlyUnknown = config.onlyUnknown !== false;
|
||||||
// Default to true - always try to resolve platform IDs for dutchie stores
|
// Default to true - always try to resolve platform IDs for dutchie stores
|
||||||
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
|
const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
|
||||||
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
|
const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
|
||||||
|
const includeSlugChanges = config.includeSlugChanges !== false;
|
||||||
|
const includeRecentlyAdded = config.includeRecentlyAdded !== false;
|
||||||
|
|
||||||
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
|
const scopeDesc = scope?.states?.length
|
||||||
|
? ` (states: ${scope.states.join(', ')})`
|
||||||
|
: scope?.storeIds?.length
|
||||||
|
? ` (${scope.storeIds.length} specific stores)`
|
||||||
|
: ` (state: ${state})`;
|
||||||
|
|
||||||
|
console.log(`[Henry - Entry Point Finder] Executing scheduled job${scopeDesc}...`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await runBulkDetection({
|
const result = await runBulkDetection({
|
||||||
state,
|
state: scope ? undefined : state, // Use scope if provided, otherwise fall back to state
|
||||||
|
scope,
|
||||||
onlyUnknown,
|
onlyUnknown,
|
||||||
onlyMissingPlatformId,
|
onlyMissingPlatformId,
|
||||||
includeDutchieMissingPlatformId,
|
includeDutchieMissingPlatformId,
|
||||||
|
includeSlugChanges,
|
||||||
|
includeRecentlyAdded,
|
||||||
});
|
});
|
||||||
|
|
||||||
const status: JobStatus =
|
const status: JobStatus =
|
||||||
@@ -998,9 +1058,11 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
|||||||
itemsFailed: result.totalFailed,
|
itemsFailed: result.totalFailed,
|
||||||
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
state,
|
scope: scope || { states: [state] },
|
||||||
onlyUnknown,
|
onlyUnknown,
|
||||||
onlyMissingPlatformId,
|
onlyMissingPlatformId,
|
||||||
|
includeSlugChanges,
|
||||||
|
includeRecentlyAdded,
|
||||||
providerCounts: countByProvider(result.results),
|
providerCounts: countByProvider(result.results),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -1011,6 +1073,7 @@ export async function executeMenuDetectionJob(config: Record<string, any> = {}):
|
|||||||
itemsSucceeded: 0,
|
itemsSucceeded: 0,
|
||||||
itemsFailed: 0,
|
itemsFailed: 0,
|
||||||
errorMessage: error.message,
|
errorMessage: error.message,
|
||||||
|
metadata: { scope: scope || { states: [state] } },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
455
backend/src/dutchie-az/services/proxy-rotator.ts
Normal file
455
backend/src/dutchie-az/services/proxy-rotator.ts
Normal file
@@ -0,0 +1,455 @@
|
|||||||
|
/**
|
||||||
|
* Proxy & User Agent Rotator
|
||||||
|
*
|
||||||
|
* Manages rotation of proxies and user agents to avoid blocks.
|
||||||
|
* Integrates with error taxonomy for intelligent rotation decisions.
|
||||||
|
*
|
||||||
|
* Phase 1: Crawler Reliability & Stabilization
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// USER AGENT CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
|
||||||
|
* Updated: 2024
|
||||||
|
*/
|
||||||
|
export const USER_AGENTS = [
|
||||||
|
// Chrome on Windows
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
|
||||||
|
|
||||||
|
// Chrome on macOS
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||||
|
|
||||||
|
// Firefox on Windows
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
|
||||||
|
|
||||||
|
// Firefox on macOS
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||||
|
|
||||||
|
// Safari on macOS
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||||
|
|
||||||
|
// Edge on Windows
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||||
|
|
||||||
|
// Chrome on Linux
|
||||||
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
];
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PROXY TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface Proxy {
|
||||||
|
id: number;
|
||||||
|
host: string;
|
||||||
|
port: number;
|
||||||
|
username?: string;
|
||||||
|
password?: string;
|
||||||
|
protocol: 'http' | 'https' | 'socks5';
|
||||||
|
isActive: boolean;
|
||||||
|
lastUsedAt: Date | null;
|
||||||
|
failureCount: number;
|
||||||
|
successCount: number;
|
||||||
|
avgResponseTimeMs: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ProxyStats {
|
||||||
|
totalProxies: number;
|
||||||
|
activeProxies: number;
|
||||||
|
blockedProxies: number;
|
||||||
|
avgSuccessRate: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PROXY ROTATOR CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class ProxyRotator {
|
||||||
|
private pool: Pool | null = null;
|
||||||
|
private proxies: Proxy[] = [];
|
||||||
|
private currentIndex: number = 0;
|
||||||
|
private lastRotation: Date = new Date();
|
||||||
|
|
||||||
|
constructor(pool?: Pool) {
|
||||||
|
this.pool = pool || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize with database pool
|
||||||
|
*/
|
||||||
|
setPool(pool: Pool): void {
|
||||||
|
this.pool = pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load proxies from database
|
||||||
|
*/
|
||||||
|
async loadProxies(): Promise<void> {
|
||||||
|
if (!this.pool) {
|
||||||
|
console.warn('[ProxyRotator] No database pool configured');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await this.pool.query<Proxy>(`
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
host,
|
||||||
|
port,
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
protocol,
|
||||||
|
is_active as "isActive",
|
||||||
|
last_used_at as "lastUsedAt",
|
||||||
|
failure_count as "failureCount",
|
||||||
|
success_count as "successCount",
|
||||||
|
avg_response_time_ms as "avgResponseTimeMs"
|
||||||
|
FROM proxies
|
||||||
|
WHERE is_active = true
|
||||||
|
ORDER BY failure_count ASC, last_used_at ASC NULLS FIRST
|
||||||
|
`);
|
||||||
|
|
||||||
|
this.proxies = result.rows;
|
||||||
|
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies`);
|
||||||
|
} catch (error) {
|
||||||
|
// Table might not exist - that's okay
|
||||||
|
console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
|
||||||
|
this.proxies = [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get next proxy in rotation
|
||||||
|
*/
|
||||||
|
getNext(): Proxy | null {
|
||||||
|
if (this.proxies.length === 0) return null;
|
||||||
|
|
||||||
|
// Round-robin rotation
|
||||||
|
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
|
||||||
|
this.lastRotation = new Date();
|
||||||
|
|
||||||
|
return this.proxies[this.currentIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current proxy without rotating
|
||||||
|
*/
|
||||||
|
getCurrent(): Proxy | null {
|
||||||
|
if (this.proxies.length === 0) return null;
|
||||||
|
return this.proxies[this.currentIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get proxy by ID
|
||||||
|
*/
|
||||||
|
getById(id: number): Proxy | null {
|
||||||
|
return this.proxies.find(p => p.id === id) || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rotate to a specific proxy
|
||||||
|
*/
|
||||||
|
setProxy(id: number): boolean {
|
||||||
|
const index = this.proxies.findIndex(p => p.id === id);
|
||||||
|
if (index === -1) return false;
|
||||||
|
|
||||||
|
this.currentIndex = index;
|
||||||
|
this.lastRotation = new Date();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark proxy as failed (temporarily remove from rotation)
|
||||||
|
*/
|
||||||
|
async markFailed(proxyId: number, error?: string): Promise<void> {
|
||||||
|
// Update in-memory
|
||||||
|
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||||
|
if (proxy) {
|
||||||
|
proxy.failureCount++;
|
||||||
|
|
||||||
|
// Deactivate if too many failures
|
||||||
|
if (proxy.failureCount >= 5) {
|
||||||
|
proxy.isActive = false;
|
||||||
|
this.proxies = this.proxies.filter(p => p.id !== proxyId);
|
||||||
|
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update database
|
||||||
|
if (this.pool) {
|
||||||
|
try {
|
||||||
|
await this.pool.query(`
|
||||||
|
UPDATE proxies
|
||||||
|
SET
|
||||||
|
failure_count = failure_count + 1,
|
||||||
|
last_failure_at = NOW(),
|
||||||
|
last_error = $2,
|
||||||
|
is_active = CASE WHEN failure_count >= 4 THEN false ELSE is_active END
|
||||||
|
WHERE id = $1
|
||||||
|
`, [proxyId, error || null]);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark proxy as successful
|
||||||
|
*/
|
||||||
|
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
|
||||||
|
// Update in-memory
|
||||||
|
const proxy = this.proxies.find(p => p.id === proxyId);
|
||||||
|
if (proxy) {
|
||||||
|
proxy.successCount++;
|
||||||
|
proxy.lastUsedAt = new Date();
|
||||||
|
if (responseTimeMs !== undefined) {
|
||||||
|
// Rolling average
|
||||||
|
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
|
||||||
|
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
|
||||||
|
: responseTimeMs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update database
|
||||||
|
if (this.pool) {
|
||||||
|
try {
|
||||||
|
await this.pool.query(`
|
||||||
|
UPDATE proxies
|
||||||
|
SET
|
||||||
|
success_count = success_count + 1,
|
||||||
|
last_used_at = NOW(),
|
||||||
|
avg_response_time_ms = CASE
|
||||||
|
WHEN avg_response_time_ms IS NULL THEN $2
|
||||||
|
ELSE (avg_response_time_ms * 0.8) + ($2 * 0.2)
|
||||||
|
END
|
||||||
|
WHERE id = $1
|
||||||
|
`, [proxyId, responseTimeMs || null]);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get proxy URL for HTTP client
|
||||||
|
*/
|
||||||
|
getProxyUrl(proxy: Proxy): string {
|
||||||
|
const auth = proxy.username && proxy.password
|
||||||
|
? `${proxy.username}:${proxy.password}@`
|
||||||
|
: '';
|
||||||
|
return `${proxy.protocol}://${auth}${proxy.host}:${proxy.port}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get stats about proxy pool
|
||||||
|
*/
|
||||||
|
getStats(): ProxyStats {
|
||||||
|
const totalProxies = this.proxies.length;
|
||||||
|
const activeProxies = this.proxies.filter(p => p.isActive).length;
|
||||||
|
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
|
||||||
|
|
||||||
|
const successRates = this.proxies
|
||||||
|
.filter(p => p.successCount + p.failureCount > 0)
|
||||||
|
.map(p => p.successCount / (p.successCount + p.failureCount));
|
||||||
|
|
||||||
|
const avgSuccessRate = successRates.length > 0
|
||||||
|
? successRates.reduce((a, b) => a + b, 0) / successRates.length
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalProxies,
|
||||||
|
activeProxies,
|
||||||
|
blockedProxies,
|
||||||
|
avgSuccessRate,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if proxy pool has available proxies
|
||||||
|
*/
|
||||||
|
hasAvailableProxies(): boolean {
|
||||||
|
return this.proxies.length > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// USER AGENT ROTATOR CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class UserAgentRotator {
|
||||||
|
private userAgents: string[];
|
||||||
|
private currentIndex: number = 0;
|
||||||
|
private lastRotation: Date = new Date();
|
||||||
|
|
||||||
|
constructor(userAgents: string[] = USER_AGENTS) {
|
||||||
|
this.userAgents = userAgents;
|
||||||
|
// Start at random index to avoid patterns
|
||||||
|
this.currentIndex = Math.floor(Math.random() * userAgents.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get next user agent in rotation
|
||||||
|
*/
|
||||||
|
getNext(): string {
|
||||||
|
this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
|
||||||
|
this.lastRotation = new Date();
|
||||||
|
return this.userAgents[this.currentIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current user agent without rotating
|
||||||
|
*/
|
||||||
|
getCurrent(): string {
|
||||||
|
return this.userAgents[this.currentIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a random user agent
|
||||||
|
*/
|
||||||
|
getRandom(): string {
|
||||||
|
const index = Math.floor(Math.random() * this.userAgents.length);
|
||||||
|
return this.userAgents[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get total available user agents
|
||||||
|
*/
|
||||||
|
getCount(): number {
|
||||||
|
return this.userAgents.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// COMBINED ROTATOR (for convenience)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class CrawlRotator {
|
||||||
|
public proxy: ProxyRotator;
|
||||||
|
public userAgent: UserAgentRotator;
|
||||||
|
|
||||||
|
constructor(pool?: Pool) {
|
||||||
|
this.proxy = new ProxyRotator(pool);
|
||||||
|
this.userAgent = new UserAgentRotator();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize rotator (load proxies from DB)
|
||||||
|
*/
|
||||||
|
async initialize(): Promise<void> {
|
||||||
|
await this.proxy.loadProxies();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rotate proxy only
|
||||||
|
*/
|
||||||
|
rotateProxy(): Proxy | null {
|
||||||
|
return this.proxy.getNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rotate user agent only
|
||||||
|
*/
|
||||||
|
rotateUserAgent(): string {
|
||||||
|
return this.userAgent.getNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rotate both proxy and user agent
|
||||||
|
*/
|
||||||
|
rotateBoth(): { proxy: Proxy | null; userAgent: string } {
|
||||||
|
return {
|
||||||
|
proxy: this.proxy.getNext(),
|
||||||
|
userAgent: this.userAgent.getNext(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current proxy and user agent without rotating
|
||||||
|
*/
|
||||||
|
getCurrent(): { proxy: Proxy | null; userAgent: string } {
|
||||||
|
return {
|
||||||
|
proxy: this.proxy.getCurrent(),
|
||||||
|
userAgent: this.userAgent.getCurrent(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record success for current proxy
|
||||||
|
*/
|
||||||
|
async recordSuccess(responseTimeMs?: number): Promise<void> {
|
||||||
|
const current = this.proxy.getCurrent();
|
||||||
|
if (current) {
|
||||||
|
await this.proxy.markSuccess(current.id, responseTimeMs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record failure for current proxy
|
||||||
|
*/
|
||||||
|
async recordFailure(error?: string): Promise<void> {
|
||||||
|
const current = this.proxy.getCurrent();
|
||||||
|
if (current) {
|
||||||
|
await this.proxy.markFailed(current.id, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE OPERATIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update dispensary's current proxy and user agent
|
||||||
|
*/
|
||||||
|
export async function updateDispensaryRotation(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number,
|
||||||
|
proxyId: number | null,
|
||||||
|
userAgent: string | null
|
||||||
|
): Promise<void> {
|
||||||
|
await pool.query(`
|
||||||
|
UPDATE dispensaries
|
||||||
|
SET
|
||||||
|
current_proxy_id = $2,
|
||||||
|
current_user_agent = $3
|
||||||
|
WHERE id = $1
|
||||||
|
`, [dispensaryId, proxyId, userAgent]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get dispensary's current proxy and user agent
|
||||||
|
*/
|
||||||
|
export async function getDispensaryRotation(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number
|
||||||
|
): Promise<{ proxyId: number | null; userAgent: string | null }> {
|
||||||
|
const result = await pool.query(`
|
||||||
|
SELECT current_proxy_id as "proxyId", current_user_agent as "userAgent"
|
||||||
|
FROM dispensaries
|
||||||
|
WHERE id = $1
|
||||||
|
`, [dispensaryId]);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return { proxyId: null, userAgent: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.rows[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SINGLETON INSTANCES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const proxyRotator = new ProxyRotator();
|
||||||
|
export const userAgentRotator = new UserAgentRotator();
|
||||||
|
export const crawlRotator = new CrawlRotator();
|
||||||
435
backend/src/dutchie-az/services/retry-manager.ts
Normal file
435
backend/src/dutchie-az/services/retry-manager.ts
Normal file
@@ -0,0 +1,435 @@
|
|||||||
|
/**
|
||||||
|
* Unified Retry Manager
|
||||||
|
*
|
||||||
|
* Handles retry logic with exponential backoff, jitter, and
|
||||||
|
* intelligent error-based decisions (rotate proxy, rotate UA, etc.)
|
||||||
|
*
|
||||||
|
* Phase 1: Crawler Reliability & Stabilization
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
CrawlErrorCodeType,
|
||||||
|
CrawlErrorCode,
|
||||||
|
classifyError,
|
||||||
|
getErrorMetadata,
|
||||||
|
isRetryable,
|
||||||
|
shouldRotateProxy,
|
||||||
|
shouldRotateUserAgent,
|
||||||
|
getBackoffMultiplier,
|
||||||
|
} from './error-taxonomy';
|
||||||
|
import { DEFAULT_CONFIG } from './store-validator';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RETRY CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface RetryConfig {
|
||||||
|
maxRetries: number;
|
||||||
|
baseBackoffMs: number;
|
||||||
|
maxBackoffMs: number;
|
||||||
|
backoffMultiplier: number;
|
||||||
|
jitterFactor: number; // 0.0 - 1.0 (percentage of backoff to randomize)
|
||||||
|
}
|
||||||
|
|
||||||
|
export const DEFAULT_RETRY_CONFIG: RetryConfig = {
|
||||||
|
maxRetries: DEFAULT_CONFIG.maxRetries,
|
||||||
|
baseBackoffMs: DEFAULT_CONFIG.baseBackoffMs,
|
||||||
|
maxBackoffMs: DEFAULT_CONFIG.maxBackoffMs,
|
||||||
|
backoffMultiplier: DEFAULT_CONFIG.backoffMultiplier,
|
||||||
|
jitterFactor: 0.25, // +/- 25% jitter
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RETRY CONTEXT
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Context for tracking retry state across attempts
|
||||||
|
*/
|
||||||
|
export interface RetryContext {
|
||||||
|
attemptNumber: number;
|
||||||
|
maxAttempts: number;
|
||||||
|
lastErrorCode: CrawlErrorCodeType | null;
|
||||||
|
lastHttpStatus: number | null;
|
||||||
|
totalBackoffMs: number;
|
||||||
|
proxyRotated: boolean;
|
||||||
|
userAgentRotated: boolean;
|
||||||
|
startedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decision about what to do after an error
|
||||||
|
*/
|
||||||
|
export interface RetryDecision {
|
||||||
|
shouldRetry: boolean;
|
||||||
|
reason: string;
|
||||||
|
backoffMs: number;
|
||||||
|
rotateProxy: boolean;
|
||||||
|
rotateUserAgent: boolean;
|
||||||
|
errorCode: CrawlErrorCodeType;
|
||||||
|
attemptNumber: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// RETRY MANAGER CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class RetryManager {
|
||||||
|
private config: RetryConfig;
|
||||||
|
private context: RetryContext;
|
||||||
|
|
||||||
|
constructor(config: Partial<RetryConfig> = {}) {
|
||||||
|
this.config = { ...DEFAULT_RETRY_CONFIG, ...config };
|
||||||
|
this.context = this.createInitialContext();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create initial retry context
|
||||||
|
*/
|
||||||
|
private createInitialContext(): RetryContext {
|
||||||
|
return {
|
||||||
|
attemptNumber: 0,
|
||||||
|
maxAttempts: this.config.maxRetries + 1, // +1 for initial attempt
|
||||||
|
lastErrorCode: null,
|
||||||
|
lastHttpStatus: null,
|
||||||
|
totalBackoffMs: 0,
|
||||||
|
proxyRotated: false,
|
||||||
|
userAgentRotated: false,
|
||||||
|
startedAt: new Date(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset retry state for a new operation
|
||||||
|
*/
|
||||||
|
reset(): void {
|
||||||
|
this.context = this.createInitialContext();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current attempt number (1-based)
|
||||||
|
*/
|
||||||
|
getAttemptNumber(): number {
|
||||||
|
return this.context.attemptNumber + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if we should attempt (call before each attempt)
|
||||||
|
*/
|
||||||
|
shouldAttempt(): boolean {
|
||||||
|
return this.context.attemptNumber < this.context.maxAttempts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record an attempt (call at start of each attempt)
|
||||||
|
*/
|
||||||
|
recordAttempt(): void {
|
||||||
|
this.context.attemptNumber++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluate an error and decide what to do
|
||||||
|
*/
|
||||||
|
evaluateError(
|
||||||
|
error: Error | string | null,
|
||||||
|
httpStatus?: number
|
||||||
|
): RetryDecision {
|
||||||
|
const errorCode = classifyError(error, httpStatus);
|
||||||
|
const metadata = getErrorMetadata(errorCode);
|
||||||
|
const attemptNumber = this.context.attemptNumber;
|
||||||
|
|
||||||
|
// Update context
|
||||||
|
this.context.lastErrorCode = errorCode;
|
||||||
|
this.context.lastHttpStatus = httpStatus || null;
|
||||||
|
|
||||||
|
// Check if error is retryable
|
||||||
|
if (!isRetryable(errorCode)) {
|
||||||
|
return {
|
||||||
|
shouldRetry: false,
|
||||||
|
reason: `Error ${errorCode} is not retryable: ${metadata.description}`,
|
||||||
|
backoffMs: 0,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
errorCode,
|
||||||
|
attemptNumber,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we've exhausted retries
|
||||||
|
if (!this.shouldAttempt()) {
|
||||||
|
return {
|
||||||
|
shouldRetry: false,
|
||||||
|
reason: `Max retries (${this.config.maxRetries}) exhausted`,
|
||||||
|
backoffMs: 0,
|
||||||
|
rotateProxy: false,
|
||||||
|
rotateUserAgent: false,
|
||||||
|
errorCode,
|
||||||
|
attemptNumber,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate backoff with exponential increase and jitter
|
||||||
|
const baseBackoff = this.calculateBackoff(attemptNumber, errorCode);
|
||||||
|
const backoffWithJitter = this.addJitter(baseBackoff);
|
||||||
|
|
||||||
|
// Track total backoff
|
||||||
|
this.context.totalBackoffMs += backoffWithJitter;
|
||||||
|
|
||||||
|
// Determine rotation needs
|
||||||
|
const rotateProxy = shouldRotateProxy(errorCode);
|
||||||
|
const rotateUserAgent = shouldRotateUserAgent(errorCode);
|
||||||
|
|
||||||
|
if (rotateProxy) this.context.proxyRotated = true;
|
||||||
|
if (rotateUserAgent) this.context.userAgentRotated = true;
|
||||||
|
|
||||||
|
const rotationInfo = [];
|
||||||
|
if (rotateProxy) rotationInfo.push('rotate proxy');
|
||||||
|
if (rotateUserAgent) rotationInfo.push('rotate UA');
|
||||||
|
const rotationStr = rotationInfo.length > 0 ? ` (${rotationInfo.join(', ')})` : '';
|
||||||
|
|
||||||
|
return {
|
||||||
|
shouldRetry: true,
|
||||||
|
reason: `Retrying after ${errorCode}${rotationStr}, backoff ${backoffWithJitter}ms`,
|
||||||
|
backoffMs: backoffWithJitter,
|
||||||
|
rotateProxy,
|
||||||
|
rotateUserAgent,
|
||||||
|
errorCode,
|
||||||
|
attemptNumber,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate exponential backoff for an attempt
|
||||||
|
*/
|
||||||
|
private calculateBackoff(attemptNumber: number, errorCode: CrawlErrorCodeType): number {
|
||||||
|
// Base exponential: baseBackoff * multiplier^(attempt-1)
|
||||||
|
const exponential = this.config.baseBackoffMs *
|
||||||
|
Math.pow(this.config.backoffMultiplier, attemptNumber - 1);
|
||||||
|
|
||||||
|
// Apply error-specific multiplier
|
||||||
|
const errorMultiplier = getBackoffMultiplier(errorCode);
|
||||||
|
const adjusted = exponential * errorMultiplier;
|
||||||
|
|
||||||
|
// Cap at max backoff
|
||||||
|
return Math.min(adjusted, this.config.maxBackoffMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add jitter to backoff to prevent thundering herd
|
||||||
|
*/
|
||||||
|
private addJitter(backoffMs: number): number {
|
||||||
|
const jitterRange = backoffMs * this.config.jitterFactor;
|
||||||
|
// Random between -jitterRange and +jitterRange
|
||||||
|
const jitter = (Math.random() * 2 - 1) * jitterRange;
|
||||||
|
return Math.max(0, Math.round(backoffMs + jitter));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get retry context summary
|
||||||
|
*/
|
||||||
|
getSummary(): RetryContextSummary {
|
||||||
|
const elapsedMs = Date.now() - this.context.startedAt.getTime();
|
||||||
|
return {
|
||||||
|
attemptsMade: this.context.attemptNumber,
|
||||||
|
maxAttempts: this.context.maxAttempts,
|
||||||
|
lastErrorCode: this.context.lastErrorCode,
|
||||||
|
lastHttpStatus: this.context.lastHttpStatus,
|
||||||
|
totalBackoffMs: this.context.totalBackoffMs,
|
||||||
|
totalElapsedMs: elapsedMs,
|
||||||
|
proxyWasRotated: this.context.proxyRotated,
|
||||||
|
userAgentWasRotated: this.context.userAgentRotated,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RetryContextSummary {
|
||||||
|
attemptsMade: number;
|
||||||
|
maxAttempts: number;
|
||||||
|
lastErrorCode: CrawlErrorCodeType | null;
|
||||||
|
lastHttpStatus: number | null;
|
||||||
|
totalBackoffMs: number;
|
||||||
|
totalElapsedMs: number;
|
||||||
|
proxyWasRotated: boolean;
|
||||||
|
userAgentWasRotated: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CONVENIENCE FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sleep for specified milliseconds
|
||||||
|
*/
|
||||||
|
export function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute a function with automatic retry logic
|
||||||
|
*/
|
||||||
|
export async function withRetry<T>(
|
||||||
|
fn: (attemptNumber: number) => Promise<T>,
|
||||||
|
config: Partial<RetryConfig> = {},
|
||||||
|
callbacks?: {
|
||||||
|
onRetry?: (decision: RetryDecision) => void | Promise<void>;
|
||||||
|
onRotateProxy?: () => void | Promise<void>;
|
||||||
|
onRotateUserAgent?: () => void | Promise<void>;
|
||||||
|
}
|
||||||
|
): Promise<{ result: T; summary: RetryContextSummary }> {
|
||||||
|
const manager = new RetryManager(config);
|
||||||
|
|
||||||
|
while (manager.shouldAttempt()) {
|
||||||
|
manager.recordAttempt();
|
||||||
|
const attemptNumber = manager.getAttemptNumber();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await fn(attemptNumber);
|
||||||
|
return { result, summary: manager.getSummary() };
|
||||||
|
} catch (error) {
|
||||||
|
const err = error instanceof Error ? error : new Error(String(error));
|
||||||
|
const httpStatus = (error as any)?.status || (error as any)?.statusCode;
|
||||||
|
|
||||||
|
const decision = manager.evaluateError(err, httpStatus);
|
||||||
|
|
||||||
|
if (!decision.shouldRetry) {
|
||||||
|
// Re-throw with enhanced context
|
||||||
|
const enhancedError = new RetryExhaustedError(
|
||||||
|
`${err.message} (${decision.reason})`,
|
||||||
|
err,
|
||||||
|
manager.getSummary()
|
||||||
|
);
|
||||||
|
throw enhancedError;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Notify callbacks
|
||||||
|
if (callbacks?.onRetry) {
|
||||||
|
await callbacks.onRetry(decision);
|
||||||
|
}
|
||||||
|
if (decision.rotateProxy && callbacks?.onRotateProxy) {
|
||||||
|
await callbacks.onRotateProxy();
|
||||||
|
}
|
||||||
|
if (decision.rotateUserAgent && callbacks?.onRotateUserAgent) {
|
||||||
|
await callbacks.onRotateUserAgent();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log retry decision
|
||||||
|
console.log(
|
||||||
|
`[RetryManager] Attempt ${attemptNumber} failed: ${decision.errorCode}. ` +
|
||||||
|
`${decision.reason}. Waiting ${decision.backoffMs}ms before retry.`
|
||||||
|
);
|
||||||
|
|
||||||
|
// Wait before retry
|
||||||
|
await sleep(decision.backoffMs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should not reach here, but handle edge case
|
||||||
|
throw new RetryExhaustedError(
|
||||||
|
'Max retries exhausted',
|
||||||
|
null,
|
||||||
|
manager.getSummary()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CUSTOM ERROR CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class RetryExhaustedError extends Error {
|
||||||
|
public readonly originalError: Error | null;
|
||||||
|
public readonly summary: RetryContextSummary;
|
||||||
|
public readonly errorCode: CrawlErrorCodeType;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
originalError: Error | null,
|
||||||
|
summary: RetryContextSummary
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = 'RetryExhaustedError';
|
||||||
|
this.originalError = originalError;
|
||||||
|
this.summary = summary;
|
||||||
|
this.errorCode = summary.lastErrorCode || CrawlErrorCode.UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BACKOFF CALCULATOR (for external use)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate next crawl time based on consecutive failures
|
||||||
|
*/
|
||||||
|
export function calculateNextCrawlDelay(
|
||||||
|
consecutiveFailures: number,
|
||||||
|
baseFrequencyMinutes: number,
|
||||||
|
maxBackoffMultiplier: number = 4.0
|
||||||
|
): number {
|
||||||
|
// Each failure doubles the delay, up to max multiplier
|
||||||
|
const multiplier = Math.min(
|
||||||
|
Math.pow(2, consecutiveFailures),
|
||||||
|
maxBackoffMultiplier
|
||||||
|
);
|
||||||
|
|
||||||
|
const delayMinutes = baseFrequencyMinutes * multiplier;
|
||||||
|
|
||||||
|
// Add jitter (0-10% of delay)
|
||||||
|
const jitterMinutes = delayMinutes * Math.random() * 0.1;
|
||||||
|
|
||||||
|
return Math.round(delayMinutes + jitterMinutes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate next crawl timestamp
|
||||||
|
*/
|
||||||
|
export function calculateNextCrawlAt(
|
||||||
|
consecutiveFailures: number,
|
||||||
|
baseFrequencyMinutes: number
|
||||||
|
): Date {
|
||||||
|
const delayMinutes = calculateNextCrawlDelay(consecutiveFailures, baseFrequencyMinutes);
|
||||||
|
return new Date(Date.now() + delayMinutes * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STATUS DETERMINATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine crawl status based on failure count
|
||||||
|
*/
|
||||||
|
export function determineCrawlStatus(
|
||||||
|
consecutiveFailures: number,
|
||||||
|
thresholds: { degraded: number; failed: number } = { degraded: 3, failed: 10 }
|
||||||
|
): 'active' | 'degraded' | 'failed' {
|
||||||
|
if (consecutiveFailures >= thresholds.failed) {
|
||||||
|
return 'failed';
|
||||||
|
}
|
||||||
|
if (consecutiveFailures >= thresholds.degraded) {
|
||||||
|
return 'degraded';
|
||||||
|
}
|
||||||
|
return 'active';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine if store should be auto-recovered
|
||||||
|
* (Called periodically to check if failed stores can be retried)
|
||||||
|
*/
|
||||||
|
export function shouldAttemptRecovery(
|
||||||
|
lastFailureAt: Date | null,
|
||||||
|
consecutiveFailures: number,
|
||||||
|
recoveryIntervalHours: number = 24
|
||||||
|
): boolean {
|
||||||
|
if (!lastFailureAt) return true;
|
||||||
|
|
||||||
|
// Wait longer for more failures
|
||||||
|
const waitHours = recoveryIntervalHours * Math.min(consecutiveFailures, 5);
|
||||||
|
const recoveryTime = new Date(lastFailureAt.getTime() + waitHours * 60 * 60 * 1000);
|
||||||
|
|
||||||
|
return new Date() >= recoveryTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SINGLETON INSTANCE
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const retryManager = new RetryManager();
|
||||||
@@ -11,12 +11,14 @@
|
|||||||
* Example: 4-hour base with ±30min jitter = runs anywhere from 3h30m to 4h30m apart
|
* Example: 4-hour base with ±30min jitter = runs anywhere from 3h30m to 4h30m apart
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { query, getClient } from '../db/connection';
|
import { query, getClient, getPool } from '../db/connection';
|
||||||
import { crawlDispensaryProducts, CrawlResult } from './product-crawler';
|
import { crawlDispensaryProducts, CrawlResult } from './product-crawler';
|
||||||
import { mapDbRowToDispensary } from './discovery';
|
import { mapDbRowToDispensary } from './discovery';
|
||||||
import { executeMenuDetectionJob } from './menu-detection';
|
import { executeMenuDetectionJob } from './menu-detection';
|
||||||
import { bulkEnqueueJobs, enqueueJob, getQueueStats } from './job-queue';
|
import { bulkEnqueueJobs, enqueueJob, getQueueStats } from './job-queue';
|
||||||
import { JobSchedule, JobStatus, Dispensary } from '../types';
|
import { JobSchedule, JobStatus, Dispensary } from '../types';
|
||||||
|
import { DtLocationDiscoveryService } from '../discovery/DtLocationDiscoveryService';
|
||||||
|
import { StateQueryService } from '../../multi-state/state-query-service';
|
||||||
|
|
||||||
// Scheduler poll interval (how often we check for due jobs)
|
// Scheduler poll interval (how often we check for due jobs)
|
||||||
const SCHEDULER_POLL_INTERVAL_MS = 60 * 1000; // 1 minute
|
const SCHEDULER_POLL_INTERVAL_MS = 60 * 1000; // 1 minute
|
||||||
@@ -65,6 +67,7 @@ export async function getAllSchedules(): Promise<JobSchedule[]> {
|
|||||||
SELECT
|
SELECT
|
||||||
id, job_name, description, enabled,
|
id, job_name, description, enabled,
|
||||||
base_interval_minutes, jitter_minutes,
|
base_interval_minutes, jitter_minutes,
|
||||||
|
worker_name, worker_role,
|
||||||
last_run_at, last_status, last_error_message, last_duration_ms,
|
last_run_at, last_status, last_error_message, last_duration_ms,
|
||||||
next_run_at, job_config, created_at, updated_at
|
next_run_at, job_config, created_at, updated_at
|
||||||
FROM job_schedules
|
FROM job_schedules
|
||||||
@@ -78,6 +81,8 @@ export async function getAllSchedules(): Promise<JobSchedule[]> {
|
|||||||
enabled: row.enabled,
|
enabled: row.enabled,
|
||||||
baseIntervalMinutes: row.base_interval_minutes,
|
baseIntervalMinutes: row.base_interval_minutes,
|
||||||
jitterMinutes: row.jitter_minutes,
|
jitterMinutes: row.jitter_minutes,
|
||||||
|
workerName: row.worker_name,
|
||||||
|
workerRole: row.worker_role,
|
||||||
lastRunAt: row.last_run_at,
|
lastRunAt: row.last_run_at,
|
||||||
lastStatus: row.last_status,
|
lastStatus: row.last_status,
|
||||||
lastErrorMessage: row.last_error_message,
|
lastErrorMessage: row.last_error_message,
|
||||||
@@ -108,6 +113,8 @@ export async function getScheduleById(id: number): Promise<JobSchedule | null> {
|
|||||||
enabled: row.enabled,
|
enabled: row.enabled,
|
||||||
baseIntervalMinutes: row.base_interval_minutes,
|
baseIntervalMinutes: row.base_interval_minutes,
|
||||||
jitterMinutes: row.jitter_minutes,
|
jitterMinutes: row.jitter_minutes,
|
||||||
|
workerName: row.worker_name,
|
||||||
|
workerRole: row.worker_role,
|
||||||
lastRunAt: row.last_run_at,
|
lastRunAt: row.last_run_at,
|
||||||
lastStatus: row.last_status,
|
lastStatus: row.last_status,
|
||||||
lastErrorMessage: row.last_error_message,
|
lastErrorMessage: row.last_error_message,
|
||||||
@@ -128,6 +135,8 @@ export async function createSchedule(schedule: {
|
|||||||
enabled?: boolean;
|
enabled?: boolean;
|
||||||
baseIntervalMinutes: number;
|
baseIntervalMinutes: number;
|
||||||
jitterMinutes: number;
|
jitterMinutes: number;
|
||||||
|
workerName?: string;
|
||||||
|
workerRole?: string;
|
||||||
jobConfig?: Record<string, any>;
|
jobConfig?: Record<string, any>;
|
||||||
startImmediately?: boolean;
|
startImmediately?: boolean;
|
||||||
}): Promise<JobSchedule> {
|
}): Promise<JobSchedule> {
|
||||||
@@ -141,8 +150,9 @@ export async function createSchedule(schedule: {
|
|||||||
INSERT INTO job_schedules (
|
INSERT INTO job_schedules (
|
||||||
job_name, description, enabled,
|
job_name, description, enabled,
|
||||||
base_interval_minutes, jitter_minutes,
|
base_interval_minutes, jitter_minutes,
|
||||||
|
worker_name, worker_role,
|
||||||
next_run_at, job_config
|
next_run_at, job_config
|
||||||
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||||
RETURNING *
|
RETURNING *
|
||||||
`,
|
`,
|
||||||
[
|
[
|
||||||
@@ -151,13 +161,16 @@ export async function createSchedule(schedule: {
|
|||||||
schedule.enabled ?? true,
|
schedule.enabled ?? true,
|
||||||
schedule.baseIntervalMinutes,
|
schedule.baseIntervalMinutes,
|
||||||
schedule.jitterMinutes,
|
schedule.jitterMinutes,
|
||||||
|
schedule.workerName || null,
|
||||||
|
schedule.workerRole || null,
|
||||||
nextRunAt,
|
nextRunAt,
|
||||||
schedule.jobConfig ? JSON.stringify(schedule.jobConfig) : null,
|
schedule.jobConfig ? JSON.stringify(schedule.jobConfig) : null,
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
const row = rows[0];
|
const row = rows[0];
|
||||||
console.log(`[Scheduler] Created schedule "${schedule.jobName}" - next run at ${nextRunAt.toISOString()}`);
|
const workerInfo = schedule.workerName ? ` (Worker: ${schedule.workerName})` : '';
|
||||||
|
console.log(`[Scheduler] Created schedule "${schedule.jobName}"${workerInfo} - next run at ${nextRunAt.toISOString()}`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
id: row.id,
|
id: row.id,
|
||||||
@@ -166,6 +179,8 @@ export async function createSchedule(schedule: {
|
|||||||
enabled: row.enabled,
|
enabled: row.enabled,
|
||||||
baseIntervalMinutes: row.base_interval_minutes,
|
baseIntervalMinutes: row.base_interval_minutes,
|
||||||
jitterMinutes: row.jitter_minutes,
|
jitterMinutes: row.jitter_minutes,
|
||||||
|
workerName: row.worker_name,
|
||||||
|
workerRole: row.worker_role,
|
||||||
lastRunAt: row.last_run_at,
|
lastRunAt: row.last_run_at,
|
||||||
lastStatus: row.last_status,
|
lastStatus: row.last_status,
|
||||||
lastErrorMessage: row.last_error_message,
|
lastErrorMessage: row.last_error_message,
|
||||||
@@ -304,20 +319,22 @@ async function updateScheduleAfterRun(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a job run log entry
|
* Create a job run log entry with worker metadata propagated from schedule
|
||||||
*/
|
*/
|
||||||
async function createRunLog(
|
async function createRunLog(
|
||||||
scheduleId: number,
|
scheduleId: number,
|
||||||
jobName: string,
|
jobName: string,
|
||||||
status: 'pending' | 'running'
|
status: 'pending' | 'running',
|
||||||
|
workerName?: string,
|
||||||
|
workerRole?: string
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
const { rows } = await query<{ id: number }>(
|
const { rows } = await query<{ id: number }>(
|
||||||
`
|
`
|
||||||
INSERT INTO job_run_logs (schedule_id, job_name, status, started_at)
|
INSERT INTO job_run_logs (schedule_id, job_name, status, worker_name, run_role, started_at)
|
||||||
VALUES ($1, $2, $3, NOW())
|
VALUES ($1, $2, $3, $4, $5, NOW())
|
||||||
RETURNING id
|
RETURNING id
|
||||||
`,
|
`,
|
||||||
[scheduleId, jobName, status]
|
[scheduleId, jobName, status, workerName || null, workerRole || null]
|
||||||
);
|
);
|
||||||
return rows[0].id;
|
return rows[0].id;
|
||||||
}
|
}
|
||||||
@@ -434,22 +451,31 @@ async function executeJob(schedule: JobSchedule): Promise<{
|
|||||||
return executeDiscovery(config);
|
return executeDiscovery(config);
|
||||||
case 'dutchie_az_menu_detection':
|
case 'dutchie_az_menu_detection':
|
||||||
return executeMenuDetectionJob(config);
|
return executeMenuDetectionJob(config);
|
||||||
|
case 'dutchie_store_discovery':
|
||||||
|
return executeStoreDiscovery(config);
|
||||||
|
case 'analytics_refresh':
|
||||||
|
return executeAnalyticsRefresh(config);
|
||||||
default:
|
default:
|
||||||
throw new Error(`Unknown job type: ${schedule.jobName}`);
|
throw new Error(`Unknown job type: ${schedule.jobName}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute the AZ Dutchie product crawl job
|
* Execute the AZ Dutchie product crawl job (Worker: Bella)
|
||||||
*
|
*
|
||||||
* NEW BEHAVIOR: Instead of running crawls directly, this now ENQUEUES jobs
|
* NEW BEHAVIOR: Instead of running crawls directly, this now ENQUEUES jobs
|
||||||
* into the crawl_jobs queue. Workers (running as separate replicas) will
|
* into the crawl_jobs queue. Workers (running as separate replicas) will
|
||||||
* pick up and process these jobs.
|
* pick up and process these jobs.
|
||||||
*
|
*
|
||||||
|
* Scope filtering:
|
||||||
|
* - config.scope.states: Array of state codes to limit crawl (e.g., ["AZ", "CA"])
|
||||||
|
* - config.scope.storeIds: Array of specific store IDs to crawl
|
||||||
|
*
|
||||||
* This allows:
|
* This allows:
|
||||||
* - Multiple workers to process jobs in parallel
|
* - Multiple workers to process jobs in parallel
|
||||||
* - No double-crawls (DB-level locking per dispensary)
|
* - No double-crawls (DB-level locking per dispensary)
|
||||||
* - Better scalability (add more worker replicas)
|
* - Better scalability (add more worker replicas)
|
||||||
|
* - Sharding by state or store for parallel execution
|
||||||
* - Live monitoring of individual job progress
|
* - Live monitoring of individual job progress
|
||||||
*/
|
*/
|
||||||
async function executeProductCrawl(config: Record<string, any>): Promise<{
|
async function executeProductCrawl(config: Record<string, any>): Promise<{
|
||||||
@@ -462,18 +488,45 @@ async function executeProductCrawl(config: Record<string, any>): Promise<{
|
|||||||
}> {
|
}> {
|
||||||
const pricingType = config.pricingType || 'rec';
|
const pricingType = config.pricingType || 'rec';
|
||||||
const useBothModes = config.useBothModes !== false;
|
const useBothModes = config.useBothModes !== false;
|
||||||
|
const scope = config.scope as { states?: string[]; storeIds?: number[] } | undefined;
|
||||||
|
|
||||||
// Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed)
|
const scopeDesc = scope?.states?.length
|
||||||
// Note: Menu detection is handled separately by the dutchie_az_menu_detection schedule
|
? ` (states: ${scope.states.join(', ')})`
|
||||||
|
: scope?.storeIds?.length
|
||||||
|
? ` (${scope.storeIds.length} specific stores)`
|
||||||
|
: ' (all AZ stores)';
|
||||||
|
|
||||||
|
console.log(`[Bella - Product Sync] Starting product crawl job${scopeDesc}...`);
|
||||||
|
|
||||||
|
// Build query based on scope
|
||||||
|
let whereClause = `
|
||||||
|
WHERE menu_type = 'dutchie'
|
||||||
|
AND platform_dispensary_id IS NOT NULL
|
||||||
|
AND failed_at IS NULL
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
// Apply scope filtering
|
||||||
|
if (scope?.storeIds?.length) {
|
||||||
|
whereClause += ` AND id = ANY($${paramIndex++})`;
|
||||||
|
params.push(scope.storeIds);
|
||||||
|
} else if (scope?.states?.length) {
|
||||||
|
whereClause += ` AND state = ANY($${paramIndex++})`;
|
||||||
|
params.push(scope.states);
|
||||||
|
} else {
|
||||||
|
// Default to AZ if no scope specified
|
||||||
|
whereClause += ` AND state = 'AZ'`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all "ready" dispensaries matching scope
|
||||||
const { rows: rawRows } = await query(
|
const { rows: rawRows } = await query(
|
||||||
`
|
`
|
||||||
SELECT id FROM dispensaries
|
SELECT id FROM dispensaries
|
||||||
WHERE state = 'AZ'
|
${whereClause}
|
||||||
AND menu_type = 'dutchie'
|
|
||||||
AND platform_dispensary_id IS NOT NULL
|
|
||||||
AND failed_at IS NULL
|
|
||||||
ORDER BY last_crawl_at ASC NULLS FIRST
|
ORDER BY last_crawl_at ASC NULLS FIRST
|
||||||
`
|
`,
|
||||||
|
params
|
||||||
);
|
);
|
||||||
const dispensaryIds = rawRows.map((r: any) => r.id);
|
const dispensaryIds = rawRows.map((r: any) => r.id);
|
||||||
|
|
||||||
@@ -483,11 +536,14 @@ async function executeProductCrawl(config: Record<string, any>): Promise<{
|
|||||||
itemsProcessed: 0,
|
itemsProcessed: 0,
|
||||||
itemsSucceeded: 0,
|
itemsSucceeded: 0,
|
||||||
itemsFailed: 0,
|
itemsFailed: 0,
|
||||||
metadata: { message: 'No ready dispensaries to crawl. Run menu detection to discover more.' },
|
metadata: {
|
||||||
|
message: 'No ready dispensaries to crawl. Run menu detection to discover more.',
|
||||||
|
scope: scope || 'all',
|
||||||
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`[Scheduler] Enqueueing crawl jobs for ${dispensaryIds.length} dispensaries...`);
|
console.log(`[Bella - Product Sync] Enqueueing crawl jobs for ${dispensaryIds.length} dispensaries...`);
|
||||||
|
|
||||||
// Bulk enqueue jobs (skips dispensaries that already have pending/running jobs)
|
// Bulk enqueue jobs (skips dispensaries that already have pending/running jobs)
|
||||||
const { enqueued, skipped } = await bulkEnqueueJobs(
|
const { enqueued, skipped } = await bulkEnqueueJobs(
|
||||||
@@ -499,7 +555,7 @@ async function executeProductCrawl(config: Record<string, any>): Promise<{
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log(`[Scheduler] Enqueued ${enqueued} jobs, skipped ${skipped} (already queued)`);
|
console.log(`[Bella - Product Sync] Enqueued ${enqueued} jobs, skipped ${skipped} (already queued)`);
|
||||||
|
|
||||||
// Get current queue stats
|
// Get current queue stats
|
||||||
const queueStats = await getQueueStats();
|
const queueStats = await getQueueStats();
|
||||||
@@ -515,6 +571,7 @@ async function executeProductCrawl(config: Record<string, any>): Promise<{
|
|||||||
queueStats,
|
queueStats,
|
||||||
pricingType,
|
pricingType,
|
||||||
useBothModes,
|
useBothModes,
|
||||||
|
scope: scope || 'all',
|
||||||
message: `Enqueued ${enqueued} jobs. Workers will process them. Check /scraper-monitor for progress.`,
|
message: `Enqueued ${enqueued} jobs. Workers will process them. Check /scraper-monitor for progress.`,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -541,6 +598,181 @@ async function executeDiscovery(_config: Record<string, any>): Promise<{
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute the Store Discovery job (Worker: Alice)
|
||||||
|
*
|
||||||
|
* Full discovery workflow:
|
||||||
|
* 1. Fetch master cities page from https://dutchie.com/cities
|
||||||
|
* 2. Upsert discovered states/cities into dutchie_discovery_cities
|
||||||
|
* 3. Crawl each city page to discover all stores
|
||||||
|
* 4. Detect new stores, slug changes, and removed stores
|
||||||
|
* 5. Mark retired stores (never delete)
|
||||||
|
*
|
||||||
|
* Scope filtering:
|
||||||
|
* - config.scope.states: Array of state codes to limit discovery (e.g., ["AZ", "CA"])
|
||||||
|
* - config.scope.storeIds: Array of specific store IDs to process
|
||||||
|
*/
|
||||||
|
async function executeStoreDiscovery(config: Record<string, any>): Promise<{
|
||||||
|
status: JobStatus;
|
||||||
|
itemsProcessed: number;
|
||||||
|
itemsSucceeded: number;
|
||||||
|
itemsFailed: number;
|
||||||
|
errorMessage?: string;
|
||||||
|
metadata?: any;
|
||||||
|
}> {
|
||||||
|
const delayMs = config.delayMs || 2000; // Delay between cities
|
||||||
|
const scope = config.scope as { states?: string[]; storeIds?: number[] } | undefined;
|
||||||
|
|
||||||
|
const scopeDesc = scope?.states?.length
|
||||||
|
? ` (states: ${scope.states.join(', ')})`
|
||||||
|
: scope?.storeIds?.length
|
||||||
|
? ` (${scope.storeIds.length} specific stores)`
|
||||||
|
: ' (all states)';
|
||||||
|
|
||||||
|
console.log(`[Alice - Store Discovery] Starting store discovery job${scopeDesc}...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const pool = getPool();
|
||||||
|
const discoveryService = new DtLocationDiscoveryService(pool);
|
||||||
|
|
||||||
|
// Get stats before
|
||||||
|
const statsBefore = await discoveryService.getStats();
|
||||||
|
console.log(`[Alice - Store Discovery] Current stats: ${statsBefore.total} total locations, ${statsBefore.withCoordinates} with coordinates`);
|
||||||
|
|
||||||
|
// Run full discovery with change detection
|
||||||
|
const result = await discoveryService.runFullDiscoveryWithChangeDetection({
|
||||||
|
scope,
|
||||||
|
delayMs,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[Alice - Store Discovery] Completed: ${result.statesDiscovered} states, ${result.citiesDiscovered} cities`);
|
||||||
|
console.log(`[Alice - Store Discovery] Stores found: ${result.totalLocationsFound} total`);
|
||||||
|
console.log(`[Alice - Store Discovery] Changes: +${result.newStoreCount} new, ~${result.updatedStoreCount} updated, =${result.slugChangedCount} slug changes, -${result.removedStoreCount} retired`);
|
||||||
|
|
||||||
|
const totalChanges = result.newStoreCount + result.updatedStoreCount + result.slugChangedCount;
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: result.errors.length > 0 ? 'partial' : 'success',
|
||||||
|
itemsProcessed: result.totalLocationsFound,
|
||||||
|
itemsSucceeded: totalChanges,
|
||||||
|
itemsFailed: result.errors.length,
|
||||||
|
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
||||||
|
metadata: {
|
||||||
|
statesDiscovered: result.statesDiscovered,
|
||||||
|
citiesDiscovered: result.citiesDiscovered,
|
||||||
|
totalLocationsFound: result.totalLocationsFound,
|
||||||
|
newStoreCount: result.newStoreCount,
|
||||||
|
updatedStoreCount: result.updatedStoreCount,
|
||||||
|
slugChangedCount: result.slugChangedCount,
|
||||||
|
removedStoreCount: result.removedStoreCount,
|
||||||
|
durationMs: result.durationMs,
|
||||||
|
errorCount: result.errors.length,
|
||||||
|
scope: scope || 'all',
|
||||||
|
statsBefore: {
|
||||||
|
total: statsBefore.total,
|
||||||
|
withCoordinates: statsBefore.withCoordinates,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Alice - Store Discovery] Job failed:', error.message);
|
||||||
|
return {
|
||||||
|
status: 'error',
|
||||||
|
itemsProcessed: 0,
|
||||||
|
itemsSucceeded: 0,
|
||||||
|
itemsFailed: 1,
|
||||||
|
errorMessage: error.message,
|
||||||
|
metadata: { error: error.message, scope: scope || 'all' },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute the Analytics Refresh job (Worker: Oscar)
|
||||||
|
*
|
||||||
|
* Refreshes materialized views and analytics data.
|
||||||
|
* Uses StateQueryService to refresh mv_state_metrics and other views.
|
||||||
|
*/
|
||||||
|
async function executeAnalyticsRefresh(config: Record<string, any>): Promise<{
|
||||||
|
status: JobStatus;
|
||||||
|
itemsProcessed: number;
|
||||||
|
itemsSucceeded: number;
|
||||||
|
itemsFailed: number;
|
||||||
|
errorMessage?: string;
|
||||||
|
metadata?: any;
|
||||||
|
}> {
|
||||||
|
console.log('[Oscar - Analytics Refresh] Starting analytics refresh job...');
|
||||||
|
|
||||||
|
const startTime = Date.now();
|
||||||
|
const refreshedViews: string[] = [];
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
try {
|
||||||
|
const pool = getPool();
|
||||||
|
const stateService = new StateQueryService(pool);
|
||||||
|
|
||||||
|
// Refresh state metrics materialized view
|
||||||
|
console.log('[Oscar - Analytics Refresh] Refreshing mv_state_metrics...');
|
||||||
|
try {
|
||||||
|
await stateService.refreshMetrics();
|
||||||
|
refreshedViews.push('mv_state_metrics');
|
||||||
|
console.log('[Oscar - Analytics Refresh] mv_state_metrics refreshed successfully');
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Oscar - Analytics Refresh] Failed to refresh mv_state_metrics:', error.message);
|
||||||
|
errors.push(`mv_state_metrics: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Refresh other analytics views if configured
|
||||||
|
if (config.refreshBrandViews !== false) {
|
||||||
|
console.log('[Oscar - Analytics Refresh] Refreshing brand analytics views...');
|
||||||
|
try {
|
||||||
|
// Check if v_brand_state_presence exists and refresh if needed
|
||||||
|
await pool.query(`
|
||||||
|
SELECT 1 FROM pg_matviews WHERE matviewname = 'v_brand_state_presence' LIMIT 1
|
||||||
|
`).then(async (result) => {
|
||||||
|
if (result.rows.length > 0) {
|
||||||
|
await pool.query('REFRESH MATERIALIZED VIEW CONCURRENTLY v_brand_state_presence');
|
||||||
|
refreshedViews.push('v_brand_state_presence');
|
||||||
|
console.log('[Oscar - Analytics Refresh] v_brand_state_presence refreshed');
|
||||||
|
}
|
||||||
|
}).catch(() => {
|
||||||
|
// View doesn't exist, skip
|
||||||
|
});
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`v_brand_state_presence: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log(`[Oscar - Analytics Refresh] Completed: ${refreshedViews.length} views refreshed in ${Math.round(durationMs / 1000)}s`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: errors.length > 0 ? (refreshedViews.length > 0 ? 'partial' : 'error') : 'success',
|
||||||
|
itemsProcessed: refreshedViews.length + errors.length,
|
||||||
|
itemsSucceeded: refreshedViews.length,
|
||||||
|
itemsFailed: errors.length,
|
||||||
|
errorMessage: errors.length > 0 ? errors.join('; ') : undefined,
|
||||||
|
metadata: {
|
||||||
|
refreshedViews,
|
||||||
|
errorCount: errors.length,
|
||||||
|
errors: errors.length > 0 ? errors : undefined,
|
||||||
|
durationMs,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('[Oscar - Analytics Refresh] Job failed:', error.message);
|
||||||
|
return {
|
||||||
|
status: 'error',
|
||||||
|
itemsProcessed: 0,
|
||||||
|
itemsSucceeded: 0,
|
||||||
|
itemsFailed: 1,
|
||||||
|
errorMessage: error.message,
|
||||||
|
metadata: { error: error.message },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// SCHEDULER RUNNER
|
// SCHEDULER RUNNER
|
||||||
// ============================================================
|
// ============================================================
|
||||||
@@ -596,14 +828,21 @@ async function checkAndRunDueJobs(): Promise<void> {
|
|||||||
*/
|
*/
|
||||||
async function runScheduledJob(schedule: JobSchedule): Promise<void> {
|
async function runScheduledJob(schedule: JobSchedule): Promise<void> {
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
const workerInfo = schedule.workerName ? ` [Worker: ${schedule.workerName}]` : '';
|
||||||
|
|
||||||
console.log(`[Scheduler] Starting job "${schedule.jobName}"...`);
|
console.log(`[Scheduler]${workerInfo} Starting job "${schedule.jobName}"...`);
|
||||||
|
|
||||||
// Mark as running
|
// Mark as running
|
||||||
await markScheduleRunning(schedule.id);
|
await markScheduleRunning(schedule.id);
|
||||||
|
|
||||||
// Create run log entry
|
// Create run log entry with worker metadata propagated from schedule
|
||||||
const runLogId = await createRunLog(schedule.id, schedule.jobName, 'running');
|
const runLogId = await createRunLog(
|
||||||
|
schedule.id,
|
||||||
|
schedule.jobName,
|
||||||
|
'running',
|
||||||
|
schedule.workerName,
|
||||||
|
schedule.workerRole
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Execute the job
|
// Execute the job
|
||||||
@@ -735,11 +974,17 @@ export async function triggerScheduleNow(scheduleId: number): Promise<{
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize default schedules if they don't exist
|
* Initialize default schedules if they don't exist
|
||||||
|
*
|
||||||
|
* Named Workers:
|
||||||
|
* - Bella: GraphQL Product Sync (crawls products from Dutchie) - 4hr
|
||||||
|
* - Henry: Entry Point Finder (detects menu providers and resolves platform IDs) - 24hr
|
||||||
|
* - Alice: Store Discovery (discovers new locations from city pages) - 24hr
|
||||||
|
* - Oscar: Analytics Refresh (refreshes materialized views) - 1hr
|
||||||
*/
|
*/
|
||||||
export async function initializeDefaultSchedules(): Promise<void> {
|
export async function initializeDefaultSchedules(): Promise<void> {
|
||||||
const schedules = await getAllSchedules();
|
const schedules = await getAllSchedules();
|
||||||
|
|
||||||
// Check if product crawl schedule exists
|
// Check if product crawl schedule exists (Worker: Bella)
|
||||||
const productCrawlExists = schedules.some(s => s.jobName === 'dutchie_az_product_crawl');
|
const productCrawlExists = schedules.some(s => s.jobName === 'dutchie_az_product_crawl');
|
||||||
if (!productCrawlExists) {
|
if (!productCrawlExists) {
|
||||||
await createSchedule({
|
await createSchedule({
|
||||||
@@ -748,13 +993,15 @@ export async function initializeDefaultSchedules(): Promise<void> {
|
|||||||
enabled: true,
|
enabled: true,
|
||||||
baseIntervalMinutes: 240, // 4 hours
|
baseIntervalMinutes: 240, // 4 hours
|
||||||
jitterMinutes: 30, // ±30 minutes
|
jitterMinutes: 30, // ±30 minutes
|
||||||
|
workerName: 'Bella',
|
||||||
|
workerRole: 'GraphQL Product Sync',
|
||||||
jobConfig: { pricingType: 'rec', useBothModes: true },
|
jobConfig: { pricingType: 'rec', useBothModes: true },
|
||||||
startImmediately: false,
|
startImmediately: false,
|
||||||
});
|
});
|
||||||
console.log('[Scheduler] Created default product crawl schedule');
|
console.log('[Scheduler] Created default product crawl schedule (Worker: Bella)');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if menu detection schedule exists
|
// Check if menu detection schedule exists (Worker: Henry)
|
||||||
const menuDetectionExists = schedules.some(s => s.jobName === 'dutchie_az_menu_detection');
|
const menuDetectionExists = schedules.some(s => s.jobName === 'dutchie_az_menu_detection');
|
||||||
if (!menuDetectionExists) {
|
if (!menuDetectionExists) {
|
||||||
await createSchedule({
|
await createSchedule({
|
||||||
@@ -763,10 +1010,46 @@ export async function initializeDefaultSchedules(): Promise<void> {
|
|||||||
enabled: true,
|
enabled: true,
|
||||||
baseIntervalMinutes: 1440, // 24 hours
|
baseIntervalMinutes: 1440, // 24 hours
|
||||||
jitterMinutes: 60, // ±1 hour
|
jitterMinutes: 60, // ±1 hour
|
||||||
|
workerName: 'Henry',
|
||||||
|
workerRole: 'Entry Point Finder',
|
||||||
jobConfig: { state: 'AZ', onlyUnknown: true },
|
jobConfig: { state: 'AZ', onlyUnknown: true },
|
||||||
startImmediately: false,
|
startImmediately: false,
|
||||||
});
|
});
|
||||||
console.log('[Scheduler] Created default menu detection schedule');
|
console.log('[Scheduler] Created default menu detection schedule (Worker: Henry)');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if store discovery schedule exists (Worker: Alice)
|
||||||
|
const storeDiscoveryExists = schedules.some(s => s.jobName === 'dutchie_store_discovery');
|
||||||
|
if (!storeDiscoveryExists) {
|
||||||
|
await createSchedule({
|
||||||
|
jobName: 'dutchie_store_discovery',
|
||||||
|
description: 'Discover new Dutchie dispensary locations from city pages',
|
||||||
|
enabled: true,
|
||||||
|
baseIntervalMinutes: 1440, // 24 hours
|
||||||
|
jitterMinutes: 120, // ±2 hours
|
||||||
|
workerName: 'Alice',
|
||||||
|
workerRole: 'Store Discovery',
|
||||||
|
jobConfig: { delayMs: 2000 },
|
||||||
|
startImmediately: false,
|
||||||
|
});
|
||||||
|
console.log('[Scheduler] Created default store discovery schedule (Worker: Alice)');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if analytics refresh schedule exists (Worker: Oscar)
|
||||||
|
const analyticsRefreshExists = schedules.some(s => s.jobName === 'analytics_refresh');
|
||||||
|
if (!analyticsRefreshExists) {
|
||||||
|
await createSchedule({
|
||||||
|
jobName: 'analytics_refresh',
|
||||||
|
description: 'Refresh analytics materialized views (mv_state_metrics, etc.)',
|
||||||
|
enabled: true,
|
||||||
|
baseIntervalMinutes: 60, // 1 hour
|
||||||
|
jitterMinutes: 10, // ±10 minutes
|
||||||
|
workerName: 'Oscar',
|
||||||
|
workerRole: 'Analytics Refresh',
|
||||||
|
jobConfig: { refreshBrandViews: true },
|
||||||
|
startImmediately: false,
|
||||||
|
});
|
||||||
|
console.log('[Scheduler] Created default analytics refresh schedule (Worker: Oscar)');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
465
backend/src/dutchie-az/services/store-validator.ts
Normal file
465
backend/src/dutchie-az/services/store-validator.ts
Normal file
@@ -0,0 +1,465 @@
|
|||||||
|
/**
|
||||||
|
* Store Configuration Validator
|
||||||
|
*
|
||||||
|
* Validates and sanitizes store configurations before crawling.
|
||||||
|
* Applies defaults for missing values and logs warnings.
|
||||||
|
*
|
||||||
|
* Phase 1: Crawler Reliability & Stabilization
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { CrawlErrorCode, CrawlErrorCodeType } from './error-taxonomy';
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DEFAULT CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default crawl configuration values
|
||||||
|
*/
|
||||||
|
export const DEFAULT_CONFIG = {
|
||||||
|
// Scheduling
|
||||||
|
crawlFrequencyMinutes: 240, // 4 hours
|
||||||
|
minCrawlGapMinutes: 2, // Minimum 2 minutes between crawls
|
||||||
|
|
||||||
|
// Retries
|
||||||
|
maxRetries: 3,
|
||||||
|
baseBackoffMs: 1000, // 1 second
|
||||||
|
maxBackoffMs: 60000, // 1 minute
|
||||||
|
backoffMultiplier: 2.0, // Exponential backoff
|
||||||
|
|
||||||
|
// Timeouts
|
||||||
|
requestTimeoutMs: 30000, // 30 seconds
|
||||||
|
pageLoadTimeoutMs: 60000, // 60 seconds
|
||||||
|
|
||||||
|
// Limits
|
||||||
|
maxProductsPerPage: 100,
|
||||||
|
maxPages: 50,
|
||||||
|
|
||||||
|
// Proxy
|
||||||
|
proxyRotationEnabled: true,
|
||||||
|
proxyRotationOnFailure: true,
|
||||||
|
|
||||||
|
// User Agent
|
||||||
|
userAgentRotationEnabled: true,
|
||||||
|
userAgentRotationOnFailure: true,
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STORE CONFIG INTERFACE
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Raw store configuration from database
|
||||||
|
*/
|
||||||
|
export interface RawStoreConfig {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
slug?: string;
|
||||||
|
platform?: string;
|
||||||
|
menuType?: string;
|
||||||
|
platformDispensaryId?: string;
|
||||||
|
menuUrl?: string;
|
||||||
|
website?: string;
|
||||||
|
|
||||||
|
// Crawl config
|
||||||
|
crawlFrequencyMinutes?: number;
|
||||||
|
maxRetries?: number;
|
||||||
|
currentProxyId?: number;
|
||||||
|
currentUserAgent?: string;
|
||||||
|
|
||||||
|
// Status
|
||||||
|
crawlStatus?: string;
|
||||||
|
consecutiveFailures?: number;
|
||||||
|
backoffMultiplier?: number;
|
||||||
|
lastCrawlAt?: Date;
|
||||||
|
lastSuccessAt?: Date;
|
||||||
|
lastFailureAt?: Date;
|
||||||
|
lastErrorCode?: string;
|
||||||
|
nextCrawlAt?: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validated and sanitized store configuration
|
||||||
|
*/
|
||||||
|
export interface ValidatedStoreConfig {
|
||||||
|
id: number;
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
platform: string;
|
||||||
|
menuType: string;
|
||||||
|
platformDispensaryId: string;
|
||||||
|
menuUrl: string;
|
||||||
|
|
||||||
|
// Crawl config (with defaults applied)
|
||||||
|
crawlFrequencyMinutes: number;
|
||||||
|
maxRetries: number;
|
||||||
|
currentProxyId: number | null;
|
||||||
|
currentUserAgent: string | null;
|
||||||
|
|
||||||
|
// Status
|
||||||
|
crawlStatus: 'active' | 'degraded' | 'paused' | 'failed';
|
||||||
|
consecutiveFailures: number;
|
||||||
|
backoffMultiplier: number;
|
||||||
|
lastCrawlAt: Date | null;
|
||||||
|
lastSuccessAt: Date | null;
|
||||||
|
lastFailureAt: Date | null;
|
||||||
|
lastErrorCode: CrawlErrorCodeType | null;
|
||||||
|
nextCrawlAt: Date | null;
|
||||||
|
|
||||||
|
// Validation metadata
|
||||||
|
isValid: boolean;
|
||||||
|
validationErrors: ValidationError[];
|
||||||
|
validationWarnings: ValidationWarning[];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// VALIDATION TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface ValidationError {
|
||||||
|
field: string;
|
||||||
|
message: string;
|
||||||
|
code: CrawlErrorCodeType;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ValidationWarning {
|
||||||
|
field: string;
|
||||||
|
message: string;
|
||||||
|
appliedDefault?: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ValidationResult {
|
||||||
|
isValid: boolean;
|
||||||
|
config: ValidatedStoreConfig | null;
|
||||||
|
errors: ValidationError[];
|
||||||
|
warnings: ValidationWarning[];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// VALIDATOR CLASS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class StoreValidator {
|
||||||
|
private errors: ValidationError[] = [];
|
||||||
|
private warnings: ValidationWarning[] = [];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate and sanitize a store configuration
|
||||||
|
*/
|
||||||
|
validate(raw: RawStoreConfig): ValidationResult {
|
||||||
|
this.errors = [];
|
||||||
|
this.warnings = [];
|
||||||
|
|
||||||
|
// Required field validation
|
||||||
|
this.validateRequired(raw);
|
||||||
|
|
||||||
|
// If critical errors, return early
|
||||||
|
if (this.errors.length > 0) {
|
||||||
|
return {
|
||||||
|
isValid: false,
|
||||||
|
config: null,
|
||||||
|
errors: this.errors,
|
||||||
|
warnings: this.warnings,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build validated config with defaults
|
||||||
|
const config = this.buildValidatedConfig(raw);
|
||||||
|
|
||||||
|
return {
|
||||||
|
isValid: this.errors.length === 0,
|
||||||
|
config,
|
||||||
|
errors: this.errors,
|
||||||
|
warnings: this.warnings,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate required fields
|
||||||
|
*/
|
||||||
|
private validateRequired(raw: RawStoreConfig): void {
|
||||||
|
if (!raw.id) {
|
||||||
|
this.addError('id', 'Store ID is required', CrawlErrorCode.INVALID_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!raw.name) {
|
||||||
|
this.addError('name', 'Store name is required', CrawlErrorCode.INVALID_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!raw.platformDispensaryId) {
|
||||||
|
this.addError(
|
||||||
|
'platformDispensaryId',
|
||||||
|
'Platform dispensary ID is required for crawling',
|
||||||
|
CrawlErrorCode.MISSING_PLATFORM_ID
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!raw.menuType || raw.menuType === 'unknown') {
|
||||||
|
this.addError(
|
||||||
|
'menuType',
|
||||||
|
'Menu type must be detected before crawling',
|
||||||
|
CrawlErrorCode.INVALID_CONFIG
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build validated config with defaults applied
|
||||||
|
*/
|
||||||
|
private buildValidatedConfig(raw: RawStoreConfig): ValidatedStoreConfig {
|
||||||
|
// Slug
|
||||||
|
const slug = raw.slug || this.generateSlug(raw.name);
|
||||||
|
if (!raw.slug) {
|
||||||
|
this.addWarning('slug', 'Slug was missing, generated from name', slug);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Platform
|
||||||
|
const platform = raw.platform || 'dutchie';
|
||||||
|
if (!raw.platform) {
|
||||||
|
this.addWarning('platform', 'Platform was missing, defaulting to dutchie', platform);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Menu URL
|
||||||
|
const menuUrl = raw.menuUrl || this.generateMenuUrl(raw.platformDispensaryId!, platform);
|
||||||
|
if (!raw.menuUrl) {
|
||||||
|
this.addWarning('menuUrl', 'Menu URL was missing, generated from platform ID', menuUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Crawl frequency
|
||||||
|
const crawlFrequencyMinutes = this.validateNumeric(
|
||||||
|
raw.crawlFrequencyMinutes,
|
||||||
|
'crawlFrequencyMinutes',
|
||||||
|
DEFAULT_CONFIG.crawlFrequencyMinutes,
|
||||||
|
60, // min: 1 hour
|
||||||
|
1440 // max: 24 hours
|
||||||
|
);
|
||||||
|
|
||||||
|
// Max retries
|
||||||
|
const maxRetries = this.validateNumeric(
|
||||||
|
raw.maxRetries,
|
||||||
|
'maxRetries',
|
||||||
|
DEFAULT_CONFIG.maxRetries,
|
||||||
|
1, // min
|
||||||
|
10 // max
|
||||||
|
);
|
||||||
|
|
||||||
|
// Backoff multiplier
|
||||||
|
const backoffMultiplier = this.validateNumeric(
|
||||||
|
raw.backoffMultiplier,
|
||||||
|
'backoffMultiplier',
|
||||||
|
1.0,
|
||||||
|
1.0, // min
|
||||||
|
10.0 // max
|
||||||
|
);
|
||||||
|
|
||||||
|
// Crawl status
|
||||||
|
const crawlStatus = this.validateCrawlStatus(raw.crawlStatus);
|
||||||
|
|
||||||
|
// Consecutive failures
|
||||||
|
const consecutiveFailures = Math.max(0, raw.consecutiveFailures || 0);
|
||||||
|
|
||||||
|
// Last error code
|
||||||
|
const lastErrorCode = this.validateErrorCode(raw.lastErrorCode);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: raw.id,
|
||||||
|
name: raw.name,
|
||||||
|
slug,
|
||||||
|
platform,
|
||||||
|
menuType: raw.menuType!,
|
||||||
|
platformDispensaryId: raw.platformDispensaryId!,
|
||||||
|
menuUrl,
|
||||||
|
|
||||||
|
crawlFrequencyMinutes,
|
||||||
|
maxRetries,
|
||||||
|
currentProxyId: raw.currentProxyId || null,
|
||||||
|
currentUserAgent: raw.currentUserAgent || null,
|
||||||
|
|
||||||
|
crawlStatus,
|
||||||
|
consecutiveFailures,
|
||||||
|
backoffMultiplier,
|
||||||
|
lastCrawlAt: raw.lastCrawlAt || null,
|
||||||
|
lastSuccessAt: raw.lastSuccessAt || null,
|
||||||
|
lastFailureAt: raw.lastFailureAt || null,
|
||||||
|
lastErrorCode,
|
||||||
|
nextCrawlAt: raw.nextCrawlAt || null,
|
||||||
|
|
||||||
|
isValid: true,
|
||||||
|
validationErrors: [],
|
||||||
|
validationWarnings: this.warnings,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate numeric value with bounds
|
||||||
|
*/
|
||||||
|
private validateNumeric(
|
||||||
|
value: number | undefined,
|
||||||
|
field: string,
|
||||||
|
defaultValue: number,
|
||||||
|
min: number,
|
||||||
|
max: number
|
||||||
|
): number {
|
||||||
|
if (value === undefined || value === null) {
|
||||||
|
this.addWarning(field, `Missing, defaulting to ${defaultValue}`, defaultValue);
|
||||||
|
return defaultValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value < min) {
|
||||||
|
this.addWarning(field, `Value ${value} below minimum ${min}, using minimum`, min);
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value > max) {
|
||||||
|
this.addWarning(field, `Value ${value} above maximum ${max}, using maximum`, max);
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate crawl status
|
||||||
|
*/
|
||||||
|
private validateCrawlStatus(status?: string): 'active' | 'degraded' | 'paused' | 'failed' {
|
||||||
|
const validStatuses = ['active', 'degraded', 'paused', 'failed'];
|
||||||
|
if (!status || !validStatuses.includes(status)) {
|
||||||
|
if (status) {
|
||||||
|
this.addWarning('crawlStatus', `Invalid status "${status}", defaulting to active`, 'active');
|
||||||
|
}
|
||||||
|
return 'active';
|
||||||
|
}
|
||||||
|
return status as 'active' | 'degraded' | 'paused' | 'failed';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate error code
|
||||||
|
*/
|
||||||
|
private validateErrorCode(code?: string): CrawlErrorCodeType | null {
|
||||||
|
if (!code) return null;
|
||||||
|
const validCodes = Object.values(CrawlErrorCode);
|
||||||
|
if (!validCodes.includes(code as CrawlErrorCodeType)) {
|
||||||
|
this.addWarning('lastErrorCode', `Invalid error code "${code}"`, null);
|
||||||
|
return CrawlErrorCode.UNKNOWN_ERROR;
|
||||||
|
}
|
||||||
|
return code as CrawlErrorCodeType;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate slug from name
|
||||||
|
*/
|
||||||
|
private generateSlug(name: string): string {
|
||||||
|
return name
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/[^a-z0-9]+/g, '-')
|
||||||
|
.replace(/^-+|-+$/g, '')
|
||||||
|
.substring(0, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate menu URL from platform ID
|
||||||
|
*/
|
||||||
|
private generateMenuUrl(platformId: string, platform: string): string {
|
||||||
|
if (platform === 'dutchie') {
|
||||||
|
return `https://dutchie.com/embedded-menu/${platformId}`;
|
||||||
|
}
|
||||||
|
return `https://${platform}.com/menu/${platformId}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add validation error
|
||||||
|
*/
|
||||||
|
private addError(field: string, message: string, code: CrawlErrorCodeType): void {
|
||||||
|
this.errors.push({ field, message, code });
|
||||||
|
console.warn(`[StoreValidator] ERROR ${field}: ${message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add validation warning
|
||||||
|
*/
|
||||||
|
private addWarning(field: string, message: string, appliedDefault?: any): void {
|
||||||
|
this.warnings.push({ field, message, appliedDefault });
|
||||||
|
// Log at debug level - warnings are expected for incomplete configs
|
||||||
|
console.debug(`[StoreValidator] WARNING ${field}: ${message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CONVENIENCE FUNCTIONS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate a single store config
|
||||||
|
*/
|
||||||
|
export function validateStoreConfig(raw: RawStoreConfig): ValidationResult {
|
||||||
|
const validator = new StoreValidator();
|
||||||
|
return validator.validate(raw);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate multiple store configs
|
||||||
|
*/
|
||||||
|
export function validateStoreConfigs(raws: RawStoreConfig[]): {
|
||||||
|
valid: ValidatedStoreConfig[];
|
||||||
|
invalid: { raw: RawStoreConfig; errors: ValidationError[] }[];
|
||||||
|
warnings: { storeId: number; warnings: ValidationWarning[] }[];
|
||||||
|
} {
|
||||||
|
const valid: ValidatedStoreConfig[] = [];
|
||||||
|
const invalid: { raw: RawStoreConfig; errors: ValidationError[] }[] = [];
|
||||||
|
const warnings: { storeId: number; warnings: ValidationWarning[] }[] = [];
|
||||||
|
|
||||||
|
for (const raw of raws) {
|
||||||
|
const result = validateStoreConfig(raw);
|
||||||
|
|
||||||
|
if (result.isValid && result.config) {
|
||||||
|
valid.push(result.config);
|
||||||
|
if (result.warnings.length > 0) {
|
||||||
|
warnings.push({ storeId: raw.id, warnings: result.warnings });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
invalid.push({ raw, errors: result.errors });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { valid, invalid, warnings };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Quick check if a store is crawlable
|
||||||
|
*/
|
||||||
|
export function isCrawlable(raw: RawStoreConfig): boolean {
|
||||||
|
return !!(
|
||||||
|
raw.id &&
|
||||||
|
raw.name &&
|
||||||
|
raw.platformDispensaryId &&
|
||||||
|
raw.menuType &&
|
||||||
|
raw.menuType !== 'unknown' &&
|
||||||
|
raw.crawlStatus !== 'failed' &&
|
||||||
|
raw.crawlStatus !== 'paused'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get reason why store is not crawlable
|
||||||
|
*/
|
||||||
|
export function getNotCrawlableReason(raw: RawStoreConfig): string | null {
|
||||||
|
if (!raw.platformDispensaryId) {
|
||||||
|
return 'Missing platform_dispensary_id';
|
||||||
|
}
|
||||||
|
if (!raw.menuType || raw.menuType === 'unknown') {
|
||||||
|
return 'Menu type not detected';
|
||||||
|
}
|
||||||
|
if (raw.crawlStatus === 'failed') {
|
||||||
|
return 'Store is marked as failed';
|
||||||
|
}
|
||||||
|
if (raw.crawlStatus === 'paused') {
|
||||||
|
return 'Crawling is paused';
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SINGLETON INSTANCE
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const storeValidator = new StoreValidator();
|
||||||
@@ -564,6 +564,10 @@ export interface JobSchedule {
|
|||||||
baseIntervalMinutes: number; // e.g., 240 (4 hours)
|
baseIntervalMinutes: number; // e.g., 240 (4 hours)
|
||||||
jitterMinutes: number; // e.g., 30 (±30 minutes)
|
jitterMinutes: number; // e.g., 30 (±30 minutes)
|
||||||
|
|
||||||
|
// Worker identity
|
||||||
|
workerName?: string; // e.g., "Alice", "Henry", "Bella", "Oscar"
|
||||||
|
workerRole?: string; // e.g., "Store Discovery Worker", "GraphQL Product Sync"
|
||||||
|
|
||||||
// Last run tracking
|
// Last run tracking
|
||||||
lastRunAt?: Date;
|
lastRunAt?: Date;
|
||||||
lastStatus?: JobStatus;
|
lastStatus?: JobStatus;
|
||||||
@@ -593,6 +597,10 @@ export interface JobRunLog {
|
|||||||
durationMs?: number;
|
durationMs?: number;
|
||||||
errorMessage?: string;
|
errorMessage?: string;
|
||||||
|
|
||||||
|
// Worker identity (propagated from schedule)
|
||||||
|
workerName?: string; // e.g., "Alice", "Henry", "Bella", "Oscar"
|
||||||
|
runRole?: string; // e.g., "Store Discovery Worker"
|
||||||
|
|
||||||
// Results summary
|
// Results summary
|
||||||
itemsProcessed?: number;
|
itemsProcessed?: number;
|
||||||
itemsSucceeded?: number;
|
itemsSucceeded?: number;
|
||||||
@@ -672,3 +680,72 @@ export interface BrandSummary {
|
|||||||
productCount: number;
|
productCount: number;
|
||||||
dispensaryCount: number;
|
dispensaryCount: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CRAWLER PROFILE TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DispensaryCrawlerProfile - per-store crawler configuration
|
||||||
|
*
|
||||||
|
* Allows each dispensary to have customized crawler settings without
|
||||||
|
* affecting shared crawler logic. A dispensary can have multiple profiles
|
||||||
|
* but only one is active at a time (via dispensaries.active_crawler_profile_id).
|
||||||
|
*/
|
||||||
|
export interface DispensaryCrawlerProfile {
|
||||||
|
id: number;
|
||||||
|
dispensaryId: number;
|
||||||
|
profileName: string;
|
||||||
|
crawlerType: string; // 'dutchie', 'treez', 'jane', 'sandbox', 'custom'
|
||||||
|
profileKey: string | null; // Optional key for per-store module mapping
|
||||||
|
config: Record<string, any>; // Crawler-specific configuration
|
||||||
|
timeoutMs: number | null;
|
||||||
|
downloadImages: boolean;
|
||||||
|
trackStock: boolean;
|
||||||
|
version: number;
|
||||||
|
enabled: boolean;
|
||||||
|
createdAt: Date;
|
||||||
|
updatedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DispensaryCrawlerProfileCreate - input type for creating a new profile
|
||||||
|
*/
|
||||||
|
export interface DispensaryCrawlerProfileCreate {
|
||||||
|
dispensaryId: number;
|
||||||
|
profileName: string;
|
||||||
|
crawlerType: string;
|
||||||
|
profileKey?: string | null;
|
||||||
|
config?: Record<string, any>;
|
||||||
|
timeoutMs?: number | null;
|
||||||
|
downloadImages?: boolean;
|
||||||
|
trackStock?: boolean;
|
||||||
|
version?: number;
|
||||||
|
enabled?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DispensaryCrawlerProfileUpdate - input type for updating an existing profile
|
||||||
|
*/
|
||||||
|
export interface DispensaryCrawlerProfileUpdate {
|
||||||
|
profileName?: string;
|
||||||
|
crawlerType?: string;
|
||||||
|
profileKey?: string | null;
|
||||||
|
config?: Record<string, any>;
|
||||||
|
timeoutMs?: number | null;
|
||||||
|
downloadImages?: boolean;
|
||||||
|
trackStock?: boolean;
|
||||||
|
version?: number;
|
||||||
|
enabled?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CrawlerProfileOptions - runtime options derived from a profile
|
||||||
|
* Used when invoking the actual crawler
|
||||||
|
*/
|
||||||
|
export interface CrawlerProfileOptions {
|
||||||
|
timeoutMs: number;
|
||||||
|
downloadImages: boolean;
|
||||||
|
trackStock: boolean;
|
||||||
|
config: Record<string, any>;
|
||||||
|
}
|
||||||
|
|||||||
250
backend/src/hydration/__tests__/hydration.test.ts
Normal file
250
backend/src/hydration/__tests__/hydration.test.ts
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
/**
|
||||||
|
* Hydration Pipeline Unit Tests
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { HydrationWorker } from '../worker';
|
||||||
|
import { HydrationLockManager, LOCK_NAMES } from '../locking';
|
||||||
|
import { RawPayload, HydrationOptions } from '../types';
|
||||||
|
|
||||||
|
// Mock the pool
|
||||||
|
const mockQuery = jest.fn();
|
||||||
|
const mockConnect = jest.fn();
|
||||||
|
const mockPool = {
|
||||||
|
query: mockQuery,
|
||||||
|
connect: mockConnect,
|
||||||
|
} as any;
|
||||||
|
|
||||||
|
describe('HydrationLockManager', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.clearAllMocks();
|
||||||
|
mockQuery.mockResolvedValue({ rows: [] });
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('acquireLock', () => {
|
||||||
|
it('should acquire lock when not held', async () => {
|
||||||
|
mockQuery
|
||||||
|
.mockResolvedValueOnce({ rows: [] }) // DELETE expired
|
||||||
|
.mockResolvedValueOnce({ rows: [{ id: 1 }] }); // INSERT
|
||||||
|
|
||||||
|
const manager = new HydrationLockManager(mockPool, 'test-worker');
|
||||||
|
const acquired = await manager.acquireLock('test-lock');
|
||||||
|
|
||||||
|
expect(acquired).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return false when lock held by another worker', async () => {
|
||||||
|
mockQuery
|
||||||
|
.mockResolvedValueOnce({ rows: [] }) // DELETE expired
|
||||||
|
.mockResolvedValueOnce({ rows: [] }) // INSERT failed
|
||||||
|
.mockResolvedValueOnce({ rows: [{ worker_id: 'other-worker' }] }); // SELECT
|
||||||
|
|
||||||
|
const manager = new HydrationLockManager(mockPool, 'test-worker');
|
||||||
|
const acquired = await manager.acquireLock('test-lock');
|
||||||
|
|
||||||
|
expect(acquired).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return true when lock held by same worker', async () => {
|
||||||
|
mockQuery
|
||||||
|
.mockResolvedValueOnce({ rows: [] }) // DELETE expired
|
||||||
|
.mockResolvedValueOnce({ rows: [] }) // INSERT failed
|
||||||
|
.mockResolvedValueOnce({ rows: [{ worker_id: 'test-worker' }] }) // SELECT
|
||||||
|
.mockResolvedValueOnce({ rows: [] }); // UPDATE refresh
|
||||||
|
|
||||||
|
const manager = new HydrationLockManager(mockPool, 'test-worker');
|
||||||
|
const acquired = await manager.acquireLock('test-lock');
|
||||||
|
|
||||||
|
expect(acquired).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('releaseLock', () => {
|
||||||
|
it('should release lock owned by worker', async () => {
|
||||||
|
const manager = new HydrationLockManager(mockPool, 'test-worker');
|
||||||
|
await manager.releaseLock('test-lock');
|
||||||
|
|
||||||
|
expect(mockQuery).toHaveBeenCalledWith(
|
||||||
|
expect.stringContaining('DELETE FROM hydration_locks'),
|
||||||
|
['test-lock', 'test-worker']
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('HydrationWorker', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('processPayload', () => {
|
||||||
|
it('should process valid payload in dry-run mode', async () => {
|
||||||
|
const mockPayload: RawPayload = {
|
||||||
|
id: 'test-uuid',
|
||||||
|
dispensary_id: 123,
|
||||||
|
crawl_run_id: 1,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{ _id: 'p1', Name: 'Product 1', Status: 'Active' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
product_count: 1,
|
||||||
|
pricing_type: 'rec',
|
||||||
|
crawl_mode: 'dual',
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const worker = new HydrationWorker(mockPool, { dryRun: true });
|
||||||
|
const result = await worker.processPayload(mockPayload);
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.payloadId).toBe('test-uuid');
|
||||||
|
expect(result.dispensaryId).toBe(123);
|
||||||
|
// In dry-run, DB should not be updated
|
||||||
|
expect(mockQuery).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle missing normalizer', async () => {
|
||||||
|
const mockPayload: RawPayload = {
|
||||||
|
id: 'test-uuid',
|
||||||
|
dispensary_id: 123,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'unknown-platform',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: { products: [] },
|
||||||
|
product_count: 0,
|
||||||
|
pricing_type: null,
|
||||||
|
crawl_mode: null,
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
mockQuery.mockResolvedValueOnce({ rows: [] }); // markPayloadFailed
|
||||||
|
|
||||||
|
const worker = new HydrationWorker(mockPool, { dryRun: false });
|
||||||
|
const result = await worker.processPayload(mockPayload);
|
||||||
|
|
||||||
|
expect(result.success).toBe(false);
|
||||||
|
expect(result.errors).toContain('No normalizer found for platform: unknown-platform');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty products', async () => {
|
||||||
|
const mockPayload: RawPayload = {
|
||||||
|
id: 'test-uuid',
|
||||||
|
dispensary_id: 123,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: { products: [] },
|
||||||
|
product_count: 0,
|
||||||
|
pricing_type: null,
|
||||||
|
crawl_mode: null,
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const worker = new HydrationWorker(mockPool, { dryRun: true });
|
||||||
|
const result = await worker.processPayload(mockPayload);
|
||||||
|
|
||||||
|
// Should succeed but with 0 products
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.productsUpserted).toBe(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('dry-run mode', () => {
|
||||||
|
it('should not modify database in dry-run mode', async () => {
|
||||||
|
const mockPayload: RawPayload = {
|
||||||
|
id: 'test-uuid',
|
||||||
|
dispensary_id: 123,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
_id: 'p1',
|
||||||
|
Name: 'Product 1',
|
||||||
|
Status: 'Active',
|
||||||
|
brandName: 'Test Brand',
|
||||||
|
type: 'Flower',
|
||||||
|
recPrices: [50],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
product_count: 1,
|
||||||
|
pricing_type: 'rec',
|
||||||
|
crawl_mode: 'dual',
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const consoleSpy = jest.spyOn(console, 'log').mockImplementation();
|
||||||
|
|
||||||
|
const worker = new HydrationWorker(mockPool, { dryRun: true });
|
||||||
|
await worker.processPayload(mockPayload);
|
||||||
|
|
||||||
|
// Verify dry-run log messages
|
||||||
|
expect(consoleSpy).toHaveBeenCalledWith(
|
||||||
|
expect.stringContaining('[DryRun]')
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify no database writes
|
||||||
|
expect(mockQuery).not.toHaveBeenCalled();
|
||||||
|
|
||||||
|
consoleSpy.mockRestore();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Discontinued products handling', () => {
|
||||||
|
it('should identify missing products correctly', () => {
|
||||||
|
const currentProducts = new Set(['p1', 'p2', 'p3']);
|
||||||
|
const previousProducts = ['p1', 'p2', 'p4', 'p5'];
|
||||||
|
|
||||||
|
const discontinued = previousProducts.filter((id) => !currentProducts.has(id));
|
||||||
|
|
||||||
|
expect(discontinued).toEqual(['p4', 'p5']);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('OOS transition handling', () => {
|
||||||
|
it('should detect OOS from Active to Inactive', () => {
|
||||||
|
const previousStatus = 'Active';
|
||||||
|
const currentStatus = 'Inactive';
|
||||||
|
|
||||||
|
const wasActive = previousStatus === 'Active';
|
||||||
|
const nowInactive = currentStatus === 'Inactive';
|
||||||
|
const transitionedToOOS = wasActive && nowInactive;
|
||||||
|
|
||||||
|
expect(transitionedToOOS).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not flag OOS when already inactive', () => {
|
||||||
|
const previousStatus = 'Inactive';
|
||||||
|
const currentStatus = 'Inactive';
|
||||||
|
|
||||||
|
const wasActive = previousStatus === 'Active';
|
||||||
|
const transitionedToOOS = wasActive && currentStatus === 'Inactive';
|
||||||
|
|
||||||
|
expect(transitionedToOOS).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
311
backend/src/hydration/__tests__/normalizer.test.ts
Normal file
311
backend/src/hydration/__tests__/normalizer.test.ts
Normal file
@@ -0,0 +1,311 @@
|
|||||||
|
/**
|
||||||
|
* Normalizer Unit Tests
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { DutchieNormalizer } from '../normalizers/dutchie';
|
||||||
|
import { RawPayload } from '../types';
|
||||||
|
|
||||||
|
describe('DutchieNormalizer', () => {
|
||||||
|
const normalizer = new DutchieNormalizer();
|
||||||
|
|
||||||
|
describe('extractProducts', () => {
|
||||||
|
it('should extract products from GraphQL response format', () => {
|
||||||
|
const rawJson = {
|
||||||
|
data: {
|
||||||
|
filteredProducts: {
|
||||||
|
products: [
|
||||||
|
{ _id: '1', Name: 'Product 1' },
|
||||||
|
{ _id: '2', Name: 'Product 2' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const products = normalizer.extractProducts(rawJson);
|
||||||
|
expect(products).toHaveLength(2);
|
||||||
|
expect(products[0]._id).toBe('1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract products from direct array', () => {
|
||||||
|
const products = normalizer.extractProducts([
|
||||||
|
{ _id: '1', Name: 'Product 1' },
|
||||||
|
]);
|
||||||
|
expect(products).toHaveLength(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract products from merged mode format', () => {
|
||||||
|
const rawJson = {
|
||||||
|
merged: [
|
||||||
|
{ _id: '1', Name: 'Product 1' },
|
||||||
|
],
|
||||||
|
products_a: [{ _id: '2' }],
|
||||||
|
products_b: [{ _id: '3' }],
|
||||||
|
};
|
||||||
|
|
||||||
|
const products = normalizer.extractProducts(rawJson);
|
||||||
|
expect(products).toHaveLength(1);
|
||||||
|
expect(products[0]._id).toBe('1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should merge products from mode A and B when no merged array', () => {
|
||||||
|
const rawJson = {
|
||||||
|
products_a: [
|
||||||
|
{ _id: '1', Name: 'Product 1' },
|
||||||
|
],
|
||||||
|
products_b: [
|
||||||
|
{ _id: '2', Name: 'Product 2' },
|
||||||
|
{ _id: '1', Name: 'Product 1 from B' }, // Duplicate
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const products = normalizer.extractProducts(rawJson);
|
||||||
|
expect(products).toHaveLength(2);
|
||||||
|
// Mode A takes precedence for duplicates
|
||||||
|
expect(products.find((p: any) => p._id === '1').Name).toBe('Product 1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty array for invalid payload', () => {
|
||||||
|
expect(normalizer.extractProducts(null)).toEqual([]);
|
||||||
|
expect(normalizer.extractProducts({})).toEqual([]);
|
||||||
|
expect(normalizer.extractProducts({ data: {} })).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('validatePayload', () => {
|
||||||
|
it('should validate valid payload', () => {
|
||||||
|
const rawJson = {
|
||||||
|
products: [{ _id: '1', Name: 'Product' }],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = normalizer.validatePayload(rawJson);
|
||||||
|
expect(result.valid).toBe(true);
|
||||||
|
expect(result.errors).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should reject empty payload', () => {
|
||||||
|
const result = normalizer.validatePayload({});
|
||||||
|
expect(result.valid).toBe(false);
|
||||||
|
expect(result.errors).toContain('No products found in payload');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should capture GraphQL errors', () => {
|
||||||
|
const rawJson = {
|
||||||
|
products: [{ _id: '1' }],
|
||||||
|
errors: [{ message: 'Rate limit exceeded' }],
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = normalizer.validatePayload(rawJson);
|
||||||
|
expect(result.errors.length).toBeGreaterThan(0);
|
||||||
|
expect(result.errors.some((e) => e.includes('Rate limit'))).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('normalize', () => {
|
||||||
|
const mockPayload: RawPayload = {
|
||||||
|
id: 'test-uuid',
|
||||||
|
dispensary_id: 123,
|
||||||
|
crawl_run_id: 1,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
_id: 'prod-1',
|
||||||
|
Name: 'Blue Dream',
|
||||||
|
brandName: 'Top Shelf',
|
||||||
|
brandId: 'brand-1',
|
||||||
|
type: 'Flower',
|
||||||
|
subcategory: 'Hybrid',
|
||||||
|
strainType: 'Hybrid',
|
||||||
|
THC: 25.5,
|
||||||
|
CBD: 0.5,
|
||||||
|
Status: 'Active',
|
||||||
|
Image: 'https://example.com/image.jpg',
|
||||||
|
recPrices: [35, 60, 100],
|
||||||
|
medicalPrices: [30, 55, 90],
|
||||||
|
POSMetaData: {
|
||||||
|
children: [
|
||||||
|
{ option: '1g', recPrice: 35, quantityAvailable: 10 },
|
||||||
|
{ option: '3.5g', recPrice: 60, quantityAvailable: 5 },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
product_count: 1,
|
||||||
|
pricing_type: 'rec',
|
||||||
|
crawl_mode: 'dual',
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
it('should normalize products correctly', () => {
|
||||||
|
const result = normalizer.normalize(mockPayload);
|
||||||
|
|
||||||
|
expect(result.products).toHaveLength(1);
|
||||||
|
expect(result.productCount).toBe(1);
|
||||||
|
|
||||||
|
const product = result.products[0];
|
||||||
|
expect(product.externalProductId).toBe('prod-1');
|
||||||
|
expect(product.name).toBe('Blue Dream');
|
||||||
|
expect(product.brandName).toBe('Top Shelf');
|
||||||
|
expect(product.category).toBe('Flower');
|
||||||
|
expect(product.thcPercent).toBe(25.5);
|
||||||
|
expect(product.isActive).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should normalize pricing correctly', () => {
|
||||||
|
const result = normalizer.normalize(mockPayload);
|
||||||
|
|
||||||
|
const pricing = result.pricing.get('prod-1');
|
||||||
|
expect(pricing).toBeDefined();
|
||||||
|
expect(pricing!.priceRecMin).toBe(3500); // cents
|
||||||
|
expect(pricing!.priceRecMax).toBe(10000);
|
||||||
|
expect(pricing!.priceMedMin).toBe(3000);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should normalize availability correctly', () => {
|
||||||
|
const result = normalizer.normalize(mockPayload);
|
||||||
|
|
||||||
|
const availability = result.availability.get('prod-1');
|
||||||
|
expect(availability).toBeDefined();
|
||||||
|
expect(availability!.inStock).toBe(true);
|
||||||
|
expect(availability!.stockStatus).toBe('in_stock');
|
||||||
|
expect(availability!.quantity).toBe(15); // 10 + 5
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract brands', () => {
|
||||||
|
const result = normalizer.normalize(mockPayload);
|
||||||
|
|
||||||
|
expect(result.brands).toHaveLength(1);
|
||||||
|
expect(result.brands[0].name).toBe('Top Shelf');
|
||||||
|
expect(result.brands[0].slug).toBe('top-shelf');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should extract categories', () => {
|
||||||
|
const result = normalizer.normalize(mockPayload);
|
||||||
|
|
||||||
|
expect(result.categories).toHaveLength(1);
|
||||||
|
expect(result.categories[0].name).toBe('Flower');
|
||||||
|
expect(result.categories[0].slug).toBe('flower');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle products without required fields', () => {
|
||||||
|
const badPayload: RawPayload = {
|
||||||
|
...mockPayload,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{ _id: 'no-name' }, // Missing Name
|
||||||
|
{ Name: 'No ID' }, // Missing _id
|
||||||
|
{ _id: 'valid', Name: 'Valid Product' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = normalizer.normalize(badPayload);
|
||||||
|
// Only the valid product should be included
|
||||||
|
expect(result.products).toHaveLength(1);
|
||||||
|
expect(result.products[0].name).toBe('Valid Product');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should mark inactive products correctly', () => {
|
||||||
|
const inactivePayload: RawPayload = {
|
||||||
|
...mockPayload,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
_id: 'inactive-1',
|
||||||
|
Name: 'Inactive Product',
|
||||||
|
Status: 'Inactive',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = normalizer.normalize(inactivePayload);
|
||||||
|
const availability = result.availability.get('inactive-1');
|
||||||
|
|
||||||
|
expect(availability).toBeDefined();
|
||||||
|
expect(availability!.inStock).toBe(false);
|
||||||
|
expect(availability!.stockStatus).toBe('out_of_stock');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Normalizer edge cases', () => {
|
||||||
|
const normalizer = new DutchieNormalizer();
|
||||||
|
|
||||||
|
it('should handle null/undefined values gracefully', () => {
|
||||||
|
const payload: RawPayload = {
|
||||||
|
id: 'test',
|
||||||
|
dispensary_id: 1,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
_id: 'prod-1',
|
||||||
|
Name: 'Test',
|
||||||
|
brandName: null,
|
||||||
|
THC: undefined,
|
||||||
|
POSMetaData: null,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
product_count: 1,
|
||||||
|
pricing_type: null,
|
||||||
|
crawl_mode: null,
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = normalizer.normalize(payload);
|
||||||
|
expect(result.products).toHaveLength(1);
|
||||||
|
expect(result.products[0].brandName).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle special price scenarios', () => {
|
||||||
|
const payload: RawPayload = {
|
||||||
|
id: 'test',
|
||||||
|
dispensary_id: 1,
|
||||||
|
crawl_run_id: null,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payload_version: 1,
|
||||||
|
raw_json: {
|
||||||
|
products: [
|
||||||
|
{
|
||||||
|
_id: 'special-prod',
|
||||||
|
Name: 'Special Product',
|
||||||
|
recPrices: [50],
|
||||||
|
recSpecialPrices: [40],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
product_count: 1,
|
||||||
|
pricing_type: null,
|
||||||
|
crawl_mode: null,
|
||||||
|
fetched_at: new Date(),
|
||||||
|
processed: false,
|
||||||
|
normalized_at: null,
|
||||||
|
hydration_error: null,
|
||||||
|
hydration_attempts: 0,
|
||||||
|
created_at: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = normalizer.normalize(payload);
|
||||||
|
const pricing = result.pricing.get('special-prod');
|
||||||
|
|
||||||
|
expect(pricing!.isOnSpecial).toBe(true);
|
||||||
|
expect(pricing!.priceRecSpecial).toBe(4000);
|
||||||
|
expect(pricing!.discountPercent).toBe(20);
|
||||||
|
});
|
||||||
|
});
|
||||||
431
backend/src/hydration/backfill.ts
Normal file
431
backend/src/hydration/backfill.ts
Normal file
@@ -0,0 +1,431 @@
|
|||||||
|
/**
|
||||||
|
* Backfill Script
|
||||||
|
*
|
||||||
|
* Imports historical payloads from existing data sources:
|
||||||
|
* - dutchie_products.latest_raw_payload
|
||||||
|
* - dutchie_product_snapshots.raw_data
|
||||||
|
* - Any cached files on disk
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import { storeRawPayload } from './payload-store';
|
||||||
|
import { HydrationLockManager, LOCK_NAMES } from './locking';
|
||||||
|
|
||||||
|
const BATCH_SIZE = 100;
|
||||||
|
|
||||||
|
export interface BackfillOptions {
|
||||||
|
dryRun?: boolean;
|
||||||
|
source: 'dutchie_products' | 'snapshots' | 'cache_files' | 'all';
|
||||||
|
dispensaryId?: number;
|
||||||
|
limit?: number;
|
||||||
|
cachePath?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BackfillResult {
|
||||||
|
source: string;
|
||||||
|
payloadsCreated: number;
|
||||||
|
skipped: number;
|
||||||
|
errors: string[];
|
||||||
|
durationMs: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BACKFILL FROM DUTCHIE_PRODUCTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backfill from dutchie_products.latest_raw_payload
|
||||||
|
* This captures the most recent raw data for each product
|
||||||
|
*/
|
||||||
|
export async function backfillFromDutchieProducts(
|
||||||
|
pool: Pool,
|
||||||
|
options: BackfillOptions
|
||||||
|
): Promise<BackfillResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
let payloadsCreated = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
console.log('[Backfill] Starting backfill from dutchie_products...');
|
||||||
|
|
||||||
|
// Get distinct dispensaries with raw payloads
|
||||||
|
let query = `
|
||||||
|
SELECT DISTINCT dispensary_id
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE latest_raw_payload IS NOT NULL
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
query += ` AND dispensary_id = $1`;
|
||||||
|
params.push(options.dispensaryId);
|
||||||
|
}
|
||||||
|
|
||||||
|
const dispensaries = await pool.query(query, params);
|
||||||
|
|
||||||
|
console.log(`[Backfill] Found ${dispensaries.rows.length} dispensaries with raw payloads`);
|
||||||
|
|
||||||
|
for (const row of dispensaries.rows) {
|
||||||
|
const dispensaryId = row.dispensary_id;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Check if we already have a payload for this dispensary
|
||||||
|
const existing = await pool.query(
|
||||||
|
`SELECT 1 FROM raw_payloads
|
||||||
|
WHERE dispensary_id = $1 AND platform = 'dutchie'
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate all products for this dispensary into one payload
|
||||||
|
const products = await pool.query(
|
||||||
|
`SELECT
|
||||||
|
external_product_id,
|
||||||
|
latest_raw_payload,
|
||||||
|
updated_at
|
||||||
|
FROM dutchie_products
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
AND latest_raw_payload IS NOT NULL
|
||||||
|
ORDER BY updated_at DESC
|
||||||
|
LIMIT $2`,
|
||||||
|
[dispensaryId, options.limit || 10000]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (products.rows.length === 0) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create aggregated payload
|
||||||
|
const aggregatedPayload = {
|
||||||
|
products: products.rows.map((p: any) => p.latest_raw_payload),
|
||||||
|
backfilled: true,
|
||||||
|
backfill_source: 'dutchie_products',
|
||||||
|
backfill_date: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get the latest update time
|
||||||
|
const latestUpdate = products.rows[0]?.updated_at || new Date();
|
||||||
|
|
||||||
|
if (options.dryRun) {
|
||||||
|
console.log(
|
||||||
|
`[Backfill][DryRun] Would create payload for dispensary ${dispensaryId} ` +
|
||||||
|
`with ${products.rows.length} products`
|
||||||
|
);
|
||||||
|
payloadsCreated++;
|
||||||
|
} else {
|
||||||
|
await storeRawPayload(pool, {
|
||||||
|
dispensaryId,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payloadVersion: 1,
|
||||||
|
rawJson: aggregatedPayload,
|
||||||
|
productCount: products.rows.length,
|
||||||
|
pricingType: 'rec',
|
||||||
|
crawlMode: 'backfill',
|
||||||
|
fetchedAt: latestUpdate,
|
||||||
|
});
|
||||||
|
payloadsCreated++;
|
||||||
|
console.log(
|
||||||
|
`[Backfill] Created payload for dispensary ${dispensaryId} ` +
|
||||||
|
`with ${products.rows.length} products`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`Dispensary ${dispensaryId}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
source: 'dutchie_products',
|
||||||
|
payloadsCreated,
|
||||||
|
skipped,
|
||||||
|
errors,
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BACKFILL FROM SNAPSHOTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backfill from dutchie_product_snapshots.raw_data
|
||||||
|
* Creates payloads from historical snapshot data
|
||||||
|
*/
|
||||||
|
export async function backfillFromSnapshots(
|
||||||
|
pool: Pool,
|
||||||
|
options: BackfillOptions
|
||||||
|
): Promise<BackfillResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
let payloadsCreated = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
console.log('[Backfill] Starting backfill from snapshots...');
|
||||||
|
|
||||||
|
// Get distinct crawl timestamps per dispensary
|
||||||
|
let query = `
|
||||||
|
SELECT DISTINCT
|
||||||
|
dispensary_id,
|
||||||
|
DATE_TRUNC('hour', captured_at) as crawl_hour,
|
||||||
|
COUNT(*) as product_count
|
||||||
|
FROM dutchie_product_snapshots
|
||||||
|
WHERE raw_data IS NOT NULL
|
||||||
|
`;
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
query += ` AND dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(options.dispensaryId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` GROUP BY dispensary_id, DATE_TRUNC('hour', captured_at)
|
||||||
|
ORDER BY crawl_hour DESC`;
|
||||||
|
|
||||||
|
if (options.limit) {
|
||||||
|
query += ` LIMIT $${paramIndex}`;
|
||||||
|
params.push(options.limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
const crawlHours = await pool.query(query, params);
|
||||||
|
|
||||||
|
console.log(`[Backfill] Found ${crawlHours.rows.length} distinct crawl hours`);
|
||||||
|
|
||||||
|
for (const row of crawlHours.rows) {
|
||||||
|
const { dispensary_id, crawl_hour, product_count } = row;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Check if we already have this payload
|
||||||
|
const existing = await pool.query(
|
||||||
|
`SELECT 1 FROM raw_payloads
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
AND platform = 'dutchie'
|
||||||
|
AND fetched_at >= $2
|
||||||
|
AND fetched_at < $2 + INTERVAL '1 hour'
|
||||||
|
LIMIT 1`,
|
||||||
|
[dispensary_id, crawl_hour]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all snapshots for this hour
|
||||||
|
const snapshots = await pool.query(
|
||||||
|
`SELECT raw_data
|
||||||
|
FROM dutchie_product_snapshots
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
AND captured_at >= $2
|
||||||
|
AND captured_at < $2 + INTERVAL '1 hour'
|
||||||
|
AND raw_data IS NOT NULL`,
|
||||||
|
[dispensary_id, crawl_hour]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (snapshots.rows.length === 0) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const aggregatedPayload = {
|
||||||
|
products: snapshots.rows.map((s: any) => s.raw_data),
|
||||||
|
backfilled: true,
|
||||||
|
backfill_source: 'snapshots',
|
||||||
|
backfill_date: new Date().toISOString(),
|
||||||
|
original_crawl_hour: crawl_hour,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (options.dryRun) {
|
||||||
|
console.log(
|
||||||
|
`[Backfill][DryRun] Would create payload for dispensary ${dispensary_id} ` +
|
||||||
|
`at ${crawl_hour} with ${snapshots.rows.length} products`
|
||||||
|
);
|
||||||
|
payloadsCreated++;
|
||||||
|
} else {
|
||||||
|
await storeRawPayload(pool, {
|
||||||
|
dispensaryId: dispensary_id,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payloadVersion: 1,
|
||||||
|
rawJson: aggregatedPayload,
|
||||||
|
productCount: snapshots.rows.length,
|
||||||
|
pricingType: 'rec',
|
||||||
|
crawlMode: 'backfill',
|
||||||
|
fetchedAt: crawl_hour,
|
||||||
|
});
|
||||||
|
payloadsCreated++;
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`Dispensary ${dispensary_id} at ${crawl_hour}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
source: 'snapshots',
|
||||||
|
payloadsCreated,
|
||||||
|
skipped,
|
||||||
|
errors,
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BACKFILL FROM CACHE FILES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backfill from cached JSON files on disk
|
||||||
|
*/
|
||||||
|
export async function backfillFromCacheFiles(
|
||||||
|
pool: Pool,
|
||||||
|
options: BackfillOptions
|
||||||
|
): Promise<BackfillResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
let payloadsCreated = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
const cachePath = options.cachePath || './cache/payloads';
|
||||||
|
|
||||||
|
console.log(`[Backfill] Starting backfill from cache files at ${cachePath}...`);
|
||||||
|
|
||||||
|
if (!fs.existsSync(cachePath)) {
|
||||||
|
console.log('[Backfill] Cache directory does not exist');
|
||||||
|
return {
|
||||||
|
source: 'cache_files',
|
||||||
|
payloadsCreated: 0,
|
||||||
|
skipped: 0,
|
||||||
|
errors: ['Cache directory does not exist'],
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expected structure: cache/payloads/<dispensary_id>/<timestamp>.json
|
||||||
|
const dispensaryDirs = fs.readdirSync(cachePath);
|
||||||
|
|
||||||
|
for (const dispensaryDir of dispensaryDirs) {
|
||||||
|
const dispensaryPath = path.join(cachePath, dispensaryDir);
|
||||||
|
if (!fs.statSync(dispensaryPath).isDirectory()) continue;
|
||||||
|
|
||||||
|
const dispensaryId = parseInt(dispensaryDir, 10);
|
||||||
|
if (isNaN(dispensaryId)) continue;
|
||||||
|
|
||||||
|
if (options.dispensaryId && options.dispensaryId !== dispensaryId) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const files = fs.readdirSync(dispensaryPath)
|
||||||
|
.filter((f) => f.endsWith('.json'))
|
||||||
|
.sort()
|
||||||
|
.reverse();
|
||||||
|
|
||||||
|
let processed = 0;
|
||||||
|
for (const file of files) {
|
||||||
|
if (options.limit && processed >= options.limit) break;
|
||||||
|
|
||||||
|
const filePath = path.join(dispensaryPath, file);
|
||||||
|
try {
|
||||||
|
const content = fs.readFileSync(filePath, 'utf-8');
|
||||||
|
const payload = JSON.parse(content);
|
||||||
|
|
||||||
|
// Extract timestamp from filename (e.g., 2024-01-15T10-30-00.json)
|
||||||
|
const timestamp = file.replace('.json', '').replace(/-/g, ':').replace('T', ' ');
|
||||||
|
const fetchedAt = new Date(timestamp);
|
||||||
|
|
||||||
|
if (options.dryRun) {
|
||||||
|
console.log(
|
||||||
|
`[Backfill][DryRun] Would import ${file} for dispensary ${dispensaryId}`
|
||||||
|
);
|
||||||
|
payloadsCreated++;
|
||||||
|
} else {
|
||||||
|
await storeRawPayload(pool, {
|
||||||
|
dispensaryId,
|
||||||
|
platform: 'dutchie',
|
||||||
|
payloadVersion: 1,
|
||||||
|
rawJson: {
|
||||||
|
...payload,
|
||||||
|
backfilled: true,
|
||||||
|
backfill_source: 'cache_files',
|
||||||
|
backfill_file: file,
|
||||||
|
},
|
||||||
|
productCount: payload.products?.length || 0,
|
||||||
|
pricingType: 'rec',
|
||||||
|
crawlMode: 'backfill',
|
||||||
|
fetchedAt: isNaN(fetchedAt.getTime()) ? new Date() : fetchedAt,
|
||||||
|
});
|
||||||
|
payloadsCreated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
processed++;
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`File ${filePath}: ${error.message}`);
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
source: 'cache_files',
|
||||||
|
payloadsCreated,
|
||||||
|
skipped,
|
||||||
|
errors,
|
||||||
|
durationMs: Date.now() - startTime,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN BACKFILL FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run full backfill
|
||||||
|
*/
|
||||||
|
export async function runBackfill(
|
||||||
|
pool: Pool,
|
||||||
|
options: BackfillOptions
|
||||||
|
): Promise<BackfillResult[]> {
|
||||||
|
const lockManager = new HydrationLockManager(pool);
|
||||||
|
const results: BackfillResult[] = [];
|
||||||
|
|
||||||
|
// Acquire lock
|
||||||
|
const lockAcquired = await lockManager.acquireLock(LOCK_NAMES.BACKFILL, 60 * 60 * 1000);
|
||||||
|
if (!lockAcquired) {
|
||||||
|
console.log('[Backfill] Could not acquire lock, another backfill may be running');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log('[Backfill] Starting backfill process...');
|
||||||
|
|
||||||
|
if (options.source === 'all' || options.source === 'dutchie_products') {
|
||||||
|
const result = await backfillFromDutchieProducts(pool, options);
|
||||||
|
results.push(result);
|
||||||
|
console.log(`[Backfill] dutchie_products: ${result.payloadsCreated} created, ${result.skipped} skipped`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.source === 'all' || options.source === 'snapshots') {
|
||||||
|
const result = await backfillFromSnapshots(pool, options);
|
||||||
|
results.push(result);
|
||||||
|
console.log(`[Backfill] snapshots: ${result.payloadsCreated} created, ${result.skipped} skipped`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.source === 'all' || options.source === 'cache_files') {
|
||||||
|
const result = await backfillFromCacheFiles(pool, options);
|
||||||
|
results.push(result);
|
||||||
|
console.log(`[Backfill] cache_files: ${result.payloadsCreated} created, ${result.skipped} skipped`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[Backfill] Backfill complete');
|
||||||
|
return results;
|
||||||
|
} finally {
|
||||||
|
await lockManager.releaseLock(LOCK_NAMES.BACKFILL);
|
||||||
|
}
|
||||||
|
}
|
||||||
435
backend/src/hydration/canonical-upsert.ts
Normal file
435
backend/src/hydration/canonical-upsert.ts
Normal file
@@ -0,0 +1,435 @@
|
|||||||
|
/**
|
||||||
|
* Canonical Upsert Functions
|
||||||
|
*
|
||||||
|
* Upserts normalized data into canonical tables:
|
||||||
|
* - store_products
|
||||||
|
* - store_product_snapshots
|
||||||
|
* - brands
|
||||||
|
* - categories (future)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool, PoolClient } from 'pg';
|
||||||
|
import {
|
||||||
|
NormalizedProduct,
|
||||||
|
NormalizedPricing,
|
||||||
|
NormalizedAvailability,
|
||||||
|
NormalizedBrand,
|
||||||
|
NormalizationResult,
|
||||||
|
} from './types';
|
||||||
|
|
||||||
|
const BATCH_SIZE = 100;
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// PRODUCT UPSERTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface UpsertProductsResult {
|
||||||
|
upserted: number;
|
||||||
|
new: number;
|
||||||
|
updated: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert products to store_products table
|
||||||
|
* Returns counts of new vs updated products
|
||||||
|
*/
|
||||||
|
export async function upsertStoreProducts(
|
||||||
|
pool: Pool,
|
||||||
|
products: NormalizedProduct[],
|
||||||
|
pricing: Map<string, NormalizedPricing>,
|
||||||
|
availability: Map<string, NormalizedAvailability>,
|
||||||
|
options: { dryRun?: boolean } = {}
|
||||||
|
): Promise<UpsertProductsResult> {
|
||||||
|
if (products.length === 0) {
|
||||||
|
return { upserted: 0, new: 0, updated: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const { dryRun = false } = options;
|
||||||
|
let newCount = 0;
|
||||||
|
let updatedCount = 0;
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < products.length; i += BATCH_SIZE) {
|
||||||
|
const batch = products.slice(i, i + BATCH_SIZE);
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
console.log(`[DryRun] Would upsert ${batch.length} products`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const client = await pool.connect();
|
||||||
|
try {
|
||||||
|
await client.query('BEGIN');
|
||||||
|
|
||||||
|
for (const product of batch) {
|
||||||
|
const productPricing = pricing.get(product.externalProductId);
|
||||||
|
const productAvailability = availability.get(product.externalProductId);
|
||||||
|
|
||||||
|
const result = await client.query(
|
||||||
|
`INSERT INTO store_products (
|
||||||
|
dispensary_id, provider, provider_product_id, provider_brand_id,
|
||||||
|
name, brand_name, category, subcategory,
|
||||||
|
price_rec, price_med, price_rec_special, price_med_special,
|
||||||
|
is_on_special, discount_percent,
|
||||||
|
is_in_stock, stock_status,
|
||||||
|
thc_percent, cbd_percent,
|
||||||
|
image_url,
|
||||||
|
first_seen_at, last_seen_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, $4,
|
||||||
|
$5, $6, $7, $8,
|
||||||
|
$9, $10, $11, $12,
|
||||||
|
$13, $14,
|
||||||
|
$15, $16,
|
||||||
|
$17, $18,
|
||||||
|
$19,
|
||||||
|
NOW(), NOW(), NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||||
|
DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
brand_name = EXCLUDED.brand_name,
|
||||||
|
category = EXCLUDED.category,
|
||||||
|
subcategory = EXCLUDED.subcategory,
|
||||||
|
price_rec = EXCLUDED.price_rec,
|
||||||
|
price_med = EXCLUDED.price_med,
|
||||||
|
price_rec_special = EXCLUDED.price_rec_special,
|
||||||
|
price_med_special = EXCLUDED.price_med_special,
|
||||||
|
is_on_special = EXCLUDED.is_on_special,
|
||||||
|
discount_percent = EXCLUDED.discount_percent,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_status = EXCLUDED.stock_status,
|
||||||
|
thc_percent = EXCLUDED.thc_percent,
|
||||||
|
cbd_percent = EXCLUDED.cbd_percent,
|
||||||
|
image_url = EXCLUDED.image_url,
|
||||||
|
last_seen_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING (xmax = 0) as is_new`,
|
||||||
|
[
|
||||||
|
product.dispensaryId,
|
||||||
|
product.platform,
|
||||||
|
product.externalProductId,
|
||||||
|
product.brandId,
|
||||||
|
product.name,
|
||||||
|
product.brandName,
|
||||||
|
product.category,
|
||||||
|
product.subcategory,
|
||||||
|
productPricing?.priceRec ? productPricing.priceRec / 100 : null,
|
||||||
|
productPricing?.priceMed ? productPricing.priceMed / 100 : null,
|
||||||
|
productPricing?.priceRecSpecial ? productPricing.priceRecSpecial / 100 : null,
|
||||||
|
productPricing?.priceMedSpecial ? productPricing.priceMedSpecial / 100 : null,
|
||||||
|
productPricing?.isOnSpecial || false,
|
||||||
|
productPricing?.discountPercent,
|
||||||
|
productAvailability?.inStock ?? true,
|
||||||
|
productAvailability?.stockStatus || 'unknown',
|
||||||
|
product.thcPercent,
|
||||||
|
product.cbdPercent,
|
||||||
|
product.primaryImageUrl,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows[0]?.is_new) {
|
||||||
|
newCount++;
|
||||||
|
} else {
|
||||||
|
updatedCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await client.query('COMMIT');
|
||||||
|
} catch (error) {
|
||||||
|
await client.query('ROLLBACK');
|
||||||
|
throw error;
|
||||||
|
} finally {
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
upserted: newCount + updatedCount,
|
||||||
|
new: newCount,
|
||||||
|
updated: updatedCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SNAPSHOT CREATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface CreateSnapshotsResult {
|
||||||
|
created: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create snapshots for all products in a crawl
|
||||||
|
*/
|
||||||
|
export async function createStoreProductSnapshots(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number,
|
||||||
|
products: NormalizedProduct[],
|
||||||
|
pricing: Map<string, NormalizedPricing>,
|
||||||
|
availability: Map<string, NormalizedAvailability>,
|
||||||
|
crawlRunId: number | null,
|
||||||
|
options: { dryRun?: boolean } = {}
|
||||||
|
): Promise<CreateSnapshotsResult> {
|
||||||
|
if (products.length === 0) {
|
||||||
|
return { created: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const { dryRun = false } = options;
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
console.log(`[DryRun] Would create ${products.length} snapshots`);
|
||||||
|
return { created: products.length };
|
||||||
|
}
|
||||||
|
|
||||||
|
let created = 0;
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < products.length; i += BATCH_SIZE) {
|
||||||
|
const batch = products.slice(i, i + BATCH_SIZE);
|
||||||
|
|
||||||
|
const values: any[][] = [];
|
||||||
|
for (const product of batch) {
|
||||||
|
const productPricing = pricing.get(product.externalProductId);
|
||||||
|
const productAvailability = availability.get(product.externalProductId);
|
||||||
|
|
||||||
|
values.push([
|
||||||
|
dispensaryId,
|
||||||
|
product.platform,
|
||||||
|
product.externalProductId,
|
||||||
|
crawlRunId,
|
||||||
|
new Date(), // captured_at
|
||||||
|
product.name,
|
||||||
|
product.brandName,
|
||||||
|
product.category,
|
||||||
|
product.subcategory,
|
||||||
|
productPricing?.priceRec ? productPricing.priceRec / 100 : null,
|
||||||
|
productPricing?.priceMed ? productPricing.priceMed / 100 : null,
|
||||||
|
productPricing?.priceRecSpecial ? productPricing.priceRecSpecial / 100 : null,
|
||||||
|
productPricing?.priceMedSpecial ? productPricing.priceMedSpecial / 100 : null,
|
||||||
|
productPricing?.isOnSpecial || false,
|
||||||
|
productPricing?.discountPercent,
|
||||||
|
productAvailability?.inStock ?? true,
|
||||||
|
productAvailability?.quantity,
|
||||||
|
productAvailability?.stockStatus || 'unknown',
|
||||||
|
product.thcPercent,
|
||||||
|
product.cbdPercent,
|
||||||
|
product.primaryImageUrl,
|
||||||
|
JSON.stringify(product.rawProduct),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build bulk insert query
|
||||||
|
const placeholders = values.map((_, idx) => {
|
||||||
|
const offset = idx * 22;
|
||||||
|
return `(${Array.from({ length: 22 }, (_, j) => `$${offset + j + 1}`).join(', ')})`;
|
||||||
|
}).join(', ');
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, provider, provider_product_id, crawl_run_id,
|
||||||
|
captured_at,
|
||||||
|
name, brand_name, category, subcategory,
|
||||||
|
price_rec, price_med, price_rec_special, price_med_special,
|
||||||
|
is_on_special, discount_percent,
|
||||||
|
is_in_stock, stock_quantity, stock_status,
|
||||||
|
thc_percent, cbd_percent,
|
||||||
|
image_url, raw_data
|
||||||
|
) VALUES ${placeholders}`,
|
||||||
|
values.flat()
|
||||||
|
);
|
||||||
|
|
||||||
|
created += batch.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { created };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DISCONTINUED PRODUCTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mark products as discontinued if they weren't in the current crawl
|
||||||
|
*/
|
||||||
|
export async function markDiscontinuedProducts(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number,
|
||||||
|
currentProductIds: Set<string>,
|
||||||
|
platform: string,
|
||||||
|
crawlRunId: number | null,
|
||||||
|
options: { dryRun?: boolean } = {}
|
||||||
|
): Promise<number> {
|
||||||
|
const { dryRun = false } = options;
|
||||||
|
|
||||||
|
// Get all products for this dispensary/platform
|
||||||
|
const result = await pool.query(
|
||||||
|
`SELECT provider_product_id FROM store_products
|
||||||
|
WHERE dispensary_id = $1 AND provider = $2 AND is_in_stock = TRUE`,
|
||||||
|
[dispensaryId, platform]
|
||||||
|
);
|
||||||
|
|
||||||
|
const existingIds = result.rows.map((r: any) => r.provider_product_id);
|
||||||
|
const discontinuedIds = existingIds.filter((id: string) => !currentProductIds.has(id));
|
||||||
|
|
||||||
|
if (discontinuedIds.length === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
console.log(`[DryRun] Would mark ${discontinuedIds.length} products as discontinued`);
|
||||||
|
return discontinuedIds.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update store_products to mark as out of stock
|
||||||
|
await pool.query(
|
||||||
|
`UPDATE store_products
|
||||||
|
SET is_in_stock = FALSE,
|
||||||
|
stock_status = 'discontinued',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE dispensary_id = $1
|
||||||
|
AND provider = $2
|
||||||
|
AND provider_product_id = ANY($3)`,
|
||||||
|
[dispensaryId, platform, discontinuedIds]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create snapshots for discontinued products
|
||||||
|
for (const productId of discontinuedIds) {
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, provider, provider_product_id, crawl_run_id,
|
||||||
|
captured_at, is_in_stock, stock_status
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
dispensary_id, provider, provider_product_id, $4,
|
||||||
|
NOW(), FALSE, 'discontinued'
|
||||||
|
FROM store_products
|
||||||
|
WHERE dispensary_id = $1 AND provider = $2 AND provider_product_id = $3`,
|
||||||
|
[dispensaryId, platform, productId, crawlRunId]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return discontinuedIds.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BRAND UPSERTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface UpsertBrandsResult {
|
||||||
|
upserted: number;
|
||||||
|
new: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upsert brands to brands table
|
||||||
|
*/
|
||||||
|
export async function upsertBrands(
|
||||||
|
pool: Pool,
|
||||||
|
brands: NormalizedBrand[],
|
||||||
|
options: { dryRun?: boolean; skipIfExists?: boolean } = {}
|
||||||
|
): Promise<UpsertBrandsResult> {
|
||||||
|
if (brands.length === 0) {
|
||||||
|
return { upserted: 0, new: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const { dryRun = false, skipIfExists = true } = options;
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
console.log(`[DryRun] Would upsert ${brands.length} brands`);
|
||||||
|
return { upserted: brands.length, new: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
let newCount = 0;
|
||||||
|
|
||||||
|
for (const brand of brands) {
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO brands (name, slug, external_id, logo_url, created_at, updated_at)
|
||||||
|
VALUES ($1, $2, $3, $4, NOW(), NOW())
|
||||||
|
ON CONFLICT (slug) DO ${skipIfExists ? 'NOTHING' : 'UPDATE SET logo_url = COALESCE(EXCLUDED.logo_url, brands.logo_url), updated_at = NOW()'}
|
||||||
|
RETURNING (xmax = 0) as is_new`,
|
||||||
|
[brand.name, brand.slug, brand.externalBrandId, brand.logoUrl]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows[0]?.is_new) {
|
||||||
|
newCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
upserted: brands.length,
|
||||||
|
new: newCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// FULL HYDRATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface HydratePayloadResult {
|
||||||
|
productsUpserted: number;
|
||||||
|
productsNew: number;
|
||||||
|
productsUpdated: number;
|
||||||
|
productsDiscontinued: number;
|
||||||
|
snapshotsCreated: number;
|
||||||
|
brandsCreated: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hydrate a complete normalization result into canonical tables
|
||||||
|
*/
|
||||||
|
export async function hydrateToCanonical(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number,
|
||||||
|
normResult: NormalizationResult,
|
||||||
|
crawlRunId: number | null,
|
||||||
|
options: { dryRun?: boolean } = {}
|
||||||
|
): Promise<HydratePayloadResult> {
|
||||||
|
const { dryRun = false } = options;
|
||||||
|
|
||||||
|
// 1. Upsert brands
|
||||||
|
const brandResult = await upsertBrands(pool, normResult.brands, { dryRun });
|
||||||
|
|
||||||
|
// 2. Upsert products
|
||||||
|
const productResult = await upsertStoreProducts(
|
||||||
|
pool,
|
||||||
|
normResult.products,
|
||||||
|
normResult.pricing,
|
||||||
|
normResult.availability,
|
||||||
|
{ dryRun }
|
||||||
|
);
|
||||||
|
|
||||||
|
// 3. Create snapshots
|
||||||
|
const snapshotResult = await createStoreProductSnapshots(
|
||||||
|
pool,
|
||||||
|
dispensaryId,
|
||||||
|
normResult.products,
|
||||||
|
normResult.pricing,
|
||||||
|
normResult.availability,
|
||||||
|
crawlRunId,
|
||||||
|
{ dryRun }
|
||||||
|
);
|
||||||
|
|
||||||
|
// 4. Mark discontinued products
|
||||||
|
const currentProductIds = new Set(
|
||||||
|
normResult.products.map((p) => p.externalProductId)
|
||||||
|
);
|
||||||
|
const platform = normResult.products[0]?.platform || 'dutchie';
|
||||||
|
const discontinuedCount = await markDiscontinuedProducts(
|
||||||
|
pool,
|
||||||
|
dispensaryId,
|
||||||
|
currentProductIds,
|
||||||
|
platform,
|
||||||
|
crawlRunId,
|
||||||
|
{ dryRun }
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
productsUpserted: productResult.upserted,
|
||||||
|
productsNew: productResult.new,
|
||||||
|
productsUpdated: productResult.updated,
|
||||||
|
productsDiscontinued: discontinuedCount,
|
||||||
|
snapshotsCreated: snapshotResult.created,
|
||||||
|
brandsCreated: brandResult.new,
|
||||||
|
};
|
||||||
|
}
|
||||||
680
backend/src/hydration/incremental-sync.ts
Normal file
680
backend/src/hydration/incremental-sync.ts
Normal file
@@ -0,0 +1,680 @@
|
|||||||
|
/**
|
||||||
|
* Incremental Sync
|
||||||
|
*
|
||||||
|
* Hooks into the crawler to automatically write to canonical tables
|
||||||
|
* after each crawl completes. This ensures store_products and
|
||||||
|
* store_product_snapshots stay in sync with new data.
|
||||||
|
*
|
||||||
|
* Two modes:
|
||||||
|
* 1. Inline - Called directly from crawler after saving to legacy tables
|
||||||
|
* 2. Async - Called from a background worker that processes recent crawls
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* // Inline mode (in crawler)
|
||||||
|
* import { syncCrawlToCanonical } from './hydration/incremental-sync';
|
||||||
|
* await syncCrawlToCanonical(pool, crawlResult);
|
||||||
|
*
|
||||||
|
* // Async mode (background worker)
|
||||||
|
* import { syncRecentCrawls } from './hydration/incremental-sync';
|
||||||
|
* await syncRecentCrawls(pool, { since: '1 hour' });
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
|
||||||
|
const BATCH_SIZE = 100;
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// TYPES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface CrawlResult {
|
||||||
|
dispensaryId: number;
|
||||||
|
stateId?: number;
|
||||||
|
platformDispensaryId?: string;
|
||||||
|
crawlJobId?: number; // legacy dispensary_crawl_jobs.id
|
||||||
|
startedAt: Date;
|
||||||
|
finishedAt?: Date;
|
||||||
|
status: 'success' | 'failed' | 'running';
|
||||||
|
errorMessage?: string;
|
||||||
|
productsFound: number;
|
||||||
|
productsCreated: number;
|
||||||
|
productsUpdated: number;
|
||||||
|
productsMissing?: number;
|
||||||
|
brandsFound?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SyncOptions {
|
||||||
|
dryRun?: boolean;
|
||||||
|
verbose?: boolean;
|
||||||
|
skipSnapshots?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SyncResult {
|
||||||
|
crawlRunId: number | null;
|
||||||
|
productsUpserted: number;
|
||||||
|
productsNew: number;
|
||||||
|
productsUpdated: number;
|
||||||
|
snapshotsCreated: number;
|
||||||
|
durationMs: number;
|
||||||
|
errors: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CREATE OR GET CRAWL RUN
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a crawl_run record for a completed crawl.
|
||||||
|
* Returns existing if already synced (idempotent).
|
||||||
|
*/
|
||||||
|
export async function getOrCreateCrawlRun(
|
||||||
|
pool: Pool,
|
||||||
|
crawlResult: CrawlResult,
|
||||||
|
options: SyncOptions = {}
|
||||||
|
): Promise<number | null> {
|
||||||
|
const { dryRun = false, verbose = false } = options;
|
||||||
|
|
||||||
|
// Check if already exists (by legacy job ID)
|
||||||
|
if (crawlResult.crawlJobId) {
|
||||||
|
const existing = await pool.query(
|
||||||
|
`SELECT id FROM crawl_runs WHERE legacy_dispensary_crawl_job_id = $1`,
|
||||||
|
[crawlResult.crawlJobId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Found existing crawl_run ${existing.rows[0].id} for job ${crawlResult.crawlJobId}`);
|
||||||
|
}
|
||||||
|
return existing.rows[0].id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
console.log(`[IncrSync][DryRun] Would create crawl_run for dispensary ${crawlResult.dispensaryId}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = crawlResult.finishedAt && crawlResult.startedAt
|
||||||
|
? crawlResult.finishedAt.getTime() - crawlResult.startedAt.getTime()
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO crawl_runs (
|
||||||
|
dispensary_id, state_id, provider,
|
||||||
|
legacy_dispensary_crawl_job_id,
|
||||||
|
started_at, finished_at, duration_ms,
|
||||||
|
status, error_message,
|
||||||
|
products_found, products_new, products_updated, products_missing,
|
||||||
|
brands_found, trigger_type, created_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, 'dutchie',
|
||||||
|
$3,
|
||||||
|
$4, $5, $6,
|
||||||
|
$7, $8,
|
||||||
|
$9, $10, $11, $12,
|
||||||
|
$13, 'scheduled', NOW()
|
||||||
|
)
|
||||||
|
RETURNING id`,
|
||||||
|
[
|
||||||
|
crawlResult.dispensaryId,
|
||||||
|
crawlResult.stateId,
|
||||||
|
crawlResult.crawlJobId,
|
||||||
|
crawlResult.startedAt,
|
||||||
|
crawlResult.finishedAt,
|
||||||
|
durationMs,
|
||||||
|
crawlResult.status,
|
||||||
|
crawlResult.errorMessage,
|
||||||
|
crawlResult.productsFound,
|
||||||
|
crawlResult.productsCreated,
|
||||||
|
crawlResult.productsUpdated,
|
||||||
|
crawlResult.productsMissing || 0,
|
||||||
|
crawlResult.brandsFound || 0,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Created crawl_run ${result.rows[0].id}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.rows[0].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SYNC PRODUCTS TO CANONICAL
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sync dutchie_products to store_products for a single dispensary.
|
||||||
|
* Called after a crawl completes.
|
||||||
|
*/
|
||||||
|
export async function syncProductsToCanonical(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number,
|
||||||
|
stateId: number | null,
|
||||||
|
crawlRunId: number | null,
|
||||||
|
options: SyncOptions = {}
|
||||||
|
): Promise<{ upserted: number; new: number; updated: number; errors: string[] }> {
|
||||||
|
const { dryRun = false, verbose = false } = options;
|
||||||
|
const errors: string[] = [];
|
||||||
|
let newCount = 0;
|
||||||
|
let updatedCount = 0;
|
||||||
|
|
||||||
|
// Get all products for this dispensary
|
||||||
|
const { rows: products } = await pool.query(
|
||||||
|
`SELECT
|
||||||
|
dp.id,
|
||||||
|
dp.external_product_id,
|
||||||
|
dp.name,
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
dp.category,
|
||||||
|
dp.subcategory,
|
||||||
|
dp.type,
|
||||||
|
dp.strain_type,
|
||||||
|
dp.description,
|
||||||
|
dp.effects,
|
||||||
|
dp.cannabinoids_v2,
|
||||||
|
dp.thc,
|
||||||
|
dp.thc_content,
|
||||||
|
dp.cbd,
|
||||||
|
dp.cbd_content,
|
||||||
|
dp.primary_image_url,
|
||||||
|
dp.local_image_url,
|
||||||
|
dp.local_image_thumb_url,
|
||||||
|
dp.local_image_medium_url,
|
||||||
|
dp.original_image_url,
|
||||||
|
dp.additional_images,
|
||||||
|
dp.stock_status,
|
||||||
|
dp.c_name,
|
||||||
|
dp.enterprise_product_id,
|
||||||
|
dp.weight,
|
||||||
|
dp.options,
|
||||||
|
dp.measurements,
|
||||||
|
dp.status,
|
||||||
|
dp.featured,
|
||||||
|
dp.special,
|
||||||
|
dp.medical_only,
|
||||||
|
dp.rec_only,
|
||||||
|
dp.is_below_threshold,
|
||||||
|
dp.is_below_kiosk_threshold,
|
||||||
|
dp.total_quantity_available,
|
||||||
|
dp.total_kiosk_quantity_available,
|
||||||
|
dp.first_seen_at,
|
||||||
|
dp.last_seen_at,
|
||||||
|
dp.updated_at,
|
||||||
|
d.platform_dispensary_id
|
||||||
|
FROM dutchie_products dp
|
||||||
|
LEFT JOIN dispensaries d ON d.id = dp.dispensary_id
|
||||||
|
WHERE dp.dispensary_id = $1`,
|
||||||
|
[dispensaryId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Found ${products.length} products for dispensary ${dispensaryId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < products.length; i += BATCH_SIZE) {
|
||||||
|
const batch = products.slice(i, i + BATCH_SIZE);
|
||||||
|
|
||||||
|
for (const p of batch) {
|
||||||
|
try {
|
||||||
|
const thcPercent = parseFloat(p.thc) || parseFloat(p.thc_content) || null;
|
||||||
|
const cbdPercent = parseFloat(p.cbd) || parseFloat(p.cbd_content) || null;
|
||||||
|
const stockStatus = p.stock_status || 'unknown';
|
||||||
|
const isInStock = stockStatus === 'in_stock' || stockStatus === 'unknown';
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync][DryRun] Would upsert product ${p.external_product_id}`);
|
||||||
|
}
|
||||||
|
newCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO store_products (
|
||||||
|
dispensary_id, state_id, provider, provider_product_id,
|
||||||
|
provider_brand_id, provider_dispensary_id, enterprise_product_id,
|
||||||
|
legacy_dutchie_product_id,
|
||||||
|
name, brand_name, category, subcategory, product_type, strain_type,
|
||||||
|
description, effects, cannabinoids,
|
||||||
|
thc_percent, cbd_percent, thc_content_text, cbd_content_text,
|
||||||
|
is_in_stock, stock_status, stock_quantity,
|
||||||
|
total_quantity_available, total_kiosk_quantity_available,
|
||||||
|
image_url, local_image_url, local_image_thumb_url, local_image_medium_url,
|
||||||
|
original_image_url, additional_images,
|
||||||
|
is_on_special, is_featured, medical_only, rec_only,
|
||||||
|
is_below_threshold, is_below_kiosk_threshold,
|
||||||
|
platform_status, c_name, weight, options, measurements,
|
||||||
|
first_seen_at, last_seen_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, 'dutchie', $3,
|
||||||
|
$4, $5, $6,
|
||||||
|
$7,
|
||||||
|
$8, $9, $10, $11, $12, $13,
|
||||||
|
$14, $15, $16,
|
||||||
|
$17, $18, $19, $20,
|
||||||
|
$21, $22, $23,
|
||||||
|
$24, $25,
|
||||||
|
$26, $27, $28, $29,
|
||||||
|
$30, $31,
|
||||||
|
$32, $33, $34, $35,
|
||||||
|
$36, $37,
|
||||||
|
$38, $39, $40, $41, $42,
|
||||||
|
$43, $44, NOW()
|
||||||
|
)
|
||||||
|
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||||
|
DO UPDATE SET
|
||||||
|
legacy_dutchie_product_id = EXCLUDED.legacy_dutchie_product_id,
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
brand_name = EXCLUDED.brand_name,
|
||||||
|
category = EXCLUDED.category,
|
||||||
|
subcategory = EXCLUDED.subcategory,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_status = EXCLUDED.stock_status,
|
||||||
|
thc_percent = EXCLUDED.thc_percent,
|
||||||
|
cbd_percent = EXCLUDED.cbd_percent,
|
||||||
|
image_url = EXCLUDED.image_url,
|
||||||
|
local_image_url = EXCLUDED.local_image_url,
|
||||||
|
is_on_special = EXCLUDED.is_on_special,
|
||||||
|
platform_status = EXCLUDED.platform_status,
|
||||||
|
last_seen_at = NOW(),
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING (xmax = 0) as is_new`,
|
||||||
|
[
|
||||||
|
dispensaryId,
|
||||||
|
stateId,
|
||||||
|
p.external_product_id,
|
||||||
|
p.brand_id,
|
||||||
|
p.platform_dispensary_id,
|
||||||
|
p.enterprise_product_id,
|
||||||
|
p.id,
|
||||||
|
p.name,
|
||||||
|
p.brand_name,
|
||||||
|
p.category || p.type,
|
||||||
|
p.subcategory,
|
||||||
|
p.type,
|
||||||
|
p.strain_type,
|
||||||
|
p.description,
|
||||||
|
p.effects,
|
||||||
|
p.cannabinoids_v2,
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
p.thc_content,
|
||||||
|
p.cbd_content,
|
||||||
|
isInStock,
|
||||||
|
stockStatus,
|
||||||
|
p.total_quantity_available,
|
||||||
|
p.total_quantity_available,
|
||||||
|
p.total_kiosk_quantity_available,
|
||||||
|
p.primary_image_url,
|
||||||
|
p.local_image_url,
|
||||||
|
p.local_image_thumb_url,
|
||||||
|
p.local_image_medium_url,
|
||||||
|
p.original_image_url,
|
||||||
|
p.additional_images,
|
||||||
|
p.special || false,
|
||||||
|
p.featured || false,
|
||||||
|
p.medical_only || false,
|
||||||
|
p.rec_only || false,
|
||||||
|
p.is_below_threshold || false,
|
||||||
|
p.is_below_kiosk_threshold || false,
|
||||||
|
p.status,
|
||||||
|
p.c_name,
|
||||||
|
p.weight,
|
||||||
|
p.options,
|
||||||
|
p.measurements,
|
||||||
|
p.first_seen_at || p.updated_at,
|
||||||
|
p.last_seen_at || p.updated_at,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows[0]?.is_new) {
|
||||||
|
newCount++;
|
||||||
|
} else {
|
||||||
|
updatedCount++;
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`Product ${p.id}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
upserted: newCount + updatedCount,
|
||||||
|
new: newCount,
|
||||||
|
updated: updatedCount,
|
||||||
|
errors,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SYNC SNAPSHOTS TO CANONICAL
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sync dutchie_product_snapshots to store_product_snapshots for recent crawls.
|
||||||
|
*/
|
||||||
|
export async function syncSnapshotsToCanonical(
|
||||||
|
pool: Pool,
|
||||||
|
dispensaryId: number,
|
||||||
|
stateId: number | null,
|
||||||
|
crawlRunId: number | null,
|
||||||
|
since: Date,
|
||||||
|
options: SyncOptions = {}
|
||||||
|
): Promise<{ created: number; errors: string[] }> {
|
||||||
|
const { dryRun = false, verbose = false } = options;
|
||||||
|
const errors: string[] = [];
|
||||||
|
let created = 0;
|
||||||
|
|
||||||
|
// Get recent snapshots that haven't been synced yet
|
||||||
|
const { rows: snapshots } = await pool.query(
|
||||||
|
`SELECT
|
||||||
|
dps.id,
|
||||||
|
dps.dutchie_product_id,
|
||||||
|
dps.dispensary_id,
|
||||||
|
dps.options,
|
||||||
|
dps.raw_product_data,
|
||||||
|
dps.crawled_at,
|
||||||
|
dps.created_at,
|
||||||
|
dp.external_product_id,
|
||||||
|
dp.name,
|
||||||
|
dp.brand_name,
|
||||||
|
dp.category,
|
||||||
|
dp.subcategory,
|
||||||
|
sp.id as store_product_id,
|
||||||
|
d.platform_dispensary_id
|
||||||
|
FROM dutchie_product_snapshots dps
|
||||||
|
JOIN dutchie_products dp ON dp.id = dps.dutchie_product_id
|
||||||
|
LEFT JOIN store_products sp ON sp.dispensary_id = dps.dispensary_id
|
||||||
|
AND sp.provider_product_id = dp.external_product_id
|
||||||
|
AND sp.provider = 'dutchie'
|
||||||
|
LEFT JOIN dispensaries d ON d.id = dps.dispensary_id
|
||||||
|
LEFT JOIN store_product_snapshots sps ON sps.legacy_snapshot_id = dps.id
|
||||||
|
WHERE dps.dispensary_id = $1
|
||||||
|
AND dps.crawled_at >= $2
|
||||||
|
AND sps.id IS NULL
|
||||||
|
ORDER BY dps.id`,
|
||||||
|
[dispensaryId, since]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Found ${snapshots.length} new snapshots since ${since.toISOString()}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (snapshots.length === 0) {
|
||||||
|
return { created: 0, errors: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const s of snapshots) {
|
||||||
|
try {
|
||||||
|
// Extract pricing from raw_product_data
|
||||||
|
let priceRec: number | null = null;
|
||||||
|
let priceMed: number | null = null;
|
||||||
|
let priceRecSpecial: number | null = null;
|
||||||
|
let isOnSpecial = false;
|
||||||
|
let isInStock = true;
|
||||||
|
let thcPercent: number | null = null;
|
||||||
|
let cbdPercent: number | null = null;
|
||||||
|
let stockStatus = 'unknown';
|
||||||
|
let platformStatus: string | null = null;
|
||||||
|
|
||||||
|
if (s.raw_product_data) {
|
||||||
|
const raw = typeof s.raw_product_data === 'string'
|
||||||
|
? JSON.parse(s.raw_product_data)
|
||||||
|
: s.raw_product_data;
|
||||||
|
|
||||||
|
priceRec = raw.recPrices?.[0] || raw.Prices?.[0] || null;
|
||||||
|
priceMed = raw.medicalPrices?.[0] || null;
|
||||||
|
priceRecSpecial = raw.recSpecialPrices?.[0] || null;
|
||||||
|
isOnSpecial = raw.special === true || (priceRecSpecial !== null);
|
||||||
|
thcPercent = raw.THCContent?.range?.[0] || raw.THC || null;
|
||||||
|
cbdPercent = raw.CBDContent?.range?.[0] || raw.CBD || null;
|
||||||
|
platformStatus = raw.Status || null;
|
||||||
|
isInStock = platformStatus === 'Active';
|
||||||
|
stockStatus = isInStock ? 'in_stock' : 'out_of_stock';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dryRun) {
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync][DryRun] Would create snapshot for legacy ${s.id}`);
|
||||||
|
}
|
||||||
|
created++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, store_product_id, state_id,
|
||||||
|
provider, provider_product_id, provider_dispensary_id,
|
||||||
|
crawl_run_id,
|
||||||
|
legacy_snapshot_id, legacy_dutchie_product_id,
|
||||||
|
captured_at,
|
||||||
|
name, brand_name, category, subcategory,
|
||||||
|
price_rec, price_med, price_rec_special,
|
||||||
|
is_on_special, is_in_stock, stock_status,
|
||||||
|
thc_percent, cbd_percent,
|
||||||
|
platform_status, options, raw_data,
|
||||||
|
created_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3,
|
||||||
|
'dutchie', $4, $5,
|
||||||
|
$6,
|
||||||
|
$7, $8,
|
||||||
|
$9,
|
||||||
|
$10, $11, $12, $13,
|
||||||
|
$14, $15, $16,
|
||||||
|
$17, $18, $19,
|
||||||
|
$20, $21,
|
||||||
|
$22, $23, $24,
|
||||||
|
NOW()
|
||||||
|
)`,
|
||||||
|
[
|
||||||
|
s.dispensary_id,
|
||||||
|
s.store_product_id,
|
||||||
|
stateId,
|
||||||
|
s.external_product_id,
|
||||||
|
s.platform_dispensary_id,
|
||||||
|
crawlRunId,
|
||||||
|
s.id,
|
||||||
|
s.dutchie_product_id,
|
||||||
|
s.crawled_at,
|
||||||
|
s.name,
|
||||||
|
s.brand_name,
|
||||||
|
s.category,
|
||||||
|
s.subcategory,
|
||||||
|
priceRec,
|
||||||
|
priceMed,
|
||||||
|
priceRecSpecial,
|
||||||
|
isOnSpecial,
|
||||||
|
isInStock,
|
||||||
|
stockStatus,
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
platformStatus,
|
||||||
|
s.options,
|
||||||
|
s.raw_product_data,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
created++;
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`Snapshot ${s.id}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { created, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN SYNC FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sync a single crawl result to canonical tables.
|
||||||
|
* Call this from the crawler after each crawl completes.
|
||||||
|
*/
|
||||||
|
export async function syncCrawlToCanonical(
|
||||||
|
pool: Pool,
|
||||||
|
crawlResult: CrawlResult,
|
||||||
|
options: SyncOptions = {}
|
||||||
|
): Promise<SyncResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const errors: string[] = [];
|
||||||
|
const { verbose = false, skipSnapshots = false } = options;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Starting sync for dispensary ${crawlResult.dispensaryId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1. Create crawl_run record
|
||||||
|
const crawlRunId = await getOrCreateCrawlRun(pool, crawlResult, options);
|
||||||
|
|
||||||
|
// 2. Sync products
|
||||||
|
const productResult = await syncProductsToCanonical(
|
||||||
|
pool,
|
||||||
|
crawlResult.dispensaryId,
|
||||||
|
crawlResult.stateId || null,
|
||||||
|
crawlRunId,
|
||||||
|
options
|
||||||
|
);
|
||||||
|
errors.push(...productResult.errors);
|
||||||
|
|
||||||
|
// 3. Sync snapshots (if not skipped)
|
||||||
|
let snapshotsCreated = 0;
|
||||||
|
if (!skipSnapshots) {
|
||||||
|
const since = new Date(crawlResult.startedAt.getTime() - 60 * 1000); // 1 min before
|
||||||
|
const snapshotResult = await syncSnapshotsToCanonical(
|
||||||
|
pool,
|
||||||
|
crawlResult.dispensaryId,
|
||||||
|
crawlResult.stateId || null,
|
||||||
|
crawlRunId,
|
||||||
|
since,
|
||||||
|
options
|
||||||
|
);
|
||||||
|
snapshotsCreated = snapshotResult.created;
|
||||||
|
errors.push(...snapshotResult.errors);
|
||||||
|
}
|
||||||
|
|
||||||
|
const durationMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Completed in ${durationMs}ms: ${productResult.upserted} products, ${snapshotsCreated} snapshots`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
crawlRunId,
|
||||||
|
productsUpserted: productResult.upserted,
|
||||||
|
productsNew: productResult.new,
|
||||||
|
productsUpdated: productResult.updated,
|
||||||
|
snapshotsCreated,
|
||||||
|
durationMs,
|
||||||
|
errors,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// BATCH SYNC FOR RECENT CRAWLS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export interface RecentSyncOptions extends SyncOptions {
|
||||||
|
since?: string; // e.g., '1 hour', '30 minutes', '1 day'
|
||||||
|
dispensaryId?: number;
|
||||||
|
limit?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sync recent crawls that haven't been synced yet.
|
||||||
|
* Run this as a background job to catch any missed syncs.
|
||||||
|
*/
|
||||||
|
export async function syncRecentCrawls(
|
||||||
|
pool: Pool,
|
||||||
|
options: RecentSyncOptions = {}
|
||||||
|
): Promise<{ synced: number; errors: string[] }> {
|
||||||
|
const {
|
||||||
|
since = '1 hour',
|
||||||
|
dispensaryId,
|
||||||
|
limit = 100,
|
||||||
|
verbose = false,
|
||||||
|
dryRun = false,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const errors: string[] = [];
|
||||||
|
let synced = 0;
|
||||||
|
|
||||||
|
// Find recent completed crawl jobs that don't have a crawl_run
|
||||||
|
let query = `
|
||||||
|
SELECT
|
||||||
|
dcj.id as crawl_job_id,
|
||||||
|
dcj.dispensary_id,
|
||||||
|
dcj.status,
|
||||||
|
dcj.started_at,
|
||||||
|
dcj.completed_at,
|
||||||
|
dcj.products_found,
|
||||||
|
dcj.products_created,
|
||||||
|
dcj.products_updated,
|
||||||
|
dcj.brands_found,
|
||||||
|
dcj.error_message,
|
||||||
|
d.state_id
|
||||||
|
FROM dispensary_crawl_jobs dcj
|
||||||
|
LEFT JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||||
|
LEFT JOIN crawl_runs cr ON cr.legacy_dispensary_crawl_job_id = dcj.id
|
||||||
|
WHERE dcj.status IN ('completed', 'failed')
|
||||||
|
AND dcj.started_at > NOW() - INTERVAL '${since}'
|
||||||
|
AND cr.id IS NULL
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIdx = 1;
|
||||||
|
|
||||||
|
if (dispensaryId) {
|
||||||
|
query += ` AND dcj.dispensary_id = $${paramIdx}`;
|
||||||
|
params.push(dispensaryId);
|
||||||
|
paramIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY dcj.started_at DESC LIMIT $${paramIdx}`;
|
||||||
|
params.push(limit);
|
||||||
|
|
||||||
|
const { rows: unsynced } = await pool.query(query, params);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
console.log(`[IncrSync] Found ${unsynced.length} unsynced crawls from last ${since}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const job of unsynced) {
|
||||||
|
try {
|
||||||
|
const crawlResult: CrawlResult = {
|
||||||
|
dispensaryId: job.dispensary_id,
|
||||||
|
stateId: job.state_id,
|
||||||
|
crawlJobId: job.crawl_job_id,
|
||||||
|
startedAt: new Date(job.started_at),
|
||||||
|
finishedAt: job.completed_at ? new Date(job.completed_at) : undefined,
|
||||||
|
status: job.status === 'completed' ? 'success' : 'failed',
|
||||||
|
errorMessage: job.error_message,
|
||||||
|
productsFound: job.products_found || 0,
|
||||||
|
productsCreated: job.products_created || 0,
|
||||||
|
productsUpdated: job.products_updated || 0,
|
||||||
|
brandsFound: job.brands_found || 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
await syncCrawlToCanonical(pool, crawlResult, { dryRun, verbose });
|
||||||
|
synced++;
|
||||||
|
} catch (error: any) {
|
||||||
|
errors.push(`Job ${job.crawl_job_id}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { synced, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// EXPORTS
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export {
|
||||||
|
CrawlResult,
|
||||||
|
SyncOptions,
|
||||||
|
SyncResult,
|
||||||
|
};
|
||||||
96
backend/src/hydration/index.ts
Normal file
96
backend/src/hydration/index.ts
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
/**
|
||||||
|
* Hydration Module
|
||||||
|
*
|
||||||
|
* Central export for the raw payload → canonical hydration pipeline.
|
||||||
|
*
|
||||||
|
* Components:
|
||||||
|
* - Payload Store: Store and retrieve raw payloads
|
||||||
|
* - Normalizers: Platform-specific JSON → canonical format converters
|
||||||
|
* - Canonical Upsert: Write normalized data to canonical tables
|
||||||
|
* - Worker: Process payloads in batches with locking
|
||||||
|
* - Backfill: Import historical data
|
||||||
|
* - Producer: Hook for crawlers to store payloads
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Types
|
||||||
|
export * from './types';
|
||||||
|
|
||||||
|
// Payload storage
|
||||||
|
export {
|
||||||
|
storeRawPayload,
|
||||||
|
getUnprocessedPayloads,
|
||||||
|
markPayloadProcessed,
|
||||||
|
markPayloadFailed,
|
||||||
|
getPayloadById,
|
||||||
|
getPayloadsForDispensary,
|
||||||
|
getPayloadStats,
|
||||||
|
} from './payload-store';
|
||||||
|
|
||||||
|
// Normalizers
|
||||||
|
export {
|
||||||
|
getNormalizer,
|
||||||
|
getRegisteredPlatforms,
|
||||||
|
isPlatformSupported,
|
||||||
|
DutchieNormalizer,
|
||||||
|
INormalizer,
|
||||||
|
BaseNormalizer,
|
||||||
|
} from './normalizers';
|
||||||
|
|
||||||
|
// Canonical upserts
|
||||||
|
export {
|
||||||
|
upsertStoreProducts,
|
||||||
|
createStoreProductSnapshots,
|
||||||
|
markDiscontinuedProducts,
|
||||||
|
upsertBrands,
|
||||||
|
hydrateToCanonical,
|
||||||
|
} from './canonical-upsert';
|
||||||
|
|
||||||
|
// Locking
|
||||||
|
export {
|
||||||
|
HydrationLockManager,
|
||||||
|
LOCK_NAMES,
|
||||||
|
} from './locking';
|
||||||
|
|
||||||
|
// Worker
|
||||||
|
export {
|
||||||
|
HydrationWorker,
|
||||||
|
runHydrationBatch,
|
||||||
|
processPayloadById,
|
||||||
|
reprocessFailedPayloads,
|
||||||
|
} from './worker';
|
||||||
|
|
||||||
|
// Backfill
|
||||||
|
export {
|
||||||
|
runBackfill,
|
||||||
|
backfillFromDutchieProducts,
|
||||||
|
backfillFromSnapshots,
|
||||||
|
backfillFromCacheFiles,
|
||||||
|
BackfillOptions,
|
||||||
|
BackfillResult,
|
||||||
|
} from './backfill';
|
||||||
|
|
||||||
|
// Producer
|
||||||
|
export {
|
||||||
|
producePayload,
|
||||||
|
createProducer,
|
||||||
|
onCrawlComplete,
|
||||||
|
ProducerOptions,
|
||||||
|
} from './producer';
|
||||||
|
|
||||||
|
// Legacy Backfill
|
||||||
|
export {
|
||||||
|
runLegacyBackfill,
|
||||||
|
} from './legacy-backfill';
|
||||||
|
|
||||||
|
// Incremental Sync
|
||||||
|
export {
|
||||||
|
syncCrawlToCanonical,
|
||||||
|
syncRecentCrawls,
|
||||||
|
syncProductsToCanonical,
|
||||||
|
syncSnapshotsToCanonical,
|
||||||
|
getOrCreateCrawlRun,
|
||||||
|
CrawlResult,
|
||||||
|
SyncOptions,
|
||||||
|
SyncResult,
|
||||||
|
RecentSyncOptions,
|
||||||
|
} from './incremental-sync';
|
||||||
851
backend/src/hydration/legacy-backfill.ts
Normal file
851
backend/src/hydration/legacy-backfill.ts
Normal file
@@ -0,0 +1,851 @@
|
|||||||
|
/**
|
||||||
|
* Legacy Backfill Script
|
||||||
|
*
|
||||||
|
* Directly hydrates canonical tables from legacy dutchie_* tables.
|
||||||
|
* This bypasses the payload-store and normalizer pipeline for efficiency.
|
||||||
|
*
|
||||||
|
* Source Tables (READ-ONLY):
|
||||||
|
* - dutchie_products → store_products
|
||||||
|
* - dutchie_product_snapshots → store_product_snapshots
|
||||||
|
* - dispensary_crawl_jobs → crawl_runs
|
||||||
|
*
|
||||||
|
* This script is:
|
||||||
|
* - IDEMPOTENT: Can be run multiple times safely
|
||||||
|
* - BATCH-ORIENTED: Processes in chunks to avoid OOM
|
||||||
|
* - RESUMABLE: Can start from a specific ID if interrupted
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx src/hydration/legacy-backfill.ts
|
||||||
|
* npx tsx src/hydration/legacy-backfill.ts --dispensary-id 123
|
||||||
|
* npx tsx src/hydration/legacy-backfill.ts --dry-run
|
||||||
|
* npx tsx src/hydration/legacy-backfill.ts --start-from 5000
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CONFIGURATION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
const BATCH_SIZE = 100;
|
||||||
|
|
||||||
|
interface LegacyBackfillOptions {
|
||||||
|
dryRun: boolean;
|
||||||
|
dispensaryId?: number;
|
||||||
|
startFromProductId?: number;
|
||||||
|
startFromSnapshotId?: number;
|
||||||
|
startFromJobId?: number;
|
||||||
|
verbose: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface LegacyBackfillStats {
|
||||||
|
productsProcessed: number;
|
||||||
|
productsInserted: number;
|
||||||
|
productsUpdated: number;
|
||||||
|
productsSkipped: number;
|
||||||
|
productErrors: number;
|
||||||
|
|
||||||
|
snapshotsProcessed: number;
|
||||||
|
snapshotsInserted: number;
|
||||||
|
snapshotsSkipped: number;
|
||||||
|
snapshotErrors: number;
|
||||||
|
|
||||||
|
crawlRunsProcessed: number;
|
||||||
|
crawlRunsInserted: number;
|
||||||
|
crawlRunsSkipped: number;
|
||||||
|
crawlRunErrors: number;
|
||||||
|
|
||||||
|
startedAt: Date;
|
||||||
|
completedAt?: Date;
|
||||||
|
durationMs?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// DATABASE CONNECTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
function getConnectionString(): string {
|
||||||
|
if (process.env.CANNAIQ_DB_URL) {
|
||||||
|
return process.env.CANNAIQ_DB_URL;
|
||||||
|
}
|
||||||
|
|
||||||
|
const host = process.env.CANNAIQ_DB_HOST;
|
||||||
|
const port = process.env.CANNAIQ_DB_PORT;
|
||||||
|
const name = process.env.CANNAIQ_DB_NAME;
|
||||||
|
const user = process.env.CANNAIQ_DB_USER;
|
||||||
|
const pass = process.env.CANNAIQ_DB_PASS;
|
||||||
|
|
||||||
|
if (host && port && name && user && pass) {
|
||||||
|
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error('Missing CANNAIQ_DB_* environment variables');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STEP 1: HYDRATE CRAWL RUNS FROM dispensary_crawl_jobs
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
async function hydrateCrawlRuns(
|
||||||
|
pool: Pool,
|
||||||
|
options: LegacyBackfillOptions,
|
||||||
|
stats: LegacyBackfillStats
|
||||||
|
): Promise<Map<number, number>> {
|
||||||
|
console.log('\n=== STEP 1: Hydrate crawl_runs from dispensary_crawl_jobs ===');
|
||||||
|
|
||||||
|
// Map from legacy job ID to canonical crawl_run ID
|
||||||
|
const jobToCrawlRunMap = new Map<number, number>();
|
||||||
|
|
||||||
|
// Build query
|
||||||
|
let query = `
|
||||||
|
SELECT
|
||||||
|
dcj.id,
|
||||||
|
dcj.dispensary_id,
|
||||||
|
dcj.schedule_id,
|
||||||
|
dcj.status,
|
||||||
|
dcj.job_type,
|
||||||
|
dcj.started_at,
|
||||||
|
dcj.completed_at,
|
||||||
|
dcj.products_found,
|
||||||
|
dcj.products_created,
|
||||||
|
dcj.products_updated,
|
||||||
|
dcj.brands_found,
|
||||||
|
dcj.error_message,
|
||||||
|
dcj.retry_count,
|
||||||
|
dcj.created_at,
|
||||||
|
d.state_id
|
||||||
|
FROM dispensary_crawl_jobs dcj
|
||||||
|
LEFT JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||||
|
WHERE dcj.status IN ('completed', 'failed')
|
||||||
|
AND dcj.started_at IS NOT NULL
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
query += ` AND dcj.dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(options.dispensaryId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.startFromJobId) {
|
||||||
|
query += ` AND dcj.id >= $${paramIndex}`;
|
||||||
|
params.push(options.startFromJobId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY dcj.id`;
|
||||||
|
|
||||||
|
const { rows: jobs } = await pool.query(query, params);
|
||||||
|
console.log(` Found ${jobs.length} crawl jobs to hydrate`);
|
||||||
|
|
||||||
|
for (const job of jobs) {
|
||||||
|
stats.crawlRunsProcessed++;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Check if already hydrated
|
||||||
|
const existing = await pool.query(
|
||||||
|
`SELECT id FROM crawl_runs WHERE legacy_dispensary_crawl_job_id = $1`,
|
||||||
|
[job.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
jobToCrawlRunMap.set(job.id, existing.rows[0].id);
|
||||||
|
stats.crawlRunsSkipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.dryRun) {
|
||||||
|
if (options.verbose) {
|
||||||
|
console.log(` [DryRun] Would insert crawl_run for job ${job.id}`);
|
||||||
|
}
|
||||||
|
stats.crawlRunsInserted++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate duration
|
||||||
|
const durationMs = job.completed_at && job.started_at
|
||||||
|
? new Date(job.completed_at).getTime() - new Date(job.started_at).getTime()
|
||||||
|
: null;
|
||||||
|
|
||||||
|
// Map status
|
||||||
|
const status = job.status === 'completed' ? 'success' : 'failed';
|
||||||
|
|
||||||
|
// Insert crawl_run
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO crawl_runs (
|
||||||
|
dispensary_id, state_id, provider,
|
||||||
|
legacy_dispensary_crawl_job_id, schedule_id, job_type,
|
||||||
|
started_at, finished_at, duration_ms,
|
||||||
|
status, error_message,
|
||||||
|
products_found, products_new, products_updated, products_missing,
|
||||||
|
snapshots_written, brands_found,
|
||||||
|
trigger_type, retry_count, created_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, 'dutchie',
|
||||||
|
$3, $4, $5,
|
||||||
|
$6, $7, $8,
|
||||||
|
$9, $10,
|
||||||
|
$11, $12, $13, 0,
|
||||||
|
0, $14,
|
||||||
|
'scheduled', $15, $16
|
||||||
|
)
|
||||||
|
RETURNING id`,
|
||||||
|
[
|
||||||
|
job.dispensary_id,
|
||||||
|
job.state_id,
|
||||||
|
job.id,
|
||||||
|
job.schedule_id,
|
||||||
|
job.job_type || 'full',
|
||||||
|
job.started_at,
|
||||||
|
job.completed_at,
|
||||||
|
durationMs,
|
||||||
|
status,
|
||||||
|
job.error_message,
|
||||||
|
job.products_found || 0,
|
||||||
|
job.products_created || 0,
|
||||||
|
job.products_updated || 0,
|
||||||
|
job.brands_found || 0,
|
||||||
|
job.retry_count || 0,
|
||||||
|
job.created_at,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
jobToCrawlRunMap.set(job.id, result.rows[0].id);
|
||||||
|
stats.crawlRunsInserted++;
|
||||||
|
|
||||||
|
if (options.verbose && stats.crawlRunsInserted % 100 === 0) {
|
||||||
|
console.log(` Inserted ${stats.crawlRunsInserted} crawl runs...`);
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
stats.crawlRunErrors++;
|
||||||
|
console.error(` Error hydrating job ${job.id}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` Crawl runs: ${stats.crawlRunsInserted} inserted, ${stats.crawlRunsSkipped} skipped, ${stats.crawlRunErrors} errors`);
|
||||||
|
return jobToCrawlRunMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STEP 2: HYDRATE STORE_PRODUCTS FROM dutchie_products
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
async function hydrateStoreProducts(
|
||||||
|
pool: Pool,
|
||||||
|
options: LegacyBackfillOptions,
|
||||||
|
stats: LegacyBackfillStats
|
||||||
|
): Promise<Map<number, number>> {
|
||||||
|
console.log('\n=== STEP 2: Hydrate store_products from dutchie_products ===');
|
||||||
|
|
||||||
|
// Map from legacy dutchie_product.id to canonical store_product.id
|
||||||
|
const productIdMap = new Map<number, number>();
|
||||||
|
|
||||||
|
// Get total count
|
||||||
|
let countQuery = `SELECT COUNT(*) as cnt FROM dutchie_products`;
|
||||||
|
const countParams: any[] = [];
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
countQuery += ` WHERE dispensary_id = $1`;
|
||||||
|
countParams.push(options.dispensaryId);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { rows: countRows } = await pool.query(countQuery, countParams);
|
||||||
|
const totalCount = parseInt(countRows[0].cnt, 10);
|
||||||
|
console.log(` Total dutchie_products: ${totalCount}`);
|
||||||
|
|
||||||
|
let offset = options.startFromProductId ? 0 : 0;
|
||||||
|
let processed = 0;
|
||||||
|
|
||||||
|
while (processed < totalCount) {
|
||||||
|
// Fetch batch
|
||||||
|
let query = `
|
||||||
|
SELECT
|
||||||
|
dp.id,
|
||||||
|
dp.dispensary_id,
|
||||||
|
dp.external_product_id,
|
||||||
|
dp.name,
|
||||||
|
dp.brand_name,
|
||||||
|
dp.brand_id,
|
||||||
|
dp.brand_logo_url,
|
||||||
|
dp.category,
|
||||||
|
dp.subcategory,
|
||||||
|
dp.strain_type,
|
||||||
|
dp.description,
|
||||||
|
dp.effects,
|
||||||
|
dp.thc,
|
||||||
|
dp.thc_content,
|
||||||
|
dp.cbd,
|
||||||
|
dp.cbd_content,
|
||||||
|
dp.cannabinoids_v2,
|
||||||
|
dp.primary_image_url,
|
||||||
|
dp.additional_images,
|
||||||
|
dp.local_image_url,
|
||||||
|
dp.local_image_thumb_url,
|
||||||
|
dp.local_image_medium_url,
|
||||||
|
dp.original_image_url,
|
||||||
|
dp.stock_status,
|
||||||
|
dp.type,
|
||||||
|
dp.c_name,
|
||||||
|
dp.enterprise_product_id,
|
||||||
|
dp.weight,
|
||||||
|
dp.options,
|
||||||
|
dp.measurements,
|
||||||
|
dp.status,
|
||||||
|
dp.featured,
|
||||||
|
dp.special,
|
||||||
|
dp.medical_only,
|
||||||
|
dp.rec_only,
|
||||||
|
dp.is_below_threshold,
|
||||||
|
dp.is_below_kiosk_threshold,
|
||||||
|
dp.total_quantity_available,
|
||||||
|
dp.total_kiosk_quantity_available,
|
||||||
|
dp.first_seen_at,
|
||||||
|
dp.last_seen_at,
|
||||||
|
dp.created_at,
|
||||||
|
dp.updated_at,
|
||||||
|
d.state_id,
|
||||||
|
d.platform_dispensary_id
|
||||||
|
FROM dutchie_products dp
|
||||||
|
LEFT JOIN dispensaries d ON d.id = dp.dispensary_id
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
query += ` WHERE dp.dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(options.dispensaryId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.startFromProductId && processed === 0) {
|
||||||
|
query += options.dispensaryId ? ` AND` : ` WHERE`;
|
||||||
|
query += ` dp.id >= $${paramIndex}`;
|
||||||
|
params.push(options.startFromProductId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY dp.id LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
|
||||||
|
params.push(BATCH_SIZE, offset);
|
||||||
|
|
||||||
|
const { rows: products } = await pool.query(query, params);
|
||||||
|
|
||||||
|
if (products.length === 0) break;
|
||||||
|
|
||||||
|
for (const p of products) {
|
||||||
|
stats.productsProcessed++;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Check if already hydrated by legacy ID
|
||||||
|
const existingByLegacy = await pool.query(
|
||||||
|
`SELECT id FROM store_products WHERE legacy_dutchie_product_id = $1`,
|
||||||
|
[p.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existingByLegacy.rows.length > 0) {
|
||||||
|
productIdMap.set(p.id, existingByLegacy.rows[0].id);
|
||||||
|
stats.productsSkipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse THC/CBD percent from text
|
||||||
|
const thcPercent = parseFloat(p.thc) || parseFloat(p.thc_content) || null;
|
||||||
|
const cbdPercent = parseFloat(p.cbd) || parseFloat(p.cbd_content) || null;
|
||||||
|
|
||||||
|
// Determine stock status
|
||||||
|
const stockStatus = p.stock_status || 'unknown';
|
||||||
|
const isInStock = stockStatus === 'in_stock' || stockStatus === 'unknown';
|
||||||
|
|
||||||
|
if (options.dryRun) {
|
||||||
|
if (options.verbose) {
|
||||||
|
console.log(` [DryRun] Would upsert store_product for legacy ID ${p.id}`);
|
||||||
|
}
|
||||||
|
stats.productsInserted++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upsert store_product
|
||||||
|
const result = await pool.query(
|
||||||
|
`INSERT INTO store_products (
|
||||||
|
dispensary_id, state_id, provider, provider_product_id,
|
||||||
|
provider_brand_id, provider_dispensary_id, enterprise_product_id,
|
||||||
|
legacy_dutchie_product_id,
|
||||||
|
name, brand_name, category, subcategory, product_type, strain_type,
|
||||||
|
description, effects, cannabinoids,
|
||||||
|
thc_percent, cbd_percent, thc_content_text, cbd_content_text,
|
||||||
|
is_in_stock, stock_status, stock_quantity,
|
||||||
|
total_quantity_available, total_kiosk_quantity_available,
|
||||||
|
image_url, local_image_url, local_image_thumb_url, local_image_medium_url,
|
||||||
|
original_image_url, additional_images,
|
||||||
|
is_on_special, is_featured, medical_only, rec_only,
|
||||||
|
is_below_threshold, is_below_kiosk_threshold,
|
||||||
|
platform_status, c_name, weight, options, measurements,
|
||||||
|
first_seen_at, last_seen_at, created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, 'dutchie', $3,
|
||||||
|
$4, $5, $6,
|
||||||
|
$7,
|
||||||
|
$8, $9, $10, $11, $12, $13,
|
||||||
|
$14, $15, $16,
|
||||||
|
$17, $18, $19, $20,
|
||||||
|
$21, $22, $23,
|
||||||
|
$24, $25,
|
||||||
|
$26, $27, $28, $29,
|
||||||
|
$30, $31,
|
||||||
|
$32, $33, $34, $35,
|
||||||
|
$36, $37,
|
||||||
|
$38, $39, $40, $41, $42,
|
||||||
|
$43, $44, $45, $46
|
||||||
|
)
|
||||||
|
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||||
|
DO UPDATE SET
|
||||||
|
legacy_dutchie_product_id = EXCLUDED.legacy_dutchie_product_id,
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
brand_name = EXCLUDED.brand_name,
|
||||||
|
category = EXCLUDED.category,
|
||||||
|
subcategory = EXCLUDED.subcategory,
|
||||||
|
is_in_stock = EXCLUDED.is_in_stock,
|
||||||
|
stock_status = EXCLUDED.stock_status,
|
||||||
|
last_seen_at = EXCLUDED.last_seen_at,
|
||||||
|
updated_at = NOW()
|
||||||
|
RETURNING id, (xmax = 0) as is_new`,
|
||||||
|
[
|
||||||
|
p.dispensary_id,
|
||||||
|
p.state_id,
|
||||||
|
p.external_product_id,
|
||||||
|
p.brand_id,
|
||||||
|
p.platform_dispensary_id,
|
||||||
|
p.enterprise_product_id,
|
||||||
|
p.id, // legacy_dutchie_product_id
|
||||||
|
p.name,
|
||||||
|
p.brand_name,
|
||||||
|
p.category || p.type,
|
||||||
|
p.subcategory,
|
||||||
|
p.type,
|
||||||
|
p.strain_type,
|
||||||
|
p.description,
|
||||||
|
p.effects,
|
||||||
|
p.cannabinoids_v2,
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
p.thc_content,
|
||||||
|
p.cbd_content,
|
||||||
|
isInStock,
|
||||||
|
stockStatus,
|
||||||
|
p.total_quantity_available,
|
||||||
|
p.total_quantity_available,
|
||||||
|
p.total_kiosk_quantity_available,
|
||||||
|
p.primary_image_url,
|
||||||
|
p.local_image_url,
|
||||||
|
p.local_image_thumb_url,
|
||||||
|
p.local_image_medium_url,
|
||||||
|
p.original_image_url,
|
||||||
|
p.additional_images,
|
||||||
|
p.special || false,
|
||||||
|
p.featured || false,
|
||||||
|
p.medical_only || false,
|
||||||
|
p.rec_only || false,
|
||||||
|
p.is_below_threshold || false,
|
||||||
|
p.is_below_kiosk_threshold || false,
|
||||||
|
p.status,
|
||||||
|
p.c_name,
|
||||||
|
p.weight,
|
||||||
|
p.options,
|
||||||
|
p.measurements,
|
||||||
|
p.first_seen_at || p.created_at,
|
||||||
|
p.last_seen_at || p.updated_at,
|
||||||
|
p.created_at,
|
||||||
|
p.updated_at,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
productIdMap.set(p.id, result.rows[0].id);
|
||||||
|
|
||||||
|
if (result.rows[0].is_new) {
|
||||||
|
stats.productsInserted++;
|
||||||
|
} else {
|
||||||
|
stats.productsUpdated++;
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
stats.productErrors++;
|
||||||
|
if (options.verbose) {
|
||||||
|
console.error(` Error hydrating product ${p.id}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += BATCH_SIZE;
|
||||||
|
processed += products.length;
|
||||||
|
console.log(` Processed ${processed}/${totalCount} products...`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` Products: ${stats.productsInserted} inserted, ${stats.productsUpdated} updated, ${stats.productsSkipped} skipped, ${stats.productErrors} errors`);
|
||||||
|
return productIdMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// STEP 3: HYDRATE STORE_PRODUCT_SNAPSHOTS FROM dutchie_product_snapshots
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
async function hydrateSnapshots(
|
||||||
|
pool: Pool,
|
||||||
|
options: LegacyBackfillOptions,
|
||||||
|
stats: LegacyBackfillStats,
|
||||||
|
productIdMap: Map<number, number>,
|
||||||
|
_jobToCrawlRunMap: Map<number, number>
|
||||||
|
): Promise<void> {
|
||||||
|
console.log('\n=== STEP 3: Hydrate store_product_snapshots from dutchie_product_snapshots ===');
|
||||||
|
|
||||||
|
// Get total count
|
||||||
|
let countQuery = `SELECT COUNT(*) as cnt FROM dutchie_product_snapshots`;
|
||||||
|
const countParams: any[] = [];
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
countQuery += ` WHERE dispensary_id = $1`;
|
||||||
|
countParams.push(options.dispensaryId);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { rows: countRows } = await pool.query(countQuery, countParams);
|
||||||
|
const totalCount = parseInt(countRows[0].cnt, 10);
|
||||||
|
console.log(` Total dutchie_product_snapshots: ${totalCount}`);
|
||||||
|
|
||||||
|
if (totalCount === 0) {
|
||||||
|
console.log(' No snapshots to hydrate');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let offset = 0;
|
||||||
|
let processed = 0;
|
||||||
|
|
||||||
|
while (processed < totalCount) {
|
||||||
|
// Fetch batch with product info
|
||||||
|
let query = `
|
||||||
|
SELECT
|
||||||
|
dps.id,
|
||||||
|
dps.dutchie_product_id,
|
||||||
|
dps.dispensary_id,
|
||||||
|
dps.options,
|
||||||
|
dps.raw_product_data,
|
||||||
|
dps.crawled_at,
|
||||||
|
dps.created_at,
|
||||||
|
dp.external_product_id,
|
||||||
|
dp.name,
|
||||||
|
dp.brand_name,
|
||||||
|
dp.category,
|
||||||
|
dp.subcategory,
|
||||||
|
d.state_id,
|
||||||
|
d.platform_dispensary_id
|
||||||
|
FROM dutchie_product_snapshots dps
|
||||||
|
JOIN dutchie_products dp ON dp.id = dps.dutchie_product_id
|
||||||
|
LEFT JOIN dispensaries d ON d.id = dps.dispensary_id
|
||||||
|
`;
|
||||||
|
|
||||||
|
const params: any[] = [];
|
||||||
|
let paramIndex = 1;
|
||||||
|
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
query += ` WHERE dps.dispensary_id = $${paramIndex}`;
|
||||||
|
params.push(options.dispensaryId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.startFromSnapshotId && processed === 0) {
|
||||||
|
query += options.dispensaryId ? ` AND` : ` WHERE`;
|
||||||
|
query += ` dps.id >= $${paramIndex}`;
|
||||||
|
params.push(options.startFromSnapshotId);
|
||||||
|
paramIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY dps.id LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
|
||||||
|
params.push(BATCH_SIZE, offset);
|
||||||
|
|
||||||
|
const { rows: snapshots } = await pool.query(query, params);
|
||||||
|
|
||||||
|
if (snapshots.length === 0) break;
|
||||||
|
|
||||||
|
for (const s of snapshots) {
|
||||||
|
stats.snapshotsProcessed++;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Check if already hydrated
|
||||||
|
const existing = await pool.query(
|
||||||
|
`SELECT 1 FROM store_product_snapshots WHERE legacy_snapshot_id = $1`,
|
||||||
|
[s.id]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0) {
|
||||||
|
stats.snapshotsSkipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get canonical store_product_id
|
||||||
|
const storeProductId = productIdMap.get(s.dutchie_product_id);
|
||||||
|
|
||||||
|
// Extract pricing from raw_product_data if available
|
||||||
|
let priceRec: number | null = null;
|
||||||
|
let priceMed: number | null = null;
|
||||||
|
let priceRecSpecial: number | null = null;
|
||||||
|
let isOnSpecial = false;
|
||||||
|
let isInStock = true;
|
||||||
|
let thcPercent: number | null = null;
|
||||||
|
let cbdPercent: number | null = null;
|
||||||
|
let stockStatus = 'unknown';
|
||||||
|
let platformStatus: string | null = null;
|
||||||
|
|
||||||
|
if (s.raw_product_data) {
|
||||||
|
const raw = typeof s.raw_product_data === 'string'
|
||||||
|
? JSON.parse(s.raw_product_data)
|
||||||
|
: s.raw_product_data;
|
||||||
|
|
||||||
|
priceRec = raw.recPrices?.[0] || raw.Prices?.[0] || null;
|
||||||
|
priceMed = raw.medicalPrices?.[0] || null;
|
||||||
|
priceRecSpecial = raw.recSpecialPrices?.[0] || null;
|
||||||
|
isOnSpecial = raw.special === true || (priceRecSpecial !== null);
|
||||||
|
thcPercent = raw.THCContent?.range?.[0] || raw.THC || null;
|
||||||
|
cbdPercent = raw.CBDContent?.range?.[0] || raw.CBD || null;
|
||||||
|
platformStatus = raw.Status || null;
|
||||||
|
isInStock = platformStatus === 'Active';
|
||||||
|
stockStatus = isInStock ? 'in_stock' : 'out_of_stock';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.dryRun) {
|
||||||
|
if (options.verbose) {
|
||||||
|
console.log(` [DryRun] Would insert snapshot for legacy ID ${s.id}`);
|
||||||
|
}
|
||||||
|
stats.snapshotsInserted++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert snapshot
|
||||||
|
await pool.query(
|
||||||
|
`INSERT INTO store_product_snapshots (
|
||||||
|
dispensary_id, store_product_id, state_id,
|
||||||
|
provider, provider_product_id, provider_dispensary_id,
|
||||||
|
legacy_snapshot_id, legacy_dutchie_product_id,
|
||||||
|
captured_at,
|
||||||
|
name, brand_name, category, subcategory,
|
||||||
|
price_rec, price_med, price_rec_special,
|
||||||
|
is_on_special, is_in_stock, stock_status,
|
||||||
|
thc_percent, cbd_percent,
|
||||||
|
platform_status, options, raw_data,
|
||||||
|
created_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3,
|
||||||
|
'dutchie', $4, $5,
|
||||||
|
$6, $7,
|
||||||
|
$8,
|
||||||
|
$9, $10, $11, $12,
|
||||||
|
$13, $14, $15,
|
||||||
|
$16, $17, $18,
|
||||||
|
$19, $20,
|
||||||
|
$21, $22, $23,
|
||||||
|
$24
|
||||||
|
)`,
|
||||||
|
[
|
||||||
|
s.dispensary_id,
|
||||||
|
storeProductId,
|
||||||
|
s.state_id,
|
||||||
|
s.external_product_id,
|
||||||
|
s.platform_dispensary_id,
|
||||||
|
s.id, // legacy_snapshot_id
|
||||||
|
s.dutchie_product_id,
|
||||||
|
s.crawled_at,
|
||||||
|
s.name,
|
||||||
|
s.brand_name,
|
||||||
|
s.category,
|
||||||
|
s.subcategory,
|
||||||
|
priceRec,
|
||||||
|
priceMed,
|
||||||
|
priceRecSpecial,
|
||||||
|
isOnSpecial,
|
||||||
|
isInStock,
|
||||||
|
stockStatus,
|
||||||
|
thcPercent,
|
||||||
|
cbdPercent,
|
||||||
|
platformStatus,
|
||||||
|
s.options,
|
||||||
|
s.raw_product_data,
|
||||||
|
s.created_at,
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
stats.snapshotsInserted++;
|
||||||
|
} catch (error: any) {
|
||||||
|
stats.snapshotErrors++;
|
||||||
|
if (options.verbose) {
|
||||||
|
console.error(` Error hydrating snapshot ${s.id}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += BATCH_SIZE;
|
||||||
|
processed += snapshots.length;
|
||||||
|
|
||||||
|
if (processed % 1000 === 0) {
|
||||||
|
console.log(` Processed ${processed}/${totalCount} snapshots...`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` Snapshots: ${stats.snapshotsInserted} inserted, ${stats.snapshotsSkipped} skipped, ${stats.snapshotErrors} errors`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// MAIN BACKFILL FUNCTION
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export async function runLegacyBackfill(
|
||||||
|
pool: Pool,
|
||||||
|
options: LegacyBackfillOptions
|
||||||
|
): Promise<LegacyBackfillStats> {
|
||||||
|
const stats: LegacyBackfillStats = {
|
||||||
|
productsProcessed: 0,
|
||||||
|
productsInserted: 0,
|
||||||
|
productsUpdated: 0,
|
||||||
|
productsSkipped: 0,
|
||||||
|
productErrors: 0,
|
||||||
|
snapshotsProcessed: 0,
|
||||||
|
snapshotsInserted: 0,
|
||||||
|
snapshotsSkipped: 0,
|
||||||
|
snapshotErrors: 0,
|
||||||
|
crawlRunsProcessed: 0,
|
||||||
|
crawlRunsInserted: 0,
|
||||||
|
crawlRunsSkipped: 0,
|
||||||
|
crawlRunErrors: 0,
|
||||||
|
startedAt: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('============================================================');
|
||||||
|
console.log('Legacy → Canonical Hydration Backfill');
|
||||||
|
console.log('============================================================');
|
||||||
|
console.log(`Mode: ${options.dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||||
|
if (options.dispensaryId) {
|
||||||
|
console.log(`Dispensary: ${options.dispensaryId}`);
|
||||||
|
}
|
||||||
|
console.log(`Batch size: ${BATCH_SIZE}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Step 1: Hydrate crawl_runs
|
||||||
|
const jobToCrawlRunMap = await hydrateCrawlRuns(pool, options, stats);
|
||||||
|
|
||||||
|
// Step 2: Hydrate store_products
|
||||||
|
const productIdMap = await hydrateStoreProducts(pool, options, stats);
|
||||||
|
|
||||||
|
// Step 3: Hydrate store_product_snapshots
|
||||||
|
await hydrateSnapshots(pool, options, stats, productIdMap, jobToCrawlRunMap);
|
||||||
|
|
||||||
|
stats.completedAt = new Date();
|
||||||
|
stats.durationMs = stats.completedAt.getTime() - stats.startedAt.getTime();
|
||||||
|
|
||||||
|
console.log('\n============================================================');
|
||||||
|
console.log('SUMMARY');
|
||||||
|
console.log('============================================================');
|
||||||
|
console.log(`Duration: ${(stats.durationMs / 1000).toFixed(1)}s`);
|
||||||
|
console.log('');
|
||||||
|
console.log('Crawl Runs:');
|
||||||
|
console.log(` Processed: ${stats.crawlRunsProcessed}`);
|
||||||
|
console.log(` Inserted: ${stats.crawlRunsInserted}`);
|
||||||
|
console.log(` Skipped: ${stats.crawlRunsSkipped}`);
|
||||||
|
console.log(` Errors: ${stats.crawlRunErrors}`);
|
||||||
|
console.log('');
|
||||||
|
console.log('Products:');
|
||||||
|
console.log(` Processed: ${stats.productsProcessed}`);
|
||||||
|
console.log(` Inserted: ${stats.productsInserted}`);
|
||||||
|
console.log(` Updated: ${stats.productsUpdated}`);
|
||||||
|
console.log(` Skipped: ${stats.productsSkipped}`);
|
||||||
|
console.log(` Errors: ${stats.productErrors}`);
|
||||||
|
console.log('');
|
||||||
|
console.log('Snapshots:');
|
||||||
|
console.log(` Processed: ${stats.snapshotsProcessed}`);
|
||||||
|
console.log(` Inserted: ${stats.snapshotsInserted}`);
|
||||||
|
console.log(` Skipped: ${stats.snapshotsSkipped}`);
|
||||||
|
console.log(` Errors: ${stats.snapshotErrors}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
return stats;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('\nFATAL ERROR:', error.message);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// CLI ENTRYPOINT
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
|
||||||
|
const options: LegacyBackfillOptions = {
|
||||||
|
dryRun: args.includes('--dry-run'),
|
||||||
|
verbose: args.includes('--verbose') || args.includes('-v'),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse --dispensary-id
|
||||||
|
const dispIdx = args.indexOf('--dispensary-id');
|
||||||
|
if (dispIdx !== -1 && args[dispIdx + 1]) {
|
||||||
|
options.dispensaryId = parseInt(args[dispIdx + 1], 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse --start-from
|
||||||
|
const startIdx = args.indexOf('--start-from');
|
||||||
|
if (startIdx !== -1 && args[startIdx + 1]) {
|
||||||
|
options.startFromProductId = parseInt(args[startIdx + 1], 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show help
|
||||||
|
if (args.includes('--help') || args.includes('-h')) {
|
||||||
|
console.log(`
|
||||||
|
Legacy Backfill Script - Hydrates canonical tables from dutchie_* tables
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
npx tsx src/hydration/legacy-backfill.ts [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--dry-run Print what would be done without modifying the database
|
||||||
|
--dispensary-id N Only process a specific dispensary
|
||||||
|
--start-from N Resume from a specific product ID
|
||||||
|
--verbose, -v Print detailed progress for each record
|
||||||
|
--help, -h Show this help message
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Full backfill
|
||||||
|
npx tsx src/hydration/legacy-backfill.ts
|
||||||
|
|
||||||
|
# Dry run for one dispensary
|
||||||
|
npx tsx src/hydration/legacy-backfill.ts --dry-run --dispensary-id 123
|
||||||
|
|
||||||
|
# Resume from product ID 5000
|
||||||
|
npx tsx src/hydration/legacy-backfill.ts --start-from 5000
|
||||||
|
`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
const pool = new Pool({
|
||||||
|
connectionString: getConnectionString(),
|
||||||
|
max: 5,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Verify connection
|
||||||
|
await pool.query('SELECT 1');
|
||||||
|
console.log('Database connection: OK');
|
||||||
|
|
||||||
|
await runLegacyBackfill(pool, options);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Error:', error.message);
|
||||||
|
process.exit(1);
|
||||||
|
} finally {
|
||||||
|
await pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run if called directly
|
||||||
|
if (require.main === module) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
194
backend/src/hydration/locking.ts
Normal file
194
backend/src/hydration/locking.ts
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
/**
|
||||||
|
* Distributed Locking for Hydration Workers
|
||||||
|
*
|
||||||
|
* Prevents multiple workers from processing the same payloads.
|
||||||
|
* Uses PostgreSQL advisory locks with timeout-based expiry.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Pool } from 'pg';
|
||||||
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
|
|
||||||
|
const DEFAULT_LOCK_TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes
|
||||||
|
const HEARTBEAT_INTERVAL_MS = 30 * 1000; // 30 seconds
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// LOCK MANAGER
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export class HydrationLockManager {
|
||||||
|
private pool: Pool;
|
||||||
|
private workerId: string;
|
||||||
|
private heartbeatInterval: NodeJS.Timeout | null = null;
|
||||||
|
private activeLocks: Set<string> = new Set();
|
||||||
|
|
||||||
|
constructor(pool: Pool, workerId?: string) {
|
||||||
|
this.pool = pool;
|
||||||
|
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Acquire a named lock
|
||||||
|
* Returns true if lock was acquired, false if already held by another worker
|
||||||
|
*/
|
||||||
|
async acquireLock(
|
||||||
|
lockName: string,
|
||||||
|
timeoutMs: number = DEFAULT_LOCK_TIMEOUT_MS
|
||||||
|
): Promise<boolean> {
|
||||||
|
const expiresAt = new Date(Date.now() + timeoutMs);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// First, clean up expired locks
|
||||||
|
await this.pool.query(
|
||||||
|
`DELETE FROM hydration_locks WHERE expires_at < NOW()`
|
||||||
|
);
|
||||||
|
|
||||||
|
// Try to insert the lock
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`INSERT INTO hydration_locks (lock_name, worker_id, acquired_at, expires_at, heartbeat_at)
|
||||||
|
VALUES ($1, $2, NOW(), $3, NOW())
|
||||||
|
ON CONFLICT (lock_name) DO NOTHING
|
||||||
|
RETURNING id`,
|
||||||
|
[lockName, this.workerId, expiresAt]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows.length > 0) {
|
||||||
|
this.activeLocks.add(lockName);
|
||||||
|
this.startHeartbeat();
|
||||||
|
console.log(`[HydrationLock] Acquired lock: ${lockName} (worker: ${this.workerId})`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we already own the lock
|
||||||
|
const existing = await this.pool.query(
|
||||||
|
`SELECT worker_id FROM hydration_locks WHERE lock_name = $1`,
|
||||||
|
[lockName]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (existing.rows.length > 0 && existing.rows[0].worker_id === this.workerId) {
|
||||||
|
// Refresh our own lock
|
||||||
|
await this.refreshLock(lockName, timeoutMs);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[HydrationLock] Lock ${lockName} already held by ${existing.rows[0]?.worker_id}`);
|
||||||
|
return false;
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`[HydrationLock] Error acquiring lock ${lockName}:`, error.message);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Release a named lock
|
||||||
|
*/
|
||||||
|
async releaseLock(lockName: string): Promise<void> {
|
||||||
|
try {
|
||||||
|
await this.pool.query(
|
||||||
|
`DELETE FROM hydration_locks WHERE lock_name = $1 AND worker_id = $2`,
|
||||||
|
[lockName, this.workerId]
|
||||||
|
);
|
||||||
|
this.activeLocks.delete(lockName);
|
||||||
|
console.log(`[HydrationLock] Released lock: ${lockName}`);
|
||||||
|
|
||||||
|
if (this.activeLocks.size === 0) {
|
||||||
|
this.stopHeartbeat();
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`[HydrationLock] Error releasing lock ${lockName}:`, error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Refresh lock expiry
|
||||||
|
*/
|
||||||
|
async refreshLock(lockName: string, timeoutMs: number = DEFAULT_LOCK_TIMEOUT_MS): Promise<void> {
|
||||||
|
const expiresAt = new Date(Date.now() + timeoutMs);
|
||||||
|
await this.pool.query(
|
||||||
|
`UPDATE hydration_locks
|
||||||
|
SET expires_at = $1, heartbeat_at = NOW()
|
||||||
|
WHERE lock_name = $2 AND worker_id = $3`,
|
||||||
|
[expiresAt, lockName, this.workerId]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Release all locks held by this worker
|
||||||
|
*/
|
||||||
|
async releaseAllLocks(): Promise<void> {
|
||||||
|
this.stopHeartbeat();
|
||||||
|
await this.pool.query(
|
||||||
|
`DELETE FROM hydration_locks WHERE worker_id = $1`,
|
||||||
|
[this.workerId]
|
||||||
|
);
|
||||||
|
this.activeLocks.clear();
|
||||||
|
console.log(`[HydrationLock] Released all locks for worker: ${this.workerId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a lock is held (by any worker)
|
||||||
|
*/
|
||||||
|
async isLockHeld(lockName: string): Promise<boolean> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`SELECT 1 FROM hydration_locks
|
||||||
|
WHERE lock_name = $1 AND expires_at > NOW()`,
|
||||||
|
[lockName]
|
||||||
|
);
|
||||||
|
return result.rows.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current lock holder
|
||||||
|
*/
|
||||||
|
async getLockHolder(lockName: string): Promise<string | null> {
|
||||||
|
const result = await this.pool.query(
|
||||||
|
`SELECT worker_id FROM hydration_locks
|
||||||
|
WHERE lock_name = $1 AND expires_at > NOW()`,
|
||||||
|
[lockName]
|
||||||
|
);
|
||||||
|
return result.rows[0]?.worker_id || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start heartbeat to keep locks alive
|
||||||
|
*/
|
||||||
|
private startHeartbeat(): void {
|
||||||
|
if (this.heartbeatInterval) return;
|
||||||
|
|
||||||
|
this.heartbeatInterval = setInterval(async () => {
|
||||||
|
for (const lockName of this.activeLocks) {
|
||||||
|
try {
|
||||||
|
await this.refreshLock(lockName);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error(`[HydrationLock] Heartbeat failed for ${lockName}:`, error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, HEARTBEAT_INTERVAL_MS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop heartbeat
|
||||||
|
*/
|
||||||
|
private stopHeartbeat(): void {
|
||||||
|
if (this.heartbeatInterval) {
|
||||||
|
clearInterval(this.heartbeatInterval);
|
||||||
|
this.heartbeatInterval = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get worker ID
|
||||||
|
*/
|
||||||
|
getWorkerId(): string {
|
||||||
|
return this.workerId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// SINGLETON LOCK NAMES
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
export const LOCK_NAMES = {
|
||||||
|
HYDRATION_BATCH: 'hydration:batch',
|
||||||
|
HYDRATION_CATCHUP: 'hydration:catchup',
|
||||||
|
BACKFILL: 'hydration:backfill',
|
||||||
|
} as const;
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user