Compare commits
94 Commits
feature/wo
...
feat/task-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a2fa21f65c | ||
|
|
4949b22457 | ||
|
|
1fb0eb94c2 | ||
|
|
9aefb554bc | ||
|
|
1fa9ea496c | ||
|
|
31756a2233 | ||
|
|
166583621b | ||
|
|
ca952c4674 | ||
|
|
4054778b6c | ||
|
|
56a5f00015 | ||
|
|
a96d50c481 | ||
|
|
4806212f46 | ||
|
|
2486f3c6b2 | ||
|
|
f25bebf6ee | ||
|
|
22dad6d0fc | ||
|
|
03eab66d35 | ||
|
|
97b1ab23d8 | ||
|
|
9fff0ba430 | ||
|
|
7d3e91b2e6 | ||
|
|
74957a9ec5 | ||
|
|
2d035c46cf | ||
|
|
53445fe72a | ||
|
|
37cc8956c5 | ||
|
|
197c82f921 | ||
|
|
2c52493a9c | ||
|
|
2ee2ba6b8c | ||
|
|
bafcf1694a | ||
|
|
95792aab15 | ||
|
|
38ae2c3a3e | ||
|
|
249d3c1b7f | ||
|
|
9647f94f89 | ||
|
|
afc288d2cf | ||
|
|
df01ce6aad | ||
|
|
aea93bc96b | ||
|
|
4e84f30f8b | ||
|
|
b20a0a4fa5 | ||
|
|
6eb1babc86 | ||
|
|
9a9c2f76a2 | ||
|
|
56cc171287 | ||
|
|
0295637ed6 | ||
|
|
9c6dd37316 | ||
|
|
524d13209a | ||
|
|
9199db3927 | ||
|
|
a0652c7c73 | ||
|
|
89c262ee20 | ||
|
|
7f9cf559cf | ||
|
|
bbe039c868 | ||
|
|
4e5c09a2a5 | ||
|
|
7f65598332 | ||
|
|
75315ed91e | ||
|
|
7fe7d17b43 | ||
|
|
7e517b5801 | ||
|
|
38ba9021d1 | ||
|
|
ddebad48d3 | ||
|
|
1cebf2e296 | ||
|
|
1d6e67d837 | ||
|
|
cfb4b6e4ce | ||
|
|
f418c403d6 | ||
|
|
be4221af46 | ||
|
|
ca07606b05 | ||
|
|
baf1bf2eb7 | ||
|
|
4ef3a8d72b | ||
|
|
09dd756eff | ||
|
|
ec8ef6210c | ||
|
|
a9b7a4d7a9 | ||
|
|
5119d5ccf9 | ||
|
|
91efd1d03d | ||
|
|
aa776226b0 | ||
|
|
e9435150e9 | ||
|
|
d399b966e6 | ||
|
|
f5f0e25384 | ||
|
|
04de33e5f7 | ||
|
|
37dfea25e1 | ||
|
|
e2166bc25f | ||
|
|
b5e8f039bf | ||
|
|
346e6d1cd8 | ||
|
|
be434d25e3 | ||
|
|
ecc201e9d4 | ||
|
|
67bfdf47a5 | ||
|
|
3fa22a6ba1 | ||
|
|
9f898f68db | ||
|
|
f78b05360a | ||
|
|
2f483b3084 | ||
|
|
9711d594db | ||
|
|
39aebfcb82 | ||
|
|
5415cac2f3 | ||
|
|
70d2364a6f | ||
|
|
b1ab45f662 | ||
|
|
20300edbb8 | ||
|
|
b7cfec0770 | ||
|
|
948a732dd5 | ||
|
|
bf4ceaf09e | ||
|
|
fda688b11a | ||
|
|
414b97b3c0 |
@@ -2,37 +2,77 @@ when:
|
||||
- event: [push, pull_request]
|
||||
|
||||
steps:
|
||||
# Build checks
|
||||
# ===========================================
|
||||
# PR VALIDATION: Parallel type checks (PRs only)
|
||||
# ===========================================
|
||||
typecheck-backend:
|
||||
image: node:20
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd backend
|
||||
- npm ci
|
||||
- npx tsc --noEmit || true
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
build-cannaiq:
|
||||
image: node:20
|
||||
typecheck-cannaiq:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd cannaiq
|
||||
- npm ci
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit
|
||||
- npm run build
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
build-findadispo:
|
||||
image: node:20
|
||||
typecheck-findadispo:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd findadispo/frontend
|
||||
- npm ci
|
||||
- npm run build
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit 2>/dev/null || true
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
build-findagram:
|
||||
image: node:20
|
||||
typecheck-findagram:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd findagram/frontend
|
||||
- npm ci
|
||||
- npm run build
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit 2>/dev/null || true
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# Docker builds - only on master
|
||||
# ===========================================
|
||||
# AUTO-MERGE: Merge PR after all checks pass
|
||||
# ===========================================
|
||||
auto-merge:
|
||||
image: alpine:latest
|
||||
environment:
|
||||
GITEA_TOKEN:
|
||||
from_secret: gitea_token
|
||||
commands:
|
||||
- apk add --no-cache curl
|
||||
- |
|
||||
echo "Merging PR #${CI_COMMIT_PULL_REQUEST}..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do":"merge"}' \
|
||||
"https://code.cannabrands.app/api/v1/repos/Creationshop/dispensary-scraper/pulls/${CI_COMMIT_PULL_REQUEST}/merge"
|
||||
depends_on:
|
||||
- typecheck-backend
|
||||
- typecheck-cannaiq
|
||||
- typecheck-findadispo
|
||||
- typecheck-findagram
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# ===========================================
|
||||
# MASTER DEPLOY: Parallel Docker builds
|
||||
# ===========================================
|
||||
docker-backend:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
@@ -49,6 +89,12 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
build_args:
|
||||
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
|
||||
APP_GIT_SHA: ${CI_COMMIT_SHA}
|
||||
APP_BUILD_TIME: ${CI_PIPELINE_CREATED}
|
||||
CONTAINER_IMAGE_TAG: ${CI_COMMIT_SHA:0:8}
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -69,6 +115,7 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -89,6 +136,7 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -109,32 +157,35 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
# Deploy to Kubernetes
|
||||
# ===========================================
|
||||
# STAGE 3: Deploy (after Docker builds)
|
||||
# ===========================================
|
||||
deploy:
|
||||
image: bitnami/kubectl:latest
|
||||
environment:
|
||||
KUBECONFIG_CONTENT:
|
||||
from_secret: kubeconfig_data
|
||||
commands:
|
||||
- echo "Deploying to Kubernetes..."
|
||||
- mkdir -p ~/.kube
|
||||
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
|
||||
- chmod 600 ~/.kube/config
|
||||
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/scraper-worker scraper-worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/scraper-worker -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
|
||||
- kubectl rollout status deployment/findadispo-frontend -n dispensary-scraper --timeout=120s
|
||||
- kubectl rollout status deployment/findagram-frontend -n dispensary-scraper --timeout=120s
|
||||
- echo "All deployments complete!"
|
||||
depends_on:
|
||||
- docker-backend
|
||||
- docker-cannaiq
|
||||
- docker-findadispo
|
||||
- docker-findagram
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
309
CLAUDE.md
309
CLAUDE.md
@@ -119,7 +119,42 @@ npx tsx src/db/migrate.ts
|
||||
- Importing it at runtime causes startup crashes if env vars aren't perfect
|
||||
- `pool.ts` uses lazy initialization - only validates when first query is made
|
||||
|
||||
### 6. LOCAL DEVELOPMENT BY DEFAULT
|
||||
### 6. ALL API ROUTES REQUIRE AUTHENTICATION — NO EXCEPTIONS
|
||||
|
||||
**Every API router MUST apply `authMiddleware` at the router level.**
|
||||
|
||||
```typescript
|
||||
import { authMiddleware } from '../auth/middleware';
|
||||
|
||||
const router = Router();
|
||||
router.use(authMiddleware); // REQUIRED - first line after router creation
|
||||
```
|
||||
|
||||
**Authentication flow (see `src/auth/middleware.ts`):**
|
||||
1. Check Bearer token (JWT or API token) → grant access if valid
|
||||
2. Check trusted origins (cannaiq.co, findadispo.com, localhost, etc.) → grant access
|
||||
3. Check trusted IPs (127.0.0.1, ::1, internal pod IPs) → grant access
|
||||
4. **Return 401 Unauthorized** if none of the above
|
||||
|
||||
**NEVER create API routes without auth middleware:**
|
||||
- No "public" endpoints that bypass authentication
|
||||
- No "read-only" exceptions
|
||||
- No "analytics-only" exceptions
|
||||
- If an endpoint exists under `/api/*`, it MUST be protected
|
||||
|
||||
**When creating new route files:**
|
||||
1. Import `authMiddleware` from `../auth/middleware`
|
||||
2. Add `router.use(authMiddleware)` immediately after creating the router
|
||||
3. Document security requirements in file header comments
|
||||
|
||||
**Trusted origins (defined in middleware):**
|
||||
- `https://cannaiq.co`
|
||||
- `https://findadispo.com`
|
||||
- `https://findagram.co`
|
||||
- `*.cannabrands.app` domains
|
||||
- `localhost:*` for development
|
||||
|
||||
### 7. LOCAL DEVELOPMENT BY DEFAULT
|
||||
|
||||
**Quick Start:**
|
||||
```bash
|
||||
@@ -193,6 +228,45 @@ CannaiQ has **TWO databases** with distinct purposes:
|
||||
| `dutchie_menus` | **Canonical CannaiQ database** - All schema, migrations, and application data | READ/WRITE |
|
||||
| `dutchie_legacy` | **Legacy read-only archive** - Historical data from old system | READ-ONLY |
|
||||
|
||||
### Store vs Dispensary Terminology
|
||||
|
||||
**"Store" and "Dispensary" are SYNONYMS in CannaiQ.**
|
||||
|
||||
| Term | Usage | DB Table |
|
||||
|------|-------|----------|
|
||||
| Store | API routes (`/api/stores`) | `dispensaries` |
|
||||
| Dispensary | DB table, internal code | `dispensaries` |
|
||||
|
||||
- `/api/stores` and `/api/dispensaries` both query the `dispensaries` table
|
||||
- There is NO `stores` table in use - it's a legacy empty table
|
||||
- Use these terms interchangeably in code and documentation
|
||||
|
||||
### Canonical vs Legacy Tables
|
||||
|
||||
**CANONICAL TABLES (USE THESE):**
|
||||
|
||||
| Table | Purpose | Row Count |
|
||||
|-------|---------|-----------|
|
||||
| `dispensaries` | Store/dispensary records | ~188+ rows |
|
||||
| `store_products` | Product catalog | ~37,000+ rows |
|
||||
| `store_product_snapshots` | Price/stock history | ~millions |
|
||||
|
||||
**LEGACY TABLES (EMPTY - DO NOT USE):**
|
||||
|
||||
| Table | Status | Action |
|
||||
|-------|--------|--------|
|
||||
| `stores` | EMPTY (0 rows) | Use `dispensaries` instead |
|
||||
| `products` | EMPTY (0 rows) | Use `store_products` instead |
|
||||
| `dutchie_products` | LEGACY (0 rows) | Use `store_products` instead |
|
||||
| `dutchie_product_snapshots` | LEGACY (0 rows) | Use `store_product_snapshots` instead |
|
||||
| `categories` | EMPTY (0 rows) | Categories stored in product records |
|
||||
|
||||
**Code must NEVER:**
|
||||
- Query the `stores` table (use `dispensaries`)
|
||||
- Query the `products` table (use `store_products`)
|
||||
- Query the `dutchie_products` table (use `store_products`)
|
||||
- Query the `categories` table (categories are in product records)
|
||||
|
||||
**CRITICAL RULES:**
|
||||
- **Migrations ONLY run on `dutchie_menus`** - NEVER on `dutchie_legacy`
|
||||
- **Application code connects ONLY to `dutchie_menus`**
|
||||
@@ -305,23 +379,23 @@ npx tsx src/scripts/etl/042_legacy_import.ts
|
||||
- SCHEMA ONLY - no data inserts from legacy tables
|
||||
|
||||
**ETL Script 042** (`backend/src/scripts/etl/042_legacy_import.ts`):
|
||||
- Copies data from `dutchie_products` → `store_products`
|
||||
- Copies data from `dutchie_product_snapshots` → `store_product_snapshots`
|
||||
- Copies data from legacy `dutchie_legacy.dutchie_products` → `store_products`
|
||||
- Copies data from legacy `dutchie_legacy.dutchie_product_snapshots` → `store_product_snapshots`
|
||||
- Extracts brands from product data into `brands` table
|
||||
- Links dispensaries to chains and states
|
||||
- INSERT-ONLY and IDEMPOTENT (uses ON CONFLICT DO NOTHING)
|
||||
- Run manually: `cd backend && npx tsx src/scripts/etl/042_legacy_import.ts`
|
||||
|
||||
**Tables touched by ETL:**
|
||||
| Source Table | Target Table |
|
||||
|--------------|--------------|
|
||||
| Source Table (dutchie_legacy) | Target Table (dutchie_menus) |
|
||||
|-------------------------------|------------------------------|
|
||||
| `dutchie_products` | `store_products` |
|
||||
| `dutchie_product_snapshots` | `store_product_snapshots` |
|
||||
| (brand names extracted) | `brands` |
|
||||
| (state codes mapped) | `dispensaries.state_id` |
|
||||
| (chain names matched) | `dispensaries.chain_id` |
|
||||
|
||||
**Legacy tables remain intact** - `dutchie_products` and `dutchie_product_snapshots` are not modified.
|
||||
**Note:** The legacy `dutchie_products` and `dutchie_product_snapshots` tables in `dutchie_legacy` are read-only sources. All new crawl data goes directly to `store_products` and `store_product_snapshots`.
|
||||
|
||||
**Migration 045** (`backend/migrations/045_add_image_columns.sql`):
|
||||
- Adds `thumbnail_url` to `store_products` and `store_product_snapshots`
|
||||
@@ -413,6 +487,7 @@ const result = await pool.query(`
|
||||
16. **Running `lsof -ti:PORT | xargs kill`** or similar process-killing commands
|
||||
17. **Using hardcoded database names** in code or comments
|
||||
18. **Creating or connecting to a second database**
|
||||
19. **Creating API routes without authMiddleware** (all `/api/*` routes MUST be protected)
|
||||
|
||||
---
|
||||
|
||||
@@ -421,15 +496,66 @@ const result = await pool.query(`
|
||||
### Local Storage Structure
|
||||
|
||||
```
|
||||
/storage/products/{brand}/{state}/{product_id}/
|
||||
/storage/images/products/{state}/{store}/{brand}/{product}/
|
||||
image-{hash}.webp
|
||||
image-{hash}-medium.webp
|
||||
image-{hash}-thumb.webp
|
||||
|
||||
/storage/brands/{brand}/
|
||||
/storage/images/brands/{brand}/
|
||||
logo-{hash}.webp
|
||||
```
|
||||
|
||||
### Image Proxy API (On-Demand Resizing)
|
||||
|
||||
Images are stored at full resolution and resized on-demand via the `/img` endpoint.
|
||||
|
||||
**Endpoint:** `GET /img/<path>?<params>`
|
||||
|
||||
**Parameters:**
|
||||
| Param | Description | Example |
|
||||
|-------|-------------|---------|
|
||||
| `w` | Width in pixels (max 4000) | `?w=200` |
|
||||
| `h` | Height in pixels (max 4000) | `?h=200` |
|
||||
| `q` | Quality 1-100 (default 80) | `?q=70` |
|
||||
| `fit` | Resize mode: cover, contain, fill, inside, outside | `?fit=cover` |
|
||||
| `blur` | Blur sigma 0.3-1000 | `?blur=5` |
|
||||
| `gray` | Grayscale (1 = enabled) | `?gray=1` |
|
||||
| `format` | Output: webp, jpeg, png, avif (default webp) | `?format=jpeg` |
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Thumbnail (50px)
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=50
|
||||
|
||||
# Card image (200px, cover fit)
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=200&h=200&fit=cover
|
||||
|
||||
# JPEG at 70% quality
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=400&format=jpeg&q=70
|
||||
|
||||
# Grayscale blur
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=200&gray=1&blur=3
|
||||
```
|
||||
|
||||
**Frontend Usage:**
|
||||
```typescript
|
||||
import { getImageUrl, ImageSizes } from '../lib/images';
|
||||
|
||||
// Returns /img/products/.../image.webp?w=50 for local images
|
||||
// Returns original URL for remote images (CDN, etc.)
|
||||
const thumbUrl = getImageUrl(product.image_url, ImageSizes.thumb);
|
||||
const cardUrl = getImageUrl(product.image_url, ImageSizes.medium);
|
||||
const detailUrl = getImageUrl(product.image_url, ImageSizes.detail);
|
||||
```
|
||||
|
||||
**Size Presets:**
|
||||
| Preset | Width | Use Case |
|
||||
|--------|-------|----------|
|
||||
| `thumb` | 50px | Table thumbnails |
|
||||
| `small` | 100px | Small cards |
|
||||
| `medium` | 200px | Grid cards |
|
||||
| `large` | 400px | Large cards |
|
||||
| `detail` | 600px | Product detail |
|
||||
| `full` | - | No resize |
|
||||
|
||||
### Storage Adapter
|
||||
|
||||
```typescript
|
||||
@@ -442,8 +568,9 @@ import { saveImage, getImageUrl } from '../utils/storage-adapter';
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `backend/src/utils/local-storage.ts` | Local filesystem adapter |
|
||||
| `backend/src/utils/storage-adapter.ts` | Unified storage abstraction |
|
||||
| `backend/src/utils/image-storage.ts` | Image download and storage |
|
||||
| `backend/src/routes/image-proxy.ts` | On-demand image resizing endpoint |
|
||||
| `cannaiq/src/lib/images.ts` | Frontend image URL helper |
|
||||
| `docker-compose.local.yml` | Local stack without MinIO |
|
||||
| `start-local.sh` | Convenience startup script |
|
||||
|
||||
@@ -451,12 +578,78 @@ import { saveImage, getImageUrl } from '../utils/storage-adapter';
|
||||
|
||||
## UI ANONYMIZATION RULES
|
||||
|
||||
- No vendor names in forward-facing URLs: use `/api/az/...`, `/az`, `/az-schedule`
|
||||
- No vendor names in forward-facing URLs
|
||||
- No "dutchie", "treez", "jane", "weedmaps", "leafly" visible in consumer UIs
|
||||
- Internal admin tools may show provider names for debugging
|
||||
|
||||
---
|
||||
|
||||
## DUTCHIE DISCOVERY PIPELINE (Added 2025-01)
|
||||
|
||||
### Overview
|
||||
Automated discovery of Dutchie-powered dispensaries across all US states.
|
||||
|
||||
### Flow
|
||||
```
|
||||
1. getAllCitiesByState GraphQL → Get all cities for a state
|
||||
2. ConsumerDispensaries GraphQL → Get stores for each city
|
||||
3. Upsert to dutchie_discovery_locations (keyed by platform_location_id)
|
||||
4. AUTO-VALIDATE: Check required fields
|
||||
5. AUTO-PROMOTE: Create/update dispensaries with crawl_enabled=true
|
||||
6. Log all actions to dutchie_promotion_log
|
||||
```
|
||||
|
||||
### Tables
|
||||
| Table | Purpose |
|
||||
|-------|---------|
|
||||
| `dutchie_discovery_cities` | Cities known to have dispensaries |
|
||||
| `dutchie_discovery_locations` | Raw discovered store data |
|
||||
| `dispensaries` | Canonical stores (promoted from discovery) |
|
||||
| `dutchie_promotion_log` | Audit trail for validation/promotion |
|
||||
|
||||
### Files
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/discovery/discovery-crawler.ts` | Main orchestrator |
|
||||
| `src/discovery/location-discovery.ts` | GraphQL fetching |
|
||||
| `src/discovery/promotion.ts` | Validation & promotion logic |
|
||||
| `src/scripts/run-discovery.ts` | CLI interface |
|
||||
| `migrations/067_promotion_log.sql` | Audit log table |
|
||||
|
||||
### GraphQL Hashes (in `src/platforms/dutchie/client.ts`)
|
||||
| Query | Hash |
|
||||
|-------|------|
|
||||
| `GetAllCitiesByState` | `ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6` |
|
||||
| `ConsumerDispensaries` | `0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b` |
|
||||
|
||||
### Usage
|
||||
```bash
|
||||
# Discover all stores in a state
|
||||
npx tsx src/scripts/run-discovery.ts discover:state AZ
|
||||
npx tsx src/scripts/run-discovery.ts discover:state CA
|
||||
|
||||
# Check stats
|
||||
npx tsx src/scripts/run-discovery.ts stats
|
||||
```
|
||||
|
||||
### Validation Rules
|
||||
A discovery location must have:
|
||||
- `platform_location_id` (MongoDB ObjectId, 24 hex chars)
|
||||
- `name`
|
||||
- `city`
|
||||
- `state_code`
|
||||
- `platform_menu_url`
|
||||
|
||||
Invalid records are marked `status='rejected'` with errors logged.
|
||||
|
||||
### Key Design Decisions
|
||||
- `platform_location_id` MUST be MongoDB ObjectId (not slug)
|
||||
- Old geo-based discovery stored slugs → deleted as garbage data
|
||||
- Rate limit: 2 seconds between city requests to avoid API throttling
|
||||
- Promotion is idempotent via `ON CONFLICT (platform_dispensary_id)`
|
||||
|
||||
---
|
||||
|
||||
## FUTURE TODO / PENDING FEATURES
|
||||
|
||||
- [ ] Orchestrator observability dashboard
|
||||
@@ -601,29 +794,45 @@ export default defineConfig({
|
||||
|
||||
- **DB**: Use the single CannaiQ database via `CANNAIQ_DB_*` env vars. No hardcoded names.
|
||||
- **Images**: No MinIO. Save to local /images/products/<disp>/<prod>-<hash>.webp (and brands); preserve original URL; serve via backend static.
|
||||
- **Dutchie GraphQL**: Endpoint https://dutchie.com/api-3/graphql. Variables must use productsFilter.dispensaryId (platform_dispensary_id). Mode A: Status="Active". Mode B: Status=null/activeOnly:false.
|
||||
- **Dutchie GraphQL**: Endpoint https://dutchie.com/api-3/graphql. Variables must use productsFilter.dispensaryId (platform_dispensary_id). **CRITICAL: Use `Status: 'Active'`, NOT `null`** (null returns 0 products).
|
||||
- **cName/slug**: Derive cName from each store's menu_url (/embedded-menu/<cName> or /dispensary/<slug>). No hardcoded defaults.
|
||||
- **Dual-mode always**: useBothModes:true to get pricing (Mode A) + full coverage (Mode B).
|
||||
- **Batch DB writes**: Chunk products/snapshots/missing (100–200) to avoid OOM.
|
||||
- **OOS/missing**: Include inactive/OOS in Mode B. Union A+B, dedupe by external_product_id+dispensary_id.
|
||||
- **API/Frontend**: Use /api/az/... endpoints (stores/products/brands/categories/summary/dashboard).
|
||||
- **API/Frontend**: Use `/api/stores`, `/api/products`, `/api/workers`, `/api/pipeline` endpoints.
|
||||
- **Scheduling**: Crawl only menu_type='dutchie' AND platform_dispensary_id IS NOT NULL. 4-hour crawl with jitter.
|
||||
- **Monitor**: /scraper-monitor (and /az-schedule) should show active/recent jobs from job_run_logs/crawl_jobs.
|
||||
- **THC/CBD values**: Clamp to ≤100 - some products report milligrams as percentages.
|
||||
- **Column names**: Use `name_raw`, `brand_name_raw`, `category_raw`, `subcategory_raw` (NOT `name`, `brand_name`, etc.)
|
||||
|
||||
- **Monitor**: `/api/workers` shows active/recent jobs from job queue.
|
||||
- **No slug guessing**: Never use defaults. Always derive per store from menu_url and resolve platform IDs per location.
|
||||
|
||||
**📖 Full Documentation: See `docs/DUTCHIE_CRAWL_WORKFLOW.md` for complete pipeline documentation.**
|
||||
|
||||
---
|
||||
|
||||
### Detailed Rules
|
||||
|
||||
1) **Dispensary vs Store**
|
||||
- Dutchie pipeline uses `dispensaries` (not legacy `stores`). For dutchie crawls, always work with dispensary ID.
|
||||
1) **Dispensary = Store (SAME THING)**
|
||||
- "Dispensary" and "store" are synonyms in CannaiQ. Use interchangeably.
|
||||
- **API endpoint**: `/api/stores` (NOT `/api/dispensaries`)
|
||||
- **DB table**: `dispensaries`
|
||||
- When you need to create/query stores via API, use `/api/stores`
|
||||
- Use the record's `menu_url` and `platform_dispensary_id`.
|
||||
|
||||
2) **Menu detection and platform IDs**
|
||||
2) **API Authentication**
|
||||
- **Trusted Origins (no auth needed)**:
|
||||
- IPs: `127.0.0.1`, `::1`, `::ffff:127.0.0.1`
|
||||
- Origins: `https://cannaiq.co`, `https://findadispo.com`, `https://findagram.co`
|
||||
- Also: `http://localhost:3010`, `http://localhost:8080`, `http://localhost:5173`
|
||||
- Requests from trusted IPs/origins get automatic admin access (`role: 'internal'`)
|
||||
- **Remote (non-trusted)**: Use Bearer token (JWT or API token). NO username/password auth.
|
||||
- Never try to login with username/password via API - use tokens only.
|
||||
- See `src/auth/middleware.ts` for `TRUSTED_ORIGINS` and `TRUSTED_IPS` lists.
|
||||
|
||||
3) **Menu detection and platform IDs**
|
||||
- Set `menu_type` from `menu_url` detection; resolve `platform_dispensary_id` for `menu_type='dutchie'`.
|
||||
- Admin should have "refresh detection" and "resolve ID" actions; schedule/crawl only when `menu_type='dutchie'` AND `platform_dispensary_id` is set.
|
||||
|
||||
3) **Queries and mapping**
|
||||
4) **Queries and mapping**
|
||||
- The DB returns snake_case; code expects camelCase. Always alias/map:
|
||||
- `platform_dispensary_id AS "platformDispensaryId"`
|
||||
- Map via `mapDbRowToDispensary` when loading dispensaries (scheduler, crawler, admin crawl).
|
||||
@@ -640,7 +849,7 @@ export default defineConfig({
|
||||
- Use dutchie GraphQL pipeline only for `menu_type='dutchie'`.
|
||||
|
||||
6) **Frontend**
|
||||
- Forward-facing URLs: `/api/az`, `/az`, `/az-schedule`; no vendor names.
|
||||
- Forward-facing URLs should not contain vendor names.
|
||||
- `/scraper-schedule`: add filters/search, keep as master view for all schedules; reflect platform ID/menu_type status and controls.
|
||||
|
||||
7) **No slug guessing**
|
||||
@@ -689,24 +898,27 @@ export default defineConfig({
|
||||
|
||||
16) **API Route Semantics**
|
||||
|
||||
**Route Groups:**
|
||||
- `/api/admin/...` = Admin/operator actions (crawl triggers, health checks)
|
||||
- `/api/az/...` = Arizona data slice (stores, products, metrics)
|
||||
**Route Groups (as registered in `src/index.ts`):**
|
||||
- `/api/stores` = Store/dispensary CRUD and listing
|
||||
- `/api/products` = Product listing and details
|
||||
- `/api/workers` = Job queue monitoring (replaces legacy `/api/dutchie-az/...`)
|
||||
- `/api/pipeline` = Crawl pipeline triggers
|
||||
- `/api/admin/orchestrator` = Orchestrator admin actions
|
||||
- `/api/discovery` = Platform discovery (Dutchie, etc.)
|
||||
- `/api/v1/...` = Public API for external consumers (WordPress, etc.)
|
||||
|
||||
**Crawl Trigger (CANONICAL):**
|
||||
```
|
||||
POST /api/admin/crawl/:dispensaryId
|
||||
```
|
||||
**Crawl Trigger:**
|
||||
Check `/api/pipeline` or `/api/admin/orchestrator` routes for crawl triggers.
|
||||
The legacy `POST /api/admin/crawl/:dispensaryId` does NOT exist.
|
||||
|
||||
17) **Monitoring and logging**
|
||||
- /scraper-monitor (and /az-schedule) should show active/recent jobs from job_run_logs/crawl_jobs
|
||||
- `/api/workers` shows active/recent jobs from job queue
|
||||
- Auto-refresh every 30 seconds
|
||||
- System Logs page should show real log data, not just startup messages
|
||||
|
||||
18) **Dashboard Architecture**
|
||||
- **Frontend**: Rebuild the frontend with `VITE_API_URL` pointing to the correct backend and redeploy.
|
||||
- **Backend**: `/api/dashboard/stats` MUST use the canonical DB pool. Use the correct tables: `dutchie_products`, `dispensaries`, and views like `v_dashboard_stats`, `v_latest_snapshots`.
|
||||
- **Backend**: `/api/dashboard/stats` MUST use the canonical DB pool. Use the correct tables: `store_products`, `dispensaries`, and views like `v_dashboard_stats`, `v_latest_snapshots`.
|
||||
|
||||
19) **Deployment (Gitea + Kubernetes)**
|
||||
- **Registry**: Gitea at `code.cannabrands.app/creationshop/dispensary-scraper`
|
||||
@@ -732,8 +944,8 @@ export default defineConfig({
|
||||
- **Job schedules** (managed in `job_schedules` table):
|
||||
- `dutchie_az_menu_detection`: Runs daily with 60-min jitter
|
||||
- `dutchie_az_product_crawl`: Runs every 4 hours with 30-min jitter
|
||||
- **Trigger schedules**: `curl -X POST /api/az/admin/schedules/{id}/trigger`
|
||||
- **Check schedule status**: `curl /api/az/admin/schedules`
|
||||
- **Monitor jobs**: `GET /api/workers`
|
||||
- **Trigger crawls**: Check `/api/pipeline` routes
|
||||
|
||||
21) **Frontend Architecture - AVOID OVER-ENGINEERING**
|
||||
|
||||
@@ -1072,3 +1284,32 @@ Every analytics v2 endpoint must:
|
||||
---
|
||||
|
||||
# END Analytics V2 spec extension
|
||||
|
||||
---
|
||||
|
||||
## WordPress Plugin Versioning
|
||||
|
||||
The WordPress plugin version is tracked in `wordpress-plugin/VERSION`.
|
||||
|
||||
**Current version:** Check `wordpress-plugin/VERSION` for the latest version.
|
||||
|
||||
**Versioning rules:**
|
||||
- **Minor bumps (x.x.N)**: Bug fixes, small improvements - default for most changes
|
||||
- **Middle bumps (x.N.0)**: New features, significant improvements
|
||||
- **Major bumps (N.0.0)**: Breaking changes, major rewrites - only when user explicitly requests
|
||||
|
||||
**When making WP plugin changes:**
|
||||
1. Read `wordpress-plugin/VERSION` to get current version
|
||||
2. Bump the version number (minor by default)
|
||||
3. Update both files:
|
||||
- `wordpress-plugin/VERSION`
|
||||
- Plugin header `Version:` in `cannaiq-menus.php` and/or `crawlsy-menus.php`
|
||||
- The `define('..._VERSION', '...')` constant in each plugin file
|
||||
|
||||
**Plugin files:**
|
||||
| File | Brand | API URL |
|
||||
|------|-------|---------|
|
||||
| `cannaiq-menus.php` | CannaIQ | `https://cannaiq.co/api/v1` |
|
||||
| `crawlsy-menus.php` | Crawlsy (legacy) | `https://cannaiq.co/api/v1` |
|
||||
|
||||
Both plugins use the same API endpoint. The Crawlsy version exists for backward compatibility with existing installations.
|
||||
|
||||
40
backend/.env
40
backend/.env
@@ -1,30 +1,52 @@
|
||||
# CannaiQ Backend Environment Configuration
|
||||
# Copy this file to .env and fill in the values
|
||||
|
||||
# Server
|
||||
PORT=3010
|
||||
NODE_ENV=development
|
||||
|
||||
# =============================================================================
|
||||
# CannaiQ Database (dutchie_menus) - PRIMARY DATABASE
|
||||
# CANNAIQ DATABASE (dutchie_menus) - PRIMARY DATABASE
|
||||
# =============================================================================
|
||||
# This is where all schema migrations run and where canonical tables live.
|
||||
# All CANNAIQ_DB_* variables are REQUIRED - connection will fail if missing.
|
||||
# This is where ALL schema migrations run and where canonical tables live.
|
||||
# All CANNAIQ_DB_* variables are REQUIRED - no defaults.
|
||||
# The application will fail to start if any are missing.
|
||||
|
||||
CANNAIQ_DB_HOST=localhost
|
||||
CANNAIQ_DB_PORT=54320
|
||||
CANNAIQ_DB_NAME=dutchie_menus
|
||||
CANNAIQ_DB_NAME=dutchie_menus # MUST be dutchie_menus - NOT dutchie_legacy
|
||||
CANNAIQ_DB_USER=dutchie
|
||||
CANNAIQ_DB_PASS=dutchie_local_pass
|
||||
|
||||
# Alternative: Use a full connection URL instead of individual vars
|
||||
# If set, this takes priority over individual vars above
|
||||
# CANNAIQ_DB_URL=postgresql://user:pass@host:port/dutchie_menus
|
||||
|
||||
# =============================================================================
|
||||
# Legacy Database (dutchie_legacy) - READ-ONLY SOURCE
|
||||
# LEGACY DATABASE (dutchie_legacy) - READ-ONLY FOR ETL
|
||||
# =============================================================================
|
||||
# Used ONLY by ETL scripts to read historical data.
|
||||
# NEVER run migrations against this database.
|
||||
# These are only needed when running 042_legacy_import.ts
|
||||
|
||||
LEGACY_DB_HOST=localhost
|
||||
LEGACY_DB_PORT=54320
|
||||
LEGACY_DB_NAME=dutchie_legacy
|
||||
LEGACY_DB_NAME=dutchie_legacy # READ-ONLY - never migrated
|
||||
LEGACY_DB_USER=dutchie
|
||||
LEGACY_DB_PASS=dutchie_local_pass
|
||||
LEGACY_DB_PASS=
|
||||
|
||||
# Local image storage (no MinIO per CLAUDE.md)
|
||||
# Alternative: Use a full connection URL instead of individual vars
|
||||
# LEGACY_DB_URL=postgresql://user:pass@host:port/dutchie_legacy
|
||||
|
||||
# =============================================================================
|
||||
# LOCAL STORAGE
|
||||
# =============================================================================
|
||||
# Local image storage path (no MinIO)
|
||||
LOCAL_IMAGES_PATH=./public/images
|
||||
|
||||
# JWT
|
||||
# =============================================================================
|
||||
# AUTHENTICATION
|
||||
# =============================================================================
|
||||
JWT_SECRET=your-secret-key-change-in-production
|
||||
ANTHROPIC_API_KEY=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
|
||||
OPENAI_API_KEY=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
|
||||
|
||||
3
backend/.gitignore
vendored
Normal file
3
backend/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
|
||||
# IP2Location database (downloaded separately)
|
||||
data/ip2location/
|
||||
@@ -1,17 +1,17 @@
|
||||
# Build stage
|
||||
# Image: code.cannabrands.app/creationshop/dispensary-scraper
|
||||
FROM node:20-slim AS builder
|
||||
FROM code.cannabrands.app/creationshop/node:20-slim AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci
|
||||
RUN npm install
|
||||
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
|
||||
# Production stage
|
||||
FROM node:20-slim
|
||||
FROM code.cannabrands.app/creationshop/node:20-slim
|
||||
|
||||
# Build arguments for version info
|
||||
ARG APP_BUILD_VERSION=dev
|
||||
@@ -43,10 +43,13 @@ ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci --omit=dev
|
||||
RUN npm install --omit=dev
|
||||
|
||||
COPY --from=builder /app/dist ./dist
|
||||
|
||||
# Copy migrations for auto-migrate on startup
|
||||
COPY migrations ./migrations
|
||||
|
||||
# Create local images directory for when MinIO is not configured
|
||||
RUN mkdir -p /app/public/images/products
|
||||
|
||||
|
||||
394
backend/docs/BRAND_INTELLIGENCE_API.md
Normal file
394
backend/docs/BRAND_INTELLIGENCE_API.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Brand Intelligence API
|
||||
|
||||
## Endpoint
|
||||
|
||||
```
|
||||
GET /api/analytics/v2/brand/:name/intelligence
|
||||
```
|
||||
|
||||
## Query Parameters
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| `window` | `7d\|30d\|90d` | `30d` | Time window for trend calculations |
|
||||
| `state` | string | - | Filter by state code (e.g., `AZ`) |
|
||||
| `category` | string | - | Filter by category (e.g., `Flower`) |
|
||||
|
||||
## Response Payload Schema
|
||||
|
||||
```typescript
|
||||
interface BrandIntelligenceResult {
|
||||
brand_name: string;
|
||||
window: '7d' | '30d' | '90d';
|
||||
generated_at: string; // ISO timestamp when data was computed
|
||||
|
||||
performance_snapshot: PerformanceSnapshot;
|
||||
alerts: Alerts;
|
||||
sku_performance: SkuPerformance[];
|
||||
retail_footprint: RetailFootprint;
|
||||
competitive_landscape: CompetitiveLandscape;
|
||||
inventory_health: InventoryHealth;
|
||||
promo_performance: PromoPerformance;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Section 1: Performance Snapshot
|
||||
|
||||
Summary cards with key brand metrics.
|
||||
|
||||
```typescript
|
||||
interface PerformanceSnapshot {
|
||||
active_skus: number; // Total products in catalog
|
||||
total_revenue_30d: number | null; // Estimated from qty × price
|
||||
total_stores: number; // Active retail partners
|
||||
new_stores_30d: number; // New distribution in window
|
||||
market_share: number | null; // % of category SKUs
|
||||
avg_wholesale_price: number | null;
|
||||
price_position: 'premium' | 'value' | 'competitive';
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label | Helper Text |
|
||||
|-------|-------------------|-------------|
|
||||
| `active_skus` | Active Products | X total in catalog |
|
||||
| `total_revenue_30d` | Monthly Revenue | Estimated from sales |
|
||||
| `total_stores` | Retail Distribution | Active retail partners |
|
||||
| `new_stores_30d` | New Opportunities | X new in last 30 days |
|
||||
| `market_share` | Category Position | % of category |
|
||||
| `avg_wholesale_price` | Avg Wholesale | Per unit |
|
||||
| `price_position` | Pricing Tier | Premium/Value/Market Rate |
|
||||
|
||||
---
|
||||
|
||||
## Section 2: Alerts
|
||||
|
||||
Issues requiring attention.
|
||||
|
||||
```typescript
|
||||
interface Alerts {
|
||||
lost_stores_30d_count: number;
|
||||
lost_skus_30d_count: number;
|
||||
competitor_takeover_count: number;
|
||||
avg_oos_duration_days: number | null;
|
||||
avg_reorder_lag_days: number | null;
|
||||
items: AlertItem[];
|
||||
}
|
||||
|
||||
interface AlertItem {
|
||||
type: 'lost_store' | 'delisted_sku' | 'shelf_loss' | 'extended_oos';
|
||||
severity: 'critical' | 'warning';
|
||||
store_name?: string;
|
||||
product_name?: string;
|
||||
competitor_brand?: string;
|
||||
days_since?: number;
|
||||
state_code?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `lost_stores_30d_count` | Accounts at Risk |
|
||||
| `lost_skus_30d_count` | Delisted SKUs |
|
||||
| `competitor_takeover_count` | Shelf Losses |
|
||||
| `avg_oos_duration_days` | Avg Stockout Length |
|
||||
| `avg_reorder_lag_days` | Avg Restock Time |
|
||||
| `severity: critical` | Urgent |
|
||||
| `severity: warning` | Watch |
|
||||
|
||||
---
|
||||
|
||||
## Section 3: SKU Performance (Product Velocity)
|
||||
|
||||
How fast each SKU sells.
|
||||
|
||||
```typescript
|
||||
interface SkuPerformance {
|
||||
store_product_id: number;
|
||||
product_name: string;
|
||||
category: string | null;
|
||||
daily_velocity: number; // Units/day estimate
|
||||
velocity_status: 'hot' | 'steady' | 'slow' | 'stale';
|
||||
retail_price: number | null;
|
||||
on_sale: boolean;
|
||||
stores_carrying: number;
|
||||
stock_status: 'in_stock' | 'low_stock' | 'out_of_stock';
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `daily_velocity` | Daily Rate |
|
||||
| `velocity_status` | Momentum |
|
||||
| `velocity_status: hot` | Hot |
|
||||
| `velocity_status: steady` | Steady |
|
||||
| `velocity_status: slow` | Slow |
|
||||
| `velocity_status: stale` | Stale |
|
||||
| `retail_price` | Retail Price |
|
||||
| `on_sale` | Promo (badge) |
|
||||
|
||||
**Velocity Thresholds:**
|
||||
- `hot`: >= 5 units/day
|
||||
- `steady`: >= 1 unit/day
|
||||
- `slow`: >= 0.1 units/day
|
||||
- `stale`: < 0.1 units/day
|
||||
|
||||
---
|
||||
|
||||
## Section 4: Retail Footprint
|
||||
|
||||
Store placement and coverage.
|
||||
|
||||
```typescript
|
||||
interface RetailFootprint {
|
||||
total_stores: number;
|
||||
in_stock_count: number;
|
||||
out_of_stock_count: number;
|
||||
penetration_by_region: RegionPenetration[];
|
||||
whitespace_stores: WhitespaceStore[];
|
||||
}
|
||||
|
||||
interface RegionPenetration {
|
||||
state_code: string;
|
||||
store_count: number;
|
||||
percent_reached: number; // % of state's dispensaries
|
||||
in_stock: number;
|
||||
out_of_stock: number;
|
||||
}
|
||||
|
||||
interface WhitespaceStore {
|
||||
store_id: number;
|
||||
store_name: string;
|
||||
state_code: string;
|
||||
city: string | null;
|
||||
category_fit: number; // How many competing brands they carry
|
||||
competitor_brands: string[];
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `penetration_by_region` | Market Coverage by Region |
|
||||
| `percent_reached` | X% reached |
|
||||
| `in_stock` | X stocked |
|
||||
| `out_of_stock` | X out |
|
||||
| `whitespace_stores` | Expansion Opportunities |
|
||||
| `category_fit` | X fit |
|
||||
|
||||
---
|
||||
|
||||
## Section 5: Competitive Landscape
|
||||
|
||||
Market positioning vs competitors.
|
||||
|
||||
```typescript
|
||||
interface CompetitiveLandscape {
|
||||
brand_price_position: 'premium' | 'value' | 'competitive';
|
||||
market_share_trend: MarketSharePoint[];
|
||||
competitors: Competitor[];
|
||||
head_to_head_skus: HeadToHead[];
|
||||
}
|
||||
|
||||
interface MarketSharePoint {
|
||||
date: string;
|
||||
share_percent: number;
|
||||
}
|
||||
|
||||
interface Competitor {
|
||||
brand_name: string;
|
||||
store_overlap_percent: number;
|
||||
price_position: 'premium' | 'value' | 'competitive';
|
||||
avg_price: number | null;
|
||||
sku_count: number;
|
||||
}
|
||||
|
||||
interface HeadToHead {
|
||||
product_name: string;
|
||||
brand_price: number;
|
||||
competitor_brand: string;
|
||||
competitor_price: number;
|
||||
price_diff_percent: number;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `price_position: premium` | Premium Tier |
|
||||
| `price_position: value` | Value Leader |
|
||||
| `price_position: competitive` | Market Rate |
|
||||
| `market_share_trend` | Share of Shelf Trend |
|
||||
| `head_to_head_skus` | Price Comparison |
|
||||
| `store_overlap_percent` | X% store overlap |
|
||||
|
||||
---
|
||||
|
||||
## Section 6: Inventory Health
|
||||
|
||||
Stock projections and risk levels.
|
||||
|
||||
```typescript
|
||||
interface InventoryHealth {
|
||||
critical_count: number; // <7 days stock
|
||||
warning_count: number; // 7-14 days stock
|
||||
healthy_count: number; // 14-90 days stock
|
||||
overstocked_count: number; // >90 days stock
|
||||
skus: InventorySku[];
|
||||
overstock_alert: OverstockItem[];
|
||||
}
|
||||
|
||||
interface InventorySku {
|
||||
store_product_id: number;
|
||||
product_name: string;
|
||||
store_name: string;
|
||||
days_of_stock: number | null;
|
||||
risk_level: 'critical' | 'elevated' | 'moderate' | 'healthy';
|
||||
current_quantity: number | null;
|
||||
daily_sell_rate: number | null;
|
||||
}
|
||||
|
||||
interface OverstockItem {
|
||||
product_name: string;
|
||||
store_name: string;
|
||||
excess_units: number;
|
||||
days_of_stock: number;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `risk_level: critical` | Reorder Now |
|
||||
| `risk_level: elevated` | Low Stock |
|
||||
| `risk_level: moderate` | Monitor |
|
||||
| `risk_level: healthy` | Healthy |
|
||||
| `critical_count` | Urgent (<7 days) |
|
||||
| `warning_count` | Low (7-14 days) |
|
||||
| `overstocked_count` | Excess (>90 days) |
|
||||
| `days_of_stock` | X days remaining |
|
||||
| `overstock_alert` | Overstock Alert |
|
||||
| `excess_units` | X excess units |
|
||||
|
||||
---
|
||||
|
||||
## Section 7: Promotion Effectiveness
|
||||
|
||||
How promotions impact sales.
|
||||
|
||||
```typescript
|
||||
interface PromoPerformance {
|
||||
avg_baseline_velocity: number | null;
|
||||
avg_promo_velocity: number | null;
|
||||
avg_velocity_lift: number | null; // % increase during promo
|
||||
avg_efficiency_score: number | null; // ROI proxy
|
||||
promotions: Promotion[];
|
||||
}
|
||||
|
||||
interface Promotion {
|
||||
product_name: string;
|
||||
store_name: string;
|
||||
status: 'active' | 'scheduled' | 'ended';
|
||||
start_date: string;
|
||||
end_date: string | null;
|
||||
regular_price: number;
|
||||
promo_price: number;
|
||||
discount_percent: number;
|
||||
baseline_velocity: number | null;
|
||||
promo_velocity: number | null;
|
||||
velocity_lift: number | null;
|
||||
efficiency_score: number | null;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `avg_baseline_velocity` | Normal Rate |
|
||||
| `avg_promo_velocity` | During Promos |
|
||||
| `avg_velocity_lift` | Avg Sales Lift |
|
||||
| `avg_efficiency_score` | ROI Score |
|
||||
| `velocity_lift` | Sales Lift |
|
||||
| `efficiency_score` | ROI Score |
|
||||
| `status: active` | Live |
|
||||
| `status: scheduled` | Scheduled |
|
||||
| `status: ended` | Ended |
|
||||
|
||||
---
|
||||
|
||||
## Example Queries
|
||||
|
||||
### Get full payload
|
||||
```javascript
|
||||
const response = await fetch('/api/analytics/v2/brand/Wyld/intelligence?window=30d');
|
||||
const data = await response.json();
|
||||
```
|
||||
|
||||
### Extract summary cards (flattened)
|
||||
```javascript
|
||||
const { performance_snapshot: ps, alerts } = data;
|
||||
|
||||
const summaryCards = {
|
||||
activeProducts: ps.active_skus,
|
||||
monthlyRevenue: ps.total_revenue_30d,
|
||||
retailDistribution: ps.total_stores,
|
||||
newOpportunities: ps.new_stores_30d,
|
||||
categoryPosition: ps.market_share,
|
||||
avgWholesale: ps.avg_wholesale_price,
|
||||
pricingTier: ps.price_position,
|
||||
accountsAtRisk: alerts.lost_stores_30d_count,
|
||||
delistedSkus: alerts.lost_skus_30d_count,
|
||||
shelfLosses: alerts.competitor_takeover_count,
|
||||
};
|
||||
```
|
||||
|
||||
### Get top 10 fastest selling SKUs
|
||||
```javascript
|
||||
const topSkus = data.sku_performance
|
||||
.filter(sku => sku.velocity_status === 'hot' || sku.velocity_status === 'steady')
|
||||
.sort((a, b) => b.daily_velocity - a.daily_velocity)
|
||||
.slice(0, 10);
|
||||
```
|
||||
|
||||
### Get critical inventory alerts only
|
||||
```javascript
|
||||
const criticalInventory = data.inventory_health.skus
|
||||
.filter(sku => sku.risk_level === 'critical');
|
||||
```
|
||||
|
||||
### Get states with <50% penetration
|
||||
```javascript
|
||||
const underPenetrated = data.retail_footprint.penetration_by_region
|
||||
.filter(region => region.percent_reached < 50)
|
||||
.sort((a, b) => a.percent_reached - b.percent_reached);
|
||||
```
|
||||
|
||||
### Get active promotions with positive lift
|
||||
```javascript
|
||||
const effectivePromos = data.promo_performance.promotions
|
||||
.filter(p => p.status === 'active' && p.velocity_lift > 0)
|
||||
.sort((a, b) => b.velocity_lift - a.velocity_lift);
|
||||
```
|
||||
|
||||
### Build chart data for market share trend
|
||||
```javascript
|
||||
const chartData = data.competitive_landscape.market_share_trend.map(point => ({
|
||||
x: new Date(point.date),
|
||||
y: point.share_percent,
|
||||
}));
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes for Frontend Implementation
|
||||
|
||||
1. **All fields are snake_case** - transform to camelCase if needed
|
||||
2. **Null values are possible** - handle gracefully in UI
|
||||
3. **Arrays may be empty** - show appropriate empty states
|
||||
4. **Timestamps are ISO format** - parse with `new Date()`
|
||||
5. **Percentages are already computed** - no need to multiply by 100
|
||||
6. **The `window` parameter affects trend calculations** - 7d/30d/90d
|
||||
539
backend/docs/CRAWL_PIPELINE.md
Normal file
539
backend/docs/CRAWL_PIPELINE.md
Normal file
@@ -0,0 +1,539 @@
|
||||
# Crawl Pipeline Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The crawl pipeline fetches product data from Dutchie dispensary menus and stores it in the canonical database. This document covers the complete flow from task scheduling to data storage.
|
||||
|
||||
---
|
||||
|
||||
## Pipeline Stages
|
||||
|
||||
```
|
||||
┌─────────────────────┐
|
||||
│ store_discovery │ Find new dispensaries
|
||||
└─────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────┐
|
||||
│ entry_point_discovery│ Resolve slug → platform_dispensary_id
|
||||
└─────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────┐
|
||||
│ product_discovery │ Initial product crawl
|
||||
└─────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────┐
|
||||
│ product_resync │ Recurring crawl (every 4 hours)
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Stage Details
|
||||
|
||||
### 1. Store Discovery
|
||||
**Purpose:** Find new dispensaries to crawl
|
||||
|
||||
**Handler:** `src/tasks/handlers/store-discovery.ts`
|
||||
|
||||
**Flow:**
|
||||
1. Query Dutchie `ConsumerDispensaries` GraphQL for cities/states
|
||||
2. Extract dispensary info (name, address, menu_url)
|
||||
3. Insert into `dutchie_discovery_locations`
|
||||
4. Queue `entry_point_discovery` for each new location
|
||||
|
||||
---
|
||||
|
||||
### 2. Entry Point Discovery
|
||||
**Purpose:** Resolve menu URL slug to platform_dispensary_id (MongoDB ObjectId)
|
||||
|
||||
**Handler:** `src/tasks/handlers/entry-point-discovery.ts`
|
||||
|
||||
**Flow:**
|
||||
1. Load dispensary from database
|
||||
2. Extract slug from `menu_url`:
|
||||
- `/embedded-menu/<slug>` or `/dispensary/<slug>`
|
||||
3. Start stealth session (fingerprint + proxy)
|
||||
4. Query `resolveDispensaryIdWithDetails(slug)` via GraphQL
|
||||
5. Update dispensary with `platform_dispensary_id`
|
||||
6. Queue `product_discovery` task
|
||||
|
||||
**Example:**
|
||||
```
|
||||
menu_url: https://dutchie.com/embedded-menu/deeply-rooted
|
||||
slug: deeply-rooted
|
||||
platform_dispensary_id: 6405ef617056e8014d79101b
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Product Discovery
|
||||
**Purpose:** Initial crawl of a new dispensary
|
||||
|
||||
**Handler:** `src/tasks/handlers/product-discovery.ts`
|
||||
|
||||
Same as product_resync but for first-time crawls.
|
||||
|
||||
---
|
||||
|
||||
### 4. Product Resync
|
||||
**Purpose:** Recurring crawl to capture price/stock changes
|
||||
|
||||
**Handler:** `src/tasks/handlers/product-resync.ts`
|
||||
|
||||
**Flow:**
|
||||
|
||||
#### Step 1: Load Dispensary Info
|
||||
```sql
|
||||
SELECT id, name, platform_dispensary_id, menu_url, state
|
||||
FROM dispensaries
|
||||
WHERE id = $1 AND crawl_enabled = true
|
||||
```
|
||||
|
||||
#### Step 2: Start Stealth Session
|
||||
- Generate random browser fingerprint
|
||||
- Set locale/timezone matching state
|
||||
- Optional proxy rotation
|
||||
|
||||
#### Step 3: Fetch Products via GraphQL
|
||||
**Endpoint:** `https://dutchie.com/api-3/graphql`
|
||||
|
||||
**Variables:**
|
||||
```javascript
|
||||
{
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: "<platform_dispensary_id>",
|
||||
pricingType: "rec",
|
||||
Status: "All",
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: "popularSortIdx",
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false
|
||||
},
|
||||
page: 0,
|
||||
perPage: 100
|
||||
}
|
||||
```
|
||||
|
||||
**Key Notes:**
|
||||
- `Status: "All"` returns all products (Active returns same count)
|
||||
- `Status: null` returns 0 products (broken)
|
||||
- `pricingType: "rec"` returns BOTH rec and med prices
|
||||
- Paginate until `products.length < perPage` or `allProducts.length >= totalCount`
|
||||
|
||||
#### Step 4: Normalize Data
|
||||
Transform raw Dutchie payload to canonical format via `DutchieNormalizer`.
|
||||
|
||||
#### Step 5: Upsert Products
|
||||
Insert/update `store_products` table with normalized data.
|
||||
|
||||
#### Step 6: Create Snapshots
|
||||
Insert point-in-time record to `store_product_snapshots`.
|
||||
|
||||
#### Step 7: Track Missing Products (OOS Detection)
|
||||
```sql
|
||||
-- Reset consecutive_misses for products IN the feed
|
||||
UPDATE store_products
|
||||
SET consecutive_misses = 0, last_seen_at = NOW()
|
||||
WHERE dispensary_id = $1
|
||||
AND provider = 'dutchie'
|
||||
AND provider_product_id = ANY($2)
|
||||
|
||||
-- Increment for products NOT in feed
|
||||
UPDATE store_products
|
||||
SET consecutive_misses = consecutive_misses + 1
|
||||
WHERE dispensary_id = $1
|
||||
AND provider = 'dutchie'
|
||||
AND provider_product_id NOT IN (...)
|
||||
AND consecutive_misses < 3
|
||||
|
||||
-- Mark OOS at 3 consecutive misses
|
||||
UPDATE store_products
|
||||
SET stock_status = 'oos', is_in_stock = false
|
||||
WHERE dispensary_id = $1
|
||||
AND consecutive_misses >= 3
|
||||
AND stock_status != 'oos'
|
||||
```
|
||||
|
||||
#### Step 8: Download Images
|
||||
For new products, download and store images locally.
|
||||
|
||||
#### Step 9: Update Dispensary
|
||||
```sql
|
||||
UPDATE dispensaries SET last_crawl_at = NOW() WHERE id = $1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## GraphQL Payload Structure
|
||||
|
||||
### Product Fields (from filteredProducts.products[])
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `_id` / `id` | string | MongoDB ObjectId (24 hex chars) |
|
||||
| `Name` | string | Product display name |
|
||||
| `brandName` | string | Brand name |
|
||||
| `brand.name` | string | Brand name (nested) |
|
||||
| `brand.description` | string | Brand description |
|
||||
| `type` | string | Category (Flower, Edible, Concentrate, etc.) |
|
||||
| `subcategory` | string | Subcategory |
|
||||
| `strainType` | string | Hybrid, Indica, Sativa, N/A |
|
||||
| `Status` | string | Always "Active" in feed |
|
||||
| `Image` | string | Primary image URL |
|
||||
| `images[]` | array | All product images |
|
||||
|
||||
### Pricing Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `Prices[]` | number[] | Rec prices per option |
|
||||
| `recPrices[]` | number[] | Rec prices |
|
||||
| `medicalPrices[]` | number[] | Medical prices |
|
||||
| `recSpecialPrices[]` | number[] | Rec sale prices |
|
||||
| `medicalSpecialPrices[]` | number[] | Medical sale prices |
|
||||
| `Options[]` | string[] | Size options ("1/8oz", "1g", etc.) |
|
||||
| `rawOptions[]` | string[] | Raw weight options ("3.5g") |
|
||||
|
||||
### Inventory Fields (POSMetaData.children[])
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `quantity` | number | Total inventory count |
|
||||
| `quantityAvailable` | number | Available for online orders |
|
||||
| `kioskQuantityAvailable` | number | Available for kiosk orders |
|
||||
| `option` | string | Which size option this is for |
|
||||
|
||||
### Potency Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `THCContent.range[]` | number[] | THC percentage |
|
||||
| `CBDContent.range[]` | number[] | CBD percentage |
|
||||
| `cannabinoidsV2[]` | array | Detailed cannabinoid breakdown |
|
||||
|
||||
### Specials (specialData.bogoSpecials[])
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `specialName` | string | Deal name |
|
||||
| `specialType` | string | "bogo", "sale", etc. |
|
||||
| `itemsForAPrice.value` | string | Bundle price |
|
||||
| `bogoRewards[].totalQuantity.quantity` | number | Required quantity |
|
||||
|
||||
---
|
||||
|
||||
## OOS Detection Logic
|
||||
|
||||
Products disappear from the Dutchie feed when they go out of stock. We track this via `consecutive_misses`:
|
||||
|
||||
| Scenario | Action |
|
||||
|----------|--------|
|
||||
| Product in feed | `consecutive_misses = 0` |
|
||||
| Product missing 1st time | `consecutive_misses = 1` |
|
||||
| Product missing 2nd time | `consecutive_misses = 2` |
|
||||
| Product missing 3rd time | `consecutive_misses = 3`, mark `stock_status = 'oos'` |
|
||||
| Product returns to feed | `consecutive_misses = 0`, update stock_status |
|
||||
|
||||
**Why 3 misses?**
|
||||
- Protects against false positives from crawl failures
|
||||
- Single bad crawl doesn't trigger mass OOS alerts
|
||||
- Balances detection speed vs accuracy
|
||||
|
||||
---
|
||||
|
||||
## Database Tables
|
||||
|
||||
### store_products
|
||||
Current state of each product:
|
||||
- `provider_product_id` - Dutchie's MongoDB ObjectId
|
||||
- `name_raw`, `brand_name_raw` - Raw values from feed
|
||||
- `price_rec`, `price_med` - Current prices
|
||||
- `is_in_stock`, `stock_status` - Availability
|
||||
- `consecutive_misses` - OOS detection counter
|
||||
- `last_seen_at` - Last time product was in feed
|
||||
|
||||
### store_product_snapshots
|
||||
Point-in-time records for historical analysis:
|
||||
- One row per product per crawl
|
||||
- Captures price, stock, potency at that moment
|
||||
- Used for price history, analytics
|
||||
|
||||
### dispensaries
|
||||
Store metadata:
|
||||
- `platform_dispensary_id` - MongoDB ObjectId for GraphQL
|
||||
- `menu_url` - Source URL
|
||||
- `last_crawl_at` - Last successful crawl
|
||||
- `crawl_enabled` - Whether to crawl
|
||||
|
||||
---
|
||||
|
||||
## Worker Roles
|
||||
|
||||
Workers pull tasks from the `worker_tasks` queue based on their assigned role.
|
||||
|
||||
| Role | Name | Description | Handler |
|
||||
|------|------|-------------|---------|
|
||||
| `product_resync` | Product Resync | Re-crawl dispensary products for price/stock changes | `handleProductResync` |
|
||||
| `product_discovery` | Product Discovery | Initial product discovery for new dispensaries | `handleProductDiscovery` |
|
||||
| `store_discovery` | Store Discovery | Discover new dispensary locations | `handleStoreDiscovery` |
|
||||
| `entry_point_discovery` | Entry Point Discovery | Resolve platform IDs from menu URLs | `handleEntryPointDiscovery` |
|
||||
| `analytics_refresh` | Analytics Refresh | Refresh materialized views and analytics | `handleAnalyticsRefresh` |
|
||||
|
||||
**API Endpoint:** `GET /api/worker-registry/roles`
|
||||
|
||||
---
|
||||
|
||||
## Scheduling
|
||||
|
||||
Crawls are scheduled via `worker_tasks` table:
|
||||
|
||||
| Role | Frequency | Description |
|
||||
|------|-----------|-------------|
|
||||
| `product_resync` | Every 4 hours | Regular product refresh |
|
||||
| `product_discovery` | On-demand | First crawl for new stores |
|
||||
| `entry_point_discovery` | On-demand | New store setup |
|
||||
| `store_discovery` | Daily | Find new stores |
|
||||
| `analytics_refresh` | Daily | Refresh analytics materialized views |
|
||||
|
||||
---
|
||||
|
||||
## Priority & On-Demand Tasks
|
||||
|
||||
Tasks are claimed by workers in order of **priority DESC, created_at ASC**.
|
||||
|
||||
### Priority Levels
|
||||
|
||||
| Priority | Use Case | Example |
|
||||
|----------|----------|---------|
|
||||
| 0 | Scheduled/batch tasks | Daily product_resync generation |
|
||||
| 10 | On-demand/chained tasks | entry_point → product_discovery |
|
||||
| Higher | Urgent/manual triggers | Admin-triggered immediate crawl |
|
||||
|
||||
### Task Chaining
|
||||
|
||||
When a task completes, the system automatically creates follow-up tasks:
|
||||
|
||||
```
|
||||
store_discovery (completed)
|
||||
└─► entry_point_discovery (priority: 10) for each new store
|
||||
|
||||
entry_point_discovery (completed, success)
|
||||
└─► product_discovery (priority: 10) for that store
|
||||
|
||||
product_discovery (completed)
|
||||
└─► [no chain] Store enters regular resync schedule
|
||||
```
|
||||
|
||||
### On-Demand Task Creation
|
||||
|
||||
Use the task service to create high-priority tasks:
|
||||
|
||||
```typescript
|
||||
// Create immediate product resync for a store
|
||||
await taskService.createTask({
|
||||
role: 'product_resync',
|
||||
dispensary_id: 123,
|
||||
platform: 'dutchie',
|
||||
priority: 20, // Higher than batch tasks
|
||||
});
|
||||
|
||||
// Convenience methods with default high priority (10)
|
||||
await taskService.createEntryPointTask(dispensaryId, 'dutchie');
|
||||
await taskService.createProductDiscoveryTask(dispensaryId, 'dutchie');
|
||||
await taskService.createStoreDiscoveryTask('dutchie', 'AZ');
|
||||
```
|
||||
|
||||
### Claim Function
|
||||
|
||||
The `claim_task()` SQL function atomically claims tasks:
|
||||
- Respects priority ordering (higher = first)
|
||||
- Uses `FOR UPDATE SKIP LOCKED` for concurrency
|
||||
- Prevents multiple active tasks per store
|
||||
|
||||
---
|
||||
|
||||
## Image Storage
|
||||
|
||||
Images are downloaded from Dutchie's AWS S3 and stored locally with on-demand resizing.
|
||||
|
||||
### Storage Path
|
||||
```
|
||||
/storage/images/products/<state>/<store>/<brand>/<product_id>/image-<hash>.webp
|
||||
/storage/images/brands/<brand>/logo-<hash>.webp
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```
|
||||
/storage/images/products/az/az-deeply-rooted/bud-bros/6913e3cd444eac3935e928b9/image-ae38b1f9.webp
|
||||
```
|
||||
|
||||
### Image Proxy API
|
||||
Served via `/img/*` with on-demand resizing using **sharp**:
|
||||
|
||||
```
|
||||
GET /img/products/az/az-deeply-rooted/bud-bros/6913e3cd444eac3935e928b9/image-ae38b1f9.webp?w=200
|
||||
```
|
||||
|
||||
| Param | Description |
|
||||
|-------|-------------|
|
||||
| `w` | Width in pixels (max 4000) |
|
||||
| `h` | Height in pixels (max 4000) |
|
||||
| `q` | Quality 1-100 (default 80) |
|
||||
| `fit` | cover, contain, fill, inside, outside |
|
||||
| `blur` | Blur sigma (0.3-1000) |
|
||||
| `gray` | Grayscale (1 = enabled) |
|
||||
| `format` | webp, jpeg, png, avif (default webp) |
|
||||
|
||||
### Key Files
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/utils/image-storage.ts` | Download & save images to local filesystem |
|
||||
| `src/routes/image-proxy.ts` | On-demand resize/transform at `/img/*` |
|
||||
|
||||
### Download Rules
|
||||
|
||||
| Scenario | Image Action |
|
||||
|----------|--------------|
|
||||
| **New product (first crawl)** | Download if `primaryImageUrl` exists |
|
||||
| **Existing product (refresh)** | Download only if `local_image_path` is NULL (backfill) |
|
||||
| **Product already has local image** | Skip download entirely |
|
||||
|
||||
**Logic:**
|
||||
- Images are downloaded **once** and never re-downloaded on subsequent crawls
|
||||
- `skipIfExists: true` - filesystem check prevents re-download even if queued
|
||||
- First crawl: all products get images
|
||||
- Refresh crawl: only new products or products missing local images
|
||||
|
||||
### Storage Rules
|
||||
- **NO MinIO** - local filesystem only (`STORAGE_DRIVER=local`)
|
||||
- Store full resolution, resize on-demand via `/img` proxy
|
||||
- Convert to webp for consistency using **sharp**
|
||||
- Preserve original Dutchie URL as fallback in `image_url` column
|
||||
- Local path stored in `local_image_path` column
|
||||
|
||||
---
|
||||
|
||||
## Stealth & Anti-Detection
|
||||
|
||||
**PROXIES ARE REQUIRED** - Workers will fail to start if no active proxies are available in the database. All HTTP requests to Dutchie go through a proxy.
|
||||
|
||||
Workers automatically initialize anti-detection systems on startup.
|
||||
|
||||
### Components
|
||||
|
||||
| Component | Purpose | Source |
|
||||
|-----------|---------|--------|
|
||||
| **CrawlRotator** | Coordinates proxy + UA rotation | `src/services/crawl-rotator.ts` |
|
||||
| **ProxyRotator** | Round-robin proxy selection, health tracking | `src/services/crawl-rotator.ts` |
|
||||
| **UserAgentRotator** | Cycles through realistic browser fingerprints | `src/services/crawl-rotator.ts` |
|
||||
| **Dutchie Client** | Curl-based HTTP with auto-retry on 403 | `src/platforms/dutchie/client.ts` |
|
||||
|
||||
### Initialization Flow
|
||||
|
||||
```
|
||||
Worker Start
|
||||
│
|
||||
├─► initializeStealth()
|
||||
│ │
|
||||
│ ├─► CrawlRotator.initialize()
|
||||
│ │ └─► Load proxies from `proxies` table
|
||||
│ │
|
||||
│ └─► setCrawlRotator(rotator)
|
||||
│ └─► Wire to Dutchie client
|
||||
│
|
||||
└─► Process tasks...
|
||||
```
|
||||
|
||||
### Stealth Session (per task)
|
||||
|
||||
Each crawl task starts a stealth session:
|
||||
|
||||
```typescript
|
||||
// In product-refresh.ts, entry-point-discovery.ts
|
||||
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
|
||||
```
|
||||
|
||||
This creates a new identity with:
|
||||
- **Random fingerprint:** Chrome/Firefox/Safari/Edge on Win/Mac/Linux
|
||||
- **Accept-Language:** Matches timezone (e.g., `America/Phoenix` → `en-US,en;q=0.9`)
|
||||
- **sec-ch-ua headers:** Proper Client Hints for the browser profile
|
||||
|
||||
### On 403 Block
|
||||
|
||||
When Dutchie returns 403, the client automatically:
|
||||
|
||||
1. Records failure on current proxy (increments `failure_count`)
|
||||
2. If proxy has 5+ failures, deactivates it
|
||||
3. Rotates to next healthy proxy
|
||||
4. Rotates fingerprint
|
||||
5. Retries the request
|
||||
|
||||
### Proxy Table Schema
|
||||
|
||||
```sql
|
||||
CREATE TABLE proxies (
|
||||
id SERIAL PRIMARY KEY,
|
||||
host VARCHAR(255) NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
username VARCHAR(100),
|
||||
password VARCHAR(100),
|
||||
protocol VARCHAR(10) DEFAULT 'http', -- http, https, socks5
|
||||
is_active BOOLEAN DEFAULT true,
|
||||
last_used_at TIMESTAMPTZ,
|
||||
failure_count INTEGER DEFAULT 0,
|
||||
success_count INTEGER DEFAULT 0,
|
||||
avg_response_time_ms INTEGER,
|
||||
last_failure_at TIMESTAMPTZ,
|
||||
last_error TEXT
|
||||
);
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Proxies are mandatory. There is no environment variable to disable them. Workers will refuse to start without active proxies in the database.
|
||||
|
||||
### User-Agent Generation
|
||||
|
||||
See `workflow-12102025.md` for full specification.
|
||||
|
||||
**Summary:**
|
||||
- Uses `intoli/user-agents` library (daily-updated market share data)
|
||||
- Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
|
||||
- Browser whitelist: Chrome, Safari, Edge, Firefox only
|
||||
- UA sticks until IP rotates (403 or manual rotation)
|
||||
- Failure = alert admin + stop crawl (no fallback)
|
||||
|
||||
Each fingerprint includes proper `sec-ch-ua`, `sec-ch-ua-platform`, and `sec-ch-ua-mobile` headers.
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
- **GraphQL errors:** Logged, task marked failed, retried later
|
||||
- **Normalization errors:** Logged as warnings, continue with valid products
|
||||
- **Image download errors:** Non-fatal, logged, continue
|
||||
- **Database errors:** Task fails, will be retried
|
||||
- **403 blocks:** Auto-rotate proxy + fingerprint, retry (up to 3 retries)
|
||||
|
||||
---
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/tasks/handlers/product-resync.ts` | Main crawl handler |
|
||||
| `src/tasks/handlers/entry-point-discovery.ts` | Slug → ID resolution |
|
||||
| `src/platforms/dutchie/index.ts` | GraphQL client, session management |
|
||||
| `src/hydration/normalizers/dutchie.ts` | Payload normalization |
|
||||
| `src/hydration/canonical-upsert.ts` | Database upsert logic |
|
||||
| `src/utils/image-storage.ts` | Image download and local storage |
|
||||
| `src/routes/image-proxy.ts` | On-demand image resizing |
|
||||
| `migrations/075_consecutive_misses.sql` | OOS tracking column |
|
||||
584
backend/docs/TASK_WORKFLOW_2024-12-10.md
Normal file
584
backend/docs/TASK_WORKFLOW_2024-12-10.md
Normal file
@@ -0,0 +1,584 @@
|
||||
# Task Workflow Documentation
|
||||
**Date: 2024-12-10**
|
||||
|
||||
This document describes the complete task/job processing architecture after the 2024-12-10 rewrite.
|
||||
|
||||
---
|
||||
|
||||
## Complete Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ KUBERNETES CLUSTER │
|
||||
├─────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ API SERVER POD (scraper) │ │
|
||||
│ │ │ │
|
||||
│ │ ┌──────────────────┐ ┌────────────────────────────────────────┐ │ │
|
||||
│ │ │ Express API │ │ TaskScheduler │ │ │
|
||||
│ │ │ │ │ (src/services/task-scheduler.ts) │ │ │
|
||||
│ │ │ /api/job-queue │ │ │ │ │
|
||||
│ │ │ /api/tasks │ │ • Polls every 60s │ │ │
|
||||
│ │ │ /api/schedules │ │ • Checks task_schedules table │ │ │
|
||||
│ │ └────────┬─────────┘ │ • SELECT FOR UPDATE SKIP LOCKED │ │ │
|
||||
│ │ │ │ • Generates tasks when due │ │ │
|
||||
│ │ │ └──────────────────┬─────────────────────┘ │ │
|
||||
│ │ │ │ │ │
|
||||
│ └────────────┼──────────────────────────────────┼──────────────────────────┘ │
|
||||
│ │ │ │
|
||||
│ │ ┌────────────────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ POSTGRESQL DATABASE │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────────────┐ ┌─────────────────────┐ │ │
|
||||
│ │ │ task_schedules │ │ worker_tasks │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ • product_refresh │───────►│ • pending tasks │ │ │
|
||||
│ │ │ • store_discovery │ create │ • claimed tasks │ │ │
|
||||
│ │ │ • analytics_refresh │ tasks │ • running tasks │ │ │
|
||||
│ │ │ │ │ • completed tasks │ │ │
|
||||
│ │ │ next_run_at │ │ │ │ │
|
||||
│ │ │ last_run_at │ │ role, dispensary_id │ │ │
|
||||
│ │ │ interval_hours │ │ priority, status │ │ │
|
||||
│ │ └─────────────────────┘ └──────────┬──────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ └─────────────────────────────────────────────┼────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────────────────────┘ │
|
||||
│ │ Workers poll for tasks │
|
||||
│ │ (SELECT FOR UPDATE SKIP LOCKED) │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ WORKER PODS (StatefulSet: scraper-worker) │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ Worker 0 │ │ Worker 1 │ │ Worker 2 │ │ Worker N │ │ │
|
||||
│ │ │ │ │ │ │ │ │ │ │ │
|
||||
│ │ │ task-worker │ │ task-worker │ │ task-worker │ │ task-worker │ │ │
|
||||
│ │ │ .ts │ │ .ts │ │ .ts │ │ .ts │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ └──────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└──────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Startup Sequence
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ API SERVER STARTUP │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ 1. Express app initializes │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 2. runAutoMigrations() │
|
||||
│ • Runs pending migrations (including 079_task_schedules.sql) │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 3. initializeMinio() / initializeImageStorage() │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 4. cleanupOrphanedJobs() │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 5. taskScheduler.start() ◄─── NEW (per TASK_WORKFLOW_2024-12-10.md) │
|
||||
│ │ │
|
||||
│ ├── Recover stale tasks (workers that died) │
|
||||
│ ├── Ensure default schedules exist in task_schedules │
|
||||
│ ├── Check and run any due schedules immediately │
|
||||
│ └── Start 60-second poll interval │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 6. app.listen(PORT) │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ WORKER POD STARTUP │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ 1. K8s starts pod from StatefulSet │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 2. TaskWorker.constructor() │
|
||||
│ • Create DB pool │
|
||||
│ • Create CrawlRotator │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 3. initializeStealth() │
|
||||
│ • Load proxies from DB (REQUIRED - fails if none) │
|
||||
│ • Wire rotator to Dutchie client │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 4. register() with API │
|
||||
│ • Optional - continues if fails │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 5. startRegistryHeartbeat() every 30s │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 6. processNextTask() loop │
|
||||
│ │ │
|
||||
│ ├── Poll for pending task (FOR UPDATE SKIP LOCKED) │
|
||||
│ ├── Claim task atomically │
|
||||
│ ├── Execute handler (product_refresh, store_discovery, etc.) │
|
||||
│ ├── Mark complete/failed │
|
||||
│ ├── Chain next task if applicable │
|
||||
│ └── Loop │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Schedule Flow
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ SCHEDULER POLL (every 60 seconds) │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ BEGIN TRANSACTION │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ SELECT * FROM task_schedules │
|
||||
│ WHERE enabled = true AND next_run_at <= NOW() │
|
||||
│ FOR UPDATE SKIP LOCKED ◄─── Prevents duplicate execution across replicas │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ For each due schedule: │
|
||||
│ │ │
|
||||
│ ├── product_refresh_all │
|
||||
│ │ └─► Query dispensaries needing crawl │
|
||||
│ │ └─► Create product_refresh tasks in worker_tasks │
|
||||
│ │ │
|
||||
│ ├── store_discovery_dutchie │
|
||||
│ │ └─► Create single store_discovery task │
|
||||
│ │ │
|
||||
│ └── analytics_refresh │
|
||||
│ └─► Create single analytics_refresh task │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ UPDATE task_schedules SET │
|
||||
│ last_run_at = NOW(), │
|
||||
│ next_run_at = NOW() + interval_hours │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ COMMIT │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task Lifecycle
|
||||
|
||||
```
|
||||
┌──────────┐
|
||||
│ SCHEDULE │
|
||||
│ DUE │
|
||||
└────┬─────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐ claim ┌──────────────┐ start ┌──────────────┐
|
||||
│ PENDING │────────────►│ CLAIMED │────────────►│ RUNNING │
|
||||
└──────────────┘ └──────────────┘ └──────┬───────┘
|
||||
▲ │
|
||||
│ ┌──────────────┼──────────────┐
|
||||
│ retry │ │ │
|
||||
│ (if retries < max) ▼ ▼ ▼
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||
└──────────────────────────────────│ FAILED │ │ COMPLETED│ │ STALE │
|
||||
└──────────┘ └──────────┘ └────┬─────┘
|
||||
│
|
||||
recover_stale_tasks()
|
||||
│
|
||||
▼
|
||||
┌──────────┐
|
||||
│ PENDING │
|
||||
└──────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Tables
|
||||
|
||||
### task_schedules (NEW - migration 079)
|
||||
|
||||
Stores schedule definitions. Survives restarts.
|
||||
|
||||
```sql
|
||||
CREATE TABLE task_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
role VARCHAR(50) NOT NULL, -- product_refresh, store_discovery, etc.
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
interval_hours INTEGER NOT NULL, -- How often to run
|
||||
priority INTEGER DEFAULT 0, -- Task priority when created
|
||||
state_code VARCHAR(2), -- Optional filter
|
||||
last_run_at TIMESTAMPTZ, -- When it last ran
|
||||
next_run_at TIMESTAMPTZ, -- When it's due next
|
||||
last_task_count INTEGER, -- Tasks created last run
|
||||
last_error TEXT -- Error message if failed
|
||||
);
|
||||
```
|
||||
|
||||
### worker_tasks (migration 074)
|
||||
|
||||
The task queue. Workers pull from here.
|
||||
|
||||
```sql
|
||||
CREATE TABLE worker_tasks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
role task_role NOT NULL, -- What type of work
|
||||
dispensary_id INTEGER, -- Which store (if applicable)
|
||||
platform VARCHAR(50), -- Which platform
|
||||
status task_status DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 0, -- Higher = process first
|
||||
scheduled_for TIMESTAMP, -- Don't process before this time
|
||||
worker_id VARCHAR(100), -- Which worker claimed it
|
||||
claimed_at TIMESTAMP,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
last_heartbeat_at TIMESTAMP, -- For stale detection
|
||||
result JSONB,
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
max_retries INTEGER DEFAULT 3
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Default Schedules
|
||||
|
||||
| Name | Role | Interval | Priority | Description |
|
||||
|------|------|----------|----------|-------------|
|
||||
| `payload_fetch_all` | payload_fetch | 4 hours | 0 | Fetch payloads from Dutchie API (chains to product_refresh) |
|
||||
| `store_discovery_dutchie` | store_discovery | 24 hours | 5 | Find new Dutchie stores |
|
||||
| `analytics_refresh` | analytics_refresh | 6 hours | 0 | Refresh MVs |
|
||||
|
||||
---
|
||||
|
||||
## Task Roles
|
||||
|
||||
| Role | Description | Creates Tasks For |
|
||||
|------|-------------|-------------------|
|
||||
| `payload_fetch` | **NEW** - Fetch from Dutchie API, save to disk | Each dispensary needing crawl |
|
||||
| `product_refresh` | **CHANGED** - Read local payload, normalize, upsert to DB | Chained from payload_fetch |
|
||||
| `store_discovery` | Find new dispensaries, returns newStoreIds[] | Single task per platform |
|
||||
| `entry_point_discovery` | **DEPRECATED** - Resolve platform IDs | No longer used |
|
||||
| `product_discovery` | Initial product fetch for new stores | Chained from store_discovery |
|
||||
| `analytics_refresh` | Refresh MVs | Single global task |
|
||||
|
||||
### Payload/Refresh Separation (2024-12-10)
|
||||
|
||||
The crawl workflow is now split into two phases:
|
||||
|
||||
```
|
||||
payload_fetch (scheduled every 4h)
|
||||
└─► Hit Dutchie GraphQL API
|
||||
└─► Save raw JSON to /storage/payloads/{year}/{month}/{day}/store_{id}_{ts}.json.gz
|
||||
└─► Record metadata in raw_crawl_payloads table
|
||||
└─► Queue product_refresh task with payload_id
|
||||
|
||||
product_refresh (chained from payload_fetch)
|
||||
└─► Load payload from filesystem (NOT from API)
|
||||
└─► Normalize via DutchieNormalizer
|
||||
└─► Upsert to store_products
|
||||
└─► Create snapshots
|
||||
└─► Track missing products
|
||||
└─► Download images
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- **Retry-friendly**: If normalize fails, re-run product_refresh without re-crawling
|
||||
- **Replay-able**: Run product_refresh against any historical payload
|
||||
- **Faster refreshes**: Local file read vs network call
|
||||
- **Historical diffs**: Compare payloads to see what changed between crawls
|
||||
- **Less API pressure**: Only payload_fetch hits Dutchie
|
||||
|
||||
---
|
||||
|
||||
## Task Chaining
|
||||
|
||||
Tasks automatically queue follow-up tasks upon successful completion. This creates two main flows:
|
||||
|
||||
### Discovery Flow (New Stores)
|
||||
|
||||
When `store_discovery` finds new dispensaries, they automatically get their initial product data:
|
||||
|
||||
```
|
||||
store_discovery
|
||||
└─► Discovers new locations via Dutchie GraphQL
|
||||
└─► Auto-promotes valid locations to dispensaries table
|
||||
└─► Collects newDispensaryIds[] from promotions
|
||||
└─► Returns { newStoreIds: [...] } in result
|
||||
|
||||
chainNextTask() detects newStoreIds
|
||||
└─► Creates product_discovery task for each new store
|
||||
|
||||
product_discovery
|
||||
└─► Calls handlePayloadFetch() internally
|
||||
└─► payload_fetch hits Dutchie API
|
||||
└─► Saves raw JSON to /storage/payloads/
|
||||
└─► Queues product_refresh task with payload_id
|
||||
|
||||
product_refresh
|
||||
└─► Loads payload from filesystem
|
||||
└─► Normalizes and upserts to store_products
|
||||
└─► Creates snapshots, downloads images
|
||||
```
|
||||
|
||||
**Complete Discovery Chain:**
|
||||
```
|
||||
store_discovery → product_discovery → payload_fetch → product_refresh
|
||||
(internal call) (queues next)
|
||||
```
|
||||
|
||||
### Scheduled Flow (Existing Stores)
|
||||
|
||||
For existing stores, `payload_fetch_all` schedule runs every 4 hours:
|
||||
|
||||
```
|
||||
TaskScheduler (every 60s)
|
||||
└─► Checks task_schedules for due schedules
|
||||
└─► payload_fetch_all is due
|
||||
└─► Generates payload_fetch task for each dispensary
|
||||
|
||||
payload_fetch
|
||||
└─► Hits Dutchie GraphQL API
|
||||
└─► Saves raw JSON to /storage/payloads/
|
||||
└─► Queues product_refresh task with payload_id
|
||||
|
||||
product_refresh
|
||||
└─► Loads payload from filesystem (NOT API)
|
||||
└─► Normalizes via DutchieNormalizer
|
||||
└─► Upserts to store_products
|
||||
└─► Creates snapshots
|
||||
```
|
||||
|
||||
**Complete Scheduled Chain:**
|
||||
```
|
||||
payload_fetch → product_refresh
|
||||
(queues) (reads local)
|
||||
```
|
||||
|
||||
### Chaining Implementation
|
||||
|
||||
Task chaining is handled in two places:
|
||||
|
||||
1. **Internal chaining (handler calls handler):**
|
||||
- `product_discovery` calls `handlePayloadFetch()` directly
|
||||
|
||||
2. **External chaining (chainNextTask() in task-service.ts):**
|
||||
- Called after task completion
|
||||
- `store_discovery` → queues `product_discovery` for each newStoreId
|
||||
|
||||
3. **Queue-based chaining (taskService.createTask):**
|
||||
- `payload_fetch` queues `product_refresh` with `payload: { payload_id }`
|
||||
|
||||
---
|
||||
|
||||
## Payload API Endpoints
|
||||
|
||||
Raw crawl payloads can be accessed via the Payloads API:
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/payloads` | GET | List payload metadata (paginated) |
|
||||
| `GET /api/payloads/:id` | GET | Get payload metadata by ID |
|
||||
| `GET /api/payloads/:id/data` | GET | Get full payload JSON (decompressed) |
|
||||
| `GET /api/payloads/store/:dispensaryId` | GET | List payloads for a store |
|
||||
| `GET /api/payloads/store/:dispensaryId/latest` | GET | Get latest payload for a store |
|
||||
| `GET /api/payloads/store/:dispensaryId/diff` | GET | Diff two payloads for changes |
|
||||
|
||||
### Payload Diff Response
|
||||
|
||||
The diff endpoint returns:
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"from": { "id": 123, "fetchedAt": "...", "productCount": 100 },
|
||||
"to": { "id": 456, "fetchedAt": "...", "productCount": 105 },
|
||||
"diff": {
|
||||
"added": 10,
|
||||
"removed": 5,
|
||||
"priceChanges": 8,
|
||||
"stockChanges": 12
|
||||
},
|
||||
"details": {
|
||||
"added": [...],
|
||||
"removed": [...],
|
||||
"priceChanges": [...],
|
||||
"stockChanges": [...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Schedules (NEW)
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/schedules` | GET | List all schedules |
|
||||
| `PUT /api/schedules/:id` | PUT | Update schedule |
|
||||
| `POST /api/schedules/:id/trigger` | POST | Run schedule immediately |
|
||||
|
||||
### Task Creation (rewired 2024-12-10)
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `POST /api/job-queue/enqueue` | POST | Create single task |
|
||||
| `POST /api/job-queue/enqueue-batch` | POST | Create batch tasks |
|
||||
| `POST /api/job-queue/enqueue-state` | POST | Create tasks for state |
|
||||
| `POST /api/tasks` | POST | Direct task creation |
|
||||
|
||||
### Task Management
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/tasks` | GET | List tasks |
|
||||
| `GET /api/tasks/:id` | GET | Get single task |
|
||||
| `GET /api/tasks/counts` | GET | Task counts by status |
|
||||
| `POST /api/tasks/recover-stale` | POST | Recover stale tasks |
|
||||
|
||||
---
|
||||
|
||||
## Key Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/services/task-scheduler.ts` | **NEW** - DB-driven scheduler |
|
||||
| `src/tasks/task-worker.ts` | Worker that processes tasks |
|
||||
| `src/tasks/task-service.ts` | Task CRUD operations |
|
||||
| `src/tasks/handlers/payload-fetch.ts` | **NEW** - Fetches from API, saves to disk |
|
||||
| `src/tasks/handlers/product-refresh.ts` | **CHANGED** - Reads from disk, processes to DB |
|
||||
| `src/utils/payload-storage.ts` | **NEW** - Payload save/load utilities |
|
||||
| `src/routes/tasks.ts` | Task API endpoints |
|
||||
| `src/routes/job-queue.ts` | Job Queue UI endpoints (rewired) |
|
||||
| `migrations/079_task_schedules.sql` | Schedule table |
|
||||
| `migrations/080_raw_crawl_payloads.sql` | Payload metadata table |
|
||||
| `migrations/081_payload_fetch_columns.sql` | payload, last_fetch_at columns |
|
||||
| `migrations/074_worker_task_queue.sql` | Task queue table |
|
||||
|
||||
---
|
||||
|
||||
## Legacy Code (DEPRECATED)
|
||||
|
||||
| File | Status | Replacement |
|
||||
|------|--------|-------------|
|
||||
| `src/services/scheduler.ts` | DEPRECATED | `task-scheduler.ts` |
|
||||
| `dispensary_crawl_jobs` table | ORPHANED | `worker_tasks` |
|
||||
| `job_schedules` table | LEGACY | `task_schedules` |
|
||||
|
||||
---
|
||||
|
||||
## Dashboard Integration
|
||||
|
||||
Both pages remain wired to the dashboard:
|
||||
|
||||
| Page | Data Source | Actions |
|
||||
|------|-------------|---------|
|
||||
| **Job Queue** | `worker_tasks`, `task_schedules` | Create tasks, view schedules |
|
||||
| **Task Queue** | `worker_tasks` | View tasks, recover stale |
|
||||
|
||||
---
|
||||
|
||||
## Multi-Replica Safety
|
||||
|
||||
The scheduler uses `SELECT FOR UPDATE SKIP LOCKED` to ensure:
|
||||
|
||||
1. **Only one replica** executes a schedule at a time
|
||||
2. **No duplicate tasks** created
|
||||
3. **Survives pod restarts** - state in DB, not memory
|
||||
4. **Self-healing** - recovers stale tasks on startup
|
||||
|
||||
```sql
|
||||
-- This query is atomic across all API server replicas
|
||||
SELECT * FROM task_schedules
|
||||
WHERE enabled = true AND next_run_at <= NOW()
|
||||
FOR UPDATE SKIP LOCKED
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Worker Scaling (K8s)
|
||||
|
||||
Workers run as a StatefulSet in Kubernetes. You can scale from the admin UI or CLI.
|
||||
|
||||
### From Admin UI
|
||||
|
||||
The Workers page (`/admin/workers`) provides:
|
||||
- Current replica count display
|
||||
- Scale up/down buttons
|
||||
- Target replica input
|
||||
|
||||
### API Endpoints
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/workers/k8s/replicas` | GET | Get current/desired replica counts |
|
||||
| `POST /api/workers/k8s/scale` | POST | Scale to N replicas (body: `{ replicas: N }`) |
|
||||
|
||||
### From CLI
|
||||
|
||||
```bash
|
||||
# View current replicas
|
||||
kubectl get statefulset scraper-worker -n dispensary-scraper
|
||||
|
||||
# Scale to 10 workers
|
||||
kubectl scale statefulset scraper-worker -n dispensary-scraper --replicas=10
|
||||
|
||||
# Scale down to 3 workers
|
||||
kubectl scale statefulset scraper-worker -n dispensary-scraper --replicas=3
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Environment variables for the API server:
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `K8S_NAMESPACE` | `dispensary-scraper` | Kubernetes namespace |
|
||||
| `K8S_WORKER_STATEFULSET` | `scraper-worker` | StatefulSet name |
|
||||
|
||||
### RBAC Requirements
|
||||
|
||||
The API server pod needs these K8s permissions:
|
||||
|
||||
```yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: worker-scaler
|
||||
namespace: dispensary-scraper
|
||||
rules:
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["statefulsets"]
|
||||
verbs: ["get", "patch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: scraper-worker-scaler
|
||||
namespace: dispensary-scraper
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: default
|
||||
namespace: dispensary-scraper
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: worker-scaler
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
```
|
||||
400
backend/docs/WORKER_TASK_ARCHITECTURE.md
Normal file
400
backend/docs/WORKER_TASK_ARCHITECTURE.md
Normal file
@@ -0,0 +1,400 @@
|
||||
# Worker Task Architecture
|
||||
|
||||
This document describes the unified task-based worker system that replaces the legacy fragmented job systems.
|
||||
|
||||
## Overview
|
||||
|
||||
The task worker architecture provides a single, unified system for managing all background work in CannaiQ:
|
||||
|
||||
- **Store discovery** - Find new dispensaries on platforms
|
||||
- **Entry point discovery** - Resolve platform IDs from menu URLs
|
||||
- **Product discovery** - Initial product fetch for new stores
|
||||
- **Product resync** - Regular price/stock updates for existing stores
|
||||
- **Analytics refresh** - Refresh materialized views and analytics
|
||||
|
||||
## Architecture
|
||||
|
||||
### Database Tables
|
||||
|
||||
**`worker_tasks`** - Central task queue
|
||||
```sql
|
||||
CREATE TABLE worker_tasks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
role task_role NOT NULL, -- What type of work
|
||||
dispensary_id INTEGER, -- Which store (if applicable)
|
||||
platform VARCHAR(50), -- Which platform (dutchie, etc.)
|
||||
status task_status DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 0, -- Higher = process first
|
||||
scheduled_for TIMESTAMP, -- Don't process before this time
|
||||
worker_id VARCHAR(100), -- Which worker claimed it
|
||||
claimed_at TIMESTAMP,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
last_heartbeat_at TIMESTAMP, -- For stale detection
|
||||
result JSONB, -- Output from handler
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
max_retries INTEGER DEFAULT 3,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
**Key indexes:**
|
||||
- `idx_worker_tasks_pending_priority` - For efficient task claiming
|
||||
- `idx_worker_tasks_active_dispensary` - Prevents concurrent tasks per store (partial unique index)
|
||||
|
||||
### Task Roles
|
||||
|
||||
| Role | Purpose | Per-Store | Scheduled |
|
||||
|------|---------|-----------|-----------|
|
||||
| `store_discovery` | Find new stores on a platform | No | Daily |
|
||||
| `entry_point_discovery` | Resolve platform IDs | Yes | On-demand |
|
||||
| `product_discovery` | Initial product fetch | Yes | After entry_point |
|
||||
| `product_resync` | Price/stock updates | Yes | Every 4 hours |
|
||||
| `analytics_refresh` | Refresh MVs | No | Daily |
|
||||
|
||||
### Task Lifecycle
|
||||
|
||||
```
|
||||
pending → claimed → running → completed
|
||||
↓
|
||||
failed
|
||||
```
|
||||
|
||||
1. **pending** - Task is waiting to be picked up
|
||||
2. **claimed** - Worker has claimed it (atomic via SELECT FOR UPDATE SKIP LOCKED)
|
||||
3. **running** - Worker is actively processing
|
||||
4. **completed** - Task finished successfully
|
||||
5. **failed** - Task encountered an error
|
||||
6. **stale** - Task lost its worker (recovered automatically)
|
||||
|
||||
## Files
|
||||
|
||||
### Core Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/tasks/task-service.ts` | TaskService - CRUD, claiming, capacity metrics |
|
||||
| `src/tasks/task-worker.ts` | TaskWorker - Main worker loop |
|
||||
| `src/tasks/index.ts` | Module exports |
|
||||
| `src/routes/tasks.ts` | API endpoints |
|
||||
| `migrations/074_worker_task_queue.sql` | Database schema |
|
||||
|
||||
### Task Handlers
|
||||
|
||||
| File | Role |
|
||||
|------|------|
|
||||
| `src/tasks/handlers/store-discovery.ts` | `store_discovery` |
|
||||
| `src/tasks/handlers/entry-point-discovery.ts` | `entry_point_discovery` |
|
||||
| `src/tasks/handlers/product-discovery.ts` | `product_discovery` |
|
||||
| `src/tasks/handlers/product-resync.ts` | `product_resync` |
|
||||
| `src/tasks/handlers/analytics-refresh.ts` | `analytics_refresh` |
|
||||
|
||||
## Running Workers
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `WORKER_ROLE` | (required) | Which task role to process |
|
||||
| `WORKER_ID` | auto-generated | Custom worker identifier |
|
||||
| `POLL_INTERVAL_MS` | 5000 | How often to check for tasks |
|
||||
| `HEARTBEAT_INTERVAL_MS` | 30000 | How often to update heartbeat |
|
||||
|
||||
### Starting a Worker
|
||||
|
||||
```bash
|
||||
# Start a product resync worker
|
||||
WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts
|
||||
|
||||
# Start with custom ID
|
||||
WORKER_ROLE=product_resync WORKER_ID=resync-1 npx tsx src/tasks/task-worker.ts
|
||||
|
||||
# Start multiple workers for different roles
|
||||
WORKER_ROLE=store_discovery npx tsx src/tasks/task-worker.ts &
|
||||
WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts &
|
||||
```
|
||||
|
||||
### Kubernetes Deployment
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: task-worker-resync
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: worker
|
||||
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||
command: ["npx", "tsx", "src/tasks/task-worker.ts"]
|
||||
env:
|
||||
- name: WORKER_ROLE
|
||||
value: "product_resync"
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Task Management
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks` | GET | List tasks with filters |
|
||||
| `/api/tasks` | POST | Create a new task |
|
||||
| `/api/tasks/:id` | GET | Get task by ID |
|
||||
| `/api/tasks/counts` | GET | Get counts by status |
|
||||
| `/api/tasks/capacity` | GET | Get capacity metrics |
|
||||
| `/api/tasks/capacity/:role` | GET | Get role-specific capacity |
|
||||
| `/api/tasks/recover-stale` | POST | Recover tasks from dead workers |
|
||||
|
||||
### Task Generation
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks/generate/resync` | POST | Generate daily resync tasks |
|
||||
| `/api/tasks/generate/discovery` | POST | Create store discovery task |
|
||||
|
||||
### Migration (from legacy systems)
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks/migration/status` | GET | Compare old vs new systems |
|
||||
| `/api/tasks/migration/disable-old-schedules` | POST | Disable job_schedules |
|
||||
| `/api/tasks/migration/cancel-pending-crawl-jobs` | POST | Cancel old crawl jobs |
|
||||
| `/api/tasks/migration/create-resync-tasks` | POST | Create tasks for all stores |
|
||||
| `/api/tasks/migration/full-migrate` | POST | One-click migration |
|
||||
|
||||
### Role-Specific Endpoints
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks/role/:role/last-completion` | GET | Last completion time |
|
||||
| `/api/tasks/role/:role/recent` | GET | Recent completions |
|
||||
| `/api/tasks/store/:id/active` | GET | Check if store has active task |
|
||||
|
||||
## Capacity Planning
|
||||
|
||||
The `v_worker_capacity` view provides real-time metrics:
|
||||
|
||||
```sql
|
||||
SELECT * FROM v_worker_capacity;
|
||||
```
|
||||
|
||||
Returns:
|
||||
- `pending_tasks` - Tasks waiting to be claimed
|
||||
- `ready_tasks` - Tasks ready now (scheduled_for is null or past)
|
||||
- `claimed_tasks` - Tasks claimed but not started
|
||||
- `running_tasks` - Tasks actively processing
|
||||
- `completed_last_hour` - Recent completions
|
||||
- `failed_last_hour` - Recent failures
|
||||
- `active_workers` - Workers with recent heartbeats
|
||||
- `avg_duration_sec` - Average task duration
|
||||
- `tasks_per_worker_hour` - Throughput estimate
|
||||
- `estimated_hours_to_drain` - Time to clear queue
|
||||
|
||||
### Scaling Recommendations
|
||||
|
||||
```javascript
|
||||
// API: GET /api/tasks/capacity/:role
|
||||
{
|
||||
"role": "product_resync",
|
||||
"pending_tasks": 500,
|
||||
"active_workers": 3,
|
||||
"workers_needed": {
|
||||
"for_1_hour": 10,
|
||||
"for_4_hours": 3,
|
||||
"for_8_hours": 2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Task Chaining
|
||||
|
||||
Tasks can automatically create follow-up tasks:
|
||||
|
||||
```
|
||||
store_discovery → entry_point_discovery → product_discovery
|
||||
↓
|
||||
(store has platform_dispensary_id)
|
||||
↓
|
||||
Daily resync tasks
|
||||
```
|
||||
|
||||
The `chainNextTask()` method handles this automatically.
|
||||
|
||||
## Stale Task Recovery
|
||||
|
||||
Tasks are considered stale if `last_heartbeat_at` is older than the threshold (default 10 minutes).
|
||||
|
||||
```sql
|
||||
SELECT recover_stale_tasks(10); -- 10 minute threshold
|
||||
```
|
||||
|
||||
Or via API:
|
||||
```bash
|
||||
curl -X POST /api/tasks/recover-stale \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"threshold_minutes": 10}'
|
||||
```
|
||||
|
||||
## Migration from Legacy Systems
|
||||
|
||||
### Legacy Systems Replaced
|
||||
|
||||
1. **job_schedules + job_run_logs** - Scheduled job definitions
|
||||
2. **dispensary_crawl_jobs** - Per-dispensary crawl queue
|
||||
3. **SyncOrchestrator + HydrationWorker** - Raw payload processing
|
||||
|
||||
### Migration Steps
|
||||
|
||||
**Option 1: One-Click Migration**
|
||||
```bash
|
||||
curl -X POST /api/tasks/migration/full-migrate
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Disable all job_schedules
|
||||
2. Cancel pending dispensary_crawl_jobs
|
||||
3. Generate resync tasks for all stores
|
||||
4. Create discovery and analytics tasks
|
||||
|
||||
**Option 2: Manual Migration**
|
||||
```bash
|
||||
# 1. Check current status
|
||||
curl /api/tasks/migration/status
|
||||
|
||||
# 2. Disable old schedules
|
||||
curl -X POST /api/tasks/migration/disable-old-schedules
|
||||
|
||||
# 3. Cancel pending crawl jobs
|
||||
curl -X POST /api/tasks/migration/cancel-pending-crawl-jobs
|
||||
|
||||
# 4. Create resync tasks
|
||||
curl -X POST /api/tasks/migration/create-resync-tasks \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"state_code": "AZ"}'
|
||||
|
||||
# 5. Generate daily resync schedule
|
||||
curl -X POST /api/tasks/generate/resync \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"batches_per_day": 6}'
|
||||
```
|
||||
|
||||
## Per-Store Locking
|
||||
|
||||
The system prevents concurrent tasks for the same store using a partial unique index:
|
||||
|
||||
```sql
|
||||
CREATE UNIQUE INDEX idx_worker_tasks_active_dispensary
|
||||
ON worker_tasks (dispensary_id)
|
||||
WHERE dispensary_id IS NOT NULL
|
||||
AND status IN ('claimed', 'running');
|
||||
```
|
||||
|
||||
This ensures only one task can be active per store at any time.
|
||||
|
||||
## Task Priority
|
||||
|
||||
Tasks are claimed in priority order (higher first), then by creation time:
|
||||
|
||||
```sql
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
```
|
||||
|
||||
Default priorities:
|
||||
- `store_discovery`: 0
|
||||
- `entry_point_discovery`: 10 (high - new stores)
|
||||
- `product_discovery`: 10 (high - new stores)
|
||||
- `product_resync`: 0
|
||||
- `analytics_refresh`: 0
|
||||
|
||||
## Scheduled Tasks
|
||||
|
||||
Tasks can be scheduled for future execution:
|
||||
|
||||
```javascript
|
||||
await taskService.createTask({
|
||||
role: 'product_resync',
|
||||
dispensary_id: 123,
|
||||
scheduled_for: new Date('2025-01-10T06:00:00Z'),
|
||||
});
|
||||
```
|
||||
|
||||
The `generate_resync_tasks()` function creates staggered tasks throughout the day:
|
||||
|
||||
```sql
|
||||
SELECT generate_resync_tasks(6, '2025-01-10'); -- 6 batches = every 4 hours
|
||||
```
|
||||
|
||||
## Dashboard Integration
|
||||
|
||||
The admin dashboard shows task queue status in the main overview:
|
||||
|
||||
```
|
||||
Task Queue Summary
|
||||
------------------
|
||||
Pending: 45
|
||||
Running: 3
|
||||
Completed: 1,234
|
||||
Failed: 12
|
||||
```
|
||||
|
||||
Full task management is available at `/admin/tasks`.
|
||||
|
||||
## Error Handling
|
||||
|
||||
Failed tasks include the error message in `error_message` and can be retried:
|
||||
|
||||
```sql
|
||||
-- View failed tasks
|
||||
SELECT id, role, dispensary_id, error_message, retry_count
|
||||
FROM worker_tasks
|
||||
WHERE status = 'failed'
|
||||
ORDER BY completed_at DESC
|
||||
LIMIT 20;
|
||||
|
||||
-- Retry failed tasks
|
||||
UPDATE worker_tasks
|
||||
SET status = 'pending', retry_count = retry_count + 1
|
||||
WHERE status = 'failed' AND retry_count < max_retries;
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Logs
|
||||
|
||||
Workers log to stdout:
|
||||
```
|
||||
[TaskWorker] Starting worker worker-product_resync-a1b2c3d4 for role: product_resync
|
||||
[TaskWorker] Claimed task 123 (product_resync) for dispensary 456
|
||||
[TaskWorker] Task 123 completed successfully
|
||||
```
|
||||
|
||||
### Health Check
|
||||
|
||||
Check if workers are active:
|
||||
```sql
|
||||
SELECT worker_id, role, COUNT(*), MAX(last_heartbeat_at)
|
||||
FROM worker_tasks
|
||||
WHERE last_heartbeat_at > NOW() - INTERVAL '5 minutes'
|
||||
GROUP BY worker_id, role;
|
||||
```
|
||||
|
||||
### Metrics
|
||||
|
||||
```sql
|
||||
-- Tasks by status
|
||||
SELECT status, COUNT(*) FROM worker_tasks GROUP BY status;
|
||||
|
||||
-- Tasks by role
|
||||
SELECT role, status, COUNT(*) FROM worker_tasks GROUP BY role, status;
|
||||
|
||||
-- Average duration by role
|
||||
SELECT role, AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) as avg_seconds
|
||||
FROM worker_tasks
|
||||
WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '24 hours'
|
||||
GROUP BY role;
|
||||
```
|
||||
69
backend/k8s/cronjob-ip2location.yaml
Normal file
69
backend/k8s/cronjob-ip2location.yaml
Normal file
@@ -0,0 +1,69 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: ip2location-update
|
||||
namespace: default
|
||||
spec:
|
||||
# Run on the 1st of every month at 3am UTC
|
||||
schedule: "0 3 1 * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: ip2location-updater
|
||||
image: curlimages/curl:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
echo "Downloading IP2Location LITE DB5..."
|
||||
|
||||
# Download to temp
|
||||
cd /tmp
|
||||
curl -L -o ip2location.zip "https://www.ip2location.com/download/?token=${IP2LOCATION_TOKEN}&file=DB5LITEBIN"
|
||||
|
||||
# Extract
|
||||
unzip -o ip2location.zip
|
||||
|
||||
# Find and copy the BIN file
|
||||
BIN_FILE=$(ls *.BIN 2>/dev/null | head -1)
|
||||
if [ -z "$BIN_FILE" ]; then
|
||||
echo "ERROR: No BIN file found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Copy to shared volume
|
||||
cp "$BIN_FILE" /data/IP2LOCATION-LITE-DB5.BIN
|
||||
|
||||
echo "Done! Database updated: /data/IP2LOCATION-LITE-DB5.BIN"
|
||||
env:
|
||||
- name: IP2LOCATION_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: dutchie-backend-secret
|
||||
key: IP2LOCATION_TOKEN
|
||||
volumeMounts:
|
||||
- name: ip2location-data
|
||||
mountPath: /data
|
||||
restartPolicy: OnFailure
|
||||
volumes:
|
||||
- name: ip2location-data
|
||||
persistentVolumeClaim:
|
||||
claimName: ip2location-pvc
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ip2location-pvc
|
||||
namespace: default
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Mi
|
||||
@@ -26,6 +26,12 @@ spec:
|
||||
name: dutchie-backend-config
|
||||
- secretRef:
|
||||
name: dutchie-backend-secret
|
||||
env:
|
||||
- name: IP2LOCATION_DB_PATH
|
||||
value: /data/ip2location/IP2LOCATION-LITE-DB5.BIN
|
||||
volumeMounts:
|
||||
- name: ip2location-data
|
||||
mountPath: /data/ip2location
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
@@ -45,3 +51,7 @@ spec:
|
||||
port: 3010
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: ip2location-data
|
||||
persistentVolumeClaim:
|
||||
claimName: ip2location-pvc
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
-- Add location columns to proxies table
|
||||
ALTER TABLE proxies
|
||||
ADD COLUMN city VARCHAR(100),
|
||||
ADD COLUMN state VARCHAR(100),
|
||||
ADD COLUMN country VARCHAR(100),
|
||||
ADD COLUMN country_code VARCHAR(2),
|
||||
ADD COLUMN location_updated_at TIMESTAMP;
|
||||
ADD COLUMN IF NOT EXISTS city VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS state VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country_code VARCHAR(2),
|
||||
ADD COLUMN IF NOT EXISTS location_updated_at TIMESTAMP;
|
||||
|
||||
-- Add index for location-based queries
|
||||
CREATE INDEX idx_proxies_location ON proxies(country_code, state, city);
|
||||
CREATE INDEX IF NOT EXISTS idx_proxies_location ON proxies(country_code, state, city);
|
||||
|
||||
-- Add the same to failed_proxies table
|
||||
ALTER TABLE failed_proxies
|
||||
ADD COLUMN city VARCHAR(100),
|
||||
ADD COLUMN state VARCHAR(100),
|
||||
ADD COLUMN country VARCHAR(100),
|
||||
ADD COLUMN country_code VARCHAR(2),
|
||||
ADD COLUMN location_updated_at TIMESTAMP;
|
||||
ADD COLUMN IF NOT EXISTS city VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS state VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country_code VARCHAR(2),
|
||||
ADD COLUMN IF NOT EXISTS location_updated_at TIMESTAMP;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
-- Create dispensaries table as single source of truth
|
||||
-- This consolidates azdhs_list (official data) + stores (menu data) into one table
|
||||
CREATE TABLE dispensaries (
|
||||
CREATE TABLE IF NOT EXISTS dispensaries (
|
||||
-- Primary key
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
@@ -43,11 +43,11 @@ CREATE TABLE dispensaries (
|
||||
);
|
||||
|
||||
-- Create indexes for common queries
|
||||
CREATE INDEX idx_dispensaries_city ON dispensaries(city);
|
||||
CREATE INDEX idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX idx_dispensaries_slug ON dispensaries(slug);
|
||||
CREATE INDEX idx_dispensaries_azdhs_id ON dispensaries(azdhs_id);
|
||||
CREATE INDEX idx_dispensaries_menu_status ON dispensaries(menu_scrape_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_slug ON dispensaries(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_azdhs_id ON dispensaries(azdhs_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_menu_status ON dispensaries(menu_scrape_status);
|
||||
|
||||
-- Create index for location-based queries
|
||||
CREATE INDEX idx_dispensaries_location ON dispensaries(latitude, longitude) WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_location ON dispensaries(latitude, longitude) WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
-- Create dispensary_changes table for change approval workflow
|
||||
-- This protects against accidental data destruction by requiring manual review
|
||||
CREATE TABLE dispensary_changes (
|
||||
CREATE TABLE IF NOT EXISTS dispensary_changes (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
@@ -26,10 +26,10 @@ CREATE TABLE dispensary_changes (
|
||||
);
|
||||
|
||||
-- Create indexes for common queries
|
||||
CREATE INDEX idx_dispensary_changes_status ON dispensary_changes(status);
|
||||
CREATE INDEX idx_dispensary_changes_dispensary_status ON dispensary_changes(dispensary_id, status);
|
||||
CREATE INDEX idx_dispensary_changes_created_at ON dispensary_changes(created_at DESC);
|
||||
CREATE INDEX idx_dispensary_changes_requires_recrawl ON dispensary_changes(requires_recrawl) WHERE requires_recrawl = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_status ON dispensary_changes(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_dispensary_status ON dispensary_changes(dispensary_id, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_created_at ON dispensary_changes(created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_requires_recrawl ON dispensary_changes(requires_recrawl) WHERE requires_recrawl = TRUE;
|
||||
|
||||
-- Create function to automatically set requires_recrawl for website/menu_url changes
|
||||
CREATE OR REPLACE FUNCTION set_requires_recrawl()
|
||||
@@ -42,7 +42,8 @@ BEGIN
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Create trigger to call the function
|
||||
-- Create trigger to call the function (drop first to make idempotent)
|
||||
DROP TRIGGER IF EXISTS trigger_set_requires_recrawl ON dispensary_changes;
|
||||
CREATE TRIGGER trigger_set_requires_recrawl
|
||||
BEFORE INSERT ON dispensary_changes
|
||||
FOR EACH ROW
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
-- Populate dispensaries table from azdhs_list
|
||||
-- This migrates all 182 AZDHS records with their enriched Google Maps data
|
||||
-- For multi-location dispensaries with duplicate slugs, append city name to make unique
|
||||
-- IDEMPOTENT: Uses ON CONFLICT DO NOTHING to skip already-imported records
|
||||
|
||||
WITH ranked_dispensaries AS (
|
||||
SELECT
|
||||
@@ -78,9 +79,10 @@ SELECT
|
||||
created_at,
|
||||
updated_at
|
||||
FROM ranked_dispensaries
|
||||
ORDER BY id;
|
||||
ORDER BY id
|
||||
ON CONFLICT (azdhs_id) DO NOTHING;
|
||||
|
||||
-- Verify the migration
|
||||
-- Verify the migration (idempotent - just logs, doesn't fail)
|
||||
DO $$
|
||||
DECLARE
|
||||
source_count INTEGER;
|
||||
@@ -89,9 +91,11 @@ BEGIN
|
||||
SELECT COUNT(*) INTO source_count FROM azdhs_list;
|
||||
SELECT COUNT(*) INTO dest_count FROM dispensaries;
|
||||
|
||||
RAISE NOTICE 'Migration complete: % records from azdhs_list → % records in dispensaries', source_count, dest_count;
|
||||
RAISE NOTICE 'Migration status: % records in azdhs_list, % records in dispensaries', source_count, dest_count;
|
||||
|
||||
IF source_count != dest_count THEN
|
||||
RAISE EXCEPTION 'Record count mismatch! Expected %, got %', source_count, dest_count;
|
||||
IF dest_count >= source_count THEN
|
||||
RAISE NOTICE 'OK: dispensaries table has expected records';
|
||||
ELSE
|
||||
RAISE WARNING 'dispensaries has fewer records than azdhs_list (% vs %)', dest_count, source_count;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -3,15 +3,15 @@
|
||||
|
||||
-- Add dispensary_id to products table
|
||||
ALTER TABLE products
|
||||
ADD COLUMN dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
|
||||
-- Add dispensary_id to categories table
|
||||
ALTER TABLE categories
|
||||
ADD COLUMN dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
|
||||
-- Create indexes for the new foreign keys
|
||||
CREATE INDEX idx_products_dispensary_id ON products(dispensary_id);
|
||||
CREATE INDEX idx_categories_dispensary_id ON categories(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_products_dispensary_id ON products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_categories_dispensary_id ON categories(dispensary_id);
|
||||
|
||||
-- NOTE: We'll populate these FKs and migrate data from stores in a separate data migration
|
||||
-- For now, new scrapers should use dispensary_id, but old store_id still works
|
||||
|
||||
119
backend/migrations/051_worker_definitions.sql
Normal file
119
backend/migrations/051_worker_definitions.sql
Normal file
@@ -0,0 +1,119 @@
|
||||
-- Migration 051: Worker Definitions
|
||||
-- Creates a dedicated workers table for named workers with roles and assignments
|
||||
|
||||
-- Workers table - defines named workers with roles
|
||||
CREATE TABLE IF NOT EXISTS workers (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
role VARCHAR(100) NOT NULL,
|
||||
description TEXT,
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Schedule configuration (for dedicated crawl workers)
|
||||
schedule_type VARCHAR(50) DEFAULT 'interval', -- 'interval', 'cron', 'manual'
|
||||
interval_minutes INTEGER DEFAULT 240,
|
||||
cron_expression VARCHAR(100), -- e.g., '0 */4 * * *'
|
||||
jitter_minutes INTEGER DEFAULT 30,
|
||||
|
||||
-- Assignment scope
|
||||
assignment_type VARCHAR(50) DEFAULT 'all', -- 'all', 'state', 'dispensary', 'chain'
|
||||
assigned_state_codes TEXT[], -- e.g., ['AZ', 'CA']
|
||||
assigned_dispensary_ids INTEGER[],
|
||||
assigned_chain_ids INTEGER[],
|
||||
|
||||
-- Job configuration
|
||||
job_type VARCHAR(50) NOT NULL DEFAULT 'dutchie_product_crawl',
|
||||
job_config JSONB DEFAULT '{}',
|
||||
priority INTEGER DEFAULT 0,
|
||||
max_concurrent INTEGER DEFAULT 1,
|
||||
|
||||
-- Status tracking
|
||||
status VARCHAR(50) DEFAULT 'idle', -- 'idle', 'running', 'paused', 'error'
|
||||
last_run_at TIMESTAMPTZ,
|
||||
last_status VARCHAR(50),
|
||||
last_error TEXT,
|
||||
last_duration_ms INTEGER,
|
||||
next_run_at TIMESTAMPTZ,
|
||||
current_job_id INTEGER,
|
||||
|
||||
-- Metrics
|
||||
total_runs INTEGER DEFAULT 0,
|
||||
successful_runs INTEGER DEFAULT 0,
|
||||
failed_runs INTEGER DEFAULT 0,
|
||||
avg_duration_ms INTEGER,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Worker run history
|
||||
CREATE TABLE IF NOT EXISTS worker_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id INTEGER NOT NULL REFERENCES workers(id) ON DELETE CASCADE,
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
status VARCHAR(50) DEFAULT 'running', -- 'running', 'success', 'error', 'cancelled'
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- What was processed
|
||||
jobs_created INTEGER DEFAULT 0,
|
||||
jobs_completed INTEGER DEFAULT 0,
|
||||
jobs_failed INTEGER DEFAULT 0,
|
||||
dispensaries_crawled INTEGER DEFAULT 0,
|
||||
products_found INTEGER DEFAULT 0,
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for efficient lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_workers_enabled ON workers(enabled) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_workers_next_run ON workers(next_run_at) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_workers_status ON workers(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_runs_worker_id ON worker_runs(worker_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_runs_started_at ON worker_runs(started_at DESC);
|
||||
|
||||
-- Add worker_id to dispensary_crawl_jobs if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'assigned_worker_id'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN assigned_worker_id INTEGER REFERENCES workers(id);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Migrate existing job_schedules workers to new workers table
|
||||
INSERT INTO workers (name, role, description, enabled, interval_minutes, jitter_minutes, job_type, job_config, last_run_at, last_status, last_error, last_duration_ms, next_run_at)
|
||||
SELECT
|
||||
worker_name,
|
||||
worker_role,
|
||||
description,
|
||||
enabled,
|
||||
base_interval_minutes,
|
||||
jitter_minutes,
|
||||
job_name,
|
||||
job_config,
|
||||
last_run_at,
|
||||
last_status,
|
||||
last_error_message,
|
||||
last_duration_ms,
|
||||
next_run_at
|
||||
FROM job_schedules
|
||||
WHERE worker_name IS NOT NULL
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
updated_at = NOW();
|
||||
|
||||
-- Available worker roles (reference)
|
||||
COMMENT ON TABLE workers IS 'Named workers with specific roles and assignments. Roles include:
|
||||
- product_sync: Crawls products from dispensary menus
|
||||
- store_discovery: Discovers new dispensary locations
|
||||
- entry_point_finder: Detects menu providers and resolves platform IDs
|
||||
- analytics_refresh: Refreshes materialized views and analytics
|
||||
- price_monitor: Monitors price changes and triggers alerts
|
||||
- inventory_sync: Syncs inventory levels
|
||||
- image_processor: Downloads and processes product images
|
||||
- data_validator: Validates data integrity';
|
||||
49
backend/migrations/052_seo_settings.sql
Normal file
49
backend/migrations/052_seo_settings.sql
Normal file
@@ -0,0 +1,49 @@
|
||||
-- Migration 052: SEO Settings Table
|
||||
-- Key/value store for SEO Orchestrator configuration
|
||||
|
||||
CREATE TABLE IF NOT EXISTS seo_settings (
|
||||
id SERIAL PRIMARY KEY,
|
||||
key TEXT UNIQUE NOT NULL,
|
||||
value JSONB NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Create index on key for fast lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_settings_key ON seo_settings(key);
|
||||
|
||||
-- Seed with default settings
|
||||
INSERT INTO seo_settings (key, value) VALUES
|
||||
-- Section 1: Global Content Generation Settings
|
||||
('primary_prompt_template', '"You are a cannabis industry content expert. Generate SEO-optimized content for {{page_type}} pages about {{subject}}. Focus on: {{focus_areas}}. Maintain a {{tone}} tone and keep content {{length}}."'),
|
||||
('regeneration_prompt_template', '"Regenerate the following SEO content with fresh perspectives. Original topic: {{subject}}. Improve upon: {{improvement_areas}}. Maintain compliance with cannabis industry standards."'),
|
||||
('default_content_length', '"medium"'),
|
||||
('tone_voice', '"informational"'),
|
||||
|
||||
-- Section 2: Automatic Refresh Rules
|
||||
('auto_refresh_interval', '"weekly"'),
|
||||
('trigger_pct_product_change', 'true'),
|
||||
('trigger_pct_brand_change', 'true'),
|
||||
('trigger_new_stores', 'true'),
|
||||
('trigger_market_shift', 'false'),
|
||||
('webhook_url', '""'),
|
||||
('notify_on_trigger', 'false'),
|
||||
|
||||
-- Section 3: Page-Level Defaults
|
||||
('default_title_template', '"{{state_name}} Dispensaries | Find Cannabis Near You | CannaiQ"'),
|
||||
('default_meta_description_template', '"Discover the best dispensaries in {{state_name}}. Browse {{dispensary_count}}+ licensed retailers, compare prices, and find cannabis products near you."'),
|
||||
('default_slug_template', '"dispensaries-{{state_code_lower}}"'),
|
||||
('default_og_image_template', '"/images/seo/og-{{state_code_lower}}.jpg"'),
|
||||
('enable_ai_images', 'false'),
|
||||
|
||||
-- Section 4: Crawl / Dataset Configuration
|
||||
('primary_data_provider', '"cannaiq"'),
|
||||
('fallback_data_provider', '"dutchie"'),
|
||||
('min_data_freshness_hours', '24'),
|
||||
('stale_data_behavior', '"allow_with_warning"')
|
||||
ON CONFLICT (key) DO NOTHING;
|
||||
|
||||
-- Record migration
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES ('052', 'seo_settings', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
@@ -0,0 +1,42 @@
|
||||
-- Migration 057: Add crawl_enabled and dutchie_verified fields to dispensaries
|
||||
--
|
||||
-- Purpose:
|
||||
-- 1. Add crawl_enabled to control which dispensaries get crawled
|
||||
-- 2. Add dutchie_verified to track Dutchie source-of-truth verification
|
||||
-- 3. Default existing records to crawl_enabled = TRUE to preserve behavior
|
||||
--
|
||||
-- After this migration, run the harmonization script to:
|
||||
-- - Match dispensaries to Dutchie discoveries
|
||||
-- - Update platform_dispensary_id from Dutchie
|
||||
-- - Set dutchie_verified = TRUE for matches
|
||||
-- - Set crawl_enabled = FALSE for unverified records
|
||||
|
||||
-- Add crawl_enabled column (defaults to true to not break existing crawls)
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE;
|
||||
|
||||
-- Add dutchie_verified column to track if record is verified against Dutchie
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS dutchie_verified BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add dutchie_verified_at timestamp
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS dutchie_verified_at TIMESTAMP WITH TIME ZONE;
|
||||
|
||||
-- Add dutchie_discovery_id to link back to the discovery record
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS dutchie_discovery_id BIGINT REFERENCES dutchie_discovery_locations(id);
|
||||
|
||||
-- Create index for crawl queries (only crawl enabled dispensaries)
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawl_enabled
|
||||
ON dispensaries(crawl_enabled, state)
|
||||
WHERE crawl_enabled = TRUE;
|
||||
|
||||
-- Create index for dutchie verification status
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_dutchie_verified
|
||||
ON dispensaries(dutchie_verified, state);
|
||||
|
||||
COMMENT ON COLUMN dispensaries.crawl_enabled IS 'Whether this dispensary should be included in crawl jobs. Set to FALSE for unverified or problematic records.';
|
||||
COMMENT ON COLUMN dispensaries.dutchie_verified IS 'Whether this dispensary has been verified against Dutchie source of truth (matched by slug or manually linked).';
|
||||
COMMENT ON COLUMN dispensaries.dutchie_verified_at IS 'Timestamp when Dutchie verification was completed.';
|
||||
COMMENT ON COLUMN dispensaries.dutchie_discovery_id IS 'Link to the dutchie_discovery_locations record this was matched/verified against.';
|
||||
56
backend/migrations/065_slug_verification_tracking.sql
Normal file
56
backend/migrations/065_slug_verification_tracking.sql
Normal file
@@ -0,0 +1,56 @@
|
||||
-- Migration 065: Slug verification and data source tracking
|
||||
-- Adds columns to track when slug/menu data was verified and from what source
|
||||
|
||||
-- Add slug verification columns to dispensaries
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS slug_source VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS slug_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS slug_status VARCHAR(20) DEFAULT 'unverified',
|
||||
ADD COLUMN IF NOT EXISTS menu_url_source VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS menu_url_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS platform_id_source VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS platform_id_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS country VARCHAR(2) DEFAULT 'US';
|
||||
|
||||
-- Add index for finding unverified stores
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_slug_status
|
||||
ON dispensaries(slug_status)
|
||||
WHERE slug_status != 'verified';
|
||||
|
||||
-- Add index for country
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_country
|
||||
ON dispensaries(country);
|
||||
|
||||
-- Comment on columns
|
||||
COMMENT ON COLUMN dispensaries.slug_source IS 'Source of slug data: dutchie_api, manual, azdhs, discovery, etc.';
|
||||
COMMENT ON COLUMN dispensaries.slug_verified_at IS 'When the slug was last verified against the source';
|
||||
COMMENT ON COLUMN dispensaries.slug_status IS 'Status: unverified, verified, invalid, changed';
|
||||
COMMENT ON COLUMN dispensaries.menu_url_source IS 'Source of menu_url: dutchie_api, website_scrape, manual, etc.';
|
||||
COMMENT ON COLUMN dispensaries.menu_url_verified_at IS 'When the menu_url was last verified';
|
||||
COMMENT ON COLUMN dispensaries.platform_id_source IS 'Source of platform_dispensary_id: dutchie_api, graphql_resolution, etc.';
|
||||
COMMENT ON COLUMN dispensaries.platform_id_verified_at IS 'When the platform_dispensary_id was last verified';
|
||||
COMMENT ON COLUMN dispensaries.country IS 'ISO 2-letter country code: US, CA, etc.';
|
||||
|
||||
-- Update Green Pharms Mesa with verified Dutchie data
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
slug = 'green-pharms-mesa',
|
||||
menu_url = 'https://dutchie.com/embedded-menu/green-pharms-mesa',
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = '68dc47a2af90f2e653f8df30',
|
||||
slug_source = 'dutchie_api',
|
||||
slug_verified_at = NOW(),
|
||||
slug_status = 'verified',
|
||||
menu_url_source = 'dutchie_api',
|
||||
menu_url_verified_at = NOW(),
|
||||
platform_id_source = 'dutchie_api',
|
||||
platform_id_verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = 232;
|
||||
|
||||
-- Mark all other AZ dispensaries as needing verification
|
||||
UPDATE dispensaries
|
||||
SET slug_status = 'unverified'
|
||||
WHERE state = 'AZ'
|
||||
AND id != 232
|
||||
AND (slug_status IS NULL OR slug_status = 'unverified');
|
||||
140
backend/migrations/066_dutchie_field_alignment.sql
Normal file
140
backend/migrations/066_dutchie_field_alignment.sql
Normal file
@@ -0,0 +1,140 @@
|
||||
-- Migration 066: Align dispensaries and discovery_locations tables with Dutchie field names
|
||||
-- Uses snake_case convention (Postgres standard) mapped from Dutchie's camelCase
|
||||
--
|
||||
-- Changes:
|
||||
-- 1. dispensaries: rename address→address1, zip→zipcode, remove company_name
|
||||
-- 2. dispensaries: add missing Dutchie fields
|
||||
-- 3. dutchie_discovery_locations: add missing Dutchie fields
|
||||
|
||||
-- ============================================================================
|
||||
-- DISPENSARIES TABLE
|
||||
-- ============================================================================
|
||||
|
||||
-- Rename address to address1 (matches Dutchie's address1)
|
||||
ALTER TABLE dispensaries RENAME COLUMN address TO address1;
|
||||
|
||||
-- Rename zip to zipcode (matches Dutchie's zip, but we use zipcode for clarity)
|
||||
ALTER TABLE dispensaries RENAME COLUMN zip TO zipcode;
|
||||
|
||||
-- Drop company_name (redundant with name)
|
||||
ALTER TABLE dispensaries DROP COLUMN IF EXISTS company_name;
|
||||
|
||||
-- Add address2
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS address2 VARCHAR(255);
|
||||
|
||||
-- Add country
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS country VARCHAR(100) DEFAULT 'United States';
|
||||
|
||||
-- Add timezone
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS timezone VARCHAR(50);
|
||||
|
||||
-- Add email
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS email VARCHAR(255);
|
||||
|
||||
-- Add description
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS description TEXT;
|
||||
|
||||
-- Add logo_image (Dutchie: logoImage)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS logo_image TEXT;
|
||||
|
||||
-- Add banner_image (Dutchie: bannerImage)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS banner_image TEXT;
|
||||
|
||||
-- Add offer_pickup (Dutchie: offerPickup)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS offer_pickup BOOLEAN DEFAULT TRUE;
|
||||
|
||||
-- Add offer_delivery (Dutchie: offerDelivery)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS offer_delivery BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add offer_curbside_pickup (Dutchie: offerCurbsidePickup)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS offer_curbside_pickup BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add is_medical (Dutchie: isMedical)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS is_medical BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add is_recreational (Dutchie: isRecreational)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS is_recreational BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add chain_slug (Dutchie: chain)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_slug VARCHAR(255);
|
||||
|
||||
-- Add enterprise_id (Dutchie: retailer.enterpriseId)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS enterprise_id VARCHAR(100);
|
||||
|
||||
-- Add status (Dutchie: status - open/closed)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS status VARCHAR(50);
|
||||
|
||||
-- Add c_name (Dutchie: cName - the URL slug used in embedded menus)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
|
||||
|
||||
-- ============================================================================
|
||||
-- DUTCHIE_DISCOVERY_LOCATIONS TABLE
|
||||
-- ============================================================================
|
||||
|
||||
-- Add phone
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS phone VARCHAR(50);
|
||||
|
||||
-- Add website (Dutchie: embedBackUrl)
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS website TEXT;
|
||||
|
||||
-- Add email
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS email VARCHAR(255);
|
||||
|
||||
-- Add description
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS description TEXT;
|
||||
|
||||
-- Add logo_image
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS logo_image TEXT;
|
||||
|
||||
-- Add banner_image
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS banner_image TEXT;
|
||||
|
||||
-- Add chain_slug
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS chain_slug VARCHAR(255);
|
||||
|
||||
-- Add enterprise_id
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS enterprise_id VARCHAR(100);
|
||||
|
||||
-- Add c_name
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
|
||||
|
||||
-- Add country
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS country VARCHAR(100) DEFAULT 'United States';
|
||||
|
||||
-- Add store status
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS store_status VARCHAR(50);
|
||||
|
||||
-- ============================================================================
|
||||
-- INDEXES
|
||||
-- ============================================================================
|
||||
|
||||
-- Index for chain lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_slug ON dispensaries(chain_slug) WHERE chain_slug IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_chain_slug ON dutchie_discovery_locations(chain_slug) WHERE chain_slug IS NOT NULL;
|
||||
|
||||
-- Index for enterprise lookups (for multi-location chains)
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_enterprise_id ON dispensaries(enterprise_id) WHERE enterprise_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_enterprise_id ON dutchie_discovery_locations(enterprise_id) WHERE enterprise_id IS NOT NULL;
|
||||
|
||||
-- Index for c_name lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_c_name ON dispensaries(c_name) WHERE c_name IS NOT NULL;
|
||||
|
||||
-- ============================================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================================
|
||||
|
||||
COMMENT ON COLUMN dispensaries.address1 IS 'Street address line 1 (Dutchie: address1)';
|
||||
COMMENT ON COLUMN dispensaries.address2 IS 'Street address line 2 (Dutchie: address2)';
|
||||
COMMENT ON COLUMN dispensaries.zipcode IS 'ZIP/postal code (Dutchie: zip)';
|
||||
COMMENT ON COLUMN dispensaries.c_name IS 'Dutchie URL slug for embedded menus (Dutchie: cName)';
|
||||
COMMENT ON COLUMN dispensaries.chain_slug IS 'Chain identifier slug (Dutchie: chain)';
|
||||
COMMENT ON COLUMN dispensaries.enterprise_id IS 'Parent enterprise UUID (Dutchie: retailer.enterpriseId)';
|
||||
COMMENT ON COLUMN dispensaries.logo_image IS 'Logo image URL (Dutchie: logoImage)';
|
||||
COMMENT ON COLUMN dispensaries.banner_image IS 'Banner image URL (Dutchie: bannerImage)';
|
||||
COMMENT ON COLUMN dispensaries.offer_pickup IS 'Offers in-store pickup (Dutchie: offerPickup)';
|
||||
COMMENT ON COLUMN dispensaries.offer_delivery IS 'Offers delivery (Dutchie: offerDelivery)';
|
||||
COMMENT ON COLUMN dispensaries.offer_curbside_pickup IS 'Offers curbside pickup (Dutchie: offerCurbsidePickup)';
|
||||
COMMENT ON COLUMN dispensaries.is_medical IS 'Licensed for medical sales (Dutchie: isMedical)';
|
||||
COMMENT ON COLUMN dispensaries.is_recreational IS 'Licensed for recreational sales (Dutchie: isRecreational)';
|
||||
|
||||
SELECT 'Migration 066 completed: Dutchie field alignment' as status;
|
||||
24
backend/migrations/067_promotion_log.sql
Normal file
24
backend/migrations/067_promotion_log.sql
Normal file
@@ -0,0 +1,24 @@
|
||||
-- Promotion log table for tracking discovery → dispensary promotions
|
||||
-- Tracks validation and promotion actions for audit/review
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dutchie_promotion_log (
|
||||
id SERIAL PRIMARY KEY,
|
||||
discovery_id INTEGER REFERENCES dutchie_discovery_locations(id) ON DELETE SET NULL,
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
|
||||
action VARCHAR(50) NOT NULL, -- 'validated', 'rejected', 'promoted_create', 'promoted_update', 'skipped'
|
||||
state_code VARCHAR(10),
|
||||
store_name VARCHAR(255),
|
||||
validation_errors TEXT[], -- Array of error messages if rejected
|
||||
field_changes JSONB, -- Before/after snapshot of changed fields
|
||||
triggered_by VARCHAR(100) DEFAULT 'auto', -- 'auto', 'manual', 'api'
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Indexes for efficient querying
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_discovery_id ON dutchie_promotion_log(discovery_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_dispensary_id ON dutchie_promotion_log(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_action ON dutchie_promotion_log(action);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_state_code ON dutchie_promotion_log(state_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_created_at ON dutchie_promotion_log(created_at DESC);
|
||||
|
||||
COMMENT ON TABLE dutchie_promotion_log IS 'Audit log for discovery location validation and promotion to dispensaries';
|
||||
95
backend/migrations/068_crawler_status_alerts.sql
Normal file
95
backend/migrations/068_crawler_status_alerts.sql
Normal file
@@ -0,0 +1,95 @@
|
||||
-- Migration 068: Crawler Status Alerts
|
||||
-- Creates status_alerts table for dashboard notifications and status change logging
|
||||
|
||||
-- ============================================================
|
||||
-- STATUS ALERTS TABLE
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS crawler_status_alerts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- References
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id),
|
||||
profile_id INTEGER REFERENCES dispensary_crawler_profiles(id),
|
||||
|
||||
-- Alert info
|
||||
alert_type VARCHAR(50) NOT NULL, -- 'status_change', 'crawl_error', 'validation_failed', 'promoted', 'demoted'
|
||||
severity VARCHAR(20) DEFAULT 'info', -- 'info', 'warning', 'error', 'critical'
|
||||
|
||||
-- Status transition
|
||||
previous_status VARCHAR(50),
|
||||
new_status VARCHAR(50),
|
||||
|
||||
-- Context
|
||||
message TEXT,
|
||||
error_details JSONB,
|
||||
metadata JSONB, -- Additional context (product counts, error codes, etc.)
|
||||
|
||||
-- Tracking
|
||||
acknowledged BOOLEAN DEFAULT FALSE,
|
||||
acknowledged_at TIMESTAMP WITH TIME ZONE,
|
||||
acknowledged_by VARCHAR(100),
|
||||
|
||||
-- Timestamps
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_dispensary ON crawler_status_alerts(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_type ON crawler_status_alerts(alert_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_severity ON crawler_status_alerts(severity);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_unack ON crawler_status_alerts(acknowledged) WHERE acknowledged = FALSE;
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_created ON crawler_status_alerts(created_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- STATUS DEFINITIONS (for reference/validation)
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON TABLE crawler_status_alerts IS 'Crawler status change notifications for dashboard alerting';
|
||||
COMMENT ON COLUMN crawler_status_alerts.alert_type IS 'Type: status_change, crawl_error, validation_failed, promoted, demoted';
|
||||
COMMENT ON COLUMN crawler_status_alerts.severity IS 'Severity: info, warning, error, critical';
|
||||
COMMENT ON COLUMN crawler_status_alerts.previous_status IS 'Previous crawler status before change';
|
||||
COMMENT ON COLUMN crawler_status_alerts.new_status IS 'New crawler status after change';
|
||||
|
||||
-- ============================================================
|
||||
-- STATUS TRACKING ON PROFILES
|
||||
-- ============================================================
|
||||
|
||||
-- Add columns for status tracking if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Consecutive success count for auto-promotion
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'consecutive_successes') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN consecutive_successes INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
|
||||
-- Consecutive failure count for auto-demotion
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'consecutive_failures') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN consecutive_failures INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
|
||||
-- Last status change timestamp
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'status_changed_at') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN status_changed_at TIMESTAMP WITH TIME ZONE;
|
||||
END IF;
|
||||
|
||||
-- Status change reason
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'status_reason') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN status_reason TEXT;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- ============================================================
|
||||
-- VALID STATUS VALUES
|
||||
-- ============================================================
|
||||
-- Status values for dispensary_crawler_profiles.status:
|
||||
-- 'sandbox' - Newly created, being validated
|
||||
-- 'production' - Healthy, actively crawled
|
||||
-- 'needs_manual' - Requires human intervention
|
||||
-- 'failing' - Multiple consecutive failures
|
||||
-- 'disabled' - Manually disabled
|
||||
-- 'legacy' - No profile, uses default method (virtual status)
|
||||
163
backend/migrations/069_six_stage_status.sql
Normal file
163
backend/migrations/069_six_stage_status.sql
Normal file
@@ -0,0 +1,163 @@
|
||||
-- Migration 069: Seven-Stage Status System
|
||||
--
|
||||
-- Implements explicit 7-stage pipeline for store lifecycle:
|
||||
-- 1. discovered - Found via Dutchie API, raw data
|
||||
-- 2. validated - Passed field checks, ready for promotion
|
||||
-- 3. promoted - In dispensaries table, has crawler profile
|
||||
-- 4. sandbox - First crawl attempted, testing
|
||||
-- 5. hydrating - Products are being loaded/updated
|
||||
-- 6. production - Healthy, scheduled crawls via Horizon
|
||||
-- 7. failing - Crawl errors, needs attention
|
||||
|
||||
-- ============================================================
|
||||
-- STAGE ENUM TYPE
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Create enum if not exists
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'store_stage') THEN
|
||||
CREATE TYPE store_stage AS ENUM (
|
||||
'discovered',
|
||||
'validated',
|
||||
'promoted',
|
||||
'sandbox',
|
||||
'hydrating',
|
||||
'production',
|
||||
'failing'
|
||||
);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- ============================================================
|
||||
-- UPDATE DISCOVERY LOCATIONS TABLE
|
||||
-- ============================================================
|
||||
|
||||
-- Add stage column to discovery locations (replaces status)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dutchie_discovery_locations' AND column_name = 'stage') THEN
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN stage VARCHAR(20) DEFAULT 'discovered';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Migrate existing status values to stage
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET stage = CASE
|
||||
WHEN status = 'discovered' THEN 'discovered'
|
||||
WHEN status = 'verified' THEN 'validated'
|
||||
WHEN status = 'rejected' THEN 'failing'
|
||||
WHEN status = 'merged' THEN 'validated'
|
||||
ELSE 'discovered'
|
||||
END
|
||||
WHERE stage IS NULL OR stage = '';
|
||||
|
||||
-- ============================================================
|
||||
-- UPDATE CRAWLER PROFILES TABLE
|
||||
-- ============================================================
|
||||
|
||||
-- Ensure status column exists and update to new values
|
||||
UPDATE dispensary_crawler_profiles
|
||||
SET status = CASE
|
||||
WHEN status = 'sandbox' THEN 'sandbox'
|
||||
WHEN status = 'production' THEN 'production'
|
||||
WHEN status = 'needs_manual' THEN 'failing'
|
||||
WHEN status = 'failing' THEN 'failing'
|
||||
WHEN status = 'disabled' THEN 'failing'
|
||||
WHEN status IS NULL THEN 'promoted'
|
||||
ELSE 'promoted'
|
||||
END;
|
||||
|
||||
-- ============================================================
|
||||
-- ADD STAGE TRACKING TO DISPENSARIES
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Add stage column to dispensaries for quick filtering
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'stage') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN stage VARCHAR(20) DEFAULT 'promoted';
|
||||
END IF;
|
||||
|
||||
-- Add stage_changed_at for tracking
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'stage_changed_at') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN stage_changed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP;
|
||||
END IF;
|
||||
|
||||
-- Add first_crawl_at to track sandbox → production transition
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'first_crawl_at') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN first_crawl_at TIMESTAMP WITH TIME ZONE;
|
||||
END IF;
|
||||
|
||||
-- Add last_successful_crawl_at
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'last_successful_crawl_at') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN last_successful_crawl_at TIMESTAMP WITH TIME ZONE;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Set initial stage for existing dispensaries based on their crawler profile status
|
||||
UPDATE dispensaries d
|
||||
SET stage = COALESCE(
|
||||
(SELECT dcp.status FROM dispensary_crawler_profiles dcp
|
||||
WHERE dcp.dispensary_id = d.id AND dcp.enabled = true
|
||||
ORDER BY dcp.updated_at DESC LIMIT 1),
|
||||
'promoted'
|
||||
)
|
||||
WHERE d.stage IS NULL OR d.stage = '';
|
||||
|
||||
-- ============================================================
|
||||
-- INDEXES FOR STAGE-BASED QUERIES
|
||||
-- ============================================================
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_stage ON dispensaries(stage);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_stage_state ON dispensaries(stage, state);
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_stage ON dutchie_discovery_locations(stage);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_status ON dispensary_crawler_profiles(status);
|
||||
|
||||
-- ============================================================
|
||||
-- STAGE TRANSITION LOG
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS stage_transitions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- What changed
|
||||
entity_type VARCHAR(20) NOT NULL, -- 'discovery_location' or 'dispensary'
|
||||
entity_id INTEGER NOT NULL,
|
||||
|
||||
-- Stage change
|
||||
from_stage VARCHAR(20),
|
||||
to_stage VARCHAR(20) NOT NULL,
|
||||
|
||||
-- Context
|
||||
trigger_type VARCHAR(50) NOT NULL, -- 'api', 'scheduler', 'manual', 'auto'
|
||||
trigger_endpoint VARCHAR(200),
|
||||
|
||||
-- Outcome
|
||||
success BOOLEAN DEFAULT TRUE,
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
|
||||
-- Timing
|
||||
duration_ms INTEGER,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_stage_transitions_entity ON stage_transitions(entity_type, entity_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_stage_transitions_to_stage ON stage_transitions(to_stage);
|
||||
CREATE INDEX IF NOT EXISTS idx_stage_transitions_created ON stage_transitions(created_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON TABLE stage_transitions IS 'Audit log for all stage transitions in the pipeline';
|
||||
COMMENT ON COLUMN dispensaries.stage IS 'Current pipeline stage: discovered, validated, promoted, sandbox, production, failing';
|
||||
COMMENT ON COLUMN dispensaries.stage_changed_at IS 'When the stage was last changed';
|
||||
COMMENT ON COLUMN dispensaries.first_crawl_at IS 'When the first crawl was attempted (sandbox stage)';
|
||||
COMMENT ON COLUMN dispensaries.last_successful_crawl_at IS 'When the last successful crawl completed';
|
||||
239
backend/migrations/070_product_variants.sql
Normal file
239
backend/migrations/070_product_variants.sql
Normal file
@@ -0,0 +1,239 @@
|
||||
-- ============================================================================
|
||||
-- Migration 070: Product Variants Tables
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Store variant-level pricing and inventory as first-class entities
|
||||
-- to enable time-series analytics, price comparisons, and sale tracking.
|
||||
--
|
||||
-- Enables queries like:
|
||||
-- - Price history for a specific variant (1g Blue Dream over time)
|
||||
-- - Sale frequency analysis (how often is this on special?)
|
||||
-- - Cross-store price comparison (who has cheapest 1g flower?)
|
||||
-- - Current specials across all stores
|
||||
--
|
||||
-- RULES:
|
||||
-- - STRICTLY ADDITIVE (no DROP, DELETE, TRUNCATE)
|
||||
-- - All new tables use IF NOT EXISTS
|
||||
-- - All indexes use IF NOT EXISTS
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: PRODUCT_VARIANTS TABLE (Current State)
|
||||
-- ============================================================================
|
||||
-- One row per product+option combination. Tracks current pricing/inventory.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS product_variants (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_product_id INTEGER NOT NULL REFERENCES store_products(id) ON DELETE CASCADE,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- Variant identity (from Dutchie POSMetaData.children)
|
||||
option VARCHAR(100) NOT NULL, -- "1g", "3.5g", "1/8oz", "100mg"
|
||||
canonical_sku VARCHAR(100), -- Dutchie canonicalSKU
|
||||
canonical_id VARCHAR(100), -- Dutchie canonicalID
|
||||
canonical_name VARCHAR(500), -- Dutchie canonicalName
|
||||
|
||||
-- Current pricing (in dollars, not cents)
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
|
||||
-- Current inventory
|
||||
quantity INTEGER,
|
||||
quantity_available INTEGER,
|
||||
in_stock BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Special/sale status
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Weight/size parsing (for analytics)
|
||||
weight_value NUMERIC(10,2), -- 1, 3.5, 28, etc.
|
||||
weight_unit VARCHAR(20), -- g, oz, mg, ml, etc.
|
||||
|
||||
-- Timestamps
|
||||
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_price_change_at TIMESTAMPTZ,
|
||||
last_stock_change_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(store_product_id, option)
|
||||
);
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_store_product ON product_variants(store_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_dispensary ON product_variants(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_option ON product_variants(option);
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_in_stock ON product_variants(dispensary_id, in_stock) WHERE in_stock = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_on_special ON product_variants(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_canonical_sku ON product_variants(canonical_sku) WHERE canonical_sku IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_price_rec ON product_variants(price_rec) WHERE price_rec IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE product_variants IS 'Current state of each product variant (weight/size option). One row per product+option.';
|
||||
COMMENT ON COLUMN product_variants.option IS 'Weight/size option string from Dutchie (e.g., "1g", "3.5g", "1/8oz")';
|
||||
COMMENT ON COLUMN product_variants.canonical_sku IS 'Dutchie POS SKU for cross-store matching';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: PRODUCT_VARIANT_SNAPSHOTS TABLE (Historical Data)
|
||||
-- ============================================================================
|
||||
-- Time-series data for variant pricing. One row per variant per crawl.
|
||||
-- CRITICAL: NEVER DELETE from this table.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS product_variant_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
product_variant_id INTEGER NOT NULL REFERENCES product_variants(id) ON DELETE CASCADE,
|
||||
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
|
||||
-- Variant identity (denormalized for query performance)
|
||||
option VARCHAR(100) NOT NULL,
|
||||
|
||||
-- Pricing at time of capture
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
|
||||
-- Inventory at time of capture
|
||||
quantity INTEGER,
|
||||
in_stock BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Special status at time of capture
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Feed presence (FALSE = variant missing from crawl)
|
||||
is_present_in_feed BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Capture timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for time-series queries
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_variant ON product_variant_snapshots(product_variant_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_dispensary ON product_variant_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_crawl ON product_variant_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_captured ON product_variant_snapshots(captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_special ON product_variant_snapshots(is_on_special, captured_at DESC) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_option ON product_variant_snapshots(option, captured_at DESC);
|
||||
|
||||
COMMENT ON TABLE product_variant_snapshots IS 'Historical variant pricing/inventory. One row per variant per crawl. NEVER DELETE.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: USEFUL VIEWS
|
||||
-- ============================================================================
|
||||
|
||||
-- View: Current specials across all stores
|
||||
CREATE OR REPLACE VIEW v_current_specials AS
|
||||
SELECT
|
||||
pv.id as variant_id,
|
||||
sp.id as product_id,
|
||||
sp.name_raw as product_name,
|
||||
sp.brand_name_raw as brand_name,
|
||||
sp.category_raw as category,
|
||||
d.id as dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
pv.option,
|
||||
pv.price_rec,
|
||||
pv.price_rec_special,
|
||||
ROUND(((pv.price_rec - pv.price_rec_special) / NULLIF(pv.price_rec, 0)) * 100, 1) as discount_percent,
|
||||
pv.quantity,
|
||||
pv.in_stock,
|
||||
pv.last_seen_at
|
||||
FROM product_variants pv
|
||||
JOIN store_products sp ON sp.id = pv.store_product_id
|
||||
JOIN dispensaries d ON d.id = pv.dispensary_id
|
||||
WHERE pv.is_on_special = TRUE
|
||||
AND pv.in_stock = TRUE
|
||||
AND pv.price_rec_special IS NOT NULL
|
||||
AND pv.price_rec_special < pv.price_rec;
|
||||
|
||||
COMMENT ON VIEW v_current_specials IS 'All products currently on special across all stores';
|
||||
|
||||
|
||||
-- View: Price comparison for a product across stores
|
||||
CREATE OR REPLACE VIEW v_price_comparison AS
|
||||
SELECT
|
||||
sp.name_raw as product_name,
|
||||
sp.brand_name_raw as brand_name,
|
||||
sp.category_raw as category,
|
||||
pv.option,
|
||||
d.id as dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.city,
|
||||
pv.price_rec,
|
||||
pv.price_rec_special,
|
||||
pv.is_on_special,
|
||||
pv.in_stock,
|
||||
pv.quantity,
|
||||
RANK() OVER (PARTITION BY sp.name_raw, pv.option ORDER BY COALESCE(pv.price_rec_special, pv.price_rec) ASC) as price_rank
|
||||
FROM product_variants pv
|
||||
JOIN store_products sp ON sp.id = pv.store_product_id
|
||||
JOIN dispensaries d ON d.id = pv.dispensary_id
|
||||
WHERE pv.in_stock = TRUE
|
||||
AND (pv.price_rec IS NOT NULL OR pv.price_rec_special IS NOT NULL);
|
||||
|
||||
COMMENT ON VIEW v_price_comparison IS 'Compare prices for same product across stores, ranked by price';
|
||||
|
||||
|
||||
-- View: Latest snapshot per variant
|
||||
CREATE OR REPLACE VIEW v_latest_variant_snapshots AS
|
||||
SELECT DISTINCT ON (product_variant_id)
|
||||
pvs.*
|
||||
FROM product_variant_snapshots pvs
|
||||
ORDER BY product_variant_id, captured_at DESC;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: HELPER FUNCTION FOR SALE FREQUENCY
|
||||
-- ============================================================================
|
||||
|
||||
-- Function to calculate sale frequency for a variant
|
||||
CREATE OR REPLACE FUNCTION get_variant_sale_stats(p_variant_id INTEGER, p_days INTEGER DEFAULT 30)
|
||||
RETURNS TABLE (
|
||||
total_snapshots BIGINT,
|
||||
times_on_special BIGINT,
|
||||
special_frequency_pct NUMERIC,
|
||||
avg_discount_pct NUMERIC,
|
||||
min_price NUMERIC,
|
||||
max_price NUMERIC,
|
||||
avg_price NUMERIC
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COUNT(*)::BIGINT as total_snapshots,
|
||||
COUNT(*) FILTER (WHERE is_on_special)::BIGINT as times_on_special,
|
||||
ROUND((COUNT(*) FILTER (WHERE is_on_special)::NUMERIC / NULLIF(COUNT(*), 0)) * 100, 1) as special_frequency_pct,
|
||||
ROUND(AVG(
|
||||
CASE WHEN is_on_special AND price_rec_special IS NOT NULL AND price_rec IS NOT NULL
|
||||
THEN ((price_rec - price_rec_special) / NULLIF(price_rec, 0)) * 100
|
||||
END
|
||||
), 1) as avg_discount_pct,
|
||||
MIN(COALESCE(price_rec_special, price_rec)) as min_price,
|
||||
MAX(price_rec) as max_price,
|
||||
ROUND(AVG(COALESCE(price_rec_special, price_rec)), 2) as avg_price
|
||||
FROM product_variant_snapshots
|
||||
WHERE product_variant_id = p_variant_id
|
||||
AND captured_at >= NOW() - (p_days || ' days')::INTERVAL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
COMMENT ON FUNCTION get_variant_sale_stats IS 'Get sale frequency and price stats for a variant over N days';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- DONE
|
||||
-- ============================================================================
|
||||
|
||||
SELECT 'Migration 070 completed. Product variants tables ready for time-series analytics.' AS status;
|
||||
53
backend/migrations/071_harmonize_store_products.sql
Normal file
53
backend/migrations/071_harmonize_store_products.sql
Normal file
@@ -0,0 +1,53 @@
|
||||
-- Migration 071: Harmonize store_products with dutchie_products
|
||||
-- Adds missing columns to store_products to consolidate on a single canonical table
|
||||
|
||||
-- Product details
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS description TEXT;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weight VARCHAR(50);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weights JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS measurements JSONB;
|
||||
|
||||
-- Cannabinoid/terpene data
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS effects JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS terpenes JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cannabinoids_v2 JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS thc_content NUMERIC(10,4);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cbd_content NUMERIC(10,4);
|
||||
|
||||
-- Images
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS images JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS primary_image_url TEXT;
|
||||
|
||||
-- Inventory
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER DEFAULT 0;
|
||||
|
||||
-- Status/flags
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS status VARCHAR(50);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS featured BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS coming_soon BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMP WITH TIME ZONE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMP WITH TIME ZONE;
|
||||
|
||||
-- Threshold flags (Dutchie-specific)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS certificate_of_analysis_enabled BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Platform metadata
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS external_product_id VARCHAR(100);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS c_name VARCHAR(500);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS past_c_names TEXT[];
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS latest_raw_payload JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS created_at_platform TIMESTAMP WITH TIME ZONE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS updated_at_platform TIMESTAMP WITH TIME ZONE;
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_external_id ON store_products(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_visibility_lost ON store_products(visibility_lost) WHERE visibility_lost = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_status ON store_products(status);
|
||||
|
||||
-- Add comment
|
||||
COMMENT ON TABLE store_products IS 'Canonical product table - consolidated from dutchie_products';
|
||||
74
backend/migrations/072_product_views.sql
Normal file
74
backend/migrations/072_product_views.sql
Normal file
@@ -0,0 +1,74 @@
|
||||
-- Migration 072: Create compatibility views for store_products and store_product_snapshots
|
||||
-- These views provide backward-compatible column names for API routes
|
||||
|
||||
-- v_products view - aliases store_products columns to match legacy dutchie_products naming
|
||||
CREATE OR REPLACE VIEW v_products AS
|
||||
SELECT
|
||||
id,
|
||||
dispensary_id,
|
||||
provider_product_id as external_product_id,
|
||||
provider_product_id as dutchie_id,
|
||||
name_raw as name,
|
||||
brand_name_raw as brand_name,
|
||||
category_raw as type,
|
||||
subcategory_raw as subcategory,
|
||||
strain_type,
|
||||
thc_percent as thc,
|
||||
cbd_percent as cbd,
|
||||
stock_status,
|
||||
is_in_stock,
|
||||
stock_quantity,
|
||||
image_url,
|
||||
primary_image_url,
|
||||
images,
|
||||
effects,
|
||||
description,
|
||||
is_on_special,
|
||||
featured,
|
||||
medical_only,
|
||||
rec_only,
|
||||
external_product_id as external_id,
|
||||
provider,
|
||||
created_at,
|
||||
updated_at
|
||||
FROM store_products;
|
||||
|
||||
-- v_product_snapshots view - aliases store_product_snapshots columns to match legacy naming
|
||||
CREATE OR REPLACE VIEW v_product_snapshots AS
|
||||
SELECT
|
||||
id,
|
||||
store_product_id,
|
||||
dispensary_id,
|
||||
provider,
|
||||
provider_product_id,
|
||||
crawl_run_id,
|
||||
captured_at as crawled_at,
|
||||
name_raw,
|
||||
brand_name_raw,
|
||||
category_raw,
|
||||
subcategory_raw,
|
||||
-- Convert price_rec (dollars) to rec_min_price_cents (cents)
|
||||
CASE WHEN price_rec IS NOT NULL THEN (price_rec * 100)::integer END as rec_min_price_cents,
|
||||
CASE WHEN price_rec IS NOT NULL THEN (price_rec * 100)::integer END as rec_max_price_cents,
|
||||
CASE WHEN price_rec_special IS NOT NULL THEN (price_rec_special * 100)::integer END as rec_min_special_price_cents,
|
||||
CASE WHEN price_med IS NOT NULL THEN (price_med * 100)::integer END as med_min_price_cents,
|
||||
CASE WHEN price_med IS NOT NULL THEN (price_med * 100)::integer END as med_max_price_cents,
|
||||
CASE WHEN price_med_special IS NOT NULL THEN (price_med_special * 100)::integer END as med_min_special_price_cents,
|
||||
is_on_special as special,
|
||||
discount_percent,
|
||||
is_in_stock,
|
||||
stock_quantity,
|
||||
stock_status,
|
||||
stock_quantity as total_quantity_available,
|
||||
thc_percent,
|
||||
cbd_percent,
|
||||
image_url,
|
||||
raw_data as options,
|
||||
created_at
|
||||
FROM store_product_snapshots;
|
||||
|
||||
-- Add indexes for the views' underlying tables
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_stock ON store_products(stock_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_snapshots_product ON store_product_snapshots(store_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_snapshots_captured ON store_product_snapshots(captured_at DESC);
|
||||
12
backend/migrations/073_proxy_timezone.sql
Normal file
12
backend/migrations/073_proxy_timezone.sql
Normal file
@@ -0,0 +1,12 @@
|
||||
-- Add timezone column to proxies table for geo-consistent fingerprinting
|
||||
-- This allows matching Accept-Language and other headers to proxy location
|
||||
|
||||
ALTER TABLE proxies
|
||||
ADD COLUMN IF NOT EXISTS timezone VARCHAR(50);
|
||||
|
||||
-- Add timezone to failed_proxies as well
|
||||
ALTER TABLE failed_proxies
|
||||
ADD COLUMN IF NOT EXISTS timezone VARCHAR(50);
|
||||
|
||||
-- Comment explaining usage
|
||||
COMMENT ON COLUMN proxies.timezone IS 'IANA timezone (e.g., America/Phoenix) for geo-consistent fingerprinting';
|
||||
322
backend/migrations/074_worker_task_queue.sql
Normal file
322
backend/migrations/074_worker_task_queue.sql
Normal file
@@ -0,0 +1,322 @@
|
||||
-- Migration 074: Worker Task Queue System
|
||||
-- Implements role-based task queue with per-store locking and capacity tracking
|
||||
|
||||
-- Task queue table
|
||||
CREATE TABLE IF NOT EXISTS worker_tasks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Task identification
|
||||
role VARCHAR(50) NOT NULL, -- store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform VARCHAR(20), -- dutchie, jane, treez, etc.
|
||||
|
||||
-- Task state
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 0, -- Higher = more urgent
|
||||
|
||||
-- Scheduling
|
||||
scheduled_for TIMESTAMPTZ, -- For batch scheduling (e.g., every 4 hours)
|
||||
|
||||
-- Ownership
|
||||
worker_id VARCHAR(100), -- Pod name or worker ID
|
||||
claimed_at TIMESTAMPTZ,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
last_heartbeat_at TIMESTAMPTZ,
|
||||
|
||||
-- Results
|
||||
result JSONB, -- Task output data
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
max_retries INTEGER DEFAULT 3,
|
||||
|
||||
-- Metadata
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
-- Constraints
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'claimed', 'running', 'completed', 'failed', 'stale'))
|
||||
);
|
||||
|
||||
-- Indexes for efficient task claiming
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_pending
|
||||
ON worker_tasks(role, priority DESC, created_at ASC)
|
||||
WHERE status = 'pending';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_claimed
|
||||
ON worker_tasks(worker_id, claimed_at)
|
||||
WHERE status = 'claimed';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_running
|
||||
ON worker_tasks(worker_id, last_heartbeat_at)
|
||||
WHERE status = 'running';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_dispensary
|
||||
ON worker_tasks(dispensary_id)
|
||||
WHERE dispensary_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_scheduled
|
||||
ON worker_tasks(scheduled_for)
|
||||
WHERE status = 'pending' AND scheduled_for IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_history
|
||||
ON worker_tasks(role, completed_at DESC)
|
||||
WHERE status IN ('completed', 'failed');
|
||||
|
||||
-- Partial unique index to prevent duplicate active tasks per store
|
||||
-- Only one task can be claimed/running for a given dispensary at a time
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_worker_tasks_unique_active_store
|
||||
ON worker_tasks(dispensary_id)
|
||||
WHERE status IN ('claimed', 'running') AND dispensary_id IS NOT NULL;
|
||||
|
||||
-- Worker registration table (tracks active workers)
|
||||
CREATE TABLE IF NOT EXISTS worker_registry (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id VARCHAR(100) UNIQUE NOT NULL,
|
||||
role VARCHAR(50) NOT NULL,
|
||||
pod_name VARCHAR(100),
|
||||
hostname VARCHAR(100),
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_heartbeat_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
tasks_completed INTEGER DEFAULT 0,
|
||||
tasks_failed INTEGER DEFAULT 0,
|
||||
status VARCHAR(20) DEFAULT 'active',
|
||||
|
||||
CONSTRAINT valid_worker_status CHECK (status IN ('active', 'idle', 'offline'))
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_role
|
||||
ON worker_registry(role, status);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_heartbeat
|
||||
ON worker_registry(last_heartbeat_at)
|
||||
WHERE status = 'active';
|
||||
|
||||
-- Task completion tracking (summarized history)
|
||||
CREATE TABLE IF NOT EXISTS task_completion_log (
|
||||
id SERIAL PRIMARY KEY,
|
||||
role VARCHAR(50) NOT NULL,
|
||||
date DATE NOT NULL DEFAULT CURRENT_DATE,
|
||||
hour INTEGER NOT NULL DEFAULT EXTRACT(HOUR FROM NOW()),
|
||||
|
||||
tasks_created INTEGER DEFAULT 0,
|
||||
tasks_completed INTEGER DEFAULT 0,
|
||||
tasks_failed INTEGER DEFAULT 0,
|
||||
|
||||
avg_duration_sec NUMERIC(10,2),
|
||||
min_duration_sec NUMERIC(10,2),
|
||||
max_duration_sec NUMERIC(10,2),
|
||||
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(role, date, hour)
|
||||
);
|
||||
|
||||
-- Capacity planning view
|
||||
CREATE OR REPLACE VIEW v_worker_capacity AS
|
||||
SELECT
|
||||
role,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'pending' AND (scheduled_for IS NULL OR scheduled_for <= NOW())) as ready_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'claimed') as claimed_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') as completed_last_hour,
|
||||
COUNT(*) FILTER (WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') as failed_last_hour,
|
||||
COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) as active_workers,
|
||||
AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') as avg_duration_sec,
|
||||
-- Capacity planning metrics
|
||||
CASE
|
||||
WHEN COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') > 0
|
||||
THEN 3600.0 / NULLIF(AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'), 0)
|
||||
ELSE NULL
|
||||
END as tasks_per_worker_hour,
|
||||
-- Estimated time to drain queue
|
||||
CASE
|
||||
WHEN COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) > 0
|
||||
AND COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') > 0
|
||||
THEN COUNT(*) FILTER (WHERE status = 'pending') / NULLIF(
|
||||
COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) *
|
||||
(3600.0 / NULLIF(AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'), 0)),
|
||||
0
|
||||
)
|
||||
ELSE NULL
|
||||
END as estimated_hours_to_drain
|
||||
FROM worker_tasks
|
||||
GROUP BY role;
|
||||
|
||||
-- Task history view (for UI)
|
||||
CREATE OR REPLACE VIEW v_task_history AS
|
||||
SELECT
|
||||
t.id,
|
||||
t.role,
|
||||
t.dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
t.platform,
|
||||
t.status,
|
||||
t.priority,
|
||||
t.worker_id,
|
||||
t.scheduled_for,
|
||||
t.claimed_at,
|
||||
t.started_at,
|
||||
t.completed_at,
|
||||
t.error_message,
|
||||
t.retry_count,
|
||||
t.created_at,
|
||||
EXTRACT(EPOCH FROM (t.completed_at - t.started_at)) as duration_sec
|
||||
FROM worker_tasks t
|
||||
LEFT JOIN dispensaries d ON d.id = t.dispensary_id
|
||||
ORDER BY t.created_at DESC;
|
||||
|
||||
-- Function to claim a task atomically
|
||||
CREATE OR REPLACE FUNCTION claim_task(
|
||||
p_role VARCHAR(50),
|
||||
p_worker_id VARCHAR(100)
|
||||
) RETURNS worker_tasks AS $$
|
||||
DECLARE
|
||||
claimed_task worker_tasks;
|
||||
BEGIN
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'claimed',
|
||||
worker_id = p_worker_id,
|
||||
claimed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_tasks
|
||||
WHERE role = p_role
|
||||
AND status = 'pending'
|
||||
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
|
||||
-- Exclude stores that already have an active task
|
||||
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM worker_tasks
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND dispensary_id IS NOT NULL
|
||||
))
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING * INTO claimed_task;
|
||||
|
||||
RETURN claimed_task;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to mark stale tasks (workers that died)
|
||||
CREATE OR REPLACE FUNCTION recover_stale_tasks(
|
||||
stale_threshold_minutes INTEGER DEFAULT 10
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
recovered_count INTEGER;
|
||||
BEGIN
|
||||
WITH stale AS (
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'pending',
|
||||
worker_id = NULL,
|
||||
claimed_at = NULL,
|
||||
started_at = NULL,
|
||||
retry_count = retry_count + 1,
|
||||
updated_at = NOW()
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
|
||||
AND retry_count < max_retries
|
||||
RETURNING id
|
||||
)
|
||||
SELECT COUNT(*) INTO recovered_count FROM stale;
|
||||
|
||||
-- Mark tasks that exceeded retries as failed
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'failed',
|
||||
error_message = 'Exceeded max retries after worker failures',
|
||||
completed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
|
||||
AND retry_count >= max_retries;
|
||||
|
||||
RETURN recovered_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to generate daily resync tasks
|
||||
CREATE OR REPLACE FUNCTION generate_resync_tasks(
|
||||
p_batches_per_day INTEGER DEFAULT 6, -- Every 4 hours
|
||||
p_date DATE DEFAULT CURRENT_DATE
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
store_count INTEGER;
|
||||
stores_per_batch INTEGER;
|
||||
batch_num INTEGER;
|
||||
scheduled_time TIMESTAMPTZ;
|
||||
created_count INTEGER := 0;
|
||||
BEGIN
|
||||
-- Count active stores that need resync
|
||||
SELECT COUNT(*) INTO store_count
|
||||
FROM dispensaries
|
||||
WHERE crawl_enabled = true
|
||||
AND menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL;
|
||||
|
||||
IF store_count = 0 THEN
|
||||
RETURN 0;
|
||||
END IF;
|
||||
|
||||
stores_per_batch := CEIL(store_count::NUMERIC / p_batches_per_day);
|
||||
|
||||
FOR batch_num IN 0..(p_batches_per_day - 1) LOOP
|
||||
scheduled_time := p_date + (batch_num * 4 || ' hours')::INTERVAL;
|
||||
|
||||
INSERT INTO worker_tasks (role, dispensary_id, platform, scheduled_for, priority)
|
||||
SELECT
|
||||
'product_resync',
|
||||
d.id,
|
||||
'dutchie',
|
||||
scheduled_time,
|
||||
0
|
||||
FROM (
|
||||
SELECT id, ROW_NUMBER() OVER (ORDER BY id) as rn
|
||||
FROM dispensaries
|
||||
WHERE crawl_enabled = true
|
||||
AND menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL
|
||||
) d
|
||||
WHERE d.rn > (batch_num * stores_per_batch)
|
||||
AND d.rn <= ((batch_num + 1) * stores_per_batch)
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
GET DIAGNOSTICS created_count = created_count + ROW_COUNT;
|
||||
END LOOP;
|
||||
|
||||
RETURN created_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Trigger to update timestamp
|
||||
CREATE OR REPLACE FUNCTION update_worker_tasks_timestamp()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
DROP TRIGGER IF EXISTS worker_tasks_updated_at ON worker_tasks;
|
||||
CREATE TRIGGER worker_tasks_updated_at
|
||||
BEFORE UPDATE ON worker_tasks
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_worker_tasks_timestamp();
|
||||
|
||||
-- Comments
|
||||
COMMENT ON TABLE worker_tasks IS 'Central task queue for all worker roles';
|
||||
COMMENT ON TABLE worker_registry IS 'Registry of active workers and their stats';
|
||||
COMMENT ON TABLE task_completion_log IS 'Hourly aggregated task completion metrics';
|
||||
COMMENT ON VIEW v_worker_capacity IS 'Real-time capacity planning metrics per role';
|
||||
COMMENT ON VIEW v_task_history IS 'Task history with dispensary details for UI';
|
||||
COMMENT ON FUNCTION claim_task IS 'Atomically claim a task for a worker, respecting per-store locking';
|
||||
COMMENT ON FUNCTION recover_stale_tasks IS 'Release tasks from dead workers back to pending';
|
||||
COMMENT ON FUNCTION generate_resync_tasks IS 'Generate daily product resync tasks in batches';
|
||||
13
backend/migrations/075_consecutive_misses.sql
Normal file
13
backend/migrations/075_consecutive_misses.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
-- Migration 075: Add consecutive_misses column to store_products
|
||||
-- Used to track how many consecutive crawls a product has been missing from the feed
|
||||
-- After 3 consecutive misses, product is marked as OOS
|
||||
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS consecutive_misses INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Index for finding products that need OOS check
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_consecutive_misses
|
||||
ON store_products (dispensary_id, consecutive_misses)
|
||||
WHERE consecutive_misses > 0;
|
||||
|
||||
COMMENT ON COLUMN store_products.consecutive_misses IS 'Number of consecutive crawls where product was not in feed. Reset to 0 when seen. At 3, mark OOS.';
|
||||
71
backend/migrations/076_visitor_analytics.sql
Normal file
71
backend/migrations/076_visitor_analytics.sql
Normal file
@@ -0,0 +1,71 @@
|
||||
-- Visitor location analytics for Findagram
|
||||
-- Tracks visitor locations to understand popular areas
|
||||
|
||||
CREATE TABLE IF NOT EXISTS visitor_locations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Location data (from IP lookup)
|
||||
ip_hash VARCHAR(64), -- Hashed IP for privacy (SHA256)
|
||||
city VARCHAR(100),
|
||||
state VARCHAR(100),
|
||||
state_code VARCHAR(10),
|
||||
country VARCHAR(100),
|
||||
country_code VARCHAR(10),
|
||||
latitude DECIMAL(10, 7),
|
||||
longitude DECIMAL(10, 7),
|
||||
|
||||
-- Visit metadata
|
||||
domain VARCHAR(50) NOT NULL, -- 'findagram.co', 'findadispo.com', etc.
|
||||
page_path VARCHAR(255), -- '/products', '/dispensaries/123', etc.
|
||||
referrer VARCHAR(500),
|
||||
user_agent VARCHAR(500),
|
||||
|
||||
-- Session tracking
|
||||
session_id VARCHAR(64), -- For grouping page views in a session
|
||||
|
||||
-- Timestamps
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for analytics queries
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_domain ON visitor_locations(domain);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_city_state ON visitor_locations(city, state_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_created_at ON visitor_locations(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_session ON visitor_locations(session_id);
|
||||
|
||||
-- Aggregated daily stats (materialized for performance)
|
||||
CREATE TABLE IF NOT EXISTS visitor_location_stats (
|
||||
id SERIAL PRIMARY KEY,
|
||||
date DATE NOT NULL,
|
||||
domain VARCHAR(50) NOT NULL,
|
||||
city VARCHAR(100),
|
||||
state VARCHAR(100),
|
||||
state_code VARCHAR(10),
|
||||
country_code VARCHAR(10),
|
||||
|
||||
-- Metrics
|
||||
visit_count INTEGER DEFAULT 0,
|
||||
unique_sessions INTEGER DEFAULT 0,
|
||||
|
||||
UNIQUE(date, domain, city, state_code, country_code)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_stats_date ON visitor_location_stats(date);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_stats_domain ON visitor_location_stats(domain);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_stats_state ON visitor_location_stats(state_code);
|
||||
|
||||
-- View for easy querying of top locations
|
||||
CREATE OR REPLACE VIEW v_top_visitor_locations AS
|
||||
SELECT
|
||||
domain,
|
||||
city,
|
||||
state,
|
||||
state_code,
|
||||
country_code,
|
||||
COUNT(*) as total_visits,
|
||||
COUNT(DISTINCT session_id) as unique_sessions,
|
||||
MAX(created_at) as last_visit
|
||||
FROM visitor_locations
|
||||
WHERE created_at > NOW() - INTERVAL '30 days'
|
||||
GROUP BY domain, city, state, state_code, country_code
|
||||
ORDER BY total_visits DESC;
|
||||
141
backend/migrations/076_worker_registry.sql
Normal file
141
backend/migrations/076_worker_registry.sql
Normal file
@@ -0,0 +1,141 @@
|
||||
-- Migration 076: Worker Registry for Dynamic Workers
|
||||
-- Workers register on startup, receive a friendly name, and report heartbeats
|
||||
|
||||
-- Name pool for workers (expandable, no hardcoding)
|
||||
CREATE TABLE IF NOT EXISTS worker_name_pool (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(50) UNIQUE NOT NULL,
|
||||
in_use BOOLEAN DEFAULT FALSE,
|
||||
assigned_to VARCHAR(100), -- worker_id
|
||||
assigned_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Seed with initial names (can add more via API)
|
||||
INSERT INTO worker_name_pool (name) VALUES
|
||||
('Alice'), ('Bella'), ('Clara'), ('Diana'), ('Elena'),
|
||||
('Fiona'), ('Grace'), ('Hazel'), ('Iris'), ('Julia'),
|
||||
('Katie'), ('Luna'), ('Mia'), ('Nora'), ('Olive'),
|
||||
('Pearl'), ('Quinn'), ('Rosa'), ('Sara'), ('Tara'),
|
||||
('Uma'), ('Vera'), ('Wendy'), ('Xena'), ('Yuki'), ('Zara'),
|
||||
('Amber'), ('Blake'), ('Coral'), ('Dawn'), ('Echo'),
|
||||
('Fleur'), ('Gem'), ('Haven'), ('Ivy'), ('Jade'),
|
||||
('Kira'), ('Lotus'), ('Maple'), ('Nova'), ('Onyx'),
|
||||
('Pixel'), ('Quest'), ('Raven'), ('Sage'), ('Terra'),
|
||||
('Unity'), ('Violet'), ('Willow'), ('Xylo'), ('Yara'), ('Zen')
|
||||
ON CONFLICT (name) DO NOTHING;
|
||||
|
||||
-- Worker registry - tracks active workers
|
||||
CREATE TABLE IF NOT EXISTS worker_registry (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id VARCHAR(100) UNIQUE NOT NULL, -- e.g., "pod-abc123" or uuid
|
||||
friendly_name VARCHAR(50), -- assigned from pool
|
||||
role VARCHAR(50) NOT NULL, -- task role
|
||||
pod_name VARCHAR(100), -- k8s pod name
|
||||
hostname VARCHAR(100), -- machine hostname
|
||||
ip_address VARCHAR(50), -- worker IP
|
||||
status VARCHAR(20) DEFAULT 'starting', -- starting, active, idle, offline, terminated
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_heartbeat_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_task_at TIMESTAMPTZ,
|
||||
tasks_completed INTEGER DEFAULT 0,
|
||||
tasks_failed INTEGER DEFAULT 0,
|
||||
current_task_id INTEGER,
|
||||
metadata JSONB DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for worker registry
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_status ON worker_registry(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_role ON worker_registry(role);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_heartbeat ON worker_registry(last_heartbeat_at);
|
||||
|
||||
-- Function to assign a name to a new worker
|
||||
CREATE OR REPLACE FUNCTION assign_worker_name(p_worker_id VARCHAR(100))
|
||||
RETURNS VARCHAR(50) AS $$
|
||||
DECLARE
|
||||
v_name VARCHAR(50);
|
||||
BEGIN
|
||||
-- Try to get an unused name
|
||||
UPDATE worker_name_pool
|
||||
SET in_use = TRUE, assigned_to = p_worker_id, assigned_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_name_pool
|
||||
WHERE in_use = FALSE
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING name INTO v_name;
|
||||
|
||||
-- If no names available, generate one
|
||||
IF v_name IS NULL THEN
|
||||
v_name := 'Worker-' || SUBSTRING(p_worker_id FROM 1 FOR 8);
|
||||
END IF;
|
||||
|
||||
RETURN v_name;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to release a worker's name back to the pool
|
||||
CREATE OR REPLACE FUNCTION release_worker_name(p_worker_id VARCHAR(100))
|
||||
RETURNS VOID AS $$
|
||||
BEGIN
|
||||
UPDATE worker_name_pool
|
||||
SET in_use = FALSE, assigned_to = NULL, assigned_at = NULL
|
||||
WHERE assigned_to = p_worker_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to mark stale workers as offline
|
||||
CREATE OR REPLACE FUNCTION mark_stale_workers(stale_threshold_minutes INTEGER DEFAULT 5)
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
v_count INTEGER;
|
||||
BEGIN
|
||||
UPDATE worker_registry
|
||||
SET status = 'offline', updated_at = NOW()
|
||||
WHERE status IN ('active', 'idle', 'starting')
|
||||
AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
|
||||
RETURNING COUNT(*) INTO v_count;
|
||||
|
||||
-- Release names from offline workers
|
||||
PERFORM release_worker_name(worker_id)
|
||||
FROM worker_registry
|
||||
WHERE status = 'offline'
|
||||
AND last_heartbeat_at < NOW() - INTERVAL '30 minutes';
|
||||
|
||||
RETURN COALESCE(v_count, 0);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- View for dashboard
|
||||
CREATE OR REPLACE VIEW v_active_workers AS
|
||||
SELECT
|
||||
wr.id,
|
||||
wr.worker_id,
|
||||
wr.friendly_name,
|
||||
wr.role,
|
||||
wr.status,
|
||||
wr.pod_name,
|
||||
wr.hostname,
|
||||
wr.started_at,
|
||||
wr.last_heartbeat_at,
|
||||
wr.last_task_at,
|
||||
wr.tasks_completed,
|
||||
wr.tasks_failed,
|
||||
wr.current_task_id,
|
||||
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN wr.status = 'offline' THEN 'offline'
|
||||
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status
|
||||
FROM worker_registry wr
|
||||
WHERE wr.status != 'terminated'
|
||||
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
|
||||
|
||||
COMMENT ON TABLE worker_registry IS 'Tracks all workers that have registered with the system';
|
||||
COMMENT ON TABLE worker_name_pool IS 'Pool of friendly names for workers - expandable via API';
|
||||
35
backend/migrations/077_click_events_location.sql
Normal file
35
backend/migrations/077_click_events_location.sql
Normal file
@@ -0,0 +1,35 @@
|
||||
-- Migration: Add visitor location and dispensary name to click events
|
||||
-- Captures where visitors are clicking from and which dispensary
|
||||
|
||||
-- Add visitor location columns
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_city VARCHAR(100);
|
||||
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_state VARCHAR(10);
|
||||
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_lat DECIMAL(10, 7);
|
||||
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_lng DECIMAL(10, 7);
|
||||
|
||||
-- Add dispensary name for easier reporting
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS dispensary_name VARCHAR(255);
|
||||
|
||||
-- Create index for location-based analytics
|
||||
CREATE INDEX IF NOT EXISTS idx_product_click_events_visitor_state
|
||||
ON product_click_events(visitor_state)
|
||||
WHERE visitor_state IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_product_click_events_visitor_city
|
||||
ON product_click_events(visitor_city)
|
||||
WHERE visitor_city IS NOT NULL;
|
||||
|
||||
-- Add comments
|
||||
COMMENT ON COLUMN product_click_events.visitor_city IS 'City where the visitor is located (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.visitor_state IS 'State where the visitor is located (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.visitor_lat IS 'Visitor latitude (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.visitor_lng IS 'Visitor longitude (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.dispensary_name IS 'Name of the dispensary (denormalized for easier reporting)';
|
||||
8
backend/migrations/078_proxy_consecutive_403.sql
Normal file
8
backend/migrations/078_proxy_consecutive_403.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
-- Migration 078: Add consecutive_403_count to proxies table
|
||||
-- Per workflow-12102025.md: Track consecutive 403s per proxy
|
||||
-- After 3 consecutive 403s with different fingerprints → disable proxy
|
||||
|
||||
ALTER TABLE proxies ADD COLUMN IF NOT EXISTS consecutive_403_count INTEGER DEFAULT 0;
|
||||
|
||||
-- Add comment explaining the column
|
||||
COMMENT ON COLUMN proxies.consecutive_403_count IS 'Tracks consecutive 403 blocks. Reset to 0 on success. Proxy disabled at 3.';
|
||||
49
backend/migrations/079_task_schedules.sql
Normal file
49
backend/migrations/079_task_schedules.sql
Normal file
@@ -0,0 +1,49 @@
|
||||
-- Migration 079: Task Schedules for Database-Driven Scheduler
|
||||
-- Per TASK_WORKFLOW_2024-12-10.md: Replaces node-cron with DB-driven scheduling
|
||||
--
|
||||
-- 2024-12-10: Created for reliable, multi-replica-safe task scheduling
|
||||
|
||||
-- task_schedules: Stores schedule definitions and state
|
||||
CREATE TABLE IF NOT EXISTS task_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
role VARCHAR(50) NOT NULL, -- TaskRole: product_refresh, store_discovery, etc.
|
||||
description TEXT,
|
||||
|
||||
-- Schedule configuration
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
interval_hours INTEGER NOT NULL DEFAULT 4,
|
||||
priority INTEGER DEFAULT 0,
|
||||
|
||||
-- Optional scope filters
|
||||
state_code VARCHAR(2), -- NULL = all states
|
||||
platform VARCHAR(50), -- NULL = all platforms
|
||||
|
||||
-- Execution state (updated by scheduler)
|
||||
last_run_at TIMESTAMPTZ,
|
||||
next_run_at TIMESTAMPTZ,
|
||||
last_task_count INTEGER DEFAULT 0,
|
||||
last_error TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for scheduler queries
|
||||
CREATE INDEX IF NOT EXISTS idx_task_schedules_enabled ON task_schedules(enabled) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_task_schedules_next_run ON task_schedules(next_run_at) WHERE enabled = TRUE;
|
||||
|
||||
-- Insert default schedules
|
||||
INSERT INTO task_schedules (name, role, interval_hours, priority, description, next_run_at)
|
||||
VALUES
|
||||
('product_refresh_all', 'product_refresh', 4, 0, 'Generate product refresh tasks for all crawl-enabled stores every 4 hours', NOW()),
|
||||
('store_discovery_dutchie', 'store_discovery', 24, 5, 'Discover new Dutchie stores daily', NOW()),
|
||||
('analytics_refresh', 'analytics_refresh', 6, 0, 'Refresh analytics materialized views every 6 hours', NOW())
|
||||
ON CONFLICT (name) DO NOTHING;
|
||||
|
||||
-- Comment for documentation
|
||||
COMMENT ON TABLE task_schedules IS 'Database-driven task scheduler configuration. Per TASK_WORKFLOW_2024-12-10.md:
|
||||
- Schedules persist in DB (survive restarts)
|
||||
- Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
|
||||
- Scheduler polls every 60s and executes due schedules
|
||||
- Creates tasks in worker_tasks for task-worker.ts to process';
|
||||
58
backend/migrations/080_raw_crawl_payloads.sql
Normal file
58
backend/migrations/080_raw_crawl_payloads.sql
Normal file
@@ -0,0 +1,58 @@
|
||||
-- Migration 080: Raw Crawl Payloads Metadata Table
|
||||
-- Per TASK_WORKFLOW_2024-12-10.md: Store full GraphQL payloads for historical analysis
|
||||
--
|
||||
-- Design Pattern: Metadata/Payload Separation
|
||||
-- - Metadata (this table): Small, indexed, queryable
|
||||
-- - Payload (filesystem): Gzipped JSON at storage_path
|
||||
--
|
||||
-- Benefits:
|
||||
-- - Compare any two crawls to see what changed
|
||||
-- - Replay/re-normalize historical data if logic changes
|
||||
-- - Debug issues by seeing exactly what the API returned
|
||||
-- - DB stays small, backups stay fast
|
||||
--
|
||||
-- Storage location: /storage/payloads/{year}/{month}/{day}/store_{id}_{timestamp}.json.gz
|
||||
-- Compression: ~90% reduction (1.5MB -> 150KB per crawl)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS raw_crawl_payloads (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Links to crawl tracking
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- File location (gzipped JSON)
|
||||
storage_path TEXT NOT NULL,
|
||||
|
||||
-- Metadata for quick queries without loading file
|
||||
product_count INTEGER NOT NULL DEFAULT 0,
|
||||
size_bytes INTEGER, -- Compressed size
|
||||
size_bytes_raw INTEGER, -- Uncompressed size
|
||||
|
||||
-- Timestamps
|
||||
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Optional: checksum for integrity verification
|
||||
checksum_sha256 VARCHAR(64)
|
||||
);
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary
|
||||
ON raw_crawl_payloads(dispensary_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary_fetched
|
||||
ON raw_crawl_payloads(dispensary_id, fetched_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_fetched
|
||||
ON raw_crawl_payloads(fetched_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_crawl_run
|
||||
ON raw_crawl_payloads(crawl_run_id)
|
||||
WHERE crawl_run_id IS NOT NULL;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON TABLE raw_crawl_payloads IS 'Metadata for raw GraphQL payloads stored on filesystem. Per TASK_WORKFLOW_2024-12-10.md: Full payloads enable historical diffs and replay.';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.storage_path IS 'Path to gzipped JSON file, e.g. /storage/payloads/2024/12/10/store_123_1702234567.json.gz';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.size_bytes IS 'Compressed file size in bytes';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.size_bytes_raw IS 'Uncompressed payload size in bytes';
|
||||
37
backend/migrations/081_payload_fetch_columns.sql
Normal file
37
backend/migrations/081_payload_fetch_columns.sql
Normal file
@@ -0,0 +1,37 @@
|
||||
-- Migration 081: Payload Fetch Columns
|
||||
-- Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing
|
||||
--
|
||||
-- New architecture:
|
||||
-- - payload_fetch: Hits Dutchie API, saves raw payload to disk
|
||||
-- - product_refresh: Reads local payload, normalizes, upserts to DB
|
||||
--
|
||||
-- This migration adds:
|
||||
-- 1. payload column to worker_tasks (for task chaining data)
|
||||
-- 2. processed_at column to raw_crawl_payloads (track when payload was processed)
|
||||
-- 3. last_fetch_at column to dispensaries (track when last payload was fetched)
|
||||
|
||||
-- Add payload column to worker_tasks for task chaining
|
||||
-- Used by payload_fetch to pass payload_id to product_refresh
|
||||
ALTER TABLE worker_tasks
|
||||
ADD COLUMN IF NOT EXISTS payload JSONB DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN worker_tasks.payload IS 'Per TASK_WORKFLOW_2024-12-10.md: Task chaining data (e.g., payload_id from payload_fetch to product_refresh)';
|
||||
|
||||
-- Add processed_at to raw_crawl_payloads
|
||||
-- Tracks when the payload was processed by product_refresh
|
||||
ALTER TABLE raw_crawl_payloads
|
||||
ADD COLUMN IF NOT EXISTS processed_at TIMESTAMPTZ DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN raw_crawl_payloads.processed_at IS 'When this payload was processed by product_refresh handler';
|
||||
|
||||
-- Index for finding unprocessed payloads
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_unprocessed
|
||||
ON raw_crawl_payloads(dispensary_id, fetched_at DESC)
|
||||
WHERE processed_at IS NULL;
|
||||
|
||||
-- Add last_fetch_at to dispensaries
|
||||
-- Tracks when the last payload was fetched (separate from last_crawl_at which is when processing completed)
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS last_fetch_at TIMESTAMPTZ DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.last_fetch_at IS 'Per TASK_WORKFLOW_2024-12-10.md: When last payload was fetched from API (separate from last_crawl_at which is when processing completed)';
|
||||
19
backend/node_modules/.package-lock.json
generated
vendored
19
backend/node_modules/.package-lock.json
generated
vendored
@@ -1026,6 +1026,17 @@
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/csv-parser": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.2.0.tgz",
|
||||
"integrity": "sha512-fgKbp+AJbn1h2dcAHKIdKNSSjfp43BZZykXsCjzALjKy80VXQNHPFJ6T9Afwdzoj24aMkq8GwDS7KGcDPpejrA==",
|
||||
"bin": {
|
||||
"csv-parser": "bin/csv-parser"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -2235,6 +2246,14 @@
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/ip2location-nodejs": {
|
||||
"version": "9.7.0",
|
||||
"resolved": "https://registry.npmjs.org/ip2location-nodejs/-/ip2location-nodejs-9.7.0.tgz",
|
||||
"integrity": "sha512-eQ4T5TXm1cx0+pQcRycPiuaiRuoDEMd9O89Be7Ugk555qi9UY9enXSznkkqr3kQRyUaXx7zj5dORC5LGTPOttA==",
|
||||
"dependencies": {
|
||||
"csv-parser": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ipaddr.js": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-2.2.0.tgz",
|
||||
|
||||
310
backend/package-lock.json
generated
310
backend/package-lock.json
generated
@@ -1,13 +1,14 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"dependencies": {
|
||||
"@kubernetes/client-node": "^1.4.0",
|
||||
"@types/bcryptjs": "^3.0.0",
|
||||
"axios": "^1.6.2",
|
||||
"bcrypt": "^5.1.1",
|
||||
@@ -21,6 +22,7 @@
|
||||
"helmet": "^7.1.0",
|
||||
"https-proxy-agent": "^7.0.2",
|
||||
"ioredis": "^5.8.2",
|
||||
"ip2location-nodejs": "^9.7.0",
|
||||
"ipaddr.js": "^2.2.0",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"minio": "^7.1.3",
|
||||
@@ -33,6 +35,7 @@
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"sharp": "^0.32.0",
|
||||
"socks-proxy-agent": "^8.0.2",
|
||||
"user-agents": "^1.1.669",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
@@ -491,6 +494,97 @@
|
||||
"resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz",
|
||||
"integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ=="
|
||||
},
|
||||
"node_modules/@jsep-plugin/assignment": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz",
|
||||
"integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"jsep": "^0.4.0||^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@jsep-plugin/regex": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz",
|
||||
"integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"jsep": "^0.4.0||^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-1.4.0.tgz",
|
||||
"integrity": "sha512-Zge3YvF7DJi264dU1b3wb/GmzR99JhUpqTvp+VGHfwZT+g7EOOYNScDJNZwXy9cszyIGPIs0VHr+kk8e95qqrA==",
|
||||
"dependencies": {
|
||||
"@types/js-yaml": "^4.0.1",
|
||||
"@types/node": "^24.0.0",
|
||||
"@types/node-fetch": "^2.6.13",
|
||||
"@types/stream-buffers": "^3.0.3",
|
||||
"form-data": "^4.0.0",
|
||||
"hpagent": "^1.2.0",
|
||||
"isomorphic-ws": "^5.0.0",
|
||||
"js-yaml": "^4.1.0",
|
||||
"jsonpath-plus": "^10.3.0",
|
||||
"node-fetch": "^2.7.0",
|
||||
"openid-client": "^6.1.3",
|
||||
"rfc4648": "^1.3.0",
|
||||
"socks-proxy-agent": "^8.0.4",
|
||||
"stream-buffers": "^3.0.2",
|
||||
"tar-fs": "^3.0.9",
|
||||
"ws": "^8.18.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/@types/node": {
|
||||
"version": "24.10.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.3.tgz",
|
||||
"integrity": "sha512-gqkrWUsS8hcm0r44yn7/xZeV1ERva/nLgrLxFRUGb7aoNMIJfZJ3AC261zDQuOAKC7MiXai1WCpYc48jAHoShQ==",
|
||||
"dependencies": {
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/tar-fs": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
|
||||
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0",
|
||||
"tar-stream": "^3.1.5"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"bare-fs": "^4.0.1",
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/undici-types": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
|
||||
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@mapbox/node-pre-gyp": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz",
|
||||
@@ -756,6 +850,11 @@
|
||||
"integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/js-yaml": {
|
||||
"version": "4.0.9",
|
||||
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
|
||||
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg=="
|
||||
},
|
||||
"node_modules/@types/jsonwebtoken": {
|
||||
"version": "9.0.10",
|
||||
"resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz",
|
||||
@@ -781,7 +880,6 @@
|
||||
"version": "20.19.25",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz",
|
||||
"integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==",
|
||||
"devOptional": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~6.21.0"
|
||||
}
|
||||
@@ -792,6 +890,15 @@
|
||||
"integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/node-fetch": {
|
||||
"version": "2.6.13",
|
||||
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz",
|
||||
"integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==",
|
||||
"dependencies": {
|
||||
"@types/node": "*",
|
||||
"form-data": "^4.0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/pg": {
|
||||
"version": "8.15.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz",
|
||||
@@ -845,6 +952,14 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/stream-buffers": {
|
||||
"version": "3.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/stream-buffers/-/stream-buffers-3.0.8.tgz",
|
||||
"integrity": "sha512-J+7VaHKNvlNPJPEJXX/fKa9DZtR/xPMwuIbe+yNOwp1YB+ApUOBv2aUpEoBJEi8nJgbgs1x8e73ttg0r1rSUdw==",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
|
||||
@@ -1025,6 +1140,78 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-fs": {
|
||||
"version": "4.5.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
|
||||
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-events": "^2.5.4",
|
||||
"bare-path": "^3.0.0",
|
||||
"bare-stream": "^2.6.4",
|
||||
"bare-url": "^2.2.2",
|
||||
"fast-fifo": "^1.3.2"
|
||||
},
|
||||
"engines": {
|
||||
"bare": ">=1.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-os": {
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
|
||||
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"bare": ">=1.14.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-path": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
|
||||
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-os": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-stream": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
|
||||
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"streamx": "^2.21.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*",
|
||||
"bare-events": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
},
|
||||
"bare-events": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-url": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
|
||||
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/base64-js": {
|
||||
"version": "1.5.1",
|
||||
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
|
||||
@@ -1531,6 +1718,17 @@
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/csv-parser": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.2.0.tgz",
|
||||
"integrity": "sha512-fgKbp+AJbn1h2dcAHKIdKNSSjfp43BZZykXsCjzALjKy80VXQNHPFJ6T9Afwdzoj24aMkq8GwDS7KGcDPpejrA==",
|
||||
"bin": {
|
||||
"csv-parser": "bin/csv-parser"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -2527,6 +2725,14 @@
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/hpagent": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz",
|
||||
"integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==",
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
|
||||
@@ -2754,6 +2960,14 @@
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/ip2location-nodejs": {
|
||||
"version": "9.7.0",
|
||||
"resolved": "https://registry.npmjs.org/ip2location-nodejs/-/ip2location-nodejs-9.7.0.tgz",
|
||||
"integrity": "sha512-eQ4T5TXm1cx0+pQcRycPiuaiRuoDEMd9O89Be7Ugk555qi9UY9enXSznkkqr3kQRyUaXx7zj5dORC5LGTPOttA==",
|
||||
"dependencies": {
|
||||
"csv-parser": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ipaddr.js": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-2.2.0.tgz",
|
||||
@@ -2882,6 +3096,22 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/isomorphic-ws": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz",
|
||||
"integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==",
|
||||
"peerDependencies": {
|
||||
"ws": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/jose": {
|
||||
"version": "6.1.3",
|
||||
"resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz",
|
||||
"integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/js-tokens": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
|
||||
@@ -2898,6 +3128,14 @@
|
||||
"js-yaml": "bin/js-yaml.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jsep": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz",
|
||||
"integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/json-parse-even-better-errors": {
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
|
||||
@@ -2919,6 +3157,23 @@
|
||||
"graceful-fs": "^4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonpath-plus": {
|
||||
"version": "10.3.0",
|
||||
"resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
|
||||
"integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==",
|
||||
"dependencies": {
|
||||
"@jsep-plugin/assignment": "^1.3.0",
|
||||
"@jsep-plugin/regex": "^1.0.4",
|
||||
"jsep": "^1.4.0"
|
||||
},
|
||||
"bin": {
|
||||
"jsonpath": "bin/jsonpath-cli.js",
|
||||
"jsonpath-plus": "bin/jsonpath-cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonwebtoken": {
|
||||
"version": "9.0.2",
|
||||
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz",
|
||||
@@ -2993,6 +3248,11 @@
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
|
||||
"integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
|
||||
},
|
||||
"node_modules/lodash.clonedeep": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz",
|
||||
"integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ=="
|
||||
},
|
||||
"node_modules/lodash.defaults": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz",
|
||||
@@ -3442,6 +3702,14 @@
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/oauth4webapi": {
|
||||
"version": "3.8.3",
|
||||
"resolved": "https://registry.npmjs.org/oauth4webapi/-/oauth4webapi-3.8.3.tgz",
|
||||
"integrity": "sha512-pQ5BsX3QRTgnt5HxgHwgunIRaDXBdkT23tf8dfzmtTIL2LTpdmxgbpbBm0VgFWAIDlezQvQCTgnVIUmHupXHxw==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
@@ -3480,6 +3748,18 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/openid-client": {
|
||||
"version": "6.8.1",
|
||||
"resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.1.tgz",
|
||||
"integrity": "sha512-VoYT6enBo6Vj2j3Q5Ec0AezS+9YGzQo1f5Xc42lreMGlfP4ljiXPKVDvCADh+XHCV/bqPu/wWSiCVXbJKvrODw==",
|
||||
"dependencies": {
|
||||
"jose": "^6.1.0",
|
||||
"oauth4webapi": "^3.8.2"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-proxy-agent": {
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
|
||||
@@ -4396,6 +4676,11 @@
|
||||
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/rfc4648": {
|
||||
"version": "1.5.4",
|
||||
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.4.tgz",
|
||||
"integrity": "sha512-rRg/6Lb+IGfJqO05HZkN50UtY7K/JhxJag1kP23+zyMfrvoB0B7RWv06MbOzoc79RgCdNTiUaNsTT1AJZ7Z+cg=="
|
||||
},
|
||||
"node_modules/rimraf": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
|
||||
@@ -4826,6 +5111,14 @@
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/stream-buffers": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz",
|
||||
"integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==",
|
||||
"engines": {
|
||||
"node": ">= 0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.23.0",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
|
||||
@@ -5045,8 +5338,7 @@
|
||||
"node_modules/undici-types": {
|
||||
"version": "6.21.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
|
||||
"devOptional": true
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="
|
||||
},
|
||||
"node_modules/universalify": {
|
||||
"version": "2.0.1",
|
||||
@@ -5069,6 +5361,14 @@
|
||||
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
|
||||
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
|
||||
},
|
||||
"node_modules/user-agents": {
|
||||
"version": "1.1.669",
|
||||
"resolved": "https://registry.npmjs.org/user-agents/-/user-agents-1.1.669.tgz",
|
||||
"integrity": "sha512-pbIzG+AOqCaIpySKJ4IAm1l0VyE4jMnK4y1thV8lm8PYxI+7X5uWcppOK7zY79TCKKTAnJH3/4gaVIZHsjrmJA==",
|
||||
"dependencies": {
|
||||
"lodash.clonedeep": "^4.5.0"
|
||||
}
|
||||
},
|
||||
"node_modules/util": {
|
||||
"version": "0.12.5",
|
||||
"resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"description": "Backend API for Dutchie Menus scraper and management",
|
||||
"main": "dist/index.js",
|
||||
"scripts": {
|
||||
@@ -22,6 +22,7 @@
|
||||
"seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@kubernetes/client-node": "^1.4.0",
|
||||
"@types/bcryptjs": "^3.0.0",
|
||||
"axios": "^1.6.2",
|
||||
"bcrypt": "^5.1.1",
|
||||
@@ -35,6 +36,7 @@
|
||||
"helmet": "^7.1.0",
|
||||
"https-proxy-agent": "^7.0.2",
|
||||
"ioredis": "^5.8.2",
|
||||
"ip2location-nodejs": "^9.7.0",
|
||||
"ipaddr.js": "^2.2.0",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"minio": "^7.1.3",
|
||||
@@ -47,6 +49,7 @@
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"sharp": "^0.32.0",
|
||||
"socks-proxy-agent": "^8.0.2",
|
||||
"user-agents": "^1.1.669",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
|
||||
BIN
backend/public/downloads/cannaiq-menus-1.5.3.zip
Normal file
BIN
backend/public/downloads/cannaiq-menus-1.5.3.zip
Normal file
Binary file not shown.
BIN
backend/public/downloads/cannaiq-menus-1.5.4.zip
Normal file
BIN
backend/public/downloads/cannaiq-menus-1.5.4.zip
Normal file
Binary file not shown.
BIN
backend/public/downloads/cannaiq-menus-1.6.0.zip
Normal file
BIN
backend/public/downloads/cannaiq-menus-1.6.0.zip
Normal file
Binary file not shown.
1
backend/public/downloads/cannaiq-menus-latest.zip
Symbolic link
1
backend/public/downloads/cannaiq-menus-latest.zip
Symbolic link
@@ -0,0 +1 @@
|
||||
cannaiq-menus-1.6.0.zip
|
||||
65
backend/scripts/download-ip2location.sh
Executable file
65
backend/scripts/download-ip2location.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
# Download IP2Location LITE DB3 (City-level) database
|
||||
# Free for commercial use with attribution
|
||||
# https://lite.ip2location.com/database/db3-ip-country-region-city
|
||||
|
||||
set -e
|
||||
|
||||
DATA_DIR="${1:-./data/ip2location}"
|
||||
DB_FILE="IP2LOCATION-LITE-DB3.BIN"
|
||||
|
||||
mkdir -p "$DATA_DIR"
|
||||
cd "$DATA_DIR"
|
||||
|
||||
echo "Downloading IP2Location LITE DB3 database..."
|
||||
|
||||
# IP2Location LITE DB3 - includes city, region, country, lat/lng
|
||||
# You need to register at https://lite.ip2location.com/ to get a download token
|
||||
# Then set IP2LOCATION_TOKEN environment variable
|
||||
|
||||
if [ -z "$IP2LOCATION_TOKEN" ]; then
|
||||
echo ""
|
||||
echo "ERROR: IP2LOCATION_TOKEN not set"
|
||||
echo ""
|
||||
echo "To download the database:"
|
||||
echo "1. Register free at https://lite.ip2location.com/"
|
||||
echo "2. Get your download token from the dashboard"
|
||||
echo "3. Run: IP2LOCATION_TOKEN=your_token ./scripts/download-ip2location.sh"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Download DB3.LITE (IPv4 + City)
|
||||
DOWNLOAD_URL="https://www.ip2location.com/download/?token=${IP2LOCATION_TOKEN}&file=DB3LITEBIN"
|
||||
|
||||
echo "Downloading from IP2Location..."
|
||||
curl -L -o ip2location.zip "$DOWNLOAD_URL"
|
||||
|
||||
echo "Extracting..."
|
||||
unzip -o ip2location.zip
|
||||
|
||||
# Rename to standard name
|
||||
if [ -f "IP2LOCATION-LITE-DB3.BIN" ]; then
|
||||
echo "Database ready: $DATA_DIR/IP2LOCATION-LITE-DB3.BIN"
|
||||
elif [ -f "IP-COUNTRY-REGION-CITY.BIN" ]; then
|
||||
mv "IP-COUNTRY-REGION-CITY.BIN" "$DB_FILE"
|
||||
echo "Database ready: $DATA_DIR/$DB_FILE"
|
||||
else
|
||||
# Find whatever BIN file was extracted
|
||||
BIN_FILE=$(ls *.BIN 2>/dev/null | head -1)
|
||||
if [ -n "$BIN_FILE" ]; then
|
||||
mv "$BIN_FILE" "$DB_FILE"
|
||||
echo "Database ready: $DATA_DIR/$DB_FILE"
|
||||
else
|
||||
echo "ERROR: No BIN file found in archive"
|
||||
ls -la
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
rm -f ip2location.zip *.txt LICENSE* README*
|
||||
|
||||
echo ""
|
||||
echo "Done! Database saved to: $DATA_DIR/$DB_FILE"
|
||||
echo "Update monthly by re-running this script."
|
||||
@@ -1,3 +1,14 @@
|
||||
/**
|
||||
* CannaiQ Authentication Middleware
|
||||
*
|
||||
* AUTH METHODS (in order of priority):
|
||||
* 1. IP-based: Localhost/trusted IPs get 'internal' role (full access, no token needed)
|
||||
* 2. Token-based: Bearer token (JWT or API token)
|
||||
*
|
||||
* NO username/password auth in API. Use tokens only.
|
||||
*
|
||||
* Localhost bypass: curl from 127.0.0.1 gets automatic admin access.
|
||||
*/
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
import jwt from 'jsonwebtoken';
|
||||
import bcrypt from 'bcrypt';
|
||||
@@ -5,6 +16,87 @@ import { pool } from '../db/pool';
|
||||
|
||||
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
|
||||
|
||||
// Trusted origins that bypass auth for internal/same-origin requests
|
||||
const TRUSTED_ORIGINS = [
|
||||
'https://cannaiq.co',
|
||||
'https://www.cannaiq.co',
|
||||
'https://findadispo.com',
|
||||
'https://www.findadispo.com',
|
||||
'https://findagram.co',
|
||||
'https://www.findagram.co',
|
||||
'http://localhost:3010',
|
||||
'http://localhost:8080',
|
||||
'http://localhost:5173',
|
||||
];
|
||||
|
||||
// Pattern-based trusted origins (wildcards)
|
||||
const TRUSTED_ORIGIN_PATTERNS = [
|
||||
/^https:\/\/.*\.cannabrands\.app$/, // *.cannabrands.app
|
||||
/^https:\/\/.*\.cannaiq\.co$/, // *.cannaiq.co
|
||||
];
|
||||
|
||||
// Trusted IPs for internal pod-to-pod communication
|
||||
const TRUSTED_IPS = [
|
||||
'127.0.0.1',
|
||||
'::1',
|
||||
'::ffff:127.0.0.1',
|
||||
];
|
||||
|
||||
/**
|
||||
* Check if request is from a trusted origin/IP
|
||||
*/
|
||||
function isTrustedRequest(req: Request): boolean {
|
||||
// Check origin header
|
||||
const origin = req.headers.origin;
|
||||
if (origin) {
|
||||
if (TRUSTED_ORIGINS.includes(origin)) {
|
||||
return true;
|
||||
}
|
||||
// Check pattern-based origins (wildcards like *.cannabrands.app)
|
||||
for (const pattern of TRUSTED_ORIGIN_PATTERNS) {
|
||||
if (pattern.test(origin)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check referer header (for same-origin requests without CORS)
|
||||
const referer = req.headers.referer;
|
||||
if (referer) {
|
||||
for (const trusted of TRUSTED_ORIGINS) {
|
||||
if (referer.startsWith(trusted)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check pattern-based referers
|
||||
try {
|
||||
const refererUrl = new URL(referer);
|
||||
const refererOrigin = refererUrl.origin;
|
||||
for (const pattern of TRUSTED_ORIGIN_PATTERNS) {
|
||||
if (pattern.test(refererOrigin)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Invalid referer URL, skip
|
||||
}
|
||||
}
|
||||
|
||||
// Check IP for internal requests (pod-to-pod, localhost)
|
||||
const clientIp = req.ip || req.socket.remoteAddress || '';
|
||||
if (TRUSTED_IPS.includes(clientIp)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for Kubernetes internal header (set by ingress/service mesh)
|
||||
const internalHeader = req.headers['x-internal-request'];
|
||||
if (internalHeader === process.env.INTERNAL_REQUEST_SECRET) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export interface AuthUser {
|
||||
id: number;
|
||||
email: string;
|
||||
@@ -63,84 +155,81 @@ export async function authenticateUser(email: string, password: string): Promise
|
||||
export async function authMiddleware(req: AuthRequest, res: Response, next: NextFunction) {
|
||||
const authHeader = req.headers.authorization;
|
||||
|
||||
if (!authHeader || !authHeader.startsWith('Bearer ')) {
|
||||
return res.status(401).json({ error: 'No token provided' });
|
||||
// If a Bearer token is provided, always try to use it first (logged-in user)
|
||||
if (authHeader && authHeader.startsWith('Bearer ')) {
|
||||
const token = authHeader.substring(7);
|
||||
|
||||
// Try JWT first
|
||||
const jwtUser = verifyToken(token);
|
||||
|
||||
if (jwtUser) {
|
||||
req.user = jwtUser;
|
||||
return next();
|
||||
}
|
||||
|
||||
// If JWT fails, try API token
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
|
||||
FROM api_tokens
|
||||
WHERE token = $1
|
||||
`, [token]);
|
||||
|
||||
if (result.rows.length > 0) {
|
||||
const apiToken = result.rows[0];
|
||||
if (!apiToken.active) {
|
||||
return res.status(401).json({ error: 'API token is inactive' });
|
||||
}
|
||||
if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
|
||||
return res.status(401).json({ error: 'API token has expired' });
|
||||
}
|
||||
req.user = {
|
||||
id: 0,
|
||||
email: `api:${apiToken.name}`,
|
||||
role: 'api_token'
|
||||
};
|
||||
req.apiToken = apiToken;
|
||||
return next();
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('API token lookup error:', err);
|
||||
}
|
||||
|
||||
// Token provided but invalid
|
||||
return res.status(401).json({ error: 'Invalid token' });
|
||||
}
|
||||
|
||||
const token = authHeader.substring(7);
|
||||
|
||||
// Try JWT first
|
||||
const jwtUser = verifyToken(token);
|
||||
|
||||
if (jwtUser) {
|
||||
req.user = jwtUser;
|
||||
// No token provided - check trusted origins for API access (WordPress, etc.)
|
||||
if (isTrustedRequest(req)) {
|
||||
req.user = {
|
||||
id: 0,
|
||||
email: 'internal@system',
|
||||
role: 'internal'
|
||||
};
|
||||
return next();
|
||||
}
|
||||
|
||||
// If JWT fails, try API token
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
|
||||
FROM api_tokens
|
||||
WHERE token = $1
|
||||
`, [token]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(401).json({ error: 'Invalid token' });
|
||||
}
|
||||
|
||||
const apiToken = result.rows[0];
|
||||
|
||||
// Check if token is active
|
||||
if (!apiToken.active) {
|
||||
return res.status(401).json({ error: 'Token is disabled' });
|
||||
}
|
||||
|
||||
// Check if token is expired
|
||||
if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
|
||||
return res.status(401).json({ error: 'Token has expired' });
|
||||
}
|
||||
|
||||
// Check allowed endpoints
|
||||
if (apiToken.allowed_endpoints && apiToken.allowed_endpoints.length > 0) {
|
||||
const isAllowed = apiToken.allowed_endpoints.some((pattern: string) => {
|
||||
// Simple wildcard matching
|
||||
const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
|
||||
return regex.test(req.path);
|
||||
});
|
||||
|
||||
if (!isAllowed) {
|
||||
return res.status(403).json({ error: 'Endpoint not allowed for this token' });
|
||||
}
|
||||
}
|
||||
|
||||
// Set API token on request for tracking
|
||||
req.apiToken = {
|
||||
id: apiToken.id,
|
||||
name: apiToken.name,
|
||||
rate_limit: apiToken.rate_limit
|
||||
};
|
||||
|
||||
// Set a generic user for compatibility with existing code
|
||||
req.user = {
|
||||
id: apiToken.id,
|
||||
email: `api-token-${apiToken.id}@system`,
|
||||
role: 'api'
|
||||
};
|
||||
|
||||
next();
|
||||
} catch (error) {
|
||||
console.error('Error verifying API token:', error);
|
||||
return res.status(500).json({ error: 'Authentication failed' });
|
||||
}
|
||||
return res.status(401).json({ error: 'No token provided' });
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Require specific role(s) to access endpoint.
|
||||
*
|
||||
* NOTE: 'internal' role (localhost/trusted IPs) bypasses all role checks.
|
||||
* This allows local development and internal services full access.
|
||||
*/
|
||||
export function requireRole(...roles: string[]) {
|
||||
return (req: AuthRequest, res: Response, next: NextFunction) => {
|
||||
if (!req.user) {
|
||||
return res.status(401).json({ error: 'Not authenticated' });
|
||||
}
|
||||
|
||||
// Internal role (localhost) bypasses role checks
|
||||
if (req.user.role === 'internal') {
|
||||
return next();
|
||||
}
|
||||
|
||||
if (!roles.includes(req.user.role)) {
|
||||
return res.status(403).json({ error: 'Insufficient permissions' });
|
||||
}
|
||||
|
||||
@@ -472,7 +472,8 @@ export class CanonicalHydrationService {
|
||||
}
|
||||
|
||||
// Step 3: Create initial snapshots from current product state
|
||||
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId);
|
||||
// crawlRunId is guaranteed to be set at this point (either from existing run or insert)
|
||||
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId!);
|
||||
result.snapshotsWritten += snapshotsWritten;
|
||||
|
||||
// Update crawl run with snapshot count
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* CLI Entrypoint for CannaIQ Backend
|
||||
* @module cli
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/cli.ts # Start API server
|
||||
@@ -50,18 +51,14 @@ async function main() {
|
||||
showHelp();
|
||||
}
|
||||
|
||||
if (args.includes('--worker')) {
|
||||
console.log('[CLI] Starting worker process...');
|
||||
const { startWorker } = await import('./dutchie-az/services/worker');
|
||||
await startWorker();
|
||||
} else {
|
||||
// Default: start API server
|
||||
console.log('[CLI] Starting API server...');
|
||||
await import('./index');
|
||||
}
|
||||
// Default: start API server
|
||||
console.log('[CLI] Starting API server...');
|
||||
await import('./index');
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('[CLI] Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
export {};
|
||||
|
||||
@@ -1,657 +0,0 @@
|
||||
/**
|
||||
* Base Dutchie Crawler Template
|
||||
*
|
||||
* This is the base template for all Dutchie store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* Exports:
|
||||
* - crawlProducts(dispensary, options) - Main crawl entry point
|
||||
* - detectStructure(page) - Detect page structure for sandbox mode
|
||||
* - extractProducts(document) - Extract product data
|
||||
* - extractImages(document) - Extract product images
|
||||
* - extractStock(document) - Extract stock status
|
||||
* - extractPagination(document) - Extract pagination info
|
||||
*/
|
||||
|
||||
import {
|
||||
crawlDispensaryProducts as baseCrawlDispensaryProducts,
|
||||
CrawlResult,
|
||||
} from '../../dutchie-az/services/product-crawler';
|
||||
import { Dispensary, CrawlerProfileOptions } from '../../dutchie-az/types';
|
||||
|
||||
// Re-export CrawlResult for convenience
|
||||
export { CrawlResult };
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Options passed to the per-store crawler
|
||||
*/
|
||||
export interface StoreCrawlOptions {
|
||||
pricingType?: 'rec' | 'med';
|
||||
useBothModes?: boolean;
|
||||
downloadImages?: boolean;
|
||||
trackStock?: boolean;
|
||||
timeoutMs?: number;
|
||||
config?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Progress callback for reporting crawl progress
|
||||
*/
|
||||
export interface CrawlProgressCallback {
|
||||
phase: 'fetching' | 'processing' | 'saving' | 'images' | 'complete';
|
||||
current: number;
|
||||
total: number;
|
||||
message?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Structure detection result for sandbox mode
|
||||
*/
|
||||
export interface StructureDetectionResult {
|
||||
success: boolean;
|
||||
menuType: 'dutchie' | 'treez' | 'jane' | 'unknown';
|
||||
iframeUrl?: string;
|
||||
graphqlEndpoint?: string;
|
||||
dispensaryId?: string;
|
||||
selectors: {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
pagination?: string;
|
||||
loadMore?: string;
|
||||
};
|
||||
pagination: {
|
||||
type: 'scroll' | 'click' | 'graphql' | 'none';
|
||||
hasMore?: boolean;
|
||||
pageSize?: number;
|
||||
};
|
||||
errors: string[];
|
||||
metadata: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Product extraction result
|
||||
*/
|
||||
export interface ExtractedProduct {
|
||||
externalId: string;
|
||||
name: string;
|
||||
brand?: string;
|
||||
category?: string;
|
||||
subcategory?: string;
|
||||
price?: number;
|
||||
priceRec?: number;
|
||||
priceMed?: number;
|
||||
weight?: string;
|
||||
thcContent?: string;
|
||||
cbdContent?: string;
|
||||
description?: string;
|
||||
imageUrl?: string;
|
||||
stockStatus?: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||
quantity?: number;
|
||||
raw?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Image extraction result
|
||||
*/
|
||||
export interface ExtractedImage {
|
||||
productId: string;
|
||||
imageUrl: string;
|
||||
isPrimary: boolean;
|
||||
position: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stock extraction result
|
||||
*/
|
||||
export interface ExtractedStock {
|
||||
productId: string;
|
||||
status: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||
quantity?: number;
|
||||
lastChecked: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pagination extraction result
|
||||
*/
|
||||
export interface ExtractedPagination {
|
||||
hasNextPage: boolean;
|
||||
currentPage?: number;
|
||||
totalPages?: number;
|
||||
totalProducts?: number;
|
||||
nextCursor?: string;
|
||||
loadMoreSelector?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hook points that per-store crawlers can override
|
||||
*/
|
||||
export interface DutchieCrawlerHooks {
|
||||
/**
|
||||
* Called before fetching products
|
||||
* Can be used to set up custom headers, cookies, etc.
|
||||
*/
|
||||
beforeFetch?: (dispensary: Dispensary) => Promise<void>;
|
||||
|
||||
/**
|
||||
* Called after fetching products, before processing
|
||||
* Can be used to filter or transform raw products
|
||||
*/
|
||||
afterFetch?: (products: any[], dispensary: Dispensary) => Promise<any[]>;
|
||||
|
||||
/**
|
||||
* Called after all processing is complete
|
||||
* Can be used for cleanup or post-processing
|
||||
*/
|
||||
afterComplete?: (result: CrawlResult, dispensary: Dispensary) => Promise<void>;
|
||||
|
||||
/**
|
||||
* Custom selector resolver for iframe detection
|
||||
*/
|
||||
resolveIframe?: (page: any) => Promise<string | null>;
|
||||
|
||||
/**
|
||||
* Custom product container selector
|
||||
*/
|
||||
getProductContainerSelector?: () => string;
|
||||
|
||||
/**
|
||||
* Custom product extraction from container element
|
||||
*/
|
||||
extractProductFromElement?: (element: any) => Promise<ExtractedProduct | null>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selectors configuration for per-store overrides
|
||||
*/
|
||||
export interface DutchieSelectors {
|
||||
iframe?: string;
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productPriceRec?: string;
|
||||
productPriceMed?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
productWeight?: string;
|
||||
productThc?: string;
|
||||
productCbd?: string;
|
||||
productDescription?: string;
|
||||
productStock?: string;
|
||||
loadMore?: string;
|
||||
pagination?: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DEFAULT SELECTORS
|
||||
// ============================================================
|
||||
|
||||
export const DEFAULT_DUTCHIE_SELECTORS: DutchieSelectors = {
|
||||
iframe: 'iframe[src*="dutchie.com"]',
|
||||
productContainer: '[data-testid="product-card"], .product-card, [class*="ProductCard"]',
|
||||
productName: '[data-testid="product-title"], .product-title, [class*="ProductTitle"]',
|
||||
productPrice: '[data-testid="product-price"], .product-price, [class*="ProductPrice"]',
|
||||
productImage: 'img[src*="dutchie"], img[src*="product"], .product-image img',
|
||||
productCategory: '[data-testid="category-name"], .category-name',
|
||||
productBrand: '[data-testid="brand-name"], .brand-name, [class*="BrandName"]',
|
||||
loadMore: 'button[data-testid="load-more"], .load-more-button',
|
||||
pagination: '.pagination, [class*="Pagination"]',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* BaseDutchieCrawler - Base class for all Dutchie store crawlers
|
||||
*
|
||||
* Per-store crawlers extend this class and override methods as needed.
|
||||
* The default implementation delegates to the existing shared Dutchie logic.
|
||||
*/
|
||||
export class BaseDutchieCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected hooks: DutchieCrawlerHooks;
|
||||
protected selectors: DutchieSelectors;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
hooks: DutchieCrawlerHooks = {},
|
||||
selectors: DutchieSelectors = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.hooks = hooks;
|
||||
this.selectors = { ...DEFAULT_DUTCHIE_SELECTORS, ...selectors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* Override this in per-store crawlers to customize behavior
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
// Call beforeFetch hook if defined
|
||||
if (this.hooks.beforeFetch) {
|
||||
await this.hooks.beforeFetch(this.dispensary);
|
||||
}
|
||||
|
||||
// Use the existing shared Dutchie crawl logic
|
||||
const result = await baseCrawlDispensaryProducts(
|
||||
this.dispensary,
|
||||
this.options.pricingType || 'rec',
|
||||
{
|
||||
useBothModes: this.options.useBothModes,
|
||||
downloadImages: this.options.downloadImages,
|
||||
}
|
||||
);
|
||||
|
||||
// Call afterComplete hook if defined
|
||||
if (this.hooks.afterComplete) {
|
||||
await this.hooks.afterComplete(result, this.dispensary);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
* Override in per-store crawlers if needed
|
||||
*
|
||||
* @param page - Puppeteer page object or HTML string
|
||||
* @returns Structure detection result
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
const result: StructureDetectionResult = {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: [],
|
||||
metadata: {},
|
||||
};
|
||||
|
||||
try {
|
||||
// Default implementation: check for Dutchie iframe
|
||||
if (typeof page === 'string') {
|
||||
// HTML string mode
|
||||
if (page.includes('dutchie.com')) {
|
||||
result.menuType = 'dutchie';
|
||||
result.success = true;
|
||||
}
|
||||
} else if (page && typeof page.evaluate === 'function') {
|
||||
// Puppeteer page mode
|
||||
const detection = await page.evaluate((selectorConfig: DutchieSelectors) => {
|
||||
const iframe = document.querySelector(selectorConfig.iframe || '') as HTMLIFrameElement;
|
||||
const iframeUrl = iframe?.src || null;
|
||||
|
||||
// Check for product containers
|
||||
const containers = document.querySelectorAll(selectorConfig.productContainer || '');
|
||||
|
||||
return {
|
||||
hasIframe: !!iframe,
|
||||
iframeUrl,
|
||||
productCount: containers.length,
|
||||
isDutchie: !!iframeUrl?.includes('dutchie.com'),
|
||||
};
|
||||
}, this.selectors);
|
||||
|
||||
if (detection.isDutchie) {
|
||||
result.menuType = 'dutchie';
|
||||
result.iframeUrl = detection.iframeUrl;
|
||||
result.success = true;
|
||||
}
|
||||
|
||||
result.metadata = detection;
|
||||
}
|
||||
|
||||
// Set default selectors for Dutchie
|
||||
if (result.menuType === 'dutchie') {
|
||||
result.selectors = {
|
||||
productContainer: this.selectors.productContainer,
|
||||
productName: this.selectors.productName,
|
||||
productPrice: this.selectors.productPrice,
|
||||
productImage: this.selectors.productImage,
|
||||
productCategory: this.selectors.productCategory,
|
||||
};
|
||||
result.pagination = { type: 'graphql' };
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Detection error: ${error.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from page/document
|
||||
* Override in per-store crawlers for custom extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or raw products array
|
||||
* @returns Array of extracted products
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
// Default implementation: assume document is already an array of products
|
||||
// from the GraphQL response
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((product) => this.mapRawProduct(product));
|
||||
}
|
||||
|
||||
// If document is a Puppeteer page, extract from DOM
|
||||
if (document && typeof document.evaluate === 'function') {
|
||||
return this.extractProductsFromPage(document);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from Puppeteer page
|
||||
* Override for custom DOM extraction
|
||||
*/
|
||||
protected async extractProductsFromPage(page: any): Promise<ExtractedProduct[]> {
|
||||
const products = await page.evaluate((selectors: DutchieSelectors) => {
|
||||
const containers = document.querySelectorAll(selectors.productContainer || '');
|
||||
return Array.from(containers).map((container) => {
|
||||
const nameEl = container.querySelector(selectors.productName || '');
|
||||
const priceEl = container.querySelector(selectors.productPrice || '');
|
||||
const imageEl = container.querySelector(selectors.productImage || '') as HTMLImageElement;
|
||||
const brandEl = container.querySelector(selectors.productBrand || '');
|
||||
|
||||
return {
|
||||
name: nameEl?.textContent?.trim() || '',
|
||||
price: priceEl?.textContent?.trim() || '',
|
||||
imageUrl: imageEl?.src || '',
|
||||
brand: brandEl?.textContent?.trim() || '',
|
||||
};
|
||||
});
|
||||
}, this.selectors);
|
||||
|
||||
return products.map((p: any, i: number) => ({
|
||||
externalId: `dom-product-${i}`,
|
||||
name: p.name,
|
||||
brand: p.brand,
|
||||
price: this.parsePrice(p.price),
|
||||
imageUrl: p.imageUrl,
|
||||
stockStatus: 'unknown' as const,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Map raw product from GraphQL to ExtractedProduct
|
||||
* Override for custom mapping
|
||||
*/
|
||||
protected mapRawProduct(raw: any): ExtractedProduct {
|
||||
return {
|
||||
externalId: raw.id || raw._id || raw.externalId,
|
||||
name: raw.name || raw.Name,
|
||||
brand: raw.brand?.name || raw.brandName || raw.brand,
|
||||
category: raw.type || raw.category || raw.Category,
|
||||
subcategory: raw.subcategory || raw.Subcategory,
|
||||
price: raw.recPrice || raw.price || raw.Price,
|
||||
priceRec: raw.recPrice || raw.Prices?.rec,
|
||||
priceMed: raw.medPrice || raw.Prices?.med,
|
||||
weight: raw.weight || raw.Weight,
|
||||
thcContent: raw.potencyThc?.formatted || raw.THCContent?.formatted,
|
||||
cbdContent: raw.potencyCbd?.formatted || raw.CBDContent?.formatted,
|
||||
description: raw.description || raw.Description,
|
||||
imageUrl: raw.image || raw.Image,
|
||||
stockStatus: this.mapStockStatus(raw),
|
||||
quantity: raw.quantity || raw.Quantity,
|
||||
raw,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Map raw stock status to standardized value
|
||||
*/
|
||||
protected mapStockStatus(raw: any): 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown' {
|
||||
const status = raw.Status || raw.status || raw.stockStatus;
|
||||
if (status === 'Active' || status === 'active' || status === 'in_stock') {
|
||||
return 'in_stock';
|
||||
}
|
||||
if (status === 'Inactive' || status === 'inactive' || status === 'out_of_stock') {
|
||||
return 'out_of_stock';
|
||||
}
|
||||
if (status === 'low_stock') {
|
||||
return 'low_stock';
|
||||
}
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse price string to number
|
||||
*/
|
||||
protected parsePrice(priceStr: string): number | undefined {
|
||||
if (!priceStr) return undefined;
|
||||
const cleaned = priceStr.replace(/[^0-9.]/g, '');
|
||||
const num = parseFloat(cleaned);
|
||||
return isNaN(num) ? undefined : num;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
* Override for custom image extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or products array
|
||||
* @returns Array of extracted images
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document
|
||||
.filter((p) => p.image || p.Image || p.imageUrl)
|
||||
.map((p, i) => ({
|
||||
productId: p.id || p._id || `product-${i}`,
|
||||
imageUrl: p.image || p.Image || p.imageUrl,
|
||||
isPrimary: true,
|
||||
position: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
// Puppeteer page extraction
|
||||
if (document && typeof document.evaluate === 'function') {
|
||||
return this.extractImagesFromPage(document);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from Puppeteer page
|
||||
*/
|
||||
protected async extractImagesFromPage(page: any): Promise<ExtractedImage[]> {
|
||||
const images = await page.evaluate((selector: string) => {
|
||||
const imgs = document.querySelectorAll(selector);
|
||||
return Array.from(imgs).map((img, i) => ({
|
||||
src: (img as HTMLImageElement).src,
|
||||
position: i,
|
||||
}));
|
||||
}, this.selectors.productImage || 'img');
|
||||
|
||||
return images.map((img: any, i: number) => ({
|
||||
productId: `dom-product-${i}`,
|
||||
imageUrl: img.src,
|
||||
isPrimary: i === 0,
|
||||
position: img.position,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
* Override for custom stock extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or products array
|
||||
* @returns Array of extracted stock statuses
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((p) => ({
|
||||
productId: p.id || p._id || p.externalId,
|
||||
status: this.mapStockStatus(p),
|
||||
quantity: p.quantity || p.Quantity,
|
||||
lastChecked: new Date(),
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information from document
|
||||
* Override for custom pagination handling
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or GraphQL response
|
||||
* @returns Pagination info
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
// Default: check for page info in GraphQL response
|
||||
if (document && document.pageInfo) {
|
||||
return {
|
||||
hasNextPage: document.pageInfo.hasNextPage || false,
|
||||
currentPage: document.pageInfo.currentPage,
|
||||
totalPages: document.pageInfo.totalPages,
|
||||
totalProducts: document.pageInfo.totalCount || document.totalCount,
|
||||
nextCursor: document.pageInfo.endCursor,
|
||||
};
|
||||
}
|
||||
|
||||
// Default: no pagination
|
||||
return {
|
||||
hasNextPage: false,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cName (Dutchie slug) for this dispensary
|
||||
* Override to customize cName extraction
|
||||
*/
|
||||
getCName(): string {
|
||||
if (this.dispensary.menuUrl) {
|
||||
try {
|
||||
const url = new URL(this.dispensary.menuUrl);
|
||||
const segments = url.pathname.split('/').filter(Boolean);
|
||||
if (segments.length >= 2) {
|
||||
return segments[segments.length - 1];
|
||||
}
|
||||
} catch {
|
||||
// Fall through to default
|
||||
}
|
||||
}
|
||||
return this.dispensary.slug || '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get custom headers for API requests
|
||||
* Override for store-specific headers
|
||||
*/
|
||||
getCustomHeaders(): Record<string, string> {
|
||||
const cName = this.getCName();
|
||||
return {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Origin: 'https://dutchie.com',
|
||||
Referer: `https://dutchie.com/embedded-menu/${cName}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Create a base Dutchie crawler instance
|
||||
* This is the default export used when no per-store override exists
|
||||
*/
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
hooks: DutchieCrawlerHooks = {},
|
||||
selectors: DutchieSelectors = {}
|
||||
): BaseDutchieCrawler {
|
||||
return new BaseDutchieCrawler(dispensary, options, hooks, selectors);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS (required exports for orchestrator)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Crawl products using the base Dutchie logic
|
||||
* Per-store files can call this or override it completely
|
||||
*/
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect structure using the base Dutchie logic
|
||||
*/
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products using the base Dutchie logic
|
||||
*/
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images using the base Dutchie logic
|
||||
*/
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock using the base Dutchie logic
|
||||
*/
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination using the base Dutchie logic
|
||||
*/
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
@@ -1,330 +0,0 @@
|
||||
/**
|
||||
* Base Jane Crawler Template (PLACEHOLDER)
|
||||
*
|
||||
* This is the base template for all Jane (iheartjane) store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* TODO: Implement Jane-specific crawling logic (Algolia-based)
|
||||
*/
|
||||
|
||||
import { Dispensary } from '../../dutchie-az/types';
|
||||
import {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
} from './base-dutchie';
|
||||
|
||||
// Re-export types
|
||||
export {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// JANE-SPECIFIC TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface JaneConfig {
|
||||
algoliaAppId?: string;
|
||||
algoliaApiKey?: string;
|
||||
algoliaIndex?: string;
|
||||
storeId?: string;
|
||||
}
|
||||
|
||||
export interface JaneSelectors {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
pagination?: string;
|
||||
loadMore?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_JANE_SELECTORS: JaneSelectors = {
|
||||
productContainer: '[data-testid="product-card"], .product-card',
|
||||
productName: '[data-testid="product-name"], .product-name',
|
||||
productPrice: '[data-testid="product-price"], .product-price',
|
||||
productImage: '.product-image img, [data-testid="product-image"] img',
|
||||
productCategory: '.product-category',
|
||||
productBrand: '.product-brand, [data-testid="brand-name"]',
|
||||
loadMore: '[data-testid="load-more"], .load-more-btn',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE JANE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class BaseJaneCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected selectors: JaneSelectors;
|
||||
protected janeConfig: JaneConfig;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: JaneSelectors = {},
|
||||
janeConfig: JaneConfig = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: false,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.selectors = { ...DEFAULT_JANE_SELECTORS, ...selectors };
|
||||
this.janeConfig = janeConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* TODO: Implement Jane/Algolia-specific crawling
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
const startTime = Date.now();
|
||||
console.warn(`[BaseJaneCrawler] Jane crawling not yet implemented for ${this.dispensary.name}`);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: this.dispensary.id || 0,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
imagesDownloaded: 0,
|
||||
errorMessage: 'Jane crawler not yet implemented',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
* Jane uses Algolia, so we look for Algolia config
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
const result: StructureDetectionResult = {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: [],
|
||||
metadata: {},
|
||||
};
|
||||
|
||||
try {
|
||||
if (page && typeof page.evaluate === 'function') {
|
||||
// Look for Jane/Algolia indicators
|
||||
const detection = await page.evaluate(() => {
|
||||
// Check for iheartjane in page
|
||||
const hasJane = document.documentElement.innerHTML.includes('iheartjane') ||
|
||||
document.documentElement.innerHTML.includes('jane-menu');
|
||||
|
||||
// Look for Algolia config
|
||||
const scripts = Array.from(document.querySelectorAll('script'));
|
||||
let algoliaConfig: any = null;
|
||||
|
||||
for (const script of scripts) {
|
||||
const content = script.textContent || '';
|
||||
if (content.includes('algolia') || content.includes('ALGOLIA')) {
|
||||
// Try to extract config
|
||||
const appIdMatch = content.match(/applicationId['":\s]+['"]([^'"]+)['"]/);
|
||||
const apiKeyMatch = content.match(/apiKey['":\s]+['"]([^'"]+)['"]/);
|
||||
if (appIdMatch && apiKeyMatch) {
|
||||
algoliaConfig = {
|
||||
appId: appIdMatch[1],
|
||||
apiKey: apiKeyMatch[1],
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
hasJane,
|
||||
algoliaConfig,
|
||||
};
|
||||
});
|
||||
|
||||
if (detection.hasJane) {
|
||||
result.menuType = 'jane';
|
||||
result.success = true;
|
||||
result.metadata = detection;
|
||||
|
||||
if (detection.algoliaConfig) {
|
||||
result.metadata.algoliaAppId = detection.algoliaConfig.appId;
|
||||
result.metadata.algoliaApiKey = detection.algoliaConfig.apiKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Detection error: ${error.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from Algolia response or page
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
// If document is Algolia hits array
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((hit) => this.mapAlgoliaHit(hit));
|
||||
}
|
||||
|
||||
console.warn('[BaseJaneCrawler] extractProducts not yet fully implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Map Algolia hit to ExtractedProduct
|
||||
*/
|
||||
protected mapAlgoliaHit(hit: any): ExtractedProduct {
|
||||
return {
|
||||
externalId: hit.objectID || hit.id || hit.product_id,
|
||||
name: hit.name || hit.product_name,
|
||||
brand: hit.brand || hit.brand_name,
|
||||
category: hit.category || hit.kind,
|
||||
subcategory: hit.subcategory,
|
||||
price: hit.price || hit.bucket_price,
|
||||
priceRec: hit.prices?.rec || hit.price_rec,
|
||||
priceMed: hit.prices?.med || hit.price_med,
|
||||
weight: hit.weight || hit.amount,
|
||||
thcContent: hit.percent_thc ? `${hit.percent_thc}%` : undefined,
|
||||
cbdContent: hit.percent_cbd ? `${hit.percent_cbd}%` : undefined,
|
||||
description: hit.description,
|
||||
imageUrl: hit.image_url || hit.product_image_url,
|
||||
stockStatus: hit.available ? 'in_stock' : 'out_of_stock',
|
||||
quantity: hit.quantity_available,
|
||||
raw: hit,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document
|
||||
.filter((hit) => hit.image_url || hit.product_image_url)
|
||||
.map((hit, i) => ({
|
||||
productId: hit.objectID || hit.id || `jane-product-${i}`,
|
||||
imageUrl: hit.image_url || hit.product_image_url,
|
||||
isPrimary: true,
|
||||
position: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((hit) => ({
|
||||
productId: hit.objectID || hit.id,
|
||||
status: hit.available ? 'in_stock' as const : 'out_of_stock' as const,
|
||||
quantity: hit.quantity_available,
|
||||
lastChecked: new Date(),
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information
|
||||
* Algolia uses cursor-based pagination
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
if (document && typeof document === 'object' && !Array.isArray(document)) {
|
||||
return {
|
||||
hasNextPage: document.page < document.nbPages - 1,
|
||||
currentPage: document.page,
|
||||
totalPages: document.nbPages,
|
||||
totalProducts: document.nbHits,
|
||||
};
|
||||
}
|
||||
|
||||
return { hasNextPage: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: JaneSelectors = {},
|
||||
janeConfig: JaneConfig = {}
|
||||
): BaseJaneCrawler {
|
||||
return new BaseJaneCrawler(dispensary, options, selectors, janeConfig);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
@@ -1,212 +0,0 @@
|
||||
/**
|
||||
* Base Treez Crawler Template (PLACEHOLDER)
|
||||
*
|
||||
* This is the base template for all Treez store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* TODO: Implement Treez-specific crawling logic
|
||||
*/
|
||||
|
||||
import { Dispensary } from '../../dutchie-az/types';
|
||||
import {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
} from './base-dutchie';
|
||||
|
||||
// Re-export types
|
||||
export {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// TREEZ-SPECIFIC TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface TreezSelectors {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
addToCart?: string;
|
||||
pagination?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_TREEZ_SELECTORS: TreezSelectors = {
|
||||
productContainer: '.product-tile, [class*="ProductCard"]',
|
||||
productName: '.product-name, [class*="ProductName"]',
|
||||
productPrice: '.product-price, [class*="ProductPrice"]',
|
||||
productImage: '.product-image img',
|
||||
productCategory: '.product-category',
|
||||
productBrand: '.product-brand',
|
||||
addToCart: '.add-to-cart-btn',
|
||||
pagination: '.pagination',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE TREEZ CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class BaseTreezCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected selectors: TreezSelectors;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: TreezSelectors = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: false,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.selectors = { ...DEFAULT_TREEZ_SELECTORS, ...selectors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* TODO: Implement Treez-specific crawling
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
const startTime = Date.now();
|
||||
console.warn(`[BaseTreezCrawler] Treez crawling not yet implemented for ${this.dispensary.name}`);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: this.dispensary.id || 0,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
imagesDownloaded: 0,
|
||||
errorMessage: 'Treez crawler not yet implemented',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
return {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: ['Treez structure detection not yet implemented'],
|
||||
metadata: {},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from page/document
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
console.warn('[BaseTreezCrawler] extractProducts not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
console.warn('[BaseTreezCrawler] extractImages not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
console.warn('[BaseTreezCrawler] extractStock not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information from document
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
return { hasNextPage: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: TreezSelectors = {}
|
||||
): BaseTreezCrawler {
|
||||
return new BaseTreezCrawler(dispensary, options, selectors);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
/**
|
||||
* Base Crawler Templates Index
|
||||
*
|
||||
* Exports all base crawler templates for easy importing.
|
||||
*/
|
||||
|
||||
// Dutchie base (primary implementation)
|
||||
export * from './base-dutchie';
|
||||
|
||||
// Treez base (placeholder)
|
||||
export * as Treez from './base-treez';
|
||||
|
||||
// Jane base (placeholder)
|
||||
export * as Jane from './base-jane';
|
||||
|
||||
// Re-export common types from dutchie for convenience
|
||||
export type {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
DutchieCrawlerHooks,
|
||||
DutchieSelectors,
|
||||
} from './base-dutchie';
|
||||
@@ -1,9 +0,0 @@
|
||||
/**
|
||||
* Base Dutchie Crawler Template (Re-export for backward compatibility)
|
||||
*
|
||||
* DEPRECATED: Import from '../base/base-dutchie' instead.
|
||||
* This file re-exports everything from the new location for existing code.
|
||||
*/
|
||||
|
||||
// Re-export everything from the new base location
|
||||
export * from '../base/base-dutchie';
|
||||
@@ -1,118 +0,0 @@
|
||||
/**
|
||||
* Trulieve Scottsdale - Per-Store Dutchie Crawler
|
||||
*
|
||||
* Store ID: 101
|
||||
* Profile Key: trulieve-scottsdale
|
||||
* Platform Dispensary ID: 5eaf489fa8a61801212577cc
|
||||
*
|
||||
* Phase 1: Identity implementation - no overrides, just uses base Dutchie logic.
|
||||
* Future: Add store-specific selectors, timing, or custom logic as needed.
|
||||
*/
|
||||
|
||||
import {
|
||||
BaseDutchieCrawler,
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
DutchieSelectors,
|
||||
crawlProducts as baseCrawlProducts,
|
||||
} from '../../base/base-dutchie';
|
||||
import { Dispensary } from '../../../dutchie-az/types';
|
||||
|
||||
// Re-export CrawlResult for the orchestrator
|
||||
export { CrawlResult };
|
||||
|
||||
// ============================================================
|
||||
// STORE CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Store-specific configuration
|
||||
* These can be used to customize crawler behavior for this store
|
||||
*/
|
||||
export const STORE_CONFIG = {
|
||||
storeId: 101,
|
||||
profileKey: 'trulieve-scottsdale',
|
||||
name: 'Trulieve of Scottsdale Dispensary',
|
||||
platformDispensaryId: '5eaf489fa8a61801212577cc',
|
||||
|
||||
// Store-specific overrides (none for Phase 1)
|
||||
customOptions: {
|
||||
// Example future overrides:
|
||||
// pricingType: 'rec',
|
||||
// useBothModes: true,
|
||||
// customHeaders: {},
|
||||
// maxRetries: 3,
|
||||
},
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// STORE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* TrulieveScottsdaleCrawler - Per-store crawler for Trulieve Scottsdale
|
||||
*
|
||||
* Phase 1: Identity implementation - extends BaseDutchieCrawler with no overrides.
|
||||
* Future phases can override methods like:
|
||||
* - getCName() for custom slug handling
|
||||
* - crawlProducts() for completely custom logic
|
||||
* - Add hooks for pre/post processing
|
||||
*/
|
||||
export class TrulieveScottsdaleCrawler extends BaseDutchieCrawler {
|
||||
constructor(dispensary: Dispensary, options: StoreCrawlOptions = {}) {
|
||||
// Merge store-specific options with provided options
|
||||
const mergedOptions: StoreCrawlOptions = {
|
||||
...STORE_CONFIG.customOptions,
|
||||
...options,
|
||||
};
|
||||
|
||||
super(dispensary, mergedOptions);
|
||||
}
|
||||
|
||||
// Phase 1: No overrides - use base implementation
|
||||
// Future phases can add overrides here:
|
||||
//
|
||||
// async crawlProducts(): Promise<CrawlResult> {
|
||||
// // Custom pre-processing
|
||||
// // ...
|
||||
// const result = await super.crawlProducts();
|
||||
// // Custom post-processing
|
||||
// // ...
|
||||
// return result;
|
||||
// }
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// EXPORTED CRAWL FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Main entry point for the orchestrator
|
||||
*
|
||||
* The orchestrator calls: mod.crawlProducts(dispensary, options)
|
||||
* This function creates a TrulieveScottsdaleCrawler and runs it.
|
||||
*/
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
console.log(`[TrulieveScottsdale] Using per-store crawler for ${dispensary.name}`);
|
||||
|
||||
const crawler = new TrulieveScottsdaleCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION (alternative API)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Create a crawler instance without running it
|
||||
* Useful for testing or when you need to configure before running
|
||||
*/
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): TrulieveScottsdaleCrawler {
|
||||
return new TrulieveScottsdaleCrawler(dispensary, options);
|
||||
}
|
||||
141
backend/src/db/auto-migrate.ts
Normal file
141
backend/src/db/auto-migrate.ts
Normal file
@@ -0,0 +1,141 @@
|
||||
/**
|
||||
* Auto-Migration System
|
||||
*
|
||||
* Runs SQL migration files from the migrations/ folder automatically on server startup.
|
||||
* Uses a schema_migrations table to track which migrations have been applied.
|
||||
*
|
||||
* Safe to run multiple times - only applies new migrations.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
const MIGRATIONS_DIR = path.join(__dirname, '../../migrations');
|
||||
|
||||
/**
|
||||
* Ensure schema_migrations table exists
|
||||
*/
|
||||
async function ensureMigrationsTable(pool: Pool): Promise<void> {
|
||||
await pool.query(`
|
||||
CREATE TABLE IF NOT EXISTS schema_migrations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) UNIQUE NOT NULL,
|
||||
applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of already-applied migrations
|
||||
*/
|
||||
async function getAppliedMigrations(pool: Pool): Promise<Set<string>> {
|
||||
const result = await pool.query('SELECT name FROM schema_migrations');
|
||||
return new Set(result.rows.map(row => row.name));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of migration files from disk
|
||||
*/
|
||||
function getMigrationFiles(): string[] {
|
||||
if (!fs.existsSync(MIGRATIONS_DIR)) {
|
||||
console.log('[AutoMigrate] No migrations directory found');
|
||||
return [];
|
||||
}
|
||||
|
||||
return fs.readdirSync(MIGRATIONS_DIR)
|
||||
.filter(f => f.endsWith('.sql'))
|
||||
.sort(); // Sort alphabetically (001_, 002_, etc.)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single migration file
|
||||
*/
|
||||
async function runMigration(pool: Pool, filename: string): Promise<void> {
|
||||
const filepath = path.join(MIGRATIONS_DIR, filename);
|
||||
const sql = fs.readFileSync(filepath, 'utf8');
|
||||
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Run the migration SQL
|
||||
await client.query(sql);
|
||||
|
||||
// Record that this migration was applied
|
||||
await client.query(
|
||||
'INSERT INTO schema_migrations (name) VALUES ($1) ON CONFLICT (name) DO NOTHING',
|
||||
[filename]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
console.log(`[AutoMigrate] ✓ Applied: ${filename}`);
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error(`[AutoMigrate] ✗ Failed: ${filename}`);
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run all pending migrations
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @returns Number of migrations applied
|
||||
*/
|
||||
export async function runAutoMigrations(pool: Pool): Promise<number> {
|
||||
console.log('[AutoMigrate] Checking for pending migrations...');
|
||||
|
||||
try {
|
||||
// Ensure migrations table exists
|
||||
await ensureMigrationsTable(pool);
|
||||
|
||||
// Get applied and available migrations
|
||||
const applied = await getAppliedMigrations(pool);
|
||||
const available = getMigrationFiles();
|
||||
|
||||
// Find pending migrations
|
||||
const pending = available.filter(f => !applied.has(f));
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log('[AutoMigrate] No pending migrations');
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`[AutoMigrate] Found ${pending.length} pending migrations`);
|
||||
|
||||
// Run each pending migration in order
|
||||
for (const filename of pending) {
|
||||
await runMigration(pool, filename);
|
||||
}
|
||||
|
||||
console.log(`[AutoMigrate] Successfully applied ${pending.length} migrations`);
|
||||
return pending.length;
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('[AutoMigrate] Migration failed:', error.message);
|
||||
// Don't crash the server - log and continue
|
||||
// The specific failing migration will have been rolled back
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check migration status without running anything
|
||||
*/
|
||||
export async function checkMigrationStatus(pool: Pool): Promise<{
|
||||
applied: string[];
|
||||
pending: string[];
|
||||
}> {
|
||||
await ensureMigrationsTable(pool);
|
||||
|
||||
const applied = await getAppliedMigrations(pool);
|
||||
const available = getMigrationFiles();
|
||||
|
||||
return {
|
||||
applied: available.filter(f => applied.has(f)),
|
||||
pending: available.filter(f => !applied.has(f)),
|
||||
};
|
||||
}
|
||||
@@ -372,6 +372,51 @@ async function runMigrations() {
|
||||
ON CONFLICT (key) DO NOTHING;
|
||||
`);
|
||||
|
||||
// SEO Pages table
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS seo_pages (
|
||||
id SERIAL PRIMARY KEY,
|
||||
type VARCHAR(50) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||
page_key VARCHAR(255) NOT NULL,
|
||||
primary_keyword VARCHAR(255),
|
||||
status VARCHAR(50) DEFAULT 'pending_generation',
|
||||
data_source VARCHAR(100),
|
||||
meta_title VARCHAR(255),
|
||||
meta_description TEXT,
|
||||
last_generated_at TIMESTAMPTZ,
|
||||
last_reviewed_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_pages_type ON seo_pages(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_pages_status ON seo_pages(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_pages_slug ON seo_pages(slug);
|
||||
`);
|
||||
|
||||
// SEO Page Contents table
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS seo_page_contents (
|
||||
id SERIAL PRIMARY KEY,
|
||||
page_id INTEGER NOT NULL REFERENCES seo_pages(id) ON DELETE CASCADE,
|
||||
version INTEGER DEFAULT 1,
|
||||
blocks JSONB NOT NULL DEFAULT '[]',
|
||||
meta JSONB NOT NULL DEFAULT '{}',
|
||||
meta_title VARCHAR(255),
|
||||
meta_description TEXT,
|
||||
h1 VARCHAR(255),
|
||||
canonical_url TEXT,
|
||||
og_title VARCHAR(255),
|
||||
og_description TEXT,
|
||||
og_image_url TEXT,
|
||||
generated_by VARCHAR(50) DEFAULT 'claude',
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(page_id, version)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_page_contents_page ON seo_page_contents(page_id);
|
||||
`);
|
||||
|
||||
await client.query('COMMIT');
|
||||
console.log('✅ Migrations completed successfully');
|
||||
} catch (error) {
|
||||
|
||||
@@ -77,7 +77,9 @@ export function getPool(): Pool {
|
||||
* This is a getter that lazily initializes on first access.
|
||||
*/
|
||||
export const pool = {
|
||||
query: (...args: Parameters<Pool['query']>) => getPool().query(...args),
|
||||
query: (queryTextOrConfig: string | import('pg').QueryConfig, values?: any[]): Promise<import('pg').QueryResult<any>> => {
|
||||
return getPool().query(queryTextOrConfig as any, values);
|
||||
},
|
||||
connect: () => getPool().connect(),
|
||||
end: () => getPool().end(),
|
||||
on: (event: 'error' | 'connect' | 'acquire' | 'remove' | 'release', listener: (...args: any[]) => void) => getPool().on(event as any, listener),
|
||||
|
||||
200
backend/src/db/run-migrations.ts
Normal file
200
backend/src/db/run-migrations.ts
Normal file
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Database Migration Runner
|
||||
*
|
||||
* Runs SQL migrations from backend/migrations/*.sql in order.
|
||||
* Tracks applied migrations in schema_migrations table.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/db/run-migrations.ts
|
||||
*
|
||||
* Environment:
|
||||
* DATABASE_URL or CANNAIQ_DB_* variables
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import * as fs from 'fs/promises';
|
||||
import * as path from 'path';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
function getConnectionString(): string {
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
|
||||
const port = process.env.CANNAIQ_DB_PORT || '54320';
|
||||
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
|
||||
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
|
||||
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
|
||||
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
interface MigrationFile {
|
||||
filename: string;
|
||||
number: number;
|
||||
path: string;
|
||||
}
|
||||
|
||||
async function getMigrationFiles(migrationsDir: string): Promise<MigrationFile[]> {
|
||||
const files = await fs.readdir(migrationsDir);
|
||||
|
||||
const migrations: MigrationFile[] = files
|
||||
.filter(f => f.endsWith('.sql'))
|
||||
.map(filename => {
|
||||
// Extract number from filename like "005_api_tokens.sql" or "073_proxy_timezone.sql"
|
||||
const match = filename.match(/^(\d+)_/);
|
||||
if (!match) return null;
|
||||
|
||||
return {
|
||||
filename,
|
||||
number: parseInt(match[1], 10),
|
||||
path: path.join(migrationsDir, filename),
|
||||
};
|
||||
})
|
||||
.filter((m): m is MigrationFile => m !== null)
|
||||
.sort((a, b) => a.number - b.number);
|
||||
|
||||
return migrations;
|
||||
}
|
||||
|
||||
async function ensureMigrationsTable(pool: Pool): Promise<void> {
|
||||
// Migrate to filename-based tracking (handles duplicate version numbers)
|
||||
// Check if old version-based PK exists
|
||||
const pkCheck = await pool.query(`
|
||||
SELECT constraint_name FROM information_schema.table_constraints
|
||||
WHERE table_name = 'schema_migrations' AND constraint_type = 'PRIMARY KEY'
|
||||
`);
|
||||
|
||||
if (pkCheck.rows.length === 0) {
|
||||
// Table doesn't exist, create with filename as PK
|
||||
await pool.query(`
|
||||
CREATE TABLE IF NOT EXISTS schema_migrations (
|
||||
filename VARCHAR(255) NOT NULL PRIMARY KEY,
|
||||
version VARCHAR(10),
|
||||
name VARCHAR(255),
|
||||
applied_at TIMESTAMPTZ DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
} else {
|
||||
// Table exists - add filename column if missing
|
||||
await pool.query(`
|
||||
ALTER TABLE schema_migrations ADD COLUMN IF NOT EXISTS filename VARCHAR(255)
|
||||
`);
|
||||
// Populate filename from version+name for existing rows
|
||||
await pool.query(`
|
||||
UPDATE schema_migrations SET filename = version || '_' || name || '.sql'
|
||||
WHERE filename IS NULL
|
||||
`);
|
||||
}
|
||||
}
|
||||
|
||||
async function getAppliedMigrations(pool: Pool): Promise<Set<string>> {
|
||||
// Try filename first, fall back to version_name combo
|
||||
const result = await pool.query(`
|
||||
SELECT COALESCE(filename, version || '_' || name || '.sql') as filename
|
||||
FROM schema_migrations
|
||||
`);
|
||||
return new Set(result.rows.map(r => r.filename));
|
||||
}
|
||||
|
||||
async function applyMigration(pool: Pool, migration: MigrationFile): Promise<void> {
|
||||
const sql = await fs.readFile(migration.path, 'utf-8');
|
||||
|
||||
// Extract version and name from filename like "005_api_tokens.sql"
|
||||
const version = String(migration.number).padStart(3, '0');
|
||||
const name = migration.filename.replace(/^\d+_/, '').replace(/\.sql$/, '');
|
||||
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Run the migration SQL
|
||||
await client.query(sql);
|
||||
|
||||
// Record that it was applied - use INSERT with ON CONFLICT for safety
|
||||
await client.query(`
|
||||
INSERT INTO schema_migrations (filename, version, name)
|
||||
VALUES ($1, $2, $3)
|
||||
ON CONFLICT DO NOTHING
|
||||
`, [migration.filename, version, name]);
|
||||
|
||||
await client.query('COMMIT');
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const pool = new Pool({ connectionString: getConnectionString() });
|
||||
|
||||
// Migrations directory relative to this file
|
||||
const migrationsDir = path.resolve(__dirname, '../../migrations');
|
||||
|
||||
console.log('╔════════════════════════════════════════════════════════════╗');
|
||||
console.log('║ DATABASE MIGRATION RUNNER ║');
|
||||
console.log('╚════════════════════════════════════════════════════════════╝');
|
||||
console.log(`Migrations dir: ${migrationsDir}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// Ensure tracking table exists
|
||||
await ensureMigrationsTable(pool);
|
||||
|
||||
// Get all migration files
|
||||
const allMigrations = await getMigrationFiles(migrationsDir);
|
||||
console.log(`Found ${allMigrations.length} migration files`);
|
||||
|
||||
// Get already-applied migrations
|
||||
const applied = await getAppliedMigrations(pool);
|
||||
console.log(`Already applied: ${applied.size} migrations`);
|
||||
console.log('');
|
||||
|
||||
// Find pending migrations (compare by filename)
|
||||
const pending = allMigrations.filter(m => !applied.has(m.filename));
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log('✅ No pending migrations. Database is up to date.');
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Pending migrations: ${pending.length}`);
|
||||
console.log('─'.repeat(60));
|
||||
|
||||
// Apply each pending migration
|
||||
for (const migration of pending) {
|
||||
process.stdout.write(` ${migration.filename}... `);
|
||||
try {
|
||||
await applyMigration(pool, migration);
|
||||
console.log('✅');
|
||||
} catch (error: any) {
|
||||
console.log('❌');
|
||||
console.error(`\nError applying ${migration.filename}:`);
|
||||
console.error(error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(`✅ Applied ${pending.length} migrations successfully`);
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Migration runner failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -3,14 +3,23 @@
|
||||
*
|
||||
* Main orchestrator for the Dutchie store discovery pipeline.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Discover cities from Dutchie (or use seeded cities)
|
||||
* 2. For each city, discover store locations
|
||||
* 3. Upsert all data to discovery tables
|
||||
* 4. Admin verifies locations manually
|
||||
* 5. Verified locations are promoted to canonical dispensaries
|
||||
* AUTOMATED FLOW (as of 2025-01):
|
||||
* 1. Fetch cities dynamically from Dutchie GraphQL (getAllCitiesByState)
|
||||
* 2. For each city, discover store locations via ConsumerDispensaries query
|
||||
* 3. Upsert locations to dutchie_discovery_locations (keyed by platform_location_id)
|
||||
* 4. AUTO-VALIDATE: Check required fields (name, city, state, platform_menu_url, platform_location_id)
|
||||
* 5. AUTO-PROMOTE: Valid locations are upserted to dispensaries table with crawl_enabled=true
|
||||
* 6. All actions logged to dutchie_promotion_log for audit
|
||||
*
|
||||
* This module does NOT create canonical dispensaries automatically.
|
||||
* Tables involved:
|
||||
* - dutchie_discovery_cities: Known cities for each state
|
||||
* - dutchie_discovery_locations: Raw discovered store data
|
||||
* - dispensaries: Canonical store records (promoted from discovery)
|
||||
* - dutchie_promotion_log: Audit trail for validation/promotion
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/run-discovery.ts discover:state AZ
|
||||
* npx tsx src/scripts/run-discovery.ts discover:state CA
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
@@ -24,11 +33,12 @@ import {
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import {
|
||||
discoverLocationsForCity,
|
||||
getCitiesForState,
|
||||
} from './location-discovery';
|
||||
import { promoteDiscoveredLocations } from './promotion';
|
||||
|
||||
// ============================================================
|
||||
// FULL DISCOVERY
|
||||
@@ -162,12 +172,161 @@ export async function runFullDiscovery(
|
||||
console.log(`Errors: ${totalErrors}`);
|
||||
}
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
let newDispensaryIds: number[] = [];
|
||||
|
||||
// Step 4: Auto-validate and promote discovered locations
|
||||
if (!dryRun && totalLocationsUpserted > 0) {
|
||||
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
|
||||
const promotionResult = await promoteDiscoveredLocations(stateCode, false);
|
||||
console.log(`[Discovery] Promotion complete:`);
|
||||
console.log(` Created: ${promotionResult.created} new dispensaries`);
|
||||
console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
|
||||
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Capture new IDs for task chaining
|
||||
newDispensaryIds = promotionResult.newDispensaryIds;
|
||||
if (newDispensaryIds.length > 0) {
|
||||
console.log(` New store IDs for crawl: [${newDispensaryIds.join(', ')}]`);
|
||||
}
|
||||
|
||||
if (promotionResult.rejectedRecords.length > 0) {
|
||||
console.log(` Rejection reasons:`);
|
||||
promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
|
||||
console.log(` - ${r.name}: ${r.errors.join(', ')}`);
|
||||
});
|
||||
if (promotionResult.rejectedRecords.length > 5) {
|
||||
console.log(` ... and ${promotionResult.rejectedRecords.length - 5} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Detect dropped stores (in DB but not in discovery results)
|
||||
if (!dryRun) {
|
||||
console.log('\n[Discovery] Step 5: Detecting dropped stores...');
|
||||
const droppedResult = await detectDroppedStores(pool, stateCode);
|
||||
if (droppedResult.droppedCount > 0) {
|
||||
console.log(`[Discovery] Found ${droppedResult.droppedCount} dropped stores:`);
|
||||
droppedResult.droppedStores.slice(0, 10).forEach(s => {
|
||||
console.log(` - ${s.name} (${s.city}, ${s.state}) - last seen: ${s.lastSeenAt}`);
|
||||
});
|
||||
if (droppedResult.droppedCount > 10) {
|
||||
console.log(` ... and ${droppedResult.droppedCount - 10} more`);
|
||||
}
|
||||
} else {
|
||||
console.log(`[Discovery] No dropped stores detected`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
cities: cityResult,
|
||||
locations: locationResults,
|
||||
totalLocationsFound,
|
||||
totalLocationsUpserted,
|
||||
durationMs,
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
|
||||
newDispensaryIds,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DROPPED STORE DETECTION
|
||||
// ============================================================
|
||||
|
||||
export interface DroppedStoreResult {
|
||||
droppedCount: number;
|
||||
droppedStores: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
city: string;
|
||||
state: string;
|
||||
platformDispensaryId: string;
|
||||
lastSeenAt: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect stores that exist in dispensaries but were not found in discovery.
|
||||
* Marks them as status='dropped' for manual review.
|
||||
*
|
||||
* A store is considered "dropped" if:
|
||||
* 1. It has a platform_dispensary_id (was verified via Dutchie)
|
||||
* 2. It was NOT seen in the latest discovery crawl (last_seen_at in discovery < 24h ago)
|
||||
* 3. It's currently marked as 'open' status
|
||||
*/
|
||||
export async function detectDroppedStores(
|
||||
pool: Pool,
|
||||
stateCode?: string
|
||||
): Promise<DroppedStoreResult> {
|
||||
// Find dispensaries that:
|
||||
// 1. Have platform_dispensary_id (verified Dutchie stores)
|
||||
// 2. Are currently 'open' status
|
||||
// 3. Have a linked discovery record that wasn't seen in the last discovery run
|
||||
// (last_seen_at in dutchie_discovery_locations is older than 24 hours)
|
||||
const params: any[] = [];
|
||||
let stateFilter = '';
|
||||
|
||||
if (stateCode) {
|
||||
stateFilter = ` AND d.state = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
const query = `
|
||||
WITH recently_seen AS (
|
||||
SELECT DISTINCT platform_location_id
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE last_seen_at > NOW() - INTERVAL '24 hours'
|
||||
AND active = true
|
||||
)
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.platform_dispensary_id,
|
||||
d.updated_at as last_seen_at
|
||||
FROM dispensaries d
|
||||
WHERE d.platform_dispensary_id IS NOT NULL
|
||||
AND d.platform = 'dutchie'
|
||||
AND (d.status = 'open' OR d.status IS NULL)
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id NOT IN (SELECT platform_location_id FROM recently_seen)
|
||||
${stateFilter}
|
||||
ORDER BY d.name
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const droppedStores = result.rows;
|
||||
|
||||
// Mark these stores as 'dropped' status
|
||||
if (droppedStores.length > 0) {
|
||||
const ids = droppedStores.map(s => s.id);
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET status = 'dropped', updated_at = NOW()
|
||||
WHERE id = ANY($1::int[])
|
||||
`, [ids]);
|
||||
|
||||
// Log to promotion log for audit
|
||||
for (const store of droppedStores) {
|
||||
await pool.query(`
|
||||
INSERT INTO dutchie_promotion_log
|
||||
(dispensary_id, action, state_code, store_name, triggered_by)
|
||||
VALUES ($1, 'dropped', $2, $3, 'discovery_detection')
|
||||
`, [store.id, store.state, store.name]);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
droppedCount: droppedStores.length,
|
||||
droppedStores: droppedStores.map(s => ({
|
||||
id: s.id,
|
||||
name: s.name,
|
||||
city: s.city,
|
||||
state: s.state,
|
||||
platformDispensaryId: s.platform_dispensary_id,
|
||||
lastSeenAt: s.last_seen_at,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -235,11 +394,19 @@ export async function discoverState(
|
||||
|
||||
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
||||
|
||||
// Seed known cities for this state
|
||||
if (stateCode === 'AZ') {
|
||||
console.log('[Discovery] Seeding Arizona cities...');
|
||||
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
|
||||
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
|
||||
// Dynamically fetch and seed cities for this state
|
||||
console.log(`[Discovery] Fetching cities for ${stateCode} from Dutchie...`);
|
||||
const cityNames = await getCitiesForState(stateCode);
|
||||
if (cityNames.length > 0) {
|
||||
const cities = cityNames.map(name => ({
|
||||
name,
|
||||
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
|
||||
stateCode,
|
||||
}));
|
||||
const seeded = await seedKnownCities(pool, cities);
|
||||
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated for ${stateCode}`);
|
||||
} else {
|
||||
console.log(`[Discovery] No cities found for ${stateCode}`);
|
||||
}
|
||||
|
||||
// Run full discovery for this state
|
||||
|
||||
@@ -13,7 +13,6 @@ export {
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
|
||||
// Location Discovery
|
||||
@@ -33,5 +32,17 @@ export {
|
||||
DiscoveryStats,
|
||||
} from './discovery-crawler';
|
||||
|
||||
// Promotion
|
||||
export {
|
||||
validateForPromotion,
|
||||
validateDiscoveredLocations,
|
||||
promoteDiscoveredLocations,
|
||||
promoteSingleLocation,
|
||||
ValidationResult,
|
||||
ValidationSummary,
|
||||
PromotionResult,
|
||||
PromotionSummary,
|
||||
} from './promotion';
|
||||
|
||||
// Routes
|
||||
export { createDiscoveryRoutes } from './routes';
|
||||
|
||||
@@ -26,13 +26,346 @@ import {
|
||||
mapLocationRowToLocation,
|
||||
} from './types';
|
||||
import { DiscoveryCity } from './types';
|
||||
import {
|
||||
executeGraphQL,
|
||||
fetchPage,
|
||||
extractNextData,
|
||||
GRAPHQL_HASHES,
|
||||
setProxy,
|
||||
} from '../platforms/dutchie/client';
|
||||
import { getStateProxy, getRandomProxy } from '../utils/proxyManager';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// PROXY INITIALIZATION
|
||||
// ============================================================
|
||||
// Call initDiscoveryProxy() before any discovery operations to
|
||||
// set up proxy if USE_PROXY=true environment variable is set.
|
||||
// This is opt-in and does NOT break existing behavior.
|
||||
// ============================================================
|
||||
|
||||
let proxyInitialized = false;
|
||||
|
||||
/**
|
||||
* Initialize proxy for discovery operations
|
||||
* Only runs if USE_PROXY=true is set in environment
|
||||
* Safe to call multiple times - only initializes once
|
||||
*
|
||||
* @param stateCode - Optional state code for state-specific proxy (e.g., 'AZ', 'CA')
|
||||
* @returns true if proxy was set, false if skipped or failed
|
||||
*/
|
||||
export async function initDiscoveryProxy(stateCode?: string): Promise<boolean> {
|
||||
// Skip if already initialized
|
||||
if (proxyInitialized) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Skip if USE_PROXY is not enabled
|
||||
if (process.env.USE_PROXY !== 'true') {
|
||||
console.log('[LocationDiscovery] Proxy disabled (USE_PROXY != true)');
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Get proxy - prefer state-specific if state code provided
|
||||
const proxyConfig = stateCode
|
||||
? await getStateProxy(stateCode)
|
||||
: await getRandomProxy();
|
||||
|
||||
if (!proxyConfig) {
|
||||
console.warn('[LocationDiscovery] No proxy available, proceeding without proxy');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build proxy URL with auth if needed
|
||||
let proxyUrl = proxyConfig.server;
|
||||
if (proxyConfig.username && proxyConfig.password) {
|
||||
const url = new URL(proxyConfig.server);
|
||||
url.username = proxyConfig.username;
|
||||
url.password = proxyConfig.password;
|
||||
proxyUrl = url.toString();
|
||||
}
|
||||
|
||||
// Set proxy on the Dutchie client
|
||||
setProxy(proxyUrl);
|
||||
proxyInitialized = true;
|
||||
|
||||
console.log(`[LocationDiscovery] Proxy initialized for ${stateCode || 'general'} discovery`);
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Failed to initialize proxy: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset proxy initialization flag (for testing or re-initialization)
|
||||
*/
|
||||
export function resetProxyInit(): void {
|
||||
proxyInitialized = false;
|
||||
setProxy(null);
|
||||
}
|
||||
|
||||
const PLATFORM = 'dutchie';
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL / API FETCHING
|
||||
// CITY-BASED DISCOVERY (CANONICAL SOURCE OF TRUTH)
|
||||
// ============================================================
|
||||
// GraphQL with city+state filter is the SOURCE OF TRUTH for database data.
|
||||
//
|
||||
// Method:
|
||||
// 1. Get city list from statesWithDispensaries (in __NEXT_DATA__)
|
||||
// 2. Query stores per city using city + state GraphQL filter
|
||||
// 3. This gives us complete, accurate dispensary data
|
||||
//
|
||||
// Geo-coordinate queries (nearLat/nearLng) are ONLY for showing search
|
||||
// results to users (e.g., "stores within 20 miles of me").
|
||||
// They are NOT a source of truth for establishing database records.
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* State with dispensary cities from Dutchie's statesWithDispensaries data
|
||||
*/
|
||||
export interface StateWithCities {
|
||||
name: string; // State code (e.g., "CA", "AZ")
|
||||
country: string; // Country code (e.g., "US")
|
||||
cities: string[]; // Array of city names
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all states with their cities via direct GraphQL query
|
||||
*
|
||||
* Uses the getAllCitiesByState persisted query which returns all states
|
||||
* and cities where Dutchie has dispensaries.
|
||||
*/
|
||||
export async function fetchStatesWithDispensaries(
|
||||
options: { verbose?: boolean } = {}
|
||||
): Promise<StateWithCities[]> {
|
||||
const { verbose = false } = options;
|
||||
|
||||
// Initialize proxy if USE_PROXY=true
|
||||
await initDiscoveryProxy();
|
||||
|
||||
console.log('[LocationDiscovery] Fetching statesWithDispensaries via GraphQL...');
|
||||
|
||||
try {
|
||||
// Use direct GraphQL query - much cleaner than scraping __NEXT_DATA__
|
||||
const result = await executeGraphQL(
|
||||
'getAllCitiesByState',
|
||||
{}, // No variables needed
|
||||
GRAPHQL_HASHES.GetAllCitiesByState,
|
||||
{ maxRetries: 3, retryOn403: true }
|
||||
);
|
||||
|
||||
const statesData = result?.data?.statesWithDispensaries;
|
||||
if (!Array.isArray(statesData)) {
|
||||
console.error('[LocationDiscovery] statesWithDispensaries not found in response');
|
||||
return [];
|
||||
}
|
||||
|
||||
// Map to our StateWithCities format
|
||||
const states: StateWithCities[] = [];
|
||||
for (const state of statesData) {
|
||||
if (state && state.name) {
|
||||
// Filter out null cities
|
||||
const cities = Array.isArray(state.cities)
|
||||
? state.cities.filter((c: string | null) => c !== null)
|
||||
: [];
|
||||
|
||||
states.push({
|
||||
name: state.name,
|
||||
country: state.country || 'US',
|
||||
cities,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Found ${states.length} states`);
|
||||
for (const state of states) {
|
||||
console.log(` ${state.name}: ${state.cities.length} cities`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] Loaded ${states.length} states with cities`);
|
||||
return states;
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Failed to fetch states: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cities for a specific state
|
||||
*/
|
||||
export async function getCitiesForState(
|
||||
stateCode: string,
|
||||
options: { verbose?: boolean } = {}
|
||||
): Promise<string[]> {
|
||||
const states = await fetchStatesWithDispensaries(options);
|
||||
const state = states.find(s => s.name.toUpperCase() === stateCode.toUpperCase());
|
||||
|
||||
if (!state) {
|
||||
console.warn(`[LocationDiscovery] No cities found for state: ${stateCode}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] Found ${state.cities.length} cities for ${stateCode}`);
|
||||
return state.cities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch dispensaries for a specific city+state using GraphQL
|
||||
*
|
||||
* This is the CORRECT method for establishing database data:
|
||||
* Uses city + state filter, NOT geo-coordinates.
|
||||
*/
|
||||
export async function fetchDispensariesByCityState(
|
||||
city: string,
|
||||
stateCode: string,
|
||||
options: { verbose?: boolean; perPage?: number; maxPages?: number } = {}
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
const { verbose = false, perPage = 200, maxPages = 10 } = options;
|
||||
|
||||
// Initialize proxy if USE_PROXY=true (state-specific proxy preferred)
|
||||
await initDiscoveryProxy(stateCode);
|
||||
|
||||
console.log(`[LocationDiscovery] Fetching dispensaries for ${city}, ${stateCode}...`);
|
||||
|
||||
const allDispensaries: any[] = [];
|
||||
let page = 0;
|
||||
let hasMore = true;
|
||||
|
||||
while (hasMore && page < maxPages) {
|
||||
const variables = {
|
||||
dispensaryFilter: {
|
||||
activeOnly: true,
|
||||
city: city,
|
||||
state: stateCode,
|
||||
},
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await executeGraphQL(
|
||||
'ConsumerDispensaries',
|
||||
variables,
|
||||
GRAPHQL_HASHES.ConsumerDispensaries,
|
||||
{ cName: `${city.toLowerCase().replace(/\s+/g, '-')}-${stateCode.toLowerCase()}`, maxRetries: 2, retryOn403: true }
|
||||
);
|
||||
|
||||
const dispensaries = result?.data?.filteredDispensaries || [];
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Page ${page}: ${dispensaries.length} dispensaries`);
|
||||
}
|
||||
|
||||
if (dispensaries.length === 0) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
// Filter to ensure we only get dispensaries in the correct state
|
||||
const stateFiltered = dispensaries.filter((d: any) =>
|
||||
d.location?.state?.toUpperCase() === stateCode.toUpperCase()
|
||||
);
|
||||
allDispensaries.push(...stateFiltered);
|
||||
|
||||
if (dispensaries.length < perPage) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
page++;
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Error fetching page ${page}: ${error.message}`);
|
||||
hasMore = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Dedupe by ID
|
||||
const uniqueMap = new Map<string, any>();
|
||||
for (const d of allDispensaries) {
|
||||
const id = d.id || d._id;
|
||||
if (id && !uniqueMap.has(id)) {
|
||||
uniqueMap.set(id, d);
|
||||
}
|
||||
}
|
||||
|
||||
const unique = Array.from(uniqueMap.values());
|
||||
console.log(`[LocationDiscovery] Found ${unique.length} unique dispensaries in ${city}, ${stateCode}`);
|
||||
|
||||
return unique.map(d => normalizeLocationResponse(d));
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch ALL dispensaries for a state by querying each city
|
||||
*
|
||||
* This is the canonical method for establishing state data:
|
||||
* 1. Get city list from statesWithDispensaries
|
||||
* 2. Query each city using city+state filter
|
||||
* 3. Dedupe and return all dispensaries
|
||||
*/
|
||||
export async function fetchAllDispensariesForState(
|
||||
stateCode: string,
|
||||
options: { verbose?: boolean; progressCallback?: (city: string, count: number, total: number) => void } = {}
|
||||
): Promise<{ dispensaries: DutchieLocationResponse[]; citiesQueried: number; citiesWithResults: number }> {
|
||||
const { verbose = false, progressCallback } = options;
|
||||
|
||||
console.log(`[LocationDiscovery] Fetching all dispensaries for ${stateCode}...`);
|
||||
|
||||
// Step 1: Get city list
|
||||
const cities = await getCitiesForState(stateCode, { verbose });
|
||||
if (cities.length === 0) {
|
||||
console.warn(`[LocationDiscovery] No cities found for ${stateCode}`);
|
||||
return { dispensaries: [], citiesQueried: 0, citiesWithResults: 0 };
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] Will query ${cities.length} cities for ${stateCode}`);
|
||||
|
||||
// Step 2: Query each city
|
||||
const allDispensaries = new Map<string, DutchieLocationResponse>();
|
||||
let citiesWithResults = 0;
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
|
||||
if (progressCallback) {
|
||||
progressCallback(city, i + 1, cities.length);
|
||||
}
|
||||
|
||||
try {
|
||||
const dispensaries = await fetchDispensariesByCityState(city, stateCode, { verbose });
|
||||
|
||||
if (dispensaries.length > 0) {
|
||||
citiesWithResults++;
|
||||
for (const d of dispensaries) {
|
||||
const id = d.id || d.slug;
|
||||
if (id && !allDispensaries.has(id)) {
|
||||
allDispensaries.set(id, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Small delay between cities to avoid rate limiting
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Error querying ${city}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const result = Array.from(allDispensaries.values());
|
||||
console.log(`[LocationDiscovery] Total: ${result.length} unique dispensaries across ${citiesWithResults}/${cities.length} cities`);
|
||||
|
||||
return {
|
||||
dispensaries: result,
|
||||
citiesQueried: cities.length,
|
||||
citiesWithResults,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL / API FETCHING (LEGACY - PUPPETEER-BASED)
|
||||
// ============================================================
|
||||
|
||||
interface SessionCredentials {
|
||||
@@ -91,57 +424,77 @@ async function closeSession(session: SessionCredentials): Promise<void> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch locations for a city using Dutchie's internal search API.
|
||||
* Fetch locations for a city.
|
||||
*
|
||||
* PRIMARY METHOD: Uses city+state GraphQL filter (source of truth)
|
||||
* FALLBACK: Legacy Puppeteer-based methods for edge cases
|
||||
*/
|
||||
export async function fetchLocationsForCity(
|
||||
city: DiscoveryCity,
|
||||
options: {
|
||||
session?: SessionCredentials;
|
||||
verbose?: boolean;
|
||||
useLegacyMethods?: boolean;
|
||||
} = {}
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
const { verbose = false } = options;
|
||||
let session = options.session;
|
||||
let shouldCloseSession = false;
|
||||
const { verbose = false, useLegacyMethods = false } = options;
|
||||
|
||||
if (!session) {
|
||||
session = await createSession(city.citySlug);
|
||||
shouldCloseSession = true;
|
||||
}
|
||||
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
try {
|
||||
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
// Try multiple approaches to get location data
|
||||
|
||||
// Approach 1: Extract from page __NEXT_DATA__ or similar
|
||||
const locations = await extractLocationsFromPage(session.page, verbose);
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data`);
|
||||
return locations;
|
||||
}
|
||||
|
||||
// Approach 2: Try the geo-based GraphQL query
|
||||
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
||||
if (geoLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from GraphQL`);
|
||||
return geoLocations;
|
||||
}
|
||||
|
||||
// Approach 3: Scrape visible location cards
|
||||
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
||||
if (scrapedLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping`);
|
||||
return scrapedLocations;
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||
return [];
|
||||
} finally {
|
||||
if (shouldCloseSession) {
|
||||
await closeSession(session);
|
||||
// PRIMARY METHOD: City+State GraphQL query (SOURCE OF TRUTH)
|
||||
if (city.cityName && city.stateCode) {
|
||||
try {
|
||||
const locations = await fetchDispensariesByCityState(city.cityName, city.stateCode, { verbose });
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations via GraphQL city+state`);
|
||||
return locations;
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.warn(`[LocationDiscovery] GraphQL city+state failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// FALLBACK: Legacy Puppeteer-based methods (only if explicitly enabled)
|
||||
if (useLegacyMethods) {
|
||||
let session = options.session;
|
||||
let shouldCloseSession = false;
|
||||
|
||||
if (!session) {
|
||||
session = await createSession(city.citySlug);
|
||||
shouldCloseSession = true;
|
||||
}
|
||||
|
||||
try {
|
||||
// Legacy Approach 1: Extract from page __NEXT_DATA__
|
||||
const locations = await extractLocationsFromPage(session.page, verbose);
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data (legacy)`);
|
||||
return locations;
|
||||
}
|
||||
|
||||
// Legacy Approach 2: Try the geo-based GraphQL query
|
||||
// NOTE: Geo queries are for SEARCH RESULTS only, not source of truth
|
||||
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
||||
if (geoLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from geo GraphQL (legacy)`);
|
||||
return geoLocations;
|
||||
}
|
||||
|
||||
// Legacy Approach 3: Scrape visible location cards
|
||||
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
||||
if (scrapedLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping (legacy)`);
|
||||
return scrapedLocations;
|
||||
}
|
||||
} finally {
|
||||
if (shouldCloseSession) {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -202,33 +555,52 @@ async function extractLocationsFromPage(
|
||||
|
||||
/**
|
||||
* Fetch locations via GraphQL geo-based query.
|
||||
*
|
||||
* Uses ConsumerDispensaries with geo filtering:
|
||||
* - dispensaryFilter.nearLat/nearLng for center point
|
||||
* - dispensaryFilter.distance for radius in miles
|
||||
* - Response at data.filteredDispensaries
|
||||
*/
|
||||
async function fetchLocationsViaGraphQL(
|
||||
session: SessionCredentials,
|
||||
city: DiscoveryCity,
|
||||
verbose: boolean
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
// Use a known center point for the city or default to a central US location
|
||||
const CITY_COORDS: Record<string, { lat: number; lng: number }> = {
|
||||
'phoenix': { lat: 33.4484, lng: -112.074 },
|
||||
'tucson': { lat: 32.2226, lng: -110.9747 },
|
||||
'scottsdale': { lat: 33.4942, lng: -111.9261 },
|
||||
'mesa': { lat: 33.4152, lng: -111.8315 },
|
||||
'tempe': { lat: 33.4255, lng: -111.94 },
|
||||
'flagstaff': { lat: 35.1983, lng: -111.6513 },
|
||||
// Add more as needed
|
||||
// City center coordinates with appropriate radius
|
||||
const CITY_COORDS: Record<string, { lat: number; lng: number; radius: number }> = {
|
||||
'phoenix': { lat: 33.4484, lng: -112.074, radius: 50 },
|
||||
'tucson': { lat: 32.2226, lng: -110.9747, radius: 50 },
|
||||
'scottsdale': { lat: 33.4942, lng: -111.9261, radius: 30 },
|
||||
'mesa': { lat: 33.4152, lng: -111.8315, radius: 30 },
|
||||
'tempe': { lat: 33.4255, lng: -111.94, radius: 30 },
|
||||
'flagstaff': { lat: 35.1983, lng: -111.6513, radius: 50 },
|
||||
};
|
||||
|
||||
const coords = CITY_COORDS[city.citySlug] || { lat: 33.4484, lng: -112.074 };
|
||||
// State-wide coordinates for full coverage
|
||||
const STATE_COORDS: Record<string, { lat: number; lng: number; radius: number }> = {
|
||||
'AZ': { lat: 33.4484, lng: -112.074, radius: 200 },
|
||||
'CA': { lat: 36.7783, lng: -119.4179, radius: 400 },
|
||||
'CO': { lat: 39.5501, lng: -105.7821, radius: 200 },
|
||||
'FL': { lat: 27.6648, lng: -81.5158, radius: 400 },
|
||||
'MI': { lat: 44.3148, lng: -85.6024, radius: 250 },
|
||||
'NV': { lat: 36.1699, lng: -115.1398, radius: 200 },
|
||||
};
|
||||
|
||||
// Try city-specific coords first, then state-wide, then default
|
||||
const coords = CITY_COORDS[city.citySlug]
|
||||
|| (city.stateCode && STATE_COORDS[city.stateCode])
|
||||
|| { lat: 33.4484, lng: -112.074, radius: 200 };
|
||||
|
||||
// Correct GraphQL variables for ConsumerDispensaries
|
||||
const variables = {
|
||||
dispensariesFilter: {
|
||||
latitude: coords.lat,
|
||||
longitude: coords.lng,
|
||||
distance: 50, // miles
|
||||
state: city.stateCode,
|
||||
city: city.cityName,
|
||||
dispensaryFilter: {
|
||||
activeOnly: true,
|
||||
nearLat: coords.lat,
|
||||
nearLng: coords.lng,
|
||||
distance: coords.radius,
|
||||
},
|
||||
page: 0,
|
||||
perPage: 200,
|
||||
};
|
||||
|
||||
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
|
||||
@@ -263,8 +635,19 @@ async function fetchLocationsViaGraphQL(
|
||||
return [];
|
||||
}
|
||||
|
||||
const dispensaries = response.data?.data?.consumerDispensaries || [];
|
||||
return dispensaries.map((d: any) => normalizeLocationResponse(d));
|
||||
// Response is at data.filteredDispensaries
|
||||
const dispensaries = response.data?.data?.filteredDispensaries || [];
|
||||
|
||||
// Filter to specific state if needed (radius may include neighboring states)
|
||||
const filtered = city.stateCode
|
||||
? dispensaries.filter((d: any) => d.location?.state === city.stateCode)
|
||||
: dispensaries;
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] GraphQL returned ${dispensaries.length} total, ${filtered.length} in ${city.stateCode || 'all states'}`);
|
||||
}
|
||||
|
||||
return filtered.map((d: any) => normalizeLocationResponse(d));
|
||||
} catch (error: any) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
|
||||
@@ -337,31 +720,57 @@ async function scrapeLocationCards(
|
||||
|
||||
/**
|
||||
* Normalize a raw location response to a consistent format.
|
||||
* Maps Dutchie camelCase fields to our snake_case equivalents.
|
||||
*/
|
||||
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
||||
const slug = raw.slug || raw.cName || raw.urlSlug || '';
|
||||
const id = raw.id || raw._id || raw.dispensaryId || '';
|
||||
|
||||
// Extract location data - GraphQL response nests address info in .location
|
||||
const loc = raw.location || {};
|
||||
|
||||
// Extract coordinates from geometry.coordinates [longitude, latitude]
|
||||
const coords = loc.geometry?.coordinates || [];
|
||||
const longitude = coords[0] || raw.longitude || raw.lng || loc.longitude || loc.lng;
|
||||
const latitude = coords[1] || raw.latitude || raw.lat || loc.latitude || loc.lat;
|
||||
|
||||
return {
|
||||
id,
|
||||
name: raw.name || raw.dispensaryName || '',
|
||||
slug,
|
||||
address: raw.address || raw.fullAddress || '',
|
||||
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || '',
|
||||
address2: raw.address2 || raw.addressLine2 || '',
|
||||
city: raw.city || '',
|
||||
state: raw.state || raw.stateCode || '',
|
||||
zip: raw.zip || raw.zipCode || raw.postalCode || '',
|
||||
country: raw.country || raw.countryCode || 'US',
|
||||
latitude: raw.latitude || raw.lat || raw.location?.latitude,
|
||||
longitude: raw.longitude || raw.lng || raw.location?.longitude,
|
||||
cName: raw.cName || raw.slug || '',
|
||||
address: raw.address || raw.fullAddress || loc.ln1 || '',
|
||||
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || loc.ln1 || '',
|
||||
address2: raw.address2 || raw.addressLine2 || loc.ln2 || '',
|
||||
city: raw.city || loc.city || '',
|
||||
state: raw.state || raw.stateCode || loc.state || '',
|
||||
zip: raw.zip || raw.zipCode || raw.postalCode || loc.zipcode || loc.zip || '',
|
||||
country: raw.country || raw.countryCode || loc.country || 'United States',
|
||||
latitude,
|
||||
longitude,
|
||||
timezone: raw.timezone || raw.tz || '',
|
||||
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
|
||||
retailType: raw.retailType || raw.type || '',
|
||||
// Service offerings
|
||||
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
|
||||
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
|
||||
isRecreational: raw.isRecreational ?? raw.retailType?.includes('Recreational') ?? true,
|
||||
isMedical: raw.isMedical ?? raw.retailType?.includes('Medical') ?? true,
|
||||
offerCurbsidePickup: raw.offerCurbsidePickup ?? false,
|
||||
// License types
|
||||
isRecreational: raw.isRecreational ?? raw.recDispensary ?? raw.retailType?.includes('Recreational') ?? true,
|
||||
isMedical: raw.isMedical ?? raw.medicalDispensary ?? raw.retailType?.includes('Medical') ?? true,
|
||||
// Contact info
|
||||
phone: raw.phone || '',
|
||||
email: raw.email || '',
|
||||
website: raw.embedBackUrl || '',
|
||||
// Branding
|
||||
description: raw.description || '',
|
||||
logoImage: raw.logoImage || '',
|
||||
bannerImage: raw.bannerImage || '',
|
||||
// Chain/enterprise info
|
||||
chainSlug: raw.chain || '',
|
||||
enterpriseId: raw.retailer?.enterpriseId || '',
|
||||
// Status
|
||||
status: raw.status || '',
|
||||
// Preserve raw data
|
||||
...raw,
|
||||
};
|
||||
@@ -373,13 +782,20 @@ function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
||||
|
||||
/**
|
||||
* Upsert a location into dutchie_discovery_locations.
|
||||
* REQUIRES a valid platform ID (MongoDB ObjectId) - will skip records without one.
|
||||
*/
|
||||
export async function upsertLocation(
|
||||
pool: Pool,
|
||||
location: DutchieLocationResponse,
|
||||
cityId: number | null
|
||||
): Promise<{ id: number; isNew: boolean }> {
|
||||
const platformLocationId = location.id || location.slug;
|
||||
): Promise<{ id: number; isNew: boolean } | null> {
|
||||
// REQUIRE actual platform ID - NO fallback to slug
|
||||
const platformLocationId = location.id;
|
||||
if (!platformLocationId) {
|
||||
console.warn(`[LocationDiscovery] Skipping location without platform ID: ${location.name} (${location.slug})`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
|
||||
|
||||
const result = await pool.query(
|
||||
@@ -405,15 +821,27 @@ export async function upsertLocation(
|
||||
offers_pickup,
|
||||
is_recreational,
|
||||
is_medical,
|
||||
phone,
|
||||
website,
|
||||
email,
|
||||
description,
|
||||
logo_image,
|
||||
banner_image,
|
||||
chain_slug,
|
||||
enterprise_id,
|
||||
c_name,
|
||||
country,
|
||||
store_status,
|
||||
last_seen_at,
|
||||
updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW(), NOW())
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, NOW(), NOW())
|
||||
ON CONFLICT (platform, platform_location_id)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
platform_menu_url = EXCLUDED.platform_menu_url,
|
||||
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
|
||||
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
|
||||
address_line2 = COALESCE(EXCLUDED.address_line2, dutchie_discovery_locations.address_line2),
|
||||
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
|
||||
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
|
||||
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
|
||||
@@ -425,6 +853,17 @@ export async function upsertLocation(
|
||||
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
|
||||
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
|
||||
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
|
||||
phone = COALESCE(EXCLUDED.phone, dutchie_discovery_locations.phone),
|
||||
website = COALESCE(EXCLUDED.website, dutchie_discovery_locations.website),
|
||||
email = COALESCE(EXCLUDED.email, dutchie_discovery_locations.email),
|
||||
description = COALESCE(EXCLUDED.description, dutchie_discovery_locations.description),
|
||||
logo_image = COALESCE(EXCLUDED.logo_image, dutchie_discovery_locations.logo_image),
|
||||
banner_image = COALESCE(EXCLUDED.banner_image, dutchie_discovery_locations.banner_image),
|
||||
chain_slug = COALESCE(EXCLUDED.chain_slug, dutchie_discovery_locations.chain_slug),
|
||||
enterprise_id = COALESCE(EXCLUDED.enterprise_id, dutchie_discovery_locations.enterprise_id),
|
||||
c_name = COALESCE(EXCLUDED.c_name, dutchie_discovery_locations.c_name),
|
||||
country = COALESCE(EXCLUDED.country, dutchie_discovery_locations.country),
|
||||
store_status = COALESCE(EXCLUDED.store_status, dutchie_discovery_locations.store_status),
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_new`,
|
||||
@@ -440,7 +879,7 @@ export async function upsertLocation(
|
||||
location.city || null,
|
||||
location.state || null,
|
||||
location.zip || null,
|
||||
location.country || 'US',
|
||||
location.country || 'United States',
|
||||
location.latitude || null,
|
||||
location.longitude || null,
|
||||
location.timezone || null,
|
||||
@@ -450,6 +889,17 @@ export async function upsertLocation(
|
||||
location.offerPickup ?? null,
|
||||
location.isRecreational ?? null,
|
||||
location.isMedical ?? null,
|
||||
location.phone || null,
|
||||
location.website || null,
|
||||
location.email || null,
|
||||
location.description || null,
|
||||
location.logoImage || null,
|
||||
location.bannerImage || null,
|
||||
location.chainSlug || null,
|
||||
location.enterpriseId || null,
|
||||
location.cName || null,
|
||||
location.country || 'United States',
|
||||
location.status || null,
|
||||
]
|
||||
);
|
||||
|
||||
@@ -642,6 +1092,12 @@ export async function discoverLocationsForCity(
|
||||
|
||||
const result = await upsertLocation(pool, location, city.id);
|
||||
|
||||
// Skip locations without valid platform ID
|
||||
if (!result) {
|
||||
errors.push(`Location ${location.slug}: No valid platform ID - skipped`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.isNew) {
|
||||
newCount++;
|
||||
} else {
|
||||
|
||||
587
backend/src/discovery/promotion.ts
Normal file
587
backend/src/discovery/promotion.ts
Normal file
@@ -0,0 +1,587 @@
|
||||
/**
|
||||
* Discovery Promotion Service
|
||||
*
|
||||
* Handles the promotion of discovery locations to dispensaries:
|
||||
* 1. Discovery → Raw data in dutchie_discovery_locations (status='discovered')
|
||||
* 2. Validation → Check required fields, reject incomplete records
|
||||
* 3. Promotion → Idempotent upsert to dispensaries, link back via dispensary_id
|
||||
*/
|
||||
|
||||
import { pool } from '../db/pool';
|
||||
import { DiscoveryLocationRow, DiscoveryStatus } from './types';
|
||||
|
||||
// ============================================================
|
||||
// VALIDATION
|
||||
// ============================================================
|
||||
|
||||
export interface ValidationResult {
|
||||
valid: boolean;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
export interface ValidationSummary {
|
||||
totalChecked: number;
|
||||
validCount: number;
|
||||
invalidCount: number;
|
||||
invalidRecords: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
errors: string[];
|
||||
}>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate a single discovery location has all required fields for promotion
|
||||
*/
|
||||
export function validateForPromotion(loc: DiscoveryLocationRow): ValidationResult {
|
||||
const errors: string[] = [];
|
||||
|
||||
// Required fields
|
||||
if (!loc.platform_location_id) {
|
||||
errors.push('Missing platform_location_id');
|
||||
}
|
||||
if (!loc.name || loc.name.trim() === '') {
|
||||
errors.push('Missing name');
|
||||
}
|
||||
if (!loc.city || loc.city.trim() === '') {
|
||||
errors.push('Missing city');
|
||||
}
|
||||
if (!loc.state_code || loc.state_code.trim() === '') {
|
||||
errors.push('Missing state_code');
|
||||
}
|
||||
if (!loc.platform_menu_url) {
|
||||
errors.push('Missing platform_menu_url');
|
||||
}
|
||||
|
||||
return {
|
||||
valid: errors.length === 0,
|
||||
errors,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate all discovered locations and return summary
|
||||
*/
|
||||
export async function validateDiscoveredLocations(
|
||||
stateCode?: string
|
||||
): Promise<ValidationSummary> {
|
||||
let query = `
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
WHERE status = 'discovered'
|
||||
`;
|
||||
const params: string[] = [];
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const locations = result.rows as DiscoveryLocationRow[];
|
||||
|
||||
const invalidRecords: ValidationSummary['invalidRecords'] = [];
|
||||
let validCount = 0;
|
||||
|
||||
for (const loc of locations) {
|
||||
const validation = validateForPromotion(loc);
|
||||
if (validation.valid) {
|
||||
validCount++;
|
||||
} else {
|
||||
invalidRecords.push({
|
||||
id: loc.id,
|
||||
name: loc.name,
|
||||
errors: validation.errors,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
totalChecked: locations.length,
|
||||
validCount,
|
||||
invalidCount: invalidRecords.length,
|
||||
invalidRecords,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROMOTION
|
||||
// ============================================================
|
||||
|
||||
export interface PromotionResult {
|
||||
discoveryId: number;
|
||||
dispensaryId: number;
|
||||
action: 'created' | 'updated' | 'skipped';
|
||||
name: string;
|
||||
}
|
||||
|
||||
export interface PromotionSummary {
|
||||
totalProcessed: number;
|
||||
created: number;
|
||||
updated: number;
|
||||
skipped: number;
|
||||
rejected: number;
|
||||
results: PromotionResult[];
|
||||
rejectedRecords: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
errors: string[];
|
||||
}>;
|
||||
durationMs: number;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
newDispensaryIds: number[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a URL-safe slug from name and city
|
||||
*/
|
||||
function generateSlug(name: string, city: string, state: string): string {
|
||||
const base = `${name}-${city}-${state}`
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '')
|
||||
.substring(0, 100);
|
||||
return base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a promotion action to dutchie_promotion_log
|
||||
*/
|
||||
async function logPromotionAction(
|
||||
action: string,
|
||||
discoveryId: number | null,
|
||||
dispensaryId: number | null,
|
||||
stateCode: string | null,
|
||||
storeName: string | null,
|
||||
validationErrors: string[] | null = null,
|
||||
fieldChanges: Record<string, any> | null = null,
|
||||
triggeredBy: string = 'auto'
|
||||
): Promise<void> {
|
||||
await pool.query(`
|
||||
INSERT INTO dutchie_promotion_log
|
||||
(discovery_id, dispensary_id, action, state_code, store_name, validation_errors, field_changes, triggered_by)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
`, [
|
||||
discoveryId,
|
||||
dispensaryId,
|
||||
action,
|
||||
stateCode,
|
||||
storeName,
|
||||
validationErrors,
|
||||
fieldChanges ? JSON.stringify(fieldChanges) : null,
|
||||
triggeredBy,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a status alert for the dashboard
|
||||
*/
|
||||
export async function createStatusAlert(
|
||||
dispensaryId: number,
|
||||
profileId: number | null,
|
||||
alertType: string,
|
||||
severity: 'info' | 'warning' | 'error' | 'critical',
|
||||
message: string,
|
||||
previousStatus?: string | null,
|
||||
newStatus?: string | null,
|
||||
metadata?: Record<string, any>
|
||||
): Promise<number> {
|
||||
const result = await pool.query(`
|
||||
INSERT INTO crawler_status_alerts
|
||||
(dispensary_id, profile_id, alert_type, severity, message, previous_status, new_status, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING id
|
||||
`, [
|
||||
dispensaryId,
|
||||
profileId,
|
||||
alertType,
|
||||
severity,
|
||||
message,
|
||||
previousStatus || null,
|
||||
newStatus || null,
|
||||
metadata ? JSON.stringify(metadata) : null,
|
||||
]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create or update crawler profile for a dispensary with initial sandbox status
|
||||
*/
|
||||
async function ensureCrawlerProfile(
|
||||
dispensaryId: number,
|
||||
dispensaryName: string,
|
||||
platformDispensaryId: string
|
||||
): Promise<{ profileId: number; created: boolean }> {
|
||||
// Check if profile already exists
|
||||
const existingResult = await pool.query(`
|
||||
SELECT id FROM dispensary_crawler_profiles
|
||||
WHERE dispensary_id = $1 AND enabled = true
|
||||
LIMIT 1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (existingResult.rows.length > 0) {
|
||||
return { profileId: existingResult.rows[0].id, created: false };
|
||||
}
|
||||
|
||||
// Create new profile with sandbox status
|
||||
const profileKey = dispensaryName
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '')
|
||||
.substring(0, 50);
|
||||
|
||||
const insertResult = await pool.query(`
|
||||
INSERT INTO dispensary_crawler_profiles (
|
||||
dispensary_id,
|
||||
profile_name,
|
||||
profile_key,
|
||||
crawler_type,
|
||||
status,
|
||||
status_reason,
|
||||
status_changed_at,
|
||||
config,
|
||||
enabled,
|
||||
consecutive_successes,
|
||||
consecutive_failures,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, 'dutchie', 'sandbox', 'Newly promoted from discovery', CURRENT_TIMESTAMP,
|
||||
$4::jsonb, true, 0, 0, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
||||
)
|
||||
RETURNING id
|
||||
`, [
|
||||
dispensaryId,
|
||||
dispensaryName,
|
||||
profileKey,
|
||||
JSON.stringify({
|
||||
platformDispensaryId,
|
||||
useBothModes: true,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
}),
|
||||
]);
|
||||
|
||||
const profileId = insertResult.rows[0].id;
|
||||
|
||||
// Create status alert for new sandbox store
|
||||
await createStatusAlert(
|
||||
dispensaryId,
|
||||
profileId,
|
||||
'promoted',
|
||||
'info',
|
||||
`${dispensaryName} promoted to sandbox - awaiting first successful crawl`,
|
||||
null,
|
||||
'sandbox',
|
||||
{ source: 'discovery_promotion', platformDispensaryId }
|
||||
);
|
||||
|
||||
return { profileId, created: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a single discovery location to dispensaries table
|
||||
* Idempotent: uses ON CONFLICT on platform_dispensary_id
|
||||
*/
|
||||
async function promoteLocation(
|
||||
loc: DiscoveryLocationRow
|
||||
): Promise<PromotionResult> {
|
||||
const slug = loc.platform_slug || generateSlug(loc.name, loc.city || '', loc.state_code || '');
|
||||
|
||||
// Upsert into dispensaries
|
||||
// ON CONFLICT by platform_dispensary_id ensures idempotency
|
||||
const upsertResult = await pool.query(`
|
||||
INSERT INTO dispensaries (
|
||||
platform,
|
||||
name,
|
||||
slug,
|
||||
city,
|
||||
state,
|
||||
address1,
|
||||
address2,
|
||||
zipcode,
|
||||
postal_code,
|
||||
phone,
|
||||
website,
|
||||
email,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
platform_dispensary_id,
|
||||
menu_url,
|
||||
menu_type,
|
||||
description,
|
||||
logo_image,
|
||||
banner_image,
|
||||
offer_pickup,
|
||||
offer_delivery,
|
||||
is_medical,
|
||||
is_recreational,
|
||||
chain_slug,
|
||||
enterprise_id,
|
||||
c_name,
|
||||
country,
|
||||
status,
|
||||
crawl_enabled,
|
||||
dutchie_verified,
|
||||
dutchie_verified_at,
|
||||
dutchie_discovery_id,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25, $26, $27, $28, $29, $30,
|
||||
$31, $32, $33, $34, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
||||
)
|
||||
ON CONFLICT (platform_dispensary_id) WHERE platform_dispensary_id IS NOT NULL
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
city = EXCLUDED.city,
|
||||
state = EXCLUDED.state,
|
||||
address1 = EXCLUDED.address1,
|
||||
address2 = EXCLUDED.address2,
|
||||
zipcode = EXCLUDED.zipcode,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
phone = EXCLUDED.phone,
|
||||
website = EXCLUDED.website,
|
||||
email = EXCLUDED.email,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
timezone = EXCLUDED.timezone,
|
||||
menu_url = EXCLUDED.menu_url,
|
||||
description = EXCLUDED.description,
|
||||
logo_image = EXCLUDED.logo_image,
|
||||
banner_image = EXCLUDED.banner_image,
|
||||
offer_pickup = EXCLUDED.offer_pickup,
|
||||
offer_delivery = EXCLUDED.offer_delivery,
|
||||
is_medical = EXCLUDED.is_medical,
|
||||
is_recreational = EXCLUDED.is_recreational,
|
||||
chain_slug = EXCLUDED.chain_slug,
|
||||
enterprise_id = EXCLUDED.enterprise_id,
|
||||
c_name = EXCLUDED.c_name,
|
||||
country = EXCLUDED.country,
|
||||
status = EXCLUDED.status,
|
||||
dutchie_discovery_id = EXCLUDED.dutchie_discovery_id,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
RETURNING id, (xmax = 0) AS inserted
|
||||
`, [
|
||||
loc.platform || 'dutchie', // $1 platform
|
||||
loc.name, // $2 name
|
||||
slug, // $3 slug
|
||||
loc.city, // $4 city
|
||||
loc.state_code, // $5 state
|
||||
loc.address_line1, // $6 address1
|
||||
loc.address_line2, // $7 address2
|
||||
loc.postal_code, // $8 zipcode
|
||||
loc.postal_code, // $9 postal_code
|
||||
loc.phone, // $10 phone
|
||||
loc.website, // $11 website
|
||||
loc.email, // $12 email
|
||||
loc.latitude, // $13 latitude
|
||||
loc.longitude, // $14 longitude
|
||||
loc.timezone, // $15 timezone
|
||||
loc.platform_location_id, // $16 platform_dispensary_id
|
||||
loc.platform_menu_url, // $17 menu_url
|
||||
'dutchie', // $18 menu_type
|
||||
loc.description, // $19 description
|
||||
loc.logo_image, // $20 logo_image
|
||||
loc.banner_image, // $21 banner_image
|
||||
loc.offers_pickup ?? true, // $22 offer_pickup
|
||||
loc.offers_delivery ?? false, // $23 offer_delivery
|
||||
loc.is_medical ?? false, // $24 is_medical
|
||||
loc.is_recreational ?? true, // $25 is_recreational
|
||||
loc.chain_slug, // $26 chain_slug
|
||||
loc.enterprise_id, // $27 enterprise_id
|
||||
loc.c_name, // $28 c_name
|
||||
loc.country || 'United States', // $29 country
|
||||
loc.store_status || 'open', // $30 status
|
||||
true, // $31 crawl_enabled
|
||||
true, // $32 dutchie_verified
|
||||
new Date(), // $33 dutchie_verified_at
|
||||
loc.id, // $34 dutchie_discovery_id
|
||||
]);
|
||||
|
||||
const dispensaryId = upsertResult.rows[0].id;
|
||||
const wasInserted = upsertResult.rows[0].inserted;
|
||||
|
||||
// Link discovery location back to dispensary and update status
|
||||
await pool.query(`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET
|
||||
dispensary_id = $1,
|
||||
status = 'verified',
|
||||
verified_at = CURRENT_TIMESTAMP,
|
||||
verified_by = 'auto-promotion'
|
||||
WHERE id = $2
|
||||
`, [dispensaryId, loc.id]);
|
||||
|
||||
// Create crawler profile with sandbox status for new dispensaries
|
||||
if (wasInserted && loc.platform_location_id) {
|
||||
await ensureCrawlerProfile(dispensaryId, loc.name, loc.platform_location_id);
|
||||
}
|
||||
|
||||
const action = wasInserted ? 'promoted_create' : 'promoted_update';
|
||||
|
||||
// Log the promotion
|
||||
await logPromotionAction(
|
||||
action,
|
||||
loc.id,
|
||||
dispensaryId,
|
||||
loc.state_code,
|
||||
loc.name,
|
||||
null,
|
||||
{ slug, city: loc.city, platform_location_id: loc.platform_location_id }
|
||||
);
|
||||
|
||||
return {
|
||||
discoveryId: loc.id,
|
||||
dispensaryId,
|
||||
action: wasInserted ? 'created' : 'updated',
|
||||
name: loc.name,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote all valid discovered locations to dispensaries
|
||||
*
|
||||
* @param stateCode Optional filter by state (e.g., 'CA', 'AZ')
|
||||
* @param dryRun If true, only validate without making changes
|
||||
*/
|
||||
export async function promoteDiscoveredLocations(
|
||||
stateCode?: string,
|
||||
dryRun = false
|
||||
): Promise<PromotionSummary> {
|
||||
const startTime = Date.now();
|
||||
|
||||
let query = `
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
WHERE status = 'discovered'
|
||||
`;
|
||||
const params: string[] = [];
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
query += ` ORDER BY id`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const locations = result.rows as DiscoveryLocationRow[];
|
||||
|
||||
const results: PromotionResult[] = [];
|
||||
const rejectedRecords: PromotionSummary['rejectedRecords'] = [];
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
const newDispensaryIds: number[] = [];
|
||||
let created = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
let rejected = 0;
|
||||
|
||||
for (const loc of locations) {
|
||||
// Step 2: Validation
|
||||
const validation = validateForPromotion(loc);
|
||||
|
||||
if (!validation.valid) {
|
||||
rejected++;
|
||||
rejectedRecords.push({
|
||||
id: loc.id,
|
||||
name: loc.name,
|
||||
errors: validation.errors,
|
||||
});
|
||||
|
||||
// Mark as rejected if not dry run
|
||||
if (!dryRun) {
|
||||
await pool.query(`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'rejected', notes = $1
|
||||
WHERE id = $2
|
||||
`, [validation.errors.join('; '), loc.id]);
|
||||
|
||||
// Log the rejection
|
||||
await logPromotionAction(
|
||||
'rejected',
|
||||
loc.id,
|
||||
null,
|
||||
loc.state_code,
|
||||
loc.name,
|
||||
validation.errors
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Step 3: Promotion (skip if dry run)
|
||||
if (dryRun) {
|
||||
skipped++;
|
||||
results.push({
|
||||
discoveryId: loc.id,
|
||||
dispensaryId: 0,
|
||||
action: 'skipped',
|
||||
name: loc.name,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const promotionResult = await promoteLocation(loc);
|
||||
results.push(promotionResult);
|
||||
|
||||
if (promotionResult.action === 'created') {
|
||||
created++;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new IDs for task chaining
|
||||
newDispensaryIds.push(promotionResult.dispensaryId);
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`Failed to promote location ${loc.id} (${loc.name}):`, error.message);
|
||||
rejected++;
|
||||
rejectedRecords.push({
|
||||
id: loc.id,
|
||||
name: loc.name,
|
||||
errors: [`Promotion error: ${error.message}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
totalProcessed: locations.length,
|
||||
created,
|
||||
updated,
|
||||
skipped,
|
||||
rejected,
|
||||
results,
|
||||
rejectedRecords,
|
||||
durationMs: Date.now() - startTime,
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
|
||||
newDispensaryIds,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a single discovery location by ID
|
||||
*/
|
||||
export async function promoteSingleLocation(
|
||||
discoveryId: number
|
||||
): Promise<PromotionResult> {
|
||||
const result = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[discoveryId]
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
throw new Error(`Discovery location ${discoveryId} not found`);
|
||||
}
|
||||
|
||||
const loc = result.rows[0] as DiscoveryLocationRow;
|
||||
|
||||
// Validate
|
||||
const validation = validateForPromotion(loc);
|
||||
if (!validation.valid) {
|
||||
throw new Error(`Validation failed: ${validation.errors.join(', ')}`);
|
||||
}
|
||||
|
||||
// Promote
|
||||
return promoteLocation(loc);
|
||||
}
|
||||
@@ -18,8 +18,8 @@ import {
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import { getCitiesForState } from './location-discovery';
|
||||
import {
|
||||
DiscoveryLocation,
|
||||
DiscoveryCity,
|
||||
@@ -27,6 +27,11 @@ import {
|
||||
mapLocationRowToLocation,
|
||||
mapCityRowToCity,
|
||||
} from './types';
|
||||
import {
|
||||
validateDiscoveredLocations,
|
||||
promoteDiscoveredLocations,
|
||||
promoteSingleLocation,
|
||||
} from './promotion';
|
||||
|
||||
export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
const router = Router();
|
||||
@@ -53,44 +58,44 @@ export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = 'WHERE platform = $1 AND active = TRUE';
|
||||
let whereClause = 'WHERE dl.platform = $1 AND dl.active = TRUE';
|
||||
const params: any[] = [platform];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (status) {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
whereClause += ` AND dl.status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (stateCode) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
whereClause += ` AND dl.state_code = $${paramIndex}`;
|
||||
params.push(stateCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
whereClause += ` AND dl.country_code = $${paramIndex}`;
|
||||
params.push(countryCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (city) {
|
||||
whereClause += ` AND city ILIKE $${paramIndex}`;
|
||||
whereClause += ` AND dl.city ILIKE $${paramIndex}`;
|
||||
params.push(`%${city}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (search) {
|
||||
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||
whereClause += ` AND (dl.name ILIKE $${paramIndex} OR dl.platform_slug ILIKE $${paramIndex})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (hasDispensary === 'true') {
|
||||
whereClause += ' AND dispensary_id IS NOT NULL';
|
||||
whereClause += ' AND dl.dispensary_id IS NOT NULL';
|
||||
} else if (hasDispensary === 'false') {
|
||||
whereClause += ' AND dispensary_id IS NULL';
|
||||
whereClause += ' AND dl.dispensary_id IS NULL';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
@@ -705,15 +710,22 @@ export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
return res.status(400).json({ error: 'stateCode is required' });
|
||||
}
|
||||
|
||||
let cities: any[] = [];
|
||||
if (stateCode === 'AZ') {
|
||||
cities = ARIZONA_CITIES;
|
||||
} else {
|
||||
// Dynamically fetch cities from Dutchie for any state
|
||||
const cityNames = await getCitiesForState(stateCode as string);
|
||||
|
||||
if (cityNames.length === 0) {
|
||||
return res.status(400).json({
|
||||
error: `No predefined cities for state: ${stateCode}. Add cities to city-discovery.ts`,
|
||||
error: `No cities found for state: ${stateCode}`,
|
||||
});
|
||||
}
|
||||
|
||||
// Convert to seed format
|
||||
const cities = cityNames.map(name => ({
|
||||
name,
|
||||
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
|
||||
stateCode: stateCode as string,
|
||||
}));
|
||||
|
||||
const result = await seedKnownCities(pool, cities);
|
||||
|
||||
res.json({
|
||||
@@ -834,6 +846,136 @@ export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PROMOTION ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/admin/validate
|
||||
* Validate discovered locations before promotion
|
||||
*/
|
||||
router.get('/admin/validate', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode } = req.query;
|
||||
const summary = await validateDiscoveredLocations(stateCode as string | undefined);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
...summary,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/promote
|
||||
* Promote all valid discovered locations to dispensaries (idempotent)
|
||||
*
|
||||
* Query params:
|
||||
* - stateCode: Filter by state (e.g., 'CA', 'AZ')
|
||||
* - dryRun: If true, only validate without making changes
|
||||
*/
|
||||
router.post('/admin/promote', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode, dryRun = false } = req.body;
|
||||
|
||||
console.log(`[Discovery API] Starting promotion for ${stateCode || 'all states'} (dryRun=${dryRun})`);
|
||||
const summary = await promoteDiscoveredLocations(stateCode, dryRun);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
...summary,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/promote/:id
|
||||
* Promote a single discovery location by ID
|
||||
*/
|
||||
router.post('/admin/promote/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
console.log(`[Discovery API] Promoting single location ${id}`);
|
||||
const result = await promoteSingleLocation(parseInt(id, 10));
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
...result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PROMOTION LOG
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/promotion-log
|
||||
* Get promotion audit log
|
||||
*/
|
||||
router.get('/promotion-log', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { state, dispensary_id, limit = '100' } = req.query;
|
||||
|
||||
let whereClause = 'WHERE 1=1';
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state) {
|
||||
whereClause += ` AND pl.state_code = $${paramIndex}`;
|
||||
params.push(state);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (dispensary_id) {
|
||||
whereClause += ` AND pl.dispensary_id = $${paramIndex}`;
|
||||
params.push(parseInt(dispensary_id as string, 10));
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10));
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
SELECT
|
||||
pl.*,
|
||||
dl.name as discovery_name,
|
||||
d.name as dispensary_name
|
||||
FROM dutchie_promotion_log pl
|
||||
LEFT JOIN dutchie_discovery_locations dl ON pl.discovery_id = dl.id
|
||||
LEFT JOIN dispensaries d ON pl.dispensary_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY pl.created_at DESC
|
||||
LIMIT $${paramIndex}
|
||||
`, params);
|
||||
|
||||
res.json({
|
||||
logs: rows.map((r: any) => ({
|
||||
id: r.id,
|
||||
discoveryId: r.discovery_id,
|
||||
dispensaryId: r.dispensary_id,
|
||||
action: r.action,
|
||||
stateCode: r.state_code,
|
||||
storeName: r.store_name,
|
||||
validationErrors: r.validation_errors,
|
||||
fieldChanges: r.field_changes,
|
||||
triggeredBy: r.triggered_by,
|
||||
createdAt: r.created_at,
|
||||
discoveryName: r.discovery_name,
|
||||
dispensaryName: r.dispensary_name,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
|
||||
@@ -60,6 +60,7 @@ export interface DiscoveryLocation {
|
||||
stateCode: string | null;
|
||||
postalCode: string | null;
|
||||
countryCode: string | null;
|
||||
country: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
@@ -72,6 +73,18 @@ export interface DiscoveryLocation {
|
||||
offersPickup: boolean | null;
|
||||
isRecreational: boolean | null;
|
||||
isMedical: boolean | null;
|
||||
// New Dutchie fields
|
||||
phone: string | null;
|
||||
website: string | null;
|
||||
email: string | null;
|
||||
description: string | null;
|
||||
logoImage: string | null;
|
||||
bannerImage: string | null;
|
||||
chainSlug: string | null;
|
||||
enterpriseId: string | null;
|
||||
cName: string | null;
|
||||
storeStatus: string | null;
|
||||
// Timestamps
|
||||
firstSeenAt: Date;
|
||||
lastSeenAt: Date;
|
||||
lastCheckedAt: Date | null;
|
||||
@@ -96,6 +109,7 @@ export interface DiscoveryLocationRow {
|
||||
state_code: string | null;
|
||||
postal_code: string | null;
|
||||
country_code: string | null;
|
||||
country: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
@@ -108,6 +122,18 @@ export interface DiscoveryLocationRow {
|
||||
offers_pickup: boolean | null;
|
||||
is_recreational: boolean | null;
|
||||
is_medical: boolean | null;
|
||||
// New Dutchie fields (snake_case for DB row)
|
||||
phone: string | null;
|
||||
website: string | null;
|
||||
email: string | null;
|
||||
description: string | null;
|
||||
logo_image: string | null;
|
||||
banner_image: string | null;
|
||||
chain_slug: string | null;
|
||||
enterprise_id: string | null;
|
||||
c_name: string | null;
|
||||
store_status: string | null;
|
||||
// Timestamps
|
||||
first_seen_at: Date;
|
||||
last_seen_at: Date;
|
||||
last_checked_at: Date | null;
|
||||
@@ -185,6 +211,8 @@ export interface FullDiscoveryResult {
|
||||
totalLocationsFound: number;
|
||||
totalLocationsUpserted: number;
|
||||
durationMs: number;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
newDispensaryIds?: number[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
@@ -245,6 +273,7 @@ export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLo
|
||||
stateCode: row.state_code,
|
||||
postalCode: row.postal_code,
|
||||
countryCode: row.country_code,
|
||||
country: row.country,
|
||||
latitude: row.latitude,
|
||||
longitude: row.longitude,
|
||||
timezone: row.timezone,
|
||||
@@ -257,6 +286,18 @@ export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLo
|
||||
offersPickup: row.offers_pickup,
|
||||
isRecreational: row.is_recreational,
|
||||
isMedical: row.is_medical,
|
||||
// New Dutchie fields
|
||||
phone: row.phone,
|
||||
website: row.website,
|
||||
email: row.email,
|
||||
description: row.description,
|
||||
logoImage: row.logo_image,
|
||||
bannerImage: row.banner_image,
|
||||
chainSlug: row.chain_slug,
|
||||
enterpriseId: row.enterprise_id,
|
||||
cName: row.c_name,
|
||||
storeStatus: row.store_status,
|
||||
// Timestamps
|
||||
firstSeenAt: row.first_seen_at,
|
||||
lastSeenAt: row.last_seen_at,
|
||||
lastCheckedAt: row.last_checked_at,
|
||||
|
||||
@@ -1,199 +0,0 @@
|
||||
# Dutchie AZ Pipeline
|
||||
|
||||
## Overview
|
||||
|
||||
The Dutchie AZ pipeline is the **only** authorized way to crawl Dutchie dispensary menus. It uses Dutchie's GraphQL API directly (no DOM scraping) and writes to an isolated database with a proper snapshot model.
|
||||
|
||||
## Key Principles
|
||||
|
||||
1. **GraphQL Only** - All Dutchie data is fetched via their FilteredProducts GraphQL API
|
||||
2. **Isolated Database** - Data lives in `dutchie_az_*` tables, NOT the legacy `products` table
|
||||
3. **Append-Only Snapshots** - Every crawl creates snapshots, never overwrites historical data
|
||||
4. **Stock Status Tracking** - Derived from `POSMetaData.children` inventory data
|
||||
5. **Missing Product Detection** - Products not in feed are marked with `isPresentInFeed=false`
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
src/dutchie-az/
|
||||
├── db/
|
||||
│ ├── connection.ts # Database connection pool
|
||||
│ └── schema.ts # Table definitions and migrations
|
||||
├── routes/
|
||||
│ └── index.ts # REST API endpoints
|
||||
├── services/
|
||||
│ ├── graphql-client.ts # Direct GraphQL fetch (Mode A + Mode B)
|
||||
│ ├── product-crawler.ts # Main crawler orchestration
|
||||
│ └── scheduler.ts # Jittered scheduling with wandering intervals
|
||||
└── types/
|
||||
└── index.ts # TypeScript interfaces
|
||||
```
|
||||
|
||||
## Data Model
|
||||
|
||||
### Tables
|
||||
|
||||
- **dispensaries** - Arizona Dutchie stores with `platform_dispensary_id`
|
||||
- **dutchie_products** - Canonical product identity (one row per product per store)
|
||||
- **dutchie_product_snapshots** - Historical state per crawl (append-only)
|
||||
- **job_schedules** - Scheduler configuration with jitter support
|
||||
- **job_run_logs** - Execution history
|
||||
|
||||
### Stock Status
|
||||
|
||||
The `stock_status` field is derived from `POSMetaData.children`:
|
||||
|
||||
```typescript
|
||||
function deriveStockStatus(children?: POSChild[]): StockStatus {
|
||||
if (!children || children.length === 0) return 'unknown';
|
||||
const totalAvailable = children.reduce((sum, c) =>
|
||||
sum + (c.quantityAvailable || 0), 0);
|
||||
return totalAvailable > 0 ? 'in_stock' : 'out_of_stock';
|
||||
}
|
||||
```
|
||||
|
||||
### Two-Mode Crawling
|
||||
|
||||
Mode A (UI Parity):
|
||||
- `Status: null` - Returns what the UI shows
|
||||
- Best for "current inventory" snapshot
|
||||
|
||||
Mode B (Max Coverage):
|
||||
- `Status: 'Active'` - Returns all active products
|
||||
- Catches items with `isBelowThreshold: true`
|
||||
|
||||
Both modes are merged to get maximum product coverage.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
All endpoints are mounted at `/api/dutchie-az/`:
|
||||
|
||||
```
|
||||
GET /api/dutchie-az/dispensaries - List all dispensaries
|
||||
GET /api/dutchie-az/dispensaries/:id - Get dispensary details
|
||||
GET /api/dutchie-az/products - List products (with filters)
|
||||
GET /api/dutchie-az/products/:id - Get product with snapshots
|
||||
GET /api/dutchie-az/products/:id/snapshots - Get product snapshot history
|
||||
POST /api/dutchie-az/crawl/:dispensaryId - Trigger manual crawl
|
||||
GET /api/dutchie-az/schedule - Get scheduler status
|
||||
POST /api/dutchie-az/schedule/run - Manually run scheduled jobs
|
||||
GET /api/dutchie-az/stats - Dashboard statistics
|
||||
```
|
||||
|
||||
## Scheduler
|
||||
|
||||
The scheduler uses **jitter** to avoid detection patterns:
|
||||
|
||||
```typescript
|
||||
// Each job has independent "wandering" timing
|
||||
interface JobSchedule {
|
||||
base_interval_minutes: number; // e.g., 240 (4 hours)
|
||||
jitter_minutes: number; // e.g., 30 (±30 min)
|
||||
next_run_at: Date; // Calculated with jitter after each run
|
||||
}
|
||||
```
|
||||
|
||||
Jobs run when `next_run_at <= NOW()`. After completion, the next run is calculated:
|
||||
```
|
||||
next_run_at = NOW() + base_interval + random(-jitter, +jitter)
|
||||
```
|
||||
|
||||
This prevents crawls from clustering at predictable times.
|
||||
|
||||
## Manual Testing
|
||||
|
||||
### Run a single dispensary crawl:
|
||||
|
||||
```bash
|
||||
DATABASE_URL="..." npx tsx -e "
|
||||
const { crawlDispensaryProducts } = require('./src/dutchie-az/services/product-crawler');
|
||||
const { query } = require('./src/dutchie-az/db/connection');
|
||||
|
||||
async function test() {
|
||||
const { rows } = await query('SELECT * FROM dispensaries LIMIT 1');
|
||||
if (!rows[0]) return console.log('No dispensaries found');
|
||||
|
||||
const result = await crawlDispensaryProducts(rows[0], 'rec', { useBothModes: true });
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
}
|
||||
test();
|
||||
"
|
||||
```
|
||||
|
||||
### Check stock status distribution:
|
||||
|
||||
```sql
|
||||
SELECT stock_status, COUNT(*)
|
||||
FROM dutchie_products
|
||||
GROUP BY stock_status;
|
||||
```
|
||||
|
||||
### View recent snapshots:
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
p.name,
|
||||
s.stock_status,
|
||||
s.is_present_in_feed,
|
||||
s.crawled_at
|
||||
FROM dutchie_product_snapshots s
|
||||
JOIN dutchie_products p ON p.id = s.dutchie_product_id
|
||||
ORDER BY s.crawled_at DESC
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
## Deprecated Code
|
||||
|
||||
The following files are **DEPRECATED** and will throw errors if called:
|
||||
|
||||
- `src/scrapers/dutchie-graphql.ts` - Wrote to legacy `products` table
|
||||
- `src/scrapers/dutchie-graphql-direct.ts` - Wrote to legacy `products` table
|
||||
- `src/scrapers/templates/dutchie.ts` - HTML/DOM scraper (unreliable)
|
||||
- `src/scraper-v2/engine.ts` DutchieSpider - DOM-based extraction
|
||||
|
||||
If `store-crawl-orchestrator.ts` detects `provider='dutchie'` with `mode='production'`, it now routes to this dutchie-az pipeline automatically.
|
||||
|
||||
## Integration with Legacy System
|
||||
|
||||
The `store-crawl-orchestrator.ts` bridges the legacy stores system with dutchie-az:
|
||||
|
||||
1. When a store has `product_provider='dutchie'` and `product_crawler_mode='production'`
|
||||
2. The orchestrator looks up the corresponding dispensary in `dutchie_az.dispensaries`
|
||||
3. It calls `crawlDispensaryProducts()` from the dutchie-az pipeline
|
||||
4. Results are logged but data stays in the dutchie_az tables
|
||||
|
||||
To use the dutchie-az pipeline independently:
|
||||
- Navigate to `/dutchie-az-schedule` in the UI
|
||||
- Use the REST API endpoints directly
|
||||
- Run the scheduler service
|
||||
|
||||
## Environment Variables
|
||||
|
||||
```bash
|
||||
# Database connection for dutchie-az (same DB, separate tables)
|
||||
DATABASE_URL=postgresql://user:pass@host:port/database
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Dispensary not found in dutchie-az database"
|
||||
|
||||
The dispensary must exist in `dutchie_az.dispensaries` before crawling. Either:
|
||||
1. Run discovery to populate dispensaries
|
||||
2. Manually insert the dispensary with `platform_dispensary_id`
|
||||
|
||||
### GraphQL returns empty products
|
||||
|
||||
1. Check `platform_dispensary_id` is correct (the internal Dutchie ID, not slug)
|
||||
2. Verify the dispensary is online and has menu data
|
||||
3. Try both `rec` and `med` pricing types
|
||||
|
||||
### Snapshots show `stock_status='unknown'`
|
||||
|
||||
The product likely has no `POSMetaData.children` array. This happens for:
|
||||
- Products without inventory tracking
|
||||
- Manually managed inventory
|
||||
|
||||
---
|
||||
|
||||
Last updated: December 2025
|
||||
@@ -1,129 +0,0 @@
|
||||
/**
|
||||
* Dutchie Configuration
|
||||
*
|
||||
* Centralized configuration for Dutchie GraphQL API interaction.
|
||||
* Update hashes here when Dutchie changes their persisted query system.
|
||||
*/
|
||||
|
||||
export const dutchieConfig = {
|
||||
// ============================================================
|
||||
// GRAPHQL ENDPOINT
|
||||
// ============================================================
|
||||
|
||||
/** GraphQL endpoint - must be the api-3 graphql endpoint (NOT api-gw.dutchie.com which no longer exists) */
|
||||
graphqlEndpoint: 'https://dutchie.com/api-3/graphql',
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL PERSISTED QUERY HASHES
|
||||
// ============================================================
|
||||
//
|
||||
// These hashes identify specific GraphQL operations.
|
||||
// If Dutchie changes their schema, you may need to capture
|
||||
// new hashes from live browser traffic (Network tab → graphql requests).
|
||||
|
||||
/** FilteredProducts - main product listing query */
|
||||
filteredProductsHash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
|
||||
/** GetAddressBasedDispensaryData - resolve slug to internal ID */
|
||||
getDispensaryDataHash: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
|
||||
/**
|
||||
* ConsumerDispensaries - geo-based discovery
|
||||
* NOTE: This is a placeholder guess. If discovery fails, either:
|
||||
* 1. Capture the real hash from live traffic
|
||||
* 2. Rely on known AZDHS slugs instead (set useDiscovery: false)
|
||||
*/
|
||||
consumerDispensariesHash: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
|
||||
|
||||
// ============================================================
|
||||
// BEHAVIOR FLAGS
|
||||
// ============================================================
|
||||
|
||||
/** Enable geo-based discovery (false = use known AZDHS slugs only) */
|
||||
useDiscovery: true,
|
||||
|
||||
/** Prefer GET requests (true) or POST (false). GET is default. */
|
||||
preferGet: true,
|
||||
|
||||
/**
|
||||
* Enable POST fallback when GET fails with 405 or blocked.
|
||||
* If true, will retry failed GETs as POSTs.
|
||||
*/
|
||||
enablePostFallback: true,
|
||||
|
||||
// ============================================================
|
||||
// PAGINATION & RETRY
|
||||
// ============================================================
|
||||
|
||||
/** Products per page for pagination */
|
||||
perPage: 100,
|
||||
|
||||
/** Maximum pages to fetch (safety limit) */
|
||||
maxPages: 200,
|
||||
|
||||
/** Number of retries for failed page fetches */
|
||||
maxRetries: 1,
|
||||
|
||||
/** Delay between pages in ms */
|
||||
pageDelayMs: 500,
|
||||
|
||||
/** Delay between modes in ms */
|
||||
modeDelayMs: 2000,
|
||||
|
||||
// ============================================================
|
||||
// HTTP HEADERS
|
||||
// ============================================================
|
||||
|
||||
/** Default headers to mimic browser requests */
|
||||
defaultHeaders: {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
} as Record<string, string>,
|
||||
|
||||
/** User agent string */
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
|
||||
// ============================================================
|
||||
// BROWSER LAUNCH OPTIONS
|
||||
// ============================================================
|
||||
|
||||
browserArgs: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
|
||||
/** Navigation timeout in ms */
|
||||
navigationTimeout: 60000,
|
||||
|
||||
/** Initial page load delay in ms */
|
||||
pageLoadDelay: 2000,
|
||||
};
|
||||
|
||||
/**
|
||||
* Get GraphQL hashes object for backward compatibility
|
||||
*/
|
||||
export const GRAPHQL_HASHES = {
|
||||
FilteredProducts: dutchieConfig.filteredProductsHash,
|
||||
GetAddressBasedDispensaryData: dutchieConfig.getDispensaryDataHash,
|
||||
ConsumerDispensaries: dutchieConfig.consumerDispensariesHash,
|
||||
};
|
||||
|
||||
/**
|
||||
* Arizona geo centerpoints for discovery scans
|
||||
*/
|
||||
export const ARIZONA_CENTERPOINTS = [
|
||||
{ name: 'Phoenix', lat: 33.4484, lng: -112.074 },
|
||||
{ name: 'Tucson', lat: 32.2226, lng: -110.9747 },
|
||||
{ name: 'Flagstaff', lat: 35.1983, lng: -111.6513 },
|
||||
{ name: 'Mesa', lat: 33.4152, lng: -111.8315 },
|
||||
{ name: 'Scottsdale', lat: 33.4942, lng: -111.9261 },
|
||||
{ name: 'Tempe', lat: 33.4255, lng: -111.94 },
|
||||
{ name: 'Yuma', lat: 32.6927, lng: -114.6277 },
|
||||
{ name: 'Prescott', lat: 34.54, lng: -112.4685 },
|
||||
{ name: 'Lake Havasu', lat: 34.4839, lng: -114.3224 },
|
||||
{ name: 'Sierra Vista', lat: 31.5455, lng: -110.2773 },
|
||||
];
|
||||
@@ -1,131 +0,0 @@
|
||||
/**
|
||||
* CannaiQ Database Connection
|
||||
*
|
||||
* All database access for the CannaiQ platform goes through this module.
|
||||
*
|
||||
* SINGLE DATABASE ARCHITECTURE:
|
||||
* - All services (auth, orchestrator, crawlers, admin) use this ONE database
|
||||
* - States are modeled via states table + state_id on dispensaries (not separate DBs)
|
||||
*
|
||||
* CONFIGURATION (in priority order):
|
||||
* 1. CANNAIQ_DB_URL - Full connection string (preferred)
|
||||
* 2. Individual vars: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS
|
||||
* 3. DATABASE_URL - Legacy fallback for K8s compatibility
|
||||
*
|
||||
* IMPORTANT:
|
||||
* - Do NOT create separate pools elsewhere
|
||||
* - All services should import from this module
|
||||
*/
|
||||
|
||||
import { Pool, PoolClient } from 'pg';
|
||||
|
||||
/**
|
||||
* Get the database connection string from environment variables.
|
||||
* Supports multiple configuration methods with fallback for legacy compatibility.
|
||||
*/
|
||||
function getConnectionString(): string {
|
||||
// Priority 1: Full CANNAIQ connection URL
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
// Priority 2: Build from individual CANNAIQ env vars
|
||||
const host = process.env.CANNAIQ_DB_HOST;
|
||||
const port = process.env.CANNAIQ_DB_PORT;
|
||||
const name = process.env.CANNAIQ_DB_NAME;
|
||||
const user = process.env.CANNAIQ_DB_USER;
|
||||
const pass = process.env.CANNAIQ_DB_PASS;
|
||||
|
||||
if (host && port && name && user && pass) {
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
// Priority 3: Fallback to DATABASE_URL for legacy/K8s compatibility
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
|
||||
// Report what's missing
|
||||
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||
const missing = required.filter((key) => !process.env[key]);
|
||||
|
||||
throw new Error(
|
||||
`[CannaiQ DB] Missing database configuration.\n` +
|
||||
`Set CANNAIQ_DB_URL, DATABASE_URL, or all of: ${missing.join(', ')}`
|
||||
);
|
||||
}
|
||||
|
||||
let pool: Pool | null = null;
|
||||
|
||||
/**
|
||||
* Get the CannaiQ database pool (singleton)
|
||||
*
|
||||
* This is the canonical pool for all CannaiQ services.
|
||||
* Do NOT create separate pools elsewhere.
|
||||
*/
|
||||
export function getPool(): Pool {
|
||||
if (!pool) {
|
||||
pool = new Pool({
|
||||
connectionString: getConnectionString(),
|
||||
max: 10,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
|
||||
pool.on('error', (err) => {
|
||||
console.error('[CannaiQ DB] Unexpected error on idle client:', err);
|
||||
});
|
||||
|
||||
console.log('[CannaiQ DB] Pool initialized');
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use getPool() instead
|
||||
*/
|
||||
export function getDutchieAZPool(): Pool {
|
||||
console.warn('[CannaiQ DB] getDutchieAZPool() is deprecated. Use getPool() instead.');
|
||||
return getPool();
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a query on the CannaiQ database
|
||||
*/
|
||||
export async function query<T = any>(text: string, params?: any[]): Promise<{ rows: T[]; rowCount: number }> {
|
||||
const p = getPool();
|
||||
const result = await p.query(text, params);
|
||||
return { rows: result.rows as T[], rowCount: result.rowCount || 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a client from the pool for transaction use
|
||||
*/
|
||||
export async function getClient(): Promise<PoolClient> {
|
||||
const p = getPool();
|
||||
return p.connect();
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the pool connection
|
||||
*/
|
||||
export async function closePool(): Promise<void> {
|
||||
if (pool) {
|
||||
await pool.end();
|
||||
pool = null;
|
||||
console.log('[CannaiQ DB] Pool closed');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the database is accessible
|
||||
*/
|
||||
export async function healthCheck(): Promise<boolean> {
|
||||
try {
|
||||
const result = await query('SELECT 1 as ok');
|
||||
return result.rows.length > 0 && result.rows[0].ok === 1;
|
||||
} catch (error) {
|
||||
console.error('[CannaiQ DB] Health check failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1,137 +0,0 @@
|
||||
/**
|
||||
* Dispensary Column Definitions
|
||||
*
|
||||
* Centralized column list for dispensaries table queries.
|
||||
* Handles optional columns that may not exist in all environments.
|
||||
*
|
||||
* USAGE:
|
||||
* import { DISPENSARY_COLUMNS, DISPENSARY_COLUMNS_WITH_FAILED } from '../db/dispensary-columns';
|
||||
* const result = await query(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE ...`);
|
||||
*/
|
||||
|
||||
/**
|
||||
* Core dispensary columns that always exist.
|
||||
* These are guaranteed to be present in all environments.
|
||||
*/
|
||||
const CORE_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
created_at, updated_at
|
||||
`;
|
||||
|
||||
/**
|
||||
* Optional columns with NULL fallback.
|
||||
*
|
||||
* provider_detection_data: Added in migration 044
|
||||
* active_crawler_profile_id: Added in migration 041
|
||||
*
|
||||
* Using COALESCE ensures the query works whether or not the column exists:
|
||||
* - If column exists: returns the actual value
|
||||
* - If column doesn't exist: query fails (but migration should be run)
|
||||
*
|
||||
* For pre-migration compatibility, we select NULL::jsonb which always works.
|
||||
* After migration 044 is applied, this can be changed to the real column.
|
||||
*/
|
||||
|
||||
// TEMPORARY: Use NULL fallback until migration 044 is applied
|
||||
// After running 044, change this to: provider_detection_data
|
||||
const PROVIDER_DETECTION_COLUMN = `NULL::jsonb AS provider_detection_data`;
|
||||
|
||||
// After migration 044 is applied, uncomment this line and remove the above:
|
||||
// const PROVIDER_DETECTION_COLUMN = `provider_detection_data`;
|
||||
|
||||
/**
|
||||
* Standard dispensary columns for most queries.
|
||||
* Includes provider_detection_data with NULL fallback for pre-migration compatibility.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN}`;
|
||||
|
||||
/**
|
||||
* Dispensary columns including active_crawler_profile_id.
|
||||
* Used by routes that need profile information.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS_WITH_PROFILE = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN},
|
||||
active_crawler_profile_id`;
|
||||
|
||||
/**
|
||||
* Dispensary columns including failed_at.
|
||||
* Used by worker for compatibility checks.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS_WITH_FAILED = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN},
|
||||
failed_at`;
|
||||
|
||||
/**
|
||||
* NOTE: After migration 044 is applied, update PROVIDER_DETECTION_COLUMN above
|
||||
* to use the real column instead of NULL fallback.
|
||||
*
|
||||
* To verify migration status:
|
||||
* SELECT column_name FROM information_schema.columns
|
||||
* WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data';
|
||||
*/
|
||||
|
||||
// Cache for column existence check
|
||||
let _providerDetectionColumnExists: boolean | null = null;
|
||||
|
||||
/**
|
||||
* Check if provider_detection_data column exists in dispensaries table.
|
||||
* Result is cached after first check.
|
||||
*/
|
||||
export async function hasProviderDetectionColumn(pool: { query: (sql: string) => Promise<{ rows: any[] }> }): Promise<boolean> {
|
||||
if (_providerDetectionColumnExists !== null) {
|
||||
return _providerDetectionColumnExists;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
|
||||
`);
|
||||
_providerDetectionColumnExists = result.rows.length > 0;
|
||||
} catch {
|
||||
_providerDetectionColumnExists = false;
|
||||
}
|
||||
|
||||
return _providerDetectionColumnExists;
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely update provider_detection_data column.
|
||||
* If column doesn't exist, logs a warning but doesn't crash.
|
||||
*
|
||||
* @param pool - Database pool with query method
|
||||
* @param dispensaryId - ID of dispensary to update
|
||||
* @param data - JSONB data to merge into provider_detection_data
|
||||
* @returns true if update succeeded, false if column doesn't exist
|
||||
*/
|
||||
export async function safeUpdateProviderDetectionData(
|
||||
pool: { query: (sql: string, params?: any[]) => Promise<any> },
|
||||
dispensaryId: number,
|
||||
data: Record<string, any>
|
||||
): Promise<boolean> {
|
||||
const hasColumn = await hasProviderDetectionColumn(pool);
|
||||
|
||||
if (!hasColumn) {
|
||||
console.warn(`[DispensaryColumns] provider_detection_data column not found. Run migration 044 to add it.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || $1::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2`,
|
||||
[JSON.stringify(data), dispensaryId]
|
||||
);
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
if (error.message?.includes('provider_detection_data')) {
|
||||
console.warn(`[DispensaryColumns] Failed to update provider_detection_data: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Schema Bootstrap
|
||||
*
|
||||
* Run this to create/update the dutchie_az tables (dutchie_products, dutchie_product_snapshots, etc.)
|
||||
* in the AZ pipeline database. This is separate from the legacy schema.
|
||||
*
|
||||
* Usage:
|
||||
* TS_NODE_TRANSPILE_ONLY=1 npx ts-node src/dutchie-az/db/migrate.ts
|
||||
* or (after build)
|
||||
* node dist/dutchie-az/db/migrate.js
|
||||
*/
|
||||
|
||||
import { createSchema } from './schema';
|
||||
import { closePool } from './connection';
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
console.log('[DutchieAZ] Running schema migration...');
|
||||
await createSchema();
|
||||
console.log('[DutchieAZ] Schema migration complete.');
|
||||
} catch (err: any) {
|
||||
console.error('[DutchieAZ] Schema migration failed:', err.message);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await closePool();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,408 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Database Schema
|
||||
*
|
||||
* Creates all tables for the isolated Dutchie Arizona data pipeline.
|
||||
* Run this to initialize the dutchie_az database.
|
||||
*/
|
||||
|
||||
import { query, getClient } from './connection';
|
||||
|
||||
/**
|
||||
* SQL statements to create all tables
|
||||
*/
|
||||
const SCHEMA_SQL = `
|
||||
-- ============================================================
|
||||
-- DISPENSARIES TABLE
|
||||
-- Stores discovered Dutchie dispensaries in Arizona
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dispensaries (
|
||||
id SERIAL PRIMARY KEY,
|
||||
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL,
|
||||
city VARCHAR(100) NOT NULL,
|
||||
state VARCHAR(10) NOT NULL DEFAULT 'AZ',
|
||||
postal_code VARCHAR(20),
|
||||
address TEXT,
|
||||
latitude DECIMAL(10, 7),
|
||||
longitude DECIMAL(10, 7),
|
||||
platform_dispensary_id VARCHAR(100),
|
||||
is_delivery BOOLEAN DEFAULT false,
|
||||
is_pickup BOOLEAN DEFAULT true,
|
||||
raw_metadata JSONB,
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
product_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uk_dispensaries_platform_slug UNIQUE (platform, slug, city, state)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform ON dispensaries(platform);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform_id ON dispensaries(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
|
||||
|
||||
-- ============================================================
|
||||
-- DUTCHIE_PRODUCTS TABLE
|
||||
-- Canonical product identity per store
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dutchie_products (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
external_product_id VARCHAR(100) NOT NULL,
|
||||
platform_dispensary_id VARCHAR(100) NOT NULL,
|
||||
c_name VARCHAR(500),
|
||||
name VARCHAR(500) NOT NULL,
|
||||
|
||||
-- Brand
|
||||
brand_name VARCHAR(255),
|
||||
brand_id VARCHAR(100),
|
||||
brand_logo_url TEXT,
|
||||
|
||||
-- Classification
|
||||
type VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
strain_type VARCHAR(50),
|
||||
provider VARCHAR(100),
|
||||
|
||||
-- Potency
|
||||
thc DECIMAL(10, 4),
|
||||
thc_content DECIMAL(10, 4),
|
||||
cbd DECIMAL(10, 4),
|
||||
cbd_content DECIMAL(10, 4),
|
||||
cannabinoids_v2 JSONB,
|
||||
effects JSONB,
|
||||
|
||||
-- Status / flags
|
||||
status VARCHAR(50),
|
||||
medical_only BOOLEAN DEFAULT false,
|
||||
rec_only BOOLEAN DEFAULT false,
|
||||
featured BOOLEAN DEFAULT false,
|
||||
coming_soon BOOLEAN DEFAULT false,
|
||||
certificate_of_analysis_enabled BOOLEAN DEFAULT false,
|
||||
|
||||
is_below_threshold BOOLEAN DEFAULT false,
|
||||
is_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
options_below_threshold BOOLEAN DEFAULT false,
|
||||
options_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
|
||||
-- Derived stock status: 'in_stock', 'out_of_stock', 'unknown'
|
||||
stock_status VARCHAR(20) DEFAULT 'unknown',
|
||||
total_quantity_available INTEGER DEFAULT 0,
|
||||
|
||||
-- Images
|
||||
primary_image_url TEXT,
|
||||
images JSONB,
|
||||
|
||||
-- Misc
|
||||
measurements JSONB,
|
||||
weight VARCHAR(50),
|
||||
past_c_names TEXT[],
|
||||
|
||||
created_at_dutchie TIMESTAMPTZ,
|
||||
updated_at_dutchie TIMESTAMPTZ,
|
||||
|
||||
latest_raw_payload JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uk_dutchie_products UNIQUE (dispensary_id, external_product_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_dispensary ON dutchie_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_external_id ON dutchie_products(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_platform_disp ON dutchie_products(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_brand ON dutchie_products(brand_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_type ON dutchie_products(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_subcategory ON dutchie_products(subcategory);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_status ON dutchie_products(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_strain ON dutchie_products(strain_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_stock_status ON dutchie_products(stock_status);
|
||||
|
||||
-- ============================================================
|
||||
-- DUTCHIE_PRODUCT_SNAPSHOTS TABLE
|
||||
-- Historical state per crawl, includes options[]
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dutchie_product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dutchie_product_id INTEGER NOT NULL REFERENCES dutchie_products(id) ON DELETE CASCADE,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform_dispensary_id VARCHAR(100) NOT NULL,
|
||||
external_product_id VARCHAR(100) NOT NULL,
|
||||
pricing_type VARCHAR(20) DEFAULT 'unknown',
|
||||
crawl_mode VARCHAR(20) DEFAULT 'mode_a', -- 'mode_a' (UI parity) or 'mode_b' (max coverage)
|
||||
|
||||
status VARCHAR(50),
|
||||
featured BOOLEAN DEFAULT false,
|
||||
special BOOLEAN DEFAULT false,
|
||||
medical_only BOOLEAN DEFAULT false,
|
||||
rec_only BOOLEAN DEFAULT false,
|
||||
|
||||
-- Flag indicating if product was present in feed (false = missing_from_feed snapshot)
|
||||
is_present_in_feed BOOLEAN DEFAULT true,
|
||||
|
||||
-- Derived stock status
|
||||
stock_status VARCHAR(20) DEFAULT 'unknown',
|
||||
|
||||
-- Price summary (in cents)
|
||||
rec_min_price_cents INTEGER,
|
||||
rec_max_price_cents INTEGER,
|
||||
rec_min_special_price_cents INTEGER,
|
||||
med_min_price_cents INTEGER,
|
||||
med_max_price_cents INTEGER,
|
||||
med_min_special_price_cents INTEGER,
|
||||
wholesale_min_price_cents INTEGER,
|
||||
|
||||
-- Inventory summary
|
||||
total_quantity_available INTEGER,
|
||||
total_kiosk_quantity_available INTEGER,
|
||||
manual_inventory BOOLEAN DEFAULT false,
|
||||
is_below_threshold BOOLEAN DEFAULT false,
|
||||
is_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
|
||||
-- Option-level data (from POSMetaData.children)
|
||||
options JSONB,
|
||||
|
||||
-- Full raw product node
|
||||
raw_payload JSONB NOT NULL,
|
||||
|
||||
crawled_at TIMESTAMPTZ NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product ON dutchie_product_snapshots(dutchie_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary ON dutchie_product_snapshots(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawled_at ON dutchie_product_snapshots(crawled_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_platform_disp ON dutchie_product_snapshots(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON dutchie_product_snapshots(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_special ON dutchie_product_snapshots(special) WHERE special = true;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_stock_status ON dutchie_product_snapshots(stock_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_mode ON dutchie_product_snapshots(crawl_mode);
|
||||
|
||||
-- ============================================================
|
||||
-- CRAWL_JOBS TABLE
|
||||
-- Tracks crawl execution status
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_type VARCHAR(50) NOT NULL,
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
error_message TEXT,
|
||||
products_found INTEGER,
|
||||
snapshots_created INTEGER,
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_type ON crawl_jobs(job_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_dispensary ON crawl_jobs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_created ON crawl_jobs(created_at);
|
||||
|
||||
-- ============================================================
|
||||
-- JOB_SCHEDULES TABLE
|
||||
-- Stores schedule configuration for recurring jobs with jitter support
|
||||
-- Each job has independent timing that "wanders" over time
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS job_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_name VARCHAR(100) NOT NULL UNIQUE,
|
||||
description TEXT,
|
||||
enabled BOOLEAN DEFAULT true,
|
||||
|
||||
-- Timing configuration (jitter makes times "wander")
|
||||
base_interval_minutes INTEGER NOT NULL DEFAULT 240, -- e.g., 4 hours
|
||||
jitter_minutes INTEGER NOT NULL DEFAULT 30, -- e.g., ±30 min
|
||||
|
||||
-- Last run tracking
|
||||
last_run_at TIMESTAMPTZ,
|
||||
last_status VARCHAR(20), -- 'success', 'error', 'partial', 'running'
|
||||
last_error_message TEXT,
|
||||
last_duration_ms INTEGER,
|
||||
|
||||
-- Next run (calculated with jitter after each run)
|
||||
next_run_at TIMESTAMPTZ,
|
||||
|
||||
-- Additional config
|
||||
job_config JSONB, -- e.g., { pricingType: 'rec', useBothModes: true }
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_job_schedules_enabled ON job_schedules(enabled);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_schedules_next_run ON job_schedules(next_run_at);
|
||||
|
||||
-- ============================================================
|
||||
-- JOB_RUN_LOGS TABLE
|
||||
-- Stores history of job runs for monitoring
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS job_run_logs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
|
||||
job_name VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
error_message TEXT,
|
||||
|
||||
-- Results summary
|
||||
items_processed INTEGER,
|
||||
items_succeeded INTEGER,
|
||||
items_failed INTEGER,
|
||||
|
||||
metadata JSONB, -- Additional run details
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
|
||||
|
||||
-- ============================================================
|
||||
-- VIEWS FOR EASY QUERYING
|
||||
-- ============================================================
|
||||
|
||||
-- Categories derived from products
|
||||
CREATE OR REPLACE VIEW v_categories AS
|
||||
SELECT
|
||||
type,
|
||||
subcategory,
|
||||
COUNT(DISTINCT id) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
AVG(thc) as avg_thc,
|
||||
MIN(thc) as min_thc,
|
||||
MAX(thc) as max_thc
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
ORDER BY type, subcategory;
|
||||
|
||||
-- Brands derived from products
|
||||
CREATE OR REPLACE VIEW v_brands AS
|
||||
SELECT
|
||||
brand_name,
|
||||
brand_id,
|
||||
MAX(brand_logo_url) as brand_logo_url,
|
||||
COUNT(DISTINCT id) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL) as product_types
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL
|
||||
GROUP BY brand_name, brand_id
|
||||
ORDER BY product_count DESC;
|
||||
|
||||
-- Latest snapshot per product (most recent crawl data)
|
||||
CREATE OR REPLACE VIEW v_latest_snapshots AS
|
||||
SELECT DISTINCT ON (dutchie_product_id)
|
||||
s.*
|
||||
FROM dutchie_product_snapshots s
|
||||
ORDER BY dutchie_product_id, crawled_at DESC;
|
||||
|
||||
-- Dashboard stats
|
||||
CREATE OR REPLACE VIEW v_dashboard_stats AS
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as dispensary_count,
|
||||
(SELECT COUNT(*) FROM dutchie_products) as product_count,
|
||||
(SELECT COUNT(*) FROM dutchie_product_snapshots WHERE crawled_at > NOW() - INTERVAL '24 hours') as snapshots_24h,
|
||||
(SELECT MAX(crawled_at) FROM dutchie_product_snapshots) as last_crawl_time,
|
||||
(SELECT COUNT(*) FROM crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h,
|
||||
(SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as brand_count,
|
||||
(SELECT COUNT(DISTINCT (type, subcategory)) FROM dutchie_products WHERE type IS NOT NULL) as category_count;
|
||||
`;
|
||||
|
||||
/**
|
||||
* Run the schema migration
|
||||
*/
|
||||
export async function createSchema(): Promise<void> {
|
||||
console.log('[DutchieAZ Schema] Creating database schema...');
|
||||
|
||||
const client = await getClient();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Split into individual statements and execute
|
||||
const statements = SCHEMA_SQL
|
||||
.split(';')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0 && !s.startsWith('--'));
|
||||
|
||||
for (const statement of statements) {
|
||||
if (statement.trim()) {
|
||||
await client.query(statement + ';');
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
console.log('[DutchieAZ Schema] Schema created successfully');
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[DutchieAZ Schema] Failed to create schema:', error);
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop all tables (for development/testing)
|
||||
*/
|
||||
export async function dropSchema(): Promise<void> {
|
||||
console.log('[DutchieAZ Schema] Dropping all tables...');
|
||||
|
||||
await query(`
|
||||
DROP VIEW IF EXISTS v_dashboard_stats CASCADE;
|
||||
DROP VIEW IF EXISTS v_latest_snapshots CASCADE;
|
||||
DROP VIEW IF EXISTS v_brands CASCADE;
|
||||
DROP VIEW IF EXISTS v_categories CASCADE;
|
||||
DROP TABLE IF EXISTS crawl_schedule CASCADE;
|
||||
DROP TABLE IF EXISTS crawl_jobs CASCADE;
|
||||
DROP TABLE IF EXISTS dutchie_product_snapshots CASCADE;
|
||||
DROP TABLE IF EXISTS dutchie_products CASCADE;
|
||||
DROP TABLE IF EXISTS dispensaries CASCADE;
|
||||
`);
|
||||
|
||||
console.log('[DutchieAZ Schema] All tables dropped');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if schema exists
|
||||
*/
|
||||
export async function schemaExists(): Promise<boolean> {
|
||||
try {
|
||||
const result = await query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'dispensaries'
|
||||
) as exists
|
||||
`);
|
||||
return result.rows[0]?.exists === true;
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize schema if it doesn't exist
|
||||
*/
|
||||
export async function ensureSchema(): Promise<void> {
|
||||
const exists = await schemaExists();
|
||||
if (!exists) {
|
||||
await createSchema();
|
||||
} else {
|
||||
console.log('[DutchieAZ Schema] Schema already exists');
|
||||
}
|
||||
}
|
||||
@@ -1,403 +0,0 @@
|
||||
/**
|
||||
* DtCityDiscoveryService
|
||||
*
|
||||
* Core service for Dutchie city discovery.
|
||||
* Contains shared logic used by multiple entrypoints.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Browser/API-based city fetching
|
||||
* - Manual city seeding
|
||||
* - City upsert operations
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCity {
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesInserted: number;
|
||||
citiesUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
export interface ManualSeedResult {
|
||||
city: DutchieCity;
|
||||
id: number;
|
||||
wasInserted: boolean;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// US STATE CODE MAPPING
|
||||
// ============================================================
|
||||
|
||||
export const US_STATE_MAP: Record<string, string> = {
|
||||
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||
};
|
||||
|
||||
// Canadian province mapping
|
||||
export const CA_PROVINCE_MAP: Record<string, string> = {
|
||||
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// CITY FETCHING (AUTO DISCOVERY)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch cities from Dutchie's /cities page using Puppeteer.
|
||||
*/
|
||||
export async function fetchCitiesFromBrowser(): Promise<DutchieCity[]> {
|
||||
console.log('[DtCityDiscoveryService] Launching browser to fetch cities...');
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('[DtCityDiscoveryService] Navigating to https://dutchie.com/cities...');
|
||||
await page.goto('https://dutchie.com/cities', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const cities = await page.evaluate(() => {
|
||||
const cityLinks: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
stateSlug: string | null;
|
||||
}> = [];
|
||||
|
||||
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||
links.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
const text = (link as HTMLElement).innerText?.trim();
|
||||
|
||||
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||
if (match && text) {
|
||||
cityLinks.push({
|
||||
name: text,
|
||||
slug: match[2],
|
||||
url: href,
|
||||
stateSlug: match[1],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return cityLinks;
|
||||
});
|
||||
|
||||
console.log(`[DtCityDiscoveryService] Extracted ${cities.length} city links from page`);
|
||||
|
||||
return cities.map((city) => {
|
||||
let countryCode = 'US';
|
||||
let stateCode: string | null = null;
|
||||
|
||||
if (city.stateSlug) {
|
||||
if (US_STATE_MAP[city.stateSlug]) {
|
||||
stateCode = US_STATE_MAP[city.stateSlug];
|
||||
countryCode = 'US';
|
||||
} else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||
countryCode = 'CA';
|
||||
} else if (city.stateSlug.length === 2) {
|
||||
stateCode = city.stateSlug.toUpperCase();
|
||||
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||
countryCode = 'CA';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
url: city.url,
|
||||
};
|
||||
});
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch cities via API endpoints (fallback).
|
||||
*/
|
||||
export async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||
console.log('[DtCityDiscoveryService] Attempting API-based city discovery...');
|
||||
|
||||
const apiEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of apiEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
if (response.data && Array.isArray(response.data)) {
|
||||
console.log(`[DtCityDiscoveryService] API returned ${response.data.length} cities`);
|
||||
return response.data.map((c: any) => ({
|
||||
name: c.name || c.city,
|
||||
slug: c.slug || c.citySlug,
|
||||
stateCode: c.stateCode || c.state,
|
||||
countryCode: c.countryCode || c.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(`[DtCityDiscoveryService] API ${endpoint} failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities
|
||||
*/
|
||||
export async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCity
|
||||
): Promise<{ id: number; inserted: boolean; updated: boolean }> {
|
||||
const result = await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
crawl_enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
TRUE,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
crawl_enabled = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) AS inserted
|
||||
`,
|
||||
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||
);
|
||||
|
||||
const inserted = result.rows[0]?.inserted === true;
|
||||
return {
|
||||
id: result.rows[0]?.id,
|
||||
inserted,
|
||||
updated: !inserted,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN SERVICE CLASS
|
||||
// ============================================================
|
||||
|
||||
export class DtCityDiscoveryService {
|
||||
constructor(private pool: Pool) {}
|
||||
|
||||
/**
|
||||
* Run auto-discovery (browser + API fallback)
|
||||
*/
|
||||
async runAutoDiscovery(): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let citiesFound = 0;
|
||||
let citiesInserted = 0;
|
||||
let citiesUpdated = 0;
|
||||
|
||||
console.log('[DtCityDiscoveryService] Starting auto city discovery...');
|
||||
|
||||
try {
|
||||
let cities = await fetchCitiesFromBrowser();
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[DtCityDiscoveryService] Browser returned 0 cities, trying API...');
|
||||
cities = await fetchCitiesFromAPI();
|
||||
}
|
||||
|
||||
citiesFound = cities.length;
|
||||
console.log(`[DtCityDiscoveryService] Found ${citiesFound} cities`);
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await upsertCity(this.pool, city);
|
||||
if (result.inserted) citiesInserted++;
|
||||
else if (result.updated) citiesUpdated++;
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `Auto discovery failed: ${error.message}`;
|
||||
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
citiesFound,
|
||||
citiesInserted,
|
||||
citiesUpdated,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed a single city manually
|
||||
*/
|
||||
async seedCity(city: DutchieCity): Promise<ManualSeedResult> {
|
||||
console.log(`[DtCityDiscoveryService] Seeding city: ${city.name} (${city.slug}), ${city.stateCode}, ${city.countryCode}`);
|
||||
|
||||
const result = await upsertCity(this.pool, city);
|
||||
|
||||
return {
|
||||
city,
|
||||
id: result.id,
|
||||
wasInserted: result.inserted,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed multiple cities from a list
|
||||
*/
|
||||
async seedCities(cities: DutchieCity[]): Promise<{
|
||||
results: ManualSeedResult[];
|
||||
errors: string[];
|
||||
}> {
|
||||
const results: ManualSeedResult[] = [];
|
||||
const errors: string[] = [];
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await this.seedCity(city);
|
||||
results.push(result);
|
||||
} catch (error: any) {
|
||||
errors.push(`${city.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { results, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about discovered cities
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
byCountry: Array<{ countryCode: string; count: number }>;
|
||||
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||
crawlEnabled: number;
|
||||
neverCrawled: number;
|
||||
}> {
|
||||
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE platform = \'dutchie\''),
|
||||
this.pool.query(`
|
||||
SELECT country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie'
|
||||
GROUP BY country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT state_code, country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND state_code IS NOT NULL
|
||||
GROUP BY state_code, country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND last_crawled_at IS NULL
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||
byCountry: byCountryRes.rows.map((r) => ({
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
byState: byStateRes.rows.map((r) => ({
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DtCityDiscoveryService;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,390 +0,0 @@
|
||||
/**
|
||||
* DutchieCityDiscovery
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Fetch all cities available on Dutchie
|
||||
* - For each city derive: city_name, city_slug, state_code, country_code
|
||||
* - Upsert into dutchie_discovery_cities
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import type { Browser, Page } from 'puppeteer';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCity {
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesInserted: number;
|
||||
citiesUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// US STATE CODE MAPPING
|
||||
// ============================================================
|
||||
|
||||
const US_STATE_MAP: Record<string, string> = {
|
||||
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||
};
|
||||
|
||||
// Canadian province mapping
|
||||
const CA_PROVINCE_MAP: Record<string, string> = {
|
||||
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// CITY FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch cities from Dutchie's /cities page using Puppeteer to extract data.
|
||||
*/
|
||||
async function fetchCitiesFromDutchie(): Promise<DutchieCity[]> {
|
||||
console.log('[DutchieCityDiscovery] Launching browser to fetch cities...');
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Navigate to cities page
|
||||
console.log('[DutchieCityDiscovery] Navigating to https://dutchie.com/cities...');
|
||||
await page.goto('https://dutchie.com/cities', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for content to load
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// Extract city links from the page
|
||||
const cities = await page.evaluate(() => {
|
||||
const cityLinks: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
stateSlug: string | null;
|
||||
}> = [];
|
||||
|
||||
// Find all city links - they typically follow pattern /city/{state}/{city}
|
||||
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||
links.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
const text = (link as HTMLElement).innerText?.trim();
|
||||
|
||||
// Parse URL: https://dutchie.com/city/{state}/{city}
|
||||
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||
if (match && text) {
|
||||
cityLinks.push({
|
||||
name: text,
|
||||
slug: match[2],
|
||||
url: href,
|
||||
stateSlug: match[1],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return cityLinks;
|
||||
});
|
||||
|
||||
console.log(`[DutchieCityDiscovery] Extracted ${cities.length} city links from page`);
|
||||
|
||||
// Convert to DutchieCity format
|
||||
const result: DutchieCity[] = [];
|
||||
|
||||
for (const city of cities) {
|
||||
// Determine country and state code
|
||||
let countryCode = 'US';
|
||||
let stateCode: string | null = null;
|
||||
|
||||
if (city.stateSlug) {
|
||||
// Check if it's a US state
|
||||
if (US_STATE_MAP[city.stateSlug]) {
|
||||
stateCode = US_STATE_MAP[city.stateSlug];
|
||||
countryCode = 'US';
|
||||
}
|
||||
// Check if it's a Canadian province
|
||||
else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||
countryCode = 'CA';
|
||||
}
|
||||
// Check if it's already a 2-letter code
|
||||
else if (city.stateSlug.length === 2) {
|
||||
stateCode = city.stateSlug.toUpperCase();
|
||||
// Determine country based on state code
|
||||
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||
countryCode = 'CA';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.push({
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
url: city.url,
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Fetch cities by making API/GraphQL requests.
|
||||
* Falls back to this if scraping fails.
|
||||
*/
|
||||
async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||
console.log('[DutchieCityDiscovery] Attempting API-based city discovery...');
|
||||
|
||||
// Dutchie may have an API endpoint for cities
|
||||
// Try common patterns
|
||||
const apiEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of apiEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
if (response.data && Array.isArray(response.data)) {
|
||||
console.log(`[DutchieCityDiscovery] API returned ${response.data.length} cities`);
|
||||
return response.data.map((c: any) => ({
|
||||
name: c.name || c.city,
|
||||
slug: c.slug || c.citySlug,
|
||||
stateCode: c.stateCode || c.state,
|
||||
countryCode: c.countryCode || c.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(`[DutchieCityDiscovery] API ${endpoint} failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities
|
||||
*/
|
||||
async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCity
|
||||
): Promise<{ inserted: boolean; updated: boolean }> {
|
||||
const result = await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
last_crawled_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
last_crawled_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS inserted
|
||||
`,
|
||||
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||
);
|
||||
|
||||
const inserted = result.rows[0]?.inserted === true;
|
||||
return { inserted, updated: !inserted };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export class DutchieCityDiscovery {
|
||||
private pool: Pool;
|
||||
|
||||
constructor(pool: Pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the city discovery process
|
||||
*/
|
||||
async run(): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let citiesFound = 0;
|
||||
let citiesInserted = 0;
|
||||
let citiesUpdated = 0;
|
||||
|
||||
console.log('[DutchieCityDiscovery] Starting city discovery...');
|
||||
|
||||
try {
|
||||
// Try scraping first, fall back to API
|
||||
let cities = await fetchCitiesFromDutchie();
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[DutchieCityDiscovery] Scraping returned 0 cities, trying API...');
|
||||
cities = await fetchCitiesFromAPI();
|
||||
}
|
||||
|
||||
citiesFound = cities.length;
|
||||
console.log(`[DutchieCityDiscovery] Found ${citiesFound} cities`);
|
||||
|
||||
// Upsert each city
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await upsertCity(this.pool, city);
|
||||
if (result.inserted) {
|
||||
citiesInserted++;
|
||||
} else if (result.updated) {
|
||||
citiesUpdated++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `City discovery failed: ${error.message}`;
|
||||
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log('[DutchieCityDiscovery] Discovery complete:');
|
||||
console.log(` Cities found: ${citiesFound}`);
|
||||
console.log(` Inserted: ${citiesInserted}`);
|
||||
console.log(` Updated: ${citiesUpdated}`);
|
||||
console.log(` Errors: ${errors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
citiesFound,
|
||||
citiesInserted,
|
||||
citiesUpdated,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about discovered cities
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
byCountry: Array<{ countryCode: string; count: number }>;
|
||||
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||
crawlEnabled: number;
|
||||
neverCrawled: number;
|
||||
}> {
|
||||
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||
this.pool.query(`
|
||||
SELECT country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
GROUP BY country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT state_code, country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE state_code IS NOT NULL
|
||||
GROUP BY state_code, country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE crawl_enabled = TRUE
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE last_crawled_at IS NULL
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||
byCountry: byCountryRes.rows.map((r) => ({
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
byState: byStateRes.rows.map((r) => ({
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DutchieCityDiscovery;
|
||||
@@ -1,639 +0,0 @@
|
||||
/**
|
||||
* DutchieLocationDiscovery
|
||||
*
|
||||
* Discovers store locations for each city from Dutchie and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Given a dutchie_discovery_cities row, call Dutchie's location/search endpoint
|
||||
* - For each store: extract platform_location_id, platform_slug, platform_menu_url, name, address, coords
|
||||
* - Upsert into dutchie_discovery_locations
|
||||
* - DO NOT overwrite status if already verified/merged/rejected
|
||||
* - DO NOT overwrite dispensary_id if already set
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryCity {
|
||||
id: number;
|
||||
platform: string;
|
||||
cityName: string;
|
||||
citySlug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
crawlEnabled: boolean;
|
||||
}
|
||||
|
||||
export interface DutchieLocation {
|
||||
platformLocationId: string;
|
||||
platformSlug: string;
|
||||
platformMenuUrl: string;
|
||||
name: string;
|
||||
rawAddress: string | null;
|
||||
addressLine1: string | null;
|
||||
addressLine2: string | null;
|
||||
city: string | null;
|
||||
stateCode: string | null;
|
||||
postalCode: string | null;
|
||||
countryCode: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
offersDelivery: boolean | null;
|
||||
offersPickup: boolean | null;
|
||||
isRecreational: boolean | null;
|
||||
isMedical: boolean | null;
|
||||
metadata: Record<string, any>;
|
||||
}
|
||||
|
||||
export interface LocationDiscoveryResult {
|
||||
cityId: number;
|
||||
citySlug: string;
|
||||
locationsFound: number;
|
||||
locationsInserted: number;
|
||||
locationsUpdated: number;
|
||||
locationsSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOCATION FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch locations for a city using Puppeteer to scrape the city page
|
||||
*/
|
||||
async function fetchLocationsForCity(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||
console.log(`[DutchieLocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Navigate to city page - use /us/dispensaries/{city_slug} pattern
|
||||
const cityUrl = `https://dutchie.com/us/dispensaries/${city.citySlug}`;
|
||||
console.log(`[DutchieLocationDiscovery] Navigating to ${cityUrl}...`);
|
||||
|
||||
await page.goto(cityUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for content
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// Try to extract __NEXT_DATA__ which often contains store data
|
||||
const nextData = await page.evaluate(() => {
|
||||
const script = document.querySelector('script#__NEXT_DATA__');
|
||||
if (script) {
|
||||
try {
|
||||
return JSON.parse(script.textContent || '{}');
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
let locations: DutchieLocation[] = [];
|
||||
|
||||
if (nextData?.props?.pageProps?.dispensaries) {
|
||||
// Extract from Next.js data
|
||||
const dispensaries = nextData.props.pageProps.dispensaries;
|
||||
console.log(`[DutchieLocationDiscovery] Found ${dispensaries.length} dispensaries in __NEXT_DATA__`);
|
||||
|
||||
locations = dispensaries.map((d: any) => parseDispensaryData(d, city));
|
||||
} else {
|
||||
// Fall back to DOM scraping
|
||||
console.log('[DutchieLocationDiscovery] No __NEXT_DATA__, trying DOM scraping...');
|
||||
|
||||
const scrapedData = await page.evaluate(() => {
|
||||
const stores: Array<{
|
||||
name: string;
|
||||
href: string;
|
||||
address: string | null;
|
||||
}> = [];
|
||||
|
||||
// Look for dispensary cards/links
|
||||
const cards = document.querySelectorAll('[data-testid="dispensary-card"], .dispensary-card, a[href*="/dispensary/"]');
|
||||
cards.forEach((card) => {
|
||||
const link = card.querySelector('a[href*="/dispensary/"]') || (card as HTMLAnchorElement);
|
||||
const href = (link as HTMLAnchorElement).href || '';
|
||||
const name =
|
||||
card.querySelector('[data-testid="dispensary-name"]')?.textContent ||
|
||||
card.querySelector('h2, h3, .name')?.textContent ||
|
||||
link.textContent ||
|
||||
'';
|
||||
const address = card.querySelector('[data-testid="dispensary-address"], .address')?.textContent || null;
|
||||
|
||||
if (href && name) {
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
href,
|
||||
address: address?.trim() || null,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return stores;
|
||||
});
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] DOM scraping found ${scrapedData.length} stores`);
|
||||
|
||||
locations = scrapedData.map((s) => {
|
||||
// Parse slug from URL
|
||||
const match = s.href.match(/\/dispensary\/([^/?]+)/);
|
||||
const slug = match ? match[1] : s.name.toLowerCase().replace(/\s+/g, '-');
|
||||
|
||||
return {
|
||||
platformLocationId: slug, // Will be resolved later
|
||||
platformSlug: slug,
|
||||
platformMenuUrl: `https://dutchie.com/dispensary/${slug}`,
|
||||
name: s.name,
|
||||
rawAddress: s.address,
|
||||
addressLine1: null,
|
||||
addressLine2: null,
|
||||
city: city.cityName,
|
||||
stateCode: city.stateCode,
|
||||
postalCode: null,
|
||||
countryCode: city.countryCode,
|
||||
latitude: null,
|
||||
longitude: null,
|
||||
timezone: null,
|
||||
offersDelivery: null,
|
||||
offersPickup: null,
|
||||
isRecreational: null,
|
||||
isMedical: null,
|
||||
metadata: { source: 'dom_scrape', originalUrl: s.href },
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
return locations;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse dispensary data from Dutchie's API/JSON response
|
||||
*/
|
||||
function parseDispensaryData(d: any, city: DiscoveryCity): DutchieLocation {
|
||||
const id = d.id || d._id || d.dispensaryId || '';
|
||||
const slug = d.slug || d.cName || d.name?.toLowerCase().replace(/\s+/g, '-') || '';
|
||||
|
||||
// Build menu URL
|
||||
let menuUrl = `https://dutchie.com/dispensary/${slug}`;
|
||||
if (d.menuUrl) {
|
||||
menuUrl = d.menuUrl;
|
||||
} else if (d.embeddedMenuUrl) {
|
||||
menuUrl = d.embeddedMenuUrl;
|
||||
}
|
||||
|
||||
// Parse address
|
||||
const address = d.address || d.location?.address || {};
|
||||
const rawAddress = [
|
||||
address.line1 || address.street1 || d.address1,
|
||||
address.line2 || address.street2 || d.address2,
|
||||
[
|
||||
address.city || d.city,
|
||||
address.state || address.stateCode || d.state,
|
||||
address.zip || address.zipCode || address.postalCode || d.zip,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' '),
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(', ');
|
||||
|
||||
return {
|
||||
platformLocationId: id,
|
||||
platformSlug: slug,
|
||||
platformMenuUrl: menuUrl,
|
||||
name: d.name || d.dispensaryName || '',
|
||||
rawAddress: rawAddress || null,
|
||||
addressLine1: address.line1 || address.street1 || d.address1 || null,
|
||||
addressLine2: address.line2 || address.street2 || d.address2 || null,
|
||||
city: address.city || d.city || city.cityName,
|
||||
stateCode: address.state || address.stateCode || d.state || city.stateCode,
|
||||
postalCode: address.zip || address.zipCode || address.postalCode || d.zip || null,
|
||||
countryCode: address.country || address.countryCode || d.country || city.countryCode,
|
||||
latitude: d.latitude ?? d.location?.latitude ?? d.location?.lat ?? null,
|
||||
longitude: d.longitude ?? d.location?.longitude ?? d.location?.lng ?? null,
|
||||
timezone: d.timezone || d.timeZone || null,
|
||||
offersDelivery: d.offerDelivery ?? d.offersDelivery ?? d.delivery ?? null,
|
||||
offersPickup: d.offerPickup ?? d.offersPickup ?? d.pickup ?? null,
|
||||
isRecreational: d.isRecreational ?? d.recreational ?? (d.retailType === 'recreational' || d.retailType === 'both'),
|
||||
isMedical: d.isMedical ?? d.medical ?? (d.retailType === 'medical' || d.retailType === 'both'),
|
||||
metadata: {
|
||||
source: 'next_data',
|
||||
retailType: d.retailType,
|
||||
brand: d.brand,
|
||||
logo: d.logo || d.logoUrl,
|
||||
raw: d,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Use GraphQL to discover locations
|
||||
*/
|
||||
async function fetchLocationsViaGraphQL(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||
console.log(`[DutchieLocationDiscovery] Trying GraphQL for ${city.cityName}...`);
|
||||
|
||||
// Try geo-based search
|
||||
// This would require knowing the city's coordinates
|
||||
// For now, return empty and rely on page scraping
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a location into dutchie_discovery_locations
|
||||
* Does NOT overwrite status if already verified/merged/rejected
|
||||
* Does NOT overwrite dispensary_id if already set
|
||||
*/
|
||||
async function upsertLocation(
|
||||
pool: Pool,
|
||||
location: DutchieLocation,
|
||||
cityId: number
|
||||
): Promise<{ inserted: boolean; updated: boolean; skipped: boolean }> {
|
||||
// First check if this location exists and has a protected status
|
||||
const existing = await pool.query(
|
||||
`
|
||||
SELECT id, status, dispensary_id
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND platform_location_id = $1
|
||||
`,
|
||||
[location.platformLocationId]
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
const row = existing.rows[0];
|
||||
const protectedStatuses = ['verified', 'merged', 'rejected'];
|
||||
|
||||
if (protectedStatuses.includes(row.status)) {
|
||||
// Only update last_seen_at for protected statuses
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET last_seen_at = NOW(), updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[row.id]
|
||||
);
|
||||
return { inserted: false, updated: false, skipped: true };
|
||||
}
|
||||
|
||||
// Update existing discovered location (but preserve dispensary_id if set)
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET
|
||||
platform_slug = $2,
|
||||
platform_menu_url = $3,
|
||||
name = $4,
|
||||
raw_address = COALESCE($5, raw_address),
|
||||
address_line1 = COALESCE($6, address_line1),
|
||||
address_line2 = COALESCE($7, address_line2),
|
||||
city = COALESCE($8, city),
|
||||
state_code = COALESCE($9, state_code),
|
||||
postal_code = COALESCE($10, postal_code),
|
||||
country_code = COALESCE($11, country_code),
|
||||
latitude = COALESCE($12, latitude),
|
||||
longitude = COALESCE($13, longitude),
|
||||
timezone = COALESCE($14, timezone),
|
||||
offers_delivery = COALESCE($15, offers_delivery),
|
||||
offers_pickup = COALESCE($16, offers_pickup),
|
||||
is_recreational = COALESCE($17, is_recreational),
|
||||
is_medical = COALESCE($18, is_medical),
|
||||
metadata = COALESCE($19, metadata),
|
||||
discovery_city_id = $20,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[
|
||||
row.id,
|
||||
location.platformSlug,
|
||||
location.platformMenuUrl,
|
||||
location.name,
|
||||
location.rawAddress,
|
||||
location.addressLine1,
|
||||
location.addressLine2,
|
||||
location.city,
|
||||
location.stateCode,
|
||||
location.postalCode,
|
||||
location.countryCode,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.offersDelivery,
|
||||
location.offersPickup,
|
||||
location.isRecreational,
|
||||
location.isMedical,
|
||||
JSON.stringify(location.metadata),
|
||||
cityId,
|
||||
]
|
||||
);
|
||||
return { inserted: false, updated: true, skipped: false };
|
||||
}
|
||||
|
||||
// Insert new location
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_locations (
|
||||
platform,
|
||||
platform_location_id,
|
||||
platform_slug,
|
||||
platform_menu_url,
|
||||
name,
|
||||
raw_address,
|
||||
address_line1,
|
||||
address_line2,
|
||||
city,
|
||||
state_code,
|
||||
postal_code,
|
||||
country_code,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
status,
|
||||
offers_delivery,
|
||||
offers_pickup,
|
||||
is_recreational,
|
||||
is_medical,
|
||||
metadata,
|
||||
discovery_city_id,
|
||||
first_seen_at,
|
||||
last_seen_at,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
|
||||
'discovered',
|
||||
$15, $16, $17, $18, $19, $20,
|
||||
NOW(), NOW(), TRUE, NOW(), NOW()
|
||||
)
|
||||
`,
|
||||
[
|
||||
location.platformLocationId,
|
||||
location.platformSlug,
|
||||
location.platformMenuUrl,
|
||||
location.name,
|
||||
location.rawAddress,
|
||||
location.addressLine1,
|
||||
location.addressLine2,
|
||||
location.city,
|
||||
location.stateCode,
|
||||
location.postalCode,
|
||||
location.countryCode,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.offersDelivery,
|
||||
location.offersPickup,
|
||||
location.isRecreational,
|
||||
location.isMedical,
|
||||
JSON.stringify(location.metadata),
|
||||
cityId,
|
||||
]
|
||||
);
|
||||
|
||||
return { inserted: true, updated: false, skipped: false };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY CLASS
|
||||
// ============================================================
|
||||
|
||||
export class DutchieLocationDiscovery {
|
||||
private pool: Pool;
|
||||
|
||||
constructor(pool: Pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a city by slug
|
||||
*/
|
||||
async getCityBySlug(citySlug: string): Promise<DiscoveryCity | null> {
|
||||
const { rows } = await this.pool.query(
|
||||
`
|
||||
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND city_slug = $1
|
||||
LIMIT 1
|
||||
`,
|
||||
[citySlug]
|
||||
);
|
||||
|
||||
if (rows.length === 0) return null;
|
||||
|
||||
const r = rows[0];
|
||||
return {
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all crawl-enabled cities
|
||||
*/
|
||||
async getEnabledCities(limit?: number): Promise<DiscoveryCity[]> {
|
||||
const { rows } = await this.pool.query(
|
||||
`
|
||||
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||
ORDER BY last_crawled_at ASC NULLS FIRST, city_name ASC
|
||||
${limit ? `LIMIT ${limit}` : ''}
|
||||
`
|
||||
);
|
||||
|
||||
return rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover locations for a single city
|
||||
*/
|
||||
async discoverForCity(city: DiscoveryCity): Promise<LocationDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let locationsFound = 0;
|
||||
let locationsInserted = 0;
|
||||
let locationsUpdated = 0;
|
||||
let locationsSkipped = 0;
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
try {
|
||||
// Fetch locations
|
||||
let locations = await fetchLocationsForCity(city);
|
||||
|
||||
// If scraping fails, try GraphQL
|
||||
if (locations.length === 0) {
|
||||
locations = await fetchLocationsViaGraphQL(city);
|
||||
}
|
||||
|
||||
locationsFound = locations.length;
|
||||
console.log(`[DutchieLocationDiscovery] Found ${locationsFound} locations`);
|
||||
|
||||
// Upsert each location
|
||||
for (const location of locations) {
|
||||
try {
|
||||
const result = await upsertLocation(this.pool, location, city.id);
|
||||
if (result.inserted) locationsInserted++;
|
||||
else if (result.updated) locationsUpdated++;
|
||||
else if (result.skipped) locationsSkipped++;
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert location ${location.platformSlug}: ${error.message}`;
|
||||
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Update city's last_crawled_at and location_count
|
||||
await this.pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_cities
|
||||
SET last_crawled_at = NOW(),
|
||||
location_count = $1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[locationsFound, city.id]
|
||||
);
|
||||
} catch (error: any) {
|
||||
const msg = `Location discovery failed for ${city.citySlug}: ${error.message}`;
|
||||
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] City ${city.citySlug} complete:`);
|
||||
console.log(` Locations found: ${locationsFound}`);
|
||||
console.log(` Inserted: ${locationsInserted}`);
|
||||
console.log(` Updated: ${locationsUpdated}`);
|
||||
console.log(` Skipped (protected): ${locationsSkipped}`);
|
||||
console.log(` Errors: ${errors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound,
|
||||
locationsInserted,
|
||||
locationsUpdated,
|
||||
locationsSkipped,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover locations for all enabled cities
|
||||
*/
|
||||
async discoverAllEnabled(options: {
|
||||
limit?: number;
|
||||
delayMs?: number;
|
||||
} = {}): Promise<{
|
||||
totalCities: number;
|
||||
totalLocationsFound: number;
|
||||
totalInserted: number;
|
||||
totalUpdated: number;
|
||||
totalSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}> {
|
||||
const { limit, delayMs = 2000 } = options;
|
||||
const startTime = Date.now();
|
||||
let totalLocationsFound = 0;
|
||||
let totalInserted = 0;
|
||||
let totalUpdated = 0;
|
||||
let totalSkipped = 0;
|
||||
const allErrors: string[] = [];
|
||||
|
||||
const cities = await this.getEnabledCities(limit);
|
||||
console.log(`[DutchieLocationDiscovery] Discovering locations for ${cities.length} cities...`);
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
console.log(`\n[DutchieLocationDiscovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||
|
||||
try {
|
||||
const result = await this.discoverForCity(city);
|
||||
totalLocationsFound += result.locationsFound;
|
||||
totalInserted += result.locationsInserted;
|
||||
totalUpdated += result.locationsUpdated;
|
||||
totalSkipped += result.locationsSkipped;
|
||||
allErrors.push(...result.errors);
|
||||
} catch (error: any) {
|
||||
allErrors.push(`City ${city.citySlug} failed: ${error.message}`);
|
||||
}
|
||||
|
||||
// Delay between cities
|
||||
if (i < cities.length - 1 && delayMs > 0) {
|
||||
await new Promise((r) => setTimeout(r, delayMs));
|
||||
}
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log('\n[DutchieLocationDiscovery] All cities complete:');
|
||||
console.log(` Total cities: ${cities.length}`);
|
||||
console.log(` Total locations found: ${totalLocationsFound}`);
|
||||
console.log(` Total inserted: ${totalInserted}`);
|
||||
console.log(` Total updated: ${totalUpdated}`);
|
||||
console.log(` Total skipped: ${totalSkipped}`);
|
||||
console.log(` Total errors: ${allErrors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
totalCities: cities.length,
|
||||
totalLocationsFound,
|
||||
totalInserted,
|
||||
totalUpdated,
|
||||
totalSkipped,
|
||||
errors: allErrors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DutchieLocationDiscovery;
|
||||
@@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Cities (Auto)
|
||||
*
|
||||
* Attempts browser/API-based /cities discovery.
|
||||
* Even if currently blocked (403), this runner preserves the auto-discovery path.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:cities:auto
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtCityDiscoveryService } from './DtCityDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery (AUTO) ║');
|
||||
console.log('║ Browser + API fallback ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
const service = new DtCityDiscoveryService(pool);
|
||||
const result = await service.runAutoDiscovery();
|
||||
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
}
|
||||
|
||||
const stats = await service.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
|
||||
if (result.citiesFound === 0) {
|
||||
console.log('\n⚠️ No cities found via auto-discovery.');
|
||||
console.log(' This may be due to Dutchie blocking scraping/API access.');
|
||||
console.log(' Use manual seeding instead:');
|
||||
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Auto city discovery completed');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Auto city discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,137 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Cities (Manual Seed)
|
||||
*
|
||||
* Manually seeds cities into dutchie_discovery_cities via CLI args.
|
||||
* Use this when auto-discovery is blocked (403).
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||
* npm run discovery:dt:cities:manual -- --city-slug=ma-boston --city-name=Boston --state-code=MA --country-code=US
|
||||
*
|
||||
* Options:
|
||||
* --city-slug Required. URL slug (e.g., "ny-hudson")
|
||||
* --city-name Required. Display name (e.g., "Hudson")
|
||||
* --state-code Required. State/province code (e.g., "NY", "CA", "ON")
|
||||
* --country-code Optional. Country code (default: "US")
|
||||
*
|
||||
* After seeding, run location discovery:
|
||||
* npm run discovery:dt:locations
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtCityDiscoveryService, DutchieCity } from './DtCityDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
interface Args {
|
||||
citySlug?: string;
|
||||
cityName?: string;
|
||||
stateCode?: string;
|
||||
countryCode: string;
|
||||
}
|
||||
|
||||
function parseArgs(): Args {
|
||||
const args: Args = { countryCode: 'US' };
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const citySlugMatch = arg.match(/--city-slug=(.+)/);
|
||||
if (citySlugMatch) args.citySlug = citySlugMatch[1];
|
||||
|
||||
const cityNameMatch = arg.match(/--city-name=(.+)/);
|
||||
if (cityNameMatch) args.cityName = cityNameMatch[1];
|
||||
|
||||
const stateCodeMatch = arg.match(/--state-code=(.+)/);
|
||||
if (stateCodeMatch) args.stateCode = stateCodeMatch[1].toUpperCase();
|
||||
|
||||
const countryCodeMatch = arg.match(/--country-code=(.+)/);
|
||||
if (countryCodeMatch) args.countryCode = countryCodeMatch[1].toUpperCase();
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
function printUsage() {
|
||||
console.log(`
|
||||
Usage:
|
||||
npm run discovery:dt:cities:manual -- --city-slug=<slug> --city-name=<name> --state-code=<state>
|
||||
|
||||
Required arguments:
|
||||
--city-slug URL slug for the city (e.g., "ny-hudson", "ma-boston")
|
||||
--city-name Display name (e.g., "Hudson", "Boston")
|
||||
--state-code State/province code (e.g., "NY", "CA", "ON")
|
||||
|
||||
Optional arguments:
|
||||
--country-code Country code (default: "US")
|
||||
|
||||
Examples:
|
||||
npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||
npm run discovery:dt:cities:manual -- --city-slug=ca-los-angeles --city-name="Los Angeles" --state-code=CA
|
||||
npm run discovery:dt:cities:manual -- --city-slug=on-toronto --city-name=Toronto --state-code=ON --country-code=CA
|
||||
|
||||
After seeding, run location discovery:
|
||||
npm run discovery:dt:locations
|
||||
`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery (MANUAL SEED) ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
|
||||
if (!args.citySlug || !args.cityName || !args.stateCode) {
|
||||
console.error('\n❌ Error: Missing required arguments\n');
|
||||
printUsage();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`\nCity Slug: ${args.citySlug}`);
|
||||
console.log(`City Name: ${args.cityName}`);
|
||||
console.log(`State Code: ${args.stateCode}`);
|
||||
console.log(`Country Code: ${args.countryCode}`);
|
||||
console.log(`Database: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`\nConnected at: ${rows[0].time}`);
|
||||
|
||||
const service = new DtCityDiscoveryService(pool);
|
||||
|
||||
const city: DutchieCity = {
|
||||
slug: args.citySlug,
|
||||
name: args.cityName,
|
||||
stateCode: args.stateCode,
|
||||
countryCode: args.countryCode,
|
||||
};
|
||||
|
||||
const result = await service.seedCity(city);
|
||||
|
||||
const action = result.wasInserted ? 'INSERTED' : 'UPDATED';
|
||||
console.log(`\n✅ City ${action}:`);
|
||||
console.log(` ID: ${result.id}`);
|
||||
console.log(` City Slug: ${result.city.slug}`);
|
||||
console.log(` City Name: ${result.city.name}`);
|
||||
console.log(` State Code: ${result.city.stateCode}`);
|
||||
console.log(` Country Code: ${result.city.countryCode}`);
|
||||
|
||||
const stats = await service.getStats();
|
||||
console.log(`\nTotal Dutchie cities: ${stats.total} (${stats.crawlEnabled} enabled)`);
|
||||
|
||||
console.log('\n📍 Next step: Run location discovery');
|
||||
console.log(' npm run discovery:dt:locations');
|
||||
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Failed to seed city:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Runner: Dutchie Cities
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:platforms:dt:cities
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery Runner ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
// Test DB connection
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
// Run city discovery
|
||||
const discovery = new DutchieCityDiscovery(pool);
|
||||
const result = await discovery.run();
|
||||
|
||||
// Print summary
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
}
|
||||
|
||||
// Get final stats
|
||||
const stats = await discovery.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
console.log(` By country: ${stats.byCountry.map(c => `${c.countryCode}=${c.count}`).join(', ')}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ City discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ City discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,113 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Locations (From Cities)
|
||||
*
|
||||
* Reads from dutchie_discovery_cities (crawl_enabled = true)
|
||||
* and discovers store locations for each city.
|
||||
*
|
||||
* Geo coordinates are captured when available from Dutchie's payloads.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:locations
|
||||
* npm run discovery:dt:locations -- --limit=10
|
||||
* npm run discovery:dt:locations -- --delay=3000
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts
|
||||
*
|
||||
* Options:
|
||||
* --limit=N Only process N cities (default: all)
|
||||
* --delay=N Delay between cities in ms (default: 2000)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtLocationDiscoveryService } from './DtLocationDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
function parseArgs(): { limit?: number; delay?: number } {
|
||||
const args: { limit?: number; delay?: number } = {};
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||
|
||||
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie Location Discovery (From Cities) ║');
|
||||
console.log('║ Reads crawl_enabled cities, discovers stores ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
const service = new DtLocationDiscoveryService(pool);
|
||||
const result = await service.discoverAllEnabled({
|
||||
limit: args.limit,
|
||||
delayMs: args.delay ?? 2000,
|
||||
});
|
||||
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities processed: ${result.totalCities}`);
|
||||
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors (first 10):');
|
||||
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get location stats including coordinates
|
||||
const stats = await service.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total locations: ${stats.total}`);
|
||||
console.log(` With coordinates: ${stats.withCoordinates}`);
|
||||
console.log(` By status:`);
|
||||
stats.byStatus.forEach(s => console.log(` ${s.status}: ${s.count}`));
|
||||
|
||||
if (result.totalCities === 0) {
|
||||
console.log('\n⚠️ No crawl-enabled cities found.');
|
||||
console.log(' Seed cities first:');
|
||||
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Location discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Location discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Runner: Dutchie Locations
|
||||
*
|
||||
* Discovers store locations for all crawl-enabled cities and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:platforms:dt:locations
|
||||
* npm run discovery:platforms:dt:locations -- --limit=10
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations.ts
|
||||
*
|
||||
* Options (via args):
|
||||
* --limit=N Only process N cities (default: all)
|
||||
* --delay=N Delay between cities in ms (default: 2000)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
// Parse CLI args
|
||||
function parseArgs(): { limit?: number; delay?: number } {
|
||||
const args: { limit?: number; delay?: number } = {};
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||
|
||||
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie Location Discovery Runner ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
// Test DB connection
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
// Run location discovery
|
||||
const discovery = new DutchieLocationDiscovery(pool);
|
||||
const result = await discovery.discoverAllEnabled({
|
||||
limit: args.limit,
|
||||
delayMs: args.delay ?? 2000,
|
||||
});
|
||||
|
||||
// Print summary
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities processed: ${result.totalCities}`);
|
||||
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors (first 10):');
|
||||
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get DB counts
|
||||
const { rows: countRows } = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'discovered') as discovered,
|
||||
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||
COUNT(*) FILTER (WHERE status = 'merged') as merged,
|
||||
COUNT(*) FILTER (WHERE status = 'rejected') as rejected
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE
|
||||
`);
|
||||
|
||||
const counts = countRows[0];
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total locations: ${counts.total}`);
|
||||
console.log(` Status discovered: ${counts.discovered}`);
|
||||
console.log(` Status verified: ${counts.verified}`);
|
||||
console.log(` Status merged: ${counts.merged}`);
|
||||
console.log(` Status rejected: ${counts.rejected}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Location discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Location discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,10 +0,0 @@
|
||||
/**
|
||||
* Dutchie Discovery Module
|
||||
*
|
||||
* Store discovery pipeline for Dutchie platform.
|
||||
*/
|
||||
|
||||
export { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
export { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
export { createDutchieDiscoveryRoutes } from './routes';
|
||||
export { promoteDiscoveryLocation } from './promoteDiscoveryLocation';
|
||||
@@ -1,248 +0,0 @@
|
||||
/**
|
||||
* Promote Discovery Location to Crawlable Dispensary
|
||||
*
|
||||
* When a discovery location is verified or merged:
|
||||
* 1. Ensure a crawl profile exists for the dispensary
|
||||
* 2. Seed/update crawl schedule
|
||||
* 3. Create initial crawl job
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface PromotionResult {
|
||||
success: boolean;
|
||||
discoveryId: number;
|
||||
dispensaryId: number;
|
||||
crawlProfileId?: number;
|
||||
scheduleUpdated?: boolean;
|
||||
crawlJobCreated?: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a verified/merged discovery location to a crawlable dispensary.
|
||||
*
|
||||
* This function:
|
||||
* 1. Verifies the discovery location is verified/merged and has a dispensary_id
|
||||
* 2. Ensures the dispensary has platform info (menu_type, platform_dispensary_id)
|
||||
* 3. Creates/updates a crawler profile if the profile table exists
|
||||
* 4. Queues an initial crawl job
|
||||
*/
|
||||
export async function promoteDiscoveryLocation(
|
||||
pool: Pool,
|
||||
discoveryLocationId: number
|
||||
): Promise<PromotionResult> {
|
||||
console.log(`[Promote] Starting promotion for discovery location ${discoveryLocationId}...`);
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.id as disp_id,
|
||||
d.name as disp_name,
|
||||
d.menu_type as disp_menu_type,
|
||||
d.platform_dispensary_id as disp_platform_id
|
||||
FROM dutchie_discovery_locations dl
|
||||
JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[discoveryLocationId]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId: 0,
|
||||
error: 'Discovery location not found or not linked to a dispensary',
|
||||
};
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Verify status
|
||||
if (!['verified', 'merged'].includes(location.status)) {
|
||||
return {
|
||||
success: false,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId: location.dispensary_id || 0,
|
||||
error: `Cannot promote: location status is '${location.status}', must be 'verified' or 'merged'`,
|
||||
};
|
||||
}
|
||||
|
||||
const dispensaryId = location.dispensary_id;
|
||||
console.log(`[Promote] Location ${discoveryLocationId} -> Dispensary ${dispensaryId} (${location.disp_name})`);
|
||||
|
||||
// Ensure dispensary has platform info
|
||||
if (!location.disp_platform_id) {
|
||||
console.log(`[Promote] Updating dispensary with platform info...`);
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||
);
|
||||
}
|
||||
|
||||
let crawlProfileId: number | undefined;
|
||||
let scheduleUpdated = false;
|
||||
let crawlJobCreated = false;
|
||||
|
||||
// Check if dispensary_crawler_profiles table exists
|
||||
const { rows: tableCheck } = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'dispensary_crawler_profiles'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (tableCheck[0]?.exists) {
|
||||
// Create or get crawler profile
|
||||
console.log(`[Promote] Checking crawler profile...`);
|
||||
|
||||
const { rows: profileRows } = await pool.query(
|
||||
`
|
||||
SELECT id FROM dispensary_crawler_profiles
|
||||
WHERE dispensary_id = $1 AND platform = 'dutchie'
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (profileRows.length > 0) {
|
||||
crawlProfileId = profileRows[0].id;
|
||||
console.log(`[Promote] Using existing profile ${crawlProfileId}`);
|
||||
} else {
|
||||
// Create new profile
|
||||
const profileKey = `dutchie-${location.platform_slug}`;
|
||||
const { rows: newProfile } = await pool.query(
|
||||
`
|
||||
INSERT INTO dispensary_crawler_profiles (
|
||||
dispensary_id,
|
||||
profile_key,
|
||||
profile_name,
|
||||
platform,
|
||||
config,
|
||||
status,
|
||||
enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, 'dutchie', $4, 'sandbox', TRUE, NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, platform) DO UPDATE SET
|
||||
enabled = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
dispensaryId,
|
||||
profileKey,
|
||||
`${location.name} (Dutchie)`,
|
||||
JSON.stringify({
|
||||
platformDispensaryId: location.platform_location_id,
|
||||
platformSlug: location.platform_slug,
|
||||
menuUrl: location.platform_menu_url,
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
}),
|
||||
]
|
||||
);
|
||||
|
||||
crawlProfileId = newProfile[0]?.id;
|
||||
console.log(`[Promote] Created new profile ${crawlProfileId}`);
|
||||
}
|
||||
|
||||
// Link profile to dispensary if not already linked
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET active_crawler_profile_id = COALESCE(active_crawler_profile_id, $1),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[crawlProfileId, dispensaryId]
|
||||
);
|
||||
}
|
||||
|
||||
// Check if crawl_jobs table exists and create initial job
|
||||
const { rows: jobsTableCheck } = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'crawl_jobs'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (jobsTableCheck[0]?.exists) {
|
||||
// Check if there's already a pending job
|
||||
const { rows: existingJobs } = await pool.query(
|
||||
`
|
||||
SELECT id FROM crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (existingJobs.length === 0) {
|
||||
// Create initial crawl job
|
||||
console.log(`[Promote] Creating initial crawl job...`);
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO crawl_jobs (
|
||||
dispensary_id,
|
||||
job_type,
|
||||
status,
|
||||
priority,
|
||||
config,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, 'dutchie_product_crawl', 'pending', 1, $2, NOW(), NOW()
|
||||
)
|
||||
`,
|
||||
[
|
||||
dispensaryId,
|
||||
JSON.stringify({
|
||||
source: 'discovery_promotion',
|
||||
discoveryLocationId,
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
}),
|
||||
]
|
||||
);
|
||||
crawlJobCreated = true;
|
||||
} else {
|
||||
console.log(`[Promote] Crawl job already exists for dispensary`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update discovery location notes
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET notes = COALESCE(notes || E'\n', '') || $1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[`Promoted to crawlable at ${new Date().toISOString()}`, discoveryLocationId]
|
||||
);
|
||||
|
||||
console.log(`[Promote] Promotion complete for discovery location ${discoveryLocationId}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId,
|
||||
crawlProfileId,
|
||||
scheduleUpdated,
|
||||
crawlJobCreated,
|
||||
};
|
||||
}
|
||||
|
||||
export default promoteDiscoveryLocation;
|
||||
@@ -1,973 +0,0 @@
|
||||
/**
|
||||
* Platform Discovery API Routes (DT = Dutchie)
|
||||
*
|
||||
* Routes for the platform-specific store discovery pipeline.
|
||||
* Mount at /api/discovery/platforms/dt
|
||||
*
|
||||
* Platform Slug Mapping (for trademark-safe URLs):
|
||||
* dt = Dutchie
|
||||
* jn = Jane (future)
|
||||
* wm = Weedmaps (future)
|
||||
* lf = Leafly (future)
|
||||
* tz = Treez (future)
|
||||
*
|
||||
* Note: The actual platform value stored in the DB remains 'dutchie'.
|
||||
* Only the URL paths use neutral slugs.
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
import { DiscoveryGeoService } from '../../services/DiscoveryGeoService';
|
||||
import { GeoValidationService } from '../../services/GeoValidationService';
|
||||
|
||||
export function createDutchieDiscoveryRoutes(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// ============================================================
|
||||
// LOCATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations
|
||||
*
|
||||
* List discovered locations with filtering.
|
||||
*
|
||||
* Query params:
|
||||
* - status: 'discovered' | 'verified' | 'rejected' | 'merged'
|
||||
* - state_code: e.g., 'AZ', 'CA'
|
||||
* - country_code: 'US' | 'CA'
|
||||
* - unlinked_only: 'true' to show only locations without dispensary_id
|
||||
* - search: search by name
|
||||
* - limit: number (default 50)
|
||||
* - offset: number (default 0)
|
||||
*/
|
||||
router.get('/locations', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
status,
|
||||
state_code,
|
||||
country_code,
|
||||
unlinked_only,
|
||||
search,
|
||||
limit = '50',
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = "WHERE platform = 'dutchie' AND active = TRUE";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (status) {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (state_code) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(state_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (country_code) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(country_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (unlinked_only === 'true') {
|
||||
whereClause += ' AND dispensary_id IS NULL';
|
||||
}
|
||||
|
||||
if (search) {
|
||||
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
const limitVal = parseInt(limit as string, 10);
|
||||
const offsetVal = parseInt(offset as string, 10);
|
||||
params.push(limitVal, offsetVal);
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.id,
|
||||
dl.platform,
|
||||
dl.platform_location_id,
|
||||
dl.platform_slug,
|
||||
dl.platform_menu_url,
|
||||
dl.name,
|
||||
dl.raw_address,
|
||||
dl.address_line1,
|
||||
dl.city,
|
||||
dl.state_code,
|
||||
dl.postal_code,
|
||||
dl.country_code,
|
||||
dl.latitude,
|
||||
dl.longitude,
|
||||
dl.status,
|
||||
dl.dispensary_id,
|
||||
dl.offers_delivery,
|
||||
dl.offers_pickup,
|
||||
dl.is_recreational,
|
||||
dl.is_medical,
|
||||
dl.first_seen_at,
|
||||
dl.last_seen_at,
|
||||
dl.verified_at,
|
||||
dl.verified_by,
|
||||
dl.notes,
|
||||
d.name as dispensary_name
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY dl.first_seen_at DESC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
// Get total count
|
||||
const countParams = params.slice(0, -2);
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
|
||||
countParams
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
locations: rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
platformLocationId: r.platform_location_id,
|
||||
platformSlug: r.platform_slug,
|
||||
platformMenuUrl: r.platform_menu_url,
|
||||
name: r.name,
|
||||
rawAddress: r.raw_address,
|
||||
addressLine1: r.address_line1,
|
||||
city: r.city,
|
||||
stateCode: r.state_code,
|
||||
postalCode: r.postal_code,
|
||||
countryCode: r.country_code,
|
||||
latitude: r.latitude,
|
||||
longitude: r.longitude,
|
||||
status: r.status,
|
||||
dispensaryId: r.dispensary_id,
|
||||
dispensaryName: r.dispensary_name,
|
||||
offersDelivery: r.offers_delivery,
|
||||
offersPickup: r.offers_pickup,
|
||||
isRecreational: r.is_recreational,
|
||||
isMedical: r.is_medical,
|
||||
firstSeenAt: r.first_seen_at,
|
||||
lastSeenAt: r.last_seen_at,
|
||||
verifiedAt: r.verified_at,
|
||||
verifiedBy: r.verified_by,
|
||||
notes: r.notes,
|
||||
})),
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: limitVal,
|
||||
offset: offsetVal,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching locations:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id
|
||||
*
|
||||
* Get a single location by ID.
|
||||
*/
|
||||
router.get('/locations/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.name as dispensary_name,
|
||||
d.menu_url as dispensary_menu_url
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const r = rows[0];
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
platformLocationId: r.platform_location_id,
|
||||
platformSlug: r.platform_slug,
|
||||
platformMenuUrl: r.platform_menu_url,
|
||||
name: r.name,
|
||||
rawAddress: r.raw_address,
|
||||
addressLine1: r.address_line1,
|
||||
addressLine2: r.address_line2,
|
||||
city: r.city,
|
||||
stateCode: r.state_code,
|
||||
postalCode: r.postal_code,
|
||||
countryCode: r.country_code,
|
||||
latitude: r.latitude,
|
||||
longitude: r.longitude,
|
||||
timezone: r.timezone,
|
||||
status: r.status,
|
||||
dispensaryId: r.dispensary_id,
|
||||
dispensaryName: r.dispensary_name,
|
||||
dispensaryMenuUrl: r.dispensary_menu_url,
|
||||
offersDelivery: r.offers_delivery,
|
||||
offersPickup: r.offers_pickup,
|
||||
isRecreational: r.is_recreational,
|
||||
isMedical: r.is_medical,
|
||||
firstSeenAt: r.first_seen_at,
|
||||
lastSeenAt: r.last_seen_at,
|
||||
verifiedAt: r.verified_at,
|
||||
verifiedBy: r.verified_by,
|
||||
notes: r.notes,
|
||||
metadata: r.metadata,
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching location:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// VERIFICATION ACTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/verify-create
|
||||
*
|
||||
* Verify a discovered location and create a new canonical dispensary.
|
||||
*/
|
||||
router.post('/locations/:id/verify-create', async (req: Request, res: Response) => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { verifiedBy = 'admin' } = req.body;
|
||||
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await client.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot verify: location status is '${location.status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
// Look up state_id if we have a state_code
|
||||
let stateId: number | null = null;
|
||||
if (location.state_code) {
|
||||
const { rows: stateRows } = await client.query(
|
||||
`SELECT id FROM states WHERE code = $1`,
|
||||
[location.state_code]
|
||||
);
|
||||
if (stateRows.length > 0) {
|
||||
stateId = stateRows[0].id;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the canonical dispensary
|
||||
const { rows: dispRows } = await client.query(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
name,
|
||||
slug,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
zip,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
menu_type,
|
||||
menu_url,
|
||||
platform_dispensary_id,
|
||||
state_id,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, TRUE, NOW(), NOW()
|
||||
)
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
location.platform_slug,
|
||||
location.address_line1,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.postal_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
'dutchie',
|
||||
location.platform_menu_url,
|
||||
location.platform_location_id,
|
||||
stateId,
|
||||
]
|
||||
);
|
||||
|
||||
const dispensaryId = dispRows[0].id;
|
||||
|
||||
// Update the discovery location
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'verified',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'created',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
message: `Created new dispensary (ID: ${dispensaryId})`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[Discovery Routes] Error in verify-create:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/verify-link
|
||||
*
|
||||
* Link a discovered location to an existing dispensary.
|
||||
*
|
||||
* Body:
|
||||
* - dispensaryId: number (required)
|
||||
* - verifiedBy: string (optional)
|
||||
*/
|
||||
router.post('/locations/:id/verify-link', async (req: Request, res: Response) => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { dispensaryId, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
if (!dispensaryId) {
|
||||
return res.status(400).json({ success: false, error: 'dispensaryId is required' });
|
||||
}
|
||||
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Verify dispensary exists
|
||||
const { rows: dispRows } = await client.query(
|
||||
`SELECT id, name FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (dispRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await client.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot link: location status is '${location.status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
// Update dispensary with platform info if missing
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||
);
|
||||
|
||||
// Update the discovery location
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'merged',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'linked',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
dispensaryName: dispRows[0].name,
|
||||
message: `Linked to existing dispensary: ${dispRows[0].name}`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[Discovery Routes] Error in verify-link:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/reject
|
||||
*
|
||||
* Reject a discovered location.
|
||||
*
|
||||
* Body:
|
||||
* - reason: string (optional)
|
||||
* - verifiedBy: string (optional)
|
||||
*/
|
||||
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { reason, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
// Get current status
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'discovered') {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot reject: location status is '${rows[0].status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'rejected',
|
||||
verified_at = NOW(),
|
||||
verified_by = $1,
|
||||
notes = COALESCE($2, notes),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[verifiedBy, reason, id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'rejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location rejected',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in reject:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/unreject
|
||||
*
|
||||
* Restore a rejected location to discovered status.
|
||||
*/
|
||||
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get current status
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'rejected') {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot unreject: location status is '${rows[0].status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'discovered',
|
||||
verified_at = NULL,
|
||||
verified_by = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'unrejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location restored to discovered status',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in unreject:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SUMMARY / REPORTING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/summary
|
||||
*
|
||||
* Get discovery summary statistics.
|
||||
*/
|
||||
router.get('/summary', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
// Total counts by status
|
||||
const { rows: statusRows } = await pool.query(`
|
||||
SELECT status, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE
|
||||
GROUP BY status
|
||||
`);
|
||||
|
||||
const statusCounts: Record<string, number> = {};
|
||||
let totalLocations = 0;
|
||||
for (const row of statusRows) {
|
||||
statusCounts[row.status] = parseInt(row.cnt, 10);
|
||||
totalLocations += parseInt(row.cnt, 10);
|
||||
}
|
||||
|
||||
// By state
|
||||
const { rows: stateRows } = await pool.query(`
|
||||
SELECT
|
||||
state_code,
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||
COUNT(*) FILTER (WHERE dispensary_id IS NULL AND status = 'discovered') as unlinked
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE AND state_code IS NOT NULL
|
||||
GROUP BY state_code
|
||||
ORDER BY total DESC
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
summary: {
|
||||
total_locations: totalLocations,
|
||||
discovered: statusCounts['discovered'] || 0,
|
||||
verified: statusCounts['verified'] || 0,
|
||||
merged: statusCounts['merged'] || 0,
|
||||
rejected: statusCounts['rejected'] || 0,
|
||||
},
|
||||
by_state: stateRows.map((r) => ({
|
||||
state_code: r.state_code,
|
||||
total: parseInt(r.total, 10),
|
||||
verified: parseInt(r.verified, 10),
|
||||
unlinked: parseInt(r.unlinked, 10),
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in summary:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CITIES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/cities
|
||||
*
|
||||
* List discovery cities.
|
||||
*/
|
||||
router.get('/cities', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { state_code, country_code, crawl_enabled, limit = '100', offset = '0' } = req.query;
|
||||
|
||||
let whereClause = "WHERE platform = 'dutchie'";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state_code) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(state_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (country_code) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(country_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (crawl_enabled === 'true') {
|
||||
whereClause += ' AND crawl_enabled = TRUE';
|
||||
} else if (crawl_enabled === 'false') {
|
||||
whereClause += ' AND crawl_enabled = FALSE';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
last_crawled_at,
|
||||
crawl_enabled,
|
||||
location_count
|
||||
FROM dutchie_discovery_cities
|
||||
${whereClause}
|
||||
ORDER BY country_code, state_code, city_name
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_cities ${whereClause}`,
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
cities: rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
lastCrawledAt: r.last_crawled_at,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
locationCount: r.location_count,
|
||||
})),
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching cities:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// MATCH CANDIDATES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id/match-candidates
|
||||
*
|
||||
* Find potential dispensary matches for a discovery location.
|
||||
*/
|
||||
router.get('/locations/:id/match-candidates', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Find potential matches
|
||||
const { rows: candidates } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.address,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.menu_url,
|
||||
d.latitude,
|
||||
d.longitude,
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 'exact_name'
|
||||
WHEN d.name ILIKE $2 THEN 'partial_name'
|
||||
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
|
||||
ELSE 'location_match'
|
||||
END as match_type,
|
||||
CASE
|
||||
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
|
||||
THEN (3959 * acos(
|
||||
LEAST(1.0, GREATEST(-1.0,
|
||||
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||
cos(radians(d.longitude) - radians($6::float)) +
|
||||
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||
))
|
||||
))
|
||||
ELSE NULL
|
||||
END as distance_miles
|
||||
FROM dispensaries d
|
||||
WHERE d.state = $4
|
||||
AND (
|
||||
d.name ILIKE $1
|
||||
OR d.name ILIKE $2
|
||||
OR d.city ILIKE $3
|
||||
OR (
|
||||
d.latitude IS NOT NULL
|
||||
AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL
|
||||
AND $6::float IS NOT NULL
|
||||
)
|
||||
)
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 1
|
||||
WHEN d.name ILIKE $2 THEN 2
|
||||
ELSE 3
|
||||
END,
|
||||
distance_miles NULLS LAST
|
||||
LIMIT 10
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
`%${location.name.split(' ')[0]}%`,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: location.id,
|
||||
name: location.name,
|
||||
city: location.city,
|
||||
stateCode: location.state_code,
|
||||
},
|
||||
candidates: candidates.map((c) => ({
|
||||
id: c.id,
|
||||
name: c.name,
|
||||
city: c.city,
|
||||
state: c.state,
|
||||
address: c.address,
|
||||
menuType: c.menu_type,
|
||||
platformDispensaryId: c.platform_dispensary_id,
|
||||
menuUrl: c.menu_url,
|
||||
matchType: c.match_type,
|
||||
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching match candidates:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// GEO / NEARBY (Admin/Debug Only)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/nearby
|
||||
*
|
||||
* Find discovery locations near a given coordinate.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*
|
||||
* Query params:
|
||||
* - lat: number (required)
|
||||
* - lon: number (required)
|
||||
* - radiusKm: number (optional, default 50)
|
||||
* - limit: number (optional, default 20)
|
||||
* - status: string (optional, filter by status)
|
||||
*/
|
||||
router.get('/nearby', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { lat, lon, radiusKm = '50', limit = '20', status } = req.query;
|
||||
|
||||
// Validate required params
|
||||
if (!lat || !lon) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'lat and lon are required query parameters',
|
||||
});
|
||||
}
|
||||
|
||||
const latNum = parseFloat(lat as string);
|
||||
const lonNum = parseFloat(lon as string);
|
||||
const radiusNum = parseFloat(radiusKm as string);
|
||||
const limitNum = parseInt(limit as string, 10);
|
||||
|
||||
if (isNaN(latNum) || isNaN(lonNum)) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'lat and lon must be valid numbers',
|
||||
});
|
||||
}
|
||||
|
||||
const geoService = new DiscoveryGeoService(pool);
|
||||
|
||||
const locations = await geoService.findNearbyDiscoveryLocations(latNum, lonNum, {
|
||||
radiusKm: radiusNum,
|
||||
limit: limitNum,
|
||||
platform: 'dutchie',
|
||||
status: status as string | undefined,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
center: { lat: latNum, lon: lonNum },
|
||||
radiusKm: radiusNum,
|
||||
count: locations.length,
|
||||
locations,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in nearby:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/geo-stats
|
||||
*
|
||||
* Get coordinate coverage statistics for discovery locations.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*/
|
||||
router.get('/geo-stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const geoService = new DiscoveryGeoService(pool);
|
||||
const stats = await geoService.getCoordinateCoverageStats();
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
stats,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in geo-stats:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id/validate-geo
|
||||
*
|
||||
* Validate the geographic data for a discovery location.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*/
|
||||
router.get('/locations/:id/validate-geo', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the location
|
||||
const { rows } = await pool.query(
|
||||
`SELECT latitude, longitude, state_code, country_code, name
|
||||
FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = rows[0];
|
||||
const geoValidation = new GeoValidationService();
|
||||
const result = geoValidation.validateLocationState({
|
||||
latitude: location.latitude,
|
||||
longitude: location.longitude,
|
||||
state_code: location.state_code,
|
||||
country_code: location.country_code,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: parseInt(id, 10),
|
||||
name: location.name,
|
||||
latitude: location.latitude,
|
||||
longitude: location.longitude,
|
||||
stateCode: location.state_code,
|
||||
countryCode: location.country_code,
|
||||
},
|
||||
validation: result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in validate-geo:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
export default createDutchieDiscoveryRoutes;
|
||||
@@ -1,92 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Data Pipeline
|
||||
*
|
||||
* Isolated data pipeline for crawling and storing Dutchie Arizona dispensary data.
|
||||
* This module is completely separate from the main application database.
|
||||
*
|
||||
* Features:
|
||||
* - Two-mode crawling (Mode A: UI parity, Mode B: MAX COVERAGE)
|
||||
* - Derived stockStatus field (in_stock, out_of_stock, unknown)
|
||||
* - Full raw payload storage for 100% data preservation
|
||||
* - AZDHS dispensary list as canonical source
|
||||
*/
|
||||
|
||||
// Types
|
||||
export * from './types';
|
||||
|
||||
// Database
|
||||
export {
|
||||
getDutchieAZPool,
|
||||
query,
|
||||
getClient,
|
||||
closePool,
|
||||
healthCheck,
|
||||
} from './db/connection';
|
||||
|
||||
export {
|
||||
createSchema,
|
||||
dropSchema,
|
||||
schemaExists,
|
||||
ensureSchema,
|
||||
} from './db/schema';
|
||||
|
||||
// Services - GraphQL Client
|
||||
export {
|
||||
GRAPHQL_HASHES,
|
||||
ARIZONA_CENTERPOINTS,
|
||||
resolveDispensaryId,
|
||||
fetchAllProducts,
|
||||
fetchAllProductsBothModes,
|
||||
discoverArizonaDispensaries,
|
||||
// Alias for backward compatibility
|
||||
discoverArizonaDispensaries as discoverDispensaries,
|
||||
} from './services/graphql-client';
|
||||
|
||||
// Services - Discovery
|
||||
export {
|
||||
importFromExistingDispensaries,
|
||||
discoverDispensaries as discoverAndSaveDispensaries,
|
||||
resolvePlatformDispensaryIds,
|
||||
getAllDispensaries,
|
||||
getDispensaryById,
|
||||
getDispensariesWithPlatformIds,
|
||||
} from './services/discovery';
|
||||
|
||||
// Services - Product Crawler
|
||||
export {
|
||||
normalizeProduct,
|
||||
normalizeSnapshot,
|
||||
crawlDispensaryProducts,
|
||||
crawlAllArizonaDispensaries,
|
||||
} from './services/product-crawler';
|
||||
|
||||
export type { CrawlResult } from './services/product-crawler';
|
||||
|
||||
// Services - Scheduler
|
||||
export {
|
||||
startScheduler,
|
||||
stopScheduler,
|
||||
triggerImmediateCrawl,
|
||||
getSchedulerStatus,
|
||||
crawlSingleDispensary,
|
||||
// Schedule config CRUD
|
||||
getAllSchedules,
|
||||
getScheduleById,
|
||||
createSchedule,
|
||||
updateSchedule,
|
||||
deleteSchedule,
|
||||
triggerScheduleNow,
|
||||
initializeDefaultSchedules,
|
||||
// Run logs
|
||||
getRunLogs,
|
||||
} from './services/scheduler';
|
||||
|
||||
// Services - AZDHS Import
|
||||
export {
|
||||
importAZDHSDispensaries,
|
||||
importFromJSON,
|
||||
getImportStats,
|
||||
} from './services/azdhs-import';
|
||||
|
||||
// Routes
|
||||
export { default as dutchieAZRouter } from './routes';
|
||||
@@ -1,682 +0,0 @@
|
||||
/**
|
||||
* Analytics API Routes
|
||||
*
|
||||
* Provides REST API endpoints for all analytics services.
|
||||
* All routes are prefixed with /api/analytics
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
AnalyticsCache,
|
||||
PriceTrendService,
|
||||
PenetrationService,
|
||||
CategoryAnalyticsService,
|
||||
StoreChangeService,
|
||||
BrandOpportunityService,
|
||||
} from '../services/analytics';
|
||||
|
||||
export function createAnalyticsRouter(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// Initialize services
|
||||
const cache = new AnalyticsCache(pool, { defaultTtlMinutes: 15 });
|
||||
const priceService = new PriceTrendService(pool, cache);
|
||||
const penetrationService = new PenetrationService(pool, cache);
|
||||
const categoryService = new CategoryAnalyticsService(pool, cache);
|
||||
const storeService = new StoreChangeService(pool, cache);
|
||||
const brandOpportunityService = new BrandOpportunityService(pool, cache);
|
||||
|
||||
// ============================================================
|
||||
// PRICE ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/product/:id
|
||||
* Get price trend for a specific product
|
||||
*/
|
||||
router.get('/price/product/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const productId = parseInt(req.params.id);
|
||||
const storeId = req.query.storeId ? parseInt(req.query.storeId as string) : undefined;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await priceService.getProductPriceTrend(productId, storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price product error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch product price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/brand/:name
|
||||
* Get price trend for a brand
|
||||
*/
|
||||
router.get('/price/brand/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
};
|
||||
|
||||
const result = await priceService.getBrandPriceTrend(brandName, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price brand error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/category/:name
|
||||
* Get price trend for a category
|
||||
*/
|
||||
router.get('/price/category/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
brandName: req.query.brand as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
};
|
||||
|
||||
const result = await priceService.getCategoryPriceTrend(category, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price category error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/summary
|
||||
* Get price summary statistics
|
||||
*/
|
||||
router.get('/price/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
brandName: req.query.brand as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
};
|
||||
|
||||
const result = await priceService.getPriceSummary(filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch price summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/compression/:category
|
||||
* Get price compression analysis for a category
|
||||
*/
|
||||
router.get('/price/compression/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const state = req.query.state as string | undefined;
|
||||
|
||||
const result = await priceService.detectPriceCompression(category, state);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price compression error:', error);
|
||||
res.status(500).json({ error: 'Failed to analyze price compression' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/global
|
||||
* Get global price statistics
|
||||
*/
|
||||
router.get('/price/global', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await priceService.getGlobalPriceStats();
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Global price error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch global price stats' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PENETRATION ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/brand/:name
|
||||
* Get penetration data for a brand
|
||||
*/
|
||||
router.get('/penetration/brand/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
};
|
||||
|
||||
const result = await penetrationService.getBrandPenetration(brandName, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand penetration error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand penetration' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/top
|
||||
* Get top brands by penetration
|
||||
*/
|
||||
router.get('/penetration/top', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
minStores: req.query.minStores ? parseInt(req.query.minStores as string) : 2,
|
||||
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 5,
|
||||
};
|
||||
|
||||
const result = await penetrationService.getTopBrandsByPenetration(limit, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Top penetration error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch top brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/trend/:brand
|
||||
* Get penetration trend for a brand
|
||||
*/
|
||||
router.get('/penetration/trend/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await penetrationService.getPenetrationTrend(brandName, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Penetration trend error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch penetration trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/shelf-share/:brand
|
||||
* Get shelf share by category for a brand
|
||||
*/
|
||||
router.get('/penetration/shelf-share/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getShelfShareByCategory(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Shelf share error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch shelf share' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/by-state/:brand
|
||||
* Get brand presence by state
|
||||
*/
|
||||
router.get('/penetration/by-state/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getBrandPresenceByState(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand by state error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand presence by state' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/stores/:brand
|
||||
* Get stores carrying a brand
|
||||
*/
|
||||
router.get('/penetration/stores/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getStoresCarryingBrand(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Stores carrying brand error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch stores' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/heatmap
|
||||
* Get penetration heatmap data
|
||||
*/
|
||||
router.get('/penetration/heatmap', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = req.query.brand as string | undefined;
|
||||
const result = await penetrationService.getPenetrationHeatmap(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Heatmap error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch heatmap data' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CATEGORY ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/summary
|
||||
* Get category summary
|
||||
*/
|
||||
router.get('/category/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = req.query.category as string | undefined;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
};
|
||||
|
||||
const result = await categoryService.getCategorySummary(category, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/growth
|
||||
* Get category growth data
|
||||
*/
|
||||
router.get('/category/growth', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 10,
|
||||
};
|
||||
|
||||
const result = await categoryService.getCategoryGrowth(days, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category growth error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category growth' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/trend/:category
|
||||
* Get category growth trend over time
|
||||
*/
|
||||
router.get('/category/trend/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 90;
|
||||
|
||||
const result = await categoryService.getCategoryGrowthTrend(category, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category trend error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/heatmap
|
||||
* Get category heatmap data
|
||||
*/
|
||||
router.get('/category/heatmap', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const metric = (req.query.metric as 'skus' | 'growth' | 'price') || 'skus';
|
||||
const periods = req.query.periods ? parseInt(req.query.periods as string) : 12;
|
||||
|
||||
const result = await categoryService.getCategoryHeatmap(metric, periods);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category heatmap error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch heatmap' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/top-movers
|
||||
* Get top growing and declining categories
|
||||
*/
|
||||
router.get('/category/top-movers', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 5;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await categoryService.getTopMovers(limit, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Top movers error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch top movers' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/:category/subcategories
|
||||
* Get subcategory breakdown
|
||||
*/
|
||||
router.get('/category/:category/subcategories', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const result = await categoryService.getSubcategoryBreakdown(category);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Subcategory error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch subcategories' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STORE CHANGE TRACKING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/summary
|
||||
* Get change summary for a store
|
||||
*/
|
||||
router.get('/store/:id/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const result = await storeService.getStoreChangeSummary(storeId);
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({ error: 'Store not found' });
|
||||
}
|
||||
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Store summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/events
|
||||
* Get recent change events for a store
|
||||
*/
|
||||
router.get('/store/:id/events', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const filters = {
|
||||
eventType: req.query.type as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 100,
|
||||
};
|
||||
|
||||
const result = await storeService.getStoreChangeEvents(storeId, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Store events error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store events' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/brands/new
|
||||
* Get new brands added to a store
|
||||
*/
|
||||
router.get('/store/:id/brands/new', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await storeService.getNewBrands(storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] New brands error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch new brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/brands/lost
|
||||
* Get brands lost from a store
|
||||
*/
|
||||
router.get('/store/:id/brands/lost', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await storeService.getLostBrands(storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Lost brands error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch lost brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/products/changes
|
||||
* Get product changes for a store
|
||||
*/
|
||||
router.get('/store/:id/products/changes', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const changeType = req.query.type as 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock' | undefined;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
|
||||
const result = await storeService.getProductChanges(storeId, changeType, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Product changes error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch product changes' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/leaderboard/:category
|
||||
* Get category leaderboard across stores
|
||||
*/
|
||||
router.get('/store/leaderboard/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
|
||||
const result = await storeService.getCategoryLeaderboard(category, limit);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Leaderboard error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch leaderboard' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/most-active
|
||||
* Get most active stores (by changes)
|
||||
*/
|
||||
router.get('/store/most-active', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
|
||||
|
||||
const result = await storeService.getMostActiveStores(days, limit);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Most active error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch active stores' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/compare
|
||||
* Compare two stores
|
||||
*/
|
||||
router.get('/store/compare', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const store1 = parseInt(req.query.store1 as string);
|
||||
const store2 = parseInt(req.query.store2 as string);
|
||||
|
||||
if (!store1 || !store2) {
|
||||
return res.status(400).json({ error: 'Both store1 and store2 are required' });
|
||||
}
|
||||
|
||||
const result = await storeService.compareStores(store1, store2);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Compare stores error:', error);
|
||||
res.status(500).json({ error: 'Failed to compare stores' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// BRAND OPPORTUNITY / RISK
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/brand/:name/opportunity
|
||||
* Get full opportunity analysis for a brand
|
||||
*/
|
||||
router.get('/brand/:name/opportunity', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const result = await brandOpportunityService.getBrandOpportunity(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand opportunity error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand opportunity' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/brand/:name/position
|
||||
* Get market position summary for a brand
|
||||
*/
|
||||
router.get('/brand/:name/position', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const result = await brandOpportunityService.getMarketPositionSummary(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand position error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand position' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// ALERTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/alerts
|
||||
* Get analytics alerts
|
||||
*/
|
||||
router.get('/alerts', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const filters = {
|
||||
brandName: req.query.brand as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
alertType: req.query.type as string | undefined,
|
||||
unreadOnly: req.query.unreadOnly === 'true',
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
|
||||
};
|
||||
|
||||
const result = await brandOpportunityService.getAlerts(filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Alerts error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch alerts' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/analytics/alerts/mark-read
|
||||
* Mark alerts as read
|
||||
*/
|
||||
router.post('/alerts/mark-read', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { alertIds } = req.body;
|
||||
|
||||
if (!Array.isArray(alertIds)) {
|
||||
return res.status(400).json({ error: 'alertIds must be an array' });
|
||||
}
|
||||
|
||||
await brandOpportunityService.markAlertsRead(alertIds);
|
||||
res.json({ success: true });
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Mark read error:', error);
|
||||
res.status(500).json({ error: 'Failed to mark alerts as read' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CACHE MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/cache/stats
|
||||
* Get cache statistics
|
||||
*/
|
||||
router.get('/cache/stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const stats = await cache.getStats();
|
||||
res.json(stats);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Cache stats error:', error);
|
||||
res.status(500).json({ error: 'Failed to get cache stats' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/analytics/cache/clear
|
||||
* Clear cache (admin only)
|
||||
*/
|
||||
router.post('/cache/clear', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pattern = req.query.pattern as string | undefined;
|
||||
|
||||
if (pattern) {
|
||||
const cleared = await cache.invalidatePattern(pattern);
|
||||
res.json({ success: true, clearedCount: cleared });
|
||||
} else {
|
||||
await cache.cleanExpired();
|
||||
res.json({ success: true, message: 'Expired entries cleaned' });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Cache clear error:', error);
|
||||
res.status(500).json({ error: 'Failed to clear cache' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SNAPSHOT CAPTURE (for cron/scheduled jobs)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/analytics/snapshots/capture
|
||||
* Capture daily snapshots (run by scheduler)
|
||||
*/
|
||||
router.post('/snapshots/capture', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const [brandResult, categoryResult] = await Promise.all([
|
||||
pool.query('SELECT capture_brand_snapshots() as count'),
|
||||
pool.query('SELECT capture_category_snapshots() as count'),
|
||||
]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
brandSnapshots: parseInt(brandResult.rows[0]?.count || '0'),
|
||||
categorySnapshots: parseInt(categoryResult.rows[0]?.count || '0'),
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Snapshot capture error:', error);
|
||||
res.status(500).json({ error: 'Failed to capture snapshots' });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,486 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Crawler Reliability Stress Test
|
||||
*
|
||||
* Simulates various failure scenarios to test:
|
||||
* - Retry logic with exponential backoff
|
||||
* - Error taxonomy classification
|
||||
* - Self-healing (proxy/UA rotation)
|
||||
* - Status transitions (active -> degraded -> failed)
|
||||
* - Minimum crawl gap enforcement
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*
|
||||
* Usage:
|
||||
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
|
||||
*
|
||||
* Available tests:
|
||||
* retry - Test retry manager with various error types
|
||||
* backoff - Test exponential backoff calculation
|
||||
* status - Test status transitions
|
||||
* gap - Test minimum crawl gap enforcement
|
||||
* rotation - Test proxy/UA rotation
|
||||
* all - Run all tests
|
||||
*/
|
||||
|
||||
import {
|
||||
CrawlErrorCode,
|
||||
classifyError,
|
||||
isRetryable,
|
||||
shouldRotateProxy,
|
||||
shouldRotateUserAgent,
|
||||
getBackoffMultiplier,
|
||||
getErrorMetadata,
|
||||
} from '../services/error-taxonomy';
|
||||
|
||||
import {
|
||||
RetryManager,
|
||||
withRetry,
|
||||
calculateNextCrawlDelay,
|
||||
calculateNextCrawlAt,
|
||||
determineCrawlStatus,
|
||||
shouldAttemptRecovery,
|
||||
sleep,
|
||||
} from '../services/retry-manager';
|
||||
|
||||
import {
|
||||
UserAgentRotator,
|
||||
USER_AGENTS,
|
||||
} from '../services/proxy-rotator';
|
||||
|
||||
import {
|
||||
validateStoreConfig,
|
||||
isCrawlable,
|
||||
DEFAULT_CONFIG,
|
||||
RawStoreConfig,
|
||||
} from '../services/store-validator';
|
||||
|
||||
// ============================================================
|
||||
// TEST UTILITIES
|
||||
// ============================================================
|
||||
|
||||
let testsPassed = 0;
|
||||
let testsFailed = 0;
|
||||
|
||||
function assert(condition: boolean, message: string): void {
|
||||
if (condition) {
|
||||
console.log(` ✓ ${message}`);
|
||||
testsPassed++;
|
||||
} else {
|
||||
console.log(` ✗ ${message}`);
|
||||
testsFailed++;
|
||||
}
|
||||
}
|
||||
|
||||
function section(name: string): void {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`TEST: ${name}`);
|
||||
console.log('='.repeat(60));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Error Classification
|
||||
// ============================================================
|
||||
|
||||
function testErrorClassification(): void {
|
||||
section('Error Classification');
|
||||
|
||||
// HTTP status codes
|
||||
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
|
||||
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
|
||||
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
|
||||
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
|
||||
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
|
||||
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
|
||||
|
||||
// Error messages
|
||||
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
|
||||
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
|
||||
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
|
||||
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
|
||||
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
|
||||
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
|
||||
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
|
||||
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
|
||||
|
||||
// Retryability
|
||||
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
|
||||
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
|
||||
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
|
||||
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
|
||||
|
||||
// Rotation decisions
|
||||
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
|
||||
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
|
||||
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Retry Manager
|
||||
// ============================================================
|
||||
|
||||
function testRetryManager(): void {
|
||||
section('Retry Manager');
|
||||
|
||||
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
|
||||
|
||||
// Initial state
|
||||
assert(manager.shouldAttempt() === true, 'Should attempt initially');
|
||||
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
|
||||
|
||||
// First attempt
|
||||
manager.recordAttempt();
|
||||
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
|
||||
|
||||
// Evaluate retryable error
|
||||
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
|
||||
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
|
||||
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
|
||||
assert(decision1.rotateProxy === true, 'Should rotate proxy');
|
||||
assert(decision1.backoffMs > 0, 'Backoff is positive');
|
||||
|
||||
// More attempts
|
||||
manager.recordAttempt();
|
||||
manager.recordAttempt();
|
||||
|
||||
// Now at max retries
|
||||
const decision2 = manager.evaluateError(new Error('timeout'), 504);
|
||||
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
|
||||
|
||||
manager.recordAttempt();
|
||||
const decision3 = manager.evaluateError(new Error('timeout'));
|
||||
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
|
||||
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
|
||||
|
||||
// Reset
|
||||
manager.reset();
|
||||
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
|
||||
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
|
||||
|
||||
// Non-retryable error
|
||||
const manager2 = new RetryManager({ maxRetries: 3 });
|
||||
manager2.recordAttempt();
|
||||
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
|
||||
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
|
||||
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Exponential Backoff
|
||||
// ============================================================
|
||||
|
||||
function testExponentialBackoff(): void {
|
||||
section('Exponential Backoff');
|
||||
|
||||
// Calculate next crawl delay
|
||||
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
|
||||
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
|
||||
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
|
||||
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
|
||||
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
|
||||
|
||||
console.log(` Delay with 0 failures: ${delay0} minutes`);
|
||||
console.log(` Delay with 1 failure: ${delay1} minutes`);
|
||||
console.log(` Delay with 2 failures: ${delay2} minutes`);
|
||||
console.log(` Delay with 3 failures: ${delay3} minutes`);
|
||||
console.log(` Delay with 5 failures: ${delay5} minutes`);
|
||||
|
||||
assert(delay1 > delay0, 'Delay increases with failures');
|
||||
assert(delay2 > delay1, 'Delay keeps increasing');
|
||||
assert(delay3 > delay2, 'More delay with more failures');
|
||||
// With jitter, exact values vary but ratio should be close to 2x
|
||||
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
|
||||
|
||||
// Next crawl time calculation
|
||||
const now = new Date();
|
||||
const nextAt = calculateNextCrawlAt(2, 240);
|
||||
assert(nextAt > now, 'Next crawl is in future');
|
||||
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Status Transitions
|
||||
// ============================================================
|
||||
|
||||
function testStatusTransitions(): void {
|
||||
section('Status Transitions');
|
||||
|
||||
// Active status
|
||||
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
|
||||
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
|
||||
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
|
||||
|
||||
// Degraded status
|
||||
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
|
||||
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
|
||||
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
|
||||
|
||||
// Failed status
|
||||
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
|
||||
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
|
||||
|
||||
// Custom thresholds
|
||||
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
|
||||
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
|
||||
|
||||
// Recovery check
|
||||
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
|
||||
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
|
||||
|
||||
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
|
||||
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
|
||||
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Store Validation
|
||||
// ============================================================
|
||||
|
||||
function testStoreValidation(): void {
|
||||
section('Store Validation');
|
||||
|
||||
// Valid config
|
||||
const validConfig: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test Store',
|
||||
platformDispensaryId: '123abc',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const validResult = validateStoreConfig(validConfig);
|
||||
assert(validResult.isValid === true, 'Valid config passes');
|
||||
assert(validResult.config !== null, 'Valid config returns config');
|
||||
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
|
||||
|
||||
// Missing required fields
|
||||
const missingId: RawStoreConfig = {
|
||||
id: 0,
|
||||
name: 'Test',
|
||||
platformDispensaryId: '123',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const missingIdResult = validateStoreConfig(missingId);
|
||||
assert(missingIdResult.isValid === false, 'Missing ID fails');
|
||||
|
||||
// Missing platform ID
|
||||
const missingPlatform: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const missingPlatformResult = validateStoreConfig(missingPlatform);
|
||||
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
|
||||
|
||||
// Unknown menu type
|
||||
const unknownMenu: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test',
|
||||
platformDispensaryId: '123',
|
||||
menuType: 'unknown',
|
||||
};
|
||||
const unknownMenuResult = validateStoreConfig(unknownMenu);
|
||||
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
|
||||
|
||||
// Crawlable check
|
||||
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
|
||||
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
|
||||
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
|
||||
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: User Agent Rotation
|
||||
// ============================================================
|
||||
|
||||
function testUserAgentRotation(): void {
|
||||
section('User Agent Rotation');
|
||||
|
||||
const rotator = new UserAgentRotator();
|
||||
|
||||
const first = rotator.getCurrent();
|
||||
const second = rotator.getNext();
|
||||
const third = rotator.getNext();
|
||||
|
||||
assert(first !== second, 'User agents rotate');
|
||||
assert(second !== third, 'User agents keep rotating');
|
||||
assert(USER_AGENTS.includes(first), 'Returns valid UA');
|
||||
assert(USER_AGENTS.includes(second), 'Returns valid UA');
|
||||
|
||||
// Random UA
|
||||
const random = rotator.getRandom();
|
||||
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
|
||||
|
||||
// Count
|
||||
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: WithRetry Helper
|
||||
// ============================================================
|
||||
|
||||
async function testWithRetryHelper(): Promise<void> {
|
||||
section('WithRetry Helper');
|
||||
|
||||
// Successful on first try
|
||||
let attempts = 0;
|
||||
const successResult = await withRetry(async () => {
|
||||
attempts++;
|
||||
return 'success';
|
||||
}, { maxRetries: 3 });
|
||||
assert(attempts === 1, 'Succeeds on first try');
|
||||
assert(successResult.result === 'success', 'Returns result');
|
||||
|
||||
// Fails then succeeds
|
||||
let failThenSucceedAttempts = 0;
|
||||
const failThenSuccessResult = await withRetry(async () => {
|
||||
failThenSucceedAttempts++;
|
||||
if (failThenSucceedAttempts < 3) {
|
||||
throw new Error('temporary error');
|
||||
}
|
||||
return 'finally succeeded';
|
||||
}, { maxRetries: 5, baseBackoffMs: 10 });
|
||||
assert(failThenSucceedAttempts === 3, 'Retries until success');
|
||||
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
|
||||
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
|
||||
|
||||
// Exhausts retries
|
||||
let alwaysFailAttempts = 0;
|
||||
try {
|
||||
await withRetry(async () => {
|
||||
alwaysFailAttempts++;
|
||||
throw new Error('always fails');
|
||||
}, { maxRetries: 2, baseBackoffMs: 10 });
|
||||
assert(false, 'Should have thrown');
|
||||
} catch (error: any) {
|
||||
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
|
||||
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
|
||||
}
|
||||
|
||||
// Non-retryable error stops immediately
|
||||
let nonRetryableAttempts = 0;
|
||||
try {
|
||||
await withRetry(async () => {
|
||||
nonRetryableAttempts++;
|
||||
const err = new Error('HTML structure changed - selector not found');
|
||||
throw err;
|
||||
}, { maxRetries: 3, baseBackoffMs: 10 });
|
||||
assert(false, 'Should have thrown');
|
||||
} catch {
|
||||
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Minimum Crawl Gap
|
||||
// ============================================================
|
||||
|
||||
function testMinimumCrawlGap(): void {
|
||||
section('Minimum Crawl Gap');
|
||||
|
||||
// Default config
|
||||
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
|
||||
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
|
||||
|
||||
// Gap calculation
|
||||
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
|
||||
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
|
||||
|
||||
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Error Metadata
|
||||
// ============================================================
|
||||
|
||||
function testErrorMetadata(): void {
|
||||
section('Error Metadata');
|
||||
|
||||
// RATE_LIMITED
|
||||
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
|
||||
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
|
||||
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
|
||||
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
|
||||
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
|
||||
|
||||
// HTML_CHANGED
|
||||
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
|
||||
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
|
||||
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
|
||||
|
||||
// INVALID_CONFIG
|
||||
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
|
||||
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
|
||||
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN
|
||||
// ============================================================
|
||||
|
||||
async function runTests(testName?: string): Promise<void> {
|
||||
console.log('\n');
|
||||
console.log('╔══════════════════════════════════════════════════════════╗');
|
||||
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
|
||||
console.log('╚══════════════════════════════════════════════════════════╝');
|
||||
|
||||
const allTests = !testName || testName === 'all';
|
||||
|
||||
if (allTests || testName === 'error' || testName === 'classification') {
|
||||
testErrorClassification();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'retry') {
|
||||
testRetryManager();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'backoff') {
|
||||
testExponentialBackoff();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'status') {
|
||||
testStatusTransitions();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'validation' || testName === 'store') {
|
||||
testStoreValidation();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'rotation' || testName === 'ua') {
|
||||
testUserAgentRotation();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'withRetry' || testName === 'helper') {
|
||||
await testWithRetryHelper();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'gap') {
|
||||
testMinimumCrawlGap();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'metadata') {
|
||||
testErrorMetadata();
|
||||
}
|
||||
|
||||
// Summary
|
||||
console.log('\n');
|
||||
console.log('═'.repeat(60));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Passed: ${testsPassed}`);
|
||||
console.log(` Failed: ${testsFailed}`);
|
||||
console.log(` Total: ${testsPassed + testsFailed}`);
|
||||
|
||||
if (testsFailed > 0) {
|
||||
console.log('\n❌ SOME TESTS FAILED\n');
|
||||
process.exit(1);
|
||||
} else {
|
||||
console.log('\n✅ ALL TESTS PASSED\n');
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests
|
||||
const testName = process.argv[2];
|
||||
runTests(testName).catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,659 +0,0 @@
|
||||
/**
|
||||
* Brand Opportunity / Risk Analytics Service
|
||||
*
|
||||
* Provides brand-level opportunity and risk analysis including:
|
||||
* - Under/overpriced vs market
|
||||
* - Missing SKU opportunities
|
||||
* - Stores with declining/growing shelf share
|
||||
* - Competitor intrusion alerts
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface BrandOpportunity {
|
||||
brandName: string;
|
||||
underpricedVsMarket: PricePosition[];
|
||||
overpricedVsMarket: PricePosition[];
|
||||
missingSkuOpportunities: MissingSkuOpportunity[];
|
||||
storesWithDecliningShelfShare: StoreShelfShareChange[];
|
||||
storesWithGrowingShelfShare: StoreShelfShareChange[];
|
||||
competitorIntrusionAlerts: CompetitorAlert[];
|
||||
overallScore: number; // 0-100, higher = more opportunity
|
||||
riskScore: number; // 0-100, higher = more risk
|
||||
}
|
||||
|
||||
export interface PricePosition {
|
||||
category: string;
|
||||
brandAvgPrice: number;
|
||||
marketAvgPrice: number;
|
||||
priceDifferencePercent: number;
|
||||
skuCount: number;
|
||||
suggestion: string;
|
||||
}
|
||||
|
||||
export interface MissingSkuOpportunity {
|
||||
category: string;
|
||||
subcategory: string | null;
|
||||
marketSkuCount: number;
|
||||
brandSkuCount: number;
|
||||
gapPercent: number;
|
||||
topCompetitors: string[];
|
||||
opportunityScore: number; // 0-100
|
||||
}
|
||||
|
||||
export interface StoreShelfShareChange {
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
currentShelfShare: number;
|
||||
previousShelfShare: number;
|
||||
changePercent: number;
|
||||
currentSkus: number;
|
||||
competitors: string[];
|
||||
}
|
||||
|
||||
export interface CompetitorAlert {
|
||||
competitorBrand: string;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
alertType: 'new_entry' | 'expanding' | 'price_undercut';
|
||||
details: string;
|
||||
severity: 'low' | 'medium' | 'high';
|
||||
date: string;
|
||||
}
|
||||
|
||||
export interface MarketPositionSummary {
|
||||
brandName: string;
|
||||
marketSharePercent: number;
|
||||
avgPriceVsMarket: number; // -X% to +X%
|
||||
categoryStrengths: Array<{ category: string; shelfSharePercent: number }>;
|
||||
categoryWeaknesses: Array<{ category: string; shelfSharePercent: number; marketLeader: string }>;
|
||||
growthTrend: 'growing' | 'stable' | 'declining';
|
||||
competitorThreats: string[];
|
||||
}
|
||||
|
||||
export class BrandOpportunityService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get full opportunity analysis for a brand
|
||||
*/
|
||||
async getBrandOpportunity(brandName: string): Promise<BrandOpportunity> {
|
||||
const key = cacheKey('brand_opportunity', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [
|
||||
underpriced,
|
||||
overpriced,
|
||||
missingSkus,
|
||||
decliningStores,
|
||||
growingStores,
|
||||
alerts,
|
||||
] = await Promise.all([
|
||||
this.getUnderpricedPositions(brandName),
|
||||
this.getOverpricedPositions(brandName),
|
||||
this.getMissingSkuOpportunities(brandName),
|
||||
this.getStoresWithDecliningShare(brandName),
|
||||
this.getStoresWithGrowingShare(brandName),
|
||||
this.getCompetitorAlerts(brandName),
|
||||
]);
|
||||
|
||||
// Calculate opportunity score (higher = more opportunity)
|
||||
const opportunityFactors = [
|
||||
missingSkus.length > 0 ? 20 : 0,
|
||||
underpriced.length > 0 ? 15 : 0,
|
||||
growingStores.length > 5 ? 20 : growingStores.length * 3,
|
||||
missingSkus.reduce((sum, m) => sum + m.opportunityScore, 0) / Math.max(1, missingSkus.length) * 0.3,
|
||||
];
|
||||
const opportunityScore = Math.min(100, opportunityFactors.reduce((a, b) => a + b, 0));
|
||||
|
||||
// Calculate risk score (higher = more risk)
|
||||
const riskFactors = [
|
||||
decliningStores.length > 5 ? 30 : decliningStores.length * 5,
|
||||
alerts.filter(a => a.severity === 'high').length * 15,
|
||||
alerts.filter(a => a.severity === 'medium').length * 8,
|
||||
overpriced.length > 3 ? 15 : overpriced.length * 3,
|
||||
];
|
||||
const riskScore = Math.min(100, riskFactors.reduce((a, b) => a + b, 0));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
underpricedVsMarket: underpriced,
|
||||
overpricedVsMarket: overpriced,
|
||||
missingSkuOpportunities: missingSkus,
|
||||
storesWithDecliningShelfShare: decliningStores,
|
||||
storesWithGrowingShelfShare: growingStores,
|
||||
competitorIntrusionAlerts: alerts,
|
||||
overallScore: Math.round(opportunityScore),
|
||||
riskScore: Math.round(riskScore),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories where brand is underpriced vs market
|
||||
*/
|
||||
async getUnderpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
market_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL AND brand_name != $1
|
||||
GROUP BY type
|
||||
)
|
||||
SELECT
|
||||
bp.category,
|
||||
bp.brand_avg,
|
||||
mp.market_avg,
|
||||
bp.sku_count,
|
||||
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||
FROM brand_prices bp
|
||||
JOIN market_prices mp ON bp.category = mp.category
|
||||
WHERE bp.brand_avg < mp.market_avg * 0.9 -- 10% or more below market
|
||||
AND bp.brand_avg IS NOT NULL
|
||||
AND mp.market_avg IS NOT NULL
|
||||
ORDER BY diff_pct
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category: row.category,
|
||||
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
suggestion: `Consider price increase - ${Math.abs(Math.round(parseFloat(row.diff_pct)))}% below market average`,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories where brand is overpriced vs market
|
||||
*/
|
||||
async getOverpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
market_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL AND brand_name != $1
|
||||
GROUP BY type
|
||||
)
|
||||
SELECT
|
||||
bp.category,
|
||||
bp.brand_avg,
|
||||
mp.market_avg,
|
||||
bp.sku_count,
|
||||
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||
FROM brand_prices bp
|
||||
JOIN market_prices mp ON bp.category = mp.category
|
||||
WHERE bp.brand_avg > mp.market_avg * 1.15 -- 15% or more above market
|
||||
AND bp.brand_avg IS NOT NULL
|
||||
AND mp.market_avg IS NOT NULL
|
||||
ORDER BY diff_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category: row.category,
|
||||
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
suggestion: `Price sensitivity risk - ${Math.round(parseFloat(row.diff_pct))}% above market average`,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get missing SKU opportunities (category gaps)
|
||||
*/
|
||||
async getMissingSkuOpportunities(brandName: string): Promise<MissingSkuOpportunity[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH market_categories AS (
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as market_skus,
|
||||
ARRAY_AGG(DISTINCT brand_name ORDER BY brand_name) FILTER (WHERE brand_name IS NOT NULL) as top_brands
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
HAVING COUNT(*) >= 20
|
||||
),
|
||||
brand_presence AS (
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as brand_skus
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
)
|
||||
SELECT
|
||||
mc.category,
|
||||
mc.subcategory,
|
||||
mc.market_skus,
|
||||
COALESCE(bp.brand_skus, 0) as brand_skus,
|
||||
mc.top_brands[1:5] as competitors
|
||||
FROM market_categories mc
|
||||
LEFT JOIN brand_presence bp ON mc.category = bp.category
|
||||
AND (mc.subcategory = bp.subcategory OR (mc.subcategory IS NULL AND bp.subcategory IS NULL))
|
||||
WHERE COALESCE(bp.brand_skus, 0) < mc.market_skus * 0.05 -- Brand has <5% of market presence
|
||||
ORDER BY mc.market_skus DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const marketSkus = parseInt(row.market_skus) || 0;
|
||||
const brandSkus = parseInt(row.brand_skus) || 0;
|
||||
const gapPercent = marketSkus > 0 ? ((marketSkus - brandSkus) / marketSkus) * 100 : 100;
|
||||
const opportunityScore = Math.min(100, Math.round((marketSkus / 100) * (gapPercent / 100) * 100));
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
subcategory: row.subcategory,
|
||||
marketSkuCount: marketSkus,
|
||||
brandSkuCount: brandSkus,
|
||||
gapPercent: Math.round(gapPercent),
|
||||
topCompetitors: (row.competitors || []).filter((c: string) => c !== brandName).slice(0, 5),
|
||||
opportunityScore,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores where brand's shelf share is declining
|
||||
*/
|
||||
async getStoresWithDecliningShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||
// Use brand_snapshots for historical comparison
|
||||
const result = await this.pool.query(`
|
||||
WITH current_share AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||
COUNT(*) as total_skus,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||
)
|
||||
SELECT
|
||||
cs.store_id,
|
||||
cs.store_name,
|
||||
cs.city,
|
||||
cs.state,
|
||||
cs.brand_skus as current_skus,
|
||||
cs.total_skus,
|
||||
ROUND((cs.brand_skus::NUMERIC / cs.total_skus) * 100, 2) as current_share,
|
||||
cs.competitors[1:5] as top_competitors
|
||||
FROM current_share cs
|
||||
WHERE cs.brand_skus < 10 -- Low presence
|
||||
ORDER BY cs.brand_skus
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||
previousShelfShare: parseFloat(row.current_share) || 0, // Would need historical data
|
||||
changePercent: 0,
|
||||
currentSkus: parseInt(row.current_skus) || 0,
|
||||
competitors: row.top_competitors || [],
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores where brand's shelf share is growing
|
||||
*/
|
||||
async getStoresWithGrowingShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH store_share AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||
COUNT(*) as total_skus,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||
)
|
||||
SELECT
|
||||
ss.store_id,
|
||||
ss.store_name,
|
||||
ss.city,
|
||||
ss.state,
|
||||
ss.brand_skus as current_skus,
|
||||
ss.total_skus,
|
||||
ROUND((ss.brand_skus::NUMERIC / ss.total_skus) * 100, 2) as current_share,
|
||||
ss.competitors[1:5] as top_competitors
|
||||
FROM store_share ss
|
||||
ORDER BY current_share DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||
previousShelfShare: parseFloat(row.current_share) || 0,
|
||||
changePercent: 0,
|
||||
currentSkus: parseInt(row.current_skus) || 0,
|
||||
competitors: row.top_competitors || [],
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get competitor intrusion alerts
|
||||
*/
|
||||
async getCompetitorAlerts(brandName: string): Promise<CompetitorAlert[]> {
|
||||
// Check for competitor entries in stores where this brand has presence
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_stores AS (
|
||||
SELECT DISTINCT dispensary_id
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1
|
||||
),
|
||||
competitor_presence AS (
|
||||
SELECT
|
||||
dp.brand_name as competitor,
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
COUNT(*) as sku_count,
|
||||
MAX(dp.created_at) as latest_add
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.dispensary_id IN (SELECT dispensary_id FROM brand_stores)
|
||||
AND dp.brand_name != $1
|
||||
AND dp.brand_name IS NOT NULL
|
||||
AND dp.created_at >= NOW() - INTERVAL '30 days'
|
||||
GROUP BY dp.brand_name, dp.dispensary_id, d.name
|
||||
HAVING COUNT(*) >= 5
|
||||
)
|
||||
SELECT
|
||||
competitor,
|
||||
store_id,
|
||||
store_name,
|
||||
sku_count,
|
||||
latest_add
|
||||
FROM competitor_presence
|
||||
ORDER BY sku_count DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const skuCount = parseInt(row.sku_count) || 0;
|
||||
let severity: 'low' | 'medium' | 'high' = 'low';
|
||||
if (skuCount >= 20) severity = 'high';
|
||||
else if (skuCount >= 10) severity = 'medium';
|
||||
|
||||
return {
|
||||
competitorBrand: row.competitor,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
alertType: 'expanding' as const,
|
||||
details: `${row.competitor} has ${skuCount} SKUs in ${row.store_name}`,
|
||||
severity,
|
||||
date: new Date(row.latest_add).toISOString().split('T')[0],
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get market position summary for a brand
|
||||
*/
|
||||
async getMarketPositionSummary(brandName: string): Promise<MarketPositionSummary> {
|
||||
const key = cacheKey('market_position', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [shareResult, priceResult, categoryResult, threatResult] = await Promise.all([
|
||||
// Market share
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE brand_name = $1) as brand_count,
|
||||
(SELECT COUNT(*) FROM dutchie_products) as total_count
|
||||
`, [brandName]),
|
||||
|
||||
// Price vs market
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name = $1) as brand_avg,
|
||||
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name != $1) as market_avg
|
||||
`, [brandName]),
|
||||
|
||||
// Category strengths/weaknesses
|
||||
this.pool.query(`
|
||||
WITH brand_by_cat AS (
|
||||
SELECT type as category, COUNT(*) as brand_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
market_by_cat AS (
|
||||
SELECT type as category, COUNT(*) as total_count
|
||||
FROM dutchie_products WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
leaders AS (
|
||||
SELECT type as category, brand_name, COUNT(*) as cnt,
|
||||
RANK() OVER (PARTITION BY type ORDER BY COUNT(*) DESC) as rnk
|
||||
FROM dutchie_products WHERE type IS NOT NULL AND brand_name IS NOT NULL
|
||||
GROUP BY type, brand_name
|
||||
)
|
||||
SELECT
|
||||
mc.category,
|
||||
COALESCE(bc.brand_count, 0) as brand_count,
|
||||
mc.total_count,
|
||||
ROUND((COALESCE(bc.brand_count, 0)::NUMERIC / mc.total_count) * 100, 2) as share_pct,
|
||||
(SELECT brand_name FROM leaders WHERE category = mc.category AND rnk = 1) as leader
|
||||
FROM market_by_cat mc
|
||||
LEFT JOIN brand_by_cat bc ON mc.category = bc.category
|
||||
ORDER BY share_pct DESC
|
||||
`, [brandName]),
|
||||
|
||||
// Top competitors
|
||||
this.pool.query(`
|
||||
SELECT brand_name, COUNT(*) as cnt
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL AND brand_name != $1
|
||||
GROUP BY brand_name
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 5
|
||||
`, [brandName]),
|
||||
]);
|
||||
|
||||
const brandCount = parseInt(shareResult.rows[0]?.brand_count) || 0;
|
||||
const totalCount = parseInt(shareResult.rows[0]?.total_count) || 1;
|
||||
const marketSharePercent = Math.round((brandCount / totalCount) * 1000) / 10;
|
||||
|
||||
const brandAvg = parseFloat(priceResult.rows[0]?.brand_avg) || 0;
|
||||
const marketAvg = parseFloat(priceResult.rows[0]?.market_avg) || 1;
|
||||
const avgPriceVsMarket = Math.round(((brandAvg - marketAvg) / marketAvg) * 1000) / 10;
|
||||
|
||||
const categories = categoryResult.rows;
|
||||
const strengths = categories
|
||||
.filter(c => parseFloat(c.share_pct) > 5)
|
||||
.map(c => ({ category: c.category, shelfSharePercent: parseFloat(c.share_pct) }));
|
||||
|
||||
const weaknesses = categories
|
||||
.filter(c => parseFloat(c.share_pct) < 2 && c.leader !== brandName)
|
||||
.map(c => ({
|
||||
category: c.category,
|
||||
shelfSharePercent: parseFloat(c.share_pct),
|
||||
marketLeader: c.leader || 'Unknown',
|
||||
}));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
marketSharePercent,
|
||||
avgPriceVsMarket,
|
||||
categoryStrengths: strengths.slice(0, 5),
|
||||
categoryWeaknesses: weaknesses.slice(0, 5),
|
||||
growthTrend: 'stable' as const, // Would need historical data
|
||||
competitorThreats: threatResult.rows.map(r => r.brand_name),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an analytics alert
|
||||
*/
|
||||
async createAlert(alert: {
|
||||
alertType: string;
|
||||
severity: 'info' | 'warning' | 'critical';
|
||||
title: string;
|
||||
description?: string;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
productId?: number;
|
||||
category?: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}): Promise<void> {
|
||||
await this.pool.query(`
|
||||
INSERT INTO analytics_alerts
|
||||
(alert_type, severity, title, description, store_id, brand_name, product_id, category, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
alert.alertType,
|
||||
alert.severity,
|
||||
alert.title,
|
||||
alert.description || null,
|
||||
alert.storeId || null,
|
||||
alert.brandName || null,
|
||||
alert.productId || null,
|
||||
alert.category || null,
|
||||
alert.metadata ? JSON.stringify(alert.metadata) : null,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recent alerts
|
||||
*/
|
||||
async getAlerts(filters: {
|
||||
brandName?: string;
|
||||
storeId?: number;
|
||||
alertType?: string;
|
||||
unreadOnly?: boolean;
|
||||
limit?: number;
|
||||
} = {}): Promise<Array<{
|
||||
id: number;
|
||||
alertType: string;
|
||||
severity: string;
|
||||
title: string;
|
||||
description: string | null;
|
||||
storeName: string | null;
|
||||
brandName: string | null;
|
||||
createdAt: string;
|
||||
isRead: boolean;
|
||||
}>> {
|
||||
const { brandName, storeId, alertType, unreadOnly = false, limit = 50 } = filters;
|
||||
const params: (string | number | boolean)[] = [limit];
|
||||
const conditions: string[] = [];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (brandName) {
|
||||
conditions.push(`a.brand_name = $${paramIndex++}`);
|
||||
params.push(brandName);
|
||||
}
|
||||
if (storeId) {
|
||||
conditions.push(`a.store_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
if (alertType) {
|
||||
conditions.push(`a.alert_type = $${paramIndex++}`);
|
||||
params.push(alertType);
|
||||
}
|
||||
if (unreadOnly) {
|
||||
conditions.push('a.is_read = false');
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0
|
||||
? 'WHERE ' + conditions.join(' AND ')
|
||||
: '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
a.id,
|
||||
a.alert_type,
|
||||
a.severity,
|
||||
a.title,
|
||||
a.description,
|
||||
d.name as store_name,
|
||||
a.brand_name,
|
||||
a.created_at,
|
||||
a.is_read
|
||||
FROM analytics_alerts a
|
||||
LEFT JOIN dispensaries d ON a.store_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY a.created_at DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
alertType: row.alert_type,
|
||||
severity: row.severity,
|
||||
title: row.title,
|
||||
description: row.description,
|
||||
storeName: row.store_name,
|
||||
brandName: row.brand_name,
|
||||
createdAt: row.created_at.toISOString(),
|
||||
isRead: row.is_read,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark alerts as read
|
||||
*/
|
||||
async markAlertsRead(alertIds: number[]): Promise<void> {
|
||||
if (alertIds.length === 0) return;
|
||||
|
||||
await this.pool.query(`
|
||||
UPDATE analytics_alerts
|
||||
SET is_read = true
|
||||
WHERE id = ANY($1)
|
||||
`, [alertIds]);
|
||||
}
|
||||
}
|
||||
@@ -1,227 +0,0 @@
|
||||
/**
|
||||
* Analytics Cache Service
|
||||
*
|
||||
* Provides caching layer for expensive analytics queries.
|
||||
* Uses PostgreSQL for persistence with configurable TTLs.
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface CacheEntry<T = unknown> {
|
||||
key: string;
|
||||
data: T;
|
||||
computedAt: Date;
|
||||
expiresAt: Date;
|
||||
queryTimeMs?: number;
|
||||
}
|
||||
|
||||
export interface CacheConfig {
|
||||
defaultTtlMinutes: number;
|
||||
}
|
||||
|
||||
const DEFAULT_CONFIG: CacheConfig = {
|
||||
defaultTtlMinutes: 15,
|
||||
};
|
||||
|
||||
export class AnalyticsCache {
|
||||
private pool: Pool;
|
||||
private config: CacheConfig;
|
||||
private memoryCache: Map<string, CacheEntry> = new Map();
|
||||
|
||||
constructor(pool: Pool, config: Partial<CacheConfig> = {}) {
|
||||
this.pool = pool;
|
||||
this.config = { ...DEFAULT_CONFIG, ...config };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached data or compute and cache it
|
||||
*/
|
||||
async getOrCompute<T>(
|
||||
key: string,
|
||||
computeFn: () => Promise<T>,
|
||||
ttlMinutes?: number
|
||||
): Promise<{ data: T; fromCache: boolean; queryTimeMs: number }> {
|
||||
const ttl = ttlMinutes ?? this.config.defaultTtlMinutes;
|
||||
|
||||
// Check memory cache first
|
||||
const memEntry = this.memoryCache.get(key);
|
||||
if (memEntry && new Date() < memEntry.expiresAt) {
|
||||
return { data: memEntry.data as T, fromCache: true, queryTimeMs: memEntry.queryTimeMs || 0 };
|
||||
}
|
||||
|
||||
// Check database cache
|
||||
const dbEntry = await this.getFromDb<T>(key);
|
||||
if (dbEntry && new Date() < dbEntry.expiresAt) {
|
||||
this.memoryCache.set(key, dbEntry);
|
||||
return { data: dbEntry.data, fromCache: true, queryTimeMs: dbEntry.queryTimeMs || 0 };
|
||||
}
|
||||
|
||||
// Compute fresh data
|
||||
const startTime = Date.now();
|
||||
const data = await computeFn();
|
||||
const queryTimeMs = Date.now() - startTime;
|
||||
|
||||
// Cache result
|
||||
const entry: CacheEntry<T> = {
|
||||
key,
|
||||
data,
|
||||
computedAt: new Date(),
|
||||
expiresAt: new Date(Date.now() + ttl * 60 * 1000),
|
||||
queryTimeMs,
|
||||
};
|
||||
|
||||
await this.saveToDb(entry);
|
||||
this.memoryCache.set(key, entry);
|
||||
|
||||
return { data, fromCache: false, queryTimeMs };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get from database cache
|
||||
*/
|
||||
private async getFromDb<T>(key: string): Promise<CacheEntry<T> | null> {
|
||||
try {
|
||||
const result = await this.pool.query(`
|
||||
SELECT cache_data, computed_at, expires_at, query_time_ms
|
||||
FROM analytics_cache
|
||||
WHERE cache_key = $1
|
||||
AND expires_at > NOW()
|
||||
`, [key]);
|
||||
|
||||
if (result.rows.length === 0) return null;
|
||||
|
||||
const row = result.rows[0];
|
||||
return {
|
||||
key,
|
||||
data: row.cache_data as T,
|
||||
computedAt: row.computed_at,
|
||||
expiresAt: row.expires_at,
|
||||
queryTimeMs: row.query_time_ms,
|
||||
};
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to get from DB: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save to database cache
|
||||
*/
|
||||
private async saveToDb<T>(entry: CacheEntry<T>): Promise<void> {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
INSERT INTO analytics_cache (cache_key, cache_data, computed_at, expires_at, query_time_ms)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (cache_key)
|
||||
DO UPDATE SET
|
||||
cache_data = EXCLUDED.cache_data,
|
||||
computed_at = EXCLUDED.computed_at,
|
||||
expires_at = EXCLUDED.expires_at,
|
||||
query_time_ms = EXCLUDED.query_time_ms
|
||||
`, [entry.key, JSON.stringify(entry.data), entry.computedAt, entry.expiresAt, entry.queryTimeMs]);
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to save to DB: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate a cache entry
|
||||
*/
|
||||
async invalidate(key: string): Promise<void> {
|
||||
this.memoryCache.delete(key);
|
||||
try {
|
||||
await this.pool.query('DELETE FROM analytics_cache WHERE cache_key = $1', [key]);
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to invalidate: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate all entries matching a pattern
|
||||
*/
|
||||
async invalidatePattern(pattern: string): Promise<number> {
|
||||
// Clear memory cache
|
||||
for (const key of this.memoryCache.keys()) {
|
||||
if (key.includes(pattern)) {
|
||||
this.memoryCache.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query(
|
||||
'DELETE FROM analytics_cache WHERE cache_key LIKE $1',
|
||||
[`%${pattern}%`]
|
||||
);
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to invalidate pattern: ${error}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean expired entries
|
||||
*/
|
||||
async cleanExpired(): Promise<number> {
|
||||
// Clean memory cache
|
||||
const now = new Date();
|
||||
for (const [key, entry] of this.memoryCache.entries()) {
|
||||
if (now >= entry.expiresAt) {
|
||||
this.memoryCache.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query('DELETE FROM analytics_cache WHERE expires_at < NOW()');
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to clean expired: ${error}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
memoryCacheSize: number;
|
||||
dbCacheSize: number;
|
||||
expiredCount: number;
|
||||
}> {
|
||||
try {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE expires_at > NOW()) as active,
|
||||
COUNT(*) FILTER (WHERE expires_at <= NOW()) as expired
|
||||
FROM analytics_cache
|
||||
`);
|
||||
|
||||
return {
|
||||
memoryCacheSize: this.memoryCache.size,
|
||||
dbCacheSize: parseInt(result.rows[0]?.active || '0'),
|
||||
expiredCount: parseInt(result.rows[0]?.expired || '0'),
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
memoryCacheSize: this.memoryCache.size,
|
||||
dbCacheSize: 0,
|
||||
expiredCount: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate cache key with parameters
|
||||
*/
|
||||
export function cacheKey(prefix: string, params: Record<string, unknown> = {}): string {
|
||||
const sortedParams = Object.keys(params)
|
||||
.sort()
|
||||
.filter(k => params[k] !== undefined && params[k] !== null)
|
||||
.map(k => `${k}=${params[k]}`)
|
||||
.join('&');
|
||||
|
||||
return sortedParams ? `${prefix}:${sortedParams}` : prefix;
|
||||
}
|
||||
@@ -1,530 +0,0 @@
|
||||
/**
|
||||
* Category Growth Analytics Service
|
||||
*
|
||||
* Provides category-level analytics including:
|
||||
* - SKU count growth
|
||||
* - Price growth trends
|
||||
* - New product additions
|
||||
* - Category shrinkage
|
||||
* - Seasonality patterns
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface CategoryGrowth {
|
||||
category: string;
|
||||
currentSkuCount: number;
|
||||
previousSkuCount: number;
|
||||
skuGrowthPercent: number;
|
||||
currentBrandCount: number;
|
||||
previousBrandCount: number;
|
||||
brandGrowthPercent: number;
|
||||
currentAvgPrice: number | null;
|
||||
previousAvgPrice: number | null;
|
||||
priceChangePercent: number | null;
|
||||
newProducts: number;
|
||||
discontinuedProducts: number;
|
||||
trend: 'growing' | 'declining' | 'stable';
|
||||
}
|
||||
|
||||
export interface CategorySummary {
|
||||
category: string;
|
||||
totalSkus: number;
|
||||
brandCount: number;
|
||||
storeCount: number;
|
||||
avgPrice: number | null;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
inStockSkus: number;
|
||||
outOfStockSkus: number;
|
||||
stockHealthPercent: number;
|
||||
}
|
||||
|
||||
export interface CategoryGrowthTrend {
|
||||
category: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
storeCount: number;
|
||||
}>;
|
||||
growth7d: number | null;
|
||||
growth30d: number | null;
|
||||
growth90d: number | null;
|
||||
}
|
||||
|
||||
export interface CategoryHeatmapData {
|
||||
categories: string[];
|
||||
periods: string[];
|
||||
data: Array<{
|
||||
category: string;
|
||||
period: string;
|
||||
value: number; // SKU count, growth %, or price
|
||||
changeFromPrevious: number | null;
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface SeasonalityPattern {
|
||||
category: string;
|
||||
monthlyPattern: Array<{
|
||||
month: number;
|
||||
monthName: string;
|
||||
avgSkuCount: number;
|
||||
avgPrice: number | null;
|
||||
seasonalityIndex: number; // 100 = average, >100 = above, <100 = below
|
||||
}>;
|
||||
peakMonth: number;
|
||||
troughMonth: number;
|
||||
}
|
||||
|
||||
export interface CategoryFilters {
|
||||
state?: string;
|
||||
storeId?: number;
|
||||
minSkus?: number;
|
||||
}
|
||||
|
||||
export class CategoryAnalyticsService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current category summary
|
||||
*/
|
||||
async getCategorySummary(
|
||||
category?: string,
|
||||
filters: CategoryFilters = {}
|
||||
): Promise<CategorySummary[]> {
|
||||
const { state, storeId } = filters;
|
||||
const key = cacheKey('category_summary', { category, state, storeId });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [];
|
||||
const conditions: string[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (category) {
|
||||
conditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
if (storeId) {
|
||||
conditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0
|
||||
? 'WHERE dp.type IS NOT NULL AND ' + conditions.join(' AND ')
|
||||
: 'WHERE dp.type IS NOT NULL';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
dp.type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
SUM(CASE WHEN dp.stock_status != 'in_stock' OR dp.stock_status IS NULL THEN 1 ELSE 0 END) as out_of_stock
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
${whereClause}
|
||||
GROUP BY dp.type
|
||||
ORDER BY total_skus DESC
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const inStock = parseInt(row.in_stock) || 0;
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
totalSkus,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
minPrice: row.min_price ? Math.round(parseFloat(row.min_price) * 100) / 100 : null,
|
||||
maxPrice: row.max_price ? Math.round(parseFloat(row.max_price) * 100) / 100 : null,
|
||||
inStockSkus: inStock,
|
||||
outOfStockSkus: parseInt(row.out_of_stock) || 0,
|
||||
stockHealthPercent: totalSkus > 0
|
||||
? Math.round((inStock / totalSkus) * 100)
|
||||
: 0,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category growth (comparing periods)
|
||||
*/
|
||||
async getCategoryGrowth(
|
||||
days: number = 7,
|
||||
filters: CategoryFilters = {}
|
||||
): Promise<CategoryGrowth[]> {
|
||||
const { state, storeId, minSkus = 10 } = filters;
|
||||
const key = cacheKey('category_growth', { days, state, storeId, minSkus });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use category_snapshots for historical comparison
|
||||
const result = await this.pool.query(`
|
||||
WITH current_data AS (
|
||||
SELECT
|
||||
category,
|
||||
total_skus,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date = (SELECT MAX(snapshot_date) FROM category_snapshots)
|
||||
),
|
||||
previous_data AS (
|
||||
SELECT
|
||||
category,
|
||||
total_skus,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date = (
|
||||
SELECT MAX(snapshot_date)
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date < (SELECT MAX(snapshot_date) FROM category_snapshots) - ($1 || ' days')::INTERVAL
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
c.category,
|
||||
c.total_skus as current_skus,
|
||||
COALESCE(p.total_skus, c.total_skus) as previous_skus,
|
||||
c.brand_count as current_brands,
|
||||
COALESCE(p.brand_count, c.brand_count) as previous_brands,
|
||||
c.avg_price as current_price,
|
||||
p.avg_price as previous_price
|
||||
FROM current_data c
|
||||
LEFT JOIN previous_data p ON c.category = p.category
|
||||
WHERE c.total_skus >= $2
|
||||
ORDER BY c.total_skus DESC
|
||||
`, [days, minSkus]);
|
||||
|
||||
// If no snapshots exist, use current data
|
||||
if (result.rows.length === 0) {
|
||||
const fallbackResult = await this.pool.query(`
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT brand_name) as brand_count,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= $1
|
||||
ORDER BY total_skus DESC
|
||||
`, [minSkus]);
|
||||
|
||||
return fallbackResult.rows.map(row => ({
|
||||
category: row.category,
|
||||
currentSkuCount: parseInt(row.total_skus) || 0,
|
||||
previousSkuCount: parseInt(row.total_skus) || 0,
|
||||
skuGrowthPercent: 0,
|
||||
currentBrandCount: parseInt(row.brand_count) || 0,
|
||||
previousBrandCount: parseInt(row.brand_count) || 0,
|
||||
brandGrowthPercent: 0,
|
||||
currentAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
previousAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
priceChangePercent: null,
|
||||
newProducts: 0,
|
||||
discontinuedProducts: 0,
|
||||
trend: 'stable' as const,
|
||||
}));
|
||||
}
|
||||
|
||||
return result.rows.map(row => {
|
||||
const currentSkus = parseInt(row.current_skus) || 0;
|
||||
const previousSkus = parseInt(row.previous_skus) || currentSkus;
|
||||
const currentBrands = parseInt(row.current_brands) || 0;
|
||||
const previousBrands = parseInt(row.previous_brands) || currentBrands;
|
||||
const currentPrice = row.current_price ? parseFloat(row.current_price) : null;
|
||||
const previousPrice = row.previous_price ? parseFloat(row.previous_price) : null;
|
||||
|
||||
const skuGrowth = previousSkus > 0
|
||||
? ((currentSkus - previousSkus) / previousSkus) * 100
|
||||
: 0;
|
||||
const brandGrowth = previousBrands > 0
|
||||
? ((currentBrands - previousBrands) / previousBrands) * 100
|
||||
: 0;
|
||||
const priceChange = previousPrice && currentPrice
|
||||
? ((currentPrice - previousPrice) / previousPrice) * 100
|
||||
: null;
|
||||
|
||||
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||
if (skuGrowth > 5) trend = 'growing';
|
||||
else if (skuGrowth < -5) trend = 'declining';
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
currentSkuCount: currentSkus,
|
||||
previousSkuCount: previousSkus,
|
||||
skuGrowthPercent: Math.round(skuGrowth * 10) / 10,
|
||||
currentBrandCount: currentBrands,
|
||||
previousBrandCount: previousBrands,
|
||||
brandGrowthPercent: Math.round(brandGrowth * 10) / 10,
|
||||
currentAvgPrice: currentPrice ? Math.round(currentPrice * 100) / 100 : null,
|
||||
previousAvgPrice: previousPrice ? Math.round(previousPrice * 100) / 100 : null,
|
||||
priceChangePercent: priceChange !== null ? Math.round(priceChange * 10) / 10 : null,
|
||||
newProducts: Math.max(0, currentSkus - previousSkus),
|
||||
discontinuedProducts: Math.max(0, previousSkus - currentSkus),
|
||||
trend,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category growth trend over time
|
||||
*/
|
||||
async getCategoryGrowthTrend(
|
||||
category: string,
|
||||
days: number = 90
|
||||
): Promise<CategoryGrowthTrend> {
|
||||
const key = cacheKey('category_growth_trend', { category, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
snapshot_date as date,
|
||||
total_skus as sku_count,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE category = $1
|
||||
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY snapshot_date
|
||||
`, [category, days]);
|
||||
|
||||
const dataPoints = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
}));
|
||||
|
||||
// Calculate growth rates
|
||||
const calculateGrowth = (daysBack: number): number | null => {
|
||||
if (dataPoints.length < 2) return null;
|
||||
const targetDate = new Date();
|
||||
targetDate.setDate(targetDate.getDate() - daysBack);
|
||||
const targetDateStr = targetDate.toISOString().split('T')[0];
|
||||
|
||||
const recent = dataPoints[dataPoints.length - 1];
|
||||
const older = dataPoints.find(d => d.date <= targetDateStr) || dataPoints[0];
|
||||
|
||||
if (older.skuCount === 0) return null;
|
||||
return Math.round(((recent.skuCount - older.skuCount) / older.skuCount) * 1000) / 10;
|
||||
};
|
||||
|
||||
return {
|
||||
category,
|
||||
dataPoints,
|
||||
growth7d: calculateGrowth(7),
|
||||
growth30d: calculateGrowth(30),
|
||||
growth90d: calculateGrowth(90),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category heatmap data
|
||||
*/
|
||||
async getCategoryHeatmap(
|
||||
metric: 'skus' | 'growth' | 'price' = 'skus',
|
||||
periods: number = 12 // weeks
|
||||
): Promise<CategoryHeatmapData> {
|
||||
const key = cacheKey('category_heatmap', { metric, periods });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
category,
|
||||
snapshot_date,
|
||||
total_skus,
|
||||
avg_price
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date >= CURRENT_DATE - ($1 * 7 || ' days')::INTERVAL
|
||||
ORDER BY category, snapshot_date
|
||||
`, [periods]);
|
||||
|
||||
// Get unique categories and generate weekly periods
|
||||
const categoriesSet = new Set<string>();
|
||||
const periodsSet = new Set<string>();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
categoriesSet.add(row.category);
|
||||
// Group by week
|
||||
const date = new Date(row.snapshot_date);
|
||||
const weekStart = new Date(date);
|
||||
weekStart.setDate(date.getDate() - date.getDay());
|
||||
periodsSet.add(weekStart.toISOString().split('T')[0]);
|
||||
});
|
||||
|
||||
const categories = Array.from(categoriesSet).sort();
|
||||
const periodsList = Array.from(periodsSet).sort();
|
||||
|
||||
// Aggregate data by category and week
|
||||
const dataMap = new Map<string, Map<string, { skus: number; price: number | null }>>();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
const date = new Date(row.snapshot_date);
|
||||
const weekStart = new Date(date);
|
||||
weekStart.setDate(date.getDate() - date.getDay());
|
||||
const period = weekStart.toISOString().split('T')[0];
|
||||
|
||||
if (!dataMap.has(row.category)) {
|
||||
dataMap.set(row.category, new Map());
|
||||
}
|
||||
const categoryData = dataMap.get(row.category)!;
|
||||
|
||||
if (!categoryData.has(period)) {
|
||||
categoryData.set(period, { skus: 0, price: null });
|
||||
}
|
||||
const existing = categoryData.get(period)!;
|
||||
existing.skus = Math.max(existing.skus, parseInt(row.total_skus) || 0);
|
||||
if (row.avg_price) {
|
||||
existing.price = parseFloat(row.avg_price);
|
||||
}
|
||||
});
|
||||
|
||||
// Build heatmap data
|
||||
const data: CategoryHeatmapData['data'] = [];
|
||||
|
||||
categories.forEach(category => {
|
||||
let previousValue: number | null = null;
|
||||
|
||||
periodsList.forEach(period => {
|
||||
const categoryData = dataMap.get(category)?.get(period);
|
||||
let value = 0;
|
||||
|
||||
if (categoryData) {
|
||||
switch (metric) {
|
||||
case 'skus':
|
||||
value = categoryData.skus;
|
||||
break;
|
||||
case 'price':
|
||||
value = categoryData.price || 0;
|
||||
break;
|
||||
case 'growth':
|
||||
value = previousValue !== null && previousValue > 0
|
||||
? ((categoryData.skus - previousValue) / previousValue) * 100
|
||||
: 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const changeFromPrevious = previousValue !== null && previousValue > 0
|
||||
? ((value - previousValue) / previousValue) * 100
|
||||
: null;
|
||||
|
||||
data.push({
|
||||
category,
|
||||
period,
|
||||
value: Math.round(value * 100) / 100,
|
||||
changeFromPrevious: changeFromPrevious !== null
|
||||
? Math.round(changeFromPrevious * 10) / 10
|
||||
: null,
|
||||
});
|
||||
|
||||
if (metric !== 'growth') {
|
||||
previousValue = value;
|
||||
} else if (categoryData) {
|
||||
previousValue = categoryData.skus;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return {
|
||||
categories,
|
||||
periods: periodsList,
|
||||
data,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top growing/declining categories
|
||||
*/
|
||||
async getTopMovers(
|
||||
limit: number = 5,
|
||||
days: number = 30
|
||||
): Promise<{
|
||||
growing: CategoryGrowth[];
|
||||
declining: CategoryGrowth[];
|
||||
}> {
|
||||
const key = cacheKey('top_movers', { limit, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const allGrowth = await this.getCategoryGrowth(days);
|
||||
|
||||
const sorted = [...allGrowth].sort((a, b) => b.skuGrowthPercent - a.skuGrowthPercent);
|
||||
|
||||
return {
|
||||
growing: sorted.filter(c => c.skuGrowthPercent > 0).slice(0, limit),
|
||||
declining: sorted.filter(c => c.skuGrowthPercent < 0).slice(-limit).reverse(),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category subcategory breakdown
|
||||
*/
|
||||
async getSubcategoryBreakdown(category: string): Promise<Array<{
|
||||
subcategory: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
percentOfCategory: number;
|
||||
}>> {
|
||||
const key = cacheKey('subcategory_breakdown', { category });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH category_total AS (
|
||||
SELECT COUNT(*) as total FROM dutchie_products WHERE type = $1
|
||||
)
|
||||
SELECT
|
||||
COALESCE(dp.subcategory, 'Other') as subcategory,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
ct.total as category_total
|
||||
FROM dutchie_products dp, category_total ct
|
||||
WHERE dp.type = $1
|
||||
GROUP BY dp.subcategory, ct.total
|
||||
ORDER BY sku_count DESC
|
||||
`, [category]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
subcategory: row.subcategory,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
percentOfCategory: parseInt(row.category_total) > 0
|
||||
? Math.round((parseInt(row.sku_count) / parseInt(row.category_total)) * 1000) / 10
|
||||
: 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
}
|
||||
@@ -1,57 +0,0 @@
|
||||
/**
|
||||
* Analytics Module Index
|
||||
*
|
||||
* Exports all analytics services for CannaiQ dashboards.
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
export { AnalyticsCache, cacheKey, type CacheEntry, type CacheConfig } from './cache';
|
||||
|
||||
export {
|
||||
PriceTrendService,
|
||||
type PricePoint,
|
||||
type PriceTrend,
|
||||
type PriceSummary,
|
||||
type PriceCompressionResult,
|
||||
type PriceFilters,
|
||||
} from './price-trends';
|
||||
|
||||
export {
|
||||
PenetrationService,
|
||||
type BrandPenetration,
|
||||
type PenetrationTrend,
|
||||
type ShelfShare,
|
||||
type BrandPresenceByState,
|
||||
type PenetrationFilters,
|
||||
} from './penetration';
|
||||
|
||||
export {
|
||||
CategoryAnalyticsService,
|
||||
type CategoryGrowth,
|
||||
type CategorySummary,
|
||||
type CategoryGrowthTrend,
|
||||
type CategoryHeatmapData,
|
||||
type SeasonalityPattern,
|
||||
type CategoryFilters,
|
||||
} from './category-analytics';
|
||||
|
||||
export {
|
||||
StoreChangeService,
|
||||
type StoreChangeSummary,
|
||||
type StoreChangeEvent,
|
||||
type BrandChange,
|
||||
type ProductChange,
|
||||
type CategoryLeaderboard,
|
||||
type StoreFilters,
|
||||
} from './store-changes';
|
||||
|
||||
export {
|
||||
BrandOpportunityService,
|
||||
type BrandOpportunity,
|
||||
type PricePosition,
|
||||
type MissingSkuOpportunity,
|
||||
type StoreShelfShareChange,
|
||||
type CompetitorAlert,
|
||||
type MarketPositionSummary,
|
||||
} from './brand-opportunity';
|
||||
@@ -1,556 +0,0 @@
|
||||
/**
|
||||
* Brand Penetration Analytics Service
|
||||
*
|
||||
* Provides analytics for brand market penetration including:
|
||||
* - Stores carrying brand
|
||||
* - SKU counts per brand
|
||||
* - Percentage of stores carrying
|
||||
* - Shelf share calculations
|
||||
* - Penetration trends and momentum
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface BrandPenetration {
|
||||
brandName: string;
|
||||
brandId: string | null;
|
||||
totalStores: number;
|
||||
storesCarrying: number;
|
||||
penetrationPercent: number;
|
||||
totalSkus: number;
|
||||
avgSkusPerStore: number;
|
||||
shelfSharePercent: number;
|
||||
categories: string[];
|
||||
avgPrice: number | null;
|
||||
inStockSkus: number;
|
||||
}
|
||||
|
||||
export interface PenetrationTrend {
|
||||
brandName: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
storeCount: number;
|
||||
skuCount: number;
|
||||
penetrationPercent: number;
|
||||
}>;
|
||||
momentumScore: number; // -100 to +100
|
||||
riskScore: number; // 0 to 100, higher = more risk
|
||||
trend: 'growing' | 'declining' | 'stable';
|
||||
}
|
||||
|
||||
export interface ShelfShare {
|
||||
brandName: string;
|
||||
category: string;
|
||||
skuCount: number;
|
||||
categoryTotalSkus: number;
|
||||
shelfSharePercent: number;
|
||||
rank: number;
|
||||
}
|
||||
|
||||
export interface BrandPresenceByState {
|
||||
state: string;
|
||||
storeCount: number;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
}
|
||||
|
||||
export interface PenetrationFilters {
|
||||
state?: string;
|
||||
category?: string;
|
||||
minStores?: number;
|
||||
minSkus?: number;
|
||||
}
|
||||
|
||||
export class PenetrationService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration data for a specific brand
|
||||
*/
|
||||
async getBrandPenetration(
|
||||
brandName: string,
|
||||
filters: PenetrationFilters = {}
|
||||
): Promise<BrandPenetration> {
|
||||
const { state, category } = filters;
|
||||
const key = cacheKey('brand_penetration', { brandName, state, category });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Build where clauses
|
||||
const conditions: string[] = [];
|
||||
const params: (string | number)[] = [brandName];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
if (category) {
|
||||
conditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
|
||||
const stateCondition = state ? `AND d.state = $${params.indexOf(state) + 1}` : '';
|
||||
const categoryCondition = category ? `AND dp.type = $${params.indexOf(category) + 1}` : '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH total_stores AS (
|
||||
SELECT COUNT(DISTINCT id) as total
|
||||
FROM dispensaries
|
||||
WHERE 1=1 ${state ? `AND state = $2` : ''}
|
||||
),
|
||||
brand_data AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||
COUNT(*) as total_skus,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
${stateCondition}
|
||||
${categoryCondition}
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||
)
|
||||
SELECT
|
||||
bd.brand_name,
|
||||
bd.brand_id,
|
||||
ts.total as total_stores,
|
||||
bd.stores_carrying,
|
||||
bd.total_skus,
|
||||
bd.avg_price,
|
||||
bd.in_stock,
|
||||
bd.categories,
|
||||
tsk.total as market_total_skus
|
||||
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||
`, params);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return {
|
||||
brandName,
|
||||
brandId: null,
|
||||
totalStores: 0,
|
||||
storesCarrying: 0,
|
||||
penetrationPercent: 0,
|
||||
totalSkus: 0,
|
||||
avgSkusPerStore: 0,
|
||||
shelfSharePercent: 0,
|
||||
categories: [],
|
||||
avgPrice: null,
|
||||
inStockSkus: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const row = result.rows[0];
|
||||
const totalStores = parseInt(row.total_stores) || 1;
|
||||
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||
|
||||
return {
|
||||
brandName: row.brand_name,
|
||||
brandId: row.brand_id,
|
||||
totalStores,
|
||||
storesCarrying,
|
||||
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||
totalSkus,
|
||||
avgSkusPerStore: storesCarrying > 0
|
||||
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||
: 0,
|
||||
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||
categories: row.categories || [],
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
inStockSkus: parseInt(row.in_stock) || 0,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top brands by penetration
|
||||
*/
|
||||
async getTopBrandsByPenetration(
|
||||
limit: number = 20,
|
||||
filters: PenetrationFilters = {}
|
||||
): Promise<BrandPenetration[]> {
|
||||
const { state, category, minStores = 2, minSkus = 5 } = filters;
|
||||
const key = cacheKey('top_brands_penetration', { limit, state, category, minStores, minSkus });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [limit, minStores, minSkus];
|
||||
let paramIndex = 4;
|
||||
|
||||
let stateCondition = '';
|
||||
let categoryCondition = '';
|
||||
|
||||
if (state) {
|
||||
stateCondition = `AND d.state = $${paramIndex++}`;
|
||||
params.push(state);
|
||||
}
|
||||
if (category) {
|
||||
categoryCondition = `AND dp.type = $${paramIndex++}`;
|
||||
params.push(category);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH total_stores AS (
|
||||
SELECT COUNT(DISTINCT id) as total
|
||||
FROM dispensaries
|
||||
WHERE 1=1 ${state ? `AND state = $${params.indexOf(state) + 1}` : ''}
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||
),
|
||||
brand_data AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||
COUNT(*) as total_skus,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name IS NOT NULL
|
||||
${stateCondition}
|
||||
${categoryCondition}
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
HAVING COUNT(DISTINCT dp.dispensary_id) >= $2
|
||||
AND COUNT(*) >= $3
|
||||
)
|
||||
SELECT
|
||||
bd.*,
|
||||
ts.total as total_stores,
|
||||
tsk.total as market_total_skus
|
||||
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||
ORDER BY bd.stores_carrying DESC, bd.total_skus DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const totalStores = parseInt(row.total_stores) || 1;
|
||||
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||
|
||||
return {
|
||||
brandName: row.brand_name,
|
||||
brandId: row.brand_id,
|
||||
totalStores,
|
||||
storesCarrying,
|
||||
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||
totalSkus,
|
||||
avgSkusPerStore: storesCarrying > 0
|
||||
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||
: 0,
|
||||
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||
categories: row.categories || [],
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
inStockSkus: parseInt(row.in_stock) || 0,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration trend for a brand (requires historical snapshots)
|
||||
*/
|
||||
async getPenetrationTrend(
|
||||
brandName: string,
|
||||
days: number = 30
|
||||
): Promise<PenetrationTrend> {
|
||||
const key = cacheKey('penetration_trend', { brandName, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use brand_snapshots table for historical data
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
snapshot_date as date,
|
||||
store_count,
|
||||
total_skus
|
||||
FROM brand_snapshots
|
||||
WHERE brand_name = $1
|
||||
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY snapshot_date
|
||||
`, [brandName, days]);
|
||||
|
||||
// Get total stores for penetration calculation
|
||||
const totalResult = await this.pool.query(
|
||||
'SELECT COUNT(*) as total FROM dispensaries'
|
||||
);
|
||||
const totalStores = parseInt(totalResult.rows[0]?.total) || 1;
|
||||
|
||||
const dataPoints = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
skuCount: parseInt(row.total_skus) || 0,
|
||||
penetrationPercent: Math.round((parseInt(row.store_count) / totalStores) * 1000) / 10,
|
||||
}));
|
||||
|
||||
// Calculate momentum and risk scores
|
||||
let momentumScore = 0;
|
||||
let riskScore = 0;
|
||||
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||
|
||||
if (dataPoints.length >= 2) {
|
||||
const first = dataPoints[0];
|
||||
const last = dataPoints[dataPoints.length - 1];
|
||||
|
||||
// Momentum: change in store count
|
||||
const storeChange = last.storeCount - first.storeCount;
|
||||
const storeChangePercent = first.storeCount > 0
|
||||
? (storeChange / first.storeCount) * 100
|
||||
: 0;
|
||||
|
||||
// Momentum score: -100 to +100
|
||||
momentumScore = Math.max(-100, Math.min(100, storeChangePercent * 10));
|
||||
|
||||
// Risk score: higher if losing stores
|
||||
if (storeChange < 0) {
|
||||
riskScore = Math.min(100, Math.abs(storeChangePercent) * 5);
|
||||
}
|
||||
|
||||
// Determine trend
|
||||
if (storeChangePercent > 5) trend = 'growing';
|
||||
else if (storeChangePercent < -5) trend = 'declining';
|
||||
}
|
||||
|
||||
return {
|
||||
brandName,
|
||||
dataPoints,
|
||||
momentumScore: Math.round(momentumScore),
|
||||
riskScore: Math.round(riskScore),
|
||||
trend,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get shelf share by category for a brand
|
||||
*/
|
||||
async getShelfShareByCategory(brandName: string): Promise<ShelfShare[]> {
|
||||
const key = cacheKey('shelf_share_category', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH category_totals AS (
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total_skus
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
brand_by_category AS (
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1
|
||||
AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
ranked AS (
|
||||
SELECT
|
||||
ct.category,
|
||||
COALESCE(bc.sku_count, 0) as sku_count,
|
||||
ct.total_skus,
|
||||
RANK() OVER (PARTITION BY ct.category ORDER BY bc.sku_count DESC NULLS LAST) as rank
|
||||
FROM category_totals ct
|
||||
LEFT JOIN brand_by_category bc ON ct.category = bc.category
|
||||
)
|
||||
SELECT
|
||||
r.category,
|
||||
r.sku_count,
|
||||
r.total_skus as category_total_skus,
|
||||
ROUND((r.sku_count::NUMERIC / r.total_skus) * 100, 2) as shelf_share_pct,
|
||||
(SELECT COUNT(*) + 1 FROM (
|
||||
SELECT brand_name, COUNT(*) as cnt
|
||||
FROM dutchie_products
|
||||
WHERE type = r.category AND brand_name IS NOT NULL
|
||||
GROUP BY brand_name
|
||||
HAVING COUNT(*) > r.sku_count
|
||||
) t) as rank
|
||||
FROM ranked r
|
||||
WHERE r.sku_count > 0
|
||||
ORDER BY r.shelf_share_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName,
|
||||
category: row.category,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
categoryTotalSkus: parseInt(row.category_total_skus) || 0,
|
||||
shelfSharePercent: parseFloat(row.shelf_share_pct) || 0,
|
||||
rank: parseInt(row.rank) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get brand presence by state/region
|
||||
*/
|
||||
async getBrandPresenceByState(brandName: string): Promise<BrandPresenceByState[]> {
|
||||
const key = cacheKey('brand_presence_state', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.state
|
||||
ORDER BY store_count DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores carrying a brand
|
||||
*/
|
||||
async getStoresCarryingBrand(brandName: string): Promise<Array<{
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
categories: string[];
|
||||
}>> {
|
||||
const key = cacheKey('stores_carrying_brand', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.id, d.name, d.city, d.state
|
||||
ORDER BY sku_count DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
categories: row.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration heatmap data (state-based)
|
||||
*/
|
||||
async getPenetrationHeatmap(
|
||||
brandName?: string
|
||||
): Promise<Array<{
|
||||
state: string;
|
||||
totalStores: number;
|
||||
storesWithBrand: number;
|
||||
penetrationPercent: number;
|
||||
totalSkus: number;
|
||||
}>> {
|
||||
const key = cacheKey('penetration_heatmap', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
if (brandName) {
|
||||
const result = await this.pool.query(`
|
||||
WITH state_totals AS (
|
||||
SELECT state, COUNT(*) as total_stores
|
||||
FROM dispensaries
|
||||
GROUP BY state
|
||||
),
|
||||
brand_by_state AS (
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_with_brand,
|
||||
COUNT(*) as total_skus
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.state
|
||||
)
|
||||
SELECT
|
||||
st.state,
|
||||
st.total_stores,
|
||||
COALESCE(bs.stores_with_brand, 0) as stores_with_brand,
|
||||
ROUND(COALESCE(bs.stores_with_brand, 0)::NUMERIC / st.total_stores * 100, 1) as penetration_pct,
|
||||
COALESCE(bs.total_skus, 0) as total_skus
|
||||
FROM state_totals st
|
||||
LEFT JOIN brand_by_state bs ON st.state = bs.state
|
||||
ORDER BY penetration_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
totalStores: parseInt(row.total_stores) || 0,
|
||||
storesWithBrand: parseInt(row.stores_with_brand) || 0,
|
||||
penetrationPercent: parseFloat(row.penetration_pct) || 0,
|
||||
totalSkus: parseInt(row.total_skus) || 0,
|
||||
}));
|
||||
} else {
|
||||
// Overall market data by state
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT d.id) as total_stores,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(*) as total_skus
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
GROUP BY d.state
|
||||
ORDER BY total_stores DESC
|
||||
`);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
totalStores: parseInt(row.total_stores) || 0,
|
||||
storesWithBrand: parseInt(row.brand_count) || 0, // Using brand count here
|
||||
penetrationPercent: 100, // Full penetration for overall view
|
||||
totalSkus: parseInt(row.total_skus) || 0,
|
||||
}));
|
||||
}
|
||||
}, 30)).data;
|
||||
}
|
||||
}
|
||||
@@ -1,534 +0,0 @@
|
||||
/**
|
||||
* Price Trend Analytics Service
|
||||
*
|
||||
* Provides time-series price analytics including:
|
||||
* - Price over time for products
|
||||
* - Average MSRP/Wholesale by period
|
||||
* - Price volatility scoring
|
||||
* - Price compression detection
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface PricePoint {
|
||||
date: string;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
avgPrice: number | null;
|
||||
wholesalePrice: number | null;
|
||||
sampleSize: number;
|
||||
}
|
||||
|
||||
export interface PriceTrend {
|
||||
productId?: number;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
dataPoints: PricePoint[];
|
||||
summary: {
|
||||
currentAvg: number | null;
|
||||
previousAvg: number | null;
|
||||
changePercent: number | null;
|
||||
trend: 'up' | 'down' | 'stable';
|
||||
volatilityScore: number | null;
|
||||
};
|
||||
}
|
||||
|
||||
export interface PriceSummary {
|
||||
avg7d: number | null;
|
||||
avg30d: number | null;
|
||||
avg90d: number | null;
|
||||
wholesaleAvg7d: number | null;
|
||||
wholesaleAvg30d: number | null;
|
||||
wholesaleAvg90d: number | null;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
priceRange: number | null;
|
||||
volatilityScore: number | null;
|
||||
}
|
||||
|
||||
export interface PriceCompressionResult {
|
||||
category: string;
|
||||
brands: Array<{
|
||||
brandName: string;
|
||||
avgPrice: number;
|
||||
priceDistance: number; // distance from category mean
|
||||
}>;
|
||||
compressionScore: number; // 0-100, higher = more compressed
|
||||
standardDeviation: number;
|
||||
}
|
||||
|
||||
export interface PriceFilters {
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
state?: string;
|
||||
days?: number;
|
||||
}
|
||||
|
||||
export class PriceTrendService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trend for a specific product
|
||||
*/
|
||||
async getProductPriceTrend(
|
||||
productId: number,
|
||||
storeId?: number,
|
||||
days: number = 30
|
||||
): Promise<PriceTrend> {
|
||||
const key = cacheKey('price_trend_product', { productId, storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Try to get from snapshots first
|
||||
const snapshotResult = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(crawled_at) as date,
|
||||
MIN(rec_min_price_cents) / 100.0 as min_price,
|
||||
MAX(rec_max_price_cents) / 100.0 as max_price,
|
||||
AVG(rec_min_price_cents) / 100.0 as avg_price,
|
||||
AVG(wholesale_min_price_cents) / 100.0 as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = $1
|
||||
AND crawled_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dispensary_id = $3' : ''}
|
||||
GROUP BY DATE(crawled_at)
|
||||
ORDER BY date
|
||||
`, storeId ? [productId, days, storeId] : [productId, days]);
|
||||
|
||||
let dataPoints: PricePoint[] = snapshotResult.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
// If no snapshots, get current price from product
|
||||
if (dataPoints.length === 0) {
|
||||
const productResult = await this.pool.query(`
|
||||
SELECT
|
||||
extract_min_price(latest_raw_payload) as min_price,
|
||||
extract_max_price(latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(latest_raw_payload) as wholesale_price
|
||||
FROM dutchie_products
|
||||
WHERE id = $1
|
||||
`, [productId]);
|
||||
|
||||
if (productResult.rows.length > 0) {
|
||||
const row = productResult.rows[0];
|
||||
dataPoints = [{
|
||||
date: new Date().toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.min_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: 1,
|
||||
}];
|
||||
}
|
||||
}
|
||||
|
||||
const summary = this.calculatePriceSummary(dataPoints);
|
||||
|
||||
return {
|
||||
productId,
|
||||
storeId,
|
||||
dataPoints,
|
||||
summary,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trends by brand
|
||||
*/
|
||||
async getBrandPriceTrend(
|
||||
brandName: string,
|
||||
filters: PriceFilters = {}
|
||||
): Promise<PriceTrend> {
|
||||
const { storeId, category, state, days = 30 } = filters;
|
||||
const key = cacheKey('price_trend_brand', { brandName, storeId, category, state, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use current product data aggregated by date
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(dp.updated_at) as date,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||
${category ? `AND dp.type = $${storeId ? 4 : 3}` : ''}
|
||||
${state ? `AND d.state = $${storeId ? (category ? 5 : 4) : (category ? 4 : 3)}` : ''}
|
||||
GROUP BY DATE(dp.updated_at)
|
||||
ORDER BY date
|
||||
`, this.buildParams([brandName, days], { storeId, category, state }));
|
||||
|
||||
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
storeId,
|
||||
category,
|
||||
dataPoints,
|
||||
summary: this.calculatePriceSummary(dataPoints),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trends by category
|
||||
*/
|
||||
async getCategoryPriceTrend(
|
||||
category: string,
|
||||
filters: PriceFilters = {}
|
||||
): Promise<PriceTrend> {
|
||||
const { storeId, brandName, state, days = 30 } = filters;
|
||||
const key = cacheKey('price_trend_category', { category, storeId, brandName, state, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(dp.updated_at) as date,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||
${brandName ? `AND dp.brand_name = $${storeId ? 4 : 3}` : ''}
|
||||
${state ? `AND d.state = $${storeId ? (brandName ? 5 : 4) : (brandName ? 4 : 3)}` : ''}
|
||||
GROUP BY DATE(dp.updated_at)
|
||||
ORDER BY date
|
||||
`, this.buildParams([category, days], { storeId, brandName, state }));
|
||||
|
||||
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
return {
|
||||
category,
|
||||
storeId,
|
||||
brandName,
|
||||
dataPoints,
|
||||
summary: this.calculatePriceSummary(dataPoints),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price summary statistics
|
||||
*/
|
||||
async getPriceSummary(filters: PriceFilters = {}): Promise<PriceSummary> {
|
||||
const { storeId, brandName, category, state } = filters;
|
||||
const key = cacheKey('price_summary', filters as Record<string, unknown>);
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const whereConditions: string[] = [];
|
||||
const params: (string | number)[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (storeId) {
|
||||
whereConditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
if (brandName) {
|
||||
whereConditions.push(`dp.brand_name = $${paramIndex++}`);
|
||||
params.push(brandName);
|
||||
}
|
||||
if (category) {
|
||||
whereConditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
if (state) {
|
||||
whereConditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
|
||||
const whereClause = whereConditions.length > 0
|
||||
? 'WHERE ' + whereConditions.join(' AND ')
|
||||
: '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH prices AS (
|
||||
SELECT
|
||||
extract_min_price(dp.latest_raw_payload) as min_price,
|
||||
extract_max_price(dp.latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
${whereClause}
|
||||
)
|
||||
SELECT
|
||||
AVG(min_price) as avg_price,
|
||||
AVG(wholesale_price) as avg_wholesale,
|
||||
MIN(min_price) as min_price,
|
||||
MAX(max_price) as max_price,
|
||||
STDDEV(min_price) as std_dev
|
||||
FROM prices
|
||||
WHERE min_price IS NOT NULL
|
||||
`, params);
|
||||
|
||||
const row = result.rows[0];
|
||||
const avgPrice = parseFloat(row.avg_price) || null;
|
||||
const stdDev = parseFloat(row.std_dev) || null;
|
||||
const volatility = avgPrice && stdDev ? (stdDev / avgPrice) * 100 : null;
|
||||
|
||||
return {
|
||||
avg7d: avgPrice, // Using current data as proxy
|
||||
avg30d: avgPrice,
|
||||
avg90d: avgPrice,
|
||||
wholesaleAvg7d: parseFloat(row.avg_wholesale) || null,
|
||||
wholesaleAvg30d: parseFloat(row.avg_wholesale) || null,
|
||||
wholesaleAvg90d: parseFloat(row.avg_wholesale) || null,
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
priceRange: row.max_price && row.min_price
|
||||
? parseFloat(row.max_price) - parseFloat(row.min_price)
|
||||
: null,
|
||||
volatilityScore: volatility ? Math.round(volatility * 10) / 10 : null,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect price compression in a category
|
||||
*/
|
||||
async detectPriceCompression(
|
||||
category: string,
|
||||
state?: string
|
||||
): Promise<PriceCompressionResult> {
|
||||
const key = cacheKey('price_compression', { category, state });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
AND dp.brand_name IS NOT NULL
|
||||
${state ? 'AND d.state = $2' : ''}
|
||||
GROUP BY dp.brand_name
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
stats AS (
|
||||
SELECT
|
||||
AVG(avg_price) as category_avg,
|
||||
STDDEV(avg_price) as std_dev
|
||||
FROM brand_prices
|
||||
WHERE avg_price IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
bp.brand_name,
|
||||
bp.avg_price,
|
||||
ABS(bp.avg_price - s.category_avg) as price_distance,
|
||||
s.category_avg,
|
||||
s.std_dev
|
||||
FROM brand_prices bp, stats s
|
||||
WHERE bp.avg_price IS NOT NULL
|
||||
ORDER BY bp.avg_price
|
||||
`, state ? [category, state] : [category]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return {
|
||||
category,
|
||||
brands: [],
|
||||
compressionScore: 0,
|
||||
standardDeviation: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const categoryAvg = parseFloat(result.rows[0].category_avg) || 0;
|
||||
const stdDev = parseFloat(result.rows[0].std_dev) || 0;
|
||||
|
||||
// Compression score: lower std dev relative to mean = more compression
|
||||
// Scale to 0-100 where 100 = very compressed
|
||||
const cv = categoryAvg > 0 ? (stdDev / categoryAvg) * 100 : 0;
|
||||
const compressionScore = Math.max(0, Math.min(100, 100 - cv));
|
||||
|
||||
const brands = result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
avgPrice: parseFloat(row.avg_price) || 0,
|
||||
priceDistance: parseFloat(row.price_distance) || 0,
|
||||
}));
|
||||
|
||||
return {
|
||||
category,
|
||||
brands,
|
||||
compressionScore: Math.round(compressionScore),
|
||||
standardDeviation: Math.round(stdDev * 100) / 100,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get global price statistics
|
||||
*/
|
||||
async getGlobalPriceStats(): Promise<{
|
||||
totalProductsWithPrice: number;
|
||||
avgPrice: number | null;
|
||||
medianPrice: number | null;
|
||||
priceByCategory: Array<{ category: string; avgPrice: number; count: number }>;
|
||||
priceByState: Array<{ state: string; avgPrice: number; count: number }>;
|
||||
}> {
|
||||
const key = 'global_price_stats';
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [countResult, categoryResult, stateResult] = await Promise.all([
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE extract_min_price(latest_raw_payload) IS NOT NULL) as with_price,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY extract_min_price(latest_raw_payload)) as median
|
||||
FROM dutchie_products
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as count
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
AND extract_min_price(latest_raw_payload) IS NOT NULL
|
||||
GROUP BY type
|
||||
ORDER BY avg_price DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE extract_min_price(dp.latest_raw_payload) IS NOT NULL
|
||||
GROUP BY d.state
|
||||
ORDER BY avg_price DESC
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
totalProductsWithPrice: parseInt(countResult.rows[0]?.with_price || '0'),
|
||||
avgPrice: parseFloat(countResult.rows[0]?.avg_price) || null,
|
||||
medianPrice: parseFloat(countResult.rows[0]?.median) || null,
|
||||
priceByCategory: categoryResult.rows.map(r => ({
|
||||
category: r.category,
|
||||
avgPrice: parseFloat(r.avg_price) || 0,
|
||||
count: parseInt(r.count),
|
||||
})),
|
||||
priceByState: stateResult.rows.map(r => ({
|
||||
state: r.state,
|
||||
avgPrice: parseFloat(r.avg_price) || 0,
|
||||
count: parseInt(r.count),
|
||||
})),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELPER METHODS
|
||||
// ============================================================
|
||||
|
||||
private calculatePriceSummary(dataPoints: PricePoint[]): PriceTrend['summary'] {
|
||||
if (dataPoints.length === 0) {
|
||||
return {
|
||||
currentAvg: null,
|
||||
previousAvg: null,
|
||||
changePercent: null,
|
||||
trend: 'stable',
|
||||
volatilityScore: null,
|
||||
};
|
||||
}
|
||||
|
||||
const prices = dataPoints
|
||||
.map(d => d.avgPrice)
|
||||
.filter((p): p is number => p !== null);
|
||||
|
||||
if (prices.length === 0) {
|
||||
return {
|
||||
currentAvg: null,
|
||||
previousAvg: null,
|
||||
changePercent: null,
|
||||
trend: 'stable',
|
||||
volatilityScore: null,
|
||||
};
|
||||
}
|
||||
|
||||
const currentAvg = prices[prices.length - 1];
|
||||
const midpoint = Math.floor(prices.length / 2);
|
||||
const previousAvg = prices.length > 1 ? prices[midpoint] : currentAvg;
|
||||
|
||||
const changePercent = previousAvg > 0
|
||||
? ((currentAvg - previousAvg) / previousAvg) * 100
|
||||
: null;
|
||||
|
||||
// Calculate volatility (coefficient of variation)
|
||||
const mean = prices.reduce((a, b) => a + b, 0) / prices.length;
|
||||
const variance = prices.reduce((sum, p) => sum + Math.pow(p - mean, 2), 0) / prices.length;
|
||||
const stdDev = Math.sqrt(variance);
|
||||
const volatilityScore = mean > 0 ? (stdDev / mean) * 100 : null;
|
||||
|
||||
let trend: 'up' | 'down' | 'stable' = 'stable';
|
||||
if (changePercent !== null) {
|
||||
if (changePercent > 5) trend = 'up';
|
||||
else if (changePercent < -5) trend = 'down';
|
||||
}
|
||||
|
||||
return {
|
||||
currentAvg: Math.round(currentAvg * 100) / 100,
|
||||
previousAvg: Math.round(previousAvg * 100) / 100,
|
||||
changePercent: changePercent !== null ? Math.round(changePercent * 10) / 10 : null,
|
||||
trend,
|
||||
volatilityScore: volatilityScore !== null ? Math.round(volatilityScore * 10) / 10 : null,
|
||||
};
|
||||
}
|
||||
|
||||
private buildParams(
|
||||
baseParams: (string | number)[],
|
||||
optionalParams: Record<string, string | number | undefined>
|
||||
): (string | number)[] {
|
||||
const params = [...baseParams];
|
||||
for (const value of Object.values(optionalParams)) {
|
||||
if (value !== undefined) {
|
||||
params.push(value);
|
||||
}
|
||||
}
|
||||
return params;
|
||||
}
|
||||
}
|
||||
@@ -1,587 +0,0 @@
|
||||
/**
|
||||
* Store Change Tracking Service
|
||||
*
|
||||
* Tracks changes at the store level including:
|
||||
* - New/lost brands
|
||||
* - New/discontinued products
|
||||
* - Stock status transitions
|
||||
* - Price changes
|
||||
* - Category movement leaderboards
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface StoreChangeSummary {
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
brandsAdded7d: number;
|
||||
brandsAdded30d: number;
|
||||
brandsLost7d: number;
|
||||
brandsLost30d: number;
|
||||
productsAdded7d: number;
|
||||
productsAdded30d: number;
|
||||
productsDiscontinued7d: number;
|
||||
productsDiscontinued30d: number;
|
||||
priceDrops7d: number;
|
||||
priceIncreases7d: number;
|
||||
restocks7d: number;
|
||||
stockOuts7d: number;
|
||||
}
|
||||
|
||||
export interface StoreChangeEvent {
|
||||
id: number;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
eventType: string;
|
||||
eventDate: string;
|
||||
brandName: string | null;
|
||||
productName: string | null;
|
||||
category: string | null;
|
||||
oldValue: string | null;
|
||||
newValue: string | null;
|
||||
metadata: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
export interface BrandChange {
|
||||
brandName: string;
|
||||
changeType: 'added' | 'removed';
|
||||
date: string;
|
||||
skuCount: number;
|
||||
categories: string[];
|
||||
}
|
||||
|
||||
export interface ProductChange {
|
||||
productId: number;
|
||||
productName: string;
|
||||
brandName: string | null;
|
||||
category: string | null;
|
||||
changeType: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock';
|
||||
date: string;
|
||||
oldValue?: string;
|
||||
newValue?: string;
|
||||
}
|
||||
|
||||
export interface CategoryLeaderboard {
|
||||
category: string;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
changePercent7d: number;
|
||||
rank: number;
|
||||
}
|
||||
|
||||
export interface StoreFilters {
|
||||
storeId?: number;
|
||||
state?: string;
|
||||
days?: number;
|
||||
eventType?: string;
|
||||
}
|
||||
|
||||
export class StoreChangeService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get change summary for a store
|
||||
*/
|
||||
async getStoreChangeSummary(
|
||||
storeId: number
|
||||
): Promise<StoreChangeSummary | null> {
|
||||
const key = cacheKey('store_change_summary', { storeId });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Get store info
|
||||
const storeResult = await this.pool.query(`
|
||||
SELECT id, name, city, state FROM dispensaries WHERE id = $1
|
||||
`, [storeId]);
|
||||
|
||||
if (storeResult.rows.length === 0) return null;
|
||||
const store = storeResult.rows[0];
|
||||
|
||||
// Get change events counts
|
||||
const eventsResult = await this.pool.query(`
|
||||
SELECT
|
||||
event_type,
|
||||
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '7 days') as count_7d,
|
||||
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '30 days') as count_30d
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
GROUP BY event_type
|
||||
`, [storeId]);
|
||||
|
||||
const counts: Record<string, { count_7d: number; count_30d: number }> = {};
|
||||
eventsResult.rows.forEach(row => {
|
||||
counts[row.event_type] = {
|
||||
count_7d: parseInt(row.count_7d) || 0,
|
||||
count_30d: parseInt(row.count_30d) || 0,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
storeId: store.id,
|
||||
storeName: store.name,
|
||||
city: store.city,
|
||||
state: store.state,
|
||||
brandsAdded7d: counts['brand_added']?.count_7d || 0,
|
||||
brandsAdded30d: counts['brand_added']?.count_30d || 0,
|
||||
brandsLost7d: counts['brand_removed']?.count_7d || 0,
|
||||
brandsLost30d: counts['brand_removed']?.count_30d || 0,
|
||||
productsAdded7d: counts['product_added']?.count_7d || 0,
|
||||
productsAdded30d: counts['product_added']?.count_30d || 0,
|
||||
productsDiscontinued7d: counts['product_removed']?.count_7d || 0,
|
||||
productsDiscontinued30d: counts['product_removed']?.count_30d || 0,
|
||||
priceDrops7d: counts['price_drop']?.count_7d || 0,
|
||||
priceIncreases7d: counts['price_increase']?.count_7d || 0,
|
||||
restocks7d: counts['restocked']?.count_7d || 0,
|
||||
stockOuts7d: counts['out_of_stock']?.count_7d || 0,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recent change events for a store
|
||||
*/
|
||||
async getStoreChangeEvents(
|
||||
storeId: number,
|
||||
filters: { eventType?: string; days?: number; limit?: number } = {}
|
||||
): Promise<StoreChangeEvent[]> {
|
||||
const { eventType, days = 30, limit = 100 } = filters;
|
||||
const key = cacheKey('store_change_events', { storeId, eventType, days, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [storeId, days, limit];
|
||||
let eventTypeCondition = '';
|
||||
|
||||
if (eventType) {
|
||||
eventTypeCondition = 'AND event_type = $4';
|
||||
params.push(eventType);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
sce.id,
|
||||
sce.store_id,
|
||||
d.name as store_name,
|
||||
sce.event_type,
|
||||
sce.event_date,
|
||||
sce.brand_name,
|
||||
sce.product_name,
|
||||
sce.category,
|
||||
sce.old_value,
|
||||
sce.new_value,
|
||||
sce.metadata
|
||||
FROM store_change_events sce
|
||||
JOIN dispensaries d ON sce.store_id = d.id
|
||||
WHERE sce.store_id = $1
|
||||
AND sce.event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
${eventTypeCondition}
|
||||
ORDER BY sce.event_date DESC, sce.id DESC
|
||||
LIMIT $3
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
eventType: row.event_type,
|
||||
eventDate: row.event_date.toISOString().split('T')[0],
|
||||
brandName: row.brand_name,
|
||||
productName: row.product_name,
|
||||
category: row.category,
|
||||
oldValue: row.old_value,
|
||||
newValue: row.new_value,
|
||||
metadata: row.metadata,
|
||||
}));
|
||||
}, 5)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get new brands added to a store
|
||||
*/
|
||||
async getNewBrands(
|
||||
storeId: number,
|
||||
days: number = 30
|
||||
): Promise<BrandChange[]> {
|
||||
const key = cacheKey('new_brands', { storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
brand_name,
|
||||
event_date,
|
||||
metadata
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_type = 'brand_added'
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY event_date DESC
|
||||
`, [storeId, days]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
changeType: 'added' as const,
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
skuCount: row.metadata?.sku_count || 0,
|
||||
categories: row.metadata?.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get brands lost from a store
|
||||
*/
|
||||
async getLostBrands(
|
||||
storeId: number,
|
||||
days: number = 30
|
||||
): Promise<BrandChange[]> {
|
||||
const key = cacheKey('lost_brands', { storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
brand_name,
|
||||
event_date,
|
||||
metadata
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_type = 'brand_removed'
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY event_date DESC
|
||||
`, [storeId, days]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
changeType: 'removed' as const,
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
skuCount: row.metadata?.sku_count || 0,
|
||||
categories: row.metadata?.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get product changes for a store
|
||||
*/
|
||||
async getProductChanges(
|
||||
storeId: number,
|
||||
changeType?: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock',
|
||||
days: number = 7
|
||||
): Promise<ProductChange[]> {
|
||||
const key = cacheKey('product_changes', { storeId, changeType, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const eventTypeMap: Record<string, string> = {
|
||||
'added': 'product_added',
|
||||
'discontinued': 'product_removed',
|
||||
'price_drop': 'price_drop',
|
||||
'price_increase': 'price_increase',
|
||||
'restocked': 'restocked',
|
||||
'out_of_stock': 'out_of_stock',
|
||||
};
|
||||
|
||||
const params: (string | number)[] = [storeId, days];
|
||||
let eventCondition = '';
|
||||
|
||||
if (changeType) {
|
||||
eventCondition = 'AND event_type = $3';
|
||||
params.push(eventTypeMap[changeType]);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
product_id,
|
||||
product_name,
|
||||
brand_name,
|
||||
category,
|
||||
event_type,
|
||||
event_date,
|
||||
old_value,
|
||||
new_value
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
AND product_id IS NOT NULL
|
||||
${eventCondition}
|
||||
ORDER BY event_date DESC
|
||||
LIMIT 100
|
||||
`, params);
|
||||
|
||||
const reverseMap: Record<string, ProductChange['changeType']> = {
|
||||
'product_added': 'added',
|
||||
'product_removed': 'discontinued',
|
||||
'price_drop': 'price_drop',
|
||||
'price_increase': 'price_increase',
|
||||
'restocked': 'restocked',
|
||||
'out_of_stock': 'out_of_stock',
|
||||
};
|
||||
|
||||
return result.rows.map(row => ({
|
||||
productId: row.product_id,
|
||||
productName: row.product_name,
|
||||
brandName: row.brand_name,
|
||||
category: row.category,
|
||||
changeType: reverseMap[row.event_type] || 'added',
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
oldValue: row.old_value,
|
||||
newValue: row.new_value,
|
||||
}));
|
||||
}, 5)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category leaderboard across stores
|
||||
*/
|
||||
async getCategoryLeaderboard(
|
||||
category: string,
|
||||
limit: number = 20
|
||||
): Promise<CategoryLeaderboard[]> {
|
||||
const key = cacheKey('category_leaderboard', { category, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH store_category_stats AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
GROUP BY dp.dispensary_id, d.name
|
||||
)
|
||||
SELECT
|
||||
scs.*,
|
||||
RANK() OVER (ORDER BY scs.sku_count DESC) as rank
|
||||
FROM store_category_stats scs
|
||||
ORDER BY scs.sku_count DESC
|
||||
LIMIT $2
|
||||
`, [category, limit]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
changePercent7d: 0, // Would need historical data
|
||||
rank: parseInt(row.rank) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores with most activity (changes)
|
||||
*/
|
||||
async getMostActiveStores(
|
||||
days: number = 7,
|
||||
limit: number = 10
|
||||
): Promise<Array<{
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
totalChanges: number;
|
||||
brandsChanged: number;
|
||||
productsChanged: number;
|
||||
priceChanges: number;
|
||||
stockChanges: number;
|
||||
}>> {
|
||||
const key = cacheKey('most_active_stores', { days, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) as total_changes,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('brand_added', 'brand_removed')) as brands_changed,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('product_added', 'product_removed')) as products_changed,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('price_drop', 'price_increase')) as price_changes,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('restocked', 'out_of_stock')) as stock_changes
|
||||
FROM store_change_events sce
|
||||
JOIN dispensaries d ON sce.store_id = d.id
|
||||
WHERE sce.event_date >= CURRENT_DATE - ($1 || ' days')::INTERVAL
|
||||
GROUP BY d.id, d.name, d.city, d.state
|
||||
ORDER BY total_changes DESC
|
||||
LIMIT $2
|
||||
`, [days, limit]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
totalChanges: parseInt(row.total_changes) || 0,
|
||||
brandsChanged: parseInt(row.brands_changed) || 0,
|
||||
productsChanged: parseInt(row.products_changed) || 0,
|
||||
priceChanges: parseInt(row.price_changes) || 0,
|
||||
stockChanges: parseInt(row.stock_changes) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two stores
|
||||
*/
|
||||
async compareStores(
|
||||
storeId1: number,
|
||||
storeId2: number
|
||||
): Promise<{
|
||||
store1: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||
store2: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||
sharedBrands: string[];
|
||||
uniqueToStore1: string[];
|
||||
uniqueToStore2: string[];
|
||||
categoryComparison: Array<{
|
||||
category: string;
|
||||
store1Skus: number;
|
||||
store2Skus: number;
|
||||
difference: number;
|
||||
}>;
|
||||
}> {
|
||||
const key = cacheKey('compare_stores', { storeId1, storeId2 });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [store1Data, store2Data] = await Promise.all([
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.id, d.name,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
COUNT(*) as sku_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.id = $1
|
||||
GROUP BY d.id, d.name
|
||||
`, [storeId1]),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.id, d.name,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
COUNT(*) as sku_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.id = $1
|
||||
GROUP BY d.id, d.name
|
||||
`, [storeId2]),
|
||||
]);
|
||||
|
||||
const s1 = store1Data.rows[0];
|
||||
const s2 = store2Data.rows[0];
|
||||
|
||||
const brands1Array: string[] = (s1?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||
const brands2Array: string[] = (s2?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||
const brands1 = new Set(brands1Array);
|
||||
const brands2 = new Set(brands2Array);
|
||||
|
||||
const sharedBrands: string[] = brands1Array.filter(b => brands2.has(b));
|
||||
const uniqueToStore1: string[] = brands1Array.filter(b => !brands2.has(b));
|
||||
const uniqueToStore2: string[] = brands2Array.filter(b => !brands1.has(b));
|
||||
|
||||
// Category comparison
|
||||
const categoryResult = await this.pool.query(`
|
||||
WITH store1_cats AS (
|
||||
SELECT type as category, COUNT(*) as sku_count
|
||||
FROM dutchie_products WHERE dispensary_id = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
store2_cats AS (
|
||||
SELECT type as category, COUNT(*) as sku_count
|
||||
FROM dutchie_products WHERE dispensary_id = $2 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
all_cats AS (
|
||||
SELECT category FROM store1_cats
|
||||
UNION
|
||||
SELECT category FROM store2_cats
|
||||
)
|
||||
SELECT
|
||||
ac.category,
|
||||
COALESCE(s1.sku_count, 0) as store1_skus,
|
||||
COALESCE(s2.sku_count, 0) as store2_skus
|
||||
FROM all_cats ac
|
||||
LEFT JOIN store1_cats s1 ON ac.category = s1.category
|
||||
LEFT JOIN store2_cats s2 ON ac.category = s2.category
|
||||
ORDER BY (COALESCE(s1.sku_count, 0) + COALESCE(s2.sku_count, 0)) DESC
|
||||
`, [storeId1, storeId2]);
|
||||
|
||||
return {
|
||||
store1: {
|
||||
id: s1?.id || storeId1,
|
||||
name: s1?.name || 'Unknown',
|
||||
brands: s1?.brands || [],
|
||||
categories: s1?.categories || [],
|
||||
skuCount: parseInt(s1?.sku_count) || 0,
|
||||
},
|
||||
store2: {
|
||||
id: s2?.id || storeId2,
|
||||
name: s2?.name || 'Unknown',
|
||||
brands: s2?.brands || [],
|
||||
categories: s2?.categories || [],
|
||||
skuCount: parseInt(s2?.sku_count) || 0,
|
||||
},
|
||||
sharedBrands,
|
||||
uniqueToStore1,
|
||||
uniqueToStore2,
|
||||
categoryComparison: categoryResult.rows.map(row => ({
|
||||
category: row.category,
|
||||
store1Skus: parseInt(row.store1_skus) || 0,
|
||||
store2Skus: parseInt(row.store2_skus) || 0,
|
||||
difference: (parseInt(row.store1_skus) || 0) - (parseInt(row.store2_skus) || 0),
|
||||
})),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a change event (used by crawler/worker)
|
||||
*/
|
||||
async recordChangeEvent(event: {
|
||||
storeId: number;
|
||||
eventType: string;
|
||||
brandName?: string;
|
||||
productId?: number;
|
||||
productName?: string;
|
||||
category?: string;
|
||||
oldValue?: string;
|
||||
newValue?: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}): Promise<void> {
|
||||
await this.pool.query(`
|
||||
INSERT INTO store_change_events
|
||||
(store_id, event_type, event_date, brand_name, product_id, product_name, category, old_value, new_value, metadata)
|
||||
VALUES ($1, $2, CURRENT_DATE, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
event.storeId,
|
||||
event.eventType,
|
||||
event.brandName || null,
|
||||
event.productId || null,
|
||||
event.productName || null,
|
||||
event.category || null,
|
||||
event.oldValue || null,
|
||||
event.newValue || null,
|
||||
event.metadata ? JSON.stringify(event.metadata) : null,
|
||||
]);
|
||||
|
||||
// Invalidate cache
|
||||
await this.cache.invalidatePattern(`store_change_summary:storeId=${event.storeId}`);
|
||||
}
|
||||
}
|
||||
@@ -1,266 +0,0 @@
|
||||
/**
|
||||
* LEGACY SERVICE - AZDHS Import
|
||||
*
|
||||
* DEPRECATED: This service creates its own database pool.
|
||||
* Future implementations should use the canonical CannaiQ connection.
|
||||
*
|
||||
* Imports Arizona dispensaries from the main database's dispensaries table
|
||||
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
|
||||
*
|
||||
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
|
||||
*
|
||||
* DO NOT:
|
||||
* - Run this in automated jobs
|
||||
* - Use DATABASE_URL directly
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { query as dutchieQuery } from '../db/connection';
|
||||
import { Dispensary } from '../types';
|
||||
|
||||
// Single database connection (cannaiq in cannaiq-postgres container)
|
||||
// Use CANNAIQ_DB_* env vars or defaults
|
||||
const MAIN_DB_CONNECTION = process.env.CANNAIQ_DB_URL ||
|
||||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
|
||||
|
||||
/**
|
||||
* AZDHS dispensary record from the main database
|
||||
*/
|
||||
interface AZDHSDispensary {
|
||||
id: number;
|
||||
azdhs_id: number;
|
||||
name: string;
|
||||
company_name?: string;
|
||||
address?: string;
|
||||
city: string;
|
||||
state: string;
|
||||
zip?: string;
|
||||
latitude?: number;
|
||||
longitude?: number;
|
||||
dba_name?: string;
|
||||
phone?: string;
|
||||
email?: string;
|
||||
website?: string;
|
||||
google_rating?: string;
|
||||
google_review_count?: number;
|
||||
slug: string;
|
||||
menu_provider?: string;
|
||||
product_provider?: string;
|
||||
created_at: Date;
|
||||
updated_at: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import result statistics
|
||||
*/
|
||||
interface ImportResult {
|
||||
total: number;
|
||||
imported: number;
|
||||
skipped: number;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a temporary connection to the main database
|
||||
*/
|
||||
function getMainDBPool(): Pool {
|
||||
console.warn('[AZDHS Import] LEGACY: Using separate pool. Should use canonical CannaiQ connection.');
|
||||
return new Pool({
|
||||
connectionString: MAIN_DB_CONNECTION,
|
||||
max: 5,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all AZ dispensaries from the main database
|
||||
*/
|
||||
async function fetchAZDHSDispensaries(): Promise<AZDHSDispensary[]> {
|
||||
const pool = getMainDBPool();
|
||||
|
||||
try {
|
||||
const result = await pool.query<AZDHSDispensary>(`
|
||||
SELECT
|
||||
id, azdhs_id, name, company_name, address, city, state, zip,
|
||||
latitude, longitude, dba_name, phone, email, website,
|
||||
google_rating, google_review_count, slug,
|
||||
menu_provider, product_provider,
|
||||
created_at, updated_at
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
ORDER BY id
|
||||
`);
|
||||
|
||||
return result.rows;
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Import a single dispensary into the Dutchie AZ database
|
||||
*/
|
||||
async function importDispensary(disp: AZDHSDispensary): Promise<number> {
|
||||
const result = await dutchieQuery<{ id: number }>(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
platform, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, is_delivery, is_pickup, raw_metadata, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7,
|
||||
$8, $9, $10, $11, $12, NOW()
|
||||
)
|
||||
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
address = EXCLUDED.address,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
raw_metadata = EXCLUDED.raw_metadata,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
'dutchie', // Will be updated when Dutchie match is found
|
||||
disp.dba_name || disp.name,
|
||||
disp.slug,
|
||||
disp.city,
|
||||
disp.state,
|
||||
disp.zip,
|
||||
disp.address,
|
||||
disp.latitude,
|
||||
disp.longitude,
|
||||
false, // is_delivery - unknown
|
||||
true, // is_pickup - assume true
|
||||
JSON.stringify({
|
||||
azdhs_id: disp.azdhs_id,
|
||||
main_db_id: disp.id,
|
||||
company_name: disp.company_name,
|
||||
phone: disp.phone,
|
||||
email: disp.email,
|
||||
website: disp.website,
|
||||
google_rating: disp.google_rating,
|
||||
google_review_count: disp.google_review_count,
|
||||
menu_provider: disp.menu_provider,
|
||||
product_provider: disp.product_provider,
|
||||
}),
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import all AZDHS dispensaries into the Dutchie AZ database
|
||||
*/
|
||||
export async function importAZDHSDispensaries(): Promise<ImportResult> {
|
||||
console.log('[AZDHS Import] Starting import from main database...');
|
||||
|
||||
const result: ImportResult = {
|
||||
total: 0,
|
||||
imported: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
|
||||
try {
|
||||
const dispensaries = await fetchAZDHSDispensaries();
|
||||
result.total = dispensaries.length;
|
||||
|
||||
console.log(`[AZDHS Import] Found ${dispensaries.length} AZ dispensaries in main DB`);
|
||||
|
||||
for (const disp of dispensaries) {
|
||||
try {
|
||||
const id = await importDispensary(disp);
|
||||
result.imported++;
|
||||
console.log(`[AZDHS Import] Imported: ${disp.name} (${disp.city}) -> id=${id}`);
|
||||
} catch (error: any) {
|
||||
if (error.message.includes('duplicate')) {
|
||||
result.skipped++;
|
||||
} else {
|
||||
result.errors.push(`${disp.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Failed to fetch from main DB: ${error.message}`);
|
||||
}
|
||||
|
||||
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped, ${result.errors.length} errors`);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import dispensaries from JSON file (backup export)
|
||||
*/
|
||||
export async function importFromJSON(jsonPath: string): Promise<ImportResult> {
|
||||
console.log(`[AZDHS Import] Importing from JSON: ${jsonPath}`);
|
||||
|
||||
const result: ImportResult = {
|
||||
total: 0,
|
||||
imported: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
|
||||
try {
|
||||
const fs = await import('fs/promises');
|
||||
const data = await fs.readFile(jsonPath, 'utf-8');
|
||||
const dispensaries: AZDHSDispensary[] = JSON.parse(data);
|
||||
|
||||
result.total = dispensaries.length;
|
||||
console.log(`[AZDHS Import] Found ${dispensaries.length} dispensaries in JSON file`);
|
||||
|
||||
for (const disp of dispensaries) {
|
||||
try {
|
||||
const id = await importDispensary(disp);
|
||||
result.imported++;
|
||||
} catch (error: any) {
|
||||
if (error.message.includes('duplicate')) {
|
||||
result.skipped++;
|
||||
} else {
|
||||
result.errors.push(`${disp.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Failed to read JSON file: ${error.message}`);
|
||||
}
|
||||
|
||||
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped`);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get import statistics
|
||||
*/
|
||||
export async function getImportStats(): Promise<{
|
||||
totalDispensaries: number;
|
||||
withPlatformIds: number;
|
||||
withoutPlatformIds: number;
|
||||
lastImportedAt?: Date;
|
||||
}> {
|
||||
const { rows } = await dutchieQuery<{
|
||||
total: string;
|
||||
with_platform_id: string;
|
||||
without_platform_id: string;
|
||||
last_updated: Date;
|
||||
}>(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(platform_dispensary_id) as with_platform_id,
|
||||
COUNT(*) - COUNT(platform_dispensary_id) as without_platform_id,
|
||||
MAX(updated_at) as last_updated
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
`);
|
||||
|
||||
const stats = rows[0];
|
||||
return {
|
||||
totalDispensaries: parseInt(stats.total, 10),
|
||||
withPlatformIds: parseInt(stats.with_platform_id, 10),
|
||||
withoutPlatformIds: parseInt(stats.without_platform_id, 10),
|
||||
lastImportedAt: stats.last_updated,
|
||||
};
|
||||
}
|
||||
@@ -1,481 +0,0 @@
|
||||
/**
|
||||
* Directory-Based Store Matcher
|
||||
*
|
||||
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
|
||||
* then matches them to existing dispensaries by fuzzy name/city/address matching.
|
||||
*
|
||||
* This allows us to:
|
||||
* 1. Find specific store URLs for directory-style websites
|
||||
* 2. Match stores confidently by name+city
|
||||
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
|
||||
*/
|
||||
|
||||
import { query } from '../db/connection';
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DirectoryStore {
|
||||
name: string;
|
||||
city: string;
|
||||
state: string;
|
||||
address: string | null;
|
||||
storeUrl: string;
|
||||
}
|
||||
|
||||
export interface MatchResult {
|
||||
directoryStore: DirectoryStore;
|
||||
dispensaryId: number | null;
|
||||
dispensaryName: string | null;
|
||||
confidence: 'high' | 'medium' | 'low' | 'none';
|
||||
matchReason: string;
|
||||
}
|
||||
|
||||
export interface DirectoryMatchReport {
|
||||
provider: string;
|
||||
totalDirectoryStores: number;
|
||||
highConfidenceMatches: number;
|
||||
mediumConfidenceMatches: number;
|
||||
lowConfidenceMatches: number;
|
||||
unmatched: number;
|
||||
results: MatchResult[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// NORMALIZATION FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Normalize a string for comparison:
|
||||
* - Lowercase
|
||||
* - Remove common suffixes (dispensary, cannabis, etc.)
|
||||
* - Remove punctuation
|
||||
* - Collapse whitespace
|
||||
*/
|
||||
function normalizeForComparison(str: string): string {
|
||||
if (!str) return '';
|
||||
|
||||
return str
|
||||
.toLowerCase()
|
||||
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
|
||||
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
||||
.replace(/\s+/g, ' ') // Collapse whitespace
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize city name for comparison
|
||||
*/
|
||||
function normalizeCity(city: string): string {
|
||||
if (!city) return '';
|
||||
|
||||
return city
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s]/g, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate similarity between two strings (0-1)
|
||||
* Uses Levenshtein distance normalized by max length
|
||||
*/
|
||||
function stringSimilarity(a: string, b: string): number {
|
||||
if (!a || !b) return 0;
|
||||
if (a === b) return 1;
|
||||
|
||||
const longer = a.length > b.length ? a : b;
|
||||
const shorter = a.length > b.length ? b : a;
|
||||
|
||||
if (longer.length === 0) return 1;
|
||||
|
||||
const distance = levenshteinDistance(longer, shorter);
|
||||
return (longer.length - distance) / longer.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(a: string, b: string): number {
|
||||
const matrix: number[][] = [];
|
||||
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
} else {
|
||||
matrix[i][j] = Math.min(
|
||||
matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if string contains another (with normalization)
|
||||
*/
|
||||
function containsNormalized(haystack: string, needle: string): boolean {
|
||||
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROVIDER DIRECTORY SCRAPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
|
||||
*/
|
||||
export async function scrapeSolDirectory(): Promise<DirectoryStore[]> {
|
||||
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
|
||||
|
||||
try {
|
||||
const response = await fetch('https://www.livewithsol.com/locations/', {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Extract store entries from HTML
|
||||
// Sol's structure: Each location has name, address in specific divs
|
||||
const stores: DirectoryStore[] = [];
|
||||
|
||||
// Pattern to find location cards
|
||||
// Format: <a href="/locations/slug/">NAME</a> with address nearby
|
||||
const locationRegex =
|
||||
/<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
|
||||
|
||||
let match;
|
||||
while ((match = locationRegex.exec(html)) !== null) {
|
||||
const [, path, name, address] = match;
|
||||
|
||||
// Extract city from common Arizona cities
|
||||
let city = 'Unknown';
|
||||
const cityPatterns = [
|
||||
{ pattern: /phoenix/i, city: 'Phoenix' },
|
||||
{ pattern: /scottsdale/i, city: 'Scottsdale' },
|
||||
{ pattern: /tempe/i, city: 'Tempe' },
|
||||
{ pattern: /tucson/i, city: 'Tucson' },
|
||||
{ pattern: /mesa/i, city: 'Mesa' },
|
||||
{ pattern: /sun city/i, city: 'Sun City' },
|
||||
{ pattern: /glendale/i, city: 'Glendale' },
|
||||
];
|
||||
|
||||
for (const { pattern, city: cityName } of cityPatterns) {
|
||||
if (pattern.test(name) || pattern.test(address)) {
|
||||
city = cityName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
city,
|
||||
state: 'AZ',
|
||||
address: address.trim(),
|
||||
storeUrl: `https://www.livewithsol.com${path}`,
|
||||
});
|
||||
}
|
||||
|
||||
// If regex didn't work, use known hardcoded values (fallback)
|
||||
if (stores.length === 0) {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
|
||||
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
|
||||
return stores;
|
||||
} catch (error: any) {
|
||||
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
|
||||
// Return hardcoded fallback
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
|
||||
* In production, this would use Playwright to bypass age-gate
|
||||
*/
|
||||
export async function scrapeCuraleafDirectory(): Promise<DirectoryStore[]> {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
|
||||
|
||||
// Hardcoded Arizona Curaleaf locations from public knowledge
|
||||
// These would be scraped via Playwright in production
|
||||
return [
|
||||
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
|
||||
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
|
||||
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
|
||||
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
|
||||
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
|
||||
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
|
||||
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
|
||||
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
|
||||
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
|
||||
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
|
||||
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
|
||||
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
|
||||
];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MATCHING LOGIC
|
||||
// ============================================================
|
||||
|
||||
interface Dispensary {
|
||||
id: number;
|
||||
name: string;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
address: string | null;
|
||||
menu_type: string | null;
|
||||
menu_url: string | null;
|
||||
website: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Match a directory store to an existing dispensary
|
||||
*/
|
||||
function matchStoreToDispensary(store: DirectoryStore, dispensaries: Dispensary[]): MatchResult {
|
||||
const normalizedStoreName = normalizeForComparison(store.name);
|
||||
const normalizedStoreCity = normalizeCity(store.city);
|
||||
|
||||
let bestMatch: Dispensary | null = null;
|
||||
let bestScore = 0;
|
||||
let matchReason = '';
|
||||
|
||||
for (const disp of dispensaries) {
|
||||
const normalizedDispName = normalizeForComparison(disp.name);
|
||||
const normalizedDispCity = normalizeCity(disp.city || '');
|
||||
|
||||
let score = 0;
|
||||
const reasons: string[] = [];
|
||||
|
||||
// 1. Name similarity (max 50 points)
|
||||
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
|
||||
score += nameSimilarity * 50;
|
||||
if (nameSimilarity > 0.8) reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
|
||||
|
||||
// 2. City match (25 points for exact, 15 for partial)
|
||||
if (normalizedStoreCity && normalizedDispCity) {
|
||||
if (normalizedStoreCity === normalizedDispCity) {
|
||||
score += 25;
|
||||
reasons.push('city_exact');
|
||||
} else if (
|
||||
normalizedStoreCity.includes(normalizedDispCity) ||
|
||||
normalizedDispCity.includes(normalizedStoreCity)
|
||||
) {
|
||||
score += 15;
|
||||
reasons.push('city_partial');
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Address contains street name (15 points)
|
||||
if (store.address && disp.address) {
|
||||
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
|
||||
score += 15;
|
||||
reasons.push('address_match');
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Brand name in dispensary name (10 points)
|
||||
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
|
||||
if (disp.name.toLowerCase().includes(brandName)) {
|
||||
score += 10;
|
||||
reasons.push('brand_match');
|
||||
}
|
||||
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestMatch = disp;
|
||||
matchReason = reasons.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
// Determine confidence level
|
||||
let confidence: 'high' | 'medium' | 'low' | 'none';
|
||||
if (bestScore >= 70) {
|
||||
confidence = 'high';
|
||||
} else if (bestScore >= 50) {
|
||||
confidence = 'medium';
|
||||
} else if (bestScore >= 30) {
|
||||
confidence = 'low';
|
||||
} else {
|
||||
confidence = 'none';
|
||||
}
|
||||
|
||||
return {
|
||||
directoryStore: store,
|
||||
dispensaryId: bestMatch?.id || null,
|
||||
dispensaryName: bestMatch?.name || null,
|
||||
confidence,
|
||||
matchReason: matchReason || 'no_match',
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Run directory matching for a provider and update database
|
||||
* Only applies high-confidence matches automatically
|
||||
*/
|
||||
export async function matchDirectoryToDispensaries(
|
||||
provider: 'curaleaf' | 'sol',
|
||||
dryRun: boolean = true
|
||||
): Promise<DirectoryMatchReport> {
|
||||
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
|
||||
|
||||
// Get directory stores
|
||||
let directoryStores: DirectoryStore[];
|
||||
if (provider === 'curaleaf') {
|
||||
directoryStores = await scrapeCuraleafDirectory();
|
||||
} else if (provider === 'sol') {
|
||||
directoryStores = await scrapeSolDirectory();
|
||||
} else {
|
||||
throw new Error(`Unknown provider: ${provider}`);
|
||||
}
|
||||
|
||||
// Get all AZ dispensaries from database
|
||||
const { rows: dispensaries } = await query<Dispensary>(
|
||||
`SELECT id, name, city, state, address, menu_type, menu_url, website
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'`
|
||||
);
|
||||
|
||||
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
|
||||
|
||||
// Match each directory store
|
||||
const results: MatchResult[] = [];
|
||||
for (const store of directoryStores) {
|
||||
const match = matchStoreToDispensary(store, dispensaries);
|
||||
results.push(match);
|
||||
|
||||
// Only apply high-confidence matches if not dry run
|
||||
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
|
||||
await applyDirectoryMatch(match.dispensaryId, provider, store);
|
||||
}
|
||||
}
|
||||
|
||||
// Count results
|
||||
const report: DirectoryMatchReport = {
|
||||
provider,
|
||||
totalDirectoryStores: directoryStores.length,
|
||||
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
|
||||
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
|
||||
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
|
||||
unmatched: results.filter((r) => r.confidence === 'none').length,
|
||||
results,
|
||||
};
|
||||
|
||||
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
|
||||
console.log(` - High confidence: ${report.highConfidenceMatches}`);
|
||||
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
|
||||
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
|
||||
console.log(` - Unmatched: ${report.unmatched}`);
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply a directory match to a dispensary
|
||||
*/
|
||||
async function applyDirectoryMatch(
|
||||
dispensaryId: number,
|
||||
provider: string,
|
||||
store: DirectoryStore
|
||||
): Promise<void> {
|
||||
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
|
||||
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
menu_url = $2,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'directory_match'::text,
|
||||
'detected_at', NOW(),
|
||||
'directory_store_name', $3::text,
|
||||
'directory_store_url', $2::text,
|
||||
'directory_store_city', $4::text,
|
||||
'directory_store_address', $5::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', $6::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $7
|
||||
`,
|
||||
[
|
||||
provider,
|
||||
store.storeUrl,
|
||||
store.name,
|
||||
store.city,
|
||||
store.address,
|
||||
`${provider} proprietary menu - no crawler available`,
|
||||
dispensaryId,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Preview matches without applying them
|
||||
*/
|
||||
export async function previewDirectoryMatches(
|
||||
provider: 'curaleaf' | 'sol'
|
||||
): Promise<DirectoryMatchReport> {
|
||||
return matchDirectoryToDispensaries(provider, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply high-confidence matches
|
||||
*/
|
||||
export async function applyHighConfidenceMatches(
|
||||
provider: 'curaleaf' | 'sol'
|
||||
): Promise<DirectoryMatchReport> {
|
||||
return matchDirectoryToDispensaries(provider, false);
|
||||
}
|
||||
@@ -1,592 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Discovery Service
|
||||
*
|
||||
* Discovers and manages dispensaries from Dutchie for Arizona.
|
||||
*/
|
||||
|
||||
import { query, getClient } from '../db/connection';
|
||||
import { discoverArizonaDispensaries, resolveDispensaryId, resolveDispensaryIdWithDetails, ResolveDispensaryResult } from './graphql-client';
|
||||
import { Dispensary } from '../types';
|
||||
|
||||
/**
|
||||
* Upsert a dispensary record
|
||||
*/
|
||||
async function upsertDispensary(dispensary: Partial<Dispensary>): Promise<number> {
|
||||
const result = await query<{ id: number }>(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
platform, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, platform_dispensary_id,
|
||||
is_delivery, is_pickup, raw_metadata, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7,
|
||||
$8, $9, $10,
|
||||
$11, $12, $13, NOW()
|
||||
)
|
||||
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
address = EXCLUDED.address,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
platform_dispensary_id = COALESCE(EXCLUDED.platform_dispensary_id, dispensaries.platform_dispensary_id),
|
||||
is_delivery = EXCLUDED.is_delivery,
|
||||
is_pickup = EXCLUDED.is_pickup,
|
||||
raw_metadata = EXCLUDED.raw_metadata,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
dispensary.platform || 'dutchie',
|
||||
dispensary.name,
|
||||
dispensary.slug,
|
||||
dispensary.city,
|
||||
dispensary.state || 'AZ',
|
||||
dispensary.postalCode,
|
||||
dispensary.address,
|
||||
dispensary.latitude,
|
||||
dispensary.longitude,
|
||||
dispensary.platformDispensaryId,
|
||||
dispensary.isDelivery || false,
|
||||
dispensary.isPickup || true,
|
||||
dispensary.rawMetadata ? JSON.stringify(dispensary.rawMetadata) : null,
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a raw discovery result to Dispensary
|
||||
*/
|
||||
function normalizeDispensary(raw: any): Partial<Dispensary> {
|
||||
return {
|
||||
platform: 'dutchie',
|
||||
name: raw.name || raw.Name || '',
|
||||
slug: raw.slug || raw.cName || raw.id || '',
|
||||
city: raw.city || raw.address?.city || '',
|
||||
state: 'AZ',
|
||||
postalCode: raw.postalCode || raw.address?.postalCode || raw.address?.zip,
|
||||
address: raw.streetAddress || raw.address?.streetAddress,
|
||||
latitude: raw.latitude || raw.location?.lat,
|
||||
longitude: raw.longitude || raw.location?.lng,
|
||||
platformDispensaryId: raw.dispensaryId || raw.id || null,
|
||||
isDelivery: raw.isDelivery || raw.delivery || false,
|
||||
isPickup: raw.isPickup || raw.pickup || true,
|
||||
rawMetadata: raw,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Import dispensaries from the existing dispensaries table (from AZDHS data)
|
||||
* This creates records in the dutchie_az database for AZ dispensaries
|
||||
*/
|
||||
export async function importFromExistingDispensaries(): Promise<{ imported: number }> {
|
||||
console.log('[Discovery] Importing from existing dispensaries table...');
|
||||
|
||||
// This is a workaround - we'll use the dispensaries we already know about
|
||||
// and try to resolve their Dutchie IDs
|
||||
const knownDispensaries = [
|
||||
{ name: 'Deeply Rooted', slug: 'AZ-Deeply-Rooted', city: 'Phoenix', state: 'AZ' },
|
||||
{ name: 'Curaleaf Gilbert', slug: 'curaleaf-gilbert', city: 'Gilbert', state: 'AZ' },
|
||||
{ name: 'Zen Leaf Prescott', slug: 'AZ-zen-leaf-prescott', city: 'Prescott', state: 'AZ' },
|
||||
// Add more known Dutchie stores here
|
||||
];
|
||||
|
||||
let imported = 0;
|
||||
|
||||
for (const disp of knownDispensaries) {
|
||||
try {
|
||||
const id = await upsertDispensary({
|
||||
platform: 'dutchie',
|
||||
name: disp.name,
|
||||
slug: disp.slug,
|
||||
city: disp.city,
|
||||
state: disp.state,
|
||||
});
|
||||
imported++;
|
||||
console.log(`[Discovery] Imported: ${disp.name} (id=${id})`);
|
||||
} catch (error: any) {
|
||||
console.error(`[Discovery] Failed to import ${disp.name}:`, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
return { imported };
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover all Arizona Dutchie dispensaries via GraphQL
|
||||
*/
|
||||
export async function discoverDispensaries(): Promise<{ discovered: number; errors: string[] }> {
|
||||
console.log('[Discovery] Starting Arizona dispensary discovery...');
|
||||
const errors: string[] = [];
|
||||
let discovered = 0;
|
||||
|
||||
try {
|
||||
const rawDispensaries = await discoverArizonaDispensaries();
|
||||
console.log(`[Discovery] Found ${rawDispensaries.length} dispensaries from GraphQL`);
|
||||
|
||||
for (const raw of rawDispensaries) {
|
||||
try {
|
||||
const normalized = normalizeDispensary(raw);
|
||||
if (normalized.name && normalized.slug && normalized.city) {
|
||||
await upsertDispensary(normalized);
|
||||
discovered++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
errors.push(`${raw.name || raw.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
errors.push(`Discovery failed: ${error.message}`);
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Completed: ${discovered} dispensaries, ${errors.length} errors`);
|
||||
return { discovered, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a string looks like a MongoDB ObjectId (24 hex chars)
|
||||
*/
|
||||
export function isObjectId(value: string): boolean {
|
||||
return /^[a-f0-9]{24}$/i.test(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) or platform_dispensary_id from a Dutchie menu_url
|
||||
*
|
||||
* Supports formats:
|
||||
* - https://dutchie.com/embedded-menu/<cName> -> returns { type: 'cName', value: '<cName>' }
|
||||
* - https://dutchie.com/dispensary/<cName> -> returns { type: 'cName', value: '<cName>' }
|
||||
* - https://dutchie.com/api/v2/embedded-menu/<id>.js -> returns { type: 'platformId', value: '<id>' }
|
||||
*
|
||||
* For backward compatibility, extractCNameFromMenuUrl still returns just the string value.
|
||||
*/
|
||||
export interface MenuUrlExtraction {
|
||||
type: 'cName' | 'platformId';
|
||||
value: string;
|
||||
}
|
||||
|
||||
export function extractFromMenuUrl(menuUrl: string | null | undefined): MenuUrlExtraction | null {
|
||||
if (!menuUrl) return null;
|
||||
|
||||
try {
|
||||
const url = new URL(menuUrl);
|
||||
const pathname = url.pathname;
|
||||
|
||||
// Match /api/v2/embedded-menu/<id>.js - this contains the platform_dispensary_id directly
|
||||
const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i);
|
||||
if (apiMatch) {
|
||||
return { type: 'platformId', value: apiMatch[1] };
|
||||
}
|
||||
|
||||
// Match /embedded-menu/<cName> or /dispensary/<cName>
|
||||
const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
|
||||
if (embeddedMatch) {
|
||||
const value = embeddedMatch[1];
|
||||
// Check if it's actually an ObjectId (some URLs use ID directly)
|
||||
if (isObjectId(value)) {
|
||||
return { type: 'platformId', value };
|
||||
}
|
||||
return { type: 'cName', value };
|
||||
}
|
||||
|
||||
const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
|
||||
if (dispensaryMatch) {
|
||||
const value = dispensaryMatch[1];
|
||||
if (isObjectId(value)) {
|
||||
return { type: 'platformId', value };
|
||||
}
|
||||
return { type: 'cName', value };
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) from a Dutchie menu_url
|
||||
* Backward compatible - use extractFromMenuUrl for full info
|
||||
*/
|
||||
export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null {
|
||||
const extraction = extractFromMenuUrl(menuUrl);
|
||||
return extraction?.value || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve platform dispensary IDs for all dispensaries that don't have one
|
||||
* CRITICAL: Uses cName extracted from menu_url, NOT the slug column!
|
||||
*
|
||||
* Uses the new resolveDispensaryIdWithDetails which:
|
||||
* 1. Extracts dispensaryId from window.reactEnv in the embedded menu page (preferred)
|
||||
* 2. Falls back to GraphQL if reactEnv extraction fails
|
||||
* 3. Returns HTTP status so we can mark 403/404 stores as not_crawlable
|
||||
*/
|
||||
export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number; failed: number; skipped: number; notCrawlable: number }> {
|
||||
console.log('[Discovery] Resolving platform dispensary IDs...');
|
||||
|
||||
const { rows: dispensaries } = await query<any>(
|
||||
`
|
||||
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id, crawl_status
|
||||
FROM dispensaries
|
||||
WHERE menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NULL
|
||||
AND menu_url IS NOT NULL
|
||||
AND (crawl_status IS NULL OR crawl_status != 'not_crawlable')
|
||||
ORDER BY id
|
||||
`
|
||||
);
|
||||
|
||||
let resolved = 0;
|
||||
let failed = 0;
|
||||
let skipped = 0;
|
||||
let notCrawlable = 0;
|
||||
|
||||
for (const dispensary of dispensaries) {
|
||||
try {
|
||||
// Extract cName from menu_url - this is the CORRECT way to get the Dutchie slug
|
||||
const cName = extractCNameFromMenuUrl(dispensary.menu_url);
|
||||
|
||||
if (!cName) {
|
||||
console.log(`[Discovery] Skipping ${dispensary.name}: Could not extract cName from menu_url: ${dispensary.menu_url}`);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Resolving ID for: ${dispensary.name} (cName=${cName}, menu_url=${dispensary.menu_url})`);
|
||||
|
||||
// Use the new detailed resolver that extracts from reactEnv first
|
||||
const result = await resolveDispensaryIdWithDetails(cName);
|
||||
|
||||
if (result.dispensaryId) {
|
||||
// SUCCESS: Store resolved
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
crawl_status = 'ready',
|
||||
crawl_status_reason = $2,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $3,
|
||||
last_http_status = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $5
|
||||
`,
|
||||
[
|
||||
result.dispensaryId,
|
||||
`Resolved from ${result.source || 'page'}`,
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]
|
||||
);
|
||||
resolved++;
|
||||
console.log(`[Discovery] Resolved: ${cName} -> ${result.dispensaryId} (source: ${result.source})`);
|
||||
} else if (result.httpStatus === 403 || result.httpStatus === 404) {
|
||||
// NOT CRAWLABLE: Store removed or not accessible
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
crawl_status = 'not_crawlable',
|
||||
crawl_status_reason = $1,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $2,
|
||||
last_http_status = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`,
|
||||
[
|
||||
result.error || `HTTP ${result.httpStatus}: Removed from Dutchie`,
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]
|
||||
);
|
||||
notCrawlable++;
|
||||
console.log(`[Discovery] Marked not crawlable: ${cName} (HTTP ${result.httpStatus})`);
|
||||
} else {
|
||||
// FAILED: Could not resolve but page loaded
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET crawl_status = 'not_ready',
|
||||
crawl_status_reason = $1,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $2,
|
||||
last_http_status = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`,
|
||||
[
|
||||
result.error || 'Could not extract dispensaryId from page',
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]
|
||||
);
|
||||
failed++;
|
||||
console.log(`[Discovery] Could not resolve: ${cName} - ${result.error}`);
|
||||
}
|
||||
|
||||
// Delay between requests
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
} catch (error: any) {
|
||||
failed++;
|
||||
console.error(`[Discovery] Error resolving ${dispensary.name}:`, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Completed: ${resolved} resolved, ${failed} failed, ${skipped} skipped, ${notCrawlable} not crawlable`);
|
||||
return { resolved, failed, skipped, notCrawlable };
|
||||
}
|
||||
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
|
||||
/**
|
||||
* Get all dispensaries
|
||||
*/
|
||||
|
||||
export async function getAllDispensaries(): Promise<Dispensary[]> {
|
||||
const { rows } = await query(
|
||||
`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`
|
||||
);
|
||||
return rows.map(mapDbRowToDispensary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Map snake_case DB row to camelCase Dispensary object
|
||||
* CRITICAL: DB returns snake_case (platform_dispensary_id) but TypeScript expects camelCase (platformDispensaryId)
|
||||
* This function is exported for use in other modules that query dispensaries directly.
|
||||
*
|
||||
* NOTE: The consolidated dispensaries table column mappings:
|
||||
* - zip → postalCode
|
||||
* - menu_type → menuType (keep platform as 'dutchie')
|
||||
* - last_crawl_at → lastCrawledAt
|
||||
* - platform_dispensary_id → platformDispensaryId
|
||||
*/
|
||||
export function mapDbRowToDispensary(row: any): Dispensary {
|
||||
// Extract website from raw_metadata if available (field may not exist in all environments)
|
||||
let rawMetadata = undefined;
|
||||
if (row.raw_metadata !== undefined) {
|
||||
rawMetadata = typeof row.raw_metadata === 'string'
|
||||
? JSON.parse(row.raw_metadata)
|
||||
: row.raw_metadata;
|
||||
}
|
||||
const website = row.website || rawMetadata?.website || undefined;
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
|
||||
name: row.name,
|
||||
dbaName: row.dbaName || row.dba_name || undefined, // dba_name column is optional
|
||||
slug: row.slug,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
postalCode: row.postalCode || row.zip || row.postal_code,
|
||||
latitude: row.latitude ? parseFloat(row.latitude) : undefined,
|
||||
longitude: row.longitude ? parseFloat(row.longitude) : undefined,
|
||||
address: row.address,
|
||||
platformDispensaryId: row.platformDispensaryId || row.platform_dispensary_id, // CRITICAL mapping!
|
||||
isDelivery: row.is_delivery,
|
||||
isPickup: row.is_pickup,
|
||||
rawMetadata: rawMetadata,
|
||||
lastCrawledAt: row.lastCrawledAt || row.last_crawl_at, // use last_crawl_at
|
||||
productCount: row.product_count,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
menuType: row.menuType || row.menu_type,
|
||||
menuUrl: row.menuUrl || row.menu_url,
|
||||
scrapeEnabled: row.scrapeEnabled ?? row.scrape_enabled,
|
||||
providerDetectionData: row.provider_detection_data,
|
||||
platformDispensaryIdResolvedAt: row.platform_dispensary_id_resolved_at,
|
||||
website,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get dispensary by ID
|
||||
* NOTE: Uses SQL aliases to map snake_case → camelCase directly
|
||||
*/
|
||||
export async function getDispensaryById(id: number): Promise<Dispensary | null> {
|
||||
const { rows } = await query(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
slug,
|
||||
city,
|
||||
state,
|
||||
zip AS "postalCode",
|
||||
address,
|
||||
latitude,
|
||||
longitude,
|
||||
menu_type AS "menuType",
|
||||
menu_url AS "menuUrl",
|
||||
platform_dispensary_id AS "platformDispensaryId",
|
||||
website,
|
||||
provider_detection_data AS "providerDetectionData",
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`,
|
||||
[id]
|
||||
);
|
||||
if (!rows[0]) return null;
|
||||
return mapDbRowToDispensary(rows[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get dispensaries with platform IDs (ready for crawling)
|
||||
*/
|
||||
export async function getDispensariesWithPlatformIds(): Promise<Dispensary[]> {
|
||||
const { rows } = await query(
|
||||
`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY name
|
||||
`
|
||||
);
|
||||
return rows.map(mapDbRowToDispensary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Re-resolve a single dispensary's platform ID
|
||||
* Clears the existing ID and re-resolves from the menu_url cName
|
||||
*/
|
||||
export async function reResolveDispensaryPlatformId(dispensaryId: number): Promise<{
|
||||
success: boolean;
|
||||
platformId: string | null;
|
||||
cName: string | null;
|
||||
error?: string;
|
||||
}> {
|
||||
console.log(`[Discovery] Re-resolving platform ID for dispensary ${dispensaryId}...`);
|
||||
|
||||
const dispensary = await getDispensaryById(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, platformId: null, cName: null, error: 'Dispensary not found' };
|
||||
}
|
||||
|
||||
const cName = extractCNameFromMenuUrl(dispensary.menuUrl);
|
||||
if (!cName) {
|
||||
console.log(`[Discovery] Could not extract cName from menu_url: ${dispensary.menuUrl}`);
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName: null,
|
||||
error: `Could not extract cName from menu_url: ${dispensary.menuUrl}`,
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Extracted cName: ${cName} from menu_url: ${dispensary.menuUrl}`);
|
||||
|
||||
try {
|
||||
const platformId = await resolveDispensaryId(cName);
|
||||
|
||||
if (platformId) {
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[platformId, dispensaryId]
|
||||
);
|
||||
console.log(`[Discovery] Resolved: ${cName} -> ${platformId}`);
|
||||
return { success: true, platformId, cName };
|
||||
} else {
|
||||
// Clear the invalid platform ID and mark as not crawlable
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
'{"resolution_error": "cName no longer exists on Dutchie", "not_crawlable": true}'::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
console.log(`[Discovery] Could not resolve: ${cName} - marked as not crawlable`);
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName,
|
||||
error: `cName "${cName}" no longer exists on Dutchie`,
|
||||
};
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Discovery] Error resolving ${cName}:`, error.message);
|
||||
return { success: false, platformId: null, cName, error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update menu_url for a dispensary and re-resolve platform ID
|
||||
*/
|
||||
export async function updateMenuUrlAndResolve(dispensaryId: number, newMenuUrl: string): Promise<{
|
||||
success: boolean;
|
||||
platformId: string | null;
|
||||
cName: string | null;
|
||||
error?: string;
|
||||
}> {
|
||||
console.log(`[Discovery] Updating menu_url for dispensary ${dispensaryId} to: ${newMenuUrl}`);
|
||||
|
||||
const cName = extractCNameFromMenuUrl(newMenuUrl);
|
||||
if (!cName) {
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName: null,
|
||||
error: `Could not extract cName from new menu_url: ${newMenuUrl}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Update the menu_url first
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET menu_url = $1,
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[newMenuUrl, dispensaryId]
|
||||
);
|
||||
|
||||
// Now resolve the platform ID with the new cName
|
||||
return await reResolveDispensaryPlatformId(dispensaryId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a dispensary as not crawlable (when resolution fails permanently)
|
||||
*/
|
||||
export async function markDispensaryNotCrawlable(dispensaryId: number, reason: string): Promise<void> {
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object('not_crawlable', true, 'not_crawlable_reason', $1::text, 'not_crawlable_at', NOW()::text),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[reason, dispensaryId]
|
||||
);
|
||||
console.log(`[Discovery] Marked dispensary ${dispensaryId} as not crawlable: ${reason}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cName for a dispensary (extracted from menu_url)
|
||||
*/
|
||||
export function getDispensaryCName(dispensary: Dispensary): string | null {
|
||||
return extractCNameFromMenuUrl(dispensary.menuUrl);
|
||||
}
|
||||
@@ -1,491 +0,0 @@
|
||||
/**
|
||||
* Error Taxonomy Module
|
||||
*
|
||||
* Standardized error codes and classification for crawler reliability.
|
||||
* All crawl results must use these codes for consistent error handling.
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// ERROR CODES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Standardized error codes for all crawl operations.
|
||||
* These codes are stored in the database for analytics and debugging.
|
||||
*/
|
||||
export const CrawlErrorCode = {
|
||||
// Success states
|
||||
SUCCESS: 'SUCCESS',
|
||||
|
||||
// Rate limiting
|
||||
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
|
||||
|
||||
// Proxy issues
|
||||
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
|
||||
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
|
||||
|
||||
// Content issues
|
||||
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
|
||||
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
|
||||
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
|
||||
|
||||
// Network issues
|
||||
TIMEOUT: 'TIMEOUT', // Request timeout
|
||||
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
|
||||
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
|
||||
|
||||
// Authentication
|
||||
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
|
||||
|
||||
// Server errors
|
||||
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
|
||||
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
|
||||
|
||||
// Configuration issues
|
||||
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
|
||||
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
|
||||
|
||||
// Unknown
|
||||
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
|
||||
} as const;
|
||||
|
||||
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
|
||||
|
||||
// ============================================================
|
||||
// ERROR CLASSIFICATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Error metadata for each error code
|
||||
*/
|
||||
interface ErrorMetadata {
|
||||
code: CrawlErrorCodeType;
|
||||
retryable: boolean;
|
||||
rotateProxy: boolean;
|
||||
rotateUserAgent: boolean;
|
||||
backoffMultiplier: number;
|
||||
severity: 'low' | 'medium' | 'high' | 'critical';
|
||||
description: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for each error code - defines retry behavior
|
||||
*/
|
||||
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
|
||||
[CrawlErrorCode.SUCCESS]: {
|
||||
code: CrawlErrorCode.SUCCESS,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'low',
|
||||
description: 'Crawl completed successfully',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.RATE_LIMITED]: {
|
||||
code: CrawlErrorCode.RATE_LIMITED,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'medium',
|
||||
description: 'Rate limited by target (429)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.BLOCKED_PROXY]: {
|
||||
code: CrawlErrorCode.BLOCKED_PROXY,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Proxy blocked or rejected (407)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.PROXY_TIMEOUT]: {
|
||||
code: CrawlErrorCode.PROXY_TIMEOUT,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'low',
|
||||
description: 'Proxy connection timed out',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.HTML_CHANGED]: {
|
||||
code: CrawlErrorCode.HTML_CHANGED,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'high',
|
||||
description: 'Page structure changed - needs selector update',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.NO_PRODUCTS]: {
|
||||
code: CrawlErrorCode.NO_PRODUCTS,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'low',
|
||||
description: 'No products returned (may be temporary)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.PARSE_ERROR]: {
|
||||
code: CrawlErrorCode.PARSE_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'Failed to parse response data',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.TIMEOUT]: {
|
||||
code: CrawlErrorCode.TIMEOUT,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Request timed out',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.NETWORK_ERROR]: {
|
||||
code: CrawlErrorCode.NETWORK_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'Network connection failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.DNS_ERROR]: {
|
||||
code: CrawlErrorCode.DNS_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'DNS resolution failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.AUTH_FAILED]: {
|
||||
code: CrawlErrorCode.AUTH_FAILED,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'high',
|
||||
description: 'Authentication or session failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.SERVER_ERROR]: {
|
||||
code: CrawlErrorCode.SERVER_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Server error (5xx)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
|
||||
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'high',
|
||||
description: 'Service temporarily unavailable (503)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.INVALID_CONFIG]: {
|
||||
code: CrawlErrorCode.INVALID_CONFIG,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'critical',
|
||||
description: 'Invalid store configuration',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
|
||||
code: CrawlErrorCode.MISSING_PLATFORM_ID,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'critical',
|
||||
description: 'Missing platform_dispensary_id',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.UNKNOWN_ERROR]: {
|
||||
code: CrawlErrorCode.UNKNOWN_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'high',
|
||||
description: 'Unknown/unclassified error',
|
||||
},
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ERROR CLASSIFICATION FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Classify an error into a standardized error code.
|
||||
*
|
||||
* @param error - The error to classify (Error object, string, or HTTP status)
|
||||
* @param httpStatus - Optional HTTP status code
|
||||
* @returns Standardized error code
|
||||
*/
|
||||
export function classifyError(
|
||||
error: Error | string | null,
|
||||
httpStatus?: number
|
||||
): CrawlErrorCodeType {
|
||||
// Check HTTP status first
|
||||
if (httpStatus) {
|
||||
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
|
||||
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
|
||||
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
|
||||
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
|
||||
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
|
||||
}
|
||||
|
||||
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
|
||||
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
|
||||
|
||||
// Rate limiting patterns
|
||||
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
|
||||
return CrawlErrorCode.RATE_LIMITED;
|
||||
}
|
||||
|
||||
// Proxy patterns
|
||||
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
|
||||
return CrawlErrorCode.BLOCKED_PROXY;
|
||||
}
|
||||
|
||||
// Timeout patterns
|
||||
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
|
||||
if (message.includes('proxy')) {
|
||||
return CrawlErrorCode.PROXY_TIMEOUT;
|
||||
}
|
||||
return CrawlErrorCode.TIMEOUT;
|
||||
}
|
||||
|
||||
// Network patterns
|
||||
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
|
||||
return CrawlErrorCode.NETWORK_ERROR;
|
||||
}
|
||||
|
||||
// DNS patterns
|
||||
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
|
||||
return CrawlErrorCode.DNS_ERROR;
|
||||
}
|
||||
|
||||
// Auth patterns
|
||||
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
|
||||
return CrawlErrorCode.AUTH_FAILED;
|
||||
}
|
||||
|
||||
// HTML change patterns
|
||||
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
|
||||
return CrawlErrorCode.HTML_CHANGED;
|
||||
}
|
||||
|
||||
// Parse patterns
|
||||
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
|
||||
return CrawlErrorCode.PARSE_ERROR;
|
||||
}
|
||||
|
||||
// No products patterns
|
||||
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
|
||||
return CrawlErrorCode.NO_PRODUCTS;
|
||||
}
|
||||
|
||||
// Server error patterns
|
||||
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
|
||||
return CrawlErrorCode.SERVER_ERROR;
|
||||
}
|
||||
|
||||
// Config patterns
|
||||
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
|
||||
if (message.includes('platform') || message.includes('dispensary_id')) {
|
||||
return CrawlErrorCode.MISSING_PLATFORM_ID;
|
||||
}
|
||||
return CrawlErrorCode.INVALID_CONFIG;
|
||||
}
|
||||
|
||||
return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get metadata for an error code
|
||||
*/
|
||||
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
|
||||
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an error is retryable
|
||||
*/
|
||||
export function isRetryable(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).retryable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy should be rotated for this error
|
||||
*/
|
||||
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).rotateProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if user agent should be rotated for this error
|
||||
*/
|
||||
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).rotateUserAgent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get backoff multiplier for this error
|
||||
*/
|
||||
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
|
||||
return getErrorMetadata(code).backoffMultiplier;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CRAWL RESULT TYPE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Standardized crawl result with error taxonomy
|
||||
*/
|
||||
export interface CrawlResult {
|
||||
success: boolean;
|
||||
dispensaryId: number;
|
||||
|
||||
// Error info
|
||||
errorCode: CrawlErrorCodeType;
|
||||
errorMessage?: string;
|
||||
httpStatus?: number;
|
||||
|
||||
// Timing
|
||||
startedAt: Date;
|
||||
finishedAt: Date;
|
||||
durationMs: number;
|
||||
|
||||
// Context
|
||||
attemptNumber: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
|
||||
// Metrics (on success)
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
imagesDownloaded?: number;
|
||||
|
||||
// Metadata
|
||||
metadata?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a success result
|
||||
*/
|
||||
export function createSuccessResult(
|
||||
dispensaryId: number,
|
||||
startedAt: Date,
|
||||
metrics: {
|
||||
productsFound: number;
|
||||
productsUpserted: number;
|
||||
snapshotsCreated: number;
|
||||
imagesDownloaded?: number;
|
||||
},
|
||||
context?: {
|
||||
attemptNumber?: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
}
|
||||
): CrawlResult {
|
||||
const finishedAt = new Date();
|
||||
return {
|
||||
success: true,
|
||||
dispensaryId,
|
||||
errorCode: CrawlErrorCode.SUCCESS,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||
attemptNumber: context?.attemptNumber || 1,
|
||||
proxyUsed: context?.proxyUsed,
|
||||
userAgentUsed: context?.userAgentUsed,
|
||||
...metrics,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a failure result
|
||||
*/
|
||||
export function createFailureResult(
|
||||
dispensaryId: number,
|
||||
startedAt: Date,
|
||||
error: Error | string,
|
||||
httpStatus?: number,
|
||||
context?: {
|
||||
attemptNumber?: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
}
|
||||
): CrawlResult {
|
||||
const finishedAt = new Date();
|
||||
const errorCode = classifyError(error, httpStatus);
|
||||
const errorMessage = typeof error === 'string' ? error : error.message;
|
||||
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
httpStatus,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||
attemptNumber: context?.attemptNumber || 1,
|
||||
proxyUsed: context?.proxyUsed,
|
||||
userAgentUsed: context?.userAgentUsed,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOGGING HELPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Format error code for logging
|
||||
*/
|
||||
export function formatErrorForLog(result: CrawlResult): string {
|
||||
const metadata = getErrorMetadata(result.errorCode);
|
||||
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
|
||||
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
|
||||
|
||||
if (result.success) {
|
||||
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
|
||||
}
|
||||
|
||||
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get user-friendly error description
|
||||
*/
|
||||
export function getErrorDescription(code: CrawlErrorCodeType): string {
|
||||
return getErrorMetadata(code).description;
|
||||
}
|
||||
@@ -1,712 +0,0 @@
|
||||
/**
|
||||
* Dutchie GraphQL Client
|
||||
*
|
||||
* Uses Puppeteer to establish a session (get CF cookies), then makes
|
||||
* SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies.
|
||||
*
|
||||
* DUTCHIE FETCH RULES:
|
||||
* 1. Server-side only - use axios (never browser fetch with CORS)
|
||||
* 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly
|
||||
* 3. Headers must mimic Chrome: User-Agent, Origin, Referer
|
||||
* 4. If 403, extract CF cookies from Puppeteer session and include them
|
||||
* 5. Log status codes, error bodies, and product counts
|
||||
*/
|
||||
|
||||
import axios, { AxiosError } from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import type { Browser, Page, Protocol } from 'puppeteer';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import {
|
||||
DutchieRawProduct,
|
||||
DutchiePOSChild,
|
||||
CrawlMode,
|
||||
} from '../types';
|
||||
import { dutchieConfig, GRAPHQL_HASHES, ARIZONA_CENTERPOINTS } from '../config/dutchie';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// Re-export for backward compatibility
|
||||
export { GRAPHQL_HASHES, ARIZONA_CENTERPOINTS };
|
||||
|
||||
// ============================================================
|
||||
// SESSION MANAGEMENT - Get CF cookies via Puppeteer
|
||||
// ============================================================
|
||||
|
||||
interface SessionCredentials {
|
||||
cookies: string; // Cookie header string
|
||||
userAgent: string;
|
||||
browser: Browser;
|
||||
page: Page; // Keep page reference for extracting dispensaryId
|
||||
dispensaryId?: string; // Extracted from window.reactEnv if available
|
||||
httpStatus?: number; // HTTP status code from navigation
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a session by navigating to the embedded menu page
|
||||
* and extracting CF clearance cookies for server-side requests.
|
||||
* Also extracts dispensaryId from window.reactEnv if available.
|
||||
*/
|
||||
async function createSession(cName: string): Promise<SessionCredentials> {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: dutchieConfig.browserArgs,
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
const userAgent = dutchieConfig.userAgent;
|
||||
|
||||
await page.setUserAgent(userAgent);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Navigate to the embedded menu page for this dispensary
|
||||
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
|
||||
console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`);
|
||||
|
||||
let httpStatus: number | undefined;
|
||||
let dispensaryId: string | undefined;
|
||||
|
||||
try {
|
||||
const response = await page.goto(embeddedMenuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: dutchieConfig.navigationTimeout,
|
||||
});
|
||||
httpStatus = response?.status();
|
||||
await new Promise((r) => setTimeout(r, dutchieConfig.pageLoadDelay));
|
||||
|
||||
// Try to extract dispensaryId from window.reactEnv
|
||||
try {
|
||||
dispensaryId = await page.evaluate(() => {
|
||||
return (window as any).reactEnv?.dispensaryId || null;
|
||||
});
|
||||
if (dispensaryId) {
|
||||
console.log(`[GraphQL Client] Extracted dispensaryId from reactEnv: ${dispensaryId}`);
|
||||
}
|
||||
} catch (evalError: any) {
|
||||
console.log(`[GraphQL Client] Could not extract dispensaryId from reactEnv: ${evalError.message}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.warn(`[GraphQL Client] Navigation warning: ${error.message}`);
|
||||
// Continue anyway - we may have gotten cookies
|
||||
}
|
||||
|
||||
// Extract cookies
|
||||
const cookies = await page.cookies();
|
||||
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
|
||||
|
||||
console.log(`[GraphQL Client] Got ${cookies.length} cookies, HTTP status: ${httpStatus}`);
|
||||
if (cookies.length > 0) {
|
||||
console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`);
|
||||
}
|
||||
|
||||
return { cookies: cookieString, userAgent, browser, page, dispensaryId, httpStatus };
|
||||
}
|
||||
|
||||
/**
|
||||
* Close session (browser)
|
||||
*/
|
||||
async function closeSession(session: SessionCredentials): Promise<void> {
|
||||
await session.browser.close();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SERVER-SIDE GRAPHQL FETCH USING AXIOS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build headers that mimic a real browser request
|
||||
*/
|
||||
function buildHeaders(session: SessionCredentials, cName: string): Record<string, string> {
|
||||
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
|
||||
|
||||
return {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'content-type': 'application/json',
|
||||
'origin': 'https://dutchie.com',
|
||||
'referer': embeddedMenuUrl,
|
||||
'user-agent': session.userAgent,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-site',
|
||||
...(session.cookies ? { 'cookie': session.cookies } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute GraphQL query server-side using axios
|
||||
* Uses cookies from the browser session to bypass CF
|
||||
*/
|
||||
async function executeGraphQL(
|
||||
session: SessionCredentials,
|
||||
operationName: string,
|
||||
variables: any,
|
||||
hash: string,
|
||||
cName: string
|
||||
): Promise<any> {
|
||||
const endpoint = dutchieConfig.graphqlEndpoint;
|
||||
const headers = buildHeaders(session, cName);
|
||||
|
||||
// Build request body for POST
|
||||
const body = {
|
||||
operationName,
|
||||
variables,
|
||||
extensions: {
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
},
|
||||
};
|
||||
|
||||
console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`);
|
||||
console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`);
|
||||
|
||||
try {
|
||||
const response = await axios.post(endpoint, body, {
|
||||
headers,
|
||||
timeout: 30000,
|
||||
validateStatus: () => true, // Don't throw on non-2xx
|
||||
});
|
||||
|
||||
// Log response details
|
||||
console.log(`[GraphQL Client] Response status: ${response.status}`);
|
||||
|
||||
if (response.status !== 200) {
|
||||
const bodyPreview = typeof response.data === 'string'
|
||||
? response.data.slice(0, 500)
|
||||
: JSON.stringify(response.data).slice(0, 500);
|
||||
console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`);
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
// Check for GraphQL errors
|
||||
if (response.data?.errors && response.data.errors.length > 0) {
|
||||
console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
|
||||
}
|
||||
|
||||
return response.data;
|
||||
} catch (error: any) {
|
||||
if (axios.isAxiosError(error)) {
|
||||
const axiosError = error as AxiosError;
|
||||
console.error(`[GraphQL Client] Axios error: ${axiosError.message}`);
|
||||
if (axiosError.response) {
|
||||
console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`);
|
||||
console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`);
|
||||
}
|
||||
if (axiosError.code) {
|
||||
console.error(`[GraphQL Client] Error code: ${axiosError.code}`);
|
||||
}
|
||||
} else {
|
||||
console.error(`[GraphQL Client] Error: ${error.message}`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DISPENSARY ID RESOLUTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Resolution result with HTTP status for error handling
|
||||
*/
|
||||
export interface ResolveDispensaryResult {
|
||||
dispensaryId: string | null;
|
||||
httpStatus?: number;
|
||||
error?: string;
|
||||
source?: 'reactEnv' | 'graphql';
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a dispensary slug to its internal platform ID.
|
||||
*
|
||||
* STRATEGY:
|
||||
* 1. Navigate to embedded menu page and extract window.reactEnv.dispensaryId (preferred)
|
||||
* 2. Fall back to GraphQL GetAddressBasedDispensaryData query if reactEnv fails
|
||||
*
|
||||
* Returns the dispensaryId (platform_dispensary_id) or null if not found.
|
||||
* Throws if page returns 403/404 so caller can mark as not_crawlable.
|
||||
*/
|
||||
export async function resolveDispensaryId(slug: string): Promise<string | null> {
|
||||
const result = await resolveDispensaryIdWithDetails(slug);
|
||||
return result.dispensaryId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a dispensary slug with full details (HTTP status, source, error).
|
||||
* Use this when you need to know WHY resolution failed.
|
||||
*/
|
||||
export async function resolveDispensaryIdWithDetails(slug: string): Promise<ResolveDispensaryResult> {
|
||||
console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`);
|
||||
|
||||
const session = await createSession(slug);
|
||||
|
||||
try {
|
||||
// Check HTTP status first - if 403/404, the store is not crawlable
|
||||
if (session.httpStatus && (session.httpStatus === 403 || session.httpStatus === 404)) {
|
||||
console.log(`[GraphQL Client] Page returned HTTP ${session.httpStatus} for ${slug} - not crawlable`);
|
||||
return {
|
||||
dispensaryId: null,
|
||||
httpStatus: session.httpStatus,
|
||||
error: `HTTP ${session.httpStatus}: Store removed or not accessible`,
|
||||
source: 'reactEnv',
|
||||
};
|
||||
}
|
||||
|
||||
// PREFERRED: Use dispensaryId from window.reactEnv (extracted during createSession)
|
||||
if (session.dispensaryId) {
|
||||
console.log(`[GraphQL Client] Resolved ${slug} -> ${session.dispensaryId} (from reactEnv)`);
|
||||
return {
|
||||
dispensaryId: session.dispensaryId,
|
||||
httpStatus: session.httpStatus,
|
||||
source: 'reactEnv',
|
||||
};
|
||||
}
|
||||
|
||||
// FALLBACK: Try GraphQL query
|
||||
console.log(`[GraphQL Client] reactEnv.dispensaryId not found for ${slug}, trying GraphQL...`);
|
||||
|
||||
const variables = {
|
||||
dispensaryFilter: {
|
||||
cNameOrID: slug,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await executeGraphQL(
|
||||
session,
|
||||
'GetAddressBasedDispensaryData',
|
||||
variables,
|
||||
GRAPHQL_HASHES.GetAddressBasedDispensaryData,
|
||||
slug
|
||||
);
|
||||
|
||||
const dispensaryId = result?.data?.dispensaryBySlug?.id ||
|
||||
result?.data?.dispensary?.id ||
|
||||
result?.data?.getAddressBasedDispensaryData?.dispensary?.id;
|
||||
|
||||
if (dispensaryId) {
|
||||
console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId} (from GraphQL)`);
|
||||
return {
|
||||
dispensaryId,
|
||||
httpStatus: session.httpStatus,
|
||||
source: 'graphql',
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Could not resolve ${slug}, GraphQL response:`, JSON.stringify(result).slice(0, 300));
|
||||
return {
|
||||
dispensaryId: null,
|
||||
httpStatus: session.httpStatus,
|
||||
error: 'Could not extract dispensaryId from reactEnv or GraphQL',
|
||||
};
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover Arizona dispensaries via geo-based query
|
||||
*/
|
||||
export async function discoverArizonaDispensaries(): Promise<any[]> {
|
||||
console.log('[GraphQL Client] Discovering Arizona dispensaries...');
|
||||
|
||||
// Use Phoenix as the default center
|
||||
const session = await createSession('AZ-Deeply-Rooted');
|
||||
const allDispensaries: any[] = [];
|
||||
const seenIds = new Set<string>();
|
||||
|
||||
try {
|
||||
for (const centerpoint of ARIZONA_CENTERPOINTS) {
|
||||
console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`);
|
||||
|
||||
const variables = {
|
||||
dispensariesFilter: {
|
||||
latitude: centerpoint.lat,
|
||||
longitude: centerpoint.lng,
|
||||
distance: 100,
|
||||
state: 'AZ',
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await executeGraphQL(
|
||||
session,
|
||||
'ConsumerDispensaries',
|
||||
variables,
|
||||
GRAPHQL_HASHES.ConsumerDispensaries,
|
||||
'AZ-Deeply-Rooted'
|
||||
);
|
||||
|
||||
const dispensaries = result?.data?.consumerDispensaries || [];
|
||||
|
||||
for (const d of dispensaries) {
|
||||
const id = d.id || d.dispensaryId;
|
||||
if (id && !seenIds.has(id)) {
|
||||
seenIds.add(id);
|
||||
allDispensaries.push(d);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`);
|
||||
} catch (error: any) {
|
||||
console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`);
|
||||
}
|
||||
|
||||
// Delay between requests
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`);
|
||||
return allDispensaries;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PRODUCT FILTERING VARIABLES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build filter variables for FilteredProducts query
|
||||
*
|
||||
* CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b")
|
||||
* NOT dispensaryFilter.cNameOrID!
|
||||
*
|
||||
* The actual browser request structure is:
|
||||
* {
|
||||
* "productsFilter": {
|
||||
* "dispensaryId": "6405ef617056e8014d79101b",
|
||||
* "pricingType": "rec",
|
||||
* "Status": "Active", // Mode A only
|
||||
* "strainTypes": [],
|
||||
* "subcategories": [],
|
||||
* "types": [],
|
||||
* "useCache": true,
|
||||
* ...
|
||||
* },
|
||||
* "page": 0,
|
||||
* "perPage": 100
|
||||
* }
|
||||
*
|
||||
* Mode A = UI parity (Status: "Active")
|
||||
* Mode B = MAX COVERAGE (no Status filter)
|
||||
*/
|
||||
function buildFilterVariables(
|
||||
platformDispensaryId: string,
|
||||
pricingType: 'rec' | 'med',
|
||||
crawlMode: CrawlMode,
|
||||
page: number,
|
||||
perPage: number
|
||||
): any {
|
||||
const isModeA = crawlMode === 'mode_a';
|
||||
|
||||
// Per CLAUDE.md Rule #11: Use simple productsFilter with dispensaryId directly
|
||||
// Do NOT use dispensaryFilter.cNameOrID - that's outdated
|
||||
const productsFilter: Record<string, any> = {
|
||||
dispensaryId: platformDispensaryId,
|
||||
pricingType: pricingType,
|
||||
};
|
||||
|
||||
// Mode A: Only active products (UI parity) - Status: "Active"
|
||||
// Mode B: MAX COVERAGE (OOS/inactive) - omit Status or set to null
|
||||
if (isModeA) {
|
||||
productsFilter.Status = 'Active';
|
||||
}
|
||||
// Mode B: No Status filter = returns all products including OOS/inactive
|
||||
|
||||
return {
|
||||
productsFilter,
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PRODUCT FETCHING WITH PAGINATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch products for a single mode with pagination
|
||||
*/
|
||||
async function fetchProductsForMode(
|
||||
session: SessionCredentials,
|
||||
platformDispensaryId: string,
|
||||
cName: string,
|
||||
pricingType: 'rec' | 'med',
|
||||
crawlMode: CrawlMode
|
||||
): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> {
|
||||
const perPage = dutchieConfig.perPage;
|
||||
const maxPages = dutchieConfig.maxPages;
|
||||
const maxRetries = dutchieConfig.maxRetries;
|
||||
const pageDelayMs = dutchieConfig.pageDelayMs;
|
||||
|
||||
const allProducts: DutchieRawProduct[] = [];
|
||||
let pageNum = 0;
|
||||
let totalCount = 0;
|
||||
let consecutiveEmptyPages = 0;
|
||||
|
||||
console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`);
|
||||
|
||||
while (pageNum < maxPages) {
|
||||
const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage);
|
||||
|
||||
let result: any = null;
|
||||
let lastError: Error | null = null;
|
||||
|
||||
// Retry logic
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
result = await executeGraphQL(
|
||||
session,
|
||||
'FilteredProducts',
|
||||
variables,
|
||||
GRAPHQL_HASHES.FilteredProducts,
|
||||
cName
|
||||
);
|
||||
lastError = null;
|
||||
break;
|
||||
} catch (error: any) {
|
||||
lastError = error;
|
||||
console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`);
|
||||
if (attempt < maxRetries) {
|
||||
await new Promise((r) => setTimeout(r, 1000 * (attempt + 1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (lastError) {
|
||||
console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`);
|
||||
break;
|
||||
}
|
||||
|
||||
if (result?.errors) {
|
||||
console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors));
|
||||
break;
|
||||
}
|
||||
|
||||
// Log response shape on first page
|
||||
if (pageNum === 0) {
|
||||
console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`);
|
||||
if (result?.data) {
|
||||
console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`);
|
||||
}
|
||||
if (!result?.data?.filteredProducts) {
|
||||
console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`);
|
||||
console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`);
|
||||
}
|
||||
}
|
||||
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
const queryInfo = result?.data?.filteredProducts?.queryInfo;
|
||||
|
||||
if (queryInfo?.totalCount) {
|
||||
totalCount = queryInfo.totalCount;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`
|
||||
);
|
||||
|
||||
if (products.length === 0) {
|
||||
consecutiveEmptyPages++;
|
||||
if (consecutiveEmptyPages >= 2) {
|
||||
console.log('[GraphQL Client] Multiple empty pages, stopping pagination');
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
consecutiveEmptyPages = 0;
|
||||
allProducts.push(...products);
|
||||
}
|
||||
|
||||
// Stop if incomplete page (last page)
|
||||
if (products.length < perPage) {
|
||||
console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`);
|
||||
break;
|
||||
}
|
||||
|
||||
pageNum++;
|
||||
await new Promise((r) => setTimeout(r, pageDelayMs));
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Fetched ${allProducts.length} total products (${crawlMode})`);
|
||||
return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LEGACY SINGLE-MODE INTERFACE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch all products for a dispensary (single mode)
|
||||
*/
|
||||
export async function fetchAllProducts(
|
||||
platformDispensaryId: string,
|
||||
pricingType: 'rec' | 'med' = 'rec',
|
||||
options: {
|
||||
perPage?: number;
|
||||
maxPages?: number;
|
||||
menuUrl?: string;
|
||||
crawlMode?: CrawlMode;
|
||||
cName?: string;
|
||||
} = {}
|
||||
): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> {
|
||||
const { crawlMode = 'mode_a' } = options;
|
||||
|
||||
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
|
||||
const cName = options.cName;
|
||||
if (!cName) {
|
||||
throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session');
|
||||
}
|
||||
|
||||
const session = await createSession(cName);
|
||||
|
||||
try {
|
||||
return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode);
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MODE A+B MERGING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Merge POSMetaData.children arrays from Mode A and Mode B products
|
||||
*/
|
||||
function mergeProductOptions(
|
||||
modeAProduct: DutchieRawProduct,
|
||||
modeBProduct: DutchieRawProduct
|
||||
): DutchiePOSChild[] {
|
||||
const modeAChildren = modeAProduct.POSMetaData?.children || [];
|
||||
const modeBChildren = modeBProduct.POSMetaData?.children || [];
|
||||
|
||||
const getOptionKey = (child: DutchiePOSChild): string => {
|
||||
return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || '';
|
||||
};
|
||||
|
||||
const mergedMap = new Map<string, DutchiePOSChild>();
|
||||
|
||||
for (const child of modeAChildren) {
|
||||
const key = getOptionKey(child);
|
||||
if (key) mergedMap.set(key, child);
|
||||
}
|
||||
|
||||
for (const child of modeBChildren) {
|
||||
const key = getOptionKey(child);
|
||||
if (key && !mergedMap.has(key)) {
|
||||
mergedMap.set(key, child);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(mergedMap.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge a Mode A product with a Mode B product
|
||||
*/
|
||||
function mergeProducts(
|
||||
modeAProduct: DutchieRawProduct,
|
||||
modeBProduct: DutchieRawProduct | undefined
|
||||
): DutchieRawProduct {
|
||||
if (!modeBProduct) {
|
||||
return modeAProduct;
|
||||
}
|
||||
|
||||
const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct);
|
||||
|
||||
return {
|
||||
...modeAProduct,
|
||||
POSMetaData: {
|
||||
...modeAProduct.POSMetaData,
|
||||
children: mergedChildren,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN EXPORT: TWO-MODE CRAWL
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch products using BOTH crawl modes with SINGLE session
|
||||
* Runs Mode A then Mode B, merges results
|
||||
*/
|
||||
export async function fetchAllProductsBothModes(
|
||||
platformDispensaryId: string,
|
||||
pricingType: 'rec' | 'med' = 'rec',
|
||||
options: {
|
||||
perPage?: number;
|
||||
maxPages?: number;
|
||||
menuUrl?: string;
|
||||
cName?: string;
|
||||
} = {}
|
||||
): Promise<{
|
||||
modeA: { products: DutchieRawProduct[]; totalCount: number };
|
||||
modeB: { products: DutchieRawProduct[]; totalCount: number };
|
||||
merged: { products: DutchieRawProduct[]; totalCount: number };
|
||||
}> {
|
||||
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
|
||||
const cName = options.cName;
|
||||
if (!cName) {
|
||||
throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session');
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`);
|
||||
console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`);
|
||||
|
||||
const session = await createSession(cName);
|
||||
|
||||
try {
|
||||
// Mode A (UI parity)
|
||||
const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a');
|
||||
|
||||
// Delay between modes
|
||||
await new Promise((r) => setTimeout(r, dutchieConfig.modeDelayMs));
|
||||
|
||||
// Mode B (MAX COVERAGE)
|
||||
const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b');
|
||||
|
||||
// Merge results
|
||||
const modeBMap = new Map<string, DutchieRawProduct>();
|
||||
for (const product of modeBResult.products) {
|
||||
modeBMap.set(product._id, product);
|
||||
}
|
||||
|
||||
const productMap = new Map<string, DutchieRawProduct>();
|
||||
|
||||
// Add Mode A products, merging with Mode B if exists
|
||||
for (const product of modeAResult.products) {
|
||||
const modeBProduct = modeBMap.get(product._id);
|
||||
const mergedProduct = mergeProducts(product, modeBProduct);
|
||||
productMap.set(product._id, mergedProduct);
|
||||
}
|
||||
|
||||
// Add Mode B products not in Mode A
|
||||
for (const product of modeBResult.products) {
|
||||
if (!productMap.has(product._id)) {
|
||||
productMap.set(product._id, product);
|
||||
}
|
||||
}
|
||||
|
||||
const mergedProducts = Array.from(productMap.values());
|
||||
|
||||
console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`);
|
||||
console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`);
|
||||
|
||||
return {
|
||||
modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount },
|
||||
modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount },
|
||||
merged: { products: mergedProducts, totalCount: mergedProducts.length },
|
||||
};
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
@@ -1,665 +0,0 @@
|
||||
/**
|
||||
* Job Queue Service
|
||||
*
|
||||
* DB-backed job queue with claiming/locking for distributed workers.
|
||||
* Ensures only one worker processes a given store at a time.
|
||||
*/
|
||||
|
||||
import { query, getClient } from '../db/connection';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import * as os from 'os';
|
||||
import { DEFAULT_CONFIG } from './store-validator';
|
||||
|
||||
// Minimum gap between crawls for the same dispensary (in minutes)
|
||||
const MIN_CRAWL_GAP_MINUTES = DEFAULT_CONFIG.minCrawlGapMinutes; // 2 minutes
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface QueuedJob {
|
||||
id: number;
|
||||
jobType: string;
|
||||
dispensaryId: number | null;
|
||||
status: 'pending' | 'running' | 'completed' | 'failed';
|
||||
priority: number;
|
||||
retryCount: number;
|
||||
maxRetries: number;
|
||||
claimedBy: string | null;
|
||||
claimedAt: Date | null;
|
||||
workerHostname: string | null;
|
||||
startedAt: Date | null;
|
||||
completedAt: Date | null;
|
||||
errorMessage: string | null;
|
||||
productsFound: number;
|
||||
productsUpserted: number;
|
||||
snapshotsCreated: number;
|
||||
currentPage: number;
|
||||
totalPages: number | null;
|
||||
lastHeartbeatAt: Date | null;
|
||||
metadata: Record<string, any> | null;
|
||||
createdAt: Date;
|
||||
}
|
||||
|
||||
export interface EnqueueJobOptions {
|
||||
jobType: string;
|
||||
dispensaryId?: number;
|
||||
priority?: number;
|
||||
metadata?: Record<string, any>;
|
||||
maxRetries?: number;
|
||||
}
|
||||
|
||||
export interface ClaimJobOptions {
|
||||
workerId: string;
|
||||
jobTypes?: string[];
|
||||
lockDurationMinutes?: number;
|
||||
}
|
||||
|
||||
export interface JobProgress {
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
currentPage?: number;
|
||||
totalPages?: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// WORKER IDENTITY
|
||||
// ============================================================
|
||||
|
||||
let _workerId: string | null = null;
|
||||
|
||||
/**
|
||||
* Get or create a unique worker ID for this process
|
||||
* In Kubernetes, uses POD_NAME for clarity; otherwise generates a unique ID
|
||||
*/
|
||||
export function getWorkerId(): string {
|
||||
if (!_workerId) {
|
||||
// Prefer POD_NAME in K8s (set via fieldRef)
|
||||
const podName = process.env.POD_NAME;
|
||||
if (podName) {
|
||||
_workerId = podName;
|
||||
} else {
|
||||
const hostname = os.hostname();
|
||||
const pid = process.pid;
|
||||
const uuid = uuidv4().slice(0, 8);
|
||||
_workerId = `${hostname}-${pid}-${uuid}`;
|
||||
}
|
||||
}
|
||||
return _workerId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hostname for worker tracking
|
||||
* In Kubernetes, uses POD_NAME; otherwise uses os.hostname()
|
||||
*/
|
||||
export function getWorkerHostname(): string {
|
||||
return process.env.POD_NAME || os.hostname();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB ENQUEUEING
|
||||
// ============================================================
|
||||
|
||||
export interface EnqueueResult {
|
||||
jobId: number | null;
|
||||
skipped: boolean;
|
||||
reason?: 'already_queued' | 'too_soon' | 'error';
|
||||
message?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new job for processing
|
||||
* Returns null if a pending/running job already exists for this dispensary
|
||||
* or if a job was completed/failed within the minimum gap period
|
||||
*/
|
||||
export async function enqueueJob(options: EnqueueJobOptions): Promise<number | null> {
|
||||
const result = await enqueueJobWithReason(options);
|
||||
return result.jobId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new job with detailed result info
|
||||
* Enforces:
|
||||
* 1. No duplicate pending/running jobs for same dispensary
|
||||
* 2. Minimum 2-minute gap between crawls for same dispensary
|
||||
*/
|
||||
export async function enqueueJobWithReason(options: EnqueueJobOptions): Promise<EnqueueResult> {
|
||||
const {
|
||||
jobType,
|
||||
dispensaryId,
|
||||
priority = 0,
|
||||
metadata,
|
||||
maxRetries = 3,
|
||||
} = options;
|
||||
|
||||
// Check if there's already a pending/running job for this dispensary
|
||||
if (dispensaryId) {
|
||||
const { rows: existing } = await query<any>(
|
||||
`SELECT id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (existing.length > 0) {
|
||||
console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'already_queued',
|
||||
message: `Job already pending/running for dispensary ${dispensaryId}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Check minimum gap since last job (2 minutes)
|
||||
const { rows: recent } = await query<any>(
|
||||
`SELECT id, created_at, status
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (recent.length > 0) {
|
||||
const lastJobTime = new Date(recent[0].created_at);
|
||||
const minGapMs = MIN_CRAWL_GAP_MINUTES * 60 * 1000;
|
||||
const timeSinceLastJob = Date.now() - lastJobTime.getTime();
|
||||
|
||||
if (timeSinceLastJob < minGapMs) {
|
||||
const waitSeconds = Math.ceil((minGapMs - timeSinceLastJob) / 1000);
|
||||
console.log(`[JobQueue] Skipping enqueue - minimum ${MIN_CRAWL_GAP_MINUTES}min gap not met for dispensary ${dispensaryId}. Wait ${waitSeconds}s`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'too_soon',
|
||||
message: `Minimum ${MIN_CRAWL_GAP_MINUTES}-minute gap required. Try again in ${waitSeconds} seconds.`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const { rows } = await query<any>(
|
||||
`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
|
||||
RETURNING id`,
|
||||
[jobType, dispensaryId || null, priority, maxRetries, metadata ? JSON.stringify(metadata) : null]
|
||||
);
|
||||
|
||||
const jobId = rows[0].id;
|
||||
console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
|
||||
return { jobId, skipped: false };
|
||||
} catch (error: any) {
|
||||
// Handle database trigger rejection for minimum gap
|
||||
if (error.message?.includes('Minimum') && error.message?.includes('gap')) {
|
||||
console.log(`[JobQueue] DB rejected - minimum gap not met for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'too_soon',
|
||||
message: error.message,
|
||||
};
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export interface BulkEnqueueResult {
|
||||
enqueued: number;
|
||||
skipped: number;
|
||||
skippedReasons: {
|
||||
alreadyQueued: number;
|
||||
tooSoon: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Bulk enqueue jobs for multiple dispensaries
|
||||
* Skips dispensaries that already have pending/running jobs
|
||||
* or have jobs within the minimum gap period
|
||||
*/
|
||||
export async function bulkEnqueueJobs(
|
||||
jobType: string,
|
||||
dispensaryIds: number[],
|
||||
options: { priority?: number; metadata?: Record<string, any> } = {}
|
||||
): Promise<BulkEnqueueResult> {
|
||||
const { priority = 0, metadata } = options;
|
||||
|
||||
// Get dispensaries that already have pending/running jobs
|
||||
const { rows: existing } = await query<any>(
|
||||
`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')`,
|
||||
[dispensaryIds]
|
||||
);
|
||||
const existingSet = new Set(existing.map((r: any) => r.dispensary_id));
|
||||
|
||||
// Get dispensaries that have recent jobs within minimum gap
|
||||
const { rows: recent } = await query<any>(
|
||||
`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1)
|
||||
AND created_at > NOW() - ($2 || ' minutes')::INTERVAL
|
||||
AND dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')
|
||||
)`,
|
||||
[dispensaryIds, MIN_CRAWL_GAP_MINUTES]
|
||||
);
|
||||
const recentSet = new Set(recent.map((r: any) => r.dispensary_id));
|
||||
|
||||
// Filter out dispensaries with existing or recent jobs
|
||||
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id) && !recentSet.has(id));
|
||||
|
||||
if (toEnqueue.length === 0) {
|
||||
return {
|
||||
enqueued: 0,
|
||||
skipped: dispensaryIds.length,
|
||||
skippedReasons: {
|
||||
alreadyQueued: existingSet.size,
|
||||
tooSoon: recentSet.size,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
|
||||
const metadataJson = metadata ? JSON.stringify(metadata) : null;
|
||||
const values = toEnqueue.map((_, i) => {
|
||||
const offset = i * 4;
|
||||
return `($${offset + 1}, $${offset + 2}, 'pending', $${offset + 3}, 3, $${offset + 4}, NOW())`;
|
||||
}).join(', ');
|
||||
|
||||
const params: any[] = [];
|
||||
toEnqueue.forEach(dispensaryId => {
|
||||
params.push(jobType, dispensaryId, priority, metadataJson);
|
||||
});
|
||||
|
||||
await query(
|
||||
`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ${values}`,
|
||||
params
|
||||
);
|
||||
|
||||
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size} (queued) + ${recentSet.size} (recent)`);
|
||||
return {
|
||||
enqueued: toEnqueue.length,
|
||||
skipped: existingSet.size + recentSet.size,
|
||||
skippedReasons: {
|
||||
alreadyQueued: existingSet.size,
|
||||
tooSoon: recentSet.size,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB CLAIMING (with locking)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Claim the next available job from the queue
|
||||
* Uses SELECT FOR UPDATE SKIP LOCKED to prevent double-claims
|
||||
*/
|
||||
export async function claimNextJob(options: ClaimJobOptions): Promise<QueuedJob | null> {
|
||||
const { workerId, jobTypes, lockDurationMinutes = 30 } = options;
|
||||
const hostname = getWorkerHostname();
|
||||
|
||||
const client = await getClient();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Build job type filter
|
||||
let typeFilter = '';
|
||||
const params: any[] = [workerId, hostname, lockDurationMinutes];
|
||||
let paramIndex = 4;
|
||||
|
||||
if (jobTypes && jobTypes.length > 0) {
|
||||
typeFilter = `AND job_type = ANY($${paramIndex})`;
|
||||
params.push(jobTypes);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
// Claim the next pending job using FOR UPDATE SKIP LOCKED
|
||||
// This atomically selects and locks a row, skipping any already locked by other workers
|
||||
const { rows } = await client.query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'running',
|
||||
claimed_by = $1,
|
||||
claimed_at = NOW(),
|
||||
worker_id = $1,
|
||||
worker_hostname = $2,
|
||||
started_at = NOW(),
|
||||
locked_until = NOW() + ($3 || ' minutes')::INTERVAL,
|
||||
last_heartbeat_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM dispensary_crawl_jobs
|
||||
WHERE status = 'pending'
|
||||
${typeFilter}
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING *`,
|
||||
params
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
if (rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const job = mapDbRowToJob(rows[0]);
|
||||
console.log(`[JobQueue] Worker ${workerId} claimed job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
|
||||
return job;
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB PROGRESS & COMPLETION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Update job progress (for live monitoring)
|
||||
*/
|
||||
export async function updateJobProgress(jobId: number, progress: JobProgress): Promise<void> {
|
||||
const updates: string[] = ['last_heartbeat_at = NOW()', 'updated_at = NOW()'];
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (progress.productsFound !== undefined) {
|
||||
updates.push(`products_found = $${paramIndex++}`);
|
||||
params.push(progress.productsFound);
|
||||
}
|
||||
if (progress.productsUpserted !== undefined) {
|
||||
updates.push(`products_upserted = $${paramIndex++}`);
|
||||
params.push(progress.productsUpserted);
|
||||
}
|
||||
if (progress.snapshotsCreated !== undefined) {
|
||||
updates.push(`snapshots_created = $${paramIndex++}`);
|
||||
params.push(progress.snapshotsCreated);
|
||||
}
|
||||
if (progress.currentPage !== undefined) {
|
||||
updates.push(`current_page = $${paramIndex++}`);
|
||||
params.push(progress.currentPage);
|
||||
}
|
||||
if (progress.totalPages !== undefined) {
|
||||
updates.push(`total_pages = $${paramIndex++}`);
|
||||
params.push(progress.totalPages);
|
||||
}
|
||||
|
||||
params.push(jobId);
|
||||
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
|
||||
params
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send heartbeat to keep job alive (prevents timeout)
|
||||
*/
|
||||
export async function heartbeat(jobId: number): Promise<void> {
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET last_heartbeat_at = NOW(), locked_until = NOW() + INTERVAL '30 minutes'
|
||||
WHERE id = $1 AND status = 'running'`,
|
||||
[jobId]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark job as completed
|
||||
*
|
||||
* Stores visibility tracking stats (visibilityLostCount, visibilityRestoredCount)
|
||||
* in the metadata JSONB column for dashboard analytics.
|
||||
*/
|
||||
export async function completeJob(
|
||||
jobId: number,
|
||||
result: {
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
visibilityLostCount?: number;
|
||||
visibilityRestoredCount?: number;
|
||||
}
|
||||
): Promise<void> {
|
||||
// Build metadata with visibility stats if provided
|
||||
const metadata: Record<string, any> = {};
|
||||
if (result.visibilityLostCount !== undefined) {
|
||||
metadata.visibilityLostCount = result.visibilityLostCount;
|
||||
}
|
||||
if (result.visibilityRestoredCount !== undefined) {
|
||||
metadata.visibilityRestoredCount = result.visibilityRestoredCount;
|
||||
}
|
||||
if (result.snapshotsCreated !== undefined) {
|
||||
metadata.snapshotsCreated = result.snapshotsCreated;
|
||||
}
|
||||
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
products_found = COALESCE($2, products_found),
|
||||
products_updated = COALESCE($3, products_updated),
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) || $4::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
jobId,
|
||||
result.productsFound,
|
||||
result.productsUpserted,
|
||||
JSON.stringify(metadata),
|
||||
]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} completed`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark job as failed
|
||||
*/
|
||||
export async function failJob(jobId: number, errorMessage: string): Promise<boolean> {
|
||||
// Check if we should retry
|
||||
const { rows } = await query<any>(
|
||||
`SELECT retry_count, max_retries FROM dispensary_crawl_jobs WHERE id = $1`,
|
||||
[jobId]
|
||||
);
|
||||
|
||||
if (rows.length === 0) return false;
|
||||
|
||||
const { retry_count, max_retries } = rows[0];
|
||||
|
||||
if (retry_count < max_retries) {
|
||||
// Re-queue for retry
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'pending',
|
||||
retry_count = retry_count + 1,
|
||||
claimed_by = NULL,
|
||||
claimed_at = NULL,
|
||||
worker_id = NULL,
|
||||
worker_hostname = NULL,
|
||||
started_at = NULL,
|
||||
locked_until = NULL,
|
||||
last_heartbeat_at = NULL,
|
||||
error_message = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[jobId, errorMessage]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} failed, re-queued for retry (${retry_count + 1}/${max_retries})`);
|
||||
return true; // Will retry
|
||||
} else {
|
||||
// Mark as failed permanently
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'failed',
|
||||
completed_at = NOW(),
|
||||
error_message = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[jobId, errorMessage]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} failed permanently after ${retry_count} retries`);
|
||||
return false; // No more retries
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// QUEUE MONITORING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Get queue statistics
|
||||
*/
|
||||
export async function getQueueStats(): Promise<{
|
||||
pending: number;
|
||||
running: number;
|
||||
completed1h: number;
|
||||
failed1h: number;
|
||||
activeWorkers: number;
|
||||
avgDurationSeconds: number | null;
|
||||
}> {
|
||||
const { rows } = await query<any>(`SELECT * FROM v_queue_stats`);
|
||||
const stats = rows[0] || {};
|
||||
|
||||
return {
|
||||
pending: parseInt(stats.pending_jobs || '0', 10),
|
||||
running: parseInt(stats.running_jobs || '0', 10),
|
||||
completed1h: parseInt(stats.completed_1h || '0', 10),
|
||||
failed1h: parseInt(stats.failed_1h || '0', 10),
|
||||
activeWorkers: parseInt(stats.active_workers || '0', 10),
|
||||
avgDurationSeconds: stats.avg_duration_seconds ? parseFloat(stats.avg_duration_seconds) : null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active workers
|
||||
*/
|
||||
export async function getActiveWorkers(): Promise<Array<{
|
||||
workerId: string;
|
||||
hostname: string | null;
|
||||
currentJobs: number;
|
||||
totalProductsFound: number;
|
||||
totalProductsUpserted: number;
|
||||
totalSnapshots: number;
|
||||
firstClaimedAt: Date;
|
||||
lastHeartbeat: Date | null;
|
||||
}>> {
|
||||
const { rows } = await query<any>(`SELECT * FROM v_active_workers`);
|
||||
|
||||
return rows.map((row: any) => ({
|
||||
workerId: row.worker_id,
|
||||
hostname: row.worker_hostname,
|
||||
currentJobs: parseInt(row.current_jobs || '0', 10),
|
||||
totalProductsFound: parseInt(row.total_products_found || '0', 10),
|
||||
totalProductsUpserted: parseInt(row.total_products_upserted || '0', 10),
|
||||
totalSnapshots: parseInt(row.total_snapshots || '0', 10),
|
||||
firstClaimedAt: new Date(row.first_claimed_at),
|
||||
lastHeartbeat: row.last_heartbeat ? new Date(row.last_heartbeat) : null,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get running jobs with worker info
|
||||
*/
|
||||
export async function getRunningJobs(): Promise<QueuedJob[]> {
|
||||
const { rows } = await query<any>(
|
||||
`SELECT cj.*, d.name as dispensary_name, d.city
|
||||
FROM dispensary_crawl_jobs cj
|
||||
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
|
||||
WHERE cj.status = 'running'
|
||||
ORDER BY cj.started_at DESC`
|
||||
);
|
||||
|
||||
return rows.map(mapDbRowToJob);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover stale jobs (workers that died without completing)
|
||||
*/
|
||||
export async function recoverStaleJobs(staleMinutes: number = 15): Promise<number> {
|
||||
const { rowCount } = await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'pending',
|
||||
claimed_by = NULL,
|
||||
claimed_at = NULL,
|
||||
worker_id = NULL,
|
||||
worker_hostname = NULL,
|
||||
started_at = NULL,
|
||||
locked_until = NULL,
|
||||
error_message = 'Recovered from stale worker',
|
||||
retry_count = retry_count + 1,
|
||||
updated_at = NOW()
|
||||
WHERE status = 'running'
|
||||
AND last_heartbeat_at < NOW() - ($1 || ' minutes')::INTERVAL
|
||||
AND retry_count < max_retries`,
|
||||
[staleMinutes]
|
||||
);
|
||||
|
||||
if (rowCount && rowCount > 0) {
|
||||
console.log(`[JobQueue] Recovered ${rowCount} stale jobs`);
|
||||
}
|
||||
return rowCount || 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up old completed/failed jobs
|
||||
*/
|
||||
export async function cleanupOldJobs(olderThanDays: number = 7): Promise<number> {
|
||||
const { rowCount } = await query(
|
||||
`DELETE FROM dispensary_crawl_jobs
|
||||
WHERE status IN ('completed', 'failed')
|
||||
AND completed_at < NOW() - ($1 || ' days')::INTERVAL`,
|
||||
[olderThanDays]
|
||||
);
|
||||
|
||||
if (rowCount && rowCount > 0) {
|
||||
console.log(`[JobQueue] Cleaned up ${rowCount} old jobs`);
|
||||
}
|
||||
return rowCount || 0;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELPERS
|
||||
// ============================================================
|
||||
|
||||
function mapDbRowToJob(row: any): QueuedJob {
|
||||
return {
|
||||
id: row.id,
|
||||
jobType: row.job_type,
|
||||
dispensaryId: row.dispensary_id,
|
||||
status: row.status,
|
||||
priority: row.priority || 0,
|
||||
retryCount: row.retry_count || 0,
|
||||
maxRetries: row.max_retries || 3,
|
||||
claimedBy: row.claimed_by,
|
||||
claimedAt: row.claimed_at ? new Date(row.claimed_at) : null,
|
||||
workerHostname: row.worker_hostname,
|
||||
startedAt: row.started_at ? new Date(row.started_at) : null,
|
||||
completedAt: row.completed_at ? new Date(row.completed_at) : null,
|
||||
errorMessage: row.error_message,
|
||||
productsFound: row.products_found || 0,
|
||||
productsUpserted: row.products_upserted || 0,
|
||||
snapshotsCreated: row.snapshots_created || 0,
|
||||
currentPage: row.current_page || 0,
|
||||
totalPages: row.total_pages,
|
||||
lastHeartbeatAt: row.last_heartbeat_at ? new Date(row.last_heartbeat_at) : null,
|
||||
metadata: row.metadata,
|
||||
createdAt: new Date(row.created_at),
|
||||
// Add extra fields from join if present
|
||||
...(row.dispensary_name && { dispensaryName: row.dispensary_name }),
|
||||
...(row.city && { city: row.city }),
|
||||
};
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user