Compare commits
75 Commits
feature/wo
...
feat/wordp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
74957a9ec5 | ||
|
|
2d035c46cf | ||
|
|
53445fe72a | ||
|
|
37cc8956c5 | ||
|
|
197c82f921 | ||
|
|
2c52493a9c | ||
|
|
2ee2ba6b8c | ||
|
|
bafcf1694a | ||
|
|
95792aab15 | ||
|
|
38ae2c3a3e | ||
|
|
249d3c1b7f | ||
|
|
9647f94f89 | ||
|
|
afc288d2cf | ||
|
|
df01ce6aad | ||
|
|
aea93bc96b | ||
|
|
4e84f30f8b | ||
|
|
b20a0a4fa5 | ||
|
|
6eb1babc86 | ||
|
|
9a9c2f76a2 | ||
|
|
56cc171287 | ||
|
|
0295637ed6 | ||
|
|
9c6dd37316 | ||
|
|
524d13209a | ||
|
|
9199db3927 | ||
|
|
a0652c7c73 | ||
|
|
89c262ee20 | ||
|
|
7f9cf559cf | ||
|
|
bbe039c868 | ||
|
|
4e5c09a2a5 | ||
|
|
7f65598332 | ||
|
|
75315ed91e | ||
|
|
7fe7d17b43 | ||
|
|
7e517b5801 | ||
|
|
38ba9021d1 | ||
|
|
ddebad48d3 | ||
|
|
1cebf2e296 | ||
|
|
1d6e67d837 | ||
|
|
cfb4b6e4ce | ||
|
|
f418c403d6 | ||
|
|
be4221af46 | ||
|
|
ca07606b05 | ||
|
|
baf1bf2eb7 | ||
|
|
4ef3a8d72b | ||
|
|
09dd756eff | ||
|
|
ec8ef6210c | ||
|
|
a9b7a4d7a9 | ||
|
|
5119d5ccf9 | ||
|
|
91efd1d03d | ||
|
|
aa776226b0 | ||
|
|
e9435150e9 | ||
|
|
d399b966e6 | ||
|
|
f5f0e25384 | ||
|
|
04de33e5f7 | ||
|
|
37dfea25e1 | ||
|
|
e2166bc25f | ||
|
|
b5e8f039bf | ||
|
|
346e6d1cd8 | ||
|
|
be434d25e3 | ||
|
|
ecc201e9d4 | ||
|
|
67bfdf47a5 | ||
|
|
3fa22a6ba1 | ||
|
|
9f898f68db | ||
|
|
f78b05360a | ||
|
|
2f483b3084 | ||
|
|
9711d594db | ||
|
|
39aebfcb82 | ||
|
|
5415cac2f3 | ||
|
|
70d2364a6f | ||
|
|
b1ab45f662 | ||
|
|
20300edbb8 | ||
|
|
b7cfec0770 | ||
|
|
948a732dd5 | ||
|
|
bf4ceaf09e | ||
|
|
fda688b11a | ||
|
|
414b97b3c0 |
@@ -2,37 +2,77 @@ when:
|
||||
- event: [push, pull_request]
|
||||
|
||||
steps:
|
||||
# Build checks
|
||||
# ===========================================
|
||||
# PR VALIDATION: Parallel type checks (PRs only)
|
||||
# ===========================================
|
||||
typecheck-backend:
|
||||
image: node:20
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd backend
|
||||
- npm ci
|
||||
- npx tsc --noEmit || true
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
build-cannaiq:
|
||||
image: node:20
|
||||
typecheck-cannaiq:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd cannaiq
|
||||
- npm ci
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit
|
||||
- npm run build
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
build-findadispo:
|
||||
image: node:20
|
||||
typecheck-findadispo:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd findadispo/frontend
|
||||
- npm ci
|
||||
- npm run build
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit 2>/dev/null || true
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
build-findagram:
|
||||
image: node:20
|
||||
typecheck-findagram:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- cd findagram/frontend
|
||||
- npm ci
|
||||
- npm run build
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit 2>/dev/null || true
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# Docker builds - only on master
|
||||
# ===========================================
|
||||
# AUTO-MERGE: Merge PR after all checks pass
|
||||
# ===========================================
|
||||
auto-merge:
|
||||
image: alpine:latest
|
||||
environment:
|
||||
GITEA_TOKEN:
|
||||
from_secret: gitea_token
|
||||
commands:
|
||||
- apk add --no-cache curl
|
||||
- |
|
||||
echo "Merging PR #${CI_COMMIT_PULL_REQUEST}..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do":"merge"}' \
|
||||
"https://code.cannabrands.app/api/v1/repos/Creationshop/dispensary-scraper/pulls/${CI_COMMIT_PULL_REQUEST}/merge"
|
||||
depends_on:
|
||||
- typecheck-backend
|
||||
- typecheck-cannaiq
|
||||
- typecheck-findadispo
|
||||
- typecheck-findagram
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# ===========================================
|
||||
# MASTER DEPLOY: Parallel Docker builds
|
||||
# ===========================================
|
||||
docker-backend:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
@@ -49,6 +89,8 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
build_args: APP_BUILD_VERSION=${CI_COMMIT_SHA:0:8},APP_GIT_SHA=${CI_COMMIT_SHA},APP_BUILD_TIME=${CI_PIPELINE_CREATED},CONTAINER_IMAGE_TAG=${CI_COMMIT_SHA:0:8}
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -69,6 +111,7 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -89,6 +132,7 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -109,32 +153,35 @@ steps:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
# Deploy to Kubernetes
|
||||
# ===========================================
|
||||
# STAGE 3: Deploy (after Docker builds)
|
||||
# ===========================================
|
||||
deploy:
|
||||
image: bitnami/kubectl:latest
|
||||
environment:
|
||||
KUBECONFIG_CONTENT:
|
||||
from_secret: kubeconfig_data
|
||||
commands:
|
||||
- echo "Deploying to Kubernetes..."
|
||||
- mkdir -p ~/.kube
|
||||
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
|
||||
- chmod 600 ~/.kube/config
|
||||
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/scraper-worker scraper-worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/scraper-worker -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
|
||||
- kubectl rollout status deployment/findadispo-frontend -n dispensary-scraper --timeout=120s
|
||||
- kubectl rollout status deployment/findagram-frontend -n dispensary-scraper --timeout=120s
|
||||
- echo "All deployments complete!"
|
||||
depends_on:
|
||||
- docker-backend
|
||||
- docker-cannaiq
|
||||
- docker-findadispo
|
||||
- docker-findagram
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
271
CLAUDE.md
271
CLAUDE.md
@@ -193,6 +193,45 @@ CannaiQ has **TWO databases** with distinct purposes:
|
||||
| `dutchie_menus` | **Canonical CannaiQ database** - All schema, migrations, and application data | READ/WRITE |
|
||||
| `dutchie_legacy` | **Legacy read-only archive** - Historical data from old system | READ-ONLY |
|
||||
|
||||
### Store vs Dispensary Terminology
|
||||
|
||||
**"Store" and "Dispensary" are SYNONYMS in CannaiQ.**
|
||||
|
||||
| Term | Usage | DB Table |
|
||||
|------|-------|----------|
|
||||
| Store | API routes (`/api/stores`) | `dispensaries` |
|
||||
| Dispensary | DB table, internal code | `dispensaries` |
|
||||
|
||||
- `/api/stores` and `/api/dispensaries` both query the `dispensaries` table
|
||||
- There is NO `stores` table in use - it's a legacy empty table
|
||||
- Use these terms interchangeably in code and documentation
|
||||
|
||||
### Canonical vs Legacy Tables
|
||||
|
||||
**CANONICAL TABLES (USE THESE):**
|
||||
|
||||
| Table | Purpose | Row Count |
|
||||
|-------|---------|-----------|
|
||||
| `dispensaries` | Store/dispensary records | ~188+ rows |
|
||||
| `store_products` | Product catalog | ~37,000+ rows |
|
||||
| `store_product_snapshots` | Price/stock history | ~millions |
|
||||
|
||||
**LEGACY TABLES (EMPTY - DO NOT USE):**
|
||||
|
||||
| Table | Status | Action |
|
||||
|-------|--------|--------|
|
||||
| `stores` | EMPTY (0 rows) | Use `dispensaries` instead |
|
||||
| `products` | EMPTY (0 rows) | Use `store_products` instead |
|
||||
| `dutchie_products` | LEGACY (0 rows) | Use `store_products` instead |
|
||||
| `dutchie_product_snapshots` | LEGACY (0 rows) | Use `store_product_snapshots` instead |
|
||||
| `categories` | EMPTY (0 rows) | Categories stored in product records |
|
||||
|
||||
**Code must NEVER:**
|
||||
- Query the `stores` table (use `dispensaries`)
|
||||
- Query the `products` table (use `store_products`)
|
||||
- Query the `dutchie_products` table (use `store_products`)
|
||||
- Query the `categories` table (categories are in product records)
|
||||
|
||||
**CRITICAL RULES:**
|
||||
- **Migrations ONLY run on `dutchie_menus`** - NEVER on `dutchie_legacy`
|
||||
- **Application code connects ONLY to `dutchie_menus`**
|
||||
@@ -305,23 +344,23 @@ npx tsx src/scripts/etl/042_legacy_import.ts
|
||||
- SCHEMA ONLY - no data inserts from legacy tables
|
||||
|
||||
**ETL Script 042** (`backend/src/scripts/etl/042_legacy_import.ts`):
|
||||
- Copies data from `dutchie_products` → `store_products`
|
||||
- Copies data from `dutchie_product_snapshots` → `store_product_snapshots`
|
||||
- Copies data from legacy `dutchie_legacy.dutchie_products` → `store_products`
|
||||
- Copies data from legacy `dutchie_legacy.dutchie_product_snapshots` → `store_product_snapshots`
|
||||
- Extracts brands from product data into `brands` table
|
||||
- Links dispensaries to chains and states
|
||||
- INSERT-ONLY and IDEMPOTENT (uses ON CONFLICT DO NOTHING)
|
||||
- Run manually: `cd backend && npx tsx src/scripts/etl/042_legacy_import.ts`
|
||||
|
||||
**Tables touched by ETL:**
|
||||
| Source Table | Target Table |
|
||||
|--------------|--------------|
|
||||
| Source Table (dutchie_legacy) | Target Table (dutchie_menus) |
|
||||
|-------------------------------|------------------------------|
|
||||
| `dutchie_products` | `store_products` |
|
||||
| `dutchie_product_snapshots` | `store_product_snapshots` |
|
||||
| (brand names extracted) | `brands` |
|
||||
| (state codes mapped) | `dispensaries.state_id` |
|
||||
| (chain names matched) | `dispensaries.chain_id` |
|
||||
|
||||
**Legacy tables remain intact** - `dutchie_products` and `dutchie_product_snapshots` are not modified.
|
||||
**Note:** The legacy `dutchie_products` and `dutchie_product_snapshots` tables in `dutchie_legacy` are read-only sources. All new crawl data goes directly to `store_products` and `store_product_snapshots`.
|
||||
|
||||
**Migration 045** (`backend/migrations/045_add_image_columns.sql`):
|
||||
- Adds `thumbnail_url` to `store_products` and `store_product_snapshots`
|
||||
@@ -421,15 +460,66 @@ const result = await pool.query(`
|
||||
### Local Storage Structure
|
||||
|
||||
```
|
||||
/storage/products/{brand}/{state}/{product_id}/
|
||||
/storage/images/products/{state}/{store}/{brand}/{product}/
|
||||
image-{hash}.webp
|
||||
image-{hash}-medium.webp
|
||||
image-{hash}-thumb.webp
|
||||
|
||||
/storage/brands/{brand}/
|
||||
/storage/images/brands/{brand}/
|
||||
logo-{hash}.webp
|
||||
```
|
||||
|
||||
### Image Proxy API (On-Demand Resizing)
|
||||
|
||||
Images are stored at full resolution and resized on-demand via the `/img` endpoint.
|
||||
|
||||
**Endpoint:** `GET /img/<path>?<params>`
|
||||
|
||||
**Parameters:**
|
||||
| Param | Description | Example |
|
||||
|-------|-------------|---------|
|
||||
| `w` | Width in pixels (max 4000) | `?w=200` |
|
||||
| `h` | Height in pixels (max 4000) | `?h=200` |
|
||||
| `q` | Quality 1-100 (default 80) | `?q=70` |
|
||||
| `fit` | Resize mode: cover, contain, fill, inside, outside | `?fit=cover` |
|
||||
| `blur` | Blur sigma 0.3-1000 | `?blur=5` |
|
||||
| `gray` | Grayscale (1 = enabled) | `?gray=1` |
|
||||
| `format` | Output: webp, jpeg, png, avif (default webp) | `?format=jpeg` |
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Thumbnail (50px)
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=50
|
||||
|
||||
# Card image (200px, cover fit)
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=200&h=200&fit=cover
|
||||
|
||||
# JPEG at 70% quality
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=400&format=jpeg&q=70
|
||||
|
||||
# Grayscale blur
|
||||
GET /img/products/az/store/brand/product/image-abc123.webp?w=200&gray=1&blur=3
|
||||
```
|
||||
|
||||
**Frontend Usage:**
|
||||
```typescript
|
||||
import { getImageUrl, ImageSizes } from '../lib/images';
|
||||
|
||||
// Returns /img/products/.../image.webp?w=50 for local images
|
||||
// Returns original URL for remote images (CDN, etc.)
|
||||
const thumbUrl = getImageUrl(product.image_url, ImageSizes.thumb);
|
||||
const cardUrl = getImageUrl(product.image_url, ImageSizes.medium);
|
||||
const detailUrl = getImageUrl(product.image_url, ImageSizes.detail);
|
||||
```
|
||||
|
||||
**Size Presets:**
|
||||
| Preset | Width | Use Case |
|
||||
|--------|-------|----------|
|
||||
| `thumb` | 50px | Table thumbnails |
|
||||
| `small` | 100px | Small cards |
|
||||
| `medium` | 200px | Grid cards |
|
||||
| `large` | 400px | Large cards |
|
||||
| `detail` | 600px | Product detail |
|
||||
| `full` | - | No resize |
|
||||
|
||||
### Storage Adapter
|
||||
|
||||
```typescript
|
||||
@@ -442,8 +532,9 @@ import { saveImage, getImageUrl } from '../utils/storage-adapter';
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `backend/src/utils/local-storage.ts` | Local filesystem adapter |
|
||||
| `backend/src/utils/storage-adapter.ts` | Unified storage abstraction |
|
||||
| `backend/src/utils/image-storage.ts` | Image download and storage |
|
||||
| `backend/src/routes/image-proxy.ts` | On-demand image resizing endpoint |
|
||||
| `cannaiq/src/lib/images.ts` | Frontend image URL helper |
|
||||
| `docker-compose.local.yml` | Local stack without MinIO |
|
||||
| `start-local.sh` | Convenience startup script |
|
||||
|
||||
@@ -451,12 +542,78 @@ import { saveImage, getImageUrl } from '../utils/storage-adapter';
|
||||
|
||||
## UI ANONYMIZATION RULES
|
||||
|
||||
- No vendor names in forward-facing URLs: use `/api/az/...`, `/az`, `/az-schedule`
|
||||
- No vendor names in forward-facing URLs
|
||||
- No "dutchie", "treez", "jane", "weedmaps", "leafly" visible in consumer UIs
|
||||
- Internal admin tools may show provider names for debugging
|
||||
|
||||
---
|
||||
|
||||
## DUTCHIE DISCOVERY PIPELINE (Added 2025-01)
|
||||
|
||||
### Overview
|
||||
Automated discovery of Dutchie-powered dispensaries across all US states.
|
||||
|
||||
### Flow
|
||||
```
|
||||
1. getAllCitiesByState GraphQL → Get all cities for a state
|
||||
2. ConsumerDispensaries GraphQL → Get stores for each city
|
||||
3. Upsert to dutchie_discovery_locations (keyed by platform_location_id)
|
||||
4. AUTO-VALIDATE: Check required fields
|
||||
5. AUTO-PROMOTE: Create/update dispensaries with crawl_enabled=true
|
||||
6. Log all actions to dutchie_promotion_log
|
||||
```
|
||||
|
||||
### Tables
|
||||
| Table | Purpose |
|
||||
|-------|---------|
|
||||
| `dutchie_discovery_cities` | Cities known to have dispensaries |
|
||||
| `dutchie_discovery_locations` | Raw discovered store data |
|
||||
| `dispensaries` | Canonical stores (promoted from discovery) |
|
||||
| `dutchie_promotion_log` | Audit trail for validation/promotion |
|
||||
|
||||
### Files
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/discovery/discovery-crawler.ts` | Main orchestrator |
|
||||
| `src/discovery/location-discovery.ts` | GraphQL fetching |
|
||||
| `src/discovery/promotion.ts` | Validation & promotion logic |
|
||||
| `src/scripts/run-discovery.ts` | CLI interface |
|
||||
| `migrations/067_promotion_log.sql` | Audit log table |
|
||||
|
||||
### GraphQL Hashes (in `src/platforms/dutchie/client.ts`)
|
||||
| Query | Hash |
|
||||
|-------|------|
|
||||
| `GetAllCitiesByState` | `ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6` |
|
||||
| `ConsumerDispensaries` | `0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b` |
|
||||
|
||||
### Usage
|
||||
```bash
|
||||
# Discover all stores in a state
|
||||
npx tsx src/scripts/run-discovery.ts discover:state AZ
|
||||
npx tsx src/scripts/run-discovery.ts discover:state CA
|
||||
|
||||
# Check stats
|
||||
npx tsx src/scripts/run-discovery.ts stats
|
||||
```
|
||||
|
||||
### Validation Rules
|
||||
A discovery location must have:
|
||||
- `platform_location_id` (MongoDB ObjectId, 24 hex chars)
|
||||
- `name`
|
||||
- `city`
|
||||
- `state_code`
|
||||
- `platform_menu_url`
|
||||
|
||||
Invalid records are marked `status='rejected'` with errors logged.
|
||||
|
||||
### Key Design Decisions
|
||||
- `platform_location_id` MUST be MongoDB ObjectId (not slug)
|
||||
- Old geo-based discovery stored slugs → deleted as garbage data
|
||||
- Rate limit: 2 seconds between city requests to avoid API throttling
|
||||
- Promotion is idempotent via `ON CONFLICT (platform_dispensary_id)`
|
||||
|
||||
---
|
||||
|
||||
## FUTURE TODO / PENDING FEATURES
|
||||
|
||||
- [ ] Orchestrator observability dashboard
|
||||
@@ -601,29 +758,45 @@ export default defineConfig({
|
||||
|
||||
- **DB**: Use the single CannaiQ database via `CANNAIQ_DB_*` env vars. No hardcoded names.
|
||||
- **Images**: No MinIO. Save to local /images/products/<disp>/<prod>-<hash>.webp (and brands); preserve original URL; serve via backend static.
|
||||
- **Dutchie GraphQL**: Endpoint https://dutchie.com/api-3/graphql. Variables must use productsFilter.dispensaryId (platform_dispensary_id). Mode A: Status="Active". Mode B: Status=null/activeOnly:false.
|
||||
- **Dutchie GraphQL**: Endpoint https://dutchie.com/api-3/graphql. Variables must use productsFilter.dispensaryId (platform_dispensary_id). **CRITICAL: Use `Status: 'Active'`, NOT `null`** (null returns 0 products).
|
||||
- **cName/slug**: Derive cName from each store's menu_url (/embedded-menu/<cName> or /dispensary/<slug>). No hardcoded defaults.
|
||||
- **Dual-mode always**: useBothModes:true to get pricing (Mode A) + full coverage (Mode B).
|
||||
- **Batch DB writes**: Chunk products/snapshots/missing (100–200) to avoid OOM.
|
||||
- **OOS/missing**: Include inactive/OOS in Mode B. Union A+B, dedupe by external_product_id+dispensary_id.
|
||||
- **API/Frontend**: Use /api/az/... endpoints (stores/products/brands/categories/summary/dashboard).
|
||||
- **API/Frontend**: Use `/api/stores`, `/api/products`, `/api/workers`, `/api/pipeline` endpoints.
|
||||
- **Scheduling**: Crawl only menu_type='dutchie' AND platform_dispensary_id IS NOT NULL. 4-hour crawl with jitter.
|
||||
- **Monitor**: /scraper-monitor (and /az-schedule) should show active/recent jobs from job_run_logs/crawl_jobs.
|
||||
- **THC/CBD values**: Clamp to ≤100 - some products report milligrams as percentages.
|
||||
- **Column names**: Use `name_raw`, `brand_name_raw`, `category_raw`, `subcategory_raw` (NOT `name`, `brand_name`, etc.)
|
||||
|
||||
- **Monitor**: `/api/workers` shows active/recent jobs from job queue.
|
||||
- **No slug guessing**: Never use defaults. Always derive per store from menu_url and resolve platform IDs per location.
|
||||
|
||||
**📖 Full Documentation: See `docs/DUTCHIE_CRAWL_WORKFLOW.md` for complete pipeline documentation.**
|
||||
|
||||
---
|
||||
|
||||
### Detailed Rules
|
||||
|
||||
1) **Dispensary vs Store**
|
||||
- Dutchie pipeline uses `dispensaries` (not legacy `stores`). For dutchie crawls, always work with dispensary ID.
|
||||
1) **Dispensary = Store (SAME THING)**
|
||||
- "Dispensary" and "store" are synonyms in CannaiQ. Use interchangeably.
|
||||
- **API endpoint**: `/api/stores` (NOT `/api/dispensaries`)
|
||||
- **DB table**: `dispensaries`
|
||||
- When you need to create/query stores via API, use `/api/stores`
|
||||
- Use the record's `menu_url` and `platform_dispensary_id`.
|
||||
|
||||
2) **Menu detection and platform IDs**
|
||||
2) **API Authentication**
|
||||
- **Trusted Origins (no auth needed)**:
|
||||
- IPs: `127.0.0.1`, `::1`, `::ffff:127.0.0.1`
|
||||
- Origins: `https://cannaiq.co`, `https://findadispo.com`, `https://findagram.co`
|
||||
- Also: `http://localhost:3010`, `http://localhost:8080`, `http://localhost:5173`
|
||||
- Requests from trusted IPs/origins get automatic admin access (`role: 'internal'`)
|
||||
- **Remote (non-trusted)**: Use Bearer token (JWT or API token). NO username/password auth.
|
||||
- Never try to login with username/password via API - use tokens only.
|
||||
- See `src/auth/middleware.ts` for `TRUSTED_ORIGINS` and `TRUSTED_IPS` lists.
|
||||
|
||||
3) **Menu detection and platform IDs**
|
||||
- Set `menu_type` from `menu_url` detection; resolve `platform_dispensary_id` for `menu_type='dutchie'`.
|
||||
- Admin should have "refresh detection" and "resolve ID" actions; schedule/crawl only when `menu_type='dutchie'` AND `platform_dispensary_id` is set.
|
||||
|
||||
3) **Queries and mapping**
|
||||
4) **Queries and mapping**
|
||||
- The DB returns snake_case; code expects camelCase. Always alias/map:
|
||||
- `platform_dispensary_id AS "platformDispensaryId"`
|
||||
- Map via `mapDbRowToDispensary` when loading dispensaries (scheduler, crawler, admin crawl).
|
||||
@@ -640,7 +813,7 @@ export default defineConfig({
|
||||
- Use dutchie GraphQL pipeline only for `menu_type='dutchie'`.
|
||||
|
||||
6) **Frontend**
|
||||
- Forward-facing URLs: `/api/az`, `/az`, `/az-schedule`; no vendor names.
|
||||
- Forward-facing URLs should not contain vendor names.
|
||||
- `/scraper-schedule`: add filters/search, keep as master view for all schedules; reflect platform ID/menu_type status and controls.
|
||||
|
||||
7) **No slug guessing**
|
||||
@@ -689,24 +862,27 @@ export default defineConfig({
|
||||
|
||||
16) **API Route Semantics**
|
||||
|
||||
**Route Groups:**
|
||||
- `/api/admin/...` = Admin/operator actions (crawl triggers, health checks)
|
||||
- `/api/az/...` = Arizona data slice (stores, products, metrics)
|
||||
**Route Groups (as registered in `src/index.ts`):**
|
||||
- `/api/stores` = Store/dispensary CRUD and listing
|
||||
- `/api/products` = Product listing and details
|
||||
- `/api/workers` = Job queue monitoring (replaces legacy `/api/dutchie-az/...`)
|
||||
- `/api/pipeline` = Crawl pipeline triggers
|
||||
- `/api/admin/orchestrator` = Orchestrator admin actions
|
||||
- `/api/discovery` = Platform discovery (Dutchie, etc.)
|
||||
- `/api/v1/...` = Public API for external consumers (WordPress, etc.)
|
||||
|
||||
**Crawl Trigger (CANONICAL):**
|
||||
```
|
||||
POST /api/admin/crawl/:dispensaryId
|
||||
```
|
||||
**Crawl Trigger:**
|
||||
Check `/api/pipeline` or `/api/admin/orchestrator` routes for crawl triggers.
|
||||
The legacy `POST /api/admin/crawl/:dispensaryId` does NOT exist.
|
||||
|
||||
17) **Monitoring and logging**
|
||||
- /scraper-monitor (and /az-schedule) should show active/recent jobs from job_run_logs/crawl_jobs
|
||||
- `/api/workers` shows active/recent jobs from job queue
|
||||
- Auto-refresh every 30 seconds
|
||||
- System Logs page should show real log data, not just startup messages
|
||||
|
||||
18) **Dashboard Architecture**
|
||||
- **Frontend**: Rebuild the frontend with `VITE_API_URL` pointing to the correct backend and redeploy.
|
||||
- **Backend**: `/api/dashboard/stats` MUST use the canonical DB pool. Use the correct tables: `dutchie_products`, `dispensaries`, and views like `v_dashboard_stats`, `v_latest_snapshots`.
|
||||
- **Backend**: `/api/dashboard/stats` MUST use the canonical DB pool. Use the correct tables: `store_products`, `dispensaries`, and views like `v_dashboard_stats`, `v_latest_snapshots`.
|
||||
|
||||
19) **Deployment (Gitea + Kubernetes)**
|
||||
- **Registry**: Gitea at `code.cannabrands.app/creationshop/dispensary-scraper`
|
||||
@@ -732,8 +908,8 @@ export default defineConfig({
|
||||
- **Job schedules** (managed in `job_schedules` table):
|
||||
- `dutchie_az_menu_detection`: Runs daily with 60-min jitter
|
||||
- `dutchie_az_product_crawl`: Runs every 4 hours with 30-min jitter
|
||||
- **Trigger schedules**: `curl -X POST /api/az/admin/schedules/{id}/trigger`
|
||||
- **Check schedule status**: `curl /api/az/admin/schedules`
|
||||
- **Monitor jobs**: `GET /api/workers`
|
||||
- **Trigger crawls**: Check `/api/pipeline` routes
|
||||
|
||||
21) **Frontend Architecture - AVOID OVER-ENGINEERING**
|
||||
|
||||
@@ -1072,3 +1248,32 @@ Every analytics v2 endpoint must:
|
||||
---
|
||||
|
||||
# END Analytics V2 spec extension
|
||||
|
||||
---
|
||||
|
||||
## WordPress Plugin Versioning
|
||||
|
||||
The WordPress plugin version is tracked in `wordpress-plugin/VERSION`.
|
||||
|
||||
**Current version:** Check `wordpress-plugin/VERSION` for the latest version.
|
||||
|
||||
**Versioning rules:**
|
||||
- **Minor bumps (x.x.N)**: Bug fixes, small improvements - default for most changes
|
||||
- **Middle bumps (x.N.0)**: New features, significant improvements
|
||||
- **Major bumps (N.0.0)**: Breaking changes, major rewrites - only when user explicitly requests
|
||||
|
||||
**When making WP plugin changes:**
|
||||
1. Read `wordpress-plugin/VERSION` to get current version
|
||||
2. Bump the version number (minor by default)
|
||||
3. Update both files:
|
||||
- `wordpress-plugin/VERSION`
|
||||
- Plugin header `Version:` in `cannaiq-menus.php` and/or `crawlsy-menus.php`
|
||||
- The `define('..._VERSION', '...')` constant in each plugin file
|
||||
|
||||
**Plugin files:**
|
||||
| File | Brand | API URL |
|
||||
|------|-------|---------|
|
||||
| `cannaiq-menus.php` | CannaIQ | `https://cannaiq.co/api/v1` |
|
||||
| `crawlsy-menus.php` | Crawlsy (legacy) | `https://cannaiq.co/api/v1` |
|
||||
|
||||
Both plugins use the same API endpoint. The Crawlsy version exists for backward compatibility with existing installations.
|
||||
|
||||
40
backend/.env
40
backend/.env
@@ -1,30 +1,52 @@
|
||||
# CannaiQ Backend Environment Configuration
|
||||
# Copy this file to .env and fill in the values
|
||||
|
||||
# Server
|
||||
PORT=3010
|
||||
NODE_ENV=development
|
||||
|
||||
# =============================================================================
|
||||
# CannaiQ Database (dutchie_menus) - PRIMARY DATABASE
|
||||
# CANNAIQ DATABASE (dutchie_menus) - PRIMARY DATABASE
|
||||
# =============================================================================
|
||||
# This is where all schema migrations run and where canonical tables live.
|
||||
# All CANNAIQ_DB_* variables are REQUIRED - connection will fail if missing.
|
||||
# This is where ALL schema migrations run and where canonical tables live.
|
||||
# All CANNAIQ_DB_* variables are REQUIRED - no defaults.
|
||||
# The application will fail to start if any are missing.
|
||||
|
||||
CANNAIQ_DB_HOST=localhost
|
||||
CANNAIQ_DB_PORT=54320
|
||||
CANNAIQ_DB_NAME=dutchie_menus
|
||||
CANNAIQ_DB_NAME=dutchie_menus # MUST be dutchie_menus - NOT dutchie_legacy
|
||||
CANNAIQ_DB_USER=dutchie
|
||||
CANNAIQ_DB_PASS=dutchie_local_pass
|
||||
|
||||
# Alternative: Use a full connection URL instead of individual vars
|
||||
# If set, this takes priority over individual vars above
|
||||
# CANNAIQ_DB_URL=postgresql://user:pass@host:port/dutchie_menus
|
||||
|
||||
# =============================================================================
|
||||
# Legacy Database (dutchie_legacy) - READ-ONLY SOURCE
|
||||
# LEGACY DATABASE (dutchie_legacy) - READ-ONLY FOR ETL
|
||||
# =============================================================================
|
||||
# Used ONLY by ETL scripts to read historical data.
|
||||
# NEVER run migrations against this database.
|
||||
# These are only needed when running 042_legacy_import.ts
|
||||
|
||||
LEGACY_DB_HOST=localhost
|
||||
LEGACY_DB_PORT=54320
|
||||
LEGACY_DB_NAME=dutchie_legacy
|
||||
LEGACY_DB_NAME=dutchie_legacy # READ-ONLY - never migrated
|
||||
LEGACY_DB_USER=dutchie
|
||||
LEGACY_DB_PASS=dutchie_local_pass
|
||||
LEGACY_DB_PASS=
|
||||
|
||||
# Local image storage (no MinIO per CLAUDE.md)
|
||||
# Alternative: Use a full connection URL instead of individual vars
|
||||
# LEGACY_DB_URL=postgresql://user:pass@host:port/dutchie_legacy
|
||||
|
||||
# =============================================================================
|
||||
# LOCAL STORAGE
|
||||
# =============================================================================
|
||||
# Local image storage path (no MinIO)
|
||||
LOCAL_IMAGES_PATH=./public/images
|
||||
|
||||
# JWT
|
||||
# =============================================================================
|
||||
# AUTHENTICATION
|
||||
# =============================================================================
|
||||
JWT_SECRET=your-secret-key-change-in-production
|
||||
ANTHROPIC_API_KEY=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
|
||||
OPENAI_API_KEY=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
|
||||
|
||||
3
backend/.gitignore
vendored
Normal file
3
backend/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
|
||||
# IP2Location database (downloaded separately)
|
||||
data/ip2location/
|
||||
@@ -1,17 +1,17 @@
|
||||
# Build stage
|
||||
# Image: code.cannabrands.app/creationshop/dispensary-scraper
|
||||
FROM node:20-slim AS builder
|
||||
FROM code.cannabrands.app/creationshop/node:20-slim AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci
|
||||
RUN npm install
|
||||
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
|
||||
# Production stage
|
||||
FROM node:20-slim
|
||||
FROM code.cannabrands.app/creationshop/node:20-slim
|
||||
|
||||
# Build arguments for version info
|
||||
ARG APP_BUILD_VERSION=dev
|
||||
@@ -43,10 +43,13 @@ ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci --omit=dev
|
||||
RUN npm install --omit=dev
|
||||
|
||||
COPY --from=builder /app/dist ./dist
|
||||
|
||||
# Copy migrations for auto-migrate on startup
|
||||
COPY migrations ./migrations
|
||||
|
||||
# Create local images directory for when MinIO is not configured
|
||||
RUN mkdir -p /app/public/images/products
|
||||
|
||||
|
||||
538
backend/docs/CRAWL_PIPELINE.md
Normal file
538
backend/docs/CRAWL_PIPELINE.md
Normal file
@@ -0,0 +1,538 @@
|
||||
# Crawl Pipeline Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The crawl pipeline fetches product data from Dutchie dispensary menus and stores it in the canonical database. This document covers the complete flow from task scheduling to data storage.
|
||||
|
||||
---
|
||||
|
||||
## Pipeline Stages
|
||||
|
||||
```
|
||||
┌─────────────────────┐
|
||||
│ store_discovery │ Find new dispensaries
|
||||
└─────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────┐
|
||||
│ entry_point_discovery│ Resolve slug → platform_dispensary_id
|
||||
└─────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────┐
|
||||
│ product_discovery │ Initial product crawl
|
||||
└─────────┬───────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────┐
|
||||
│ product_resync │ Recurring crawl (every 4 hours)
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Stage Details
|
||||
|
||||
### 1. Store Discovery
|
||||
**Purpose:** Find new dispensaries to crawl
|
||||
|
||||
**Handler:** `src/tasks/handlers/store-discovery.ts`
|
||||
|
||||
**Flow:**
|
||||
1. Query Dutchie `ConsumerDispensaries` GraphQL for cities/states
|
||||
2. Extract dispensary info (name, address, menu_url)
|
||||
3. Insert into `dutchie_discovery_locations`
|
||||
4. Queue `entry_point_discovery` for each new location
|
||||
|
||||
---
|
||||
|
||||
### 2. Entry Point Discovery
|
||||
**Purpose:** Resolve menu URL slug to platform_dispensary_id (MongoDB ObjectId)
|
||||
|
||||
**Handler:** `src/tasks/handlers/entry-point-discovery.ts`
|
||||
|
||||
**Flow:**
|
||||
1. Load dispensary from database
|
||||
2. Extract slug from `menu_url`:
|
||||
- `/embedded-menu/<slug>` or `/dispensary/<slug>`
|
||||
3. Start stealth session (fingerprint + proxy)
|
||||
4. Query `resolveDispensaryIdWithDetails(slug)` via GraphQL
|
||||
5. Update dispensary with `platform_dispensary_id`
|
||||
6. Queue `product_discovery` task
|
||||
|
||||
**Example:**
|
||||
```
|
||||
menu_url: https://dutchie.com/embedded-menu/deeply-rooted
|
||||
slug: deeply-rooted
|
||||
platform_dispensary_id: 6405ef617056e8014d79101b
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Product Discovery
|
||||
**Purpose:** Initial crawl of a new dispensary
|
||||
|
||||
**Handler:** `src/tasks/handlers/product-discovery.ts`
|
||||
|
||||
Same as product_resync but for first-time crawls.
|
||||
|
||||
---
|
||||
|
||||
### 4. Product Resync
|
||||
**Purpose:** Recurring crawl to capture price/stock changes
|
||||
|
||||
**Handler:** `src/tasks/handlers/product-resync.ts`
|
||||
|
||||
**Flow:**
|
||||
|
||||
#### Step 1: Load Dispensary Info
|
||||
```sql
|
||||
SELECT id, name, platform_dispensary_id, menu_url, state
|
||||
FROM dispensaries
|
||||
WHERE id = $1 AND crawl_enabled = true
|
||||
```
|
||||
|
||||
#### Step 2: Start Stealth Session
|
||||
- Generate random browser fingerprint
|
||||
- Set locale/timezone matching state
|
||||
- Optional proxy rotation
|
||||
|
||||
#### Step 3: Fetch Products via GraphQL
|
||||
**Endpoint:** `https://dutchie.com/api-3/graphql`
|
||||
|
||||
**Variables:**
|
||||
```javascript
|
||||
{
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: "<platform_dispensary_id>",
|
||||
pricingType: "rec",
|
||||
Status: "All",
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: "popularSortIdx",
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false
|
||||
},
|
||||
page: 0,
|
||||
perPage: 100
|
||||
}
|
||||
```
|
||||
|
||||
**Key Notes:**
|
||||
- `Status: "All"` returns all products (Active returns same count)
|
||||
- `Status: null` returns 0 products (broken)
|
||||
- `pricingType: "rec"` returns BOTH rec and med prices
|
||||
- Paginate until `products.length < perPage` or `allProducts.length >= totalCount`
|
||||
|
||||
#### Step 4: Normalize Data
|
||||
Transform raw Dutchie payload to canonical format via `DutchieNormalizer`.
|
||||
|
||||
#### Step 5: Upsert Products
|
||||
Insert/update `store_products` table with normalized data.
|
||||
|
||||
#### Step 6: Create Snapshots
|
||||
Insert point-in-time record to `store_product_snapshots`.
|
||||
|
||||
#### Step 7: Track Missing Products (OOS Detection)
|
||||
```sql
|
||||
-- Reset consecutive_misses for products IN the feed
|
||||
UPDATE store_products
|
||||
SET consecutive_misses = 0, last_seen_at = NOW()
|
||||
WHERE dispensary_id = $1
|
||||
AND provider = 'dutchie'
|
||||
AND provider_product_id = ANY($2)
|
||||
|
||||
-- Increment for products NOT in feed
|
||||
UPDATE store_products
|
||||
SET consecutive_misses = consecutive_misses + 1
|
||||
WHERE dispensary_id = $1
|
||||
AND provider = 'dutchie'
|
||||
AND provider_product_id NOT IN (...)
|
||||
AND consecutive_misses < 3
|
||||
|
||||
-- Mark OOS at 3 consecutive misses
|
||||
UPDATE store_products
|
||||
SET stock_status = 'oos', is_in_stock = false
|
||||
WHERE dispensary_id = $1
|
||||
AND consecutive_misses >= 3
|
||||
AND stock_status != 'oos'
|
||||
```
|
||||
|
||||
#### Step 8: Download Images
|
||||
For new products, download and store images locally.
|
||||
|
||||
#### Step 9: Update Dispensary
|
||||
```sql
|
||||
UPDATE dispensaries SET last_crawl_at = NOW() WHERE id = $1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## GraphQL Payload Structure
|
||||
|
||||
### Product Fields (from filteredProducts.products[])
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `_id` / `id` | string | MongoDB ObjectId (24 hex chars) |
|
||||
| `Name` | string | Product display name |
|
||||
| `brandName` | string | Brand name |
|
||||
| `brand.name` | string | Brand name (nested) |
|
||||
| `brand.description` | string | Brand description |
|
||||
| `type` | string | Category (Flower, Edible, Concentrate, etc.) |
|
||||
| `subcategory` | string | Subcategory |
|
||||
| `strainType` | string | Hybrid, Indica, Sativa, N/A |
|
||||
| `Status` | string | Always "Active" in feed |
|
||||
| `Image` | string | Primary image URL |
|
||||
| `images[]` | array | All product images |
|
||||
|
||||
### Pricing Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `Prices[]` | number[] | Rec prices per option |
|
||||
| `recPrices[]` | number[] | Rec prices |
|
||||
| `medicalPrices[]` | number[] | Medical prices |
|
||||
| `recSpecialPrices[]` | number[] | Rec sale prices |
|
||||
| `medicalSpecialPrices[]` | number[] | Medical sale prices |
|
||||
| `Options[]` | string[] | Size options ("1/8oz", "1g", etc.) |
|
||||
| `rawOptions[]` | string[] | Raw weight options ("3.5g") |
|
||||
|
||||
### Inventory Fields (POSMetaData.children[])
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `quantity` | number | Total inventory count |
|
||||
| `quantityAvailable` | number | Available for online orders |
|
||||
| `kioskQuantityAvailable` | number | Available for kiosk orders |
|
||||
| `option` | string | Which size option this is for |
|
||||
|
||||
### Potency Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `THCContent.range[]` | number[] | THC percentage |
|
||||
| `CBDContent.range[]` | number[] | CBD percentage |
|
||||
| `cannabinoidsV2[]` | array | Detailed cannabinoid breakdown |
|
||||
|
||||
### Specials (specialData.bogoSpecials[])
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `specialName` | string | Deal name |
|
||||
| `specialType` | string | "bogo", "sale", etc. |
|
||||
| `itemsForAPrice.value` | string | Bundle price |
|
||||
| `bogoRewards[].totalQuantity.quantity` | number | Required quantity |
|
||||
|
||||
---
|
||||
|
||||
## OOS Detection Logic
|
||||
|
||||
Products disappear from the Dutchie feed when they go out of stock. We track this via `consecutive_misses`:
|
||||
|
||||
| Scenario | Action |
|
||||
|----------|--------|
|
||||
| Product in feed | `consecutive_misses = 0` |
|
||||
| Product missing 1st time | `consecutive_misses = 1` |
|
||||
| Product missing 2nd time | `consecutive_misses = 2` |
|
||||
| Product missing 3rd time | `consecutive_misses = 3`, mark `stock_status = 'oos'` |
|
||||
| Product returns to feed | `consecutive_misses = 0`, update stock_status |
|
||||
|
||||
**Why 3 misses?**
|
||||
- Protects against false positives from crawl failures
|
||||
- Single bad crawl doesn't trigger mass OOS alerts
|
||||
- Balances detection speed vs accuracy
|
||||
|
||||
---
|
||||
|
||||
## Database Tables
|
||||
|
||||
### store_products
|
||||
Current state of each product:
|
||||
- `provider_product_id` - Dutchie's MongoDB ObjectId
|
||||
- `name_raw`, `brand_name_raw` - Raw values from feed
|
||||
- `price_rec`, `price_med` - Current prices
|
||||
- `is_in_stock`, `stock_status` - Availability
|
||||
- `consecutive_misses` - OOS detection counter
|
||||
- `last_seen_at` - Last time product was in feed
|
||||
|
||||
### store_product_snapshots
|
||||
Point-in-time records for historical analysis:
|
||||
- One row per product per crawl
|
||||
- Captures price, stock, potency at that moment
|
||||
- Used for price history, analytics
|
||||
|
||||
### dispensaries
|
||||
Store metadata:
|
||||
- `platform_dispensary_id` - MongoDB ObjectId for GraphQL
|
||||
- `menu_url` - Source URL
|
||||
- `last_crawl_at` - Last successful crawl
|
||||
- `crawl_enabled` - Whether to crawl
|
||||
|
||||
---
|
||||
|
||||
## Worker Roles
|
||||
|
||||
Workers pull tasks from the `worker_tasks` queue based on their assigned role.
|
||||
|
||||
| Role | Name | Description | Handler |
|
||||
|------|------|-------------|---------|
|
||||
| `product_resync` | Product Resync | Re-crawl dispensary products for price/stock changes | `handleProductResync` |
|
||||
| `product_discovery` | Product Discovery | Initial product discovery for new dispensaries | `handleProductDiscovery` |
|
||||
| `store_discovery` | Store Discovery | Discover new dispensary locations | `handleStoreDiscovery` |
|
||||
| `entry_point_discovery` | Entry Point Discovery | Resolve platform IDs from menu URLs | `handleEntryPointDiscovery` |
|
||||
| `analytics_refresh` | Analytics Refresh | Refresh materialized views and analytics | `handleAnalyticsRefresh` |
|
||||
|
||||
**API Endpoint:** `GET /api/worker-registry/roles`
|
||||
|
||||
---
|
||||
|
||||
## Scheduling
|
||||
|
||||
Crawls are scheduled via `worker_tasks` table:
|
||||
|
||||
| Role | Frequency | Description |
|
||||
|------|-----------|-------------|
|
||||
| `product_resync` | Every 4 hours | Regular product refresh |
|
||||
| `product_discovery` | On-demand | First crawl for new stores |
|
||||
| `entry_point_discovery` | On-demand | New store setup |
|
||||
| `store_discovery` | Daily | Find new stores |
|
||||
| `analytics_refresh` | Daily | Refresh analytics materialized views |
|
||||
|
||||
---
|
||||
|
||||
## Priority & On-Demand Tasks
|
||||
|
||||
Tasks are claimed by workers in order of **priority DESC, created_at ASC**.
|
||||
|
||||
### Priority Levels
|
||||
|
||||
| Priority | Use Case | Example |
|
||||
|----------|----------|---------|
|
||||
| 0 | Scheduled/batch tasks | Daily product_resync generation |
|
||||
| 10 | On-demand/chained tasks | entry_point → product_discovery |
|
||||
| Higher | Urgent/manual triggers | Admin-triggered immediate crawl |
|
||||
|
||||
### Task Chaining
|
||||
|
||||
When a task completes, the system automatically creates follow-up tasks:
|
||||
|
||||
```
|
||||
store_discovery (completed)
|
||||
└─► entry_point_discovery (priority: 10) for each new store
|
||||
|
||||
entry_point_discovery (completed, success)
|
||||
└─► product_discovery (priority: 10) for that store
|
||||
|
||||
product_discovery (completed)
|
||||
└─► [no chain] Store enters regular resync schedule
|
||||
```
|
||||
|
||||
### On-Demand Task Creation
|
||||
|
||||
Use the task service to create high-priority tasks:
|
||||
|
||||
```typescript
|
||||
// Create immediate product resync for a store
|
||||
await taskService.createTask({
|
||||
role: 'product_resync',
|
||||
dispensary_id: 123,
|
||||
platform: 'dutchie',
|
||||
priority: 20, // Higher than batch tasks
|
||||
});
|
||||
|
||||
// Convenience methods with default high priority (10)
|
||||
await taskService.createEntryPointTask(dispensaryId, 'dutchie');
|
||||
await taskService.createProductDiscoveryTask(dispensaryId, 'dutchie');
|
||||
await taskService.createStoreDiscoveryTask('dutchie', 'AZ');
|
||||
```
|
||||
|
||||
### Claim Function
|
||||
|
||||
The `claim_task()` SQL function atomically claims tasks:
|
||||
- Respects priority ordering (higher = first)
|
||||
- Uses `FOR UPDATE SKIP LOCKED` for concurrency
|
||||
- Prevents multiple active tasks per store
|
||||
|
||||
---
|
||||
|
||||
## Image Storage
|
||||
|
||||
Images are downloaded from Dutchie's AWS S3 and stored locally with on-demand resizing.
|
||||
|
||||
### Storage Path
|
||||
```
|
||||
/storage/images/products/<state>/<store>/<brand>/<product_id>/image-<hash>.webp
|
||||
/storage/images/brands/<brand>/logo-<hash>.webp
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```
|
||||
/storage/images/products/az/az-deeply-rooted/bud-bros/6913e3cd444eac3935e928b9/image-ae38b1f9.webp
|
||||
```
|
||||
|
||||
### Image Proxy API
|
||||
Served via `/img/*` with on-demand resizing using **sharp**:
|
||||
|
||||
```
|
||||
GET /img/products/az/az-deeply-rooted/bud-bros/6913e3cd444eac3935e928b9/image-ae38b1f9.webp?w=200
|
||||
```
|
||||
|
||||
| Param | Description |
|
||||
|-------|-------------|
|
||||
| `w` | Width in pixels (max 4000) |
|
||||
| `h` | Height in pixels (max 4000) |
|
||||
| `q` | Quality 1-100 (default 80) |
|
||||
| `fit` | cover, contain, fill, inside, outside |
|
||||
| `blur` | Blur sigma (0.3-1000) |
|
||||
| `gray` | Grayscale (1 = enabled) |
|
||||
| `format` | webp, jpeg, png, avif (default webp) |
|
||||
|
||||
### Key Files
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/utils/image-storage.ts` | Download & save images to local filesystem |
|
||||
| `src/routes/image-proxy.ts` | On-demand resize/transform at `/img/*` |
|
||||
|
||||
### Download Rules
|
||||
|
||||
| Scenario | Image Action |
|
||||
|----------|--------------|
|
||||
| **New product (first crawl)** | Download if `primaryImageUrl` exists |
|
||||
| **Existing product (refresh)** | Download only if `local_image_path` is NULL (backfill) |
|
||||
| **Product already has local image** | Skip download entirely |
|
||||
|
||||
**Logic:**
|
||||
- Images are downloaded **once** and never re-downloaded on subsequent crawls
|
||||
- `skipIfExists: true` - filesystem check prevents re-download even if queued
|
||||
- First crawl: all products get images
|
||||
- Refresh crawl: only new products or products missing local images
|
||||
|
||||
### Storage Rules
|
||||
- **NO MinIO** - local filesystem only (`STORAGE_DRIVER=local`)
|
||||
- Store full resolution, resize on-demand via `/img` proxy
|
||||
- Convert to webp for consistency using **sharp**
|
||||
- Preserve original Dutchie URL as fallback in `image_url` column
|
||||
- Local path stored in `local_image_path` column
|
||||
|
||||
---
|
||||
|
||||
## Stealth & Anti-Detection
|
||||
|
||||
**PROXIES ARE REQUIRED** - Workers will fail to start if no active proxies are available in the database. All HTTP requests to Dutchie go through a proxy.
|
||||
|
||||
Workers automatically initialize anti-detection systems on startup.
|
||||
|
||||
### Components
|
||||
|
||||
| Component | Purpose | Source |
|
||||
|-----------|---------|--------|
|
||||
| **CrawlRotator** | Coordinates proxy + UA rotation | `src/services/crawl-rotator.ts` |
|
||||
| **ProxyRotator** | Round-robin proxy selection, health tracking | `src/services/crawl-rotator.ts` |
|
||||
| **UserAgentRotator** | Cycles through realistic browser fingerprints | `src/services/crawl-rotator.ts` |
|
||||
| **Dutchie Client** | Curl-based HTTP with auto-retry on 403 | `src/platforms/dutchie/client.ts` |
|
||||
|
||||
### Initialization Flow
|
||||
|
||||
```
|
||||
Worker Start
|
||||
│
|
||||
├─► initializeStealth()
|
||||
│ │
|
||||
│ ├─► CrawlRotator.initialize()
|
||||
│ │ └─► Load proxies from `proxies` table
|
||||
│ │
|
||||
│ └─► setCrawlRotator(rotator)
|
||||
│ └─► Wire to Dutchie client
|
||||
│
|
||||
└─► Process tasks...
|
||||
```
|
||||
|
||||
### Stealth Session (per task)
|
||||
|
||||
Each crawl task starts a stealth session:
|
||||
|
||||
```typescript
|
||||
// In product-refresh.ts, entry-point-discovery.ts
|
||||
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
|
||||
```
|
||||
|
||||
This creates a new identity with:
|
||||
- **Random fingerprint:** Chrome/Firefox/Safari/Edge on Win/Mac/Linux
|
||||
- **Accept-Language:** Matches timezone (e.g., `America/Phoenix` → `en-US,en;q=0.9`)
|
||||
- **sec-ch-ua headers:** Proper Client Hints for the browser profile
|
||||
|
||||
### On 403 Block
|
||||
|
||||
When Dutchie returns 403, the client automatically:
|
||||
|
||||
1. Records failure on current proxy (increments `failure_count`)
|
||||
2. If proxy has 5+ failures, deactivates it
|
||||
3. Rotates to next healthy proxy
|
||||
4. Rotates fingerprint
|
||||
5. Retries the request
|
||||
|
||||
### Proxy Table Schema
|
||||
|
||||
```sql
|
||||
CREATE TABLE proxies (
|
||||
id SERIAL PRIMARY KEY,
|
||||
host VARCHAR(255) NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
username VARCHAR(100),
|
||||
password VARCHAR(100),
|
||||
protocol VARCHAR(10) DEFAULT 'http', -- http, https, socks5
|
||||
is_active BOOLEAN DEFAULT true,
|
||||
last_used_at TIMESTAMPTZ,
|
||||
failure_count INTEGER DEFAULT 0,
|
||||
success_count INTEGER DEFAULT 0,
|
||||
avg_response_time_ms INTEGER,
|
||||
last_failure_at TIMESTAMPTZ,
|
||||
last_error TEXT
|
||||
);
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Proxies are mandatory. There is no environment variable to disable them. Workers will refuse to start without active proxies in the database.
|
||||
|
||||
### Fingerprints Available
|
||||
|
||||
The client includes 6 browser fingerprints:
|
||||
- Chrome 131 on Windows
|
||||
- Chrome 131 on macOS
|
||||
- Chrome 120 on Windows
|
||||
- Firefox 133 on Windows
|
||||
- Safari 17.2 on macOS
|
||||
- Edge 131 on Windows
|
||||
|
||||
Each includes proper `sec-ch-ua`, `sec-ch-ua-platform`, and `sec-ch-ua-mobile` headers.
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
- **GraphQL errors:** Logged, task marked failed, retried later
|
||||
- **Normalization errors:** Logged as warnings, continue with valid products
|
||||
- **Image download errors:** Non-fatal, logged, continue
|
||||
- **Database errors:** Task fails, will be retried
|
||||
- **403 blocks:** Auto-rotate proxy + fingerprint, retry (up to 3 retries)
|
||||
|
||||
---
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/tasks/handlers/product-resync.ts` | Main crawl handler |
|
||||
| `src/tasks/handlers/entry-point-discovery.ts` | Slug → ID resolution |
|
||||
| `src/platforms/dutchie/index.ts` | GraphQL client, session management |
|
||||
| `src/hydration/normalizers/dutchie.ts` | Payload normalization |
|
||||
| `src/hydration/canonical-upsert.ts` | Database upsert logic |
|
||||
| `src/utils/image-storage.ts` | Image download and local storage |
|
||||
| `src/routes/image-proxy.ts` | On-demand image resizing |
|
||||
| `migrations/075_consecutive_misses.sql` | OOS tracking column |
|
||||
400
backend/docs/WORKER_TASK_ARCHITECTURE.md
Normal file
400
backend/docs/WORKER_TASK_ARCHITECTURE.md
Normal file
@@ -0,0 +1,400 @@
|
||||
# Worker Task Architecture
|
||||
|
||||
This document describes the unified task-based worker system that replaces the legacy fragmented job systems.
|
||||
|
||||
## Overview
|
||||
|
||||
The task worker architecture provides a single, unified system for managing all background work in CannaiQ:
|
||||
|
||||
- **Store discovery** - Find new dispensaries on platforms
|
||||
- **Entry point discovery** - Resolve platform IDs from menu URLs
|
||||
- **Product discovery** - Initial product fetch for new stores
|
||||
- **Product resync** - Regular price/stock updates for existing stores
|
||||
- **Analytics refresh** - Refresh materialized views and analytics
|
||||
|
||||
## Architecture
|
||||
|
||||
### Database Tables
|
||||
|
||||
**`worker_tasks`** - Central task queue
|
||||
```sql
|
||||
CREATE TABLE worker_tasks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
role task_role NOT NULL, -- What type of work
|
||||
dispensary_id INTEGER, -- Which store (if applicable)
|
||||
platform VARCHAR(50), -- Which platform (dutchie, etc.)
|
||||
status task_status DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 0, -- Higher = process first
|
||||
scheduled_for TIMESTAMP, -- Don't process before this time
|
||||
worker_id VARCHAR(100), -- Which worker claimed it
|
||||
claimed_at TIMESTAMP,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
last_heartbeat_at TIMESTAMP, -- For stale detection
|
||||
result JSONB, -- Output from handler
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
max_retries INTEGER DEFAULT 3,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
**Key indexes:**
|
||||
- `idx_worker_tasks_pending_priority` - For efficient task claiming
|
||||
- `idx_worker_tasks_active_dispensary` - Prevents concurrent tasks per store (partial unique index)
|
||||
|
||||
### Task Roles
|
||||
|
||||
| Role | Purpose | Per-Store | Scheduled |
|
||||
|------|---------|-----------|-----------|
|
||||
| `store_discovery` | Find new stores on a platform | No | Daily |
|
||||
| `entry_point_discovery` | Resolve platform IDs | Yes | On-demand |
|
||||
| `product_discovery` | Initial product fetch | Yes | After entry_point |
|
||||
| `product_resync` | Price/stock updates | Yes | Every 4 hours |
|
||||
| `analytics_refresh` | Refresh MVs | No | Daily |
|
||||
|
||||
### Task Lifecycle
|
||||
|
||||
```
|
||||
pending → claimed → running → completed
|
||||
↓
|
||||
failed
|
||||
```
|
||||
|
||||
1. **pending** - Task is waiting to be picked up
|
||||
2. **claimed** - Worker has claimed it (atomic via SELECT FOR UPDATE SKIP LOCKED)
|
||||
3. **running** - Worker is actively processing
|
||||
4. **completed** - Task finished successfully
|
||||
5. **failed** - Task encountered an error
|
||||
6. **stale** - Task lost its worker (recovered automatically)
|
||||
|
||||
## Files
|
||||
|
||||
### Core Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/tasks/task-service.ts` | TaskService - CRUD, claiming, capacity metrics |
|
||||
| `src/tasks/task-worker.ts` | TaskWorker - Main worker loop |
|
||||
| `src/tasks/index.ts` | Module exports |
|
||||
| `src/routes/tasks.ts` | API endpoints |
|
||||
| `migrations/074_worker_task_queue.sql` | Database schema |
|
||||
|
||||
### Task Handlers
|
||||
|
||||
| File | Role |
|
||||
|------|------|
|
||||
| `src/tasks/handlers/store-discovery.ts` | `store_discovery` |
|
||||
| `src/tasks/handlers/entry-point-discovery.ts` | `entry_point_discovery` |
|
||||
| `src/tasks/handlers/product-discovery.ts` | `product_discovery` |
|
||||
| `src/tasks/handlers/product-resync.ts` | `product_resync` |
|
||||
| `src/tasks/handlers/analytics-refresh.ts` | `analytics_refresh` |
|
||||
|
||||
## Running Workers
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `WORKER_ROLE` | (required) | Which task role to process |
|
||||
| `WORKER_ID` | auto-generated | Custom worker identifier |
|
||||
| `POLL_INTERVAL_MS` | 5000 | How often to check for tasks |
|
||||
| `HEARTBEAT_INTERVAL_MS` | 30000 | How often to update heartbeat |
|
||||
|
||||
### Starting a Worker
|
||||
|
||||
```bash
|
||||
# Start a product resync worker
|
||||
WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts
|
||||
|
||||
# Start with custom ID
|
||||
WORKER_ROLE=product_resync WORKER_ID=resync-1 npx tsx src/tasks/task-worker.ts
|
||||
|
||||
# Start multiple workers for different roles
|
||||
WORKER_ROLE=store_discovery npx tsx src/tasks/task-worker.ts &
|
||||
WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts &
|
||||
```
|
||||
|
||||
### Kubernetes Deployment
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: task-worker-resync
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: worker
|
||||
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||
command: ["npx", "tsx", "src/tasks/task-worker.ts"]
|
||||
env:
|
||||
- name: WORKER_ROLE
|
||||
value: "product_resync"
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Task Management
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks` | GET | List tasks with filters |
|
||||
| `/api/tasks` | POST | Create a new task |
|
||||
| `/api/tasks/:id` | GET | Get task by ID |
|
||||
| `/api/tasks/counts` | GET | Get counts by status |
|
||||
| `/api/tasks/capacity` | GET | Get capacity metrics |
|
||||
| `/api/tasks/capacity/:role` | GET | Get role-specific capacity |
|
||||
| `/api/tasks/recover-stale` | POST | Recover tasks from dead workers |
|
||||
|
||||
### Task Generation
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks/generate/resync` | POST | Generate daily resync tasks |
|
||||
| `/api/tasks/generate/discovery` | POST | Create store discovery task |
|
||||
|
||||
### Migration (from legacy systems)
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks/migration/status` | GET | Compare old vs new systems |
|
||||
| `/api/tasks/migration/disable-old-schedules` | POST | Disable job_schedules |
|
||||
| `/api/tasks/migration/cancel-pending-crawl-jobs` | POST | Cancel old crawl jobs |
|
||||
| `/api/tasks/migration/create-resync-tasks` | POST | Create tasks for all stores |
|
||||
| `/api/tasks/migration/full-migrate` | POST | One-click migration |
|
||||
|
||||
### Role-Specific Endpoints
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/api/tasks/role/:role/last-completion` | GET | Last completion time |
|
||||
| `/api/tasks/role/:role/recent` | GET | Recent completions |
|
||||
| `/api/tasks/store/:id/active` | GET | Check if store has active task |
|
||||
|
||||
## Capacity Planning
|
||||
|
||||
The `v_worker_capacity` view provides real-time metrics:
|
||||
|
||||
```sql
|
||||
SELECT * FROM v_worker_capacity;
|
||||
```
|
||||
|
||||
Returns:
|
||||
- `pending_tasks` - Tasks waiting to be claimed
|
||||
- `ready_tasks` - Tasks ready now (scheduled_for is null or past)
|
||||
- `claimed_tasks` - Tasks claimed but not started
|
||||
- `running_tasks` - Tasks actively processing
|
||||
- `completed_last_hour` - Recent completions
|
||||
- `failed_last_hour` - Recent failures
|
||||
- `active_workers` - Workers with recent heartbeats
|
||||
- `avg_duration_sec` - Average task duration
|
||||
- `tasks_per_worker_hour` - Throughput estimate
|
||||
- `estimated_hours_to_drain` - Time to clear queue
|
||||
|
||||
### Scaling Recommendations
|
||||
|
||||
```javascript
|
||||
// API: GET /api/tasks/capacity/:role
|
||||
{
|
||||
"role": "product_resync",
|
||||
"pending_tasks": 500,
|
||||
"active_workers": 3,
|
||||
"workers_needed": {
|
||||
"for_1_hour": 10,
|
||||
"for_4_hours": 3,
|
||||
"for_8_hours": 2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Task Chaining
|
||||
|
||||
Tasks can automatically create follow-up tasks:
|
||||
|
||||
```
|
||||
store_discovery → entry_point_discovery → product_discovery
|
||||
↓
|
||||
(store has platform_dispensary_id)
|
||||
↓
|
||||
Daily resync tasks
|
||||
```
|
||||
|
||||
The `chainNextTask()` method handles this automatically.
|
||||
|
||||
## Stale Task Recovery
|
||||
|
||||
Tasks are considered stale if `last_heartbeat_at` is older than the threshold (default 10 minutes).
|
||||
|
||||
```sql
|
||||
SELECT recover_stale_tasks(10); -- 10 minute threshold
|
||||
```
|
||||
|
||||
Or via API:
|
||||
```bash
|
||||
curl -X POST /api/tasks/recover-stale \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"threshold_minutes": 10}'
|
||||
```
|
||||
|
||||
## Migration from Legacy Systems
|
||||
|
||||
### Legacy Systems Replaced
|
||||
|
||||
1. **job_schedules + job_run_logs** - Scheduled job definitions
|
||||
2. **dispensary_crawl_jobs** - Per-dispensary crawl queue
|
||||
3. **SyncOrchestrator + HydrationWorker** - Raw payload processing
|
||||
|
||||
### Migration Steps
|
||||
|
||||
**Option 1: One-Click Migration**
|
||||
```bash
|
||||
curl -X POST /api/tasks/migration/full-migrate
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Disable all job_schedules
|
||||
2. Cancel pending dispensary_crawl_jobs
|
||||
3. Generate resync tasks for all stores
|
||||
4. Create discovery and analytics tasks
|
||||
|
||||
**Option 2: Manual Migration**
|
||||
```bash
|
||||
# 1. Check current status
|
||||
curl /api/tasks/migration/status
|
||||
|
||||
# 2. Disable old schedules
|
||||
curl -X POST /api/tasks/migration/disable-old-schedules
|
||||
|
||||
# 3. Cancel pending crawl jobs
|
||||
curl -X POST /api/tasks/migration/cancel-pending-crawl-jobs
|
||||
|
||||
# 4. Create resync tasks
|
||||
curl -X POST /api/tasks/migration/create-resync-tasks \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"state_code": "AZ"}'
|
||||
|
||||
# 5. Generate daily resync schedule
|
||||
curl -X POST /api/tasks/generate/resync \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"batches_per_day": 6}'
|
||||
```
|
||||
|
||||
## Per-Store Locking
|
||||
|
||||
The system prevents concurrent tasks for the same store using a partial unique index:
|
||||
|
||||
```sql
|
||||
CREATE UNIQUE INDEX idx_worker_tasks_active_dispensary
|
||||
ON worker_tasks (dispensary_id)
|
||||
WHERE dispensary_id IS NOT NULL
|
||||
AND status IN ('claimed', 'running');
|
||||
```
|
||||
|
||||
This ensures only one task can be active per store at any time.
|
||||
|
||||
## Task Priority
|
||||
|
||||
Tasks are claimed in priority order (higher first), then by creation time:
|
||||
|
||||
```sql
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
```
|
||||
|
||||
Default priorities:
|
||||
- `store_discovery`: 0
|
||||
- `entry_point_discovery`: 10 (high - new stores)
|
||||
- `product_discovery`: 10 (high - new stores)
|
||||
- `product_resync`: 0
|
||||
- `analytics_refresh`: 0
|
||||
|
||||
## Scheduled Tasks
|
||||
|
||||
Tasks can be scheduled for future execution:
|
||||
|
||||
```javascript
|
||||
await taskService.createTask({
|
||||
role: 'product_resync',
|
||||
dispensary_id: 123,
|
||||
scheduled_for: new Date('2025-01-10T06:00:00Z'),
|
||||
});
|
||||
```
|
||||
|
||||
The `generate_resync_tasks()` function creates staggered tasks throughout the day:
|
||||
|
||||
```sql
|
||||
SELECT generate_resync_tasks(6, '2025-01-10'); -- 6 batches = every 4 hours
|
||||
```
|
||||
|
||||
## Dashboard Integration
|
||||
|
||||
The admin dashboard shows task queue status in the main overview:
|
||||
|
||||
```
|
||||
Task Queue Summary
|
||||
------------------
|
||||
Pending: 45
|
||||
Running: 3
|
||||
Completed: 1,234
|
||||
Failed: 12
|
||||
```
|
||||
|
||||
Full task management is available at `/admin/tasks`.
|
||||
|
||||
## Error Handling
|
||||
|
||||
Failed tasks include the error message in `error_message` and can be retried:
|
||||
|
||||
```sql
|
||||
-- View failed tasks
|
||||
SELECT id, role, dispensary_id, error_message, retry_count
|
||||
FROM worker_tasks
|
||||
WHERE status = 'failed'
|
||||
ORDER BY completed_at DESC
|
||||
LIMIT 20;
|
||||
|
||||
-- Retry failed tasks
|
||||
UPDATE worker_tasks
|
||||
SET status = 'pending', retry_count = retry_count + 1
|
||||
WHERE status = 'failed' AND retry_count < max_retries;
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Logs
|
||||
|
||||
Workers log to stdout:
|
||||
```
|
||||
[TaskWorker] Starting worker worker-product_resync-a1b2c3d4 for role: product_resync
|
||||
[TaskWorker] Claimed task 123 (product_resync) for dispensary 456
|
||||
[TaskWorker] Task 123 completed successfully
|
||||
```
|
||||
|
||||
### Health Check
|
||||
|
||||
Check if workers are active:
|
||||
```sql
|
||||
SELECT worker_id, role, COUNT(*), MAX(last_heartbeat_at)
|
||||
FROM worker_tasks
|
||||
WHERE last_heartbeat_at > NOW() - INTERVAL '5 minutes'
|
||||
GROUP BY worker_id, role;
|
||||
```
|
||||
|
||||
### Metrics
|
||||
|
||||
```sql
|
||||
-- Tasks by status
|
||||
SELECT status, COUNT(*) FROM worker_tasks GROUP BY status;
|
||||
|
||||
-- Tasks by role
|
||||
SELECT role, status, COUNT(*) FROM worker_tasks GROUP BY role, status;
|
||||
|
||||
-- Average duration by role
|
||||
SELECT role, AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) as avg_seconds
|
||||
FROM worker_tasks
|
||||
WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '24 hours'
|
||||
GROUP BY role;
|
||||
```
|
||||
69
backend/k8s/cronjob-ip2location.yaml
Normal file
69
backend/k8s/cronjob-ip2location.yaml
Normal file
@@ -0,0 +1,69 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: ip2location-update
|
||||
namespace: default
|
||||
spec:
|
||||
# Run on the 1st of every month at 3am UTC
|
||||
schedule: "0 3 1 * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: ip2location-updater
|
||||
image: curlimages/curl:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
echo "Downloading IP2Location LITE DB5..."
|
||||
|
||||
# Download to temp
|
||||
cd /tmp
|
||||
curl -L -o ip2location.zip "https://www.ip2location.com/download/?token=${IP2LOCATION_TOKEN}&file=DB5LITEBIN"
|
||||
|
||||
# Extract
|
||||
unzip -o ip2location.zip
|
||||
|
||||
# Find and copy the BIN file
|
||||
BIN_FILE=$(ls *.BIN 2>/dev/null | head -1)
|
||||
if [ -z "$BIN_FILE" ]; then
|
||||
echo "ERROR: No BIN file found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Copy to shared volume
|
||||
cp "$BIN_FILE" /data/IP2LOCATION-LITE-DB5.BIN
|
||||
|
||||
echo "Done! Database updated: /data/IP2LOCATION-LITE-DB5.BIN"
|
||||
env:
|
||||
- name: IP2LOCATION_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: dutchie-backend-secret
|
||||
key: IP2LOCATION_TOKEN
|
||||
volumeMounts:
|
||||
- name: ip2location-data
|
||||
mountPath: /data
|
||||
restartPolicy: OnFailure
|
||||
volumes:
|
||||
- name: ip2location-data
|
||||
persistentVolumeClaim:
|
||||
claimName: ip2location-pvc
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: ip2location-pvc
|
||||
namespace: default
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Mi
|
||||
@@ -26,6 +26,12 @@ spec:
|
||||
name: dutchie-backend-config
|
||||
- secretRef:
|
||||
name: dutchie-backend-secret
|
||||
env:
|
||||
- name: IP2LOCATION_DB_PATH
|
||||
value: /data/ip2location/IP2LOCATION-LITE-DB5.BIN
|
||||
volumeMounts:
|
||||
- name: ip2location-data
|
||||
mountPath: /data/ip2location
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
@@ -45,3 +51,7 @@ spec:
|
||||
port: 3010
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: ip2location-data
|
||||
persistentVolumeClaim:
|
||||
claimName: ip2location-pvc
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
-- Add location columns to proxies table
|
||||
ALTER TABLE proxies
|
||||
ADD COLUMN city VARCHAR(100),
|
||||
ADD COLUMN state VARCHAR(100),
|
||||
ADD COLUMN country VARCHAR(100),
|
||||
ADD COLUMN country_code VARCHAR(2),
|
||||
ADD COLUMN location_updated_at TIMESTAMP;
|
||||
ADD COLUMN IF NOT EXISTS city VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS state VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country_code VARCHAR(2),
|
||||
ADD COLUMN IF NOT EXISTS location_updated_at TIMESTAMP;
|
||||
|
||||
-- Add index for location-based queries
|
||||
CREATE INDEX idx_proxies_location ON proxies(country_code, state, city);
|
||||
CREATE INDEX IF NOT EXISTS idx_proxies_location ON proxies(country_code, state, city);
|
||||
|
||||
-- Add the same to failed_proxies table
|
||||
ALTER TABLE failed_proxies
|
||||
ADD COLUMN city VARCHAR(100),
|
||||
ADD COLUMN state VARCHAR(100),
|
||||
ADD COLUMN country VARCHAR(100),
|
||||
ADD COLUMN country_code VARCHAR(2),
|
||||
ADD COLUMN location_updated_at TIMESTAMP;
|
||||
ADD COLUMN IF NOT EXISTS city VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS state VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS country_code VARCHAR(2),
|
||||
ADD COLUMN IF NOT EXISTS location_updated_at TIMESTAMP;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
-- Create dispensaries table as single source of truth
|
||||
-- This consolidates azdhs_list (official data) + stores (menu data) into one table
|
||||
CREATE TABLE dispensaries (
|
||||
CREATE TABLE IF NOT EXISTS dispensaries (
|
||||
-- Primary key
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
@@ -43,11 +43,11 @@ CREATE TABLE dispensaries (
|
||||
);
|
||||
|
||||
-- Create indexes for common queries
|
||||
CREATE INDEX idx_dispensaries_city ON dispensaries(city);
|
||||
CREATE INDEX idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX idx_dispensaries_slug ON dispensaries(slug);
|
||||
CREATE INDEX idx_dispensaries_azdhs_id ON dispensaries(azdhs_id);
|
||||
CREATE INDEX idx_dispensaries_menu_status ON dispensaries(menu_scrape_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_slug ON dispensaries(slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_azdhs_id ON dispensaries(azdhs_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_menu_status ON dispensaries(menu_scrape_status);
|
||||
|
||||
-- Create index for location-based queries
|
||||
CREATE INDEX idx_dispensaries_location ON dispensaries(latitude, longitude) WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_location ON dispensaries(latitude, longitude) WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
-- Create dispensary_changes table for change approval workflow
|
||||
-- This protects against accidental data destruction by requiring manual review
|
||||
CREATE TABLE dispensary_changes (
|
||||
CREATE TABLE IF NOT EXISTS dispensary_changes (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
@@ -26,10 +26,10 @@ CREATE TABLE dispensary_changes (
|
||||
);
|
||||
|
||||
-- Create indexes for common queries
|
||||
CREATE INDEX idx_dispensary_changes_status ON dispensary_changes(status);
|
||||
CREATE INDEX idx_dispensary_changes_dispensary_status ON dispensary_changes(dispensary_id, status);
|
||||
CREATE INDEX idx_dispensary_changes_created_at ON dispensary_changes(created_at DESC);
|
||||
CREATE INDEX idx_dispensary_changes_requires_recrawl ON dispensary_changes(requires_recrawl) WHERE requires_recrawl = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_status ON dispensary_changes(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_dispensary_status ON dispensary_changes(dispensary_id, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_created_at ON dispensary_changes(created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensary_changes_requires_recrawl ON dispensary_changes(requires_recrawl) WHERE requires_recrawl = TRUE;
|
||||
|
||||
-- Create function to automatically set requires_recrawl for website/menu_url changes
|
||||
CREATE OR REPLACE FUNCTION set_requires_recrawl()
|
||||
@@ -42,7 +42,8 @@ BEGIN
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Create trigger to call the function
|
||||
-- Create trigger to call the function (drop first to make idempotent)
|
||||
DROP TRIGGER IF EXISTS trigger_set_requires_recrawl ON dispensary_changes;
|
||||
CREATE TRIGGER trigger_set_requires_recrawl
|
||||
BEFORE INSERT ON dispensary_changes
|
||||
FOR EACH ROW
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
-- Populate dispensaries table from azdhs_list
|
||||
-- This migrates all 182 AZDHS records with their enriched Google Maps data
|
||||
-- For multi-location dispensaries with duplicate slugs, append city name to make unique
|
||||
-- IDEMPOTENT: Uses ON CONFLICT DO NOTHING to skip already-imported records
|
||||
|
||||
WITH ranked_dispensaries AS (
|
||||
SELECT
|
||||
@@ -78,9 +79,10 @@ SELECT
|
||||
created_at,
|
||||
updated_at
|
||||
FROM ranked_dispensaries
|
||||
ORDER BY id;
|
||||
ORDER BY id
|
||||
ON CONFLICT (azdhs_id) DO NOTHING;
|
||||
|
||||
-- Verify the migration
|
||||
-- Verify the migration (idempotent - just logs, doesn't fail)
|
||||
DO $$
|
||||
DECLARE
|
||||
source_count INTEGER;
|
||||
@@ -89,9 +91,11 @@ BEGIN
|
||||
SELECT COUNT(*) INTO source_count FROM azdhs_list;
|
||||
SELECT COUNT(*) INTO dest_count FROM dispensaries;
|
||||
|
||||
RAISE NOTICE 'Migration complete: % records from azdhs_list → % records in dispensaries', source_count, dest_count;
|
||||
RAISE NOTICE 'Migration status: % records in azdhs_list, % records in dispensaries', source_count, dest_count;
|
||||
|
||||
IF source_count != dest_count THEN
|
||||
RAISE EXCEPTION 'Record count mismatch! Expected %, got %', source_count, dest_count;
|
||||
IF dest_count >= source_count THEN
|
||||
RAISE NOTICE 'OK: dispensaries table has expected records';
|
||||
ELSE
|
||||
RAISE WARNING 'dispensaries has fewer records than azdhs_list (% vs %)', dest_count, source_count;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -3,15 +3,15 @@
|
||||
|
||||
-- Add dispensary_id to products table
|
||||
ALTER TABLE products
|
||||
ADD COLUMN dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
|
||||
-- Add dispensary_id to categories table
|
||||
ALTER TABLE categories
|
||||
ADD COLUMN dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE;
|
||||
|
||||
-- Create indexes for the new foreign keys
|
||||
CREATE INDEX idx_products_dispensary_id ON products(dispensary_id);
|
||||
CREATE INDEX idx_categories_dispensary_id ON categories(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_products_dispensary_id ON products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_categories_dispensary_id ON categories(dispensary_id);
|
||||
|
||||
-- NOTE: We'll populate these FKs and migrate data from stores in a separate data migration
|
||||
-- For now, new scrapers should use dispensary_id, but old store_id still works
|
||||
|
||||
119
backend/migrations/051_worker_definitions.sql
Normal file
119
backend/migrations/051_worker_definitions.sql
Normal file
@@ -0,0 +1,119 @@
|
||||
-- Migration 051: Worker Definitions
|
||||
-- Creates a dedicated workers table for named workers with roles and assignments
|
||||
|
||||
-- Workers table - defines named workers with roles
|
||||
CREATE TABLE IF NOT EXISTS workers (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
role VARCHAR(100) NOT NULL,
|
||||
description TEXT,
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Schedule configuration (for dedicated crawl workers)
|
||||
schedule_type VARCHAR(50) DEFAULT 'interval', -- 'interval', 'cron', 'manual'
|
||||
interval_minutes INTEGER DEFAULT 240,
|
||||
cron_expression VARCHAR(100), -- e.g., '0 */4 * * *'
|
||||
jitter_minutes INTEGER DEFAULT 30,
|
||||
|
||||
-- Assignment scope
|
||||
assignment_type VARCHAR(50) DEFAULT 'all', -- 'all', 'state', 'dispensary', 'chain'
|
||||
assigned_state_codes TEXT[], -- e.g., ['AZ', 'CA']
|
||||
assigned_dispensary_ids INTEGER[],
|
||||
assigned_chain_ids INTEGER[],
|
||||
|
||||
-- Job configuration
|
||||
job_type VARCHAR(50) NOT NULL DEFAULT 'dutchie_product_crawl',
|
||||
job_config JSONB DEFAULT '{}',
|
||||
priority INTEGER DEFAULT 0,
|
||||
max_concurrent INTEGER DEFAULT 1,
|
||||
|
||||
-- Status tracking
|
||||
status VARCHAR(50) DEFAULT 'idle', -- 'idle', 'running', 'paused', 'error'
|
||||
last_run_at TIMESTAMPTZ,
|
||||
last_status VARCHAR(50),
|
||||
last_error TEXT,
|
||||
last_duration_ms INTEGER,
|
||||
next_run_at TIMESTAMPTZ,
|
||||
current_job_id INTEGER,
|
||||
|
||||
-- Metrics
|
||||
total_runs INTEGER DEFAULT 0,
|
||||
successful_runs INTEGER DEFAULT 0,
|
||||
failed_runs INTEGER DEFAULT 0,
|
||||
avg_duration_ms INTEGER,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Worker run history
|
||||
CREATE TABLE IF NOT EXISTS worker_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id INTEGER NOT NULL REFERENCES workers(id) ON DELETE CASCADE,
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
status VARCHAR(50) DEFAULT 'running', -- 'running', 'success', 'error', 'cancelled'
|
||||
duration_ms INTEGER,
|
||||
|
||||
-- What was processed
|
||||
jobs_created INTEGER DEFAULT 0,
|
||||
jobs_completed INTEGER DEFAULT 0,
|
||||
jobs_failed INTEGER DEFAULT 0,
|
||||
dispensaries_crawled INTEGER DEFAULT 0,
|
||||
products_found INTEGER DEFAULT 0,
|
||||
|
||||
error_message TEXT,
|
||||
metadata JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Index for efficient lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_workers_enabled ON workers(enabled) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_workers_next_run ON workers(next_run_at) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_workers_status ON workers(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_runs_worker_id ON worker_runs(worker_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_runs_started_at ON worker_runs(started_at DESC);
|
||||
|
||||
-- Add worker_id to dispensary_crawl_jobs if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawl_jobs' AND column_name = 'assigned_worker_id'
|
||||
) THEN
|
||||
ALTER TABLE dispensary_crawl_jobs ADD COLUMN assigned_worker_id INTEGER REFERENCES workers(id);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Migrate existing job_schedules workers to new workers table
|
||||
INSERT INTO workers (name, role, description, enabled, interval_minutes, jitter_minutes, job_type, job_config, last_run_at, last_status, last_error, last_duration_ms, next_run_at)
|
||||
SELECT
|
||||
worker_name,
|
||||
worker_role,
|
||||
description,
|
||||
enabled,
|
||||
base_interval_minutes,
|
||||
jitter_minutes,
|
||||
job_name,
|
||||
job_config,
|
||||
last_run_at,
|
||||
last_status,
|
||||
last_error_message,
|
||||
last_duration_ms,
|
||||
next_run_at
|
||||
FROM job_schedules
|
||||
WHERE worker_name IS NOT NULL
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
updated_at = NOW();
|
||||
|
||||
-- Available worker roles (reference)
|
||||
COMMENT ON TABLE workers IS 'Named workers with specific roles and assignments. Roles include:
|
||||
- product_sync: Crawls products from dispensary menus
|
||||
- store_discovery: Discovers new dispensary locations
|
||||
- entry_point_finder: Detects menu providers and resolves platform IDs
|
||||
- analytics_refresh: Refreshes materialized views and analytics
|
||||
- price_monitor: Monitors price changes and triggers alerts
|
||||
- inventory_sync: Syncs inventory levels
|
||||
- image_processor: Downloads and processes product images
|
||||
- data_validator: Validates data integrity';
|
||||
49
backend/migrations/052_seo_settings.sql
Normal file
49
backend/migrations/052_seo_settings.sql
Normal file
@@ -0,0 +1,49 @@
|
||||
-- Migration 052: SEO Settings Table
|
||||
-- Key/value store for SEO Orchestrator configuration
|
||||
|
||||
CREATE TABLE IF NOT EXISTS seo_settings (
|
||||
id SERIAL PRIMARY KEY,
|
||||
key TEXT UNIQUE NOT NULL,
|
||||
value JSONB NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Create index on key for fast lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_settings_key ON seo_settings(key);
|
||||
|
||||
-- Seed with default settings
|
||||
INSERT INTO seo_settings (key, value) VALUES
|
||||
-- Section 1: Global Content Generation Settings
|
||||
('primary_prompt_template', '"You are a cannabis industry content expert. Generate SEO-optimized content for {{page_type}} pages about {{subject}}. Focus on: {{focus_areas}}. Maintain a {{tone}} tone and keep content {{length}}."'),
|
||||
('regeneration_prompt_template', '"Regenerate the following SEO content with fresh perspectives. Original topic: {{subject}}. Improve upon: {{improvement_areas}}. Maintain compliance with cannabis industry standards."'),
|
||||
('default_content_length', '"medium"'),
|
||||
('tone_voice', '"informational"'),
|
||||
|
||||
-- Section 2: Automatic Refresh Rules
|
||||
('auto_refresh_interval', '"weekly"'),
|
||||
('trigger_pct_product_change', 'true'),
|
||||
('trigger_pct_brand_change', 'true'),
|
||||
('trigger_new_stores', 'true'),
|
||||
('trigger_market_shift', 'false'),
|
||||
('webhook_url', '""'),
|
||||
('notify_on_trigger', 'false'),
|
||||
|
||||
-- Section 3: Page-Level Defaults
|
||||
('default_title_template', '"{{state_name}} Dispensaries | Find Cannabis Near You | CannaiQ"'),
|
||||
('default_meta_description_template', '"Discover the best dispensaries in {{state_name}}. Browse {{dispensary_count}}+ licensed retailers, compare prices, and find cannabis products near you."'),
|
||||
('default_slug_template', '"dispensaries-{{state_code_lower}}"'),
|
||||
('default_og_image_template', '"/images/seo/og-{{state_code_lower}}.jpg"'),
|
||||
('enable_ai_images', 'false'),
|
||||
|
||||
-- Section 4: Crawl / Dataset Configuration
|
||||
('primary_data_provider', '"cannaiq"'),
|
||||
('fallback_data_provider', '"dutchie"'),
|
||||
('min_data_freshness_hours', '24'),
|
||||
('stale_data_behavior', '"allow_with_warning"')
|
||||
ON CONFLICT (key) DO NOTHING;
|
||||
|
||||
-- Record migration
|
||||
INSERT INTO schema_migrations (version, name, applied_at)
|
||||
VALUES ('052', 'seo_settings', NOW())
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
@@ -0,0 +1,42 @@
|
||||
-- Migration 057: Add crawl_enabled and dutchie_verified fields to dispensaries
|
||||
--
|
||||
-- Purpose:
|
||||
-- 1. Add crawl_enabled to control which dispensaries get crawled
|
||||
-- 2. Add dutchie_verified to track Dutchie source-of-truth verification
|
||||
-- 3. Default existing records to crawl_enabled = TRUE to preserve behavior
|
||||
--
|
||||
-- After this migration, run the harmonization script to:
|
||||
-- - Match dispensaries to Dutchie discoveries
|
||||
-- - Update platform_dispensary_id from Dutchie
|
||||
-- - Set dutchie_verified = TRUE for matches
|
||||
-- - Set crawl_enabled = FALSE for unverified records
|
||||
|
||||
-- Add crawl_enabled column (defaults to true to not break existing crawls)
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS crawl_enabled BOOLEAN DEFAULT TRUE;
|
||||
|
||||
-- Add dutchie_verified column to track if record is verified against Dutchie
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS dutchie_verified BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add dutchie_verified_at timestamp
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS dutchie_verified_at TIMESTAMP WITH TIME ZONE;
|
||||
|
||||
-- Add dutchie_discovery_id to link back to the discovery record
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS dutchie_discovery_id BIGINT REFERENCES dutchie_discovery_locations(id);
|
||||
|
||||
-- Create index for crawl queries (only crawl enabled dispensaries)
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_crawl_enabled
|
||||
ON dispensaries(crawl_enabled, state)
|
||||
WHERE crawl_enabled = TRUE;
|
||||
|
||||
-- Create index for dutchie verification status
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_dutchie_verified
|
||||
ON dispensaries(dutchie_verified, state);
|
||||
|
||||
COMMENT ON COLUMN dispensaries.crawl_enabled IS 'Whether this dispensary should be included in crawl jobs. Set to FALSE for unverified or problematic records.';
|
||||
COMMENT ON COLUMN dispensaries.dutchie_verified IS 'Whether this dispensary has been verified against Dutchie source of truth (matched by slug or manually linked).';
|
||||
COMMENT ON COLUMN dispensaries.dutchie_verified_at IS 'Timestamp when Dutchie verification was completed.';
|
||||
COMMENT ON COLUMN dispensaries.dutchie_discovery_id IS 'Link to the dutchie_discovery_locations record this was matched/verified against.';
|
||||
56
backend/migrations/065_slug_verification_tracking.sql
Normal file
56
backend/migrations/065_slug_verification_tracking.sql
Normal file
@@ -0,0 +1,56 @@
|
||||
-- Migration 065: Slug verification and data source tracking
|
||||
-- Adds columns to track when slug/menu data was verified and from what source
|
||||
|
||||
-- Add slug verification columns to dispensaries
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS slug_source VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS slug_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS slug_status VARCHAR(20) DEFAULT 'unverified',
|
||||
ADD COLUMN IF NOT EXISTS menu_url_source VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS menu_url_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS platform_id_source VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS platform_id_verified_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS country VARCHAR(2) DEFAULT 'US';
|
||||
|
||||
-- Add index for finding unverified stores
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_slug_status
|
||||
ON dispensaries(slug_status)
|
||||
WHERE slug_status != 'verified';
|
||||
|
||||
-- Add index for country
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_country
|
||||
ON dispensaries(country);
|
||||
|
||||
-- Comment on columns
|
||||
COMMENT ON COLUMN dispensaries.slug_source IS 'Source of slug data: dutchie_api, manual, azdhs, discovery, etc.';
|
||||
COMMENT ON COLUMN dispensaries.slug_verified_at IS 'When the slug was last verified against the source';
|
||||
COMMENT ON COLUMN dispensaries.slug_status IS 'Status: unverified, verified, invalid, changed';
|
||||
COMMENT ON COLUMN dispensaries.menu_url_source IS 'Source of menu_url: dutchie_api, website_scrape, manual, etc.';
|
||||
COMMENT ON COLUMN dispensaries.menu_url_verified_at IS 'When the menu_url was last verified';
|
||||
COMMENT ON COLUMN dispensaries.platform_id_source IS 'Source of platform_dispensary_id: dutchie_api, graphql_resolution, etc.';
|
||||
COMMENT ON COLUMN dispensaries.platform_id_verified_at IS 'When the platform_dispensary_id was last verified';
|
||||
COMMENT ON COLUMN dispensaries.country IS 'ISO 2-letter country code: US, CA, etc.';
|
||||
|
||||
-- Update Green Pharms Mesa with verified Dutchie data
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
slug = 'green-pharms-mesa',
|
||||
menu_url = 'https://dutchie.com/embedded-menu/green-pharms-mesa',
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = '68dc47a2af90f2e653f8df30',
|
||||
slug_source = 'dutchie_api',
|
||||
slug_verified_at = NOW(),
|
||||
slug_status = 'verified',
|
||||
menu_url_source = 'dutchie_api',
|
||||
menu_url_verified_at = NOW(),
|
||||
platform_id_source = 'dutchie_api',
|
||||
platform_id_verified_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = 232;
|
||||
|
||||
-- Mark all other AZ dispensaries as needing verification
|
||||
UPDATE dispensaries
|
||||
SET slug_status = 'unverified'
|
||||
WHERE state = 'AZ'
|
||||
AND id != 232
|
||||
AND (slug_status IS NULL OR slug_status = 'unverified');
|
||||
140
backend/migrations/066_dutchie_field_alignment.sql
Normal file
140
backend/migrations/066_dutchie_field_alignment.sql
Normal file
@@ -0,0 +1,140 @@
|
||||
-- Migration 066: Align dispensaries and discovery_locations tables with Dutchie field names
|
||||
-- Uses snake_case convention (Postgres standard) mapped from Dutchie's camelCase
|
||||
--
|
||||
-- Changes:
|
||||
-- 1. dispensaries: rename address→address1, zip→zipcode, remove company_name
|
||||
-- 2. dispensaries: add missing Dutchie fields
|
||||
-- 3. dutchie_discovery_locations: add missing Dutchie fields
|
||||
|
||||
-- ============================================================================
|
||||
-- DISPENSARIES TABLE
|
||||
-- ============================================================================
|
||||
|
||||
-- Rename address to address1 (matches Dutchie's address1)
|
||||
ALTER TABLE dispensaries RENAME COLUMN address TO address1;
|
||||
|
||||
-- Rename zip to zipcode (matches Dutchie's zip, but we use zipcode for clarity)
|
||||
ALTER TABLE dispensaries RENAME COLUMN zip TO zipcode;
|
||||
|
||||
-- Drop company_name (redundant with name)
|
||||
ALTER TABLE dispensaries DROP COLUMN IF EXISTS company_name;
|
||||
|
||||
-- Add address2
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS address2 VARCHAR(255);
|
||||
|
||||
-- Add country
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS country VARCHAR(100) DEFAULT 'United States';
|
||||
|
||||
-- Add timezone
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS timezone VARCHAR(50);
|
||||
|
||||
-- Add email
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS email VARCHAR(255);
|
||||
|
||||
-- Add description
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS description TEXT;
|
||||
|
||||
-- Add logo_image (Dutchie: logoImage)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS logo_image TEXT;
|
||||
|
||||
-- Add banner_image (Dutchie: bannerImage)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS banner_image TEXT;
|
||||
|
||||
-- Add offer_pickup (Dutchie: offerPickup)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS offer_pickup BOOLEAN DEFAULT TRUE;
|
||||
|
||||
-- Add offer_delivery (Dutchie: offerDelivery)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS offer_delivery BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add offer_curbside_pickup (Dutchie: offerCurbsidePickup)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS offer_curbside_pickup BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add is_medical (Dutchie: isMedical)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS is_medical BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add is_recreational (Dutchie: isRecreational)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS is_recreational BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Add chain_slug (Dutchie: chain)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS chain_slug VARCHAR(255);
|
||||
|
||||
-- Add enterprise_id (Dutchie: retailer.enterpriseId)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS enterprise_id VARCHAR(100);
|
||||
|
||||
-- Add status (Dutchie: status - open/closed)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS status VARCHAR(50);
|
||||
|
||||
-- Add c_name (Dutchie: cName - the URL slug used in embedded menus)
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
|
||||
|
||||
-- ============================================================================
|
||||
-- DUTCHIE_DISCOVERY_LOCATIONS TABLE
|
||||
-- ============================================================================
|
||||
|
||||
-- Add phone
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS phone VARCHAR(50);
|
||||
|
||||
-- Add website (Dutchie: embedBackUrl)
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS website TEXT;
|
||||
|
||||
-- Add email
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS email VARCHAR(255);
|
||||
|
||||
-- Add description
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS description TEXT;
|
||||
|
||||
-- Add logo_image
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS logo_image TEXT;
|
||||
|
||||
-- Add banner_image
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS banner_image TEXT;
|
||||
|
||||
-- Add chain_slug
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS chain_slug VARCHAR(255);
|
||||
|
||||
-- Add enterprise_id
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS enterprise_id VARCHAR(100);
|
||||
|
||||
-- Add c_name
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS c_name VARCHAR(255);
|
||||
|
||||
-- Add country
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS country VARCHAR(100) DEFAULT 'United States';
|
||||
|
||||
-- Add store status
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN IF NOT EXISTS store_status VARCHAR(50);
|
||||
|
||||
-- ============================================================================
|
||||
-- INDEXES
|
||||
-- ============================================================================
|
||||
|
||||
-- Index for chain lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_chain_slug ON dispensaries(chain_slug) WHERE chain_slug IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_chain_slug ON dutchie_discovery_locations(chain_slug) WHERE chain_slug IS NOT NULL;
|
||||
|
||||
-- Index for enterprise lookups (for multi-location chains)
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_enterprise_id ON dispensaries(enterprise_id) WHERE enterprise_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_enterprise_id ON dutchie_discovery_locations(enterprise_id) WHERE enterprise_id IS NOT NULL;
|
||||
|
||||
-- Index for c_name lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_c_name ON dispensaries(c_name) WHERE c_name IS NOT NULL;
|
||||
|
||||
-- ============================================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================================
|
||||
|
||||
COMMENT ON COLUMN dispensaries.address1 IS 'Street address line 1 (Dutchie: address1)';
|
||||
COMMENT ON COLUMN dispensaries.address2 IS 'Street address line 2 (Dutchie: address2)';
|
||||
COMMENT ON COLUMN dispensaries.zipcode IS 'ZIP/postal code (Dutchie: zip)';
|
||||
COMMENT ON COLUMN dispensaries.c_name IS 'Dutchie URL slug for embedded menus (Dutchie: cName)';
|
||||
COMMENT ON COLUMN dispensaries.chain_slug IS 'Chain identifier slug (Dutchie: chain)';
|
||||
COMMENT ON COLUMN dispensaries.enterprise_id IS 'Parent enterprise UUID (Dutchie: retailer.enterpriseId)';
|
||||
COMMENT ON COLUMN dispensaries.logo_image IS 'Logo image URL (Dutchie: logoImage)';
|
||||
COMMENT ON COLUMN dispensaries.banner_image IS 'Banner image URL (Dutchie: bannerImage)';
|
||||
COMMENT ON COLUMN dispensaries.offer_pickup IS 'Offers in-store pickup (Dutchie: offerPickup)';
|
||||
COMMENT ON COLUMN dispensaries.offer_delivery IS 'Offers delivery (Dutchie: offerDelivery)';
|
||||
COMMENT ON COLUMN dispensaries.offer_curbside_pickup IS 'Offers curbside pickup (Dutchie: offerCurbsidePickup)';
|
||||
COMMENT ON COLUMN dispensaries.is_medical IS 'Licensed for medical sales (Dutchie: isMedical)';
|
||||
COMMENT ON COLUMN dispensaries.is_recreational IS 'Licensed for recreational sales (Dutchie: isRecreational)';
|
||||
|
||||
SELECT 'Migration 066 completed: Dutchie field alignment' as status;
|
||||
24
backend/migrations/067_promotion_log.sql
Normal file
24
backend/migrations/067_promotion_log.sql
Normal file
@@ -0,0 +1,24 @@
|
||||
-- Promotion log table for tracking discovery → dispensary promotions
|
||||
-- Tracks validation and promotion actions for audit/review
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dutchie_promotion_log (
|
||||
id SERIAL PRIMARY KEY,
|
||||
discovery_id INTEGER REFERENCES dutchie_discovery_locations(id) ON DELETE SET NULL,
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
|
||||
action VARCHAR(50) NOT NULL, -- 'validated', 'rejected', 'promoted_create', 'promoted_update', 'skipped'
|
||||
state_code VARCHAR(10),
|
||||
store_name VARCHAR(255),
|
||||
validation_errors TEXT[], -- Array of error messages if rejected
|
||||
field_changes JSONB, -- Before/after snapshot of changed fields
|
||||
triggered_by VARCHAR(100) DEFAULT 'auto', -- 'auto', 'manual', 'api'
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Indexes for efficient querying
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_discovery_id ON dutchie_promotion_log(discovery_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_dispensary_id ON dutchie_promotion_log(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_action ON dutchie_promotion_log(action);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_state_code ON dutchie_promotion_log(state_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_log_created_at ON dutchie_promotion_log(created_at DESC);
|
||||
|
||||
COMMENT ON TABLE dutchie_promotion_log IS 'Audit log for discovery location validation and promotion to dispensaries';
|
||||
95
backend/migrations/068_crawler_status_alerts.sql
Normal file
95
backend/migrations/068_crawler_status_alerts.sql
Normal file
@@ -0,0 +1,95 @@
|
||||
-- Migration 068: Crawler Status Alerts
|
||||
-- Creates status_alerts table for dashboard notifications and status change logging
|
||||
|
||||
-- ============================================================
|
||||
-- STATUS ALERTS TABLE
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS crawler_status_alerts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- References
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id),
|
||||
profile_id INTEGER REFERENCES dispensary_crawler_profiles(id),
|
||||
|
||||
-- Alert info
|
||||
alert_type VARCHAR(50) NOT NULL, -- 'status_change', 'crawl_error', 'validation_failed', 'promoted', 'demoted'
|
||||
severity VARCHAR(20) DEFAULT 'info', -- 'info', 'warning', 'error', 'critical'
|
||||
|
||||
-- Status transition
|
||||
previous_status VARCHAR(50),
|
||||
new_status VARCHAR(50),
|
||||
|
||||
-- Context
|
||||
message TEXT,
|
||||
error_details JSONB,
|
||||
metadata JSONB, -- Additional context (product counts, error codes, etc.)
|
||||
|
||||
-- Tracking
|
||||
acknowledged BOOLEAN DEFAULT FALSE,
|
||||
acknowledged_at TIMESTAMP WITH TIME ZONE,
|
||||
acknowledged_by VARCHAR(100),
|
||||
|
||||
-- Timestamps
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_dispensary ON crawler_status_alerts(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_type ON crawler_status_alerts(alert_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_severity ON crawler_status_alerts(severity);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_unack ON crawler_status_alerts(acknowledged) WHERE acknowledged = FALSE;
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_status_alerts_created ON crawler_status_alerts(created_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- STATUS DEFINITIONS (for reference/validation)
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON TABLE crawler_status_alerts IS 'Crawler status change notifications for dashboard alerting';
|
||||
COMMENT ON COLUMN crawler_status_alerts.alert_type IS 'Type: status_change, crawl_error, validation_failed, promoted, demoted';
|
||||
COMMENT ON COLUMN crawler_status_alerts.severity IS 'Severity: info, warning, error, critical';
|
||||
COMMENT ON COLUMN crawler_status_alerts.previous_status IS 'Previous crawler status before change';
|
||||
COMMENT ON COLUMN crawler_status_alerts.new_status IS 'New crawler status after change';
|
||||
|
||||
-- ============================================================
|
||||
-- STATUS TRACKING ON PROFILES
|
||||
-- ============================================================
|
||||
|
||||
-- Add columns for status tracking if not exists
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Consecutive success count for auto-promotion
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'consecutive_successes') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN consecutive_successes INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
|
||||
-- Consecutive failure count for auto-demotion
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'consecutive_failures') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN consecutive_failures INTEGER DEFAULT 0;
|
||||
END IF;
|
||||
|
||||
-- Last status change timestamp
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'status_changed_at') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN status_changed_at TIMESTAMP WITH TIME ZONE;
|
||||
END IF;
|
||||
|
||||
-- Status change reason
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensary_crawler_profiles' AND column_name = 'status_reason') THEN
|
||||
ALTER TABLE dispensary_crawler_profiles ADD COLUMN status_reason TEXT;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- ============================================================
|
||||
-- VALID STATUS VALUES
|
||||
-- ============================================================
|
||||
-- Status values for dispensary_crawler_profiles.status:
|
||||
-- 'sandbox' - Newly created, being validated
|
||||
-- 'production' - Healthy, actively crawled
|
||||
-- 'needs_manual' - Requires human intervention
|
||||
-- 'failing' - Multiple consecutive failures
|
||||
-- 'disabled' - Manually disabled
|
||||
-- 'legacy' - No profile, uses default method (virtual status)
|
||||
163
backend/migrations/069_six_stage_status.sql
Normal file
163
backend/migrations/069_six_stage_status.sql
Normal file
@@ -0,0 +1,163 @@
|
||||
-- Migration 069: Seven-Stage Status System
|
||||
--
|
||||
-- Implements explicit 7-stage pipeline for store lifecycle:
|
||||
-- 1. discovered - Found via Dutchie API, raw data
|
||||
-- 2. validated - Passed field checks, ready for promotion
|
||||
-- 3. promoted - In dispensaries table, has crawler profile
|
||||
-- 4. sandbox - First crawl attempted, testing
|
||||
-- 5. hydrating - Products are being loaded/updated
|
||||
-- 6. production - Healthy, scheduled crawls via Horizon
|
||||
-- 7. failing - Crawl errors, needs attention
|
||||
|
||||
-- ============================================================
|
||||
-- STAGE ENUM TYPE
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Create enum if not exists
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'store_stage') THEN
|
||||
CREATE TYPE store_stage AS ENUM (
|
||||
'discovered',
|
||||
'validated',
|
||||
'promoted',
|
||||
'sandbox',
|
||||
'hydrating',
|
||||
'production',
|
||||
'failing'
|
||||
);
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- ============================================================
|
||||
-- UPDATE DISCOVERY LOCATIONS TABLE
|
||||
-- ============================================================
|
||||
|
||||
-- Add stage column to discovery locations (replaces status)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dutchie_discovery_locations' AND column_name = 'stage') THEN
|
||||
ALTER TABLE dutchie_discovery_locations ADD COLUMN stage VARCHAR(20) DEFAULT 'discovered';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Migrate existing status values to stage
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET stage = CASE
|
||||
WHEN status = 'discovered' THEN 'discovered'
|
||||
WHEN status = 'verified' THEN 'validated'
|
||||
WHEN status = 'rejected' THEN 'failing'
|
||||
WHEN status = 'merged' THEN 'validated'
|
||||
ELSE 'discovered'
|
||||
END
|
||||
WHERE stage IS NULL OR stage = '';
|
||||
|
||||
-- ============================================================
|
||||
-- UPDATE CRAWLER PROFILES TABLE
|
||||
-- ============================================================
|
||||
|
||||
-- Ensure status column exists and update to new values
|
||||
UPDATE dispensary_crawler_profiles
|
||||
SET status = CASE
|
||||
WHEN status = 'sandbox' THEN 'sandbox'
|
||||
WHEN status = 'production' THEN 'production'
|
||||
WHEN status = 'needs_manual' THEN 'failing'
|
||||
WHEN status = 'failing' THEN 'failing'
|
||||
WHEN status = 'disabled' THEN 'failing'
|
||||
WHEN status IS NULL THEN 'promoted'
|
||||
ELSE 'promoted'
|
||||
END;
|
||||
|
||||
-- ============================================================
|
||||
-- ADD STAGE TRACKING TO DISPENSARIES
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Add stage column to dispensaries for quick filtering
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'stage') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN stage VARCHAR(20) DEFAULT 'promoted';
|
||||
END IF;
|
||||
|
||||
-- Add stage_changed_at for tracking
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'stage_changed_at') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN stage_changed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP;
|
||||
END IF;
|
||||
|
||||
-- Add first_crawl_at to track sandbox → production transition
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'first_crawl_at') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN first_crawl_at TIMESTAMP WITH TIME ZONE;
|
||||
END IF;
|
||||
|
||||
-- Add last_successful_crawl_at
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'last_successful_crawl_at') THEN
|
||||
ALTER TABLE dispensaries ADD COLUMN last_successful_crawl_at TIMESTAMP WITH TIME ZONE;
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- Set initial stage for existing dispensaries based on their crawler profile status
|
||||
UPDATE dispensaries d
|
||||
SET stage = COALESCE(
|
||||
(SELECT dcp.status FROM dispensary_crawler_profiles dcp
|
||||
WHERE dcp.dispensary_id = d.id AND dcp.enabled = true
|
||||
ORDER BY dcp.updated_at DESC LIMIT 1),
|
||||
'promoted'
|
||||
)
|
||||
WHERE d.stage IS NULL OR d.stage = '';
|
||||
|
||||
-- ============================================================
|
||||
-- INDEXES FOR STAGE-BASED QUERIES
|
||||
-- ============================================================
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_stage ON dispensaries(stage);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_stage_state ON dispensaries(stage, state);
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_locations_stage ON dutchie_discovery_locations(stage);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawler_profiles_status ON dispensary_crawler_profiles(status);
|
||||
|
||||
-- ============================================================
|
||||
-- STAGE TRANSITION LOG
|
||||
-- ============================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS stage_transitions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- What changed
|
||||
entity_type VARCHAR(20) NOT NULL, -- 'discovery_location' or 'dispensary'
|
||||
entity_id INTEGER NOT NULL,
|
||||
|
||||
-- Stage change
|
||||
from_stage VARCHAR(20),
|
||||
to_stage VARCHAR(20) NOT NULL,
|
||||
|
||||
-- Context
|
||||
trigger_type VARCHAR(50) NOT NULL, -- 'api', 'scheduler', 'manual', 'auto'
|
||||
trigger_endpoint VARCHAR(200),
|
||||
|
||||
-- Outcome
|
||||
success BOOLEAN DEFAULT TRUE,
|
||||
error_message TEXT,
|
||||
metadata JSONB,
|
||||
|
||||
-- Timing
|
||||
duration_ms INTEGER,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_stage_transitions_entity ON stage_transitions(entity_type, entity_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_stage_transitions_to_stage ON stage_transitions(to_stage);
|
||||
CREATE INDEX IF NOT EXISTS idx_stage_transitions_created ON stage_transitions(created_at DESC);
|
||||
|
||||
-- ============================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON TABLE stage_transitions IS 'Audit log for all stage transitions in the pipeline';
|
||||
COMMENT ON COLUMN dispensaries.stage IS 'Current pipeline stage: discovered, validated, promoted, sandbox, production, failing';
|
||||
COMMENT ON COLUMN dispensaries.stage_changed_at IS 'When the stage was last changed';
|
||||
COMMENT ON COLUMN dispensaries.first_crawl_at IS 'When the first crawl was attempted (sandbox stage)';
|
||||
COMMENT ON COLUMN dispensaries.last_successful_crawl_at IS 'When the last successful crawl completed';
|
||||
239
backend/migrations/070_product_variants.sql
Normal file
239
backend/migrations/070_product_variants.sql
Normal file
@@ -0,0 +1,239 @@
|
||||
-- ============================================================================
|
||||
-- Migration 070: Product Variants Tables
|
||||
-- ============================================================================
|
||||
--
|
||||
-- Purpose: Store variant-level pricing and inventory as first-class entities
|
||||
-- to enable time-series analytics, price comparisons, and sale tracking.
|
||||
--
|
||||
-- Enables queries like:
|
||||
-- - Price history for a specific variant (1g Blue Dream over time)
|
||||
-- - Sale frequency analysis (how often is this on special?)
|
||||
-- - Cross-store price comparison (who has cheapest 1g flower?)
|
||||
-- - Current specials across all stores
|
||||
--
|
||||
-- RULES:
|
||||
-- - STRICTLY ADDITIVE (no DROP, DELETE, TRUNCATE)
|
||||
-- - All new tables use IF NOT EXISTS
|
||||
-- - All indexes use IF NOT EXISTS
|
||||
--
|
||||
-- ============================================================================
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 1: PRODUCT_VARIANTS TABLE (Current State)
|
||||
-- ============================================================================
|
||||
-- One row per product+option combination. Tracks current pricing/inventory.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS product_variants (
|
||||
id SERIAL PRIMARY KEY,
|
||||
store_product_id INTEGER NOT NULL REFERENCES store_products(id) ON DELETE CASCADE,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- Variant identity (from Dutchie POSMetaData.children)
|
||||
option VARCHAR(100) NOT NULL, -- "1g", "3.5g", "1/8oz", "100mg"
|
||||
canonical_sku VARCHAR(100), -- Dutchie canonicalSKU
|
||||
canonical_id VARCHAR(100), -- Dutchie canonicalID
|
||||
canonical_name VARCHAR(500), -- Dutchie canonicalName
|
||||
|
||||
-- Current pricing (in dollars, not cents)
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
|
||||
-- Current inventory
|
||||
quantity INTEGER,
|
||||
quantity_available INTEGER,
|
||||
in_stock BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Special/sale status
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Weight/size parsing (for analytics)
|
||||
weight_value NUMERIC(10,2), -- 1, 3.5, 28, etc.
|
||||
weight_unit VARCHAR(20), -- g, oz, mg, ml, etc.
|
||||
|
||||
-- Timestamps
|
||||
first_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_price_change_at TIMESTAMPTZ,
|
||||
last_stock_change_at TIMESTAMPTZ,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(store_product_id, option)
|
||||
);
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_store_product ON product_variants(store_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_dispensary ON product_variants(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_option ON product_variants(option);
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_in_stock ON product_variants(dispensary_id, in_stock) WHERE in_stock = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_on_special ON product_variants(dispensary_id, is_on_special) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_canonical_sku ON product_variants(canonical_sku) WHERE canonical_sku IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_variants_price_rec ON product_variants(price_rec) WHERE price_rec IS NOT NULL;
|
||||
|
||||
COMMENT ON TABLE product_variants IS 'Current state of each product variant (weight/size option). One row per product+option.';
|
||||
COMMENT ON COLUMN product_variants.option IS 'Weight/size option string from Dutchie (e.g., "1g", "3.5g", "1/8oz")';
|
||||
COMMENT ON COLUMN product_variants.canonical_sku IS 'Dutchie POS SKU for cross-store matching';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 2: PRODUCT_VARIANT_SNAPSHOTS TABLE (Historical Data)
|
||||
-- ============================================================================
|
||||
-- Time-series data for variant pricing. One row per variant per crawl.
|
||||
-- CRITICAL: NEVER DELETE from this table.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS product_variant_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
product_variant_id INTEGER NOT NULL REFERENCES product_variants(id) ON DELETE CASCADE,
|
||||
store_product_id INTEGER REFERENCES store_products(id) ON DELETE SET NULL,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
|
||||
-- Variant identity (denormalized for query performance)
|
||||
option VARCHAR(100) NOT NULL,
|
||||
|
||||
-- Pricing at time of capture
|
||||
price_rec NUMERIC(10,2),
|
||||
price_med NUMERIC(10,2),
|
||||
price_rec_special NUMERIC(10,2),
|
||||
price_med_special NUMERIC(10,2),
|
||||
|
||||
-- Inventory at time of capture
|
||||
quantity INTEGER,
|
||||
in_stock BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Special status at time of capture
|
||||
is_on_special BOOLEAN DEFAULT FALSE,
|
||||
|
||||
-- Feed presence (FALSE = variant missing from crawl)
|
||||
is_present_in_feed BOOLEAN DEFAULT TRUE,
|
||||
|
||||
-- Capture timestamp
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for time-series queries
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_variant ON product_variant_snapshots(product_variant_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_dispensary ON product_variant_snapshots(dispensary_id, captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_crawl ON product_variant_snapshots(crawl_run_id) WHERE crawl_run_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_captured ON product_variant_snapshots(captured_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_special ON product_variant_snapshots(is_on_special, captured_at DESC) WHERE is_on_special = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_variant_snapshots_option ON product_variant_snapshots(option, captured_at DESC);
|
||||
|
||||
COMMENT ON TABLE product_variant_snapshots IS 'Historical variant pricing/inventory. One row per variant per crawl. NEVER DELETE.';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 3: USEFUL VIEWS
|
||||
-- ============================================================================
|
||||
|
||||
-- View: Current specials across all stores
|
||||
CREATE OR REPLACE VIEW v_current_specials AS
|
||||
SELECT
|
||||
pv.id as variant_id,
|
||||
sp.id as product_id,
|
||||
sp.name_raw as product_name,
|
||||
sp.brand_name_raw as brand_name,
|
||||
sp.category_raw as category,
|
||||
d.id as dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
pv.option,
|
||||
pv.price_rec,
|
||||
pv.price_rec_special,
|
||||
ROUND(((pv.price_rec - pv.price_rec_special) / NULLIF(pv.price_rec, 0)) * 100, 1) as discount_percent,
|
||||
pv.quantity,
|
||||
pv.in_stock,
|
||||
pv.last_seen_at
|
||||
FROM product_variants pv
|
||||
JOIN store_products sp ON sp.id = pv.store_product_id
|
||||
JOIN dispensaries d ON d.id = pv.dispensary_id
|
||||
WHERE pv.is_on_special = TRUE
|
||||
AND pv.in_stock = TRUE
|
||||
AND pv.price_rec_special IS NOT NULL
|
||||
AND pv.price_rec_special < pv.price_rec;
|
||||
|
||||
COMMENT ON VIEW v_current_specials IS 'All products currently on special across all stores';
|
||||
|
||||
|
||||
-- View: Price comparison for a product across stores
|
||||
CREATE OR REPLACE VIEW v_price_comparison AS
|
||||
SELECT
|
||||
sp.name_raw as product_name,
|
||||
sp.brand_name_raw as brand_name,
|
||||
sp.category_raw as category,
|
||||
pv.option,
|
||||
d.id as dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.city,
|
||||
pv.price_rec,
|
||||
pv.price_rec_special,
|
||||
pv.is_on_special,
|
||||
pv.in_stock,
|
||||
pv.quantity,
|
||||
RANK() OVER (PARTITION BY sp.name_raw, pv.option ORDER BY COALESCE(pv.price_rec_special, pv.price_rec) ASC) as price_rank
|
||||
FROM product_variants pv
|
||||
JOIN store_products sp ON sp.id = pv.store_product_id
|
||||
JOIN dispensaries d ON d.id = pv.dispensary_id
|
||||
WHERE pv.in_stock = TRUE
|
||||
AND (pv.price_rec IS NOT NULL OR pv.price_rec_special IS NOT NULL);
|
||||
|
||||
COMMENT ON VIEW v_price_comparison IS 'Compare prices for same product across stores, ranked by price';
|
||||
|
||||
|
||||
-- View: Latest snapshot per variant
|
||||
CREATE OR REPLACE VIEW v_latest_variant_snapshots AS
|
||||
SELECT DISTINCT ON (product_variant_id)
|
||||
pvs.*
|
||||
FROM product_variant_snapshots pvs
|
||||
ORDER BY product_variant_id, captured_at DESC;
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- SECTION 4: HELPER FUNCTION FOR SALE FREQUENCY
|
||||
-- ============================================================================
|
||||
|
||||
-- Function to calculate sale frequency for a variant
|
||||
CREATE OR REPLACE FUNCTION get_variant_sale_stats(p_variant_id INTEGER, p_days INTEGER DEFAULT 30)
|
||||
RETURNS TABLE (
|
||||
total_snapshots BIGINT,
|
||||
times_on_special BIGINT,
|
||||
special_frequency_pct NUMERIC,
|
||||
avg_discount_pct NUMERIC,
|
||||
min_price NUMERIC,
|
||||
max_price NUMERIC,
|
||||
avg_price NUMERIC
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COUNT(*)::BIGINT as total_snapshots,
|
||||
COUNT(*) FILTER (WHERE is_on_special)::BIGINT as times_on_special,
|
||||
ROUND((COUNT(*) FILTER (WHERE is_on_special)::NUMERIC / NULLIF(COUNT(*), 0)) * 100, 1) as special_frequency_pct,
|
||||
ROUND(AVG(
|
||||
CASE WHEN is_on_special AND price_rec_special IS NOT NULL AND price_rec IS NOT NULL
|
||||
THEN ((price_rec - price_rec_special) / NULLIF(price_rec, 0)) * 100
|
||||
END
|
||||
), 1) as avg_discount_pct,
|
||||
MIN(COALESCE(price_rec_special, price_rec)) as min_price,
|
||||
MAX(price_rec) as max_price,
|
||||
ROUND(AVG(COALESCE(price_rec_special, price_rec)), 2) as avg_price
|
||||
FROM product_variant_snapshots
|
||||
WHERE product_variant_id = p_variant_id
|
||||
AND captured_at >= NOW() - (p_days || ' days')::INTERVAL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
COMMENT ON FUNCTION get_variant_sale_stats IS 'Get sale frequency and price stats for a variant over N days';
|
||||
|
||||
|
||||
-- ============================================================================
|
||||
-- DONE
|
||||
-- ============================================================================
|
||||
|
||||
SELECT 'Migration 070 completed. Product variants tables ready for time-series analytics.' AS status;
|
||||
53
backend/migrations/071_harmonize_store_products.sql
Normal file
53
backend/migrations/071_harmonize_store_products.sql
Normal file
@@ -0,0 +1,53 @@
|
||||
-- Migration 071: Harmonize store_products with dutchie_products
|
||||
-- Adds missing columns to store_products to consolidate on a single canonical table
|
||||
|
||||
-- Product details
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS description TEXT;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weight VARCHAR(50);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS weights JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS measurements JSONB;
|
||||
|
||||
-- Cannabinoid/terpene data
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS effects JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS terpenes JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cannabinoids_v2 JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS thc_content NUMERIC(10,4);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS cbd_content NUMERIC(10,4);
|
||||
|
||||
-- Images
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS images JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS primary_image_url TEXT;
|
||||
|
||||
-- Inventory
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS total_quantity_available INTEGER DEFAULT 0;
|
||||
|
||||
-- Status/flags
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS status VARCHAR(50);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS featured BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS coming_soon BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS visibility_lost BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS visibility_lost_at TIMESTAMP WITH TIME ZONE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS visibility_restored_at TIMESTAMP WITH TIME ZONE;
|
||||
|
||||
-- Threshold flags (Dutchie-specific)
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS is_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options_below_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS options_below_kiosk_threshold BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS certificate_of_analysis_enabled BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Platform metadata
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS external_product_id VARCHAR(100);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS c_name VARCHAR(500);
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS past_c_names TEXT[];
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS latest_raw_payload JSONB;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS created_at_platform TIMESTAMP WITH TIME ZONE;
|
||||
ALTER TABLE store_products ADD COLUMN IF NOT EXISTS updated_at_platform TIMESTAMP WITH TIME ZONE;
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_external_id ON store_products(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_visibility_lost ON store_products(visibility_lost) WHERE visibility_lost = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_status ON store_products(status);
|
||||
|
||||
-- Add comment
|
||||
COMMENT ON TABLE store_products IS 'Canonical product table - consolidated from dutchie_products';
|
||||
74
backend/migrations/072_product_views.sql
Normal file
74
backend/migrations/072_product_views.sql
Normal file
@@ -0,0 +1,74 @@
|
||||
-- Migration 072: Create compatibility views for store_products and store_product_snapshots
|
||||
-- These views provide backward-compatible column names for API routes
|
||||
|
||||
-- v_products view - aliases store_products columns to match legacy dutchie_products naming
|
||||
CREATE OR REPLACE VIEW v_products AS
|
||||
SELECT
|
||||
id,
|
||||
dispensary_id,
|
||||
provider_product_id as external_product_id,
|
||||
provider_product_id as dutchie_id,
|
||||
name_raw as name,
|
||||
brand_name_raw as brand_name,
|
||||
category_raw as type,
|
||||
subcategory_raw as subcategory,
|
||||
strain_type,
|
||||
thc_percent as thc,
|
||||
cbd_percent as cbd,
|
||||
stock_status,
|
||||
is_in_stock,
|
||||
stock_quantity,
|
||||
image_url,
|
||||
primary_image_url,
|
||||
images,
|
||||
effects,
|
||||
description,
|
||||
is_on_special,
|
||||
featured,
|
||||
medical_only,
|
||||
rec_only,
|
||||
external_product_id as external_id,
|
||||
provider,
|
||||
created_at,
|
||||
updated_at
|
||||
FROM store_products;
|
||||
|
||||
-- v_product_snapshots view - aliases store_product_snapshots columns to match legacy naming
|
||||
CREATE OR REPLACE VIEW v_product_snapshots AS
|
||||
SELECT
|
||||
id,
|
||||
store_product_id,
|
||||
dispensary_id,
|
||||
provider,
|
||||
provider_product_id,
|
||||
crawl_run_id,
|
||||
captured_at as crawled_at,
|
||||
name_raw,
|
||||
brand_name_raw,
|
||||
category_raw,
|
||||
subcategory_raw,
|
||||
-- Convert price_rec (dollars) to rec_min_price_cents (cents)
|
||||
CASE WHEN price_rec IS NOT NULL THEN (price_rec * 100)::integer END as rec_min_price_cents,
|
||||
CASE WHEN price_rec IS NOT NULL THEN (price_rec * 100)::integer END as rec_max_price_cents,
|
||||
CASE WHEN price_rec_special IS NOT NULL THEN (price_rec_special * 100)::integer END as rec_min_special_price_cents,
|
||||
CASE WHEN price_med IS NOT NULL THEN (price_med * 100)::integer END as med_min_price_cents,
|
||||
CASE WHEN price_med IS NOT NULL THEN (price_med * 100)::integer END as med_max_price_cents,
|
||||
CASE WHEN price_med_special IS NOT NULL THEN (price_med_special * 100)::integer END as med_min_special_price_cents,
|
||||
is_on_special as special,
|
||||
discount_percent,
|
||||
is_in_stock,
|
||||
stock_quantity,
|
||||
stock_status,
|
||||
stock_quantity as total_quantity_available,
|
||||
thc_percent,
|
||||
cbd_percent,
|
||||
image_url,
|
||||
raw_data as options,
|
||||
created_at
|
||||
FROM store_product_snapshots;
|
||||
|
||||
-- Add indexes for the views' underlying tables
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_dispensary ON store_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_stock ON store_products(stock_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_snapshots_product ON store_product_snapshots(store_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_store_snapshots_captured ON store_product_snapshots(captured_at DESC);
|
||||
12
backend/migrations/073_proxy_timezone.sql
Normal file
12
backend/migrations/073_proxy_timezone.sql
Normal file
@@ -0,0 +1,12 @@
|
||||
-- Add timezone column to proxies table for geo-consistent fingerprinting
|
||||
-- This allows matching Accept-Language and other headers to proxy location
|
||||
|
||||
ALTER TABLE proxies
|
||||
ADD COLUMN IF NOT EXISTS timezone VARCHAR(50);
|
||||
|
||||
-- Add timezone to failed_proxies as well
|
||||
ALTER TABLE failed_proxies
|
||||
ADD COLUMN IF NOT EXISTS timezone VARCHAR(50);
|
||||
|
||||
-- Comment explaining usage
|
||||
COMMENT ON COLUMN proxies.timezone IS 'IANA timezone (e.g., America/Phoenix) for geo-consistent fingerprinting';
|
||||
322
backend/migrations/074_worker_task_queue.sql
Normal file
322
backend/migrations/074_worker_task_queue.sql
Normal file
@@ -0,0 +1,322 @@
|
||||
-- Migration 074: Worker Task Queue System
|
||||
-- Implements role-based task queue with per-store locking and capacity tracking
|
||||
|
||||
-- Task queue table
|
||||
CREATE TABLE IF NOT EXISTS worker_tasks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Task identification
|
||||
role VARCHAR(50) NOT NULL, -- store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform VARCHAR(20), -- dutchie, jane, treez, etc.
|
||||
|
||||
-- Task state
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 0, -- Higher = more urgent
|
||||
|
||||
-- Scheduling
|
||||
scheduled_for TIMESTAMPTZ, -- For batch scheduling (e.g., every 4 hours)
|
||||
|
||||
-- Ownership
|
||||
worker_id VARCHAR(100), -- Pod name or worker ID
|
||||
claimed_at TIMESTAMPTZ,
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
last_heartbeat_at TIMESTAMPTZ,
|
||||
|
||||
-- Results
|
||||
result JSONB, -- Task output data
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
max_retries INTEGER DEFAULT 3,
|
||||
|
||||
-- Metadata
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
-- Constraints
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'claimed', 'running', 'completed', 'failed', 'stale'))
|
||||
);
|
||||
|
||||
-- Indexes for efficient task claiming
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_pending
|
||||
ON worker_tasks(role, priority DESC, created_at ASC)
|
||||
WHERE status = 'pending';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_claimed
|
||||
ON worker_tasks(worker_id, claimed_at)
|
||||
WHERE status = 'claimed';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_running
|
||||
ON worker_tasks(worker_id, last_heartbeat_at)
|
||||
WHERE status = 'running';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_dispensary
|
||||
ON worker_tasks(dispensary_id)
|
||||
WHERE dispensary_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_scheduled
|
||||
ON worker_tasks(scheduled_for)
|
||||
WHERE status = 'pending' AND scheduled_for IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_history
|
||||
ON worker_tasks(role, completed_at DESC)
|
||||
WHERE status IN ('completed', 'failed');
|
||||
|
||||
-- Partial unique index to prevent duplicate active tasks per store
|
||||
-- Only one task can be claimed/running for a given dispensary at a time
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_worker_tasks_unique_active_store
|
||||
ON worker_tasks(dispensary_id)
|
||||
WHERE status IN ('claimed', 'running') AND dispensary_id IS NOT NULL;
|
||||
|
||||
-- Worker registration table (tracks active workers)
|
||||
CREATE TABLE IF NOT EXISTS worker_registry (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id VARCHAR(100) UNIQUE NOT NULL,
|
||||
role VARCHAR(50) NOT NULL,
|
||||
pod_name VARCHAR(100),
|
||||
hostname VARCHAR(100),
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_heartbeat_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
tasks_completed INTEGER DEFAULT 0,
|
||||
tasks_failed INTEGER DEFAULT 0,
|
||||
status VARCHAR(20) DEFAULT 'active',
|
||||
|
||||
CONSTRAINT valid_worker_status CHECK (status IN ('active', 'idle', 'offline'))
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_role
|
||||
ON worker_registry(role, status);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_heartbeat
|
||||
ON worker_registry(last_heartbeat_at)
|
||||
WHERE status = 'active';
|
||||
|
||||
-- Task completion tracking (summarized history)
|
||||
CREATE TABLE IF NOT EXISTS task_completion_log (
|
||||
id SERIAL PRIMARY KEY,
|
||||
role VARCHAR(50) NOT NULL,
|
||||
date DATE NOT NULL DEFAULT CURRENT_DATE,
|
||||
hour INTEGER NOT NULL DEFAULT EXTRACT(HOUR FROM NOW()),
|
||||
|
||||
tasks_created INTEGER DEFAULT 0,
|
||||
tasks_completed INTEGER DEFAULT 0,
|
||||
tasks_failed INTEGER DEFAULT 0,
|
||||
|
||||
avg_duration_sec NUMERIC(10,2),
|
||||
min_duration_sec NUMERIC(10,2),
|
||||
max_duration_sec NUMERIC(10,2),
|
||||
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
UNIQUE(role, date, hour)
|
||||
);
|
||||
|
||||
-- Capacity planning view
|
||||
CREATE OR REPLACE VIEW v_worker_capacity AS
|
||||
SELECT
|
||||
role,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'pending' AND (scheduled_for IS NULL OR scheduled_for <= NOW())) as ready_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'claimed') as claimed_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running_tasks,
|
||||
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') as completed_last_hour,
|
||||
COUNT(*) FILTER (WHERE status = 'failed' AND completed_at > NOW() - INTERVAL '1 hour') as failed_last_hour,
|
||||
COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) as active_workers,
|
||||
AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') as avg_duration_sec,
|
||||
-- Capacity planning metrics
|
||||
CASE
|
||||
WHEN COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') > 0
|
||||
THEN 3600.0 / NULLIF(AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'), 0)
|
||||
ELSE NULL
|
||||
END as tasks_per_worker_hour,
|
||||
-- Estimated time to drain queue
|
||||
CASE
|
||||
WHEN COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) > 0
|
||||
AND COUNT(*) FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour') > 0
|
||||
THEN COUNT(*) FILTER (WHERE status = 'pending') / NULLIF(
|
||||
COUNT(DISTINCT worker_id) FILTER (WHERE status IN ('claimed', 'running')) *
|
||||
(3600.0 / NULLIF(AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
|
||||
FILTER (WHERE status = 'completed' AND completed_at > NOW() - INTERVAL '1 hour'), 0)),
|
||||
0
|
||||
)
|
||||
ELSE NULL
|
||||
END as estimated_hours_to_drain
|
||||
FROM worker_tasks
|
||||
GROUP BY role;
|
||||
|
||||
-- Task history view (for UI)
|
||||
CREATE OR REPLACE VIEW v_task_history AS
|
||||
SELECT
|
||||
t.id,
|
||||
t.role,
|
||||
t.dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
t.platform,
|
||||
t.status,
|
||||
t.priority,
|
||||
t.worker_id,
|
||||
t.scheduled_for,
|
||||
t.claimed_at,
|
||||
t.started_at,
|
||||
t.completed_at,
|
||||
t.error_message,
|
||||
t.retry_count,
|
||||
t.created_at,
|
||||
EXTRACT(EPOCH FROM (t.completed_at - t.started_at)) as duration_sec
|
||||
FROM worker_tasks t
|
||||
LEFT JOIN dispensaries d ON d.id = t.dispensary_id
|
||||
ORDER BY t.created_at DESC;
|
||||
|
||||
-- Function to claim a task atomically
|
||||
CREATE OR REPLACE FUNCTION claim_task(
|
||||
p_role VARCHAR(50),
|
||||
p_worker_id VARCHAR(100)
|
||||
) RETURNS worker_tasks AS $$
|
||||
DECLARE
|
||||
claimed_task worker_tasks;
|
||||
BEGIN
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'claimed',
|
||||
worker_id = p_worker_id,
|
||||
claimed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_tasks
|
||||
WHERE role = p_role
|
||||
AND status = 'pending'
|
||||
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
|
||||
-- Exclude stores that already have an active task
|
||||
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM worker_tasks
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND dispensary_id IS NOT NULL
|
||||
))
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING * INTO claimed_task;
|
||||
|
||||
RETURN claimed_task;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to mark stale tasks (workers that died)
|
||||
CREATE OR REPLACE FUNCTION recover_stale_tasks(
|
||||
stale_threshold_minutes INTEGER DEFAULT 10
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
recovered_count INTEGER;
|
||||
BEGIN
|
||||
WITH stale AS (
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'pending',
|
||||
worker_id = NULL,
|
||||
claimed_at = NULL,
|
||||
started_at = NULL,
|
||||
retry_count = retry_count + 1,
|
||||
updated_at = NOW()
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
|
||||
AND retry_count < max_retries
|
||||
RETURNING id
|
||||
)
|
||||
SELECT COUNT(*) INTO recovered_count FROM stale;
|
||||
|
||||
-- Mark tasks that exceeded retries as failed
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'failed',
|
||||
error_message = 'Exceeded max retries after worker failures',
|
||||
completed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
|
||||
AND retry_count >= max_retries;
|
||||
|
||||
RETURN recovered_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to generate daily resync tasks
|
||||
CREATE OR REPLACE FUNCTION generate_resync_tasks(
|
||||
p_batches_per_day INTEGER DEFAULT 6, -- Every 4 hours
|
||||
p_date DATE DEFAULT CURRENT_DATE
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
store_count INTEGER;
|
||||
stores_per_batch INTEGER;
|
||||
batch_num INTEGER;
|
||||
scheduled_time TIMESTAMPTZ;
|
||||
created_count INTEGER := 0;
|
||||
BEGIN
|
||||
-- Count active stores that need resync
|
||||
SELECT COUNT(*) INTO store_count
|
||||
FROM dispensaries
|
||||
WHERE crawl_enabled = true
|
||||
AND menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL;
|
||||
|
||||
IF store_count = 0 THEN
|
||||
RETURN 0;
|
||||
END IF;
|
||||
|
||||
stores_per_batch := CEIL(store_count::NUMERIC / p_batches_per_day);
|
||||
|
||||
FOR batch_num IN 0..(p_batches_per_day - 1) LOOP
|
||||
scheduled_time := p_date + (batch_num * 4 || ' hours')::INTERVAL;
|
||||
|
||||
INSERT INTO worker_tasks (role, dispensary_id, platform, scheduled_for, priority)
|
||||
SELECT
|
||||
'product_resync',
|
||||
d.id,
|
||||
'dutchie',
|
||||
scheduled_time,
|
||||
0
|
||||
FROM (
|
||||
SELECT id, ROW_NUMBER() OVER (ORDER BY id) as rn
|
||||
FROM dispensaries
|
||||
WHERE crawl_enabled = true
|
||||
AND menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL
|
||||
) d
|
||||
WHERE d.rn > (batch_num * stores_per_batch)
|
||||
AND d.rn <= ((batch_num + 1) * stores_per_batch)
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
GET DIAGNOSTICS created_count = created_count + ROW_COUNT;
|
||||
END LOOP;
|
||||
|
||||
RETURN created_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Trigger to update timestamp
|
||||
CREATE OR REPLACE FUNCTION update_worker_tasks_timestamp()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
DROP TRIGGER IF EXISTS worker_tasks_updated_at ON worker_tasks;
|
||||
CREATE TRIGGER worker_tasks_updated_at
|
||||
BEFORE UPDATE ON worker_tasks
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_worker_tasks_timestamp();
|
||||
|
||||
-- Comments
|
||||
COMMENT ON TABLE worker_tasks IS 'Central task queue for all worker roles';
|
||||
COMMENT ON TABLE worker_registry IS 'Registry of active workers and their stats';
|
||||
COMMENT ON TABLE task_completion_log IS 'Hourly aggregated task completion metrics';
|
||||
COMMENT ON VIEW v_worker_capacity IS 'Real-time capacity planning metrics per role';
|
||||
COMMENT ON VIEW v_task_history IS 'Task history with dispensary details for UI';
|
||||
COMMENT ON FUNCTION claim_task IS 'Atomically claim a task for a worker, respecting per-store locking';
|
||||
COMMENT ON FUNCTION recover_stale_tasks IS 'Release tasks from dead workers back to pending';
|
||||
COMMENT ON FUNCTION generate_resync_tasks IS 'Generate daily product resync tasks in batches';
|
||||
13
backend/migrations/075_consecutive_misses.sql
Normal file
13
backend/migrations/075_consecutive_misses.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
-- Migration 075: Add consecutive_misses column to store_products
|
||||
-- Used to track how many consecutive crawls a product has been missing from the feed
|
||||
-- After 3 consecutive misses, product is marked as OOS
|
||||
|
||||
ALTER TABLE store_products
|
||||
ADD COLUMN IF NOT EXISTS consecutive_misses INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Index for finding products that need OOS check
|
||||
CREATE INDEX IF NOT EXISTS idx_store_products_consecutive_misses
|
||||
ON store_products (dispensary_id, consecutive_misses)
|
||||
WHERE consecutive_misses > 0;
|
||||
|
||||
COMMENT ON COLUMN store_products.consecutive_misses IS 'Number of consecutive crawls where product was not in feed. Reset to 0 when seen. At 3, mark OOS.';
|
||||
71
backend/migrations/076_visitor_analytics.sql
Normal file
71
backend/migrations/076_visitor_analytics.sql
Normal file
@@ -0,0 +1,71 @@
|
||||
-- Visitor location analytics for Findagram
|
||||
-- Tracks visitor locations to understand popular areas
|
||||
|
||||
CREATE TABLE IF NOT EXISTS visitor_locations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Location data (from IP lookup)
|
||||
ip_hash VARCHAR(64), -- Hashed IP for privacy (SHA256)
|
||||
city VARCHAR(100),
|
||||
state VARCHAR(100),
|
||||
state_code VARCHAR(10),
|
||||
country VARCHAR(100),
|
||||
country_code VARCHAR(10),
|
||||
latitude DECIMAL(10, 7),
|
||||
longitude DECIMAL(10, 7),
|
||||
|
||||
-- Visit metadata
|
||||
domain VARCHAR(50) NOT NULL, -- 'findagram.co', 'findadispo.com', etc.
|
||||
page_path VARCHAR(255), -- '/products', '/dispensaries/123', etc.
|
||||
referrer VARCHAR(500),
|
||||
user_agent VARCHAR(500),
|
||||
|
||||
-- Session tracking
|
||||
session_id VARCHAR(64), -- For grouping page views in a session
|
||||
|
||||
-- Timestamps
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for analytics queries
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_domain ON visitor_locations(domain);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_city_state ON visitor_locations(city, state_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_created_at ON visitor_locations(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_locations_session ON visitor_locations(session_id);
|
||||
|
||||
-- Aggregated daily stats (materialized for performance)
|
||||
CREATE TABLE IF NOT EXISTS visitor_location_stats (
|
||||
id SERIAL PRIMARY KEY,
|
||||
date DATE NOT NULL,
|
||||
domain VARCHAR(50) NOT NULL,
|
||||
city VARCHAR(100),
|
||||
state VARCHAR(100),
|
||||
state_code VARCHAR(10),
|
||||
country_code VARCHAR(10),
|
||||
|
||||
-- Metrics
|
||||
visit_count INTEGER DEFAULT 0,
|
||||
unique_sessions INTEGER DEFAULT 0,
|
||||
|
||||
UNIQUE(date, domain, city, state_code, country_code)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_stats_date ON visitor_location_stats(date);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_stats_domain ON visitor_location_stats(domain);
|
||||
CREATE INDEX IF NOT EXISTS idx_visitor_stats_state ON visitor_location_stats(state_code);
|
||||
|
||||
-- View for easy querying of top locations
|
||||
CREATE OR REPLACE VIEW v_top_visitor_locations AS
|
||||
SELECT
|
||||
domain,
|
||||
city,
|
||||
state,
|
||||
state_code,
|
||||
country_code,
|
||||
COUNT(*) as total_visits,
|
||||
COUNT(DISTINCT session_id) as unique_sessions,
|
||||
MAX(created_at) as last_visit
|
||||
FROM visitor_locations
|
||||
WHERE created_at > NOW() - INTERVAL '30 days'
|
||||
GROUP BY domain, city, state, state_code, country_code
|
||||
ORDER BY total_visits DESC;
|
||||
141
backend/migrations/076_worker_registry.sql
Normal file
141
backend/migrations/076_worker_registry.sql
Normal file
@@ -0,0 +1,141 @@
|
||||
-- Migration 076: Worker Registry for Dynamic Workers
|
||||
-- Workers register on startup, receive a friendly name, and report heartbeats
|
||||
|
||||
-- Name pool for workers (expandable, no hardcoding)
|
||||
CREATE TABLE IF NOT EXISTS worker_name_pool (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(50) UNIQUE NOT NULL,
|
||||
in_use BOOLEAN DEFAULT FALSE,
|
||||
assigned_to VARCHAR(100), -- worker_id
|
||||
assigned_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Seed with initial names (can add more via API)
|
||||
INSERT INTO worker_name_pool (name) VALUES
|
||||
('Alice'), ('Bella'), ('Clara'), ('Diana'), ('Elena'),
|
||||
('Fiona'), ('Grace'), ('Hazel'), ('Iris'), ('Julia'),
|
||||
('Katie'), ('Luna'), ('Mia'), ('Nora'), ('Olive'),
|
||||
('Pearl'), ('Quinn'), ('Rosa'), ('Sara'), ('Tara'),
|
||||
('Uma'), ('Vera'), ('Wendy'), ('Xena'), ('Yuki'), ('Zara'),
|
||||
('Amber'), ('Blake'), ('Coral'), ('Dawn'), ('Echo'),
|
||||
('Fleur'), ('Gem'), ('Haven'), ('Ivy'), ('Jade'),
|
||||
('Kira'), ('Lotus'), ('Maple'), ('Nova'), ('Onyx'),
|
||||
('Pixel'), ('Quest'), ('Raven'), ('Sage'), ('Terra'),
|
||||
('Unity'), ('Violet'), ('Willow'), ('Xylo'), ('Yara'), ('Zen')
|
||||
ON CONFLICT (name) DO NOTHING;
|
||||
|
||||
-- Worker registry - tracks active workers
|
||||
CREATE TABLE IF NOT EXISTS worker_registry (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id VARCHAR(100) UNIQUE NOT NULL, -- e.g., "pod-abc123" or uuid
|
||||
friendly_name VARCHAR(50), -- assigned from pool
|
||||
role VARCHAR(50) NOT NULL, -- task role
|
||||
pod_name VARCHAR(100), -- k8s pod name
|
||||
hostname VARCHAR(100), -- machine hostname
|
||||
ip_address VARCHAR(50), -- worker IP
|
||||
status VARCHAR(20) DEFAULT 'starting', -- starting, active, idle, offline, terminated
|
||||
started_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_heartbeat_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
last_task_at TIMESTAMPTZ,
|
||||
tasks_completed INTEGER DEFAULT 0,
|
||||
tasks_failed INTEGER DEFAULT 0,
|
||||
current_task_id INTEGER,
|
||||
metadata JSONB DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for worker registry
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_status ON worker_registry(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_role ON worker_registry(role);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_registry_heartbeat ON worker_registry(last_heartbeat_at);
|
||||
|
||||
-- Function to assign a name to a new worker
|
||||
CREATE OR REPLACE FUNCTION assign_worker_name(p_worker_id VARCHAR(100))
|
||||
RETURNS VARCHAR(50) AS $$
|
||||
DECLARE
|
||||
v_name VARCHAR(50);
|
||||
BEGIN
|
||||
-- Try to get an unused name
|
||||
UPDATE worker_name_pool
|
||||
SET in_use = TRUE, assigned_to = p_worker_id, assigned_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_name_pool
|
||||
WHERE in_use = FALSE
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING name INTO v_name;
|
||||
|
||||
-- If no names available, generate one
|
||||
IF v_name IS NULL THEN
|
||||
v_name := 'Worker-' || SUBSTRING(p_worker_id FROM 1 FOR 8);
|
||||
END IF;
|
||||
|
||||
RETURN v_name;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to release a worker's name back to the pool
|
||||
CREATE OR REPLACE FUNCTION release_worker_name(p_worker_id VARCHAR(100))
|
||||
RETURNS VOID AS $$
|
||||
BEGIN
|
||||
UPDATE worker_name_pool
|
||||
SET in_use = FALSE, assigned_to = NULL, assigned_at = NULL
|
||||
WHERE assigned_to = p_worker_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to mark stale workers as offline
|
||||
CREATE OR REPLACE FUNCTION mark_stale_workers(stale_threshold_minutes INTEGER DEFAULT 5)
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
v_count INTEGER;
|
||||
BEGIN
|
||||
UPDATE worker_registry
|
||||
SET status = 'offline', updated_at = NOW()
|
||||
WHERE status IN ('active', 'idle', 'starting')
|
||||
AND last_heartbeat_at < NOW() - (stale_threshold_minutes || ' minutes')::INTERVAL
|
||||
RETURNING COUNT(*) INTO v_count;
|
||||
|
||||
-- Release names from offline workers
|
||||
PERFORM release_worker_name(worker_id)
|
||||
FROM worker_registry
|
||||
WHERE status = 'offline'
|
||||
AND last_heartbeat_at < NOW() - INTERVAL '30 minutes';
|
||||
|
||||
RETURN COALESCE(v_count, 0);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- View for dashboard
|
||||
CREATE OR REPLACE VIEW v_active_workers AS
|
||||
SELECT
|
||||
wr.id,
|
||||
wr.worker_id,
|
||||
wr.friendly_name,
|
||||
wr.role,
|
||||
wr.status,
|
||||
wr.pod_name,
|
||||
wr.hostname,
|
||||
wr.started_at,
|
||||
wr.last_heartbeat_at,
|
||||
wr.last_task_at,
|
||||
wr.tasks_completed,
|
||||
wr.tasks_failed,
|
||||
wr.current_task_id,
|
||||
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN wr.status = 'offline' THEN 'offline'
|
||||
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status
|
||||
FROM worker_registry wr
|
||||
WHERE wr.status != 'terminated'
|
||||
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
|
||||
|
||||
COMMENT ON TABLE worker_registry IS 'Tracks all workers that have registered with the system';
|
||||
COMMENT ON TABLE worker_name_pool IS 'Pool of friendly names for workers - expandable via API';
|
||||
35
backend/migrations/077_click_events_location.sql
Normal file
35
backend/migrations/077_click_events_location.sql
Normal file
@@ -0,0 +1,35 @@
|
||||
-- Migration: Add visitor location and dispensary name to click events
|
||||
-- Captures where visitors are clicking from and which dispensary
|
||||
|
||||
-- Add visitor location columns
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_city VARCHAR(100);
|
||||
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_state VARCHAR(10);
|
||||
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_lat DECIMAL(10, 7);
|
||||
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS visitor_lng DECIMAL(10, 7);
|
||||
|
||||
-- Add dispensary name for easier reporting
|
||||
ALTER TABLE product_click_events
|
||||
ADD COLUMN IF NOT EXISTS dispensary_name VARCHAR(255);
|
||||
|
||||
-- Create index for location-based analytics
|
||||
CREATE INDEX IF NOT EXISTS idx_product_click_events_visitor_state
|
||||
ON product_click_events(visitor_state)
|
||||
WHERE visitor_state IS NOT NULL;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_product_click_events_visitor_city
|
||||
ON product_click_events(visitor_city)
|
||||
WHERE visitor_city IS NOT NULL;
|
||||
|
||||
-- Add comments
|
||||
COMMENT ON COLUMN product_click_events.visitor_city IS 'City where the visitor is located (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.visitor_state IS 'State where the visitor is located (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.visitor_lat IS 'Visitor latitude (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.visitor_lng IS 'Visitor longitude (from IP geolocation)';
|
||||
COMMENT ON COLUMN product_click_events.dispensary_name IS 'Name of the dispensary (denormalized for easier reporting)';
|
||||
19
backend/node_modules/.package-lock.json
generated
vendored
19
backend/node_modules/.package-lock.json
generated
vendored
@@ -1026,6 +1026,17 @@
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/csv-parser": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.2.0.tgz",
|
||||
"integrity": "sha512-fgKbp+AJbn1h2dcAHKIdKNSSjfp43BZZykXsCjzALjKy80VXQNHPFJ6T9Afwdzoj24aMkq8GwDS7KGcDPpejrA==",
|
||||
"bin": {
|
||||
"csv-parser": "bin/csv-parser"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -2235,6 +2246,14 @@
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/ip2location-nodejs": {
|
||||
"version": "9.7.0",
|
||||
"resolved": "https://registry.npmjs.org/ip2location-nodejs/-/ip2location-nodejs-9.7.0.tgz",
|
||||
"integrity": "sha512-eQ4T5TXm1cx0+pQcRycPiuaiRuoDEMd9O89Be7Ugk555qi9UY9enXSznkkqr3kQRyUaXx7zj5dORC5LGTPOttA==",
|
||||
"dependencies": {
|
||||
"csv-parser": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ipaddr.js": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-2.2.0.tgz",
|
||||
|
||||
20
backend/package-lock.json
generated
20
backend/package-lock.json
generated
@@ -21,6 +21,7 @@
|
||||
"helmet": "^7.1.0",
|
||||
"https-proxy-agent": "^7.0.2",
|
||||
"ioredis": "^5.8.2",
|
||||
"ip2location-nodejs": "^9.7.0",
|
||||
"ipaddr.js": "^2.2.0",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"minio": "^7.1.3",
|
||||
@@ -1531,6 +1532,17 @@
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/csv-parser": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.2.0.tgz",
|
||||
"integrity": "sha512-fgKbp+AJbn1h2dcAHKIdKNSSjfp43BZZykXsCjzALjKy80VXQNHPFJ6T9Afwdzoj24aMkq8GwDS7KGcDPpejrA==",
|
||||
"bin": {
|
||||
"csv-parser": "bin/csv-parser"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -2754,6 +2766,14 @@
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/ip2location-nodejs": {
|
||||
"version": "9.7.0",
|
||||
"resolved": "https://registry.npmjs.org/ip2location-nodejs/-/ip2location-nodejs-9.7.0.tgz",
|
||||
"integrity": "sha512-eQ4T5TXm1cx0+pQcRycPiuaiRuoDEMd9O89Be7Ugk555qi9UY9enXSznkkqr3kQRyUaXx7zj5dORC5LGTPOttA==",
|
||||
"dependencies": {
|
||||
"csv-parser": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ipaddr.js": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-2.2.0.tgz",
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
"helmet": "^7.1.0",
|
||||
"https-proxy-agent": "^7.0.2",
|
||||
"ioredis": "^5.8.2",
|
||||
"ip2location-nodejs": "^9.7.0",
|
||||
"ipaddr.js": "^2.2.0",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"minio": "^7.1.3",
|
||||
|
||||
BIN
backend/public/downloads/cannaiq-menus-1.5.3.zip
Normal file
BIN
backend/public/downloads/cannaiq-menus-1.5.3.zip
Normal file
Binary file not shown.
BIN
backend/public/downloads/cannaiq-menus-1.5.4.zip
Normal file
BIN
backend/public/downloads/cannaiq-menus-1.5.4.zip
Normal file
Binary file not shown.
65
backend/scripts/download-ip2location.sh
Executable file
65
backend/scripts/download-ip2location.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
# Download IP2Location LITE DB3 (City-level) database
|
||||
# Free for commercial use with attribution
|
||||
# https://lite.ip2location.com/database/db3-ip-country-region-city
|
||||
|
||||
set -e
|
||||
|
||||
DATA_DIR="${1:-./data/ip2location}"
|
||||
DB_FILE="IP2LOCATION-LITE-DB3.BIN"
|
||||
|
||||
mkdir -p "$DATA_DIR"
|
||||
cd "$DATA_DIR"
|
||||
|
||||
echo "Downloading IP2Location LITE DB3 database..."
|
||||
|
||||
# IP2Location LITE DB3 - includes city, region, country, lat/lng
|
||||
# You need to register at https://lite.ip2location.com/ to get a download token
|
||||
# Then set IP2LOCATION_TOKEN environment variable
|
||||
|
||||
if [ -z "$IP2LOCATION_TOKEN" ]; then
|
||||
echo ""
|
||||
echo "ERROR: IP2LOCATION_TOKEN not set"
|
||||
echo ""
|
||||
echo "To download the database:"
|
||||
echo "1. Register free at https://lite.ip2location.com/"
|
||||
echo "2. Get your download token from the dashboard"
|
||||
echo "3. Run: IP2LOCATION_TOKEN=your_token ./scripts/download-ip2location.sh"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Download DB3.LITE (IPv4 + City)
|
||||
DOWNLOAD_URL="https://www.ip2location.com/download/?token=${IP2LOCATION_TOKEN}&file=DB3LITEBIN"
|
||||
|
||||
echo "Downloading from IP2Location..."
|
||||
curl -L -o ip2location.zip "$DOWNLOAD_URL"
|
||||
|
||||
echo "Extracting..."
|
||||
unzip -o ip2location.zip
|
||||
|
||||
# Rename to standard name
|
||||
if [ -f "IP2LOCATION-LITE-DB3.BIN" ]; then
|
||||
echo "Database ready: $DATA_DIR/IP2LOCATION-LITE-DB3.BIN"
|
||||
elif [ -f "IP-COUNTRY-REGION-CITY.BIN" ]; then
|
||||
mv "IP-COUNTRY-REGION-CITY.BIN" "$DB_FILE"
|
||||
echo "Database ready: $DATA_DIR/$DB_FILE"
|
||||
else
|
||||
# Find whatever BIN file was extracted
|
||||
BIN_FILE=$(ls *.BIN 2>/dev/null | head -1)
|
||||
if [ -n "$BIN_FILE" ]; then
|
||||
mv "$BIN_FILE" "$DB_FILE"
|
||||
echo "Database ready: $DATA_DIR/$DB_FILE"
|
||||
else
|
||||
echo "ERROR: No BIN file found in archive"
|
||||
ls -la
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
rm -f ip2location.zip *.txt LICENSE* README*
|
||||
|
||||
echo ""
|
||||
echo "Done! Database saved to: $DATA_DIR/$DB_FILE"
|
||||
echo "Update monthly by re-running this script."
|
||||
@@ -1,3 +1,14 @@
|
||||
/**
|
||||
* CannaiQ Authentication Middleware
|
||||
*
|
||||
* AUTH METHODS (in order of priority):
|
||||
* 1. IP-based: Localhost/trusted IPs get 'internal' role (full access, no token needed)
|
||||
* 2. Token-based: Bearer token (JWT or API token)
|
||||
*
|
||||
* NO username/password auth in API. Use tokens only.
|
||||
*
|
||||
* Localhost bypass: curl from 127.0.0.1 gets automatic admin access.
|
||||
*/
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
import jwt from 'jsonwebtoken';
|
||||
import bcrypt from 'bcrypt';
|
||||
@@ -5,6 +16,86 @@ import { pool } from '../db/pool';
|
||||
|
||||
const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
|
||||
|
||||
// Trusted origins that bypass auth for internal/same-origin requests
|
||||
const TRUSTED_ORIGINS = [
|
||||
'https://cannaiq.co',
|
||||
'https://www.cannaiq.co',
|
||||
'https://findadispo.com',
|
||||
'https://www.findadispo.com',
|
||||
'https://findagram.co',
|
||||
'https://www.findagram.co',
|
||||
'http://localhost:3010',
|
||||
'http://localhost:8080',
|
||||
'http://localhost:5173',
|
||||
];
|
||||
|
||||
// Pattern-based trusted origins (wildcards)
|
||||
const TRUSTED_ORIGIN_PATTERNS = [
|
||||
/^https:\/\/.*\.cannabrands\.app$/, // *.cannabrands.app
|
||||
];
|
||||
|
||||
// Trusted IPs for internal pod-to-pod communication
|
||||
const TRUSTED_IPS = [
|
||||
'127.0.0.1',
|
||||
'::1',
|
||||
'::ffff:127.0.0.1',
|
||||
];
|
||||
|
||||
/**
|
||||
* Check if request is from a trusted origin/IP
|
||||
*/
|
||||
function isTrustedRequest(req: Request): boolean {
|
||||
// Check origin header
|
||||
const origin = req.headers.origin;
|
||||
if (origin) {
|
||||
if (TRUSTED_ORIGINS.includes(origin)) {
|
||||
return true;
|
||||
}
|
||||
// Check pattern-based origins (wildcards like *.cannabrands.app)
|
||||
for (const pattern of TRUSTED_ORIGIN_PATTERNS) {
|
||||
if (pattern.test(origin)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check referer header (for same-origin requests without CORS)
|
||||
const referer = req.headers.referer;
|
||||
if (referer) {
|
||||
for (const trusted of TRUSTED_ORIGINS) {
|
||||
if (referer.startsWith(trusted)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check pattern-based referers
|
||||
try {
|
||||
const refererUrl = new URL(referer);
|
||||
const refererOrigin = refererUrl.origin;
|
||||
for (const pattern of TRUSTED_ORIGIN_PATTERNS) {
|
||||
if (pattern.test(refererOrigin)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Invalid referer URL, skip
|
||||
}
|
||||
}
|
||||
|
||||
// Check IP for internal requests (pod-to-pod, localhost)
|
||||
const clientIp = req.ip || req.socket.remoteAddress || '';
|
||||
if (TRUSTED_IPS.includes(clientIp)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for Kubernetes internal header (set by ingress/service mesh)
|
||||
const internalHeader = req.headers['x-internal-request'];
|
||||
if (internalHeader === process.env.INTERNAL_REQUEST_SECRET) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export interface AuthUser {
|
||||
id: number;
|
||||
email: string;
|
||||
@@ -61,6 +152,16 @@ export async function authenticateUser(email: string, password: string): Promise
|
||||
}
|
||||
|
||||
export async function authMiddleware(req: AuthRequest, res: Response, next: NextFunction) {
|
||||
// Allow trusted origins/IPs to bypass auth (internal services, same-origin)
|
||||
if (isTrustedRequest(req)) {
|
||||
req.user = {
|
||||
id: 0,
|
||||
email: 'internal@system',
|
||||
role: 'internal'
|
||||
};
|
||||
return next();
|
||||
}
|
||||
|
||||
const authHeader = req.headers.authorization;
|
||||
|
||||
if (!authHeader || !authHeader.startsWith('Bearer ')) {
|
||||
@@ -135,12 +236,23 @@ export async function authMiddleware(req: AuthRequest, res: Response, next: Next
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Require specific role(s) to access endpoint.
|
||||
*
|
||||
* NOTE: 'internal' role (localhost/trusted IPs) bypasses all role checks.
|
||||
* This allows local development and internal services full access.
|
||||
*/
|
||||
export function requireRole(...roles: string[]) {
|
||||
return (req: AuthRequest, res: Response, next: NextFunction) => {
|
||||
if (!req.user) {
|
||||
return res.status(401).json({ error: 'Not authenticated' });
|
||||
}
|
||||
|
||||
// Internal role (localhost) bypasses role checks
|
||||
if (req.user.role === 'internal') {
|
||||
return next();
|
||||
}
|
||||
|
||||
if (!roles.includes(req.user.role)) {
|
||||
return res.status(403).json({ error: 'Insufficient permissions' });
|
||||
}
|
||||
|
||||
@@ -472,7 +472,8 @@ export class CanonicalHydrationService {
|
||||
}
|
||||
|
||||
// Step 3: Create initial snapshots from current product state
|
||||
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId);
|
||||
// crawlRunId is guaranteed to be set at this point (either from existing run or insert)
|
||||
const snapshotsWritten = await this.createInitialSnapshots(dispensaryId, crawlRunId!);
|
||||
result.snapshotsWritten += snapshotsWritten;
|
||||
|
||||
// Update crawl run with snapshot count
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* CLI Entrypoint for CannaIQ Backend
|
||||
* @module cli
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/cli.ts # Start API server
|
||||
@@ -50,18 +51,14 @@ async function main() {
|
||||
showHelp();
|
||||
}
|
||||
|
||||
if (args.includes('--worker')) {
|
||||
console.log('[CLI] Starting worker process...');
|
||||
const { startWorker } = await import('./dutchie-az/services/worker');
|
||||
await startWorker();
|
||||
} else {
|
||||
// Default: start API server
|
||||
console.log('[CLI] Starting API server...');
|
||||
await import('./index');
|
||||
}
|
||||
// Default: start API server
|
||||
console.log('[CLI] Starting API server...');
|
||||
await import('./index');
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('[CLI] Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
export {};
|
||||
|
||||
@@ -1,657 +0,0 @@
|
||||
/**
|
||||
* Base Dutchie Crawler Template
|
||||
*
|
||||
* This is the base template for all Dutchie store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* Exports:
|
||||
* - crawlProducts(dispensary, options) - Main crawl entry point
|
||||
* - detectStructure(page) - Detect page structure for sandbox mode
|
||||
* - extractProducts(document) - Extract product data
|
||||
* - extractImages(document) - Extract product images
|
||||
* - extractStock(document) - Extract stock status
|
||||
* - extractPagination(document) - Extract pagination info
|
||||
*/
|
||||
|
||||
import {
|
||||
crawlDispensaryProducts as baseCrawlDispensaryProducts,
|
||||
CrawlResult,
|
||||
} from '../../dutchie-az/services/product-crawler';
|
||||
import { Dispensary, CrawlerProfileOptions } from '../../dutchie-az/types';
|
||||
|
||||
// Re-export CrawlResult for convenience
|
||||
export { CrawlResult };
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Options passed to the per-store crawler
|
||||
*/
|
||||
export interface StoreCrawlOptions {
|
||||
pricingType?: 'rec' | 'med';
|
||||
useBothModes?: boolean;
|
||||
downloadImages?: boolean;
|
||||
trackStock?: boolean;
|
||||
timeoutMs?: number;
|
||||
config?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Progress callback for reporting crawl progress
|
||||
*/
|
||||
export interface CrawlProgressCallback {
|
||||
phase: 'fetching' | 'processing' | 'saving' | 'images' | 'complete';
|
||||
current: number;
|
||||
total: number;
|
||||
message?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Structure detection result for sandbox mode
|
||||
*/
|
||||
export interface StructureDetectionResult {
|
||||
success: boolean;
|
||||
menuType: 'dutchie' | 'treez' | 'jane' | 'unknown';
|
||||
iframeUrl?: string;
|
||||
graphqlEndpoint?: string;
|
||||
dispensaryId?: string;
|
||||
selectors: {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
pagination?: string;
|
||||
loadMore?: string;
|
||||
};
|
||||
pagination: {
|
||||
type: 'scroll' | 'click' | 'graphql' | 'none';
|
||||
hasMore?: boolean;
|
||||
pageSize?: number;
|
||||
};
|
||||
errors: string[];
|
||||
metadata: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Product extraction result
|
||||
*/
|
||||
export interface ExtractedProduct {
|
||||
externalId: string;
|
||||
name: string;
|
||||
brand?: string;
|
||||
category?: string;
|
||||
subcategory?: string;
|
||||
price?: number;
|
||||
priceRec?: number;
|
||||
priceMed?: number;
|
||||
weight?: string;
|
||||
thcContent?: string;
|
||||
cbdContent?: string;
|
||||
description?: string;
|
||||
imageUrl?: string;
|
||||
stockStatus?: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||
quantity?: number;
|
||||
raw?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Image extraction result
|
||||
*/
|
||||
export interface ExtractedImage {
|
||||
productId: string;
|
||||
imageUrl: string;
|
||||
isPrimary: boolean;
|
||||
position: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stock extraction result
|
||||
*/
|
||||
export interface ExtractedStock {
|
||||
productId: string;
|
||||
status: 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown';
|
||||
quantity?: number;
|
||||
lastChecked: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pagination extraction result
|
||||
*/
|
||||
export interface ExtractedPagination {
|
||||
hasNextPage: boolean;
|
||||
currentPage?: number;
|
||||
totalPages?: number;
|
||||
totalProducts?: number;
|
||||
nextCursor?: string;
|
||||
loadMoreSelector?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hook points that per-store crawlers can override
|
||||
*/
|
||||
export interface DutchieCrawlerHooks {
|
||||
/**
|
||||
* Called before fetching products
|
||||
* Can be used to set up custom headers, cookies, etc.
|
||||
*/
|
||||
beforeFetch?: (dispensary: Dispensary) => Promise<void>;
|
||||
|
||||
/**
|
||||
* Called after fetching products, before processing
|
||||
* Can be used to filter or transform raw products
|
||||
*/
|
||||
afterFetch?: (products: any[], dispensary: Dispensary) => Promise<any[]>;
|
||||
|
||||
/**
|
||||
* Called after all processing is complete
|
||||
* Can be used for cleanup or post-processing
|
||||
*/
|
||||
afterComplete?: (result: CrawlResult, dispensary: Dispensary) => Promise<void>;
|
||||
|
||||
/**
|
||||
* Custom selector resolver for iframe detection
|
||||
*/
|
||||
resolveIframe?: (page: any) => Promise<string | null>;
|
||||
|
||||
/**
|
||||
* Custom product container selector
|
||||
*/
|
||||
getProductContainerSelector?: () => string;
|
||||
|
||||
/**
|
||||
* Custom product extraction from container element
|
||||
*/
|
||||
extractProductFromElement?: (element: any) => Promise<ExtractedProduct | null>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selectors configuration for per-store overrides
|
||||
*/
|
||||
export interface DutchieSelectors {
|
||||
iframe?: string;
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productPriceRec?: string;
|
||||
productPriceMed?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
productWeight?: string;
|
||||
productThc?: string;
|
||||
productCbd?: string;
|
||||
productDescription?: string;
|
||||
productStock?: string;
|
||||
loadMore?: string;
|
||||
pagination?: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DEFAULT SELECTORS
|
||||
// ============================================================
|
||||
|
||||
export const DEFAULT_DUTCHIE_SELECTORS: DutchieSelectors = {
|
||||
iframe: 'iframe[src*="dutchie.com"]',
|
||||
productContainer: '[data-testid="product-card"], .product-card, [class*="ProductCard"]',
|
||||
productName: '[data-testid="product-title"], .product-title, [class*="ProductTitle"]',
|
||||
productPrice: '[data-testid="product-price"], .product-price, [class*="ProductPrice"]',
|
||||
productImage: 'img[src*="dutchie"], img[src*="product"], .product-image img',
|
||||
productCategory: '[data-testid="category-name"], .category-name',
|
||||
productBrand: '[data-testid="brand-name"], .brand-name, [class*="BrandName"]',
|
||||
loadMore: 'button[data-testid="load-more"], .load-more-button',
|
||||
pagination: '.pagination, [class*="Pagination"]',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* BaseDutchieCrawler - Base class for all Dutchie store crawlers
|
||||
*
|
||||
* Per-store crawlers extend this class and override methods as needed.
|
||||
* The default implementation delegates to the existing shared Dutchie logic.
|
||||
*/
|
||||
export class BaseDutchieCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected hooks: DutchieCrawlerHooks;
|
||||
protected selectors: DutchieSelectors;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
hooks: DutchieCrawlerHooks = {},
|
||||
selectors: DutchieSelectors = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.hooks = hooks;
|
||||
this.selectors = { ...DEFAULT_DUTCHIE_SELECTORS, ...selectors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* Override this in per-store crawlers to customize behavior
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
// Call beforeFetch hook if defined
|
||||
if (this.hooks.beforeFetch) {
|
||||
await this.hooks.beforeFetch(this.dispensary);
|
||||
}
|
||||
|
||||
// Use the existing shared Dutchie crawl logic
|
||||
const result = await baseCrawlDispensaryProducts(
|
||||
this.dispensary,
|
||||
this.options.pricingType || 'rec',
|
||||
{
|
||||
useBothModes: this.options.useBothModes,
|
||||
downloadImages: this.options.downloadImages,
|
||||
}
|
||||
);
|
||||
|
||||
// Call afterComplete hook if defined
|
||||
if (this.hooks.afterComplete) {
|
||||
await this.hooks.afterComplete(result, this.dispensary);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
* Override in per-store crawlers if needed
|
||||
*
|
||||
* @param page - Puppeteer page object or HTML string
|
||||
* @returns Structure detection result
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
const result: StructureDetectionResult = {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: [],
|
||||
metadata: {},
|
||||
};
|
||||
|
||||
try {
|
||||
// Default implementation: check for Dutchie iframe
|
||||
if (typeof page === 'string') {
|
||||
// HTML string mode
|
||||
if (page.includes('dutchie.com')) {
|
||||
result.menuType = 'dutchie';
|
||||
result.success = true;
|
||||
}
|
||||
} else if (page && typeof page.evaluate === 'function') {
|
||||
// Puppeteer page mode
|
||||
const detection = await page.evaluate((selectorConfig: DutchieSelectors) => {
|
||||
const iframe = document.querySelector(selectorConfig.iframe || '') as HTMLIFrameElement;
|
||||
const iframeUrl = iframe?.src || null;
|
||||
|
||||
// Check for product containers
|
||||
const containers = document.querySelectorAll(selectorConfig.productContainer || '');
|
||||
|
||||
return {
|
||||
hasIframe: !!iframe,
|
||||
iframeUrl,
|
||||
productCount: containers.length,
|
||||
isDutchie: !!iframeUrl?.includes('dutchie.com'),
|
||||
};
|
||||
}, this.selectors);
|
||||
|
||||
if (detection.isDutchie) {
|
||||
result.menuType = 'dutchie';
|
||||
result.iframeUrl = detection.iframeUrl;
|
||||
result.success = true;
|
||||
}
|
||||
|
||||
result.metadata = detection;
|
||||
}
|
||||
|
||||
// Set default selectors for Dutchie
|
||||
if (result.menuType === 'dutchie') {
|
||||
result.selectors = {
|
||||
productContainer: this.selectors.productContainer,
|
||||
productName: this.selectors.productName,
|
||||
productPrice: this.selectors.productPrice,
|
||||
productImage: this.selectors.productImage,
|
||||
productCategory: this.selectors.productCategory,
|
||||
};
|
||||
result.pagination = { type: 'graphql' };
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Detection error: ${error.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from page/document
|
||||
* Override in per-store crawlers for custom extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or raw products array
|
||||
* @returns Array of extracted products
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
// Default implementation: assume document is already an array of products
|
||||
// from the GraphQL response
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((product) => this.mapRawProduct(product));
|
||||
}
|
||||
|
||||
// If document is a Puppeteer page, extract from DOM
|
||||
if (document && typeof document.evaluate === 'function') {
|
||||
return this.extractProductsFromPage(document);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from Puppeteer page
|
||||
* Override for custom DOM extraction
|
||||
*/
|
||||
protected async extractProductsFromPage(page: any): Promise<ExtractedProduct[]> {
|
||||
const products = await page.evaluate((selectors: DutchieSelectors) => {
|
||||
const containers = document.querySelectorAll(selectors.productContainer || '');
|
||||
return Array.from(containers).map((container) => {
|
||||
const nameEl = container.querySelector(selectors.productName || '');
|
||||
const priceEl = container.querySelector(selectors.productPrice || '');
|
||||
const imageEl = container.querySelector(selectors.productImage || '') as HTMLImageElement;
|
||||
const brandEl = container.querySelector(selectors.productBrand || '');
|
||||
|
||||
return {
|
||||
name: nameEl?.textContent?.trim() || '',
|
||||
price: priceEl?.textContent?.trim() || '',
|
||||
imageUrl: imageEl?.src || '',
|
||||
brand: brandEl?.textContent?.trim() || '',
|
||||
};
|
||||
});
|
||||
}, this.selectors);
|
||||
|
||||
return products.map((p: any, i: number) => ({
|
||||
externalId: `dom-product-${i}`,
|
||||
name: p.name,
|
||||
brand: p.brand,
|
||||
price: this.parsePrice(p.price),
|
||||
imageUrl: p.imageUrl,
|
||||
stockStatus: 'unknown' as const,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Map raw product from GraphQL to ExtractedProduct
|
||||
* Override for custom mapping
|
||||
*/
|
||||
protected mapRawProduct(raw: any): ExtractedProduct {
|
||||
return {
|
||||
externalId: raw.id || raw._id || raw.externalId,
|
||||
name: raw.name || raw.Name,
|
||||
brand: raw.brand?.name || raw.brandName || raw.brand,
|
||||
category: raw.type || raw.category || raw.Category,
|
||||
subcategory: raw.subcategory || raw.Subcategory,
|
||||
price: raw.recPrice || raw.price || raw.Price,
|
||||
priceRec: raw.recPrice || raw.Prices?.rec,
|
||||
priceMed: raw.medPrice || raw.Prices?.med,
|
||||
weight: raw.weight || raw.Weight,
|
||||
thcContent: raw.potencyThc?.formatted || raw.THCContent?.formatted,
|
||||
cbdContent: raw.potencyCbd?.formatted || raw.CBDContent?.formatted,
|
||||
description: raw.description || raw.Description,
|
||||
imageUrl: raw.image || raw.Image,
|
||||
stockStatus: this.mapStockStatus(raw),
|
||||
quantity: raw.quantity || raw.Quantity,
|
||||
raw,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Map raw stock status to standardized value
|
||||
*/
|
||||
protected mapStockStatus(raw: any): 'in_stock' | 'out_of_stock' | 'low_stock' | 'unknown' {
|
||||
const status = raw.Status || raw.status || raw.stockStatus;
|
||||
if (status === 'Active' || status === 'active' || status === 'in_stock') {
|
||||
return 'in_stock';
|
||||
}
|
||||
if (status === 'Inactive' || status === 'inactive' || status === 'out_of_stock') {
|
||||
return 'out_of_stock';
|
||||
}
|
||||
if (status === 'low_stock') {
|
||||
return 'low_stock';
|
||||
}
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse price string to number
|
||||
*/
|
||||
protected parsePrice(priceStr: string): number | undefined {
|
||||
if (!priceStr) return undefined;
|
||||
const cleaned = priceStr.replace(/[^0-9.]/g, '');
|
||||
const num = parseFloat(cleaned);
|
||||
return isNaN(num) ? undefined : num;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
* Override for custom image extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or products array
|
||||
* @returns Array of extracted images
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document
|
||||
.filter((p) => p.image || p.Image || p.imageUrl)
|
||||
.map((p, i) => ({
|
||||
productId: p.id || p._id || `product-${i}`,
|
||||
imageUrl: p.image || p.Image || p.imageUrl,
|
||||
isPrimary: true,
|
||||
position: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
// Puppeteer page extraction
|
||||
if (document && typeof document.evaluate === 'function') {
|
||||
return this.extractImagesFromPage(document);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from Puppeteer page
|
||||
*/
|
||||
protected async extractImagesFromPage(page: any): Promise<ExtractedImage[]> {
|
||||
const images = await page.evaluate((selector: string) => {
|
||||
const imgs = document.querySelectorAll(selector);
|
||||
return Array.from(imgs).map((img, i) => ({
|
||||
src: (img as HTMLImageElement).src,
|
||||
position: i,
|
||||
}));
|
||||
}, this.selectors.productImage || 'img');
|
||||
|
||||
return images.map((img: any, i: number) => ({
|
||||
productId: `dom-product-${i}`,
|
||||
imageUrl: img.src,
|
||||
isPrimary: i === 0,
|
||||
position: img.position,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
* Override for custom stock extraction
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or products array
|
||||
* @returns Array of extracted stock statuses
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((p) => ({
|
||||
productId: p.id || p._id || p.externalId,
|
||||
status: this.mapStockStatus(p),
|
||||
quantity: p.quantity || p.Quantity,
|
||||
lastChecked: new Date(),
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information from document
|
||||
* Override for custom pagination handling
|
||||
*
|
||||
* @param document - DOM document, Puppeteer page, or GraphQL response
|
||||
* @returns Pagination info
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
// Default: check for page info in GraphQL response
|
||||
if (document && document.pageInfo) {
|
||||
return {
|
||||
hasNextPage: document.pageInfo.hasNextPage || false,
|
||||
currentPage: document.pageInfo.currentPage,
|
||||
totalPages: document.pageInfo.totalPages,
|
||||
totalProducts: document.pageInfo.totalCount || document.totalCount,
|
||||
nextCursor: document.pageInfo.endCursor,
|
||||
};
|
||||
}
|
||||
|
||||
// Default: no pagination
|
||||
return {
|
||||
hasNextPage: false,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cName (Dutchie slug) for this dispensary
|
||||
* Override to customize cName extraction
|
||||
*/
|
||||
getCName(): string {
|
||||
if (this.dispensary.menuUrl) {
|
||||
try {
|
||||
const url = new URL(this.dispensary.menuUrl);
|
||||
const segments = url.pathname.split('/').filter(Boolean);
|
||||
if (segments.length >= 2) {
|
||||
return segments[segments.length - 1];
|
||||
}
|
||||
} catch {
|
||||
// Fall through to default
|
||||
}
|
||||
}
|
||||
return this.dispensary.slug || '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get custom headers for API requests
|
||||
* Override for store-specific headers
|
||||
*/
|
||||
getCustomHeaders(): Record<string, string> {
|
||||
const cName = this.getCName();
|
||||
return {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Origin: 'https://dutchie.com',
|
||||
Referer: `https://dutchie.com/embedded-menu/${cName}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Create a base Dutchie crawler instance
|
||||
* This is the default export used when no per-store override exists
|
||||
*/
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
hooks: DutchieCrawlerHooks = {},
|
||||
selectors: DutchieSelectors = {}
|
||||
): BaseDutchieCrawler {
|
||||
return new BaseDutchieCrawler(dispensary, options, hooks, selectors);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS (required exports for orchestrator)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Crawl products using the base Dutchie logic
|
||||
* Per-store files can call this or override it completely
|
||||
*/
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect structure using the base Dutchie logic
|
||||
*/
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products using the base Dutchie logic
|
||||
*/
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images using the base Dutchie logic
|
||||
*/
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock using the base Dutchie logic
|
||||
*/
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination using the base Dutchie logic
|
||||
*/
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
@@ -1,330 +0,0 @@
|
||||
/**
|
||||
* Base Jane Crawler Template (PLACEHOLDER)
|
||||
*
|
||||
* This is the base template for all Jane (iheartjane) store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* TODO: Implement Jane-specific crawling logic (Algolia-based)
|
||||
*/
|
||||
|
||||
import { Dispensary } from '../../dutchie-az/types';
|
||||
import {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
} from './base-dutchie';
|
||||
|
||||
// Re-export types
|
||||
export {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// JANE-SPECIFIC TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface JaneConfig {
|
||||
algoliaAppId?: string;
|
||||
algoliaApiKey?: string;
|
||||
algoliaIndex?: string;
|
||||
storeId?: string;
|
||||
}
|
||||
|
||||
export interface JaneSelectors {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
pagination?: string;
|
||||
loadMore?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_JANE_SELECTORS: JaneSelectors = {
|
||||
productContainer: '[data-testid="product-card"], .product-card',
|
||||
productName: '[data-testid="product-name"], .product-name',
|
||||
productPrice: '[data-testid="product-price"], .product-price',
|
||||
productImage: '.product-image img, [data-testid="product-image"] img',
|
||||
productCategory: '.product-category',
|
||||
productBrand: '.product-brand, [data-testid="brand-name"]',
|
||||
loadMore: '[data-testid="load-more"], .load-more-btn',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE JANE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class BaseJaneCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected selectors: JaneSelectors;
|
||||
protected janeConfig: JaneConfig;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: JaneSelectors = {},
|
||||
janeConfig: JaneConfig = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: false,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.selectors = { ...DEFAULT_JANE_SELECTORS, ...selectors };
|
||||
this.janeConfig = janeConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* TODO: Implement Jane/Algolia-specific crawling
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
const startTime = Date.now();
|
||||
console.warn(`[BaseJaneCrawler] Jane crawling not yet implemented for ${this.dispensary.name}`);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: this.dispensary.id || 0,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
imagesDownloaded: 0,
|
||||
errorMessage: 'Jane crawler not yet implemented',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
* Jane uses Algolia, so we look for Algolia config
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
const result: StructureDetectionResult = {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: [],
|
||||
metadata: {},
|
||||
};
|
||||
|
||||
try {
|
||||
if (page && typeof page.evaluate === 'function') {
|
||||
// Look for Jane/Algolia indicators
|
||||
const detection = await page.evaluate(() => {
|
||||
// Check for iheartjane in page
|
||||
const hasJane = document.documentElement.innerHTML.includes('iheartjane') ||
|
||||
document.documentElement.innerHTML.includes('jane-menu');
|
||||
|
||||
// Look for Algolia config
|
||||
const scripts = Array.from(document.querySelectorAll('script'));
|
||||
let algoliaConfig: any = null;
|
||||
|
||||
for (const script of scripts) {
|
||||
const content = script.textContent || '';
|
||||
if (content.includes('algolia') || content.includes('ALGOLIA')) {
|
||||
// Try to extract config
|
||||
const appIdMatch = content.match(/applicationId['":\s]+['"]([^'"]+)['"]/);
|
||||
const apiKeyMatch = content.match(/apiKey['":\s]+['"]([^'"]+)['"]/);
|
||||
if (appIdMatch && apiKeyMatch) {
|
||||
algoliaConfig = {
|
||||
appId: appIdMatch[1],
|
||||
apiKey: apiKeyMatch[1],
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
hasJane,
|
||||
algoliaConfig,
|
||||
};
|
||||
});
|
||||
|
||||
if (detection.hasJane) {
|
||||
result.menuType = 'jane';
|
||||
result.success = true;
|
||||
result.metadata = detection;
|
||||
|
||||
if (detection.algoliaConfig) {
|
||||
result.metadata.algoliaAppId = detection.algoliaConfig.appId;
|
||||
result.metadata.algoliaApiKey = detection.algoliaConfig.apiKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Detection error: ${error.message}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from Algolia response or page
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
// If document is Algolia hits array
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((hit) => this.mapAlgoliaHit(hit));
|
||||
}
|
||||
|
||||
console.warn('[BaseJaneCrawler] extractProducts not yet fully implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Map Algolia hit to ExtractedProduct
|
||||
*/
|
||||
protected mapAlgoliaHit(hit: any): ExtractedProduct {
|
||||
return {
|
||||
externalId: hit.objectID || hit.id || hit.product_id,
|
||||
name: hit.name || hit.product_name,
|
||||
brand: hit.brand || hit.brand_name,
|
||||
category: hit.category || hit.kind,
|
||||
subcategory: hit.subcategory,
|
||||
price: hit.price || hit.bucket_price,
|
||||
priceRec: hit.prices?.rec || hit.price_rec,
|
||||
priceMed: hit.prices?.med || hit.price_med,
|
||||
weight: hit.weight || hit.amount,
|
||||
thcContent: hit.percent_thc ? `${hit.percent_thc}%` : undefined,
|
||||
cbdContent: hit.percent_cbd ? `${hit.percent_cbd}%` : undefined,
|
||||
description: hit.description,
|
||||
imageUrl: hit.image_url || hit.product_image_url,
|
||||
stockStatus: hit.available ? 'in_stock' : 'out_of_stock',
|
||||
quantity: hit.quantity_available,
|
||||
raw: hit,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document
|
||||
.filter((hit) => hit.image_url || hit.product_image_url)
|
||||
.map((hit, i) => ({
|
||||
productId: hit.objectID || hit.id || `jane-product-${i}`,
|
||||
imageUrl: hit.image_url || hit.product_image_url,
|
||||
isPrimary: true,
|
||||
position: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
if (Array.isArray(document)) {
|
||||
return document.map((hit) => ({
|
||||
productId: hit.objectID || hit.id,
|
||||
status: hit.available ? 'in_stock' as const : 'out_of_stock' as const,
|
||||
quantity: hit.quantity_available,
|
||||
lastChecked: new Date(),
|
||||
}));
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information
|
||||
* Algolia uses cursor-based pagination
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
if (document && typeof document === 'object' && !Array.isArray(document)) {
|
||||
return {
|
||||
hasNextPage: document.page < document.nbPages - 1,
|
||||
currentPage: document.page,
|
||||
totalPages: document.nbPages,
|
||||
totalProducts: document.nbHits,
|
||||
};
|
||||
}
|
||||
|
||||
return { hasNextPage: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: JaneSelectors = {},
|
||||
janeConfig: JaneConfig = {}
|
||||
): BaseJaneCrawler {
|
||||
return new BaseJaneCrawler(dispensary, options, selectors, janeConfig);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
@@ -1,212 +0,0 @@
|
||||
/**
|
||||
* Base Treez Crawler Template (PLACEHOLDER)
|
||||
*
|
||||
* This is the base template for all Treez store crawlers.
|
||||
* Per-store crawlers extend this by overriding specific methods.
|
||||
*
|
||||
* TODO: Implement Treez-specific crawling logic
|
||||
*/
|
||||
|
||||
import { Dispensary } from '../../dutchie-az/types';
|
||||
import {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
} from './base-dutchie';
|
||||
|
||||
// Re-export types
|
||||
export {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// TREEZ-SPECIFIC TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface TreezSelectors {
|
||||
productContainer?: string;
|
||||
productName?: string;
|
||||
productPrice?: string;
|
||||
productImage?: string;
|
||||
productCategory?: string;
|
||||
productBrand?: string;
|
||||
addToCart?: string;
|
||||
pagination?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_TREEZ_SELECTORS: TreezSelectors = {
|
||||
productContainer: '.product-tile, [class*="ProductCard"]',
|
||||
productName: '.product-name, [class*="ProductName"]',
|
||||
productPrice: '.product-price, [class*="ProductPrice"]',
|
||||
productImage: '.product-image img',
|
||||
productCategory: '.product-category',
|
||||
productBrand: '.product-brand',
|
||||
addToCart: '.add-to-cart-btn',
|
||||
pagination: '.pagination',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// BASE TREEZ CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class BaseTreezCrawler {
|
||||
protected dispensary: Dispensary;
|
||||
protected options: StoreCrawlOptions;
|
||||
protected selectors: TreezSelectors;
|
||||
|
||||
constructor(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: TreezSelectors = {}
|
||||
) {
|
||||
this.dispensary = dispensary;
|
||||
this.options = {
|
||||
pricingType: 'rec',
|
||||
useBothModes: false,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
timeoutMs: 30000,
|
||||
...options,
|
||||
};
|
||||
this.selectors = { ...DEFAULT_TREEZ_SELECTORS, ...selectors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - crawl products for this dispensary
|
||||
* TODO: Implement Treez-specific crawling
|
||||
*/
|
||||
async crawlProducts(): Promise<CrawlResult> {
|
||||
const startTime = Date.now();
|
||||
console.warn(`[BaseTreezCrawler] Treez crawling not yet implemented for ${this.dispensary.name}`);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: this.dispensary.id || 0,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
imagesDownloaded: 0,
|
||||
errorMessage: 'Treez crawler not yet implemented',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect page structure for sandbox discovery mode
|
||||
*/
|
||||
async detectStructure(page: any): Promise<StructureDetectionResult> {
|
||||
return {
|
||||
success: false,
|
||||
menuType: 'unknown',
|
||||
selectors: {},
|
||||
pagination: { type: 'none' },
|
||||
errors: ['Treez structure detection not yet implemented'],
|
||||
metadata: {},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract products from page/document
|
||||
*/
|
||||
async extractProducts(document: any): Promise<ExtractedProduct[]> {
|
||||
console.warn('[BaseTreezCrawler] extractProducts not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract images from document
|
||||
*/
|
||||
async extractImages(document: any): Promise<ExtractedImage[]> {
|
||||
console.warn('[BaseTreezCrawler] extractImages not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract stock information from document
|
||||
*/
|
||||
async extractStock(document: any): Promise<ExtractedStock[]> {
|
||||
console.warn('[BaseTreezCrawler] extractStock not yet implemented');
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract pagination information from document
|
||||
*/
|
||||
async extractPagination(document: any): Promise<ExtractedPagination> {
|
||||
return { hasNextPage: false };
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {},
|
||||
selectors: TreezSelectors = {}
|
||||
): BaseTreezCrawler {
|
||||
return new BaseTreezCrawler(dispensary, options, selectors);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
const crawler = createCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
export async function detectStructure(
|
||||
page: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<StructureDetectionResult> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.detectStructure(page);
|
||||
}
|
||||
|
||||
export async function extractProducts(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedProduct[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractProducts(document);
|
||||
}
|
||||
|
||||
export async function extractImages(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedImage[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractImages(document);
|
||||
}
|
||||
|
||||
export async function extractStock(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedStock[]> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractStock(document);
|
||||
}
|
||||
|
||||
export async function extractPagination(
|
||||
document: any,
|
||||
dispensary?: Dispensary
|
||||
): Promise<ExtractedPagination> {
|
||||
const crawler = createCrawler(dispensary || ({} as Dispensary));
|
||||
return crawler.extractPagination(document);
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
/**
|
||||
* Base Crawler Templates Index
|
||||
*
|
||||
* Exports all base crawler templates for easy importing.
|
||||
*/
|
||||
|
||||
// Dutchie base (primary implementation)
|
||||
export * from './base-dutchie';
|
||||
|
||||
// Treez base (placeholder)
|
||||
export * as Treez from './base-treez';
|
||||
|
||||
// Jane base (placeholder)
|
||||
export * as Jane from './base-jane';
|
||||
|
||||
// Re-export common types from dutchie for convenience
|
||||
export type {
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
StructureDetectionResult,
|
||||
ExtractedProduct,
|
||||
ExtractedImage,
|
||||
ExtractedStock,
|
||||
ExtractedPagination,
|
||||
DutchieCrawlerHooks,
|
||||
DutchieSelectors,
|
||||
} from './base-dutchie';
|
||||
@@ -1,9 +0,0 @@
|
||||
/**
|
||||
* Base Dutchie Crawler Template (Re-export for backward compatibility)
|
||||
*
|
||||
* DEPRECATED: Import from '../base/base-dutchie' instead.
|
||||
* This file re-exports everything from the new location for existing code.
|
||||
*/
|
||||
|
||||
// Re-export everything from the new base location
|
||||
export * from '../base/base-dutchie';
|
||||
@@ -1,118 +0,0 @@
|
||||
/**
|
||||
* Trulieve Scottsdale - Per-Store Dutchie Crawler
|
||||
*
|
||||
* Store ID: 101
|
||||
* Profile Key: trulieve-scottsdale
|
||||
* Platform Dispensary ID: 5eaf489fa8a61801212577cc
|
||||
*
|
||||
* Phase 1: Identity implementation - no overrides, just uses base Dutchie logic.
|
||||
* Future: Add store-specific selectors, timing, or custom logic as needed.
|
||||
*/
|
||||
|
||||
import {
|
||||
BaseDutchieCrawler,
|
||||
StoreCrawlOptions,
|
||||
CrawlResult,
|
||||
DutchieSelectors,
|
||||
crawlProducts as baseCrawlProducts,
|
||||
} from '../../base/base-dutchie';
|
||||
import { Dispensary } from '../../../dutchie-az/types';
|
||||
|
||||
// Re-export CrawlResult for the orchestrator
|
||||
export { CrawlResult };
|
||||
|
||||
// ============================================================
|
||||
// STORE CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Store-specific configuration
|
||||
* These can be used to customize crawler behavior for this store
|
||||
*/
|
||||
export const STORE_CONFIG = {
|
||||
storeId: 101,
|
||||
profileKey: 'trulieve-scottsdale',
|
||||
name: 'Trulieve of Scottsdale Dispensary',
|
||||
platformDispensaryId: '5eaf489fa8a61801212577cc',
|
||||
|
||||
// Store-specific overrides (none for Phase 1)
|
||||
customOptions: {
|
||||
// Example future overrides:
|
||||
// pricingType: 'rec',
|
||||
// useBothModes: true,
|
||||
// customHeaders: {},
|
||||
// maxRetries: 3,
|
||||
},
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// STORE CRAWLER CLASS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* TrulieveScottsdaleCrawler - Per-store crawler for Trulieve Scottsdale
|
||||
*
|
||||
* Phase 1: Identity implementation - extends BaseDutchieCrawler with no overrides.
|
||||
* Future phases can override methods like:
|
||||
* - getCName() for custom slug handling
|
||||
* - crawlProducts() for completely custom logic
|
||||
* - Add hooks for pre/post processing
|
||||
*/
|
||||
export class TrulieveScottsdaleCrawler extends BaseDutchieCrawler {
|
||||
constructor(dispensary: Dispensary, options: StoreCrawlOptions = {}) {
|
||||
// Merge store-specific options with provided options
|
||||
const mergedOptions: StoreCrawlOptions = {
|
||||
...STORE_CONFIG.customOptions,
|
||||
...options,
|
||||
};
|
||||
|
||||
super(dispensary, mergedOptions);
|
||||
}
|
||||
|
||||
// Phase 1: No overrides - use base implementation
|
||||
// Future phases can add overrides here:
|
||||
//
|
||||
// async crawlProducts(): Promise<CrawlResult> {
|
||||
// // Custom pre-processing
|
||||
// // ...
|
||||
// const result = await super.crawlProducts();
|
||||
// // Custom post-processing
|
||||
// // ...
|
||||
// return result;
|
||||
// }
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// EXPORTED CRAWL FUNCTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Main entry point for the orchestrator
|
||||
*
|
||||
* The orchestrator calls: mod.crawlProducts(dispensary, options)
|
||||
* This function creates a TrulieveScottsdaleCrawler and runs it.
|
||||
*/
|
||||
export async function crawlProducts(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): Promise<CrawlResult> {
|
||||
console.log(`[TrulieveScottsdale] Using per-store crawler for ${dispensary.name}`);
|
||||
|
||||
const crawler = new TrulieveScottsdaleCrawler(dispensary, options);
|
||||
return crawler.crawlProducts();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// FACTORY FUNCTION (alternative API)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Create a crawler instance without running it
|
||||
* Useful for testing or when you need to configure before running
|
||||
*/
|
||||
export function createCrawler(
|
||||
dispensary: Dispensary,
|
||||
options: StoreCrawlOptions = {}
|
||||
): TrulieveScottsdaleCrawler {
|
||||
return new TrulieveScottsdaleCrawler(dispensary, options);
|
||||
}
|
||||
141
backend/src/db/auto-migrate.ts
Normal file
141
backend/src/db/auto-migrate.ts
Normal file
@@ -0,0 +1,141 @@
|
||||
/**
|
||||
* Auto-Migration System
|
||||
*
|
||||
* Runs SQL migration files from the migrations/ folder automatically on server startup.
|
||||
* Uses a schema_migrations table to track which migrations have been applied.
|
||||
*
|
||||
* Safe to run multiple times - only applies new migrations.
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
const MIGRATIONS_DIR = path.join(__dirname, '../../migrations');
|
||||
|
||||
/**
|
||||
* Ensure schema_migrations table exists
|
||||
*/
|
||||
async function ensureMigrationsTable(pool: Pool): Promise<void> {
|
||||
await pool.query(`
|
||||
CREATE TABLE IF NOT EXISTS schema_migrations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) UNIQUE NOT NULL,
|
||||
applied_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of already-applied migrations
|
||||
*/
|
||||
async function getAppliedMigrations(pool: Pool): Promise<Set<string>> {
|
||||
const result = await pool.query('SELECT name FROM schema_migrations');
|
||||
return new Set(result.rows.map(row => row.name));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of migration files from disk
|
||||
*/
|
||||
function getMigrationFiles(): string[] {
|
||||
if (!fs.existsSync(MIGRATIONS_DIR)) {
|
||||
console.log('[AutoMigrate] No migrations directory found');
|
||||
return [];
|
||||
}
|
||||
|
||||
return fs.readdirSync(MIGRATIONS_DIR)
|
||||
.filter(f => f.endsWith('.sql'))
|
||||
.sort(); // Sort alphabetically (001_, 002_, etc.)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single migration file
|
||||
*/
|
||||
async function runMigration(pool: Pool, filename: string): Promise<void> {
|
||||
const filepath = path.join(MIGRATIONS_DIR, filename);
|
||||
const sql = fs.readFileSync(filepath, 'utf8');
|
||||
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Run the migration SQL
|
||||
await client.query(sql);
|
||||
|
||||
// Record that this migration was applied
|
||||
await client.query(
|
||||
'INSERT INTO schema_migrations (name) VALUES ($1) ON CONFLICT (name) DO NOTHING',
|
||||
[filename]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
console.log(`[AutoMigrate] ✓ Applied: ${filename}`);
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error(`[AutoMigrate] ✗ Failed: ${filename}`);
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run all pending migrations
|
||||
*
|
||||
* @param pool - Database connection pool
|
||||
* @returns Number of migrations applied
|
||||
*/
|
||||
export async function runAutoMigrations(pool: Pool): Promise<number> {
|
||||
console.log('[AutoMigrate] Checking for pending migrations...');
|
||||
|
||||
try {
|
||||
// Ensure migrations table exists
|
||||
await ensureMigrationsTable(pool);
|
||||
|
||||
// Get applied and available migrations
|
||||
const applied = await getAppliedMigrations(pool);
|
||||
const available = getMigrationFiles();
|
||||
|
||||
// Find pending migrations
|
||||
const pending = available.filter(f => !applied.has(f));
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log('[AutoMigrate] No pending migrations');
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`[AutoMigrate] Found ${pending.length} pending migrations`);
|
||||
|
||||
// Run each pending migration in order
|
||||
for (const filename of pending) {
|
||||
await runMigration(pool, filename);
|
||||
}
|
||||
|
||||
console.log(`[AutoMigrate] Successfully applied ${pending.length} migrations`);
|
||||
return pending.length;
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('[AutoMigrate] Migration failed:', error.message);
|
||||
// Don't crash the server - log and continue
|
||||
// The specific failing migration will have been rolled back
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check migration status without running anything
|
||||
*/
|
||||
export async function checkMigrationStatus(pool: Pool): Promise<{
|
||||
applied: string[];
|
||||
pending: string[];
|
||||
}> {
|
||||
await ensureMigrationsTable(pool);
|
||||
|
||||
const applied = await getAppliedMigrations(pool);
|
||||
const available = getMigrationFiles();
|
||||
|
||||
return {
|
||||
applied: available.filter(f => applied.has(f)),
|
||||
pending: available.filter(f => !applied.has(f)),
|
||||
};
|
||||
}
|
||||
@@ -372,6 +372,51 @@ async function runMigrations() {
|
||||
ON CONFLICT (key) DO NOTHING;
|
||||
`);
|
||||
|
||||
// SEO Pages table
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS seo_pages (
|
||||
id SERIAL PRIMARY KEY,
|
||||
type VARCHAR(50) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL UNIQUE,
|
||||
page_key VARCHAR(255) NOT NULL,
|
||||
primary_keyword VARCHAR(255),
|
||||
status VARCHAR(50) DEFAULT 'pending_generation',
|
||||
data_source VARCHAR(100),
|
||||
meta_title VARCHAR(255),
|
||||
meta_description TEXT,
|
||||
last_generated_at TIMESTAMPTZ,
|
||||
last_reviewed_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_pages_type ON seo_pages(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_pages_status ON seo_pages(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_pages_slug ON seo_pages(slug);
|
||||
`);
|
||||
|
||||
// SEO Page Contents table
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS seo_page_contents (
|
||||
id SERIAL PRIMARY KEY,
|
||||
page_id INTEGER NOT NULL REFERENCES seo_pages(id) ON DELETE CASCADE,
|
||||
version INTEGER DEFAULT 1,
|
||||
blocks JSONB NOT NULL DEFAULT '[]',
|
||||
meta JSONB NOT NULL DEFAULT '{}',
|
||||
meta_title VARCHAR(255),
|
||||
meta_description TEXT,
|
||||
h1 VARCHAR(255),
|
||||
canonical_url TEXT,
|
||||
og_title VARCHAR(255),
|
||||
og_description TEXT,
|
||||
og_image_url TEXT,
|
||||
generated_by VARCHAR(50) DEFAULT 'claude',
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(page_id, version)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_seo_page_contents_page ON seo_page_contents(page_id);
|
||||
`);
|
||||
|
||||
await client.query('COMMIT');
|
||||
console.log('✅ Migrations completed successfully');
|
||||
} catch (error) {
|
||||
|
||||
@@ -77,7 +77,9 @@ export function getPool(): Pool {
|
||||
* This is a getter that lazily initializes on first access.
|
||||
*/
|
||||
export const pool = {
|
||||
query: (...args: Parameters<Pool['query']>) => getPool().query(...args),
|
||||
query: (queryTextOrConfig: string | import('pg').QueryConfig, values?: any[]): Promise<import('pg').QueryResult<any>> => {
|
||||
return getPool().query(queryTextOrConfig as any, values);
|
||||
},
|
||||
connect: () => getPool().connect(),
|
||||
end: () => getPool().end(),
|
||||
on: (event: 'error' | 'connect' | 'acquire' | 'remove' | 'release', listener: (...args: any[]) => void) => getPool().on(event as any, listener),
|
||||
|
||||
200
backend/src/db/run-migrations.ts
Normal file
200
backend/src/db/run-migrations.ts
Normal file
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Database Migration Runner
|
||||
*
|
||||
* Runs SQL migrations from backend/migrations/*.sql in order.
|
||||
* Tracks applied migrations in schema_migrations table.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/db/run-migrations.ts
|
||||
*
|
||||
* Environment:
|
||||
* DATABASE_URL or CANNAIQ_DB_* variables
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import * as fs from 'fs/promises';
|
||||
import * as path from 'path';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
function getConnectionString(): string {
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
const host = process.env.CANNAIQ_DB_HOST || 'localhost';
|
||||
const port = process.env.CANNAIQ_DB_PORT || '54320';
|
||||
const name = process.env.CANNAIQ_DB_NAME || 'dutchie_menus';
|
||||
const user = process.env.CANNAIQ_DB_USER || 'dutchie';
|
||||
const pass = process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass';
|
||||
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
interface MigrationFile {
|
||||
filename: string;
|
||||
number: number;
|
||||
path: string;
|
||||
}
|
||||
|
||||
async function getMigrationFiles(migrationsDir: string): Promise<MigrationFile[]> {
|
||||
const files = await fs.readdir(migrationsDir);
|
||||
|
||||
const migrations: MigrationFile[] = files
|
||||
.filter(f => f.endsWith('.sql'))
|
||||
.map(filename => {
|
||||
// Extract number from filename like "005_api_tokens.sql" or "073_proxy_timezone.sql"
|
||||
const match = filename.match(/^(\d+)_/);
|
||||
if (!match) return null;
|
||||
|
||||
return {
|
||||
filename,
|
||||
number: parseInt(match[1], 10),
|
||||
path: path.join(migrationsDir, filename),
|
||||
};
|
||||
})
|
||||
.filter((m): m is MigrationFile => m !== null)
|
||||
.sort((a, b) => a.number - b.number);
|
||||
|
||||
return migrations;
|
||||
}
|
||||
|
||||
async function ensureMigrationsTable(pool: Pool): Promise<void> {
|
||||
// Migrate to filename-based tracking (handles duplicate version numbers)
|
||||
// Check if old version-based PK exists
|
||||
const pkCheck = await pool.query(`
|
||||
SELECT constraint_name FROM information_schema.table_constraints
|
||||
WHERE table_name = 'schema_migrations' AND constraint_type = 'PRIMARY KEY'
|
||||
`);
|
||||
|
||||
if (pkCheck.rows.length === 0) {
|
||||
// Table doesn't exist, create with filename as PK
|
||||
await pool.query(`
|
||||
CREATE TABLE IF NOT EXISTS schema_migrations (
|
||||
filename VARCHAR(255) NOT NULL PRIMARY KEY,
|
||||
version VARCHAR(10),
|
||||
name VARCHAR(255),
|
||||
applied_at TIMESTAMPTZ DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
} else {
|
||||
// Table exists - add filename column if missing
|
||||
await pool.query(`
|
||||
ALTER TABLE schema_migrations ADD COLUMN IF NOT EXISTS filename VARCHAR(255)
|
||||
`);
|
||||
// Populate filename from version+name for existing rows
|
||||
await pool.query(`
|
||||
UPDATE schema_migrations SET filename = version || '_' || name || '.sql'
|
||||
WHERE filename IS NULL
|
||||
`);
|
||||
}
|
||||
}
|
||||
|
||||
async function getAppliedMigrations(pool: Pool): Promise<Set<string>> {
|
||||
// Try filename first, fall back to version_name combo
|
||||
const result = await pool.query(`
|
||||
SELECT COALESCE(filename, version || '_' || name || '.sql') as filename
|
||||
FROM schema_migrations
|
||||
`);
|
||||
return new Set(result.rows.map(r => r.filename));
|
||||
}
|
||||
|
||||
async function applyMigration(pool: Pool, migration: MigrationFile): Promise<void> {
|
||||
const sql = await fs.readFile(migration.path, 'utf-8');
|
||||
|
||||
// Extract version and name from filename like "005_api_tokens.sql"
|
||||
const version = String(migration.number).padStart(3, '0');
|
||||
const name = migration.filename.replace(/^\d+_/, '').replace(/\.sql$/, '');
|
||||
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Run the migration SQL
|
||||
await client.query(sql);
|
||||
|
||||
// Record that it was applied - use INSERT with ON CONFLICT for safety
|
||||
await client.query(`
|
||||
INSERT INTO schema_migrations (filename, version, name)
|
||||
VALUES ($1, $2, $3)
|
||||
ON CONFLICT DO NOTHING
|
||||
`, [migration.filename, version, name]);
|
||||
|
||||
await client.query('COMMIT');
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const pool = new Pool({ connectionString: getConnectionString() });
|
||||
|
||||
// Migrations directory relative to this file
|
||||
const migrationsDir = path.resolve(__dirname, '../../migrations');
|
||||
|
||||
console.log('╔════════════════════════════════════════════════════════════╗');
|
||||
console.log('║ DATABASE MIGRATION RUNNER ║');
|
||||
console.log('╚════════════════════════════════════════════════════════════╝');
|
||||
console.log(`Migrations dir: ${migrationsDir}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// Ensure tracking table exists
|
||||
await ensureMigrationsTable(pool);
|
||||
|
||||
// Get all migration files
|
||||
const allMigrations = await getMigrationFiles(migrationsDir);
|
||||
console.log(`Found ${allMigrations.length} migration files`);
|
||||
|
||||
// Get already-applied migrations
|
||||
const applied = await getAppliedMigrations(pool);
|
||||
console.log(`Already applied: ${applied.size} migrations`);
|
||||
console.log('');
|
||||
|
||||
// Find pending migrations (compare by filename)
|
||||
const pending = allMigrations.filter(m => !applied.has(m.filename));
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log('✅ No pending migrations. Database is up to date.');
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Pending migrations: ${pending.length}`);
|
||||
console.log('─'.repeat(60));
|
||||
|
||||
// Apply each pending migration
|
||||
for (const migration of pending) {
|
||||
process.stdout.write(` ${migration.filename}... `);
|
||||
try {
|
||||
await applyMigration(pool, migration);
|
||||
console.log('✅');
|
||||
} catch (error: any) {
|
||||
console.log('❌');
|
||||
console.error(`\nError applying ${migration.filename}:`);
|
||||
console.error(error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(`✅ Applied ${pending.length} migrations successfully`);
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Migration runner failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -3,14 +3,23 @@
|
||||
*
|
||||
* Main orchestrator for the Dutchie store discovery pipeline.
|
||||
*
|
||||
* Flow:
|
||||
* 1. Discover cities from Dutchie (or use seeded cities)
|
||||
* 2. For each city, discover store locations
|
||||
* 3. Upsert all data to discovery tables
|
||||
* 4. Admin verifies locations manually
|
||||
* 5. Verified locations are promoted to canonical dispensaries
|
||||
* AUTOMATED FLOW (as of 2025-01):
|
||||
* 1. Fetch cities dynamically from Dutchie GraphQL (getAllCitiesByState)
|
||||
* 2. For each city, discover store locations via ConsumerDispensaries query
|
||||
* 3. Upsert locations to dutchie_discovery_locations (keyed by platform_location_id)
|
||||
* 4. AUTO-VALIDATE: Check required fields (name, city, state, platform_menu_url, platform_location_id)
|
||||
* 5. AUTO-PROMOTE: Valid locations are upserted to dispensaries table with crawl_enabled=true
|
||||
* 6. All actions logged to dutchie_promotion_log for audit
|
||||
*
|
||||
* This module does NOT create canonical dispensaries automatically.
|
||||
* Tables involved:
|
||||
* - dutchie_discovery_cities: Known cities for each state
|
||||
* - dutchie_discovery_locations: Raw discovered store data
|
||||
* - dispensaries: Canonical store records (promoted from discovery)
|
||||
* - dutchie_promotion_log: Audit trail for validation/promotion
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/run-discovery.ts discover:state AZ
|
||||
* npx tsx src/scripts/run-discovery.ts discover:state CA
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
@@ -24,11 +33,12 @@ import {
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import {
|
||||
discoverLocationsForCity,
|
||||
getCitiesForState,
|
||||
} from './location-discovery';
|
||||
import { promoteDiscoveredLocations } from './promotion';
|
||||
|
||||
// ============================================================
|
||||
// FULL DISCOVERY
|
||||
@@ -162,6 +172,42 @@ export async function runFullDiscovery(
|
||||
console.log(`Errors: ${totalErrors}`);
|
||||
}
|
||||
|
||||
// Step 4: Auto-validate and promote discovered locations
|
||||
if (!dryRun && totalLocationsUpserted > 0) {
|
||||
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
|
||||
const promotionResult = await promoteDiscoveredLocations(stateCode, false);
|
||||
console.log(`[Discovery] Promotion complete:`);
|
||||
console.log(` Created: ${promotionResult.created} new dispensaries`);
|
||||
console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
|
||||
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
|
||||
if (promotionResult.rejectedRecords.length > 0) {
|
||||
console.log(` Rejection reasons:`);
|
||||
promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
|
||||
console.log(` - ${r.name}: ${r.errors.join(', ')}`);
|
||||
});
|
||||
if (promotionResult.rejectedRecords.length > 5) {
|
||||
console.log(` ... and ${promotionResult.rejectedRecords.length - 5} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Detect dropped stores (in DB but not in discovery results)
|
||||
if (!dryRun) {
|
||||
console.log('\n[Discovery] Step 5: Detecting dropped stores...');
|
||||
const droppedResult = await detectDroppedStores(pool, stateCode);
|
||||
if (droppedResult.droppedCount > 0) {
|
||||
console.log(`[Discovery] Found ${droppedResult.droppedCount} dropped stores:`);
|
||||
droppedResult.droppedStores.slice(0, 10).forEach(s => {
|
||||
console.log(` - ${s.name} (${s.city}, ${s.state}) - last seen: ${s.lastSeenAt}`);
|
||||
});
|
||||
if (droppedResult.droppedCount > 10) {
|
||||
console.log(` ... and ${droppedResult.droppedCount - 10} more`);
|
||||
}
|
||||
} else {
|
||||
console.log(`[Discovery] No dropped stores detected`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
cities: cityResult,
|
||||
locations: locationResults,
|
||||
@@ -171,6 +217,107 @@ export async function runFullDiscovery(
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DROPPED STORE DETECTION
|
||||
// ============================================================
|
||||
|
||||
export interface DroppedStoreResult {
|
||||
droppedCount: number;
|
||||
droppedStores: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
city: string;
|
||||
state: string;
|
||||
platformDispensaryId: string;
|
||||
lastSeenAt: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect stores that exist in dispensaries but were not found in discovery.
|
||||
* Marks them as status='dropped' for manual review.
|
||||
*
|
||||
* A store is considered "dropped" if:
|
||||
* 1. It has a platform_dispensary_id (was verified via Dutchie)
|
||||
* 2. It was NOT seen in the latest discovery crawl (last_seen_at in discovery < 24h ago)
|
||||
* 3. It's currently marked as 'open' status
|
||||
*/
|
||||
export async function detectDroppedStores(
|
||||
pool: Pool,
|
||||
stateCode?: string
|
||||
): Promise<DroppedStoreResult> {
|
||||
// Find dispensaries that:
|
||||
// 1. Have platform_dispensary_id (verified Dutchie stores)
|
||||
// 2. Are currently 'open' status
|
||||
// 3. Have a linked discovery record that wasn't seen in the last discovery run
|
||||
// (last_seen_at in dutchie_discovery_locations is older than 24 hours)
|
||||
const params: any[] = [];
|
||||
let stateFilter = '';
|
||||
|
||||
if (stateCode) {
|
||||
stateFilter = ` AND d.state = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
const query = `
|
||||
WITH recently_seen AS (
|
||||
SELECT DISTINCT platform_location_id
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE last_seen_at > NOW() - INTERVAL '24 hours'
|
||||
AND active = true
|
||||
)
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.platform_dispensary_id,
|
||||
d.updated_at as last_seen_at
|
||||
FROM dispensaries d
|
||||
WHERE d.platform_dispensary_id IS NOT NULL
|
||||
AND d.platform = 'dutchie'
|
||||
AND (d.status = 'open' OR d.status IS NULL)
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id NOT IN (SELECT platform_location_id FROM recently_seen)
|
||||
${stateFilter}
|
||||
ORDER BY d.name
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const droppedStores = result.rows;
|
||||
|
||||
// Mark these stores as 'dropped' status
|
||||
if (droppedStores.length > 0) {
|
||||
const ids = droppedStores.map(s => s.id);
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET status = 'dropped', updated_at = NOW()
|
||||
WHERE id = ANY($1::int[])
|
||||
`, [ids]);
|
||||
|
||||
// Log to promotion log for audit
|
||||
for (const store of droppedStores) {
|
||||
await pool.query(`
|
||||
INSERT INTO dutchie_promotion_log
|
||||
(dispensary_id, action, state_code, store_name, triggered_by)
|
||||
VALUES ($1, 'dropped', $2, $3, 'discovery_detection')
|
||||
`, [store.id, store.state, store.name]);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
droppedCount: droppedStores.length,
|
||||
droppedStores: droppedStores.map(s => ({
|
||||
id: s.id,
|
||||
name: s.name,
|
||||
city: s.city,
|
||||
state: s.state,
|
||||
platformDispensaryId: s.platform_dispensary_id,
|
||||
lastSeenAt: s.last_seen_at,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLE CITY DISCOVERY
|
||||
// ============================================================
|
||||
@@ -235,11 +382,19 @@ export async function discoverState(
|
||||
|
||||
console.log(`[Discovery] Discovering state: ${stateCode}`);
|
||||
|
||||
// Seed known cities for this state
|
||||
if (stateCode === 'AZ') {
|
||||
console.log('[Discovery] Seeding Arizona cities...');
|
||||
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
|
||||
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
|
||||
// Dynamically fetch and seed cities for this state
|
||||
console.log(`[Discovery] Fetching cities for ${stateCode} from Dutchie...`);
|
||||
const cityNames = await getCitiesForState(stateCode);
|
||||
if (cityNames.length > 0) {
|
||||
const cities = cityNames.map(name => ({
|
||||
name,
|
||||
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
|
||||
stateCode,
|
||||
}));
|
||||
const seeded = await seedKnownCities(pool, cities);
|
||||
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated for ${stateCode}`);
|
||||
} else {
|
||||
console.log(`[Discovery] No cities found for ${stateCode}`);
|
||||
}
|
||||
|
||||
// Run full discovery for this state
|
||||
|
||||
@@ -13,7 +13,6 @@ export {
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
|
||||
// Location Discovery
|
||||
@@ -33,5 +32,17 @@ export {
|
||||
DiscoveryStats,
|
||||
} from './discovery-crawler';
|
||||
|
||||
// Promotion
|
||||
export {
|
||||
validateForPromotion,
|
||||
validateDiscoveredLocations,
|
||||
promoteDiscoveredLocations,
|
||||
promoteSingleLocation,
|
||||
ValidationResult,
|
||||
ValidationSummary,
|
||||
PromotionResult,
|
||||
PromotionSummary,
|
||||
} from './promotion';
|
||||
|
||||
// Routes
|
||||
export { createDiscoveryRoutes } from './routes';
|
||||
|
||||
@@ -26,13 +26,346 @@ import {
|
||||
mapLocationRowToLocation,
|
||||
} from './types';
|
||||
import { DiscoveryCity } from './types';
|
||||
import {
|
||||
executeGraphQL,
|
||||
fetchPage,
|
||||
extractNextData,
|
||||
GRAPHQL_HASHES,
|
||||
setProxy,
|
||||
} from '../platforms/dutchie/client';
|
||||
import { getStateProxy, getRandomProxy } from '../utils/proxyManager';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// PROXY INITIALIZATION
|
||||
// ============================================================
|
||||
// Call initDiscoveryProxy() before any discovery operations to
|
||||
// set up proxy if USE_PROXY=true environment variable is set.
|
||||
// This is opt-in and does NOT break existing behavior.
|
||||
// ============================================================
|
||||
|
||||
let proxyInitialized = false;
|
||||
|
||||
/**
|
||||
* Initialize proxy for discovery operations
|
||||
* Only runs if USE_PROXY=true is set in environment
|
||||
* Safe to call multiple times - only initializes once
|
||||
*
|
||||
* @param stateCode - Optional state code for state-specific proxy (e.g., 'AZ', 'CA')
|
||||
* @returns true if proxy was set, false if skipped or failed
|
||||
*/
|
||||
export async function initDiscoveryProxy(stateCode?: string): Promise<boolean> {
|
||||
// Skip if already initialized
|
||||
if (proxyInitialized) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Skip if USE_PROXY is not enabled
|
||||
if (process.env.USE_PROXY !== 'true') {
|
||||
console.log('[LocationDiscovery] Proxy disabled (USE_PROXY != true)');
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
// Get proxy - prefer state-specific if state code provided
|
||||
const proxyConfig = stateCode
|
||||
? await getStateProxy(stateCode)
|
||||
: await getRandomProxy();
|
||||
|
||||
if (!proxyConfig) {
|
||||
console.warn('[LocationDiscovery] No proxy available, proceeding without proxy');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build proxy URL with auth if needed
|
||||
let proxyUrl = proxyConfig.server;
|
||||
if (proxyConfig.username && proxyConfig.password) {
|
||||
const url = new URL(proxyConfig.server);
|
||||
url.username = proxyConfig.username;
|
||||
url.password = proxyConfig.password;
|
||||
proxyUrl = url.toString();
|
||||
}
|
||||
|
||||
// Set proxy on the Dutchie client
|
||||
setProxy(proxyUrl);
|
||||
proxyInitialized = true;
|
||||
|
||||
console.log(`[LocationDiscovery] Proxy initialized for ${stateCode || 'general'} discovery`);
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Failed to initialize proxy: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset proxy initialization flag (for testing or re-initialization)
|
||||
*/
|
||||
export function resetProxyInit(): void {
|
||||
proxyInitialized = false;
|
||||
setProxy(null);
|
||||
}
|
||||
|
||||
const PLATFORM = 'dutchie';
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL / API FETCHING
|
||||
// CITY-BASED DISCOVERY (CANONICAL SOURCE OF TRUTH)
|
||||
// ============================================================
|
||||
// GraphQL with city+state filter is the SOURCE OF TRUTH for database data.
|
||||
//
|
||||
// Method:
|
||||
// 1. Get city list from statesWithDispensaries (in __NEXT_DATA__)
|
||||
// 2. Query stores per city using city + state GraphQL filter
|
||||
// 3. This gives us complete, accurate dispensary data
|
||||
//
|
||||
// Geo-coordinate queries (nearLat/nearLng) are ONLY for showing search
|
||||
// results to users (e.g., "stores within 20 miles of me").
|
||||
// They are NOT a source of truth for establishing database records.
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* State with dispensary cities from Dutchie's statesWithDispensaries data
|
||||
*/
|
||||
export interface StateWithCities {
|
||||
name: string; // State code (e.g., "CA", "AZ")
|
||||
country: string; // Country code (e.g., "US")
|
||||
cities: string[]; // Array of city names
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all states with their cities via direct GraphQL query
|
||||
*
|
||||
* Uses the getAllCitiesByState persisted query which returns all states
|
||||
* and cities where Dutchie has dispensaries.
|
||||
*/
|
||||
export async function fetchStatesWithDispensaries(
|
||||
options: { verbose?: boolean } = {}
|
||||
): Promise<StateWithCities[]> {
|
||||
const { verbose = false } = options;
|
||||
|
||||
// Initialize proxy if USE_PROXY=true
|
||||
await initDiscoveryProxy();
|
||||
|
||||
console.log('[LocationDiscovery] Fetching statesWithDispensaries via GraphQL...');
|
||||
|
||||
try {
|
||||
// Use direct GraphQL query - much cleaner than scraping __NEXT_DATA__
|
||||
const result = await executeGraphQL(
|
||||
'getAllCitiesByState',
|
||||
{}, // No variables needed
|
||||
GRAPHQL_HASHES.GetAllCitiesByState,
|
||||
{ maxRetries: 3, retryOn403: true }
|
||||
);
|
||||
|
||||
const statesData = result?.data?.statesWithDispensaries;
|
||||
if (!Array.isArray(statesData)) {
|
||||
console.error('[LocationDiscovery] statesWithDispensaries not found in response');
|
||||
return [];
|
||||
}
|
||||
|
||||
// Map to our StateWithCities format
|
||||
const states: StateWithCities[] = [];
|
||||
for (const state of statesData) {
|
||||
if (state && state.name) {
|
||||
// Filter out null cities
|
||||
const cities = Array.isArray(state.cities)
|
||||
? state.cities.filter((c: string | null) => c !== null)
|
||||
: [];
|
||||
|
||||
states.push({
|
||||
name: state.name,
|
||||
country: state.country || 'US',
|
||||
cities,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Found ${states.length} states`);
|
||||
for (const state of states) {
|
||||
console.log(` ${state.name}: ${state.cities.length} cities`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] Loaded ${states.length} states with cities`);
|
||||
return states;
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Failed to fetch states: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cities for a specific state
|
||||
*/
|
||||
export async function getCitiesForState(
|
||||
stateCode: string,
|
||||
options: { verbose?: boolean } = {}
|
||||
): Promise<string[]> {
|
||||
const states = await fetchStatesWithDispensaries(options);
|
||||
const state = states.find(s => s.name.toUpperCase() === stateCode.toUpperCase());
|
||||
|
||||
if (!state) {
|
||||
console.warn(`[LocationDiscovery] No cities found for state: ${stateCode}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] Found ${state.cities.length} cities for ${stateCode}`);
|
||||
return state.cities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch dispensaries for a specific city+state using GraphQL
|
||||
*
|
||||
* This is the CORRECT method for establishing database data:
|
||||
* Uses city + state filter, NOT geo-coordinates.
|
||||
*/
|
||||
export async function fetchDispensariesByCityState(
|
||||
city: string,
|
||||
stateCode: string,
|
||||
options: { verbose?: boolean; perPage?: number; maxPages?: number } = {}
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
const { verbose = false, perPage = 200, maxPages = 10 } = options;
|
||||
|
||||
// Initialize proxy if USE_PROXY=true (state-specific proxy preferred)
|
||||
await initDiscoveryProxy(stateCode);
|
||||
|
||||
console.log(`[LocationDiscovery] Fetching dispensaries for ${city}, ${stateCode}...`);
|
||||
|
||||
const allDispensaries: any[] = [];
|
||||
let page = 0;
|
||||
let hasMore = true;
|
||||
|
||||
while (hasMore && page < maxPages) {
|
||||
const variables = {
|
||||
dispensaryFilter: {
|
||||
activeOnly: true,
|
||||
city: city,
|
||||
state: stateCode,
|
||||
},
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await executeGraphQL(
|
||||
'ConsumerDispensaries',
|
||||
variables,
|
||||
GRAPHQL_HASHES.ConsumerDispensaries,
|
||||
{ cName: `${city.toLowerCase().replace(/\s+/g, '-')}-${stateCode.toLowerCase()}`, maxRetries: 2, retryOn403: true }
|
||||
);
|
||||
|
||||
const dispensaries = result?.data?.filteredDispensaries || [];
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] Page ${page}: ${dispensaries.length} dispensaries`);
|
||||
}
|
||||
|
||||
if (dispensaries.length === 0) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
// Filter to ensure we only get dispensaries in the correct state
|
||||
const stateFiltered = dispensaries.filter((d: any) =>
|
||||
d.location?.state?.toUpperCase() === stateCode.toUpperCase()
|
||||
);
|
||||
allDispensaries.push(...stateFiltered);
|
||||
|
||||
if (dispensaries.length < perPage) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
page++;
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Error fetching page ${page}: ${error.message}`);
|
||||
hasMore = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Dedupe by ID
|
||||
const uniqueMap = new Map<string, any>();
|
||||
for (const d of allDispensaries) {
|
||||
const id = d.id || d._id;
|
||||
if (id && !uniqueMap.has(id)) {
|
||||
uniqueMap.set(id, d);
|
||||
}
|
||||
}
|
||||
|
||||
const unique = Array.from(uniqueMap.values());
|
||||
console.log(`[LocationDiscovery] Found ${unique.length} unique dispensaries in ${city}, ${stateCode}`);
|
||||
|
||||
return unique.map(d => normalizeLocationResponse(d));
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch ALL dispensaries for a state by querying each city
|
||||
*
|
||||
* This is the canonical method for establishing state data:
|
||||
* 1. Get city list from statesWithDispensaries
|
||||
* 2. Query each city using city+state filter
|
||||
* 3. Dedupe and return all dispensaries
|
||||
*/
|
||||
export async function fetchAllDispensariesForState(
|
||||
stateCode: string,
|
||||
options: { verbose?: boolean; progressCallback?: (city: string, count: number, total: number) => void } = {}
|
||||
): Promise<{ dispensaries: DutchieLocationResponse[]; citiesQueried: number; citiesWithResults: number }> {
|
||||
const { verbose = false, progressCallback } = options;
|
||||
|
||||
console.log(`[LocationDiscovery] Fetching all dispensaries for ${stateCode}...`);
|
||||
|
||||
// Step 1: Get city list
|
||||
const cities = await getCitiesForState(stateCode, { verbose });
|
||||
if (cities.length === 0) {
|
||||
console.warn(`[LocationDiscovery] No cities found for ${stateCode}`);
|
||||
return { dispensaries: [], citiesQueried: 0, citiesWithResults: 0 };
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] Will query ${cities.length} cities for ${stateCode}`);
|
||||
|
||||
// Step 2: Query each city
|
||||
const allDispensaries = new Map<string, DutchieLocationResponse>();
|
||||
let citiesWithResults = 0;
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
|
||||
if (progressCallback) {
|
||||
progressCallback(city, i + 1, cities.length);
|
||||
}
|
||||
|
||||
try {
|
||||
const dispensaries = await fetchDispensariesByCityState(city, stateCode, { verbose });
|
||||
|
||||
if (dispensaries.length > 0) {
|
||||
citiesWithResults++;
|
||||
for (const d of dispensaries) {
|
||||
const id = d.id || d.slug;
|
||||
if (id && !allDispensaries.has(id)) {
|
||||
allDispensaries.set(id, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Small delay between cities to avoid rate limiting
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
} catch (error: any) {
|
||||
console.error(`[LocationDiscovery] Error querying ${city}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const result = Array.from(allDispensaries.values());
|
||||
console.log(`[LocationDiscovery] Total: ${result.length} unique dispensaries across ${citiesWithResults}/${cities.length} cities`);
|
||||
|
||||
return {
|
||||
dispensaries: result,
|
||||
citiesQueried: cities.length,
|
||||
citiesWithResults,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL / API FETCHING (LEGACY - PUPPETEER-BASED)
|
||||
// ============================================================
|
||||
|
||||
interface SessionCredentials {
|
||||
@@ -91,57 +424,77 @@ async function closeSession(session: SessionCredentials): Promise<void> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch locations for a city using Dutchie's internal search API.
|
||||
* Fetch locations for a city.
|
||||
*
|
||||
* PRIMARY METHOD: Uses city+state GraphQL filter (source of truth)
|
||||
* FALLBACK: Legacy Puppeteer-based methods for edge cases
|
||||
*/
|
||||
export async function fetchLocationsForCity(
|
||||
city: DiscoveryCity,
|
||||
options: {
|
||||
session?: SessionCredentials;
|
||||
verbose?: boolean;
|
||||
useLegacyMethods?: boolean;
|
||||
} = {}
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
const { verbose = false } = options;
|
||||
let session = options.session;
|
||||
let shouldCloseSession = false;
|
||||
const { verbose = false, useLegacyMethods = false } = options;
|
||||
|
||||
if (!session) {
|
||||
session = await createSession(city.citySlug);
|
||||
shouldCloseSession = true;
|
||||
}
|
||||
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
try {
|
||||
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
// Try multiple approaches to get location data
|
||||
|
||||
// Approach 1: Extract from page __NEXT_DATA__ or similar
|
||||
const locations = await extractLocationsFromPage(session.page, verbose);
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data`);
|
||||
return locations;
|
||||
}
|
||||
|
||||
// Approach 2: Try the geo-based GraphQL query
|
||||
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
||||
if (geoLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from GraphQL`);
|
||||
return geoLocations;
|
||||
}
|
||||
|
||||
// Approach 3: Scrape visible location cards
|
||||
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
||||
if (scrapedLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping`);
|
||||
return scrapedLocations;
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||
return [];
|
||||
} finally {
|
||||
if (shouldCloseSession) {
|
||||
await closeSession(session);
|
||||
// PRIMARY METHOD: City+State GraphQL query (SOURCE OF TRUTH)
|
||||
if (city.cityName && city.stateCode) {
|
||||
try {
|
||||
const locations = await fetchDispensariesByCityState(city.cityName, city.stateCode, { verbose });
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations via GraphQL city+state`);
|
||||
return locations;
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.warn(`[LocationDiscovery] GraphQL city+state failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// FALLBACK: Legacy Puppeteer-based methods (only if explicitly enabled)
|
||||
if (useLegacyMethods) {
|
||||
let session = options.session;
|
||||
let shouldCloseSession = false;
|
||||
|
||||
if (!session) {
|
||||
session = await createSession(city.citySlug);
|
||||
shouldCloseSession = true;
|
||||
}
|
||||
|
||||
try {
|
||||
// Legacy Approach 1: Extract from page __NEXT_DATA__
|
||||
const locations = await extractLocationsFromPage(session.page, verbose);
|
||||
if (locations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data (legacy)`);
|
||||
return locations;
|
||||
}
|
||||
|
||||
// Legacy Approach 2: Try the geo-based GraphQL query
|
||||
// NOTE: Geo queries are for SEARCH RESULTS only, not source of truth
|
||||
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
||||
if (geoLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from geo GraphQL (legacy)`);
|
||||
return geoLocations;
|
||||
}
|
||||
|
||||
// Legacy Approach 3: Scrape visible location cards
|
||||
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
||||
if (scrapedLocations.length > 0) {
|
||||
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping (legacy)`);
|
||||
return scrapedLocations;
|
||||
}
|
||||
} finally {
|
||||
if (shouldCloseSession) {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -202,33 +555,52 @@ async function extractLocationsFromPage(
|
||||
|
||||
/**
|
||||
* Fetch locations via GraphQL geo-based query.
|
||||
*
|
||||
* Uses ConsumerDispensaries with geo filtering:
|
||||
* - dispensaryFilter.nearLat/nearLng for center point
|
||||
* - dispensaryFilter.distance for radius in miles
|
||||
* - Response at data.filteredDispensaries
|
||||
*/
|
||||
async function fetchLocationsViaGraphQL(
|
||||
session: SessionCredentials,
|
||||
city: DiscoveryCity,
|
||||
verbose: boolean
|
||||
): Promise<DutchieLocationResponse[]> {
|
||||
// Use a known center point for the city or default to a central US location
|
||||
const CITY_COORDS: Record<string, { lat: number; lng: number }> = {
|
||||
'phoenix': { lat: 33.4484, lng: -112.074 },
|
||||
'tucson': { lat: 32.2226, lng: -110.9747 },
|
||||
'scottsdale': { lat: 33.4942, lng: -111.9261 },
|
||||
'mesa': { lat: 33.4152, lng: -111.8315 },
|
||||
'tempe': { lat: 33.4255, lng: -111.94 },
|
||||
'flagstaff': { lat: 35.1983, lng: -111.6513 },
|
||||
// Add more as needed
|
||||
// City center coordinates with appropriate radius
|
||||
const CITY_COORDS: Record<string, { lat: number; lng: number; radius: number }> = {
|
||||
'phoenix': { lat: 33.4484, lng: -112.074, radius: 50 },
|
||||
'tucson': { lat: 32.2226, lng: -110.9747, radius: 50 },
|
||||
'scottsdale': { lat: 33.4942, lng: -111.9261, radius: 30 },
|
||||
'mesa': { lat: 33.4152, lng: -111.8315, radius: 30 },
|
||||
'tempe': { lat: 33.4255, lng: -111.94, radius: 30 },
|
||||
'flagstaff': { lat: 35.1983, lng: -111.6513, radius: 50 },
|
||||
};
|
||||
|
||||
const coords = CITY_COORDS[city.citySlug] || { lat: 33.4484, lng: -112.074 };
|
||||
// State-wide coordinates for full coverage
|
||||
const STATE_COORDS: Record<string, { lat: number; lng: number; radius: number }> = {
|
||||
'AZ': { lat: 33.4484, lng: -112.074, radius: 200 },
|
||||
'CA': { lat: 36.7783, lng: -119.4179, radius: 400 },
|
||||
'CO': { lat: 39.5501, lng: -105.7821, radius: 200 },
|
||||
'FL': { lat: 27.6648, lng: -81.5158, radius: 400 },
|
||||
'MI': { lat: 44.3148, lng: -85.6024, radius: 250 },
|
||||
'NV': { lat: 36.1699, lng: -115.1398, radius: 200 },
|
||||
};
|
||||
|
||||
// Try city-specific coords first, then state-wide, then default
|
||||
const coords = CITY_COORDS[city.citySlug]
|
||||
|| (city.stateCode && STATE_COORDS[city.stateCode])
|
||||
|| { lat: 33.4484, lng: -112.074, radius: 200 };
|
||||
|
||||
// Correct GraphQL variables for ConsumerDispensaries
|
||||
const variables = {
|
||||
dispensariesFilter: {
|
||||
latitude: coords.lat,
|
||||
longitude: coords.lng,
|
||||
distance: 50, // miles
|
||||
state: city.stateCode,
|
||||
city: city.cityName,
|
||||
dispensaryFilter: {
|
||||
activeOnly: true,
|
||||
nearLat: coords.lat,
|
||||
nearLng: coords.lng,
|
||||
distance: coords.radius,
|
||||
},
|
||||
page: 0,
|
||||
perPage: 200,
|
||||
};
|
||||
|
||||
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
|
||||
@@ -263,8 +635,19 @@ async function fetchLocationsViaGraphQL(
|
||||
return [];
|
||||
}
|
||||
|
||||
const dispensaries = response.data?.data?.consumerDispensaries || [];
|
||||
return dispensaries.map((d: any) => normalizeLocationResponse(d));
|
||||
// Response is at data.filteredDispensaries
|
||||
const dispensaries = response.data?.data?.filteredDispensaries || [];
|
||||
|
||||
// Filter to specific state if needed (radius may include neighboring states)
|
||||
const filtered = city.stateCode
|
||||
? dispensaries.filter((d: any) => d.location?.state === city.stateCode)
|
||||
: dispensaries;
|
||||
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] GraphQL returned ${dispensaries.length} total, ${filtered.length} in ${city.stateCode || 'all states'}`);
|
||||
}
|
||||
|
||||
return filtered.map((d: any) => normalizeLocationResponse(d));
|
||||
} catch (error: any) {
|
||||
if (verbose) {
|
||||
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
|
||||
@@ -337,31 +720,57 @@ async function scrapeLocationCards(
|
||||
|
||||
/**
|
||||
* Normalize a raw location response to a consistent format.
|
||||
* Maps Dutchie camelCase fields to our snake_case equivalents.
|
||||
*/
|
||||
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
||||
const slug = raw.slug || raw.cName || raw.urlSlug || '';
|
||||
const id = raw.id || raw._id || raw.dispensaryId || '';
|
||||
|
||||
// Extract location data - GraphQL response nests address info in .location
|
||||
const loc = raw.location || {};
|
||||
|
||||
// Extract coordinates from geometry.coordinates [longitude, latitude]
|
||||
const coords = loc.geometry?.coordinates || [];
|
||||
const longitude = coords[0] || raw.longitude || raw.lng || loc.longitude || loc.lng;
|
||||
const latitude = coords[1] || raw.latitude || raw.lat || loc.latitude || loc.lat;
|
||||
|
||||
return {
|
||||
id,
|
||||
name: raw.name || raw.dispensaryName || '',
|
||||
slug,
|
||||
address: raw.address || raw.fullAddress || '',
|
||||
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || '',
|
||||
address2: raw.address2 || raw.addressLine2 || '',
|
||||
city: raw.city || '',
|
||||
state: raw.state || raw.stateCode || '',
|
||||
zip: raw.zip || raw.zipCode || raw.postalCode || '',
|
||||
country: raw.country || raw.countryCode || 'US',
|
||||
latitude: raw.latitude || raw.lat || raw.location?.latitude,
|
||||
longitude: raw.longitude || raw.lng || raw.location?.longitude,
|
||||
cName: raw.cName || raw.slug || '',
|
||||
address: raw.address || raw.fullAddress || loc.ln1 || '',
|
||||
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || loc.ln1 || '',
|
||||
address2: raw.address2 || raw.addressLine2 || loc.ln2 || '',
|
||||
city: raw.city || loc.city || '',
|
||||
state: raw.state || raw.stateCode || loc.state || '',
|
||||
zip: raw.zip || raw.zipCode || raw.postalCode || loc.zipcode || loc.zip || '',
|
||||
country: raw.country || raw.countryCode || loc.country || 'United States',
|
||||
latitude,
|
||||
longitude,
|
||||
timezone: raw.timezone || raw.tz || '',
|
||||
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
|
||||
retailType: raw.retailType || raw.type || '',
|
||||
// Service offerings
|
||||
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
|
||||
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
|
||||
isRecreational: raw.isRecreational ?? raw.retailType?.includes('Recreational') ?? true,
|
||||
isMedical: raw.isMedical ?? raw.retailType?.includes('Medical') ?? true,
|
||||
offerCurbsidePickup: raw.offerCurbsidePickup ?? false,
|
||||
// License types
|
||||
isRecreational: raw.isRecreational ?? raw.recDispensary ?? raw.retailType?.includes('Recreational') ?? true,
|
||||
isMedical: raw.isMedical ?? raw.medicalDispensary ?? raw.retailType?.includes('Medical') ?? true,
|
||||
// Contact info
|
||||
phone: raw.phone || '',
|
||||
email: raw.email || '',
|
||||
website: raw.embedBackUrl || '',
|
||||
// Branding
|
||||
description: raw.description || '',
|
||||
logoImage: raw.logoImage || '',
|
||||
bannerImage: raw.bannerImage || '',
|
||||
// Chain/enterprise info
|
||||
chainSlug: raw.chain || '',
|
||||
enterpriseId: raw.retailer?.enterpriseId || '',
|
||||
// Status
|
||||
status: raw.status || '',
|
||||
// Preserve raw data
|
||||
...raw,
|
||||
};
|
||||
@@ -373,13 +782,20 @@ function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
||||
|
||||
/**
|
||||
* Upsert a location into dutchie_discovery_locations.
|
||||
* REQUIRES a valid platform ID (MongoDB ObjectId) - will skip records without one.
|
||||
*/
|
||||
export async function upsertLocation(
|
||||
pool: Pool,
|
||||
location: DutchieLocationResponse,
|
||||
cityId: number | null
|
||||
): Promise<{ id: number; isNew: boolean }> {
|
||||
const platformLocationId = location.id || location.slug;
|
||||
): Promise<{ id: number; isNew: boolean } | null> {
|
||||
// REQUIRE actual platform ID - NO fallback to slug
|
||||
const platformLocationId = location.id;
|
||||
if (!platformLocationId) {
|
||||
console.warn(`[LocationDiscovery] Skipping location without platform ID: ${location.name} (${location.slug})`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
|
||||
|
||||
const result = await pool.query(
|
||||
@@ -405,15 +821,27 @@ export async function upsertLocation(
|
||||
offers_pickup,
|
||||
is_recreational,
|
||||
is_medical,
|
||||
phone,
|
||||
website,
|
||||
email,
|
||||
description,
|
||||
logo_image,
|
||||
banner_image,
|
||||
chain_slug,
|
||||
enterprise_id,
|
||||
c_name,
|
||||
country,
|
||||
store_status,
|
||||
last_seen_at,
|
||||
updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW(), NOW())
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, NOW(), NOW())
|
||||
ON CONFLICT (platform, platform_location_id)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
platform_menu_url = EXCLUDED.platform_menu_url,
|
||||
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
|
||||
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
|
||||
address_line2 = COALESCE(EXCLUDED.address_line2, dutchie_discovery_locations.address_line2),
|
||||
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
|
||||
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
|
||||
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
|
||||
@@ -425,6 +853,17 @@ export async function upsertLocation(
|
||||
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
|
||||
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
|
||||
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
|
||||
phone = COALESCE(EXCLUDED.phone, dutchie_discovery_locations.phone),
|
||||
website = COALESCE(EXCLUDED.website, dutchie_discovery_locations.website),
|
||||
email = COALESCE(EXCLUDED.email, dutchie_discovery_locations.email),
|
||||
description = COALESCE(EXCLUDED.description, dutchie_discovery_locations.description),
|
||||
logo_image = COALESCE(EXCLUDED.logo_image, dutchie_discovery_locations.logo_image),
|
||||
banner_image = COALESCE(EXCLUDED.banner_image, dutchie_discovery_locations.banner_image),
|
||||
chain_slug = COALESCE(EXCLUDED.chain_slug, dutchie_discovery_locations.chain_slug),
|
||||
enterprise_id = COALESCE(EXCLUDED.enterprise_id, dutchie_discovery_locations.enterprise_id),
|
||||
c_name = COALESCE(EXCLUDED.c_name, dutchie_discovery_locations.c_name),
|
||||
country = COALESCE(EXCLUDED.country, dutchie_discovery_locations.country),
|
||||
store_status = COALESCE(EXCLUDED.store_status, dutchie_discovery_locations.store_status),
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_new`,
|
||||
@@ -440,7 +879,7 @@ export async function upsertLocation(
|
||||
location.city || null,
|
||||
location.state || null,
|
||||
location.zip || null,
|
||||
location.country || 'US',
|
||||
location.country || 'United States',
|
||||
location.latitude || null,
|
||||
location.longitude || null,
|
||||
location.timezone || null,
|
||||
@@ -450,6 +889,17 @@ export async function upsertLocation(
|
||||
location.offerPickup ?? null,
|
||||
location.isRecreational ?? null,
|
||||
location.isMedical ?? null,
|
||||
location.phone || null,
|
||||
location.website || null,
|
||||
location.email || null,
|
||||
location.description || null,
|
||||
location.logoImage || null,
|
||||
location.bannerImage || null,
|
||||
location.chainSlug || null,
|
||||
location.enterpriseId || null,
|
||||
location.cName || null,
|
||||
location.country || 'United States',
|
||||
location.status || null,
|
||||
]
|
||||
);
|
||||
|
||||
@@ -642,6 +1092,12 @@ export async function discoverLocationsForCity(
|
||||
|
||||
const result = await upsertLocation(pool, location, city.id);
|
||||
|
||||
// Skip locations without valid platform ID
|
||||
if (!result) {
|
||||
errors.push(`Location ${location.slug}: No valid platform ID - skipped`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.isNew) {
|
||||
newCount++;
|
||||
} else {
|
||||
|
||||
579
backend/src/discovery/promotion.ts
Normal file
579
backend/src/discovery/promotion.ts
Normal file
@@ -0,0 +1,579 @@
|
||||
/**
|
||||
* Discovery Promotion Service
|
||||
*
|
||||
* Handles the promotion of discovery locations to dispensaries:
|
||||
* 1. Discovery → Raw data in dutchie_discovery_locations (status='discovered')
|
||||
* 2. Validation → Check required fields, reject incomplete records
|
||||
* 3. Promotion → Idempotent upsert to dispensaries, link back via dispensary_id
|
||||
*/
|
||||
|
||||
import { pool } from '../db/pool';
|
||||
import { DiscoveryLocationRow, DiscoveryStatus } from './types';
|
||||
|
||||
// ============================================================
|
||||
// VALIDATION
|
||||
// ============================================================
|
||||
|
||||
export interface ValidationResult {
|
||||
valid: boolean;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
export interface ValidationSummary {
|
||||
totalChecked: number;
|
||||
validCount: number;
|
||||
invalidCount: number;
|
||||
invalidRecords: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
errors: string[];
|
||||
}>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate a single discovery location has all required fields for promotion
|
||||
*/
|
||||
export function validateForPromotion(loc: DiscoveryLocationRow): ValidationResult {
|
||||
const errors: string[] = [];
|
||||
|
||||
// Required fields
|
||||
if (!loc.platform_location_id) {
|
||||
errors.push('Missing platform_location_id');
|
||||
}
|
||||
if (!loc.name || loc.name.trim() === '') {
|
||||
errors.push('Missing name');
|
||||
}
|
||||
if (!loc.city || loc.city.trim() === '') {
|
||||
errors.push('Missing city');
|
||||
}
|
||||
if (!loc.state_code || loc.state_code.trim() === '') {
|
||||
errors.push('Missing state_code');
|
||||
}
|
||||
if (!loc.platform_menu_url) {
|
||||
errors.push('Missing platform_menu_url');
|
||||
}
|
||||
|
||||
return {
|
||||
valid: errors.length === 0,
|
||||
errors,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate all discovered locations and return summary
|
||||
*/
|
||||
export async function validateDiscoveredLocations(
|
||||
stateCode?: string
|
||||
): Promise<ValidationSummary> {
|
||||
let query = `
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
WHERE status = 'discovered'
|
||||
`;
|
||||
const params: string[] = [];
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const locations = result.rows as DiscoveryLocationRow[];
|
||||
|
||||
const invalidRecords: ValidationSummary['invalidRecords'] = [];
|
||||
let validCount = 0;
|
||||
|
||||
for (const loc of locations) {
|
||||
const validation = validateForPromotion(loc);
|
||||
if (validation.valid) {
|
||||
validCount++;
|
||||
} else {
|
||||
invalidRecords.push({
|
||||
id: loc.id,
|
||||
name: loc.name,
|
||||
errors: validation.errors,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
totalChecked: locations.length,
|
||||
validCount,
|
||||
invalidCount: invalidRecords.length,
|
||||
invalidRecords,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROMOTION
|
||||
// ============================================================
|
||||
|
||||
export interface PromotionResult {
|
||||
discoveryId: number;
|
||||
dispensaryId: number;
|
||||
action: 'created' | 'updated' | 'skipped';
|
||||
name: string;
|
||||
}
|
||||
|
||||
export interface PromotionSummary {
|
||||
totalProcessed: number;
|
||||
created: number;
|
||||
updated: number;
|
||||
skipped: number;
|
||||
rejected: number;
|
||||
results: PromotionResult[];
|
||||
rejectedRecords: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
errors: string[];
|
||||
}>;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a URL-safe slug from name and city
|
||||
*/
|
||||
function generateSlug(name: string, city: string, state: string): string {
|
||||
const base = `${name}-${city}-${state}`
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '')
|
||||
.substring(0, 100);
|
||||
return base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a promotion action to dutchie_promotion_log
|
||||
*/
|
||||
async function logPromotionAction(
|
||||
action: string,
|
||||
discoveryId: number | null,
|
||||
dispensaryId: number | null,
|
||||
stateCode: string | null,
|
||||
storeName: string | null,
|
||||
validationErrors: string[] | null = null,
|
||||
fieldChanges: Record<string, any> | null = null,
|
||||
triggeredBy: string = 'auto'
|
||||
): Promise<void> {
|
||||
await pool.query(`
|
||||
INSERT INTO dutchie_promotion_log
|
||||
(discovery_id, dispensary_id, action, state_code, store_name, validation_errors, field_changes, triggered_by)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
`, [
|
||||
discoveryId,
|
||||
dispensaryId,
|
||||
action,
|
||||
stateCode,
|
||||
storeName,
|
||||
validationErrors,
|
||||
fieldChanges ? JSON.stringify(fieldChanges) : null,
|
||||
triggeredBy,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a status alert for the dashboard
|
||||
*/
|
||||
export async function createStatusAlert(
|
||||
dispensaryId: number,
|
||||
profileId: number | null,
|
||||
alertType: string,
|
||||
severity: 'info' | 'warning' | 'error' | 'critical',
|
||||
message: string,
|
||||
previousStatus?: string | null,
|
||||
newStatus?: string | null,
|
||||
metadata?: Record<string, any>
|
||||
): Promise<number> {
|
||||
const result = await pool.query(`
|
||||
INSERT INTO crawler_status_alerts
|
||||
(dispensary_id, profile_id, alert_type, severity, message, previous_status, new_status, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING id
|
||||
`, [
|
||||
dispensaryId,
|
||||
profileId,
|
||||
alertType,
|
||||
severity,
|
||||
message,
|
||||
previousStatus || null,
|
||||
newStatus || null,
|
||||
metadata ? JSON.stringify(metadata) : null,
|
||||
]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create or update crawler profile for a dispensary with initial sandbox status
|
||||
*/
|
||||
async function ensureCrawlerProfile(
|
||||
dispensaryId: number,
|
||||
dispensaryName: string,
|
||||
platformDispensaryId: string
|
||||
): Promise<{ profileId: number; created: boolean }> {
|
||||
// Check if profile already exists
|
||||
const existingResult = await pool.query(`
|
||||
SELECT id FROM dispensary_crawler_profiles
|
||||
WHERE dispensary_id = $1 AND enabled = true
|
||||
LIMIT 1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (existingResult.rows.length > 0) {
|
||||
return { profileId: existingResult.rows[0].id, created: false };
|
||||
}
|
||||
|
||||
// Create new profile with sandbox status
|
||||
const profileKey = dispensaryName
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '')
|
||||
.substring(0, 50);
|
||||
|
||||
const insertResult = await pool.query(`
|
||||
INSERT INTO dispensary_crawler_profiles (
|
||||
dispensary_id,
|
||||
profile_name,
|
||||
profile_key,
|
||||
crawler_type,
|
||||
status,
|
||||
status_reason,
|
||||
status_changed_at,
|
||||
config,
|
||||
enabled,
|
||||
consecutive_successes,
|
||||
consecutive_failures,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, 'dutchie', 'sandbox', 'Newly promoted from discovery', CURRENT_TIMESTAMP,
|
||||
$4::jsonb, true, 0, 0, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
||||
)
|
||||
RETURNING id
|
||||
`, [
|
||||
dispensaryId,
|
||||
dispensaryName,
|
||||
profileKey,
|
||||
JSON.stringify({
|
||||
platformDispensaryId,
|
||||
useBothModes: true,
|
||||
downloadImages: true,
|
||||
trackStock: true,
|
||||
}),
|
||||
]);
|
||||
|
||||
const profileId = insertResult.rows[0].id;
|
||||
|
||||
// Create status alert for new sandbox store
|
||||
await createStatusAlert(
|
||||
dispensaryId,
|
||||
profileId,
|
||||
'promoted',
|
||||
'info',
|
||||
`${dispensaryName} promoted to sandbox - awaiting first successful crawl`,
|
||||
null,
|
||||
'sandbox',
|
||||
{ source: 'discovery_promotion', platformDispensaryId }
|
||||
);
|
||||
|
||||
return { profileId, created: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a single discovery location to dispensaries table
|
||||
* Idempotent: uses ON CONFLICT on platform_dispensary_id
|
||||
*/
|
||||
async function promoteLocation(
|
||||
loc: DiscoveryLocationRow
|
||||
): Promise<PromotionResult> {
|
||||
const slug = loc.platform_slug || generateSlug(loc.name, loc.city || '', loc.state_code || '');
|
||||
|
||||
// Upsert into dispensaries
|
||||
// ON CONFLICT by platform_dispensary_id ensures idempotency
|
||||
const upsertResult = await pool.query(`
|
||||
INSERT INTO dispensaries (
|
||||
platform,
|
||||
name,
|
||||
slug,
|
||||
city,
|
||||
state,
|
||||
address1,
|
||||
address2,
|
||||
zipcode,
|
||||
postal_code,
|
||||
phone,
|
||||
website,
|
||||
email,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
platform_dispensary_id,
|
||||
menu_url,
|
||||
menu_type,
|
||||
description,
|
||||
logo_image,
|
||||
banner_image,
|
||||
offer_pickup,
|
||||
offer_delivery,
|
||||
is_medical,
|
||||
is_recreational,
|
||||
chain_slug,
|
||||
enterprise_id,
|
||||
c_name,
|
||||
country,
|
||||
status,
|
||||
crawl_enabled,
|
||||
dutchie_verified,
|
||||
dutchie_verified_at,
|
||||
dutchie_discovery_id,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25, $26, $27, $28, $29, $30,
|
||||
$31, $32, $33, $34, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
||||
)
|
||||
ON CONFLICT (platform_dispensary_id) WHERE platform_dispensary_id IS NOT NULL
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
city = EXCLUDED.city,
|
||||
state = EXCLUDED.state,
|
||||
address1 = EXCLUDED.address1,
|
||||
address2 = EXCLUDED.address2,
|
||||
zipcode = EXCLUDED.zipcode,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
phone = EXCLUDED.phone,
|
||||
website = EXCLUDED.website,
|
||||
email = EXCLUDED.email,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
timezone = EXCLUDED.timezone,
|
||||
menu_url = EXCLUDED.menu_url,
|
||||
description = EXCLUDED.description,
|
||||
logo_image = EXCLUDED.logo_image,
|
||||
banner_image = EXCLUDED.banner_image,
|
||||
offer_pickup = EXCLUDED.offer_pickup,
|
||||
offer_delivery = EXCLUDED.offer_delivery,
|
||||
is_medical = EXCLUDED.is_medical,
|
||||
is_recreational = EXCLUDED.is_recreational,
|
||||
chain_slug = EXCLUDED.chain_slug,
|
||||
enterprise_id = EXCLUDED.enterprise_id,
|
||||
c_name = EXCLUDED.c_name,
|
||||
country = EXCLUDED.country,
|
||||
status = EXCLUDED.status,
|
||||
dutchie_discovery_id = EXCLUDED.dutchie_discovery_id,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
RETURNING id, (xmax = 0) AS inserted
|
||||
`, [
|
||||
loc.platform || 'dutchie', // $1 platform
|
||||
loc.name, // $2 name
|
||||
slug, // $3 slug
|
||||
loc.city, // $4 city
|
||||
loc.state_code, // $5 state
|
||||
loc.address_line1, // $6 address1
|
||||
loc.address_line2, // $7 address2
|
||||
loc.postal_code, // $8 zipcode
|
||||
loc.postal_code, // $9 postal_code
|
||||
loc.phone, // $10 phone
|
||||
loc.website, // $11 website
|
||||
loc.email, // $12 email
|
||||
loc.latitude, // $13 latitude
|
||||
loc.longitude, // $14 longitude
|
||||
loc.timezone, // $15 timezone
|
||||
loc.platform_location_id, // $16 platform_dispensary_id
|
||||
loc.platform_menu_url, // $17 menu_url
|
||||
'dutchie', // $18 menu_type
|
||||
loc.description, // $19 description
|
||||
loc.logo_image, // $20 logo_image
|
||||
loc.banner_image, // $21 banner_image
|
||||
loc.offers_pickup ?? true, // $22 offer_pickup
|
||||
loc.offers_delivery ?? false, // $23 offer_delivery
|
||||
loc.is_medical ?? false, // $24 is_medical
|
||||
loc.is_recreational ?? true, // $25 is_recreational
|
||||
loc.chain_slug, // $26 chain_slug
|
||||
loc.enterprise_id, // $27 enterprise_id
|
||||
loc.c_name, // $28 c_name
|
||||
loc.country || 'United States', // $29 country
|
||||
loc.store_status || 'open', // $30 status
|
||||
true, // $31 crawl_enabled
|
||||
true, // $32 dutchie_verified
|
||||
new Date(), // $33 dutchie_verified_at
|
||||
loc.id, // $34 dutchie_discovery_id
|
||||
]);
|
||||
|
||||
const dispensaryId = upsertResult.rows[0].id;
|
||||
const wasInserted = upsertResult.rows[0].inserted;
|
||||
|
||||
// Link discovery location back to dispensary and update status
|
||||
await pool.query(`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET
|
||||
dispensary_id = $1,
|
||||
status = 'verified',
|
||||
verified_at = CURRENT_TIMESTAMP,
|
||||
verified_by = 'auto-promotion'
|
||||
WHERE id = $2
|
||||
`, [dispensaryId, loc.id]);
|
||||
|
||||
// Create crawler profile with sandbox status for new dispensaries
|
||||
if (wasInserted && loc.platform_location_id) {
|
||||
await ensureCrawlerProfile(dispensaryId, loc.name, loc.platform_location_id);
|
||||
}
|
||||
|
||||
const action = wasInserted ? 'promoted_create' : 'promoted_update';
|
||||
|
||||
// Log the promotion
|
||||
await logPromotionAction(
|
||||
action,
|
||||
loc.id,
|
||||
dispensaryId,
|
||||
loc.state_code,
|
||||
loc.name,
|
||||
null,
|
||||
{ slug, city: loc.city, platform_location_id: loc.platform_location_id }
|
||||
);
|
||||
|
||||
return {
|
||||
discoveryId: loc.id,
|
||||
dispensaryId,
|
||||
action: wasInserted ? 'created' : 'updated',
|
||||
name: loc.name,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote all valid discovered locations to dispensaries
|
||||
*
|
||||
* @param stateCode Optional filter by state (e.g., 'CA', 'AZ')
|
||||
* @param dryRun If true, only validate without making changes
|
||||
*/
|
||||
export async function promoteDiscoveredLocations(
|
||||
stateCode?: string,
|
||||
dryRun = false
|
||||
): Promise<PromotionSummary> {
|
||||
const startTime = Date.now();
|
||||
|
||||
let query = `
|
||||
SELECT * FROM dutchie_discovery_locations
|
||||
WHERE status = 'discovered'
|
||||
`;
|
||||
const params: string[] = [];
|
||||
|
||||
if (stateCode) {
|
||||
query += ` AND state_code = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
query += ` ORDER BY id`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const locations = result.rows as DiscoveryLocationRow[];
|
||||
|
||||
const results: PromotionResult[] = [];
|
||||
const rejectedRecords: PromotionSummary['rejectedRecords'] = [];
|
||||
let created = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
let rejected = 0;
|
||||
|
||||
for (const loc of locations) {
|
||||
// Step 2: Validation
|
||||
const validation = validateForPromotion(loc);
|
||||
|
||||
if (!validation.valid) {
|
||||
rejected++;
|
||||
rejectedRecords.push({
|
||||
id: loc.id,
|
||||
name: loc.name,
|
||||
errors: validation.errors,
|
||||
});
|
||||
|
||||
// Mark as rejected if not dry run
|
||||
if (!dryRun) {
|
||||
await pool.query(`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'rejected', notes = $1
|
||||
WHERE id = $2
|
||||
`, [validation.errors.join('; '), loc.id]);
|
||||
|
||||
// Log the rejection
|
||||
await logPromotionAction(
|
||||
'rejected',
|
||||
loc.id,
|
||||
null,
|
||||
loc.state_code,
|
||||
loc.name,
|
||||
validation.errors
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Step 3: Promotion (skip if dry run)
|
||||
if (dryRun) {
|
||||
skipped++;
|
||||
results.push({
|
||||
discoveryId: loc.id,
|
||||
dispensaryId: 0,
|
||||
action: 'skipped',
|
||||
name: loc.name,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const promotionResult = await promoteLocation(loc);
|
||||
results.push(promotionResult);
|
||||
|
||||
if (promotionResult.action === 'created') {
|
||||
created++;
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`Failed to promote location ${loc.id} (${loc.name}):`, error.message);
|
||||
rejected++;
|
||||
rejectedRecords.push({
|
||||
id: loc.id,
|
||||
name: loc.name,
|
||||
errors: [`Promotion error: ${error.message}`],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
totalProcessed: locations.length,
|
||||
created,
|
||||
updated,
|
||||
skipped,
|
||||
rejected,
|
||||
results,
|
||||
rejectedRecords,
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a single discovery location by ID
|
||||
*/
|
||||
export async function promoteSingleLocation(
|
||||
discoveryId: number
|
||||
): Promise<PromotionResult> {
|
||||
const result = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[discoveryId]
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
throw new Error(`Discovery location ${discoveryId} not found`);
|
||||
}
|
||||
|
||||
const loc = result.rows[0] as DiscoveryLocationRow;
|
||||
|
||||
// Validate
|
||||
const validation = validateForPromotion(loc);
|
||||
if (!validation.valid) {
|
||||
throw new Error(`Validation failed: ${validation.errors.join(', ')}`);
|
||||
}
|
||||
|
||||
// Promote
|
||||
return promoteLocation(loc);
|
||||
}
|
||||
@@ -18,8 +18,8 @@ import {
|
||||
getCitiesToCrawl,
|
||||
getCityBySlug,
|
||||
seedKnownCities,
|
||||
ARIZONA_CITIES,
|
||||
} from './city-discovery';
|
||||
import { getCitiesForState } from './location-discovery';
|
||||
import {
|
||||
DiscoveryLocation,
|
||||
DiscoveryCity,
|
||||
@@ -27,6 +27,11 @@ import {
|
||||
mapLocationRowToLocation,
|
||||
mapCityRowToCity,
|
||||
} from './types';
|
||||
import {
|
||||
validateDiscoveredLocations,
|
||||
promoteDiscoveredLocations,
|
||||
promoteSingleLocation,
|
||||
} from './promotion';
|
||||
|
||||
export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
const router = Router();
|
||||
@@ -53,44 +58,44 @@ export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = 'WHERE platform = $1 AND active = TRUE';
|
||||
let whereClause = 'WHERE dl.platform = $1 AND dl.active = TRUE';
|
||||
const params: any[] = [platform];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (status) {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
whereClause += ` AND dl.status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (stateCode) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
whereClause += ` AND dl.state_code = $${paramIndex}`;
|
||||
params.push(stateCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (countryCode) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
whereClause += ` AND dl.country_code = $${paramIndex}`;
|
||||
params.push(countryCode);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (city) {
|
||||
whereClause += ` AND city ILIKE $${paramIndex}`;
|
||||
whereClause += ` AND dl.city ILIKE $${paramIndex}`;
|
||||
params.push(`%${city}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (search) {
|
||||
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||
whereClause += ` AND (dl.name ILIKE $${paramIndex} OR dl.platform_slug ILIKE $${paramIndex})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (hasDispensary === 'true') {
|
||||
whereClause += ' AND dispensary_id IS NOT NULL';
|
||||
whereClause += ' AND dl.dispensary_id IS NOT NULL';
|
||||
} else if (hasDispensary === 'false') {
|
||||
whereClause += ' AND dispensary_id IS NULL';
|
||||
whereClause += ' AND dl.dispensary_id IS NULL';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
@@ -705,15 +710,22 @@ export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
return res.status(400).json({ error: 'stateCode is required' });
|
||||
}
|
||||
|
||||
let cities: any[] = [];
|
||||
if (stateCode === 'AZ') {
|
||||
cities = ARIZONA_CITIES;
|
||||
} else {
|
||||
// Dynamically fetch cities from Dutchie for any state
|
||||
const cityNames = await getCitiesForState(stateCode as string);
|
||||
|
||||
if (cityNames.length === 0) {
|
||||
return res.status(400).json({
|
||||
error: `No predefined cities for state: ${stateCode}. Add cities to city-discovery.ts`,
|
||||
error: `No cities found for state: ${stateCode}`,
|
||||
});
|
||||
}
|
||||
|
||||
// Convert to seed format
|
||||
const cities = cityNames.map(name => ({
|
||||
name,
|
||||
slug: name.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''),
|
||||
stateCode: stateCode as string,
|
||||
}));
|
||||
|
||||
const result = await seedKnownCities(pool, cities);
|
||||
|
||||
res.json({
|
||||
@@ -834,6 +846,136 @@ export function createDiscoveryRoutes(pool: Pool): Router {
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PROMOTION ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/admin/validate
|
||||
* Validate discovered locations before promotion
|
||||
*/
|
||||
router.get('/admin/validate', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode } = req.query;
|
||||
const summary = await validateDiscoveredLocations(stateCode as string | undefined);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
...summary,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/promote
|
||||
* Promote all valid discovered locations to dispensaries (idempotent)
|
||||
*
|
||||
* Query params:
|
||||
* - stateCode: Filter by state (e.g., 'CA', 'AZ')
|
||||
* - dryRun: If true, only validate without making changes
|
||||
*/
|
||||
router.post('/admin/promote', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stateCode, dryRun = false } = req.body;
|
||||
|
||||
console.log(`[Discovery API] Starting promotion for ${stateCode || 'all states'} (dryRun=${dryRun})`);
|
||||
const summary = await promoteDiscoveredLocations(stateCode, dryRun);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
...summary,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/admin/promote/:id
|
||||
* Promote a single discovery location by ID
|
||||
*/
|
||||
router.post('/admin/promote/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
console.log(`[Discovery API] Promoting single location ${id}`);
|
||||
const result = await promoteSingleLocation(parseInt(id, 10));
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
...result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PROMOTION LOG
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/promotion-log
|
||||
* Get promotion audit log
|
||||
*/
|
||||
router.get('/promotion-log', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { state, dispensary_id, limit = '100' } = req.query;
|
||||
|
||||
let whereClause = 'WHERE 1=1';
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state) {
|
||||
whereClause += ` AND pl.state_code = $${paramIndex}`;
|
||||
params.push(state);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (dispensary_id) {
|
||||
whereClause += ` AND pl.dispensary_id = $${paramIndex}`;
|
||||
params.push(parseInt(dispensary_id as string, 10));
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10));
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
SELECT
|
||||
pl.*,
|
||||
dl.name as discovery_name,
|
||||
d.name as dispensary_name
|
||||
FROM dutchie_promotion_log pl
|
||||
LEFT JOIN dutchie_discovery_locations dl ON pl.discovery_id = dl.id
|
||||
LEFT JOIN dispensaries d ON pl.dispensary_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY pl.created_at DESC
|
||||
LIMIT $${paramIndex}
|
||||
`, params);
|
||||
|
||||
res.json({
|
||||
logs: rows.map((r: any) => ({
|
||||
id: r.id,
|
||||
discoveryId: r.discovery_id,
|
||||
dispensaryId: r.dispensary_id,
|
||||
action: r.action,
|
||||
stateCode: r.state_code,
|
||||
storeName: r.store_name,
|
||||
validationErrors: r.validation_errors,
|
||||
fieldChanges: r.field_changes,
|
||||
triggeredBy: r.triggered_by,
|
||||
createdAt: r.created_at,
|
||||
discoveryName: r.discovery_name,
|
||||
dispensaryName: r.dispensary_name,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
|
||||
@@ -60,6 +60,7 @@ export interface DiscoveryLocation {
|
||||
stateCode: string | null;
|
||||
postalCode: string | null;
|
||||
countryCode: string | null;
|
||||
country: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
@@ -72,6 +73,18 @@ export interface DiscoveryLocation {
|
||||
offersPickup: boolean | null;
|
||||
isRecreational: boolean | null;
|
||||
isMedical: boolean | null;
|
||||
// New Dutchie fields
|
||||
phone: string | null;
|
||||
website: string | null;
|
||||
email: string | null;
|
||||
description: string | null;
|
||||
logoImage: string | null;
|
||||
bannerImage: string | null;
|
||||
chainSlug: string | null;
|
||||
enterpriseId: string | null;
|
||||
cName: string | null;
|
||||
storeStatus: string | null;
|
||||
// Timestamps
|
||||
firstSeenAt: Date;
|
||||
lastSeenAt: Date;
|
||||
lastCheckedAt: Date | null;
|
||||
@@ -96,6 +109,7 @@ export interface DiscoveryLocationRow {
|
||||
state_code: string | null;
|
||||
postal_code: string | null;
|
||||
country_code: string | null;
|
||||
country: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
@@ -108,6 +122,18 @@ export interface DiscoveryLocationRow {
|
||||
offers_pickup: boolean | null;
|
||||
is_recreational: boolean | null;
|
||||
is_medical: boolean | null;
|
||||
// New Dutchie fields (snake_case for DB row)
|
||||
phone: string | null;
|
||||
website: string | null;
|
||||
email: string | null;
|
||||
description: string | null;
|
||||
logo_image: string | null;
|
||||
banner_image: string | null;
|
||||
chain_slug: string | null;
|
||||
enterprise_id: string | null;
|
||||
c_name: string | null;
|
||||
store_status: string | null;
|
||||
// Timestamps
|
||||
first_seen_at: Date;
|
||||
last_seen_at: Date;
|
||||
last_checked_at: Date | null;
|
||||
@@ -245,6 +271,7 @@ export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLo
|
||||
stateCode: row.state_code,
|
||||
postalCode: row.postal_code,
|
||||
countryCode: row.country_code,
|
||||
country: row.country,
|
||||
latitude: row.latitude,
|
||||
longitude: row.longitude,
|
||||
timezone: row.timezone,
|
||||
@@ -257,6 +284,18 @@ export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLo
|
||||
offersPickup: row.offers_pickup,
|
||||
isRecreational: row.is_recreational,
|
||||
isMedical: row.is_medical,
|
||||
// New Dutchie fields
|
||||
phone: row.phone,
|
||||
website: row.website,
|
||||
email: row.email,
|
||||
description: row.description,
|
||||
logoImage: row.logo_image,
|
||||
bannerImage: row.banner_image,
|
||||
chainSlug: row.chain_slug,
|
||||
enterpriseId: row.enterprise_id,
|
||||
cName: row.c_name,
|
||||
storeStatus: row.store_status,
|
||||
// Timestamps
|
||||
firstSeenAt: row.first_seen_at,
|
||||
lastSeenAt: row.last_seen_at,
|
||||
lastCheckedAt: row.last_checked_at,
|
||||
|
||||
@@ -1,199 +0,0 @@
|
||||
# Dutchie AZ Pipeline
|
||||
|
||||
## Overview
|
||||
|
||||
The Dutchie AZ pipeline is the **only** authorized way to crawl Dutchie dispensary menus. It uses Dutchie's GraphQL API directly (no DOM scraping) and writes to an isolated database with a proper snapshot model.
|
||||
|
||||
## Key Principles
|
||||
|
||||
1. **GraphQL Only** - All Dutchie data is fetched via their FilteredProducts GraphQL API
|
||||
2. **Isolated Database** - Data lives in `dutchie_az_*` tables, NOT the legacy `products` table
|
||||
3. **Append-Only Snapshots** - Every crawl creates snapshots, never overwrites historical data
|
||||
4. **Stock Status Tracking** - Derived from `POSMetaData.children` inventory data
|
||||
5. **Missing Product Detection** - Products not in feed are marked with `isPresentInFeed=false`
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
src/dutchie-az/
|
||||
├── db/
|
||||
│ ├── connection.ts # Database connection pool
|
||||
│ └── schema.ts # Table definitions and migrations
|
||||
├── routes/
|
||||
│ └── index.ts # REST API endpoints
|
||||
├── services/
|
||||
│ ├── graphql-client.ts # Direct GraphQL fetch (Mode A + Mode B)
|
||||
│ ├── product-crawler.ts # Main crawler orchestration
|
||||
│ └── scheduler.ts # Jittered scheduling with wandering intervals
|
||||
└── types/
|
||||
└── index.ts # TypeScript interfaces
|
||||
```
|
||||
|
||||
## Data Model
|
||||
|
||||
### Tables
|
||||
|
||||
- **dispensaries** - Arizona Dutchie stores with `platform_dispensary_id`
|
||||
- **dutchie_products** - Canonical product identity (one row per product per store)
|
||||
- **dutchie_product_snapshots** - Historical state per crawl (append-only)
|
||||
- **job_schedules** - Scheduler configuration with jitter support
|
||||
- **job_run_logs** - Execution history
|
||||
|
||||
### Stock Status
|
||||
|
||||
The `stock_status` field is derived from `POSMetaData.children`:
|
||||
|
||||
```typescript
|
||||
function deriveStockStatus(children?: POSChild[]): StockStatus {
|
||||
if (!children || children.length === 0) return 'unknown';
|
||||
const totalAvailable = children.reduce((sum, c) =>
|
||||
sum + (c.quantityAvailable || 0), 0);
|
||||
return totalAvailable > 0 ? 'in_stock' : 'out_of_stock';
|
||||
}
|
||||
```
|
||||
|
||||
### Two-Mode Crawling
|
||||
|
||||
Mode A (UI Parity):
|
||||
- `Status: null` - Returns what the UI shows
|
||||
- Best for "current inventory" snapshot
|
||||
|
||||
Mode B (Max Coverage):
|
||||
- `Status: 'Active'` - Returns all active products
|
||||
- Catches items with `isBelowThreshold: true`
|
||||
|
||||
Both modes are merged to get maximum product coverage.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
All endpoints are mounted at `/api/dutchie-az/`:
|
||||
|
||||
```
|
||||
GET /api/dutchie-az/dispensaries - List all dispensaries
|
||||
GET /api/dutchie-az/dispensaries/:id - Get dispensary details
|
||||
GET /api/dutchie-az/products - List products (with filters)
|
||||
GET /api/dutchie-az/products/:id - Get product with snapshots
|
||||
GET /api/dutchie-az/products/:id/snapshots - Get product snapshot history
|
||||
POST /api/dutchie-az/crawl/:dispensaryId - Trigger manual crawl
|
||||
GET /api/dutchie-az/schedule - Get scheduler status
|
||||
POST /api/dutchie-az/schedule/run - Manually run scheduled jobs
|
||||
GET /api/dutchie-az/stats - Dashboard statistics
|
||||
```
|
||||
|
||||
## Scheduler
|
||||
|
||||
The scheduler uses **jitter** to avoid detection patterns:
|
||||
|
||||
```typescript
|
||||
// Each job has independent "wandering" timing
|
||||
interface JobSchedule {
|
||||
base_interval_minutes: number; // e.g., 240 (4 hours)
|
||||
jitter_minutes: number; // e.g., 30 (±30 min)
|
||||
next_run_at: Date; // Calculated with jitter after each run
|
||||
}
|
||||
```
|
||||
|
||||
Jobs run when `next_run_at <= NOW()`. After completion, the next run is calculated:
|
||||
```
|
||||
next_run_at = NOW() + base_interval + random(-jitter, +jitter)
|
||||
```
|
||||
|
||||
This prevents crawls from clustering at predictable times.
|
||||
|
||||
## Manual Testing
|
||||
|
||||
### Run a single dispensary crawl:
|
||||
|
||||
```bash
|
||||
DATABASE_URL="..." npx tsx -e "
|
||||
const { crawlDispensaryProducts } = require('./src/dutchie-az/services/product-crawler');
|
||||
const { query } = require('./src/dutchie-az/db/connection');
|
||||
|
||||
async function test() {
|
||||
const { rows } = await query('SELECT * FROM dispensaries LIMIT 1');
|
||||
if (!rows[0]) return console.log('No dispensaries found');
|
||||
|
||||
const result = await crawlDispensaryProducts(rows[0], 'rec', { useBothModes: true });
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
}
|
||||
test();
|
||||
"
|
||||
```
|
||||
|
||||
### Check stock status distribution:
|
||||
|
||||
```sql
|
||||
SELECT stock_status, COUNT(*)
|
||||
FROM dutchie_products
|
||||
GROUP BY stock_status;
|
||||
```
|
||||
|
||||
### View recent snapshots:
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
p.name,
|
||||
s.stock_status,
|
||||
s.is_present_in_feed,
|
||||
s.crawled_at
|
||||
FROM dutchie_product_snapshots s
|
||||
JOIN dutchie_products p ON p.id = s.dutchie_product_id
|
||||
ORDER BY s.crawled_at DESC
|
||||
LIMIT 20;
|
||||
```
|
||||
|
||||
## Deprecated Code
|
||||
|
||||
The following files are **DEPRECATED** and will throw errors if called:
|
||||
|
||||
- `src/scrapers/dutchie-graphql.ts` - Wrote to legacy `products` table
|
||||
- `src/scrapers/dutchie-graphql-direct.ts` - Wrote to legacy `products` table
|
||||
- `src/scrapers/templates/dutchie.ts` - HTML/DOM scraper (unreliable)
|
||||
- `src/scraper-v2/engine.ts` DutchieSpider - DOM-based extraction
|
||||
|
||||
If `store-crawl-orchestrator.ts` detects `provider='dutchie'` with `mode='production'`, it now routes to this dutchie-az pipeline automatically.
|
||||
|
||||
## Integration with Legacy System
|
||||
|
||||
The `store-crawl-orchestrator.ts` bridges the legacy stores system with dutchie-az:
|
||||
|
||||
1. When a store has `product_provider='dutchie'` and `product_crawler_mode='production'`
|
||||
2. The orchestrator looks up the corresponding dispensary in `dutchie_az.dispensaries`
|
||||
3. It calls `crawlDispensaryProducts()` from the dutchie-az pipeline
|
||||
4. Results are logged but data stays in the dutchie_az tables
|
||||
|
||||
To use the dutchie-az pipeline independently:
|
||||
- Navigate to `/dutchie-az-schedule` in the UI
|
||||
- Use the REST API endpoints directly
|
||||
- Run the scheduler service
|
||||
|
||||
## Environment Variables
|
||||
|
||||
```bash
|
||||
# Database connection for dutchie-az (same DB, separate tables)
|
||||
DATABASE_URL=postgresql://user:pass@host:port/database
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Dispensary not found in dutchie-az database"
|
||||
|
||||
The dispensary must exist in `dutchie_az.dispensaries` before crawling. Either:
|
||||
1. Run discovery to populate dispensaries
|
||||
2. Manually insert the dispensary with `platform_dispensary_id`
|
||||
|
||||
### GraphQL returns empty products
|
||||
|
||||
1. Check `platform_dispensary_id` is correct (the internal Dutchie ID, not slug)
|
||||
2. Verify the dispensary is online and has menu data
|
||||
3. Try both `rec` and `med` pricing types
|
||||
|
||||
### Snapshots show `stock_status='unknown'`
|
||||
|
||||
The product likely has no `POSMetaData.children` array. This happens for:
|
||||
- Products without inventory tracking
|
||||
- Manually managed inventory
|
||||
|
||||
---
|
||||
|
||||
Last updated: December 2025
|
||||
@@ -1,129 +0,0 @@
|
||||
/**
|
||||
* Dutchie Configuration
|
||||
*
|
||||
* Centralized configuration for Dutchie GraphQL API interaction.
|
||||
* Update hashes here when Dutchie changes their persisted query system.
|
||||
*/
|
||||
|
||||
export const dutchieConfig = {
|
||||
// ============================================================
|
||||
// GRAPHQL ENDPOINT
|
||||
// ============================================================
|
||||
|
||||
/** GraphQL endpoint - must be the api-3 graphql endpoint (NOT api-gw.dutchie.com which no longer exists) */
|
||||
graphqlEndpoint: 'https://dutchie.com/api-3/graphql',
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL PERSISTED QUERY HASHES
|
||||
// ============================================================
|
||||
//
|
||||
// These hashes identify specific GraphQL operations.
|
||||
// If Dutchie changes their schema, you may need to capture
|
||||
// new hashes from live browser traffic (Network tab → graphql requests).
|
||||
|
||||
/** FilteredProducts - main product listing query */
|
||||
filteredProductsHash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
|
||||
/** GetAddressBasedDispensaryData - resolve slug to internal ID */
|
||||
getDispensaryDataHash: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
|
||||
/**
|
||||
* ConsumerDispensaries - geo-based discovery
|
||||
* NOTE: This is a placeholder guess. If discovery fails, either:
|
||||
* 1. Capture the real hash from live traffic
|
||||
* 2. Rely on known AZDHS slugs instead (set useDiscovery: false)
|
||||
*/
|
||||
consumerDispensariesHash: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
|
||||
|
||||
// ============================================================
|
||||
// BEHAVIOR FLAGS
|
||||
// ============================================================
|
||||
|
||||
/** Enable geo-based discovery (false = use known AZDHS slugs only) */
|
||||
useDiscovery: true,
|
||||
|
||||
/** Prefer GET requests (true) or POST (false). GET is default. */
|
||||
preferGet: true,
|
||||
|
||||
/**
|
||||
* Enable POST fallback when GET fails with 405 or blocked.
|
||||
* If true, will retry failed GETs as POSTs.
|
||||
*/
|
||||
enablePostFallback: true,
|
||||
|
||||
// ============================================================
|
||||
// PAGINATION & RETRY
|
||||
// ============================================================
|
||||
|
||||
/** Products per page for pagination */
|
||||
perPage: 100,
|
||||
|
||||
/** Maximum pages to fetch (safety limit) */
|
||||
maxPages: 200,
|
||||
|
||||
/** Number of retries for failed page fetches */
|
||||
maxRetries: 1,
|
||||
|
||||
/** Delay between pages in ms */
|
||||
pageDelayMs: 500,
|
||||
|
||||
/** Delay between modes in ms */
|
||||
modeDelayMs: 2000,
|
||||
|
||||
// ============================================================
|
||||
// HTTP HEADERS
|
||||
// ============================================================
|
||||
|
||||
/** Default headers to mimic browser requests */
|
||||
defaultHeaders: {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
} as Record<string, string>,
|
||||
|
||||
/** User agent string */
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
|
||||
// ============================================================
|
||||
// BROWSER LAUNCH OPTIONS
|
||||
// ============================================================
|
||||
|
||||
browserArgs: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
|
||||
/** Navigation timeout in ms */
|
||||
navigationTimeout: 60000,
|
||||
|
||||
/** Initial page load delay in ms */
|
||||
pageLoadDelay: 2000,
|
||||
};
|
||||
|
||||
/**
|
||||
* Get GraphQL hashes object for backward compatibility
|
||||
*/
|
||||
export const GRAPHQL_HASHES = {
|
||||
FilteredProducts: dutchieConfig.filteredProductsHash,
|
||||
GetAddressBasedDispensaryData: dutchieConfig.getDispensaryDataHash,
|
||||
ConsumerDispensaries: dutchieConfig.consumerDispensariesHash,
|
||||
};
|
||||
|
||||
/**
|
||||
* Arizona geo centerpoints for discovery scans
|
||||
*/
|
||||
export const ARIZONA_CENTERPOINTS = [
|
||||
{ name: 'Phoenix', lat: 33.4484, lng: -112.074 },
|
||||
{ name: 'Tucson', lat: 32.2226, lng: -110.9747 },
|
||||
{ name: 'Flagstaff', lat: 35.1983, lng: -111.6513 },
|
||||
{ name: 'Mesa', lat: 33.4152, lng: -111.8315 },
|
||||
{ name: 'Scottsdale', lat: 33.4942, lng: -111.9261 },
|
||||
{ name: 'Tempe', lat: 33.4255, lng: -111.94 },
|
||||
{ name: 'Yuma', lat: 32.6927, lng: -114.6277 },
|
||||
{ name: 'Prescott', lat: 34.54, lng: -112.4685 },
|
||||
{ name: 'Lake Havasu', lat: 34.4839, lng: -114.3224 },
|
||||
{ name: 'Sierra Vista', lat: 31.5455, lng: -110.2773 },
|
||||
];
|
||||
@@ -1,131 +0,0 @@
|
||||
/**
|
||||
* CannaiQ Database Connection
|
||||
*
|
||||
* All database access for the CannaiQ platform goes through this module.
|
||||
*
|
||||
* SINGLE DATABASE ARCHITECTURE:
|
||||
* - All services (auth, orchestrator, crawlers, admin) use this ONE database
|
||||
* - States are modeled via states table + state_id on dispensaries (not separate DBs)
|
||||
*
|
||||
* CONFIGURATION (in priority order):
|
||||
* 1. CANNAIQ_DB_URL - Full connection string (preferred)
|
||||
* 2. Individual vars: CANNAIQ_DB_HOST, CANNAIQ_DB_PORT, CANNAIQ_DB_NAME, CANNAIQ_DB_USER, CANNAIQ_DB_PASS
|
||||
* 3. DATABASE_URL - Legacy fallback for K8s compatibility
|
||||
*
|
||||
* IMPORTANT:
|
||||
* - Do NOT create separate pools elsewhere
|
||||
* - All services should import from this module
|
||||
*/
|
||||
|
||||
import { Pool, PoolClient } from 'pg';
|
||||
|
||||
/**
|
||||
* Get the database connection string from environment variables.
|
||||
* Supports multiple configuration methods with fallback for legacy compatibility.
|
||||
*/
|
||||
function getConnectionString(): string {
|
||||
// Priority 1: Full CANNAIQ connection URL
|
||||
if (process.env.CANNAIQ_DB_URL) {
|
||||
return process.env.CANNAIQ_DB_URL;
|
||||
}
|
||||
|
||||
// Priority 2: Build from individual CANNAIQ env vars
|
||||
const host = process.env.CANNAIQ_DB_HOST;
|
||||
const port = process.env.CANNAIQ_DB_PORT;
|
||||
const name = process.env.CANNAIQ_DB_NAME;
|
||||
const user = process.env.CANNAIQ_DB_USER;
|
||||
const pass = process.env.CANNAIQ_DB_PASS;
|
||||
|
||||
if (host && port && name && user && pass) {
|
||||
return `postgresql://${user}:${pass}@${host}:${port}/${name}`;
|
||||
}
|
||||
|
||||
// Priority 3: Fallback to DATABASE_URL for legacy/K8s compatibility
|
||||
if (process.env.DATABASE_URL) {
|
||||
return process.env.DATABASE_URL;
|
||||
}
|
||||
|
||||
// Report what's missing
|
||||
const required = ['CANNAIQ_DB_HOST', 'CANNAIQ_DB_PORT', 'CANNAIQ_DB_NAME', 'CANNAIQ_DB_USER', 'CANNAIQ_DB_PASS'];
|
||||
const missing = required.filter((key) => !process.env[key]);
|
||||
|
||||
throw new Error(
|
||||
`[CannaiQ DB] Missing database configuration.\n` +
|
||||
`Set CANNAIQ_DB_URL, DATABASE_URL, or all of: ${missing.join(', ')}`
|
||||
);
|
||||
}
|
||||
|
||||
let pool: Pool | null = null;
|
||||
|
||||
/**
|
||||
* Get the CannaiQ database pool (singleton)
|
||||
*
|
||||
* This is the canonical pool for all CannaiQ services.
|
||||
* Do NOT create separate pools elsewhere.
|
||||
*/
|
||||
export function getPool(): Pool {
|
||||
if (!pool) {
|
||||
pool = new Pool({
|
||||
connectionString: getConnectionString(),
|
||||
max: 10,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
|
||||
pool.on('error', (err) => {
|
||||
console.error('[CannaiQ DB] Unexpected error on idle client:', err);
|
||||
});
|
||||
|
||||
console.log('[CannaiQ DB] Pool initialized');
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use getPool() instead
|
||||
*/
|
||||
export function getDutchieAZPool(): Pool {
|
||||
console.warn('[CannaiQ DB] getDutchieAZPool() is deprecated. Use getPool() instead.');
|
||||
return getPool();
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a query on the CannaiQ database
|
||||
*/
|
||||
export async function query<T = any>(text: string, params?: any[]): Promise<{ rows: T[]; rowCount: number }> {
|
||||
const p = getPool();
|
||||
const result = await p.query(text, params);
|
||||
return { rows: result.rows as T[], rowCount: result.rowCount || 0 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a client from the pool for transaction use
|
||||
*/
|
||||
export async function getClient(): Promise<PoolClient> {
|
||||
const p = getPool();
|
||||
return p.connect();
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the pool connection
|
||||
*/
|
||||
export async function closePool(): Promise<void> {
|
||||
if (pool) {
|
||||
await pool.end();
|
||||
pool = null;
|
||||
console.log('[CannaiQ DB] Pool closed');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the database is accessible
|
||||
*/
|
||||
export async function healthCheck(): Promise<boolean> {
|
||||
try {
|
||||
const result = await query('SELECT 1 as ok');
|
||||
return result.rows.length > 0 && result.rows[0].ok === 1;
|
||||
} catch (error) {
|
||||
console.error('[CannaiQ DB] Health check failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1,137 +0,0 @@
|
||||
/**
|
||||
* Dispensary Column Definitions
|
||||
*
|
||||
* Centralized column list for dispensaries table queries.
|
||||
* Handles optional columns that may not exist in all environments.
|
||||
*
|
||||
* USAGE:
|
||||
* import { DISPENSARY_COLUMNS, DISPENSARY_COLUMNS_WITH_FAILED } from '../db/dispensary-columns';
|
||||
* const result = await query(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE ...`);
|
||||
*/
|
||||
|
||||
/**
|
||||
* Core dispensary columns that always exist.
|
||||
* These are guaranteed to be present in all environments.
|
||||
*/
|
||||
const CORE_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
created_at, updated_at
|
||||
`;
|
||||
|
||||
/**
|
||||
* Optional columns with NULL fallback.
|
||||
*
|
||||
* provider_detection_data: Added in migration 044
|
||||
* active_crawler_profile_id: Added in migration 041
|
||||
*
|
||||
* Using COALESCE ensures the query works whether or not the column exists:
|
||||
* - If column exists: returns the actual value
|
||||
* - If column doesn't exist: query fails (but migration should be run)
|
||||
*
|
||||
* For pre-migration compatibility, we select NULL::jsonb which always works.
|
||||
* After migration 044 is applied, this can be changed to the real column.
|
||||
*/
|
||||
|
||||
// TEMPORARY: Use NULL fallback until migration 044 is applied
|
||||
// After running 044, change this to: provider_detection_data
|
||||
const PROVIDER_DETECTION_COLUMN = `NULL::jsonb AS provider_detection_data`;
|
||||
|
||||
// After migration 044 is applied, uncomment this line and remove the above:
|
||||
// const PROVIDER_DETECTION_COLUMN = `provider_detection_data`;
|
||||
|
||||
/**
|
||||
* Standard dispensary columns for most queries.
|
||||
* Includes provider_detection_data with NULL fallback for pre-migration compatibility.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN}`;
|
||||
|
||||
/**
|
||||
* Dispensary columns including active_crawler_profile_id.
|
||||
* Used by routes that need profile information.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS_WITH_PROFILE = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN},
|
||||
active_crawler_profile_id`;
|
||||
|
||||
/**
|
||||
* Dispensary columns including failed_at.
|
||||
* Used by worker for compatibility checks.
|
||||
*/
|
||||
export const DISPENSARY_COLUMNS_WITH_FAILED = `${CORE_COLUMNS.trim()},
|
||||
${PROVIDER_DETECTION_COLUMN},
|
||||
failed_at`;
|
||||
|
||||
/**
|
||||
* NOTE: After migration 044 is applied, update PROVIDER_DETECTION_COLUMN above
|
||||
* to use the real column instead of NULL fallback.
|
||||
*
|
||||
* To verify migration status:
|
||||
* SELECT column_name FROM information_schema.columns
|
||||
* WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data';
|
||||
*/
|
||||
|
||||
// Cache for column existence check
|
||||
let _providerDetectionColumnExists: boolean | null = null;
|
||||
|
||||
/**
|
||||
* Check if provider_detection_data column exists in dispensaries table.
|
||||
* Result is cached after first check.
|
||||
*/
|
||||
export async function hasProviderDetectionColumn(pool: { query: (sql: string) => Promise<{ rows: any[] }> }): Promise<boolean> {
|
||||
if (_providerDetectionColumnExists !== null) {
|
||||
return _providerDetectionColumnExists;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'dispensaries' AND column_name = 'provider_detection_data'
|
||||
`);
|
||||
_providerDetectionColumnExists = result.rows.length > 0;
|
||||
} catch {
|
||||
_providerDetectionColumnExists = false;
|
||||
}
|
||||
|
||||
return _providerDetectionColumnExists;
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely update provider_detection_data column.
|
||||
* If column doesn't exist, logs a warning but doesn't crash.
|
||||
*
|
||||
* @param pool - Database pool with query method
|
||||
* @param dispensaryId - ID of dispensary to update
|
||||
* @param data - JSONB data to merge into provider_detection_data
|
||||
* @returns true if update succeeded, false if column doesn't exist
|
||||
*/
|
||||
export async function safeUpdateProviderDetectionData(
|
||||
pool: { query: (sql: string, params?: any[]) => Promise<any> },
|
||||
dispensaryId: number,
|
||||
data: Record<string, any>
|
||||
): Promise<boolean> {
|
||||
const hasColumn = await hasProviderDetectionColumn(pool);
|
||||
|
||||
if (!hasColumn) {
|
||||
console.warn(`[DispensaryColumns] provider_detection_data column not found. Run migration 044 to add it.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
await pool.query(
|
||||
`UPDATE dispensaries
|
||||
SET provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || $1::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2`,
|
||||
[JSON.stringify(data), dispensaryId]
|
||||
);
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
if (error.message?.includes('provider_detection_data')) {
|
||||
console.warn(`[DispensaryColumns] Failed to update provider_detection_data: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Schema Bootstrap
|
||||
*
|
||||
* Run this to create/update the dutchie_az tables (dutchie_products, dutchie_product_snapshots, etc.)
|
||||
* in the AZ pipeline database. This is separate from the legacy schema.
|
||||
*
|
||||
* Usage:
|
||||
* TS_NODE_TRANSPILE_ONLY=1 npx ts-node src/dutchie-az/db/migrate.ts
|
||||
* or (after build)
|
||||
* node dist/dutchie-az/db/migrate.js
|
||||
*/
|
||||
|
||||
import { createSchema } from './schema';
|
||||
import { closePool } from './connection';
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
console.log('[DutchieAZ] Running schema migration...');
|
||||
await createSchema();
|
||||
console.log('[DutchieAZ] Schema migration complete.');
|
||||
} catch (err: any) {
|
||||
console.error('[DutchieAZ] Schema migration failed:', err.message);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await closePool();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,408 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Database Schema
|
||||
*
|
||||
* Creates all tables for the isolated Dutchie Arizona data pipeline.
|
||||
* Run this to initialize the dutchie_az database.
|
||||
*/
|
||||
|
||||
import { query, getClient } from './connection';
|
||||
|
||||
/**
|
||||
* SQL statements to create all tables
|
||||
*/
|
||||
const SCHEMA_SQL = `
|
||||
-- ============================================================
|
||||
-- DISPENSARIES TABLE
|
||||
-- Stores discovered Dutchie dispensaries in Arizona
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dispensaries (
|
||||
id SERIAL PRIMARY KEY,
|
||||
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL,
|
||||
city VARCHAR(100) NOT NULL,
|
||||
state VARCHAR(10) NOT NULL DEFAULT 'AZ',
|
||||
postal_code VARCHAR(20),
|
||||
address TEXT,
|
||||
latitude DECIMAL(10, 7),
|
||||
longitude DECIMAL(10, 7),
|
||||
platform_dispensary_id VARCHAR(100),
|
||||
is_delivery BOOLEAN DEFAULT false,
|
||||
is_pickup BOOLEAN DEFAULT true,
|
||||
raw_metadata JSONB,
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
product_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uk_dispensaries_platform_slug UNIQUE (platform, slug, city, state)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform ON dispensaries(platform);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform_id ON dispensaries(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
|
||||
|
||||
-- ============================================================
|
||||
-- DUTCHIE_PRODUCTS TABLE
|
||||
-- Canonical product identity per store
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dutchie_products (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
external_product_id VARCHAR(100) NOT NULL,
|
||||
platform_dispensary_id VARCHAR(100) NOT NULL,
|
||||
c_name VARCHAR(500),
|
||||
name VARCHAR(500) NOT NULL,
|
||||
|
||||
-- Brand
|
||||
brand_name VARCHAR(255),
|
||||
brand_id VARCHAR(100),
|
||||
brand_logo_url TEXT,
|
||||
|
||||
-- Classification
|
||||
type VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
strain_type VARCHAR(50),
|
||||
provider VARCHAR(100),
|
||||
|
||||
-- Potency
|
||||
thc DECIMAL(10, 4),
|
||||
thc_content DECIMAL(10, 4),
|
||||
cbd DECIMAL(10, 4),
|
||||
cbd_content DECIMAL(10, 4),
|
||||
cannabinoids_v2 JSONB,
|
||||
effects JSONB,
|
||||
|
||||
-- Status / flags
|
||||
status VARCHAR(50),
|
||||
medical_only BOOLEAN DEFAULT false,
|
||||
rec_only BOOLEAN DEFAULT false,
|
||||
featured BOOLEAN DEFAULT false,
|
||||
coming_soon BOOLEAN DEFAULT false,
|
||||
certificate_of_analysis_enabled BOOLEAN DEFAULT false,
|
||||
|
||||
is_below_threshold BOOLEAN DEFAULT false,
|
||||
is_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
options_below_threshold BOOLEAN DEFAULT false,
|
||||
options_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
|
||||
-- Derived stock status: 'in_stock', 'out_of_stock', 'unknown'
|
||||
stock_status VARCHAR(20) DEFAULT 'unknown',
|
||||
total_quantity_available INTEGER DEFAULT 0,
|
||||
|
||||
-- Images
|
||||
primary_image_url TEXT,
|
||||
images JSONB,
|
||||
|
||||
-- Misc
|
||||
measurements JSONB,
|
||||
weight VARCHAR(50),
|
||||
past_c_names TEXT[],
|
||||
|
||||
created_at_dutchie TIMESTAMPTZ,
|
||||
updated_at_dutchie TIMESTAMPTZ,
|
||||
|
||||
latest_raw_payload JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uk_dutchie_products UNIQUE (dispensary_id, external_product_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_dispensary ON dutchie_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_external_id ON dutchie_products(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_platform_disp ON dutchie_products(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_brand ON dutchie_products(brand_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_type ON dutchie_products(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_subcategory ON dutchie_products(subcategory);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_status ON dutchie_products(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_strain ON dutchie_products(strain_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_stock_status ON dutchie_products(stock_status);
|
||||
|
||||
-- ============================================================
|
||||
-- DUTCHIE_PRODUCT_SNAPSHOTS TABLE
|
||||
-- Historical state per crawl, includes options[]
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dutchie_product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dutchie_product_id INTEGER NOT NULL REFERENCES dutchie_products(id) ON DELETE CASCADE,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform_dispensary_id VARCHAR(100) NOT NULL,
|
||||
external_product_id VARCHAR(100) NOT NULL,
|
||||
pricing_type VARCHAR(20) DEFAULT 'unknown',
|
||||
crawl_mode VARCHAR(20) DEFAULT 'mode_a', -- 'mode_a' (UI parity) or 'mode_b' (max coverage)
|
||||
|
||||
status VARCHAR(50),
|
||||
featured BOOLEAN DEFAULT false,
|
||||
special BOOLEAN DEFAULT false,
|
||||
medical_only BOOLEAN DEFAULT false,
|
||||
rec_only BOOLEAN DEFAULT false,
|
||||
|
||||
-- Flag indicating if product was present in feed (false = missing_from_feed snapshot)
|
||||
is_present_in_feed BOOLEAN DEFAULT true,
|
||||
|
||||
-- Derived stock status
|
||||
stock_status VARCHAR(20) DEFAULT 'unknown',
|
||||
|
||||
-- Price summary (in cents)
|
||||
rec_min_price_cents INTEGER,
|
||||
rec_max_price_cents INTEGER,
|
||||
rec_min_special_price_cents INTEGER,
|
||||
med_min_price_cents INTEGER,
|
||||
med_max_price_cents INTEGER,
|
||||
med_min_special_price_cents INTEGER,
|
||||
wholesale_min_price_cents INTEGER,
|
||||
|
||||
-- Inventory summary
|
||||
total_quantity_available INTEGER,
|
||||
total_kiosk_quantity_available INTEGER,
|
||||
manual_inventory BOOLEAN DEFAULT false,
|
||||
is_below_threshold BOOLEAN DEFAULT false,
|
||||
is_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
|
||||
-- Option-level data (from POSMetaData.children)
|
||||
options JSONB,
|
||||
|
||||
-- Full raw product node
|
||||
raw_payload JSONB NOT NULL,
|
||||
|
||||
crawled_at TIMESTAMPTZ NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product ON dutchie_product_snapshots(dutchie_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary ON dutchie_product_snapshots(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawled_at ON dutchie_product_snapshots(crawled_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_platform_disp ON dutchie_product_snapshots(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON dutchie_product_snapshots(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_special ON dutchie_product_snapshots(special) WHERE special = true;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_stock_status ON dutchie_product_snapshots(stock_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_mode ON dutchie_product_snapshots(crawl_mode);
|
||||
|
||||
-- ============================================================
|
||||
-- CRAWL_JOBS TABLE
|
||||
-- Tracks crawl execution status
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_type VARCHAR(50) NOT NULL,
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
error_message TEXT,
|
||||
products_found INTEGER,
|
||||
snapshots_created INTEGER,
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_type ON crawl_jobs(job_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_dispensary ON crawl_jobs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_created ON crawl_jobs(created_at);
|
||||
|
||||
-- ============================================================
|
||||
-- JOB_SCHEDULES TABLE
|
||||
-- Stores schedule configuration for recurring jobs with jitter support
|
||||
-- Each job has independent timing that "wanders" over time
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS job_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_name VARCHAR(100) NOT NULL UNIQUE,
|
||||
description TEXT,
|
||||
enabled BOOLEAN DEFAULT true,
|
||||
|
||||
-- Timing configuration (jitter makes times "wander")
|
||||
base_interval_minutes INTEGER NOT NULL DEFAULT 240, -- e.g., 4 hours
|
||||
jitter_minutes INTEGER NOT NULL DEFAULT 30, -- e.g., ±30 min
|
||||
|
||||
-- Last run tracking
|
||||
last_run_at TIMESTAMPTZ,
|
||||
last_status VARCHAR(20), -- 'success', 'error', 'partial', 'running'
|
||||
last_error_message TEXT,
|
||||
last_duration_ms INTEGER,
|
||||
|
||||
-- Next run (calculated with jitter after each run)
|
||||
next_run_at TIMESTAMPTZ,
|
||||
|
||||
-- Additional config
|
||||
job_config JSONB, -- e.g., { pricingType: 'rec', useBothModes: true }
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_job_schedules_enabled ON job_schedules(enabled);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_schedules_next_run ON job_schedules(next_run_at);
|
||||
|
||||
-- ============================================================
|
||||
-- JOB_RUN_LOGS TABLE
|
||||
-- Stores history of job runs for monitoring
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS job_run_logs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
|
||||
job_name VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
error_message TEXT,
|
||||
|
||||
-- Results summary
|
||||
items_processed INTEGER,
|
||||
items_succeeded INTEGER,
|
||||
items_failed INTEGER,
|
||||
|
||||
metadata JSONB, -- Additional run details
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
|
||||
|
||||
-- ============================================================
|
||||
-- VIEWS FOR EASY QUERYING
|
||||
-- ============================================================
|
||||
|
||||
-- Categories derived from products
|
||||
CREATE OR REPLACE VIEW v_categories AS
|
||||
SELECT
|
||||
type,
|
||||
subcategory,
|
||||
COUNT(DISTINCT id) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
AVG(thc) as avg_thc,
|
||||
MIN(thc) as min_thc,
|
||||
MAX(thc) as max_thc
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
ORDER BY type, subcategory;
|
||||
|
||||
-- Brands derived from products
|
||||
CREATE OR REPLACE VIEW v_brands AS
|
||||
SELECT
|
||||
brand_name,
|
||||
brand_id,
|
||||
MAX(brand_logo_url) as brand_logo_url,
|
||||
COUNT(DISTINCT id) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL) as product_types
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL
|
||||
GROUP BY brand_name, brand_id
|
||||
ORDER BY product_count DESC;
|
||||
|
||||
-- Latest snapshot per product (most recent crawl data)
|
||||
CREATE OR REPLACE VIEW v_latest_snapshots AS
|
||||
SELECT DISTINCT ON (dutchie_product_id)
|
||||
s.*
|
||||
FROM dutchie_product_snapshots s
|
||||
ORDER BY dutchie_product_id, crawled_at DESC;
|
||||
|
||||
-- Dashboard stats
|
||||
CREATE OR REPLACE VIEW v_dashboard_stats AS
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as dispensary_count,
|
||||
(SELECT COUNT(*) FROM dutchie_products) as product_count,
|
||||
(SELECT COUNT(*) FROM dutchie_product_snapshots WHERE crawled_at > NOW() - INTERVAL '24 hours') as snapshots_24h,
|
||||
(SELECT MAX(crawled_at) FROM dutchie_product_snapshots) as last_crawl_time,
|
||||
(SELECT COUNT(*) FROM crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h,
|
||||
(SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as brand_count,
|
||||
(SELECT COUNT(DISTINCT (type, subcategory)) FROM dutchie_products WHERE type IS NOT NULL) as category_count;
|
||||
`;
|
||||
|
||||
/**
|
||||
* Run the schema migration
|
||||
*/
|
||||
export async function createSchema(): Promise<void> {
|
||||
console.log('[DutchieAZ Schema] Creating database schema...');
|
||||
|
||||
const client = await getClient();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Split into individual statements and execute
|
||||
const statements = SCHEMA_SQL
|
||||
.split(';')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0 && !s.startsWith('--'));
|
||||
|
||||
for (const statement of statements) {
|
||||
if (statement.trim()) {
|
||||
await client.query(statement + ';');
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
console.log('[DutchieAZ Schema] Schema created successfully');
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[DutchieAZ Schema] Failed to create schema:', error);
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop all tables (for development/testing)
|
||||
*/
|
||||
export async function dropSchema(): Promise<void> {
|
||||
console.log('[DutchieAZ Schema] Dropping all tables...');
|
||||
|
||||
await query(`
|
||||
DROP VIEW IF EXISTS v_dashboard_stats CASCADE;
|
||||
DROP VIEW IF EXISTS v_latest_snapshots CASCADE;
|
||||
DROP VIEW IF EXISTS v_brands CASCADE;
|
||||
DROP VIEW IF EXISTS v_categories CASCADE;
|
||||
DROP TABLE IF EXISTS crawl_schedule CASCADE;
|
||||
DROP TABLE IF EXISTS crawl_jobs CASCADE;
|
||||
DROP TABLE IF EXISTS dutchie_product_snapshots CASCADE;
|
||||
DROP TABLE IF EXISTS dutchie_products CASCADE;
|
||||
DROP TABLE IF EXISTS dispensaries CASCADE;
|
||||
`);
|
||||
|
||||
console.log('[DutchieAZ Schema] All tables dropped');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if schema exists
|
||||
*/
|
||||
export async function schemaExists(): Promise<boolean> {
|
||||
try {
|
||||
const result = await query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'dispensaries'
|
||||
) as exists
|
||||
`);
|
||||
return result.rows[0]?.exists === true;
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize schema if it doesn't exist
|
||||
*/
|
||||
export async function ensureSchema(): Promise<void> {
|
||||
const exists = await schemaExists();
|
||||
if (!exists) {
|
||||
await createSchema();
|
||||
} else {
|
||||
console.log('[DutchieAZ Schema] Schema already exists');
|
||||
}
|
||||
}
|
||||
@@ -1,403 +0,0 @@
|
||||
/**
|
||||
* DtCityDiscoveryService
|
||||
*
|
||||
* Core service for Dutchie city discovery.
|
||||
* Contains shared logic used by multiple entrypoints.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Browser/API-based city fetching
|
||||
* - Manual city seeding
|
||||
* - City upsert operations
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCity {
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesInserted: number;
|
||||
citiesUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
export interface ManualSeedResult {
|
||||
city: DutchieCity;
|
||||
id: number;
|
||||
wasInserted: boolean;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// US STATE CODE MAPPING
|
||||
// ============================================================
|
||||
|
||||
export const US_STATE_MAP: Record<string, string> = {
|
||||
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||
};
|
||||
|
||||
// Canadian province mapping
|
||||
export const CA_PROVINCE_MAP: Record<string, string> = {
|
||||
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// CITY FETCHING (AUTO DISCOVERY)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch cities from Dutchie's /cities page using Puppeteer.
|
||||
*/
|
||||
export async function fetchCitiesFromBrowser(): Promise<DutchieCity[]> {
|
||||
console.log('[DtCityDiscoveryService] Launching browser to fetch cities...');
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('[DtCityDiscoveryService] Navigating to https://dutchie.com/cities...');
|
||||
await page.goto('https://dutchie.com/cities', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const cities = await page.evaluate(() => {
|
||||
const cityLinks: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
stateSlug: string | null;
|
||||
}> = [];
|
||||
|
||||
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||
links.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
const text = (link as HTMLElement).innerText?.trim();
|
||||
|
||||
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||
if (match && text) {
|
||||
cityLinks.push({
|
||||
name: text,
|
||||
slug: match[2],
|
||||
url: href,
|
||||
stateSlug: match[1],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return cityLinks;
|
||||
});
|
||||
|
||||
console.log(`[DtCityDiscoveryService] Extracted ${cities.length} city links from page`);
|
||||
|
||||
return cities.map((city) => {
|
||||
let countryCode = 'US';
|
||||
let stateCode: string | null = null;
|
||||
|
||||
if (city.stateSlug) {
|
||||
if (US_STATE_MAP[city.stateSlug]) {
|
||||
stateCode = US_STATE_MAP[city.stateSlug];
|
||||
countryCode = 'US';
|
||||
} else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||
countryCode = 'CA';
|
||||
} else if (city.stateSlug.length === 2) {
|
||||
stateCode = city.stateSlug.toUpperCase();
|
||||
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||
countryCode = 'CA';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
url: city.url,
|
||||
};
|
||||
});
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch cities via API endpoints (fallback).
|
||||
*/
|
||||
export async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||
console.log('[DtCityDiscoveryService] Attempting API-based city discovery...');
|
||||
|
||||
const apiEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of apiEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
if (response.data && Array.isArray(response.data)) {
|
||||
console.log(`[DtCityDiscoveryService] API returned ${response.data.length} cities`);
|
||||
return response.data.map((c: any) => ({
|
||||
name: c.name || c.city,
|
||||
slug: c.slug || c.citySlug,
|
||||
stateCode: c.stateCode || c.state,
|
||||
countryCode: c.countryCode || c.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(`[DtCityDiscoveryService] API ${endpoint} failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities
|
||||
*/
|
||||
export async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCity
|
||||
): Promise<{ id: number; inserted: boolean; updated: boolean }> {
|
||||
const result = await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
crawl_enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
TRUE,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
crawl_enabled = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) AS inserted
|
||||
`,
|
||||
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||
);
|
||||
|
||||
const inserted = result.rows[0]?.inserted === true;
|
||||
return {
|
||||
id: result.rows[0]?.id,
|
||||
inserted,
|
||||
updated: !inserted,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN SERVICE CLASS
|
||||
// ============================================================
|
||||
|
||||
export class DtCityDiscoveryService {
|
||||
constructor(private pool: Pool) {}
|
||||
|
||||
/**
|
||||
* Run auto-discovery (browser + API fallback)
|
||||
*/
|
||||
async runAutoDiscovery(): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let citiesFound = 0;
|
||||
let citiesInserted = 0;
|
||||
let citiesUpdated = 0;
|
||||
|
||||
console.log('[DtCityDiscoveryService] Starting auto city discovery...');
|
||||
|
||||
try {
|
||||
let cities = await fetchCitiesFromBrowser();
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[DtCityDiscoveryService] Browser returned 0 cities, trying API...');
|
||||
cities = await fetchCitiesFromAPI();
|
||||
}
|
||||
|
||||
citiesFound = cities.length;
|
||||
console.log(`[DtCityDiscoveryService] Found ${citiesFound} cities`);
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await upsertCity(this.pool, city);
|
||||
if (result.inserted) citiesInserted++;
|
||||
else if (result.updated) citiesUpdated++;
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `Auto discovery failed: ${error.message}`;
|
||||
console.error(`[DtCityDiscoveryService] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
citiesFound,
|
||||
citiesInserted,
|
||||
citiesUpdated,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed a single city manually
|
||||
*/
|
||||
async seedCity(city: DutchieCity): Promise<ManualSeedResult> {
|
||||
console.log(`[DtCityDiscoveryService] Seeding city: ${city.name} (${city.slug}), ${city.stateCode}, ${city.countryCode}`);
|
||||
|
||||
const result = await upsertCity(this.pool, city);
|
||||
|
||||
return {
|
||||
city,
|
||||
id: result.id,
|
||||
wasInserted: result.inserted,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed multiple cities from a list
|
||||
*/
|
||||
async seedCities(cities: DutchieCity[]): Promise<{
|
||||
results: ManualSeedResult[];
|
||||
errors: string[];
|
||||
}> {
|
||||
const results: ManualSeedResult[] = [];
|
||||
const errors: string[] = [];
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await this.seedCity(city);
|
||||
results.push(result);
|
||||
} catch (error: any) {
|
||||
errors.push(`${city.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { results, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about discovered cities
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
byCountry: Array<{ countryCode: string; count: number }>;
|
||||
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||
crawlEnabled: number;
|
||||
neverCrawled: number;
|
||||
}> {
|
||||
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE platform = \'dutchie\''),
|
||||
this.pool.query(`
|
||||
SELECT country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie'
|
||||
GROUP BY country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT state_code, country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND state_code IS NOT NULL
|
||||
GROUP BY state_code, country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND last_crawled_at IS NULL
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||
byCountry: byCountryRes.rows.map((r) => ({
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
byState: byStateRes.rows.map((r) => ({
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DtCityDiscoveryService;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,390 +0,0 @@
|
||||
/**
|
||||
* DutchieCityDiscovery
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Fetch all cities available on Dutchie
|
||||
* - For each city derive: city_name, city_slug, state_code, country_code
|
||||
* - Upsert into dutchie_discovery_cities
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import type { Browser, Page } from 'puppeteer';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DutchieCity {
|
||||
name: string;
|
||||
slug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
url?: string;
|
||||
}
|
||||
|
||||
export interface CityDiscoveryResult {
|
||||
citiesFound: number;
|
||||
citiesInserted: number;
|
||||
citiesUpdated: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// US STATE CODE MAPPING
|
||||
// ============================================================
|
||||
|
||||
const US_STATE_MAP: Record<string, string> = {
|
||||
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
|
||||
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
|
||||
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
|
||||
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
|
||||
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
|
||||
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
|
||||
'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV',
|
||||
'new-hampshire': 'NH', 'new-jersey': 'NJ', 'new-mexico': 'NM', 'new-york': 'NY',
|
||||
'north-carolina': 'NC', 'north-dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK',
|
||||
'oregon': 'OR', 'pennsylvania': 'PA', 'rhode-island': 'RI', 'south-carolina': 'SC',
|
||||
'south-dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
|
||||
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', 'west-virginia': 'WV',
|
||||
'wisconsin': 'WI', 'wyoming': 'WY', 'district-of-columbia': 'DC',
|
||||
};
|
||||
|
||||
// Canadian province mapping
|
||||
const CA_PROVINCE_MAP: Record<string, string> = {
|
||||
'alberta': 'AB', 'british-columbia': 'BC', 'manitoba': 'MB',
|
||||
'new-brunswick': 'NB', 'newfoundland-and-labrador': 'NL',
|
||||
'northwest-territories': 'NT', 'nova-scotia': 'NS', 'nunavut': 'NU',
|
||||
'ontario': 'ON', 'prince-edward-island': 'PE', 'quebec': 'QC',
|
||||
'saskatchewan': 'SK', 'yukon': 'YT',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// CITY FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch cities from Dutchie's /cities page using Puppeteer to extract data.
|
||||
*/
|
||||
async function fetchCitiesFromDutchie(): Promise<DutchieCity[]> {
|
||||
console.log('[DutchieCityDiscovery] Launching browser to fetch cities...');
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Navigate to cities page
|
||||
console.log('[DutchieCityDiscovery] Navigating to https://dutchie.com/cities...');
|
||||
await page.goto('https://dutchie.com/cities', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for content to load
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// Extract city links from the page
|
||||
const cities = await page.evaluate(() => {
|
||||
const cityLinks: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
url: string;
|
||||
stateSlug: string | null;
|
||||
}> = [];
|
||||
|
||||
// Find all city links - they typically follow pattern /city/{state}/{city}
|
||||
const links = document.querySelectorAll('a[href*="/city/"]');
|
||||
links.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
const text = (link as HTMLElement).innerText?.trim();
|
||||
|
||||
// Parse URL: https://dutchie.com/city/{state}/{city}
|
||||
const match = href.match(/\/city\/([^/]+)\/([^/?]+)/);
|
||||
if (match && text) {
|
||||
cityLinks.push({
|
||||
name: text,
|
||||
slug: match[2],
|
||||
url: href,
|
||||
stateSlug: match[1],
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return cityLinks;
|
||||
});
|
||||
|
||||
console.log(`[DutchieCityDiscovery] Extracted ${cities.length} city links from page`);
|
||||
|
||||
// Convert to DutchieCity format
|
||||
const result: DutchieCity[] = [];
|
||||
|
||||
for (const city of cities) {
|
||||
// Determine country and state code
|
||||
let countryCode = 'US';
|
||||
let stateCode: string | null = null;
|
||||
|
||||
if (city.stateSlug) {
|
||||
// Check if it's a US state
|
||||
if (US_STATE_MAP[city.stateSlug]) {
|
||||
stateCode = US_STATE_MAP[city.stateSlug];
|
||||
countryCode = 'US';
|
||||
}
|
||||
// Check if it's a Canadian province
|
||||
else if (CA_PROVINCE_MAP[city.stateSlug]) {
|
||||
stateCode = CA_PROVINCE_MAP[city.stateSlug];
|
||||
countryCode = 'CA';
|
||||
}
|
||||
// Check if it's already a 2-letter code
|
||||
else if (city.stateSlug.length === 2) {
|
||||
stateCode = city.stateSlug.toUpperCase();
|
||||
// Determine country based on state code
|
||||
if (Object.values(CA_PROVINCE_MAP).includes(stateCode)) {
|
||||
countryCode = 'CA';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.push({
|
||||
name: city.name,
|
||||
slug: city.slug,
|
||||
stateCode,
|
||||
countryCode,
|
||||
url: city.url,
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Fetch cities by making API/GraphQL requests.
|
||||
* Falls back to this if scraping fails.
|
||||
*/
|
||||
async function fetchCitiesFromAPI(): Promise<DutchieCity[]> {
|
||||
console.log('[DutchieCityDiscovery] Attempting API-based city discovery...');
|
||||
|
||||
// Dutchie may have an API endpoint for cities
|
||||
// Try common patterns
|
||||
const apiEndpoints = [
|
||||
'https://dutchie.com/api/cities',
|
||||
'https://api.dutchie.com/v1/cities',
|
||||
];
|
||||
|
||||
for (const endpoint of apiEndpoints) {
|
||||
try {
|
||||
const response = await axios.get(endpoint, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
if (response.data && Array.isArray(response.data)) {
|
||||
console.log(`[DutchieCityDiscovery] API returned ${response.data.length} cities`);
|
||||
return response.data.map((c: any) => ({
|
||||
name: c.name || c.city,
|
||||
slug: c.slug || c.citySlug,
|
||||
stateCode: c.stateCode || c.state,
|
||||
countryCode: c.countryCode || c.country || 'US',
|
||||
}));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(`[DutchieCityDiscovery] API ${endpoint} failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a city into dutchie_discovery_cities
|
||||
*/
|
||||
async function upsertCity(
|
||||
pool: Pool,
|
||||
city: DutchieCity
|
||||
): Promise<{ inserted: boolean; updated: boolean }> {
|
||||
const result = await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_cities (
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
last_crawled_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4,
|
||||
NOW(),
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (platform, country_code, state_code, city_slug)
|
||||
DO UPDATE SET
|
||||
city_name = EXCLUDED.city_name,
|
||||
last_crawled_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS inserted
|
||||
`,
|
||||
[city.name, city.slug, city.stateCode, city.countryCode]
|
||||
);
|
||||
|
||||
const inserted = result.rows[0]?.inserted === true;
|
||||
return { inserted, updated: !inserted };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY FUNCTION
|
||||
// ============================================================
|
||||
|
||||
export class DutchieCityDiscovery {
|
||||
private pool: Pool;
|
||||
|
||||
constructor(pool: Pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the city discovery process
|
||||
*/
|
||||
async run(): Promise<CityDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let citiesFound = 0;
|
||||
let citiesInserted = 0;
|
||||
let citiesUpdated = 0;
|
||||
|
||||
console.log('[DutchieCityDiscovery] Starting city discovery...');
|
||||
|
||||
try {
|
||||
// Try scraping first, fall back to API
|
||||
let cities = await fetchCitiesFromDutchie();
|
||||
|
||||
if (cities.length === 0) {
|
||||
console.log('[DutchieCityDiscovery] Scraping returned 0 cities, trying API...');
|
||||
cities = await fetchCitiesFromAPI();
|
||||
}
|
||||
|
||||
citiesFound = cities.length;
|
||||
console.log(`[DutchieCityDiscovery] Found ${citiesFound} cities`);
|
||||
|
||||
// Upsert each city
|
||||
for (const city of cities) {
|
||||
try {
|
||||
const result = await upsertCity(this.pool, city);
|
||||
if (result.inserted) {
|
||||
citiesInserted++;
|
||||
} else if (result.updated) {
|
||||
citiesUpdated++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert city ${city.slug}: ${error.message}`;
|
||||
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
const msg = `City discovery failed: ${error.message}`;
|
||||
console.error(`[DutchieCityDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log('[DutchieCityDiscovery] Discovery complete:');
|
||||
console.log(` Cities found: ${citiesFound}`);
|
||||
console.log(` Inserted: ${citiesInserted}`);
|
||||
console.log(` Updated: ${citiesUpdated}`);
|
||||
console.log(` Errors: ${errors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
citiesFound,
|
||||
citiesInserted,
|
||||
citiesUpdated,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about discovered cities
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
byCountry: Array<{ countryCode: string; count: number }>;
|
||||
byState: Array<{ stateCode: string; countryCode: string; count: number }>;
|
||||
crawlEnabled: number;
|
||||
neverCrawled: number;
|
||||
}> {
|
||||
const [totalRes, byCountryRes, byStateRes, enabledRes, neverRes] = await Promise.all([
|
||||
this.pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
|
||||
this.pool.query(`
|
||||
SELECT country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
GROUP BY country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT state_code, country_code, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE state_code IS NOT NULL
|
||||
GROUP BY state_code, country_code
|
||||
ORDER BY cnt DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE crawl_enabled = TRUE
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT COUNT(*) as cnt
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE last_crawled_at IS NULL
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
|
||||
byCountry: byCountryRes.rows.map((r) => ({
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
byState: byStateRes.rows.map((r) => ({
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
count: parseInt(r.cnt, 10),
|
||||
})),
|
||||
crawlEnabled: parseInt(enabledRes.rows[0]?.cnt || '0', 10),
|
||||
neverCrawled: parseInt(neverRes.rows[0]?.cnt || '0', 10),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DutchieCityDiscovery;
|
||||
@@ -1,639 +0,0 @@
|
||||
/**
|
||||
* DutchieLocationDiscovery
|
||||
*
|
||||
* Discovers store locations for each city from Dutchie and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Given a dutchie_discovery_cities row, call Dutchie's location/search endpoint
|
||||
* - For each store: extract platform_location_id, platform_slug, platform_menu_url, name, address, coords
|
||||
* - Upsert into dutchie_discovery_locations
|
||||
* - DO NOT overwrite status if already verified/merged/rejected
|
||||
* - DO NOT overwrite dispensary_id if already set
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import axios from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DiscoveryCity {
|
||||
id: number;
|
||||
platform: string;
|
||||
cityName: string;
|
||||
citySlug: string;
|
||||
stateCode: string | null;
|
||||
countryCode: string;
|
||||
crawlEnabled: boolean;
|
||||
}
|
||||
|
||||
export interface DutchieLocation {
|
||||
platformLocationId: string;
|
||||
platformSlug: string;
|
||||
platformMenuUrl: string;
|
||||
name: string;
|
||||
rawAddress: string | null;
|
||||
addressLine1: string | null;
|
||||
addressLine2: string | null;
|
||||
city: string | null;
|
||||
stateCode: string | null;
|
||||
postalCode: string | null;
|
||||
countryCode: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
timezone: string | null;
|
||||
offersDelivery: boolean | null;
|
||||
offersPickup: boolean | null;
|
||||
isRecreational: boolean | null;
|
||||
isMedical: boolean | null;
|
||||
metadata: Record<string, any>;
|
||||
}
|
||||
|
||||
export interface LocationDiscoveryResult {
|
||||
cityId: number;
|
||||
citySlug: string;
|
||||
locationsFound: number;
|
||||
locationsInserted: number;
|
||||
locationsUpdated: number;
|
||||
locationsSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOCATION FETCHING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch locations for a city using Puppeteer to scrape the city page
|
||||
*/
|
||||
async function fetchLocationsForCity(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||
console.log(`[DutchieLocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Navigate to city page - use /us/dispensaries/{city_slug} pattern
|
||||
const cityUrl = `https://dutchie.com/us/dispensaries/${city.citySlug}`;
|
||||
console.log(`[DutchieLocationDiscovery] Navigating to ${cityUrl}...`);
|
||||
|
||||
await page.goto(cityUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
// Wait for content
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
// Try to extract __NEXT_DATA__ which often contains store data
|
||||
const nextData = await page.evaluate(() => {
|
||||
const script = document.querySelector('script#__NEXT_DATA__');
|
||||
if (script) {
|
||||
try {
|
||||
return JSON.parse(script.textContent || '{}');
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
let locations: DutchieLocation[] = [];
|
||||
|
||||
if (nextData?.props?.pageProps?.dispensaries) {
|
||||
// Extract from Next.js data
|
||||
const dispensaries = nextData.props.pageProps.dispensaries;
|
||||
console.log(`[DutchieLocationDiscovery] Found ${dispensaries.length} dispensaries in __NEXT_DATA__`);
|
||||
|
||||
locations = dispensaries.map((d: any) => parseDispensaryData(d, city));
|
||||
} else {
|
||||
// Fall back to DOM scraping
|
||||
console.log('[DutchieLocationDiscovery] No __NEXT_DATA__, trying DOM scraping...');
|
||||
|
||||
const scrapedData = await page.evaluate(() => {
|
||||
const stores: Array<{
|
||||
name: string;
|
||||
href: string;
|
||||
address: string | null;
|
||||
}> = [];
|
||||
|
||||
// Look for dispensary cards/links
|
||||
const cards = document.querySelectorAll('[data-testid="dispensary-card"], .dispensary-card, a[href*="/dispensary/"]');
|
||||
cards.forEach((card) => {
|
||||
const link = card.querySelector('a[href*="/dispensary/"]') || (card as HTMLAnchorElement);
|
||||
const href = (link as HTMLAnchorElement).href || '';
|
||||
const name =
|
||||
card.querySelector('[data-testid="dispensary-name"]')?.textContent ||
|
||||
card.querySelector('h2, h3, .name')?.textContent ||
|
||||
link.textContent ||
|
||||
'';
|
||||
const address = card.querySelector('[data-testid="dispensary-address"], .address')?.textContent || null;
|
||||
|
||||
if (href && name) {
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
href,
|
||||
address: address?.trim() || null,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return stores;
|
||||
});
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] DOM scraping found ${scrapedData.length} stores`);
|
||||
|
||||
locations = scrapedData.map((s) => {
|
||||
// Parse slug from URL
|
||||
const match = s.href.match(/\/dispensary\/([^/?]+)/);
|
||||
const slug = match ? match[1] : s.name.toLowerCase().replace(/\s+/g, '-');
|
||||
|
||||
return {
|
||||
platformLocationId: slug, // Will be resolved later
|
||||
platformSlug: slug,
|
||||
platformMenuUrl: `https://dutchie.com/dispensary/${slug}`,
|
||||
name: s.name,
|
||||
rawAddress: s.address,
|
||||
addressLine1: null,
|
||||
addressLine2: null,
|
||||
city: city.cityName,
|
||||
stateCode: city.stateCode,
|
||||
postalCode: null,
|
||||
countryCode: city.countryCode,
|
||||
latitude: null,
|
||||
longitude: null,
|
||||
timezone: null,
|
||||
offersDelivery: null,
|
||||
offersPickup: null,
|
||||
isRecreational: null,
|
||||
isMedical: null,
|
||||
metadata: { source: 'dom_scrape', originalUrl: s.href },
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
return locations;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse dispensary data from Dutchie's API/JSON response
|
||||
*/
|
||||
function parseDispensaryData(d: any, city: DiscoveryCity): DutchieLocation {
|
||||
const id = d.id || d._id || d.dispensaryId || '';
|
||||
const slug = d.slug || d.cName || d.name?.toLowerCase().replace(/\s+/g, '-') || '';
|
||||
|
||||
// Build menu URL
|
||||
let menuUrl = `https://dutchie.com/dispensary/${slug}`;
|
||||
if (d.menuUrl) {
|
||||
menuUrl = d.menuUrl;
|
||||
} else if (d.embeddedMenuUrl) {
|
||||
menuUrl = d.embeddedMenuUrl;
|
||||
}
|
||||
|
||||
// Parse address
|
||||
const address = d.address || d.location?.address || {};
|
||||
const rawAddress = [
|
||||
address.line1 || address.street1 || d.address1,
|
||||
address.line2 || address.street2 || d.address2,
|
||||
[
|
||||
address.city || d.city,
|
||||
address.state || address.stateCode || d.state,
|
||||
address.zip || address.zipCode || address.postalCode || d.zip,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' '),
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(', ');
|
||||
|
||||
return {
|
||||
platformLocationId: id,
|
||||
platformSlug: slug,
|
||||
platformMenuUrl: menuUrl,
|
||||
name: d.name || d.dispensaryName || '',
|
||||
rawAddress: rawAddress || null,
|
||||
addressLine1: address.line1 || address.street1 || d.address1 || null,
|
||||
addressLine2: address.line2 || address.street2 || d.address2 || null,
|
||||
city: address.city || d.city || city.cityName,
|
||||
stateCode: address.state || address.stateCode || d.state || city.stateCode,
|
||||
postalCode: address.zip || address.zipCode || address.postalCode || d.zip || null,
|
||||
countryCode: address.country || address.countryCode || d.country || city.countryCode,
|
||||
latitude: d.latitude ?? d.location?.latitude ?? d.location?.lat ?? null,
|
||||
longitude: d.longitude ?? d.location?.longitude ?? d.location?.lng ?? null,
|
||||
timezone: d.timezone || d.timeZone || null,
|
||||
offersDelivery: d.offerDelivery ?? d.offersDelivery ?? d.delivery ?? null,
|
||||
offersPickup: d.offerPickup ?? d.offersPickup ?? d.pickup ?? null,
|
||||
isRecreational: d.isRecreational ?? d.recreational ?? (d.retailType === 'recreational' || d.retailType === 'both'),
|
||||
isMedical: d.isMedical ?? d.medical ?? (d.retailType === 'medical' || d.retailType === 'both'),
|
||||
metadata: {
|
||||
source: 'next_data',
|
||||
retailType: d.retailType,
|
||||
brand: d.brand,
|
||||
logo: d.logo || d.logoUrl,
|
||||
raw: d,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternative: Use GraphQL to discover locations
|
||||
*/
|
||||
async function fetchLocationsViaGraphQL(city: DiscoveryCity): Promise<DutchieLocation[]> {
|
||||
console.log(`[DutchieLocationDiscovery] Trying GraphQL for ${city.cityName}...`);
|
||||
|
||||
// Try geo-based search
|
||||
// This would require knowing the city's coordinates
|
||||
// For now, return empty and rely on page scraping
|
||||
return [];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Upsert a location into dutchie_discovery_locations
|
||||
* Does NOT overwrite status if already verified/merged/rejected
|
||||
* Does NOT overwrite dispensary_id if already set
|
||||
*/
|
||||
async function upsertLocation(
|
||||
pool: Pool,
|
||||
location: DutchieLocation,
|
||||
cityId: number
|
||||
): Promise<{ inserted: boolean; updated: boolean; skipped: boolean }> {
|
||||
// First check if this location exists and has a protected status
|
||||
const existing = await pool.query(
|
||||
`
|
||||
SELECT id, status, dispensary_id
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND platform_location_id = $1
|
||||
`,
|
||||
[location.platformLocationId]
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
const row = existing.rows[0];
|
||||
const protectedStatuses = ['verified', 'merged', 'rejected'];
|
||||
|
||||
if (protectedStatuses.includes(row.status)) {
|
||||
// Only update last_seen_at for protected statuses
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET last_seen_at = NOW(), updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[row.id]
|
||||
);
|
||||
return { inserted: false, updated: false, skipped: true };
|
||||
}
|
||||
|
||||
// Update existing discovered location (but preserve dispensary_id if set)
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET
|
||||
platform_slug = $2,
|
||||
platform_menu_url = $3,
|
||||
name = $4,
|
||||
raw_address = COALESCE($5, raw_address),
|
||||
address_line1 = COALESCE($6, address_line1),
|
||||
address_line2 = COALESCE($7, address_line2),
|
||||
city = COALESCE($8, city),
|
||||
state_code = COALESCE($9, state_code),
|
||||
postal_code = COALESCE($10, postal_code),
|
||||
country_code = COALESCE($11, country_code),
|
||||
latitude = COALESCE($12, latitude),
|
||||
longitude = COALESCE($13, longitude),
|
||||
timezone = COALESCE($14, timezone),
|
||||
offers_delivery = COALESCE($15, offers_delivery),
|
||||
offers_pickup = COALESCE($16, offers_pickup),
|
||||
is_recreational = COALESCE($17, is_recreational),
|
||||
is_medical = COALESCE($18, is_medical),
|
||||
metadata = COALESCE($19, metadata),
|
||||
discovery_city_id = $20,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[
|
||||
row.id,
|
||||
location.platformSlug,
|
||||
location.platformMenuUrl,
|
||||
location.name,
|
||||
location.rawAddress,
|
||||
location.addressLine1,
|
||||
location.addressLine2,
|
||||
location.city,
|
||||
location.stateCode,
|
||||
location.postalCode,
|
||||
location.countryCode,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.offersDelivery,
|
||||
location.offersPickup,
|
||||
location.isRecreational,
|
||||
location.isMedical,
|
||||
JSON.stringify(location.metadata),
|
||||
cityId,
|
||||
]
|
||||
);
|
||||
return { inserted: false, updated: true, skipped: false };
|
||||
}
|
||||
|
||||
// Insert new location
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO dutchie_discovery_locations (
|
||||
platform,
|
||||
platform_location_id,
|
||||
platform_slug,
|
||||
platform_menu_url,
|
||||
name,
|
||||
raw_address,
|
||||
address_line1,
|
||||
address_line2,
|
||||
city,
|
||||
state_code,
|
||||
postal_code,
|
||||
country_code,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
status,
|
||||
offers_delivery,
|
||||
offers_pickup,
|
||||
is_recreational,
|
||||
is_medical,
|
||||
metadata,
|
||||
discovery_city_id,
|
||||
first_seen_at,
|
||||
last_seen_at,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
'dutchie',
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
|
||||
'discovered',
|
||||
$15, $16, $17, $18, $19, $20,
|
||||
NOW(), NOW(), TRUE, NOW(), NOW()
|
||||
)
|
||||
`,
|
||||
[
|
||||
location.platformLocationId,
|
||||
location.platformSlug,
|
||||
location.platformMenuUrl,
|
||||
location.name,
|
||||
location.rawAddress,
|
||||
location.addressLine1,
|
||||
location.addressLine2,
|
||||
location.city,
|
||||
location.stateCode,
|
||||
location.postalCode,
|
||||
location.countryCode,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
location.offersDelivery,
|
||||
location.offersPickup,
|
||||
location.isRecreational,
|
||||
location.isMedical,
|
||||
JSON.stringify(location.metadata),
|
||||
cityId,
|
||||
]
|
||||
);
|
||||
|
||||
return { inserted: true, updated: false, skipped: false };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN DISCOVERY CLASS
|
||||
// ============================================================
|
||||
|
||||
export class DutchieLocationDiscovery {
|
||||
private pool: Pool;
|
||||
|
||||
constructor(pool: Pool) {
|
||||
this.pool = pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a city by slug
|
||||
*/
|
||||
async getCityBySlug(citySlug: string): Promise<DiscoveryCity | null> {
|
||||
const { rows } = await this.pool.query(
|
||||
`
|
||||
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND city_slug = $1
|
||||
LIMIT 1
|
||||
`,
|
||||
[citySlug]
|
||||
);
|
||||
|
||||
if (rows.length === 0) return null;
|
||||
|
||||
const r = rows[0];
|
||||
return {
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all crawl-enabled cities
|
||||
*/
|
||||
async getEnabledCities(limit?: number): Promise<DiscoveryCity[]> {
|
||||
const { rows } = await this.pool.query(
|
||||
`
|
||||
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
|
||||
FROM dutchie_discovery_cities
|
||||
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
|
||||
ORDER BY last_crawled_at ASC NULLS FIRST, city_name ASC
|
||||
${limit ? `LIMIT ${limit}` : ''}
|
||||
`
|
||||
);
|
||||
|
||||
return rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover locations for a single city
|
||||
*/
|
||||
async discoverForCity(city: DiscoveryCity): Promise<LocationDiscoveryResult> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let locationsFound = 0;
|
||||
let locationsInserted = 0;
|
||||
let locationsUpdated = 0;
|
||||
let locationsSkipped = 0;
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
||||
|
||||
try {
|
||||
// Fetch locations
|
||||
let locations = await fetchLocationsForCity(city);
|
||||
|
||||
// If scraping fails, try GraphQL
|
||||
if (locations.length === 0) {
|
||||
locations = await fetchLocationsViaGraphQL(city);
|
||||
}
|
||||
|
||||
locationsFound = locations.length;
|
||||
console.log(`[DutchieLocationDiscovery] Found ${locationsFound} locations`);
|
||||
|
||||
// Upsert each location
|
||||
for (const location of locations) {
|
||||
try {
|
||||
const result = await upsertLocation(this.pool, location, city.id);
|
||||
if (result.inserted) locationsInserted++;
|
||||
else if (result.updated) locationsUpdated++;
|
||||
else if (result.skipped) locationsSkipped++;
|
||||
} catch (error: any) {
|
||||
const msg = `Failed to upsert location ${location.platformSlug}: ${error.message}`;
|
||||
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Update city's last_crawled_at and location_count
|
||||
await this.pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_cities
|
||||
SET last_crawled_at = NOW(),
|
||||
location_count = $1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[locationsFound, city.id]
|
||||
);
|
||||
} catch (error: any) {
|
||||
const msg = `Location discovery failed for ${city.citySlug}: ${error.message}`;
|
||||
console.error(`[DutchieLocationDiscovery] ${msg}`);
|
||||
errors.push(msg);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log(`[DutchieLocationDiscovery] City ${city.citySlug} complete:`);
|
||||
console.log(` Locations found: ${locationsFound}`);
|
||||
console.log(` Inserted: ${locationsInserted}`);
|
||||
console.log(` Updated: ${locationsUpdated}`);
|
||||
console.log(` Skipped (protected): ${locationsSkipped}`);
|
||||
console.log(` Errors: ${errors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
cityId: city.id,
|
||||
citySlug: city.citySlug,
|
||||
locationsFound,
|
||||
locationsInserted,
|
||||
locationsUpdated,
|
||||
locationsSkipped,
|
||||
errors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover locations for all enabled cities
|
||||
*/
|
||||
async discoverAllEnabled(options: {
|
||||
limit?: number;
|
||||
delayMs?: number;
|
||||
} = {}): Promise<{
|
||||
totalCities: number;
|
||||
totalLocationsFound: number;
|
||||
totalInserted: number;
|
||||
totalUpdated: number;
|
||||
totalSkipped: number;
|
||||
errors: string[];
|
||||
durationMs: number;
|
||||
}> {
|
||||
const { limit, delayMs = 2000 } = options;
|
||||
const startTime = Date.now();
|
||||
let totalLocationsFound = 0;
|
||||
let totalInserted = 0;
|
||||
let totalUpdated = 0;
|
||||
let totalSkipped = 0;
|
||||
const allErrors: string[] = [];
|
||||
|
||||
const cities = await this.getEnabledCities(limit);
|
||||
console.log(`[DutchieLocationDiscovery] Discovering locations for ${cities.length} cities...`);
|
||||
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
console.log(`\n[DutchieLocationDiscovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
|
||||
|
||||
try {
|
||||
const result = await this.discoverForCity(city);
|
||||
totalLocationsFound += result.locationsFound;
|
||||
totalInserted += result.locationsInserted;
|
||||
totalUpdated += result.locationsUpdated;
|
||||
totalSkipped += result.locationsSkipped;
|
||||
allErrors.push(...result.errors);
|
||||
} catch (error: any) {
|
||||
allErrors.push(`City ${city.citySlug} failed: ${error.message}`);
|
||||
}
|
||||
|
||||
// Delay between cities
|
||||
if (i < cities.length - 1 && delayMs > 0) {
|
||||
await new Promise((r) => setTimeout(r, delayMs));
|
||||
}
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startTime;
|
||||
|
||||
console.log('\n[DutchieLocationDiscovery] All cities complete:');
|
||||
console.log(` Total cities: ${cities.length}`);
|
||||
console.log(` Total locations found: ${totalLocationsFound}`);
|
||||
console.log(` Total inserted: ${totalInserted}`);
|
||||
console.log(` Total updated: ${totalUpdated}`);
|
||||
console.log(` Total skipped: ${totalSkipped}`);
|
||||
console.log(` Total errors: ${allErrors.length}`);
|
||||
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
return {
|
||||
totalCities: cities.length,
|
||||
totalLocationsFound,
|
||||
totalInserted,
|
||||
totalUpdated,
|
||||
totalSkipped,
|
||||
errors: allErrors,
|
||||
durationMs,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default DutchieLocationDiscovery;
|
||||
@@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Cities (Auto)
|
||||
*
|
||||
* Attempts browser/API-based /cities discovery.
|
||||
* Even if currently blocked (403), this runner preserves the auto-discovery path.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:cities:auto
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities-auto.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtCityDiscoveryService } from './DtCityDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery (AUTO) ║');
|
||||
console.log('║ Browser + API fallback ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
const service = new DtCityDiscoveryService(pool);
|
||||
const result = await service.runAutoDiscovery();
|
||||
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
}
|
||||
|
||||
const stats = await service.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
|
||||
if (result.citiesFound === 0) {
|
||||
console.log('\n⚠️ No cities found via auto-discovery.');
|
||||
console.log(' This may be due to Dutchie blocking scraping/API access.');
|
||||
console.log(' Use manual seeding instead:');
|
||||
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Auto city discovery completed');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Auto city discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,137 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Cities (Manual Seed)
|
||||
*
|
||||
* Manually seeds cities into dutchie_discovery_cities via CLI args.
|
||||
* Use this when auto-discovery is blocked (403).
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||
* npm run discovery:dt:cities:manual -- --city-slug=ma-boston --city-name=Boston --state-code=MA --country-code=US
|
||||
*
|
||||
* Options:
|
||||
* --city-slug Required. URL slug (e.g., "ny-hudson")
|
||||
* --city-name Required. Display name (e.g., "Hudson")
|
||||
* --state-code Required. State/province code (e.g., "NY", "CA", "ON")
|
||||
* --country-code Optional. Country code (default: "US")
|
||||
*
|
||||
* After seeding, run location discovery:
|
||||
* npm run discovery:dt:locations
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtCityDiscoveryService, DutchieCity } from './DtCityDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
interface Args {
|
||||
citySlug?: string;
|
||||
cityName?: string;
|
||||
stateCode?: string;
|
||||
countryCode: string;
|
||||
}
|
||||
|
||||
function parseArgs(): Args {
|
||||
const args: Args = { countryCode: 'US' };
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const citySlugMatch = arg.match(/--city-slug=(.+)/);
|
||||
if (citySlugMatch) args.citySlug = citySlugMatch[1];
|
||||
|
||||
const cityNameMatch = arg.match(/--city-name=(.+)/);
|
||||
if (cityNameMatch) args.cityName = cityNameMatch[1];
|
||||
|
||||
const stateCodeMatch = arg.match(/--state-code=(.+)/);
|
||||
if (stateCodeMatch) args.stateCode = stateCodeMatch[1].toUpperCase();
|
||||
|
||||
const countryCodeMatch = arg.match(/--country-code=(.+)/);
|
||||
if (countryCodeMatch) args.countryCode = countryCodeMatch[1].toUpperCase();
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
function printUsage() {
|
||||
console.log(`
|
||||
Usage:
|
||||
npm run discovery:dt:cities:manual -- --city-slug=<slug> --city-name=<name> --state-code=<state>
|
||||
|
||||
Required arguments:
|
||||
--city-slug URL slug for the city (e.g., "ny-hudson", "ma-boston")
|
||||
--city-name Display name (e.g., "Hudson", "Boston")
|
||||
--state-code State/province code (e.g., "NY", "CA", "ON")
|
||||
|
||||
Optional arguments:
|
||||
--country-code Country code (default: "US")
|
||||
|
||||
Examples:
|
||||
npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY
|
||||
npm run discovery:dt:cities:manual -- --city-slug=ca-los-angeles --city-name="Los Angeles" --state-code=CA
|
||||
npm run discovery:dt:cities:manual -- --city-slug=on-toronto --city-name=Toronto --state-code=ON --country-code=CA
|
||||
|
||||
After seeding, run location discovery:
|
||||
npm run discovery:dt:locations
|
||||
`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery (MANUAL SEED) ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
|
||||
if (!args.citySlug || !args.cityName || !args.stateCode) {
|
||||
console.error('\n❌ Error: Missing required arguments\n');
|
||||
printUsage();
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`\nCity Slug: ${args.citySlug}`);
|
||||
console.log(`City Name: ${args.cityName}`);
|
||||
console.log(`State Code: ${args.stateCode}`);
|
||||
console.log(`Country Code: ${args.countryCode}`);
|
||||
console.log(`Database: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`\nConnected at: ${rows[0].time}`);
|
||||
|
||||
const service = new DtCityDiscoveryService(pool);
|
||||
|
||||
const city: DutchieCity = {
|
||||
slug: args.citySlug,
|
||||
name: args.cityName,
|
||||
stateCode: args.stateCode,
|
||||
countryCode: args.countryCode,
|
||||
};
|
||||
|
||||
const result = await service.seedCity(city);
|
||||
|
||||
const action = result.wasInserted ? 'INSERTED' : 'UPDATED';
|
||||
console.log(`\n✅ City ${action}:`);
|
||||
console.log(` ID: ${result.id}`);
|
||||
console.log(` City Slug: ${result.city.slug}`);
|
||||
console.log(` City Name: ${result.city.name}`);
|
||||
console.log(` State Code: ${result.city.stateCode}`);
|
||||
console.log(` Country Code: ${result.city.countryCode}`);
|
||||
|
||||
const stats = await service.getStats();
|
||||
console.log(`\nTotal Dutchie cities: ${stats.total} (${stats.crawlEnabled} enabled)`);
|
||||
|
||||
console.log('\n📍 Next step: Run location discovery');
|
||||
console.log(' npm run discovery:dt:locations');
|
||||
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Failed to seed city:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Runner: Dutchie Cities
|
||||
*
|
||||
* Discovers cities from Dutchie's /cities page and upserts to dutchie_discovery_cities.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:platforms:dt:cities
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-cities.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie City Discovery Runner ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
// Test DB connection
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
// Run city discovery
|
||||
const discovery = new DutchieCityDiscovery(pool);
|
||||
const result = await discovery.run();
|
||||
|
||||
// Print summary
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities found: ${result.citiesFound}`);
|
||||
console.log(`Cities inserted: ${result.citiesInserted}`);
|
||||
console.log(`Cities updated: ${result.citiesUpdated}`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors:');
|
||||
result.errors.forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
}
|
||||
|
||||
// Get final stats
|
||||
const stats = await discovery.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total cities: ${stats.total}`);
|
||||
console.log(` Crawl enabled: ${stats.crawlEnabled}`);
|
||||
console.log(` Never crawled: ${stats.neverCrawled}`);
|
||||
console.log(` By country: ${stats.byCountry.map(c => `${c.countryCode}=${c.count}`).join(', ')}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ City discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ City discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,113 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Entrypoint: Dutchie Locations (From Cities)
|
||||
*
|
||||
* Reads from dutchie_discovery_cities (crawl_enabled = true)
|
||||
* and discovers store locations for each city.
|
||||
*
|
||||
* Geo coordinates are captured when available from Dutchie's payloads.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:dt:locations
|
||||
* npm run discovery:dt:locations -- --limit=10
|
||||
* npm run discovery:dt:locations -- --delay=3000
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations-from-cities.ts
|
||||
*
|
||||
* Options:
|
||||
* --limit=N Only process N cities (default: all)
|
||||
* --delay=N Delay between cities in ms (default: 2000)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DtLocationDiscoveryService } from './DtLocationDiscoveryService';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
function parseArgs(): { limit?: number; delay?: number } {
|
||||
const args: { limit?: number; delay?: number } = {};
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||
|
||||
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie Location Discovery (From Cities) ║');
|
||||
console.log('║ Reads crawl_enabled cities, discovers stores ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
const service = new DtLocationDiscoveryService(pool);
|
||||
const result = await service.discoverAllEnabled({
|
||||
limit: args.limit,
|
||||
delayMs: args.delay ?? 2000,
|
||||
});
|
||||
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities processed: ${result.totalCities}`);
|
||||
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors (first 10):');
|
||||
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get location stats including coordinates
|
||||
const stats = await service.getStats();
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total locations: ${stats.total}`);
|
||||
console.log(` With coordinates: ${stats.withCoordinates}`);
|
||||
console.log(` By status:`);
|
||||
stats.byStatus.forEach(s => console.log(` ${s.status}: ${s.count}`));
|
||||
|
||||
if (result.totalCities === 0) {
|
||||
console.log('\n⚠️ No crawl-enabled cities found.');
|
||||
console.log(' Seed cities first:');
|
||||
console.log(' npm run discovery:dt:cities:manual -- --city-slug=ny-hudson --city-name=Hudson --state-code=NY');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Location discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Location discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discovery Runner: Dutchie Locations
|
||||
*
|
||||
* Discovers store locations for all crawl-enabled cities and upserts to dutchie_discovery_locations.
|
||||
*
|
||||
* Usage:
|
||||
* npm run discovery:platforms:dt:locations
|
||||
* npm run discovery:platforms:dt:locations -- --limit=10
|
||||
* DATABASE_URL="..." npx tsx src/dutchie-az/discovery/discovery-dt-locations.ts
|
||||
*
|
||||
* Options (via args):
|
||||
* --limit=N Only process N cities (default: all)
|
||||
* --delay=N Delay between cities in ms (default: 2000)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
|
||||
const DB_URL = process.env.DATABASE_URL || process.env.CANNAIQ_DB_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
// Parse CLI args
|
||||
function parseArgs(): { limit?: number; delay?: number } {
|
||||
const args: { limit?: number; delay?: number } = {};
|
||||
|
||||
for (const arg of process.argv.slice(2)) {
|
||||
const limitMatch = arg.match(/--limit=(\d+)/);
|
||||
if (limitMatch) args.limit = parseInt(limitMatch[1], 10);
|
||||
|
||||
const delayMatch = arg.match(/--delay=(\d+)/);
|
||||
if (delayMatch) args.delay = parseInt(delayMatch[1], 10);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
|
||||
console.log('╔══════════════════════════════════════════════════╗');
|
||||
console.log('║ Dutchie Location Discovery Runner ║');
|
||||
console.log('╚══════════════════════════════════════════════════╝');
|
||||
console.log(`\nDatabase: ${DB_URL.replace(/:[^:@]+@/, ':****@')}`);
|
||||
if (args.limit) console.log(`City limit: ${args.limit}`);
|
||||
if (args.delay) console.log(`Delay: ${args.delay}ms`);
|
||||
|
||||
const pool = new Pool({ connectionString: DB_URL });
|
||||
|
||||
try {
|
||||
// Test DB connection
|
||||
const { rows } = await pool.query('SELECT NOW() as time');
|
||||
console.log(`Connected at: ${rows[0].time}\n`);
|
||||
|
||||
// Run location discovery
|
||||
const discovery = new DutchieLocationDiscovery(pool);
|
||||
const result = await discovery.discoverAllEnabled({
|
||||
limit: args.limit,
|
||||
delayMs: args.delay ?? 2000,
|
||||
});
|
||||
|
||||
// Print summary
|
||||
console.log('\n' + '═'.repeat(50));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`Cities processed: ${result.totalCities}`);
|
||||
console.log(`Locations found: ${result.totalLocationsFound}`);
|
||||
console.log(`Locations inserted: ${result.totalInserted}`);
|
||||
console.log(`Locations updated: ${result.totalUpdated}`);
|
||||
console.log(`Locations skipped: ${result.totalSkipped} (protected status)`);
|
||||
console.log(`Errors: ${result.errors.length}`);
|
||||
console.log(`Duration: ${(result.durationMs / 1000).toFixed(1)}s`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\nErrors (first 10):');
|
||||
result.errors.slice(0, 10).forEach((e, i) => console.log(` ${i + 1}. ${e}`));
|
||||
if (result.errors.length > 10) {
|
||||
console.log(` ... and ${result.errors.length - 10} more`);
|
||||
}
|
||||
}
|
||||
|
||||
// Get DB counts
|
||||
const { rows: countRows } = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'discovered') as discovered,
|
||||
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||
COUNT(*) FILTER (WHERE status = 'merged') as merged,
|
||||
COUNT(*) FILTER (WHERE status = 'rejected') as rejected
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE
|
||||
`);
|
||||
|
||||
const counts = countRows[0];
|
||||
console.log('\nCurrent Database Stats:');
|
||||
console.log(` Total locations: ${counts.total}`);
|
||||
console.log(` Status discovered: ${counts.discovered}`);
|
||||
console.log(` Status verified: ${counts.verified}`);
|
||||
console.log(` Status merged: ${counts.merged}`);
|
||||
console.log(` Status rejected: ${counts.rejected}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('\n⚠️ Completed with errors');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ Location discovery completed successfully');
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Location discovery failed:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,10 +0,0 @@
|
||||
/**
|
||||
* Dutchie Discovery Module
|
||||
*
|
||||
* Store discovery pipeline for Dutchie platform.
|
||||
*/
|
||||
|
||||
export { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
export { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
export { createDutchieDiscoveryRoutes } from './routes';
|
||||
export { promoteDiscoveryLocation } from './promoteDiscoveryLocation';
|
||||
@@ -1,248 +0,0 @@
|
||||
/**
|
||||
* Promote Discovery Location to Crawlable Dispensary
|
||||
*
|
||||
* When a discovery location is verified or merged:
|
||||
* 1. Ensure a crawl profile exists for the dispensary
|
||||
* 2. Seed/update crawl schedule
|
||||
* 3. Create initial crawl job
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface PromotionResult {
|
||||
success: boolean;
|
||||
discoveryId: number;
|
||||
dispensaryId: number;
|
||||
crawlProfileId?: number;
|
||||
scheduleUpdated?: boolean;
|
||||
crawlJobCreated?: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Promote a verified/merged discovery location to a crawlable dispensary.
|
||||
*
|
||||
* This function:
|
||||
* 1. Verifies the discovery location is verified/merged and has a dispensary_id
|
||||
* 2. Ensures the dispensary has platform info (menu_type, platform_dispensary_id)
|
||||
* 3. Creates/updates a crawler profile if the profile table exists
|
||||
* 4. Queues an initial crawl job
|
||||
*/
|
||||
export async function promoteDiscoveryLocation(
|
||||
pool: Pool,
|
||||
discoveryLocationId: number
|
||||
): Promise<PromotionResult> {
|
||||
console.log(`[Promote] Starting promotion for discovery location ${discoveryLocationId}...`);
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.id as disp_id,
|
||||
d.name as disp_name,
|
||||
d.menu_type as disp_menu_type,
|
||||
d.platform_dispensary_id as disp_platform_id
|
||||
FROM dutchie_discovery_locations dl
|
||||
JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[discoveryLocationId]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId: 0,
|
||||
error: 'Discovery location not found or not linked to a dispensary',
|
||||
};
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Verify status
|
||||
if (!['verified', 'merged'].includes(location.status)) {
|
||||
return {
|
||||
success: false,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId: location.dispensary_id || 0,
|
||||
error: `Cannot promote: location status is '${location.status}', must be 'verified' or 'merged'`,
|
||||
};
|
||||
}
|
||||
|
||||
const dispensaryId = location.dispensary_id;
|
||||
console.log(`[Promote] Location ${discoveryLocationId} -> Dispensary ${dispensaryId} (${location.disp_name})`);
|
||||
|
||||
// Ensure dispensary has platform info
|
||||
if (!location.disp_platform_id) {
|
||||
console.log(`[Promote] Updating dispensary with platform info...`);
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||
);
|
||||
}
|
||||
|
||||
let crawlProfileId: number | undefined;
|
||||
let scheduleUpdated = false;
|
||||
let crawlJobCreated = false;
|
||||
|
||||
// Check if dispensary_crawler_profiles table exists
|
||||
const { rows: tableCheck } = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'dispensary_crawler_profiles'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (tableCheck[0]?.exists) {
|
||||
// Create or get crawler profile
|
||||
console.log(`[Promote] Checking crawler profile...`);
|
||||
|
||||
const { rows: profileRows } = await pool.query(
|
||||
`
|
||||
SELECT id FROM dispensary_crawler_profiles
|
||||
WHERE dispensary_id = $1 AND platform = 'dutchie'
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (profileRows.length > 0) {
|
||||
crawlProfileId = profileRows[0].id;
|
||||
console.log(`[Promote] Using existing profile ${crawlProfileId}`);
|
||||
} else {
|
||||
// Create new profile
|
||||
const profileKey = `dutchie-${location.platform_slug}`;
|
||||
const { rows: newProfile } = await pool.query(
|
||||
`
|
||||
INSERT INTO dispensary_crawler_profiles (
|
||||
dispensary_id,
|
||||
profile_key,
|
||||
profile_name,
|
||||
platform,
|
||||
config,
|
||||
status,
|
||||
enabled,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, 'dutchie', $4, 'sandbox', TRUE, NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, platform) DO UPDATE SET
|
||||
enabled = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
dispensaryId,
|
||||
profileKey,
|
||||
`${location.name} (Dutchie)`,
|
||||
JSON.stringify({
|
||||
platformDispensaryId: location.platform_location_id,
|
||||
platformSlug: location.platform_slug,
|
||||
menuUrl: location.platform_menu_url,
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
}),
|
||||
]
|
||||
);
|
||||
|
||||
crawlProfileId = newProfile[0]?.id;
|
||||
console.log(`[Promote] Created new profile ${crawlProfileId}`);
|
||||
}
|
||||
|
||||
// Link profile to dispensary if not already linked
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET active_crawler_profile_id = COALESCE(active_crawler_profile_id, $1),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[crawlProfileId, dispensaryId]
|
||||
);
|
||||
}
|
||||
|
||||
// Check if crawl_jobs table exists and create initial job
|
||||
const { rows: jobsTableCheck } = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'crawl_jobs'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (jobsTableCheck[0]?.exists) {
|
||||
// Check if there's already a pending job
|
||||
const { rows: existingJobs } = await pool.query(
|
||||
`
|
||||
SELECT id FROM crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (existingJobs.length === 0) {
|
||||
// Create initial crawl job
|
||||
console.log(`[Promote] Creating initial crawl job...`);
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO crawl_jobs (
|
||||
dispensary_id,
|
||||
job_type,
|
||||
status,
|
||||
priority,
|
||||
config,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, 'dutchie_product_crawl', 'pending', 1, $2, NOW(), NOW()
|
||||
)
|
||||
`,
|
||||
[
|
||||
dispensaryId,
|
||||
JSON.stringify({
|
||||
source: 'discovery_promotion',
|
||||
discoveryLocationId,
|
||||
pricingType: 'rec',
|
||||
useBothModes: true,
|
||||
}),
|
||||
]
|
||||
);
|
||||
crawlJobCreated = true;
|
||||
} else {
|
||||
console.log(`[Promote] Crawl job already exists for dispensary`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update discovery location notes
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET notes = COALESCE(notes || E'\n', '') || $1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[`Promoted to crawlable at ${new Date().toISOString()}`, discoveryLocationId]
|
||||
);
|
||||
|
||||
console.log(`[Promote] Promotion complete for discovery location ${discoveryLocationId}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
discoveryId: discoveryLocationId,
|
||||
dispensaryId,
|
||||
crawlProfileId,
|
||||
scheduleUpdated,
|
||||
crawlJobCreated,
|
||||
};
|
||||
}
|
||||
|
||||
export default promoteDiscoveryLocation;
|
||||
@@ -1,973 +0,0 @@
|
||||
/**
|
||||
* Platform Discovery API Routes (DT = Dutchie)
|
||||
*
|
||||
* Routes for the platform-specific store discovery pipeline.
|
||||
* Mount at /api/discovery/platforms/dt
|
||||
*
|
||||
* Platform Slug Mapping (for trademark-safe URLs):
|
||||
* dt = Dutchie
|
||||
* jn = Jane (future)
|
||||
* wm = Weedmaps (future)
|
||||
* lf = Leafly (future)
|
||||
* tz = Treez (future)
|
||||
*
|
||||
* Note: The actual platform value stored in the DB remains 'dutchie'.
|
||||
* Only the URL paths use neutral slugs.
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieCityDiscovery } from './DutchieCityDiscovery';
|
||||
import { DutchieLocationDiscovery } from './DutchieLocationDiscovery';
|
||||
import { DiscoveryGeoService } from '../../services/DiscoveryGeoService';
|
||||
import { GeoValidationService } from '../../services/GeoValidationService';
|
||||
|
||||
export function createDutchieDiscoveryRoutes(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// ============================================================
|
||||
// LOCATIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations
|
||||
*
|
||||
* List discovered locations with filtering.
|
||||
*
|
||||
* Query params:
|
||||
* - status: 'discovered' | 'verified' | 'rejected' | 'merged'
|
||||
* - state_code: e.g., 'AZ', 'CA'
|
||||
* - country_code: 'US' | 'CA'
|
||||
* - unlinked_only: 'true' to show only locations without dispensary_id
|
||||
* - search: search by name
|
||||
* - limit: number (default 50)
|
||||
* - offset: number (default 0)
|
||||
*/
|
||||
router.get('/locations', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
status,
|
||||
state_code,
|
||||
country_code,
|
||||
unlinked_only,
|
||||
search,
|
||||
limit = '50',
|
||||
offset = '0',
|
||||
} = req.query;
|
||||
|
||||
let whereClause = "WHERE platform = 'dutchie' AND active = TRUE";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (status) {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (state_code) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(state_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (country_code) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(country_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (unlinked_only === 'true') {
|
||||
whereClause += ' AND dispensary_id IS NULL';
|
||||
}
|
||||
|
||||
if (search) {
|
||||
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
const limitVal = parseInt(limit as string, 10);
|
||||
const offsetVal = parseInt(offset as string, 10);
|
||||
params.push(limitVal, offsetVal);
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.id,
|
||||
dl.platform,
|
||||
dl.platform_location_id,
|
||||
dl.platform_slug,
|
||||
dl.platform_menu_url,
|
||||
dl.name,
|
||||
dl.raw_address,
|
||||
dl.address_line1,
|
||||
dl.city,
|
||||
dl.state_code,
|
||||
dl.postal_code,
|
||||
dl.country_code,
|
||||
dl.latitude,
|
||||
dl.longitude,
|
||||
dl.status,
|
||||
dl.dispensary_id,
|
||||
dl.offers_delivery,
|
||||
dl.offers_pickup,
|
||||
dl.is_recreational,
|
||||
dl.is_medical,
|
||||
dl.first_seen_at,
|
||||
dl.last_seen_at,
|
||||
dl.verified_at,
|
||||
dl.verified_by,
|
||||
dl.notes,
|
||||
d.name as dispensary_name
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY dl.first_seen_at DESC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
// Get total count
|
||||
const countParams = params.slice(0, -2);
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
|
||||
countParams
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
locations: rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
platformLocationId: r.platform_location_id,
|
||||
platformSlug: r.platform_slug,
|
||||
platformMenuUrl: r.platform_menu_url,
|
||||
name: r.name,
|
||||
rawAddress: r.raw_address,
|
||||
addressLine1: r.address_line1,
|
||||
city: r.city,
|
||||
stateCode: r.state_code,
|
||||
postalCode: r.postal_code,
|
||||
countryCode: r.country_code,
|
||||
latitude: r.latitude,
|
||||
longitude: r.longitude,
|
||||
status: r.status,
|
||||
dispensaryId: r.dispensary_id,
|
||||
dispensaryName: r.dispensary_name,
|
||||
offersDelivery: r.offers_delivery,
|
||||
offersPickup: r.offers_pickup,
|
||||
isRecreational: r.is_recreational,
|
||||
isMedical: r.is_medical,
|
||||
firstSeenAt: r.first_seen_at,
|
||||
lastSeenAt: r.last_seen_at,
|
||||
verifiedAt: r.verified_at,
|
||||
verifiedBy: r.verified_by,
|
||||
notes: r.notes,
|
||||
})),
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: limitVal,
|
||||
offset: offsetVal,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching locations:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id
|
||||
*
|
||||
* Get a single location by ID.
|
||||
*/
|
||||
router.get('/locations/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
dl.*,
|
||||
d.name as dispensary_name,
|
||||
d.menu_url as dispensary_menu_url
|
||||
FROM dutchie_discovery_locations dl
|
||||
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
|
||||
WHERE dl.id = $1
|
||||
`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const r = rows[0];
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
platformLocationId: r.platform_location_id,
|
||||
platformSlug: r.platform_slug,
|
||||
platformMenuUrl: r.platform_menu_url,
|
||||
name: r.name,
|
||||
rawAddress: r.raw_address,
|
||||
addressLine1: r.address_line1,
|
||||
addressLine2: r.address_line2,
|
||||
city: r.city,
|
||||
stateCode: r.state_code,
|
||||
postalCode: r.postal_code,
|
||||
countryCode: r.country_code,
|
||||
latitude: r.latitude,
|
||||
longitude: r.longitude,
|
||||
timezone: r.timezone,
|
||||
status: r.status,
|
||||
dispensaryId: r.dispensary_id,
|
||||
dispensaryName: r.dispensary_name,
|
||||
dispensaryMenuUrl: r.dispensary_menu_url,
|
||||
offersDelivery: r.offers_delivery,
|
||||
offersPickup: r.offers_pickup,
|
||||
isRecreational: r.is_recreational,
|
||||
isMedical: r.is_medical,
|
||||
firstSeenAt: r.first_seen_at,
|
||||
lastSeenAt: r.last_seen_at,
|
||||
verifiedAt: r.verified_at,
|
||||
verifiedBy: r.verified_by,
|
||||
notes: r.notes,
|
||||
metadata: r.metadata,
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching location:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// VERIFICATION ACTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/verify-create
|
||||
*
|
||||
* Verify a discovered location and create a new canonical dispensary.
|
||||
*/
|
||||
router.post('/locations/:id/verify-create', async (req: Request, res: Response) => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { verifiedBy = 'admin' } = req.body;
|
||||
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await client.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot verify: location status is '${location.status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
// Look up state_id if we have a state_code
|
||||
let stateId: number | null = null;
|
||||
if (location.state_code) {
|
||||
const { rows: stateRows } = await client.query(
|
||||
`SELECT id FROM states WHERE code = $1`,
|
||||
[location.state_code]
|
||||
);
|
||||
if (stateRows.length > 0) {
|
||||
stateId = stateRows[0].id;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the canonical dispensary
|
||||
const { rows: dispRows } = await client.query(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
name,
|
||||
slug,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
zip,
|
||||
latitude,
|
||||
longitude,
|
||||
timezone,
|
||||
menu_type,
|
||||
menu_url,
|
||||
platform_dispensary_id,
|
||||
state_id,
|
||||
active,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, TRUE, NOW(), NOW()
|
||||
)
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
location.platform_slug,
|
||||
location.address_line1,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.postal_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
location.timezone,
|
||||
'dutchie',
|
||||
location.platform_menu_url,
|
||||
location.platform_location_id,
|
||||
stateId,
|
||||
]
|
||||
);
|
||||
|
||||
const dispensaryId = dispRows[0].id;
|
||||
|
||||
// Update the discovery location
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'verified',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'created',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
message: `Created new dispensary (ID: ${dispensaryId})`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[Discovery Routes] Error in verify-create:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/verify-link
|
||||
*
|
||||
* Link a discovered location to an existing dispensary.
|
||||
*
|
||||
* Body:
|
||||
* - dispensaryId: number (required)
|
||||
* - verifiedBy: string (optional)
|
||||
*/
|
||||
router.post('/locations/:id/verify-link', async (req: Request, res: Response) => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { dispensaryId, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
if (!dispensaryId) {
|
||||
return res.status(400).json({ success: false, error: 'dispensaryId is required' });
|
||||
}
|
||||
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Verify dispensary exists
|
||||
const { rows: dispRows } = await client.query(
|
||||
`SELECT id, name FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (dispRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await client.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1 FOR UPDATE`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
if (location.status !== 'discovered') {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot link: location status is '${location.status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
// Update dispensary with platform info if missing
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
|
||||
menu_url = COALESCE(menu_url, $2),
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[location.platform_location_id, location.platform_menu_url, dispensaryId]
|
||||
);
|
||||
|
||||
// Update the discovery location
|
||||
await client.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'merged',
|
||||
dispensary_id = $1,
|
||||
verified_at = NOW(),
|
||||
verified_by = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[dispensaryId, verifiedBy, id]
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'linked',
|
||||
discoveryId: parseInt(id, 10),
|
||||
dispensaryId,
|
||||
dispensaryName: dispRows[0].name,
|
||||
message: `Linked to existing dispensary: ${dispRows[0].name}`,
|
||||
});
|
||||
} catch (error: any) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[Discovery Routes] Error in verify-link:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/reject
|
||||
*
|
||||
* Reject a discovered location.
|
||||
*
|
||||
* Body:
|
||||
* - reason: string (optional)
|
||||
* - verifiedBy: string (optional)
|
||||
*/
|
||||
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { reason, verifiedBy = 'admin' } = req.body;
|
||||
|
||||
// Get current status
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'discovered') {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot reject: location status is '${rows[0].status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'rejected',
|
||||
verified_at = NOW(),
|
||||
verified_by = $1,
|
||||
notes = COALESCE($2, notes),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`,
|
||||
[verifiedBy, reason, id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'rejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location rejected',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in reject:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/discovery/platforms/dt/locations/:id/unreject
|
||||
*
|
||||
* Restore a rejected location to discovered status.
|
||||
*/
|
||||
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get current status
|
||||
const { rows } = await pool.query(
|
||||
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
if (rows[0].status !== 'rejected') {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: `Cannot unreject: location status is '${rows[0].status}'`,
|
||||
});
|
||||
}
|
||||
|
||||
await pool.query(
|
||||
`
|
||||
UPDATE dutchie_discovery_locations
|
||||
SET status = 'discovered',
|
||||
verified_at = NULL,
|
||||
verified_by = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[id]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
action: 'unrejected',
|
||||
discoveryId: parseInt(id, 10),
|
||||
message: 'Location restored to discovered status',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in unreject:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SUMMARY / REPORTING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/summary
|
||||
*
|
||||
* Get discovery summary statistics.
|
||||
*/
|
||||
router.get('/summary', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
// Total counts by status
|
||||
const { rows: statusRows } = await pool.query(`
|
||||
SELECT status, COUNT(*) as cnt
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE
|
||||
GROUP BY status
|
||||
`);
|
||||
|
||||
const statusCounts: Record<string, number> = {};
|
||||
let totalLocations = 0;
|
||||
for (const row of statusRows) {
|
||||
statusCounts[row.status] = parseInt(row.cnt, 10);
|
||||
totalLocations += parseInt(row.cnt, 10);
|
||||
}
|
||||
|
||||
// By state
|
||||
const { rows: stateRows } = await pool.query(`
|
||||
SELECT
|
||||
state_code,
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'verified') as verified,
|
||||
COUNT(*) FILTER (WHERE dispensary_id IS NULL AND status = 'discovered') as unlinked
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE platform = 'dutchie' AND active = TRUE AND state_code IS NOT NULL
|
||||
GROUP BY state_code
|
||||
ORDER BY total DESC
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
summary: {
|
||||
total_locations: totalLocations,
|
||||
discovered: statusCounts['discovered'] || 0,
|
||||
verified: statusCounts['verified'] || 0,
|
||||
merged: statusCounts['merged'] || 0,
|
||||
rejected: statusCounts['rejected'] || 0,
|
||||
},
|
||||
by_state: stateRows.map((r) => ({
|
||||
state_code: r.state_code,
|
||||
total: parseInt(r.total, 10),
|
||||
verified: parseInt(r.verified, 10),
|
||||
unlinked: parseInt(r.unlinked, 10),
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in summary:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CITIES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/cities
|
||||
*
|
||||
* List discovery cities.
|
||||
*/
|
||||
router.get('/cities', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { state_code, country_code, crawl_enabled, limit = '100', offset = '0' } = req.query;
|
||||
|
||||
let whereClause = "WHERE platform = 'dutchie'";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state_code) {
|
||||
whereClause += ` AND state_code = $${paramIndex}`;
|
||||
params.push(state_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (country_code) {
|
||||
whereClause += ` AND country_code = $${paramIndex}`;
|
||||
params.push(country_code);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (crawl_enabled === 'true') {
|
||||
whereClause += ' AND crawl_enabled = TRUE';
|
||||
} else if (crawl_enabled === 'false') {
|
||||
whereClause += ' AND crawl_enabled = FALSE';
|
||||
}
|
||||
|
||||
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
|
||||
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
platform,
|
||||
city_name,
|
||||
city_slug,
|
||||
state_code,
|
||||
country_code,
|
||||
last_crawled_at,
|
||||
crawl_enabled,
|
||||
location_count
|
||||
FROM dutchie_discovery_cities
|
||||
${whereClause}
|
||||
ORDER BY country_code, state_code, city_name
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`,
|
||||
params
|
||||
);
|
||||
|
||||
const { rows: countRows } = await pool.query(
|
||||
`SELECT COUNT(*) as total FROM dutchie_discovery_cities ${whereClause}`,
|
||||
params.slice(0, -2)
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
cities: rows.map((r) => ({
|
||||
id: r.id,
|
||||
platform: r.platform,
|
||||
cityName: r.city_name,
|
||||
citySlug: r.city_slug,
|
||||
stateCode: r.state_code,
|
||||
countryCode: r.country_code,
|
||||
lastCrawledAt: r.last_crawled_at,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
locationCount: r.location_count,
|
||||
})),
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching cities:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// MATCH CANDIDATES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id/match-candidates
|
||||
*
|
||||
* Find potential dispensary matches for a discovery location.
|
||||
*/
|
||||
router.get('/locations/:id/match-candidates', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the discovery location
|
||||
const { rows: locRows } = await pool.query(
|
||||
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (locRows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = locRows[0];
|
||||
|
||||
// Find potential matches
|
||||
const { rows: candidates } = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.address,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.menu_url,
|
||||
d.latitude,
|
||||
d.longitude,
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 'exact_name'
|
||||
WHEN d.name ILIKE $2 THEN 'partial_name'
|
||||
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
|
||||
ELSE 'location_match'
|
||||
END as match_type,
|
||||
CASE
|
||||
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
|
||||
THEN (3959 * acos(
|
||||
LEAST(1.0, GREATEST(-1.0,
|
||||
cos(radians($5::float)) * cos(radians(d.latitude)) *
|
||||
cos(radians(d.longitude) - radians($6::float)) +
|
||||
sin(radians($5::float)) * sin(radians(d.latitude))
|
||||
))
|
||||
))
|
||||
ELSE NULL
|
||||
END as distance_miles
|
||||
FROM dispensaries d
|
||||
WHERE d.state = $4
|
||||
AND (
|
||||
d.name ILIKE $1
|
||||
OR d.name ILIKE $2
|
||||
OR d.city ILIKE $3
|
||||
OR (
|
||||
d.latitude IS NOT NULL
|
||||
AND d.longitude IS NOT NULL
|
||||
AND $5::float IS NOT NULL
|
||||
AND $6::float IS NOT NULL
|
||||
)
|
||||
)
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN d.name ILIKE $1 THEN 1
|
||||
WHEN d.name ILIKE $2 THEN 2
|
||||
ELSE 3
|
||||
END,
|
||||
distance_miles NULLS LAST
|
||||
LIMIT 10
|
||||
`,
|
||||
[
|
||||
location.name,
|
||||
`%${location.name.split(' ')[0]}%`,
|
||||
location.city,
|
||||
location.state_code,
|
||||
location.latitude,
|
||||
location.longitude,
|
||||
]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: location.id,
|
||||
name: location.name,
|
||||
city: location.city,
|
||||
stateCode: location.state_code,
|
||||
},
|
||||
candidates: candidates.map((c) => ({
|
||||
id: c.id,
|
||||
name: c.name,
|
||||
city: c.city,
|
||||
state: c.state,
|
||||
address: c.address,
|
||||
menuType: c.menu_type,
|
||||
platformDispensaryId: c.platform_dispensary_id,
|
||||
menuUrl: c.menu_url,
|
||||
matchType: c.match_type,
|
||||
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error fetching match candidates:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// GEO / NEARBY (Admin/Debug Only)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/nearby
|
||||
*
|
||||
* Find discovery locations near a given coordinate.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*
|
||||
* Query params:
|
||||
* - lat: number (required)
|
||||
* - lon: number (required)
|
||||
* - radiusKm: number (optional, default 50)
|
||||
* - limit: number (optional, default 20)
|
||||
* - status: string (optional, filter by status)
|
||||
*/
|
||||
router.get('/nearby', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { lat, lon, radiusKm = '50', limit = '20', status } = req.query;
|
||||
|
||||
// Validate required params
|
||||
if (!lat || !lon) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'lat and lon are required query parameters',
|
||||
});
|
||||
}
|
||||
|
||||
const latNum = parseFloat(lat as string);
|
||||
const lonNum = parseFloat(lon as string);
|
||||
const radiusNum = parseFloat(radiusKm as string);
|
||||
const limitNum = parseInt(limit as string, 10);
|
||||
|
||||
if (isNaN(latNum) || isNaN(lonNum)) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'lat and lon must be valid numbers',
|
||||
});
|
||||
}
|
||||
|
||||
const geoService = new DiscoveryGeoService(pool);
|
||||
|
||||
const locations = await geoService.findNearbyDiscoveryLocations(latNum, lonNum, {
|
||||
radiusKm: radiusNum,
|
||||
limit: limitNum,
|
||||
platform: 'dutchie',
|
||||
status: status as string | undefined,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
center: { lat: latNum, lon: lonNum },
|
||||
radiusKm: radiusNum,
|
||||
count: locations.length,
|
||||
locations,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in nearby:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/geo-stats
|
||||
*
|
||||
* Get coordinate coverage statistics for discovery locations.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*/
|
||||
router.get('/geo-stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const geoService = new DiscoveryGeoService(pool);
|
||||
const stats = await geoService.getCoordinateCoverageStats();
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
stats,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in geo-stats:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/discovery/platforms/dt/locations/:id/validate-geo
|
||||
*
|
||||
* Validate the geographic data for a discovery location.
|
||||
* This is an internal/debug endpoint for admin use.
|
||||
*/
|
||||
router.get('/locations/:id/validate-geo', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Get the location
|
||||
const { rows } = await pool.query(
|
||||
`SELECT latitude, longitude, state_code, country_code, name
|
||||
FROM dutchie_discovery_locations WHERE id = $1`,
|
||||
[parseInt(id, 10)]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Location not found' });
|
||||
}
|
||||
|
||||
const location = rows[0];
|
||||
const geoValidation = new GeoValidationService();
|
||||
const result = geoValidation.validateLocationState({
|
||||
latitude: location.latitude,
|
||||
longitude: location.longitude,
|
||||
state_code: location.state_code,
|
||||
country_code: location.country_code,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: {
|
||||
id: parseInt(id, 10),
|
||||
name: location.name,
|
||||
latitude: location.latitude,
|
||||
longitude: location.longitude,
|
||||
stateCode: location.state_code,
|
||||
countryCode: location.country_code,
|
||||
},
|
||||
validation: result,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Discovery Routes] Error in validate-geo:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
export default createDutchieDiscoveryRoutes;
|
||||
@@ -1,92 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Data Pipeline
|
||||
*
|
||||
* Isolated data pipeline for crawling and storing Dutchie Arizona dispensary data.
|
||||
* This module is completely separate from the main application database.
|
||||
*
|
||||
* Features:
|
||||
* - Two-mode crawling (Mode A: UI parity, Mode B: MAX COVERAGE)
|
||||
* - Derived stockStatus field (in_stock, out_of_stock, unknown)
|
||||
* - Full raw payload storage for 100% data preservation
|
||||
* - AZDHS dispensary list as canonical source
|
||||
*/
|
||||
|
||||
// Types
|
||||
export * from './types';
|
||||
|
||||
// Database
|
||||
export {
|
||||
getDutchieAZPool,
|
||||
query,
|
||||
getClient,
|
||||
closePool,
|
||||
healthCheck,
|
||||
} from './db/connection';
|
||||
|
||||
export {
|
||||
createSchema,
|
||||
dropSchema,
|
||||
schemaExists,
|
||||
ensureSchema,
|
||||
} from './db/schema';
|
||||
|
||||
// Services - GraphQL Client
|
||||
export {
|
||||
GRAPHQL_HASHES,
|
||||
ARIZONA_CENTERPOINTS,
|
||||
resolveDispensaryId,
|
||||
fetchAllProducts,
|
||||
fetchAllProductsBothModes,
|
||||
discoverArizonaDispensaries,
|
||||
// Alias for backward compatibility
|
||||
discoverArizonaDispensaries as discoverDispensaries,
|
||||
} from './services/graphql-client';
|
||||
|
||||
// Services - Discovery
|
||||
export {
|
||||
importFromExistingDispensaries,
|
||||
discoverDispensaries as discoverAndSaveDispensaries,
|
||||
resolvePlatformDispensaryIds,
|
||||
getAllDispensaries,
|
||||
getDispensaryById,
|
||||
getDispensariesWithPlatformIds,
|
||||
} from './services/discovery';
|
||||
|
||||
// Services - Product Crawler
|
||||
export {
|
||||
normalizeProduct,
|
||||
normalizeSnapshot,
|
||||
crawlDispensaryProducts,
|
||||
crawlAllArizonaDispensaries,
|
||||
} from './services/product-crawler';
|
||||
|
||||
export type { CrawlResult } from './services/product-crawler';
|
||||
|
||||
// Services - Scheduler
|
||||
export {
|
||||
startScheduler,
|
||||
stopScheduler,
|
||||
triggerImmediateCrawl,
|
||||
getSchedulerStatus,
|
||||
crawlSingleDispensary,
|
||||
// Schedule config CRUD
|
||||
getAllSchedules,
|
||||
getScheduleById,
|
||||
createSchedule,
|
||||
updateSchedule,
|
||||
deleteSchedule,
|
||||
triggerScheduleNow,
|
||||
initializeDefaultSchedules,
|
||||
// Run logs
|
||||
getRunLogs,
|
||||
} from './services/scheduler';
|
||||
|
||||
// Services - AZDHS Import
|
||||
export {
|
||||
importAZDHSDispensaries,
|
||||
importFromJSON,
|
||||
getImportStats,
|
||||
} from './services/azdhs-import';
|
||||
|
||||
// Routes
|
||||
export { default as dutchieAZRouter } from './routes';
|
||||
@@ -1,682 +0,0 @@
|
||||
/**
|
||||
* Analytics API Routes
|
||||
*
|
||||
* Provides REST API endpoints for all analytics services.
|
||||
* All routes are prefixed with /api/analytics
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
AnalyticsCache,
|
||||
PriceTrendService,
|
||||
PenetrationService,
|
||||
CategoryAnalyticsService,
|
||||
StoreChangeService,
|
||||
BrandOpportunityService,
|
||||
} from '../services/analytics';
|
||||
|
||||
export function createAnalyticsRouter(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// Initialize services
|
||||
const cache = new AnalyticsCache(pool, { defaultTtlMinutes: 15 });
|
||||
const priceService = new PriceTrendService(pool, cache);
|
||||
const penetrationService = new PenetrationService(pool, cache);
|
||||
const categoryService = new CategoryAnalyticsService(pool, cache);
|
||||
const storeService = new StoreChangeService(pool, cache);
|
||||
const brandOpportunityService = new BrandOpportunityService(pool, cache);
|
||||
|
||||
// ============================================================
|
||||
// PRICE ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/product/:id
|
||||
* Get price trend for a specific product
|
||||
*/
|
||||
router.get('/price/product/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const productId = parseInt(req.params.id);
|
||||
const storeId = req.query.storeId ? parseInt(req.query.storeId as string) : undefined;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await priceService.getProductPriceTrend(productId, storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price product error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch product price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/brand/:name
|
||||
* Get price trend for a brand
|
||||
*/
|
||||
router.get('/price/brand/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
};
|
||||
|
||||
const result = await priceService.getBrandPriceTrend(brandName, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price brand error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/category/:name
|
||||
* Get price trend for a category
|
||||
*/
|
||||
router.get('/price/category/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
brandName: req.query.brand as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
};
|
||||
|
||||
const result = await priceService.getCategoryPriceTrend(category, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price category error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category price trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/summary
|
||||
* Get price summary statistics
|
||||
*/
|
||||
router.get('/price/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const filters = {
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
brandName: req.query.brand as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
state: req.query.state as string | undefined,
|
||||
};
|
||||
|
||||
const result = await priceService.getPriceSummary(filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch price summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/compression/:category
|
||||
* Get price compression analysis for a category
|
||||
*/
|
||||
router.get('/price/compression/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const state = req.query.state as string | undefined;
|
||||
|
||||
const result = await priceService.detectPriceCompression(category, state);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Price compression error:', error);
|
||||
res.status(500).json({ error: 'Failed to analyze price compression' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/price/global
|
||||
* Get global price statistics
|
||||
*/
|
||||
router.get('/price/global', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await priceService.getGlobalPriceStats();
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Global price error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch global price stats' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PENETRATION ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/brand/:name
|
||||
* Get penetration data for a brand
|
||||
*/
|
||||
router.get('/penetration/brand/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
};
|
||||
|
||||
const result = await penetrationService.getBrandPenetration(brandName, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand penetration error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand penetration' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/top
|
||||
* Get top brands by penetration
|
||||
*/
|
||||
router.get('/penetration/top', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
category: req.query.category as string | undefined,
|
||||
minStores: req.query.minStores ? parseInt(req.query.minStores as string) : 2,
|
||||
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 5,
|
||||
};
|
||||
|
||||
const result = await penetrationService.getTopBrandsByPenetration(limit, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Top penetration error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch top brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/trend/:brand
|
||||
* Get penetration trend for a brand
|
||||
*/
|
||||
router.get('/penetration/trend/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await penetrationService.getPenetrationTrend(brandName, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Penetration trend error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch penetration trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/shelf-share/:brand
|
||||
* Get shelf share by category for a brand
|
||||
*/
|
||||
router.get('/penetration/shelf-share/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getShelfShareByCategory(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Shelf share error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch shelf share' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/by-state/:brand
|
||||
* Get brand presence by state
|
||||
*/
|
||||
router.get('/penetration/by-state/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getBrandPresenceByState(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand by state error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand presence by state' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/stores/:brand
|
||||
* Get stores carrying a brand
|
||||
*/
|
||||
router.get('/penetration/stores/:brand', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.brand);
|
||||
const result = await penetrationService.getStoresCarryingBrand(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Stores carrying brand error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch stores' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/penetration/heatmap
|
||||
* Get penetration heatmap data
|
||||
*/
|
||||
router.get('/penetration/heatmap', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = req.query.brand as string | undefined;
|
||||
const result = await penetrationService.getPenetrationHeatmap(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Heatmap error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch heatmap data' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CATEGORY ANALYTICS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/summary
|
||||
* Get category summary
|
||||
*/
|
||||
router.get('/category/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = req.query.category as string | undefined;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
};
|
||||
|
||||
const result = await categoryService.getCategorySummary(category, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/growth
|
||||
* Get category growth data
|
||||
*/
|
||||
router.get('/category/growth', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
const filters = {
|
||||
state: req.query.state as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
minSkus: req.query.minSkus ? parseInt(req.query.minSkus as string) : 10,
|
||||
};
|
||||
|
||||
const result = await categoryService.getCategoryGrowth(days, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category growth error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category growth' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/trend/:category
|
||||
* Get category growth trend over time
|
||||
*/
|
||||
router.get('/category/trend/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 90;
|
||||
|
||||
const result = await categoryService.getCategoryGrowthTrend(category, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category trend error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category trend' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/heatmap
|
||||
* Get category heatmap data
|
||||
*/
|
||||
router.get('/category/heatmap', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const metric = (req.query.metric as 'skus' | 'growth' | 'price') || 'skus';
|
||||
const periods = req.query.periods ? parseInt(req.query.periods as string) : 12;
|
||||
|
||||
const result = await categoryService.getCategoryHeatmap(metric, periods);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Category heatmap error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch heatmap' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/top-movers
|
||||
* Get top growing and declining categories
|
||||
*/
|
||||
router.get('/category/top-movers', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 5;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await categoryService.getTopMovers(limit, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Top movers error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch top movers' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/category/:category/subcategories
|
||||
* Get subcategory breakdown
|
||||
*/
|
||||
router.get('/category/:category/subcategories', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const result = await categoryService.getSubcategoryBreakdown(category);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Subcategory error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch subcategories' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STORE CHANGE TRACKING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/summary
|
||||
* Get change summary for a store
|
||||
*/
|
||||
router.get('/store/:id/summary', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const result = await storeService.getStoreChangeSummary(storeId);
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({ error: 'Store not found' });
|
||||
}
|
||||
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Store summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/events
|
||||
* Get recent change events for a store
|
||||
*/
|
||||
router.get('/store/:id/events', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const filters = {
|
||||
eventType: req.query.type as string | undefined,
|
||||
days: req.query.days ? parseInt(req.query.days as string) : 30,
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 100,
|
||||
};
|
||||
|
||||
const result = await storeService.getStoreChangeEvents(storeId, filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Store events error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store events' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/brands/new
|
||||
* Get new brands added to a store
|
||||
*/
|
||||
router.get('/store/:id/brands/new', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await storeService.getNewBrands(storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] New brands error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch new brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/brands/lost
|
||||
* Get brands lost from a store
|
||||
*/
|
||||
router.get('/store/:id/brands/lost', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 30;
|
||||
|
||||
const result = await storeService.getLostBrands(storeId, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Lost brands error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch lost brands' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/:id/products/changes
|
||||
* Get product changes for a store
|
||||
*/
|
||||
router.get('/store/:id/products/changes', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.id);
|
||||
const changeType = req.query.type as 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock' | undefined;
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
|
||||
const result = await storeService.getProductChanges(storeId, changeType, days);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Product changes error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch product changes' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/leaderboard/:category
|
||||
* Get category leaderboard across stores
|
||||
*/
|
||||
router.get('/store/leaderboard/:category', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const category = decodeURIComponent(req.params.category);
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
|
||||
const result = await storeService.getCategoryLeaderboard(category, limit);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Leaderboard error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch leaderboard' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/most-active
|
||||
* Get most active stores (by changes)
|
||||
*/
|
||||
router.get('/store/most-active', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const days = req.query.days ? parseInt(req.query.days as string) : 7;
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
|
||||
|
||||
const result = await storeService.getMostActiveStores(days, limit);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Most active error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch active stores' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/store/compare
|
||||
* Compare two stores
|
||||
*/
|
||||
router.get('/store/compare', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const store1 = parseInt(req.query.store1 as string);
|
||||
const store2 = parseInt(req.query.store2 as string);
|
||||
|
||||
if (!store1 || !store2) {
|
||||
return res.status(400).json({ error: 'Both store1 and store2 are required' });
|
||||
}
|
||||
|
||||
const result = await storeService.compareStores(store1, store2);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Compare stores error:', error);
|
||||
res.status(500).json({ error: 'Failed to compare stores' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// BRAND OPPORTUNITY / RISK
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/brand/:name/opportunity
|
||||
* Get full opportunity analysis for a brand
|
||||
*/
|
||||
router.get('/brand/:name/opportunity', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const result = await brandOpportunityService.getBrandOpportunity(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand opportunity error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand opportunity' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/analytics/brand/:name/position
|
||||
* Get market position summary for a brand
|
||||
*/
|
||||
router.get('/brand/:name/position', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const result = await brandOpportunityService.getMarketPositionSummary(brandName);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Brand position error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand position' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// ALERTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/alerts
|
||||
* Get analytics alerts
|
||||
*/
|
||||
router.get('/alerts', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const filters = {
|
||||
brandName: req.query.brand as string | undefined,
|
||||
storeId: req.query.storeId ? parseInt(req.query.storeId as string) : undefined,
|
||||
alertType: req.query.type as string | undefined,
|
||||
unreadOnly: req.query.unreadOnly === 'true',
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
|
||||
};
|
||||
|
||||
const result = await brandOpportunityService.getAlerts(filters);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Alerts error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch alerts' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/analytics/alerts/mark-read
|
||||
* Mark alerts as read
|
||||
*/
|
||||
router.post('/alerts/mark-read', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { alertIds } = req.body;
|
||||
|
||||
if (!Array.isArray(alertIds)) {
|
||||
return res.status(400).json({ error: 'alertIds must be an array' });
|
||||
}
|
||||
|
||||
await brandOpportunityService.markAlertsRead(alertIds);
|
||||
res.json({ success: true });
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Mark read error:', error);
|
||||
res.status(500).json({ error: 'Failed to mark alerts as read' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CACHE MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/analytics/cache/stats
|
||||
* Get cache statistics
|
||||
*/
|
||||
router.get('/cache/stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const stats = await cache.getStats();
|
||||
res.json(stats);
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Cache stats error:', error);
|
||||
res.status(500).json({ error: 'Failed to get cache stats' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/analytics/cache/clear
|
||||
* Clear cache (admin only)
|
||||
*/
|
||||
router.post('/cache/clear', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pattern = req.query.pattern as string | undefined;
|
||||
|
||||
if (pattern) {
|
||||
const cleared = await cache.invalidatePattern(pattern);
|
||||
res.json({ success: true, clearedCount: cleared });
|
||||
} else {
|
||||
await cache.cleanExpired();
|
||||
res.json({ success: true, message: 'Expired entries cleaned' });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Cache clear error:', error);
|
||||
res.status(500).json({ error: 'Failed to clear cache' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SNAPSHOT CAPTURE (for cron/scheduled jobs)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/analytics/snapshots/capture
|
||||
* Capture daily snapshots (run by scheduler)
|
||||
*/
|
||||
router.post('/snapshots/capture', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const [brandResult, categoryResult] = await Promise.all([
|
||||
pool.query('SELECT capture_brand_snapshots() as count'),
|
||||
pool.query('SELECT capture_category_snapshots() as count'),
|
||||
]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
brandSnapshots: parseInt(brandResult.rows[0]?.count || '0'),
|
||||
categorySnapshots: parseInt(categoryResult.rows[0]?.count || '0'),
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('[Analytics] Snapshot capture error:', error);
|
||||
res.status(500).json({ error: 'Failed to capture snapshots' });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,486 +0,0 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Crawler Reliability Stress Test
|
||||
*
|
||||
* Simulates various failure scenarios to test:
|
||||
* - Retry logic with exponential backoff
|
||||
* - Error taxonomy classification
|
||||
* - Self-healing (proxy/UA rotation)
|
||||
* - Status transitions (active -> degraded -> failed)
|
||||
* - Minimum crawl gap enforcement
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*
|
||||
* Usage:
|
||||
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
|
||||
*
|
||||
* Available tests:
|
||||
* retry - Test retry manager with various error types
|
||||
* backoff - Test exponential backoff calculation
|
||||
* status - Test status transitions
|
||||
* gap - Test minimum crawl gap enforcement
|
||||
* rotation - Test proxy/UA rotation
|
||||
* all - Run all tests
|
||||
*/
|
||||
|
||||
import {
|
||||
CrawlErrorCode,
|
||||
classifyError,
|
||||
isRetryable,
|
||||
shouldRotateProxy,
|
||||
shouldRotateUserAgent,
|
||||
getBackoffMultiplier,
|
||||
getErrorMetadata,
|
||||
} from '../services/error-taxonomy';
|
||||
|
||||
import {
|
||||
RetryManager,
|
||||
withRetry,
|
||||
calculateNextCrawlDelay,
|
||||
calculateNextCrawlAt,
|
||||
determineCrawlStatus,
|
||||
shouldAttemptRecovery,
|
||||
sleep,
|
||||
} from '../services/retry-manager';
|
||||
|
||||
import {
|
||||
UserAgentRotator,
|
||||
USER_AGENTS,
|
||||
} from '../services/proxy-rotator';
|
||||
|
||||
import {
|
||||
validateStoreConfig,
|
||||
isCrawlable,
|
||||
DEFAULT_CONFIG,
|
||||
RawStoreConfig,
|
||||
} from '../services/store-validator';
|
||||
|
||||
// ============================================================
|
||||
// TEST UTILITIES
|
||||
// ============================================================
|
||||
|
||||
let testsPassed = 0;
|
||||
let testsFailed = 0;
|
||||
|
||||
function assert(condition: boolean, message: string): void {
|
||||
if (condition) {
|
||||
console.log(` ✓ ${message}`);
|
||||
testsPassed++;
|
||||
} else {
|
||||
console.log(` ✗ ${message}`);
|
||||
testsFailed++;
|
||||
}
|
||||
}
|
||||
|
||||
function section(name: string): void {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`TEST: ${name}`);
|
||||
console.log('='.repeat(60));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Error Classification
|
||||
// ============================================================
|
||||
|
||||
function testErrorClassification(): void {
|
||||
section('Error Classification');
|
||||
|
||||
// HTTP status codes
|
||||
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
|
||||
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
|
||||
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
|
||||
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
|
||||
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
|
||||
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
|
||||
|
||||
// Error messages
|
||||
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
|
||||
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
|
||||
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
|
||||
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
|
||||
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
|
||||
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
|
||||
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
|
||||
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
|
||||
|
||||
// Retryability
|
||||
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
|
||||
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
|
||||
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
|
||||
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
|
||||
|
||||
// Rotation decisions
|
||||
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
|
||||
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
|
||||
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Retry Manager
|
||||
// ============================================================
|
||||
|
||||
function testRetryManager(): void {
|
||||
section('Retry Manager');
|
||||
|
||||
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
|
||||
|
||||
// Initial state
|
||||
assert(manager.shouldAttempt() === true, 'Should attempt initially');
|
||||
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
|
||||
|
||||
// First attempt
|
||||
manager.recordAttempt();
|
||||
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
|
||||
|
||||
// Evaluate retryable error
|
||||
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
|
||||
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
|
||||
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
|
||||
assert(decision1.rotateProxy === true, 'Should rotate proxy');
|
||||
assert(decision1.backoffMs > 0, 'Backoff is positive');
|
||||
|
||||
// More attempts
|
||||
manager.recordAttempt();
|
||||
manager.recordAttempt();
|
||||
|
||||
// Now at max retries
|
||||
const decision2 = manager.evaluateError(new Error('timeout'), 504);
|
||||
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
|
||||
|
||||
manager.recordAttempt();
|
||||
const decision3 = manager.evaluateError(new Error('timeout'));
|
||||
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
|
||||
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
|
||||
|
||||
// Reset
|
||||
manager.reset();
|
||||
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
|
||||
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
|
||||
|
||||
// Non-retryable error
|
||||
const manager2 = new RetryManager({ maxRetries: 3 });
|
||||
manager2.recordAttempt();
|
||||
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
|
||||
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
|
||||
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Exponential Backoff
|
||||
// ============================================================
|
||||
|
||||
function testExponentialBackoff(): void {
|
||||
section('Exponential Backoff');
|
||||
|
||||
// Calculate next crawl delay
|
||||
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
|
||||
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
|
||||
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
|
||||
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
|
||||
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
|
||||
|
||||
console.log(` Delay with 0 failures: ${delay0} minutes`);
|
||||
console.log(` Delay with 1 failure: ${delay1} minutes`);
|
||||
console.log(` Delay with 2 failures: ${delay2} minutes`);
|
||||
console.log(` Delay with 3 failures: ${delay3} minutes`);
|
||||
console.log(` Delay with 5 failures: ${delay5} minutes`);
|
||||
|
||||
assert(delay1 > delay0, 'Delay increases with failures');
|
||||
assert(delay2 > delay1, 'Delay keeps increasing');
|
||||
assert(delay3 > delay2, 'More delay with more failures');
|
||||
// With jitter, exact values vary but ratio should be close to 2x
|
||||
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
|
||||
|
||||
// Next crawl time calculation
|
||||
const now = new Date();
|
||||
const nextAt = calculateNextCrawlAt(2, 240);
|
||||
assert(nextAt > now, 'Next crawl is in future');
|
||||
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Status Transitions
|
||||
// ============================================================
|
||||
|
||||
function testStatusTransitions(): void {
|
||||
section('Status Transitions');
|
||||
|
||||
// Active status
|
||||
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
|
||||
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
|
||||
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
|
||||
|
||||
// Degraded status
|
||||
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
|
||||
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
|
||||
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
|
||||
|
||||
// Failed status
|
||||
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
|
||||
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
|
||||
|
||||
// Custom thresholds
|
||||
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
|
||||
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
|
||||
|
||||
// Recovery check
|
||||
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
|
||||
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
|
||||
|
||||
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
|
||||
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
|
||||
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Store Validation
|
||||
// ============================================================
|
||||
|
||||
function testStoreValidation(): void {
|
||||
section('Store Validation');
|
||||
|
||||
// Valid config
|
||||
const validConfig: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test Store',
|
||||
platformDispensaryId: '123abc',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const validResult = validateStoreConfig(validConfig);
|
||||
assert(validResult.isValid === true, 'Valid config passes');
|
||||
assert(validResult.config !== null, 'Valid config returns config');
|
||||
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
|
||||
|
||||
// Missing required fields
|
||||
const missingId: RawStoreConfig = {
|
||||
id: 0,
|
||||
name: 'Test',
|
||||
platformDispensaryId: '123',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const missingIdResult = validateStoreConfig(missingId);
|
||||
assert(missingIdResult.isValid === false, 'Missing ID fails');
|
||||
|
||||
// Missing platform ID
|
||||
const missingPlatform: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test',
|
||||
menuType: 'dutchie',
|
||||
};
|
||||
const missingPlatformResult = validateStoreConfig(missingPlatform);
|
||||
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
|
||||
|
||||
// Unknown menu type
|
||||
const unknownMenu: RawStoreConfig = {
|
||||
id: 1,
|
||||
name: 'Test',
|
||||
platformDispensaryId: '123',
|
||||
menuType: 'unknown',
|
||||
};
|
||||
const unknownMenuResult = validateStoreConfig(unknownMenu);
|
||||
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
|
||||
|
||||
// Crawlable check
|
||||
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
|
||||
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
|
||||
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
|
||||
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: User Agent Rotation
|
||||
// ============================================================
|
||||
|
||||
function testUserAgentRotation(): void {
|
||||
section('User Agent Rotation');
|
||||
|
||||
const rotator = new UserAgentRotator();
|
||||
|
||||
const first = rotator.getCurrent();
|
||||
const second = rotator.getNext();
|
||||
const third = rotator.getNext();
|
||||
|
||||
assert(first !== second, 'User agents rotate');
|
||||
assert(second !== third, 'User agents keep rotating');
|
||||
assert(USER_AGENTS.includes(first), 'Returns valid UA');
|
||||
assert(USER_AGENTS.includes(second), 'Returns valid UA');
|
||||
|
||||
// Random UA
|
||||
const random = rotator.getRandom();
|
||||
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
|
||||
|
||||
// Count
|
||||
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: WithRetry Helper
|
||||
// ============================================================
|
||||
|
||||
async function testWithRetryHelper(): Promise<void> {
|
||||
section('WithRetry Helper');
|
||||
|
||||
// Successful on first try
|
||||
let attempts = 0;
|
||||
const successResult = await withRetry(async () => {
|
||||
attempts++;
|
||||
return 'success';
|
||||
}, { maxRetries: 3 });
|
||||
assert(attempts === 1, 'Succeeds on first try');
|
||||
assert(successResult.result === 'success', 'Returns result');
|
||||
|
||||
// Fails then succeeds
|
||||
let failThenSucceedAttempts = 0;
|
||||
const failThenSuccessResult = await withRetry(async () => {
|
||||
failThenSucceedAttempts++;
|
||||
if (failThenSucceedAttempts < 3) {
|
||||
throw new Error('temporary error');
|
||||
}
|
||||
return 'finally succeeded';
|
||||
}, { maxRetries: 5, baseBackoffMs: 10 });
|
||||
assert(failThenSucceedAttempts === 3, 'Retries until success');
|
||||
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
|
||||
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
|
||||
|
||||
// Exhausts retries
|
||||
let alwaysFailAttempts = 0;
|
||||
try {
|
||||
await withRetry(async () => {
|
||||
alwaysFailAttempts++;
|
||||
throw new Error('always fails');
|
||||
}, { maxRetries: 2, baseBackoffMs: 10 });
|
||||
assert(false, 'Should have thrown');
|
||||
} catch (error: any) {
|
||||
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
|
||||
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
|
||||
}
|
||||
|
||||
// Non-retryable error stops immediately
|
||||
let nonRetryableAttempts = 0;
|
||||
try {
|
||||
await withRetry(async () => {
|
||||
nonRetryableAttempts++;
|
||||
const err = new Error('HTML structure changed - selector not found');
|
||||
throw err;
|
||||
}, { maxRetries: 3, baseBackoffMs: 10 });
|
||||
assert(false, 'Should have thrown');
|
||||
} catch {
|
||||
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Minimum Crawl Gap
|
||||
// ============================================================
|
||||
|
||||
function testMinimumCrawlGap(): void {
|
||||
section('Minimum Crawl Gap');
|
||||
|
||||
// Default config
|
||||
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
|
||||
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
|
||||
|
||||
// Gap calculation
|
||||
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
|
||||
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
|
||||
|
||||
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// TEST: Error Metadata
|
||||
// ============================================================
|
||||
|
||||
function testErrorMetadata(): void {
|
||||
section('Error Metadata');
|
||||
|
||||
// RATE_LIMITED
|
||||
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
|
||||
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
|
||||
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
|
||||
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
|
||||
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
|
||||
|
||||
// HTML_CHANGED
|
||||
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
|
||||
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
|
||||
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
|
||||
|
||||
// INVALID_CONFIG
|
||||
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
|
||||
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
|
||||
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN
|
||||
// ============================================================
|
||||
|
||||
async function runTests(testName?: string): Promise<void> {
|
||||
console.log('\n');
|
||||
console.log('╔══════════════════════════════════════════════════════════╗');
|
||||
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
|
||||
console.log('╚══════════════════════════════════════════════════════════╝');
|
||||
|
||||
const allTests = !testName || testName === 'all';
|
||||
|
||||
if (allTests || testName === 'error' || testName === 'classification') {
|
||||
testErrorClassification();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'retry') {
|
||||
testRetryManager();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'backoff') {
|
||||
testExponentialBackoff();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'status') {
|
||||
testStatusTransitions();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'validation' || testName === 'store') {
|
||||
testStoreValidation();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'rotation' || testName === 'ua') {
|
||||
testUserAgentRotation();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'withRetry' || testName === 'helper') {
|
||||
await testWithRetryHelper();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'gap') {
|
||||
testMinimumCrawlGap();
|
||||
}
|
||||
|
||||
if (allTests || testName === 'metadata') {
|
||||
testErrorMetadata();
|
||||
}
|
||||
|
||||
// Summary
|
||||
console.log('\n');
|
||||
console.log('═'.repeat(60));
|
||||
console.log('SUMMARY');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Passed: ${testsPassed}`);
|
||||
console.log(` Failed: ${testsFailed}`);
|
||||
console.log(` Total: ${testsPassed + testsFailed}`);
|
||||
|
||||
if (testsFailed > 0) {
|
||||
console.log('\n❌ SOME TESTS FAILED\n');
|
||||
process.exit(1);
|
||||
} else {
|
||||
console.log('\n✅ ALL TESTS PASSED\n');
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests
|
||||
const testName = process.argv[2];
|
||||
runTests(testName).catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,659 +0,0 @@
|
||||
/**
|
||||
* Brand Opportunity / Risk Analytics Service
|
||||
*
|
||||
* Provides brand-level opportunity and risk analysis including:
|
||||
* - Under/overpriced vs market
|
||||
* - Missing SKU opportunities
|
||||
* - Stores with declining/growing shelf share
|
||||
* - Competitor intrusion alerts
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface BrandOpportunity {
|
||||
brandName: string;
|
||||
underpricedVsMarket: PricePosition[];
|
||||
overpricedVsMarket: PricePosition[];
|
||||
missingSkuOpportunities: MissingSkuOpportunity[];
|
||||
storesWithDecliningShelfShare: StoreShelfShareChange[];
|
||||
storesWithGrowingShelfShare: StoreShelfShareChange[];
|
||||
competitorIntrusionAlerts: CompetitorAlert[];
|
||||
overallScore: number; // 0-100, higher = more opportunity
|
||||
riskScore: number; // 0-100, higher = more risk
|
||||
}
|
||||
|
||||
export interface PricePosition {
|
||||
category: string;
|
||||
brandAvgPrice: number;
|
||||
marketAvgPrice: number;
|
||||
priceDifferencePercent: number;
|
||||
skuCount: number;
|
||||
suggestion: string;
|
||||
}
|
||||
|
||||
export interface MissingSkuOpportunity {
|
||||
category: string;
|
||||
subcategory: string | null;
|
||||
marketSkuCount: number;
|
||||
brandSkuCount: number;
|
||||
gapPercent: number;
|
||||
topCompetitors: string[];
|
||||
opportunityScore: number; // 0-100
|
||||
}
|
||||
|
||||
export interface StoreShelfShareChange {
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
currentShelfShare: number;
|
||||
previousShelfShare: number;
|
||||
changePercent: number;
|
||||
currentSkus: number;
|
||||
competitors: string[];
|
||||
}
|
||||
|
||||
export interface CompetitorAlert {
|
||||
competitorBrand: string;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
alertType: 'new_entry' | 'expanding' | 'price_undercut';
|
||||
details: string;
|
||||
severity: 'low' | 'medium' | 'high';
|
||||
date: string;
|
||||
}
|
||||
|
||||
export interface MarketPositionSummary {
|
||||
brandName: string;
|
||||
marketSharePercent: number;
|
||||
avgPriceVsMarket: number; // -X% to +X%
|
||||
categoryStrengths: Array<{ category: string; shelfSharePercent: number }>;
|
||||
categoryWeaknesses: Array<{ category: string; shelfSharePercent: number; marketLeader: string }>;
|
||||
growthTrend: 'growing' | 'stable' | 'declining';
|
||||
competitorThreats: string[];
|
||||
}
|
||||
|
||||
export class BrandOpportunityService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get full opportunity analysis for a brand
|
||||
*/
|
||||
async getBrandOpportunity(brandName: string): Promise<BrandOpportunity> {
|
||||
const key = cacheKey('brand_opportunity', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [
|
||||
underpriced,
|
||||
overpriced,
|
||||
missingSkus,
|
||||
decliningStores,
|
||||
growingStores,
|
||||
alerts,
|
||||
] = await Promise.all([
|
||||
this.getUnderpricedPositions(brandName),
|
||||
this.getOverpricedPositions(brandName),
|
||||
this.getMissingSkuOpportunities(brandName),
|
||||
this.getStoresWithDecliningShare(brandName),
|
||||
this.getStoresWithGrowingShare(brandName),
|
||||
this.getCompetitorAlerts(brandName),
|
||||
]);
|
||||
|
||||
// Calculate opportunity score (higher = more opportunity)
|
||||
const opportunityFactors = [
|
||||
missingSkus.length > 0 ? 20 : 0,
|
||||
underpriced.length > 0 ? 15 : 0,
|
||||
growingStores.length > 5 ? 20 : growingStores.length * 3,
|
||||
missingSkus.reduce((sum, m) => sum + m.opportunityScore, 0) / Math.max(1, missingSkus.length) * 0.3,
|
||||
];
|
||||
const opportunityScore = Math.min(100, opportunityFactors.reduce((a, b) => a + b, 0));
|
||||
|
||||
// Calculate risk score (higher = more risk)
|
||||
const riskFactors = [
|
||||
decliningStores.length > 5 ? 30 : decliningStores.length * 5,
|
||||
alerts.filter(a => a.severity === 'high').length * 15,
|
||||
alerts.filter(a => a.severity === 'medium').length * 8,
|
||||
overpriced.length > 3 ? 15 : overpriced.length * 3,
|
||||
];
|
||||
const riskScore = Math.min(100, riskFactors.reduce((a, b) => a + b, 0));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
underpricedVsMarket: underpriced,
|
||||
overpricedVsMarket: overpriced,
|
||||
missingSkuOpportunities: missingSkus,
|
||||
storesWithDecliningShelfShare: decliningStores,
|
||||
storesWithGrowingShelfShare: growingStores,
|
||||
competitorIntrusionAlerts: alerts,
|
||||
overallScore: Math.round(opportunityScore),
|
||||
riskScore: Math.round(riskScore),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories where brand is underpriced vs market
|
||||
*/
|
||||
async getUnderpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
market_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL AND brand_name != $1
|
||||
GROUP BY type
|
||||
)
|
||||
SELECT
|
||||
bp.category,
|
||||
bp.brand_avg,
|
||||
mp.market_avg,
|
||||
bp.sku_count,
|
||||
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||
FROM brand_prices bp
|
||||
JOIN market_prices mp ON bp.category = mp.category
|
||||
WHERE bp.brand_avg < mp.market_avg * 0.9 -- 10% or more below market
|
||||
AND bp.brand_avg IS NOT NULL
|
||||
AND mp.market_avg IS NOT NULL
|
||||
ORDER BY diff_pct
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category: row.category,
|
||||
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
suggestion: `Consider price increase - ${Math.abs(Math.round(parseFloat(row.diff_pct)))}% below market average`,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories where brand is overpriced vs market
|
||||
*/
|
||||
async getOverpricedPositions(brandName: string): Promise<PricePosition[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as brand_avg,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
market_prices AS (
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as market_avg
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL AND brand_name != $1
|
||||
GROUP BY type
|
||||
)
|
||||
SELECT
|
||||
bp.category,
|
||||
bp.brand_avg,
|
||||
mp.market_avg,
|
||||
bp.sku_count,
|
||||
((bp.brand_avg - mp.market_avg) / NULLIF(mp.market_avg, 0)) * 100 as diff_pct
|
||||
FROM brand_prices bp
|
||||
JOIN market_prices mp ON bp.category = mp.category
|
||||
WHERE bp.brand_avg > mp.market_avg * 1.15 -- 15% or more above market
|
||||
AND bp.brand_avg IS NOT NULL
|
||||
AND mp.market_avg IS NOT NULL
|
||||
ORDER BY diff_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category: row.category,
|
||||
brandAvgPrice: Math.round(parseFloat(row.brand_avg) * 100) / 100,
|
||||
marketAvgPrice: Math.round(parseFloat(row.market_avg) * 100) / 100,
|
||||
priceDifferencePercent: Math.round(parseFloat(row.diff_pct) * 10) / 10,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
suggestion: `Price sensitivity risk - ${Math.round(parseFloat(row.diff_pct))}% above market average`,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get missing SKU opportunities (category gaps)
|
||||
*/
|
||||
async getMissingSkuOpportunities(brandName: string): Promise<MissingSkuOpportunity[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH market_categories AS (
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as market_skus,
|
||||
ARRAY_AGG(DISTINCT brand_name ORDER BY brand_name) FILTER (WHERE brand_name IS NOT NULL) as top_brands
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
HAVING COUNT(*) >= 20
|
||||
),
|
||||
brand_presence AS (
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as brand_skus
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
)
|
||||
SELECT
|
||||
mc.category,
|
||||
mc.subcategory,
|
||||
mc.market_skus,
|
||||
COALESCE(bp.brand_skus, 0) as brand_skus,
|
||||
mc.top_brands[1:5] as competitors
|
||||
FROM market_categories mc
|
||||
LEFT JOIN brand_presence bp ON mc.category = bp.category
|
||||
AND (mc.subcategory = bp.subcategory OR (mc.subcategory IS NULL AND bp.subcategory IS NULL))
|
||||
WHERE COALESCE(bp.brand_skus, 0) < mc.market_skus * 0.05 -- Brand has <5% of market presence
|
||||
ORDER BY mc.market_skus DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const marketSkus = parseInt(row.market_skus) || 0;
|
||||
const brandSkus = parseInt(row.brand_skus) || 0;
|
||||
const gapPercent = marketSkus > 0 ? ((marketSkus - brandSkus) / marketSkus) * 100 : 100;
|
||||
const opportunityScore = Math.min(100, Math.round((marketSkus / 100) * (gapPercent / 100) * 100));
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
subcategory: row.subcategory,
|
||||
marketSkuCount: marketSkus,
|
||||
brandSkuCount: brandSkus,
|
||||
gapPercent: Math.round(gapPercent),
|
||||
topCompetitors: (row.competitors || []).filter((c: string) => c !== brandName).slice(0, 5),
|
||||
opportunityScore,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores where brand's shelf share is declining
|
||||
*/
|
||||
async getStoresWithDecliningShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||
// Use brand_snapshots for historical comparison
|
||||
const result = await this.pool.query(`
|
||||
WITH current_share AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||
COUNT(*) as total_skus,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||
)
|
||||
SELECT
|
||||
cs.store_id,
|
||||
cs.store_name,
|
||||
cs.city,
|
||||
cs.state,
|
||||
cs.brand_skus as current_skus,
|
||||
cs.total_skus,
|
||||
ROUND((cs.brand_skus::NUMERIC / cs.total_skus) * 100, 2) as current_share,
|
||||
cs.competitors[1:5] as top_competitors
|
||||
FROM current_share cs
|
||||
WHERE cs.brand_skus < 10 -- Low presence
|
||||
ORDER BY cs.brand_skus
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||
previousShelfShare: parseFloat(row.current_share) || 0, // Would need historical data
|
||||
changePercent: 0,
|
||||
currentSkus: parseInt(row.current_skus) || 0,
|
||||
competitors: row.top_competitors || [],
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores where brand's shelf share is growing
|
||||
*/
|
||||
async getStoresWithGrowingShare(brandName: string): Promise<StoreShelfShareChange[]> {
|
||||
const result = await this.pool.query(`
|
||||
WITH store_share AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) FILTER (WHERE dp.brand_name = $1) as brand_skus,
|
||||
COUNT(*) as total_skus,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name != $1 AND dp.brand_name IS NOT NULL) as competitors
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
GROUP BY dp.dispensary_id, d.name, d.city, d.state
|
||||
HAVING COUNT(*) FILTER (WHERE dp.brand_name = $1) > 0
|
||||
)
|
||||
SELECT
|
||||
ss.store_id,
|
||||
ss.store_name,
|
||||
ss.city,
|
||||
ss.state,
|
||||
ss.brand_skus as current_skus,
|
||||
ss.total_skus,
|
||||
ROUND((ss.brand_skus::NUMERIC / ss.total_skus) * 100, 2) as current_share,
|
||||
ss.competitors[1:5] as top_competitors
|
||||
FROM store_share ss
|
||||
ORDER BY current_share DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
currentShelfShare: parseFloat(row.current_share) || 0,
|
||||
previousShelfShare: parseFloat(row.current_share) || 0,
|
||||
changePercent: 0,
|
||||
currentSkus: parseInt(row.current_skus) || 0,
|
||||
competitors: row.top_competitors || [],
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get competitor intrusion alerts
|
||||
*/
|
||||
async getCompetitorAlerts(brandName: string): Promise<CompetitorAlert[]> {
|
||||
// Check for competitor entries in stores where this brand has presence
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_stores AS (
|
||||
SELECT DISTINCT dispensary_id
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1
|
||||
),
|
||||
competitor_presence AS (
|
||||
SELECT
|
||||
dp.brand_name as competitor,
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
COUNT(*) as sku_count,
|
||||
MAX(dp.created_at) as latest_add
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.dispensary_id IN (SELECT dispensary_id FROM brand_stores)
|
||||
AND dp.brand_name != $1
|
||||
AND dp.brand_name IS NOT NULL
|
||||
AND dp.created_at >= NOW() - INTERVAL '30 days'
|
||||
GROUP BY dp.brand_name, dp.dispensary_id, d.name
|
||||
HAVING COUNT(*) >= 5
|
||||
)
|
||||
SELECT
|
||||
competitor,
|
||||
store_id,
|
||||
store_name,
|
||||
sku_count,
|
||||
latest_add
|
||||
FROM competitor_presence
|
||||
ORDER BY sku_count DESC
|
||||
LIMIT 10
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const skuCount = parseInt(row.sku_count) || 0;
|
||||
let severity: 'low' | 'medium' | 'high' = 'low';
|
||||
if (skuCount >= 20) severity = 'high';
|
||||
else if (skuCount >= 10) severity = 'medium';
|
||||
|
||||
return {
|
||||
competitorBrand: row.competitor,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
alertType: 'expanding' as const,
|
||||
details: `${row.competitor} has ${skuCount} SKUs in ${row.store_name}`,
|
||||
severity,
|
||||
date: new Date(row.latest_add).toISOString().split('T')[0],
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get market position summary for a brand
|
||||
*/
|
||||
async getMarketPositionSummary(brandName: string): Promise<MarketPositionSummary> {
|
||||
const key = cacheKey('market_position', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [shareResult, priceResult, categoryResult, threatResult] = await Promise.all([
|
||||
// Market share
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dutchie_products WHERE brand_name = $1) as brand_count,
|
||||
(SELECT COUNT(*) FROM dutchie_products) as total_count
|
||||
`, [brandName]),
|
||||
|
||||
// Price vs market
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name = $1) as brand_avg,
|
||||
(SELECT AVG(extract_min_price(latest_raw_payload)) FROM dutchie_products WHERE brand_name != $1) as market_avg
|
||||
`, [brandName]),
|
||||
|
||||
// Category strengths/weaknesses
|
||||
this.pool.query(`
|
||||
WITH brand_by_cat AS (
|
||||
SELECT type as category, COUNT(*) as brand_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
market_by_cat AS (
|
||||
SELECT type as category, COUNT(*) as total_count
|
||||
FROM dutchie_products WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
leaders AS (
|
||||
SELECT type as category, brand_name, COUNT(*) as cnt,
|
||||
RANK() OVER (PARTITION BY type ORDER BY COUNT(*) DESC) as rnk
|
||||
FROM dutchie_products WHERE type IS NOT NULL AND brand_name IS NOT NULL
|
||||
GROUP BY type, brand_name
|
||||
)
|
||||
SELECT
|
||||
mc.category,
|
||||
COALESCE(bc.brand_count, 0) as brand_count,
|
||||
mc.total_count,
|
||||
ROUND((COALESCE(bc.brand_count, 0)::NUMERIC / mc.total_count) * 100, 2) as share_pct,
|
||||
(SELECT brand_name FROM leaders WHERE category = mc.category AND rnk = 1) as leader
|
||||
FROM market_by_cat mc
|
||||
LEFT JOIN brand_by_cat bc ON mc.category = bc.category
|
||||
ORDER BY share_pct DESC
|
||||
`, [brandName]),
|
||||
|
||||
// Top competitors
|
||||
this.pool.query(`
|
||||
SELECT brand_name, COUNT(*) as cnt
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL AND brand_name != $1
|
||||
GROUP BY brand_name
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 5
|
||||
`, [brandName]),
|
||||
]);
|
||||
|
||||
const brandCount = parseInt(shareResult.rows[0]?.brand_count) || 0;
|
||||
const totalCount = parseInt(shareResult.rows[0]?.total_count) || 1;
|
||||
const marketSharePercent = Math.round((brandCount / totalCount) * 1000) / 10;
|
||||
|
||||
const brandAvg = parseFloat(priceResult.rows[0]?.brand_avg) || 0;
|
||||
const marketAvg = parseFloat(priceResult.rows[0]?.market_avg) || 1;
|
||||
const avgPriceVsMarket = Math.round(((brandAvg - marketAvg) / marketAvg) * 1000) / 10;
|
||||
|
||||
const categories = categoryResult.rows;
|
||||
const strengths = categories
|
||||
.filter(c => parseFloat(c.share_pct) > 5)
|
||||
.map(c => ({ category: c.category, shelfSharePercent: parseFloat(c.share_pct) }));
|
||||
|
||||
const weaknesses = categories
|
||||
.filter(c => parseFloat(c.share_pct) < 2 && c.leader !== brandName)
|
||||
.map(c => ({
|
||||
category: c.category,
|
||||
shelfSharePercent: parseFloat(c.share_pct),
|
||||
marketLeader: c.leader || 'Unknown',
|
||||
}));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
marketSharePercent,
|
||||
avgPriceVsMarket,
|
||||
categoryStrengths: strengths.slice(0, 5),
|
||||
categoryWeaknesses: weaknesses.slice(0, 5),
|
||||
growthTrend: 'stable' as const, // Would need historical data
|
||||
competitorThreats: threatResult.rows.map(r => r.brand_name),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an analytics alert
|
||||
*/
|
||||
async createAlert(alert: {
|
||||
alertType: string;
|
||||
severity: 'info' | 'warning' | 'critical';
|
||||
title: string;
|
||||
description?: string;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
productId?: number;
|
||||
category?: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}): Promise<void> {
|
||||
await this.pool.query(`
|
||||
INSERT INTO analytics_alerts
|
||||
(alert_type, severity, title, description, store_id, brand_name, product_id, category, metadata)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
alert.alertType,
|
||||
alert.severity,
|
||||
alert.title,
|
||||
alert.description || null,
|
||||
alert.storeId || null,
|
||||
alert.brandName || null,
|
||||
alert.productId || null,
|
||||
alert.category || null,
|
||||
alert.metadata ? JSON.stringify(alert.metadata) : null,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recent alerts
|
||||
*/
|
||||
async getAlerts(filters: {
|
||||
brandName?: string;
|
||||
storeId?: number;
|
||||
alertType?: string;
|
||||
unreadOnly?: boolean;
|
||||
limit?: number;
|
||||
} = {}): Promise<Array<{
|
||||
id: number;
|
||||
alertType: string;
|
||||
severity: string;
|
||||
title: string;
|
||||
description: string | null;
|
||||
storeName: string | null;
|
||||
brandName: string | null;
|
||||
createdAt: string;
|
||||
isRead: boolean;
|
||||
}>> {
|
||||
const { brandName, storeId, alertType, unreadOnly = false, limit = 50 } = filters;
|
||||
const params: (string | number | boolean)[] = [limit];
|
||||
const conditions: string[] = [];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (brandName) {
|
||||
conditions.push(`a.brand_name = $${paramIndex++}`);
|
||||
params.push(brandName);
|
||||
}
|
||||
if (storeId) {
|
||||
conditions.push(`a.store_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
if (alertType) {
|
||||
conditions.push(`a.alert_type = $${paramIndex++}`);
|
||||
params.push(alertType);
|
||||
}
|
||||
if (unreadOnly) {
|
||||
conditions.push('a.is_read = false');
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0
|
||||
? 'WHERE ' + conditions.join(' AND ')
|
||||
: '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
a.id,
|
||||
a.alert_type,
|
||||
a.severity,
|
||||
a.title,
|
||||
a.description,
|
||||
d.name as store_name,
|
||||
a.brand_name,
|
||||
a.created_at,
|
||||
a.is_read
|
||||
FROM analytics_alerts a
|
||||
LEFT JOIN dispensaries d ON a.store_id = d.id
|
||||
${whereClause}
|
||||
ORDER BY a.created_at DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
alertType: row.alert_type,
|
||||
severity: row.severity,
|
||||
title: row.title,
|
||||
description: row.description,
|
||||
storeName: row.store_name,
|
||||
brandName: row.brand_name,
|
||||
createdAt: row.created_at.toISOString(),
|
||||
isRead: row.is_read,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark alerts as read
|
||||
*/
|
||||
async markAlertsRead(alertIds: number[]): Promise<void> {
|
||||
if (alertIds.length === 0) return;
|
||||
|
||||
await this.pool.query(`
|
||||
UPDATE analytics_alerts
|
||||
SET is_read = true
|
||||
WHERE id = ANY($1)
|
||||
`, [alertIds]);
|
||||
}
|
||||
}
|
||||
@@ -1,227 +0,0 @@
|
||||
/**
|
||||
* Analytics Cache Service
|
||||
*
|
||||
* Provides caching layer for expensive analytics queries.
|
||||
* Uses PostgreSQL for persistence with configurable TTLs.
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
|
||||
export interface CacheEntry<T = unknown> {
|
||||
key: string;
|
||||
data: T;
|
||||
computedAt: Date;
|
||||
expiresAt: Date;
|
||||
queryTimeMs?: number;
|
||||
}
|
||||
|
||||
export interface CacheConfig {
|
||||
defaultTtlMinutes: number;
|
||||
}
|
||||
|
||||
const DEFAULT_CONFIG: CacheConfig = {
|
||||
defaultTtlMinutes: 15,
|
||||
};
|
||||
|
||||
export class AnalyticsCache {
|
||||
private pool: Pool;
|
||||
private config: CacheConfig;
|
||||
private memoryCache: Map<string, CacheEntry> = new Map();
|
||||
|
||||
constructor(pool: Pool, config: Partial<CacheConfig> = {}) {
|
||||
this.pool = pool;
|
||||
this.config = { ...DEFAULT_CONFIG, ...config };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cached data or compute and cache it
|
||||
*/
|
||||
async getOrCompute<T>(
|
||||
key: string,
|
||||
computeFn: () => Promise<T>,
|
||||
ttlMinutes?: number
|
||||
): Promise<{ data: T; fromCache: boolean; queryTimeMs: number }> {
|
||||
const ttl = ttlMinutes ?? this.config.defaultTtlMinutes;
|
||||
|
||||
// Check memory cache first
|
||||
const memEntry = this.memoryCache.get(key);
|
||||
if (memEntry && new Date() < memEntry.expiresAt) {
|
||||
return { data: memEntry.data as T, fromCache: true, queryTimeMs: memEntry.queryTimeMs || 0 };
|
||||
}
|
||||
|
||||
// Check database cache
|
||||
const dbEntry = await this.getFromDb<T>(key);
|
||||
if (dbEntry && new Date() < dbEntry.expiresAt) {
|
||||
this.memoryCache.set(key, dbEntry);
|
||||
return { data: dbEntry.data, fromCache: true, queryTimeMs: dbEntry.queryTimeMs || 0 };
|
||||
}
|
||||
|
||||
// Compute fresh data
|
||||
const startTime = Date.now();
|
||||
const data = await computeFn();
|
||||
const queryTimeMs = Date.now() - startTime;
|
||||
|
||||
// Cache result
|
||||
const entry: CacheEntry<T> = {
|
||||
key,
|
||||
data,
|
||||
computedAt: new Date(),
|
||||
expiresAt: new Date(Date.now() + ttl * 60 * 1000),
|
||||
queryTimeMs,
|
||||
};
|
||||
|
||||
await this.saveToDb(entry);
|
||||
this.memoryCache.set(key, entry);
|
||||
|
||||
return { data, fromCache: false, queryTimeMs };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get from database cache
|
||||
*/
|
||||
private async getFromDb<T>(key: string): Promise<CacheEntry<T> | null> {
|
||||
try {
|
||||
const result = await this.pool.query(`
|
||||
SELECT cache_data, computed_at, expires_at, query_time_ms
|
||||
FROM analytics_cache
|
||||
WHERE cache_key = $1
|
||||
AND expires_at > NOW()
|
||||
`, [key]);
|
||||
|
||||
if (result.rows.length === 0) return null;
|
||||
|
||||
const row = result.rows[0];
|
||||
return {
|
||||
key,
|
||||
data: row.cache_data as T,
|
||||
computedAt: row.computed_at,
|
||||
expiresAt: row.expires_at,
|
||||
queryTimeMs: row.query_time_ms,
|
||||
};
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to get from DB: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save to database cache
|
||||
*/
|
||||
private async saveToDb<T>(entry: CacheEntry<T>): Promise<void> {
|
||||
try {
|
||||
await this.pool.query(`
|
||||
INSERT INTO analytics_cache (cache_key, cache_data, computed_at, expires_at, query_time_ms)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (cache_key)
|
||||
DO UPDATE SET
|
||||
cache_data = EXCLUDED.cache_data,
|
||||
computed_at = EXCLUDED.computed_at,
|
||||
expires_at = EXCLUDED.expires_at,
|
||||
query_time_ms = EXCLUDED.query_time_ms
|
||||
`, [entry.key, JSON.stringify(entry.data), entry.computedAt, entry.expiresAt, entry.queryTimeMs]);
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to save to DB: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate a cache entry
|
||||
*/
|
||||
async invalidate(key: string): Promise<void> {
|
||||
this.memoryCache.delete(key);
|
||||
try {
|
||||
await this.pool.query('DELETE FROM analytics_cache WHERE cache_key = $1', [key]);
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to invalidate: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate all entries matching a pattern
|
||||
*/
|
||||
async invalidatePattern(pattern: string): Promise<number> {
|
||||
// Clear memory cache
|
||||
for (const key of this.memoryCache.keys()) {
|
||||
if (key.includes(pattern)) {
|
||||
this.memoryCache.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query(
|
||||
'DELETE FROM analytics_cache WHERE cache_key LIKE $1',
|
||||
[`%${pattern}%`]
|
||||
);
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to invalidate pattern: ${error}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean expired entries
|
||||
*/
|
||||
async cleanExpired(): Promise<number> {
|
||||
// Clean memory cache
|
||||
const now = new Date();
|
||||
for (const [key, entry] of this.memoryCache.entries()) {
|
||||
if (now >= entry.expiresAt) {
|
||||
this.memoryCache.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.pool.query('DELETE FROM analytics_cache WHERE expires_at < NOW()');
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
console.warn(`[AnalyticsCache] Failed to clean expired: ${error}`);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get cache statistics
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
memoryCacheSize: number;
|
||||
dbCacheSize: number;
|
||||
expiredCount: number;
|
||||
}> {
|
||||
try {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE expires_at > NOW()) as active,
|
||||
COUNT(*) FILTER (WHERE expires_at <= NOW()) as expired
|
||||
FROM analytics_cache
|
||||
`);
|
||||
|
||||
return {
|
||||
memoryCacheSize: this.memoryCache.size,
|
||||
dbCacheSize: parseInt(result.rows[0]?.active || '0'),
|
||||
expiredCount: parseInt(result.rows[0]?.expired || '0'),
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
memoryCacheSize: this.memoryCache.size,
|
||||
dbCacheSize: 0,
|
||||
expiredCount: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate cache key with parameters
|
||||
*/
|
||||
export function cacheKey(prefix: string, params: Record<string, unknown> = {}): string {
|
||||
const sortedParams = Object.keys(params)
|
||||
.sort()
|
||||
.filter(k => params[k] !== undefined && params[k] !== null)
|
||||
.map(k => `${k}=${params[k]}`)
|
||||
.join('&');
|
||||
|
||||
return sortedParams ? `${prefix}:${sortedParams}` : prefix;
|
||||
}
|
||||
@@ -1,530 +0,0 @@
|
||||
/**
|
||||
* Category Growth Analytics Service
|
||||
*
|
||||
* Provides category-level analytics including:
|
||||
* - SKU count growth
|
||||
* - Price growth trends
|
||||
* - New product additions
|
||||
* - Category shrinkage
|
||||
* - Seasonality patterns
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface CategoryGrowth {
|
||||
category: string;
|
||||
currentSkuCount: number;
|
||||
previousSkuCount: number;
|
||||
skuGrowthPercent: number;
|
||||
currentBrandCount: number;
|
||||
previousBrandCount: number;
|
||||
brandGrowthPercent: number;
|
||||
currentAvgPrice: number | null;
|
||||
previousAvgPrice: number | null;
|
||||
priceChangePercent: number | null;
|
||||
newProducts: number;
|
||||
discontinuedProducts: number;
|
||||
trend: 'growing' | 'declining' | 'stable';
|
||||
}
|
||||
|
||||
export interface CategorySummary {
|
||||
category: string;
|
||||
totalSkus: number;
|
||||
brandCount: number;
|
||||
storeCount: number;
|
||||
avgPrice: number | null;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
inStockSkus: number;
|
||||
outOfStockSkus: number;
|
||||
stockHealthPercent: number;
|
||||
}
|
||||
|
||||
export interface CategoryGrowthTrend {
|
||||
category: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
storeCount: number;
|
||||
}>;
|
||||
growth7d: number | null;
|
||||
growth30d: number | null;
|
||||
growth90d: number | null;
|
||||
}
|
||||
|
||||
export interface CategoryHeatmapData {
|
||||
categories: string[];
|
||||
periods: string[];
|
||||
data: Array<{
|
||||
category: string;
|
||||
period: string;
|
||||
value: number; // SKU count, growth %, or price
|
||||
changeFromPrevious: number | null;
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface SeasonalityPattern {
|
||||
category: string;
|
||||
monthlyPattern: Array<{
|
||||
month: number;
|
||||
monthName: string;
|
||||
avgSkuCount: number;
|
||||
avgPrice: number | null;
|
||||
seasonalityIndex: number; // 100 = average, >100 = above, <100 = below
|
||||
}>;
|
||||
peakMonth: number;
|
||||
troughMonth: number;
|
||||
}
|
||||
|
||||
export interface CategoryFilters {
|
||||
state?: string;
|
||||
storeId?: number;
|
||||
minSkus?: number;
|
||||
}
|
||||
|
||||
export class CategoryAnalyticsService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current category summary
|
||||
*/
|
||||
async getCategorySummary(
|
||||
category?: string,
|
||||
filters: CategoryFilters = {}
|
||||
): Promise<CategorySummary[]> {
|
||||
const { state, storeId } = filters;
|
||||
const key = cacheKey('category_summary', { category, state, storeId });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [];
|
||||
const conditions: string[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (category) {
|
||||
conditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
if (storeId) {
|
||||
conditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0
|
||||
? 'WHERE dp.type IS NOT NULL AND ' + conditions.join(' AND ')
|
||||
: 'WHERE dp.type IS NOT NULL';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
dp.type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
SUM(CASE WHEN dp.stock_status != 'in_stock' OR dp.stock_status IS NULL THEN 1 ELSE 0 END) as out_of_stock
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
${whereClause}
|
||||
GROUP BY dp.type
|
||||
ORDER BY total_skus DESC
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const inStock = parseInt(row.in_stock) || 0;
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
totalSkus,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
minPrice: row.min_price ? Math.round(parseFloat(row.min_price) * 100) / 100 : null,
|
||||
maxPrice: row.max_price ? Math.round(parseFloat(row.max_price) * 100) / 100 : null,
|
||||
inStockSkus: inStock,
|
||||
outOfStockSkus: parseInt(row.out_of_stock) || 0,
|
||||
stockHealthPercent: totalSkus > 0
|
||||
? Math.round((inStock / totalSkus) * 100)
|
||||
: 0,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category growth (comparing periods)
|
||||
*/
|
||||
async getCategoryGrowth(
|
||||
days: number = 7,
|
||||
filters: CategoryFilters = {}
|
||||
): Promise<CategoryGrowth[]> {
|
||||
const { state, storeId, minSkus = 10 } = filters;
|
||||
const key = cacheKey('category_growth', { days, state, storeId, minSkus });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use category_snapshots for historical comparison
|
||||
const result = await this.pool.query(`
|
||||
WITH current_data AS (
|
||||
SELECT
|
||||
category,
|
||||
total_skus,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date = (SELECT MAX(snapshot_date) FROM category_snapshots)
|
||||
),
|
||||
previous_data AS (
|
||||
SELECT
|
||||
category,
|
||||
total_skus,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date = (
|
||||
SELECT MAX(snapshot_date)
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date < (SELECT MAX(snapshot_date) FROM category_snapshots) - ($1 || ' days')::INTERVAL
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
c.category,
|
||||
c.total_skus as current_skus,
|
||||
COALESCE(p.total_skus, c.total_skus) as previous_skus,
|
||||
c.brand_count as current_brands,
|
||||
COALESCE(p.brand_count, c.brand_count) as previous_brands,
|
||||
c.avg_price as current_price,
|
||||
p.avg_price as previous_price
|
||||
FROM current_data c
|
||||
LEFT JOIN previous_data p ON c.category = p.category
|
||||
WHERE c.total_skus >= $2
|
||||
ORDER BY c.total_skus DESC
|
||||
`, [days, minSkus]);
|
||||
|
||||
// If no snapshots exist, use current data
|
||||
if (result.rows.length === 0) {
|
||||
const fallbackResult = await this.pool.query(`
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total_skus,
|
||||
COUNT(DISTINCT brand_name) as brand_count,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
HAVING COUNT(*) >= $1
|
||||
ORDER BY total_skus DESC
|
||||
`, [minSkus]);
|
||||
|
||||
return fallbackResult.rows.map(row => ({
|
||||
category: row.category,
|
||||
currentSkuCount: parseInt(row.total_skus) || 0,
|
||||
previousSkuCount: parseInt(row.total_skus) || 0,
|
||||
skuGrowthPercent: 0,
|
||||
currentBrandCount: parseInt(row.brand_count) || 0,
|
||||
previousBrandCount: parseInt(row.brand_count) || 0,
|
||||
brandGrowthPercent: 0,
|
||||
currentAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
previousAvgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
priceChangePercent: null,
|
||||
newProducts: 0,
|
||||
discontinuedProducts: 0,
|
||||
trend: 'stable' as const,
|
||||
}));
|
||||
}
|
||||
|
||||
return result.rows.map(row => {
|
||||
const currentSkus = parseInt(row.current_skus) || 0;
|
||||
const previousSkus = parseInt(row.previous_skus) || currentSkus;
|
||||
const currentBrands = parseInt(row.current_brands) || 0;
|
||||
const previousBrands = parseInt(row.previous_brands) || currentBrands;
|
||||
const currentPrice = row.current_price ? parseFloat(row.current_price) : null;
|
||||
const previousPrice = row.previous_price ? parseFloat(row.previous_price) : null;
|
||||
|
||||
const skuGrowth = previousSkus > 0
|
||||
? ((currentSkus - previousSkus) / previousSkus) * 100
|
||||
: 0;
|
||||
const brandGrowth = previousBrands > 0
|
||||
? ((currentBrands - previousBrands) / previousBrands) * 100
|
||||
: 0;
|
||||
const priceChange = previousPrice && currentPrice
|
||||
? ((currentPrice - previousPrice) / previousPrice) * 100
|
||||
: null;
|
||||
|
||||
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||
if (skuGrowth > 5) trend = 'growing';
|
||||
else if (skuGrowth < -5) trend = 'declining';
|
||||
|
||||
return {
|
||||
category: row.category,
|
||||
currentSkuCount: currentSkus,
|
||||
previousSkuCount: previousSkus,
|
||||
skuGrowthPercent: Math.round(skuGrowth * 10) / 10,
|
||||
currentBrandCount: currentBrands,
|
||||
previousBrandCount: previousBrands,
|
||||
brandGrowthPercent: Math.round(brandGrowth * 10) / 10,
|
||||
currentAvgPrice: currentPrice ? Math.round(currentPrice * 100) / 100 : null,
|
||||
previousAvgPrice: previousPrice ? Math.round(previousPrice * 100) / 100 : null,
|
||||
priceChangePercent: priceChange !== null ? Math.round(priceChange * 10) / 10 : null,
|
||||
newProducts: Math.max(0, currentSkus - previousSkus),
|
||||
discontinuedProducts: Math.max(0, previousSkus - currentSkus),
|
||||
trend,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category growth trend over time
|
||||
*/
|
||||
async getCategoryGrowthTrend(
|
||||
category: string,
|
||||
days: number = 90
|
||||
): Promise<CategoryGrowthTrend> {
|
||||
const key = cacheKey('category_growth_trend', { category, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
snapshot_date as date,
|
||||
total_skus as sku_count,
|
||||
brand_count,
|
||||
avg_price,
|
||||
store_count
|
||||
FROM category_snapshots
|
||||
WHERE category = $1
|
||||
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY snapshot_date
|
||||
`, [category, days]);
|
||||
|
||||
const dataPoints = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
}));
|
||||
|
||||
// Calculate growth rates
|
||||
const calculateGrowth = (daysBack: number): number | null => {
|
||||
if (dataPoints.length < 2) return null;
|
||||
const targetDate = new Date();
|
||||
targetDate.setDate(targetDate.getDate() - daysBack);
|
||||
const targetDateStr = targetDate.toISOString().split('T')[0];
|
||||
|
||||
const recent = dataPoints[dataPoints.length - 1];
|
||||
const older = dataPoints.find(d => d.date <= targetDateStr) || dataPoints[0];
|
||||
|
||||
if (older.skuCount === 0) return null;
|
||||
return Math.round(((recent.skuCount - older.skuCount) / older.skuCount) * 1000) / 10;
|
||||
};
|
||||
|
||||
return {
|
||||
category,
|
||||
dataPoints,
|
||||
growth7d: calculateGrowth(7),
|
||||
growth30d: calculateGrowth(30),
|
||||
growth90d: calculateGrowth(90),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category heatmap data
|
||||
*/
|
||||
async getCategoryHeatmap(
|
||||
metric: 'skus' | 'growth' | 'price' = 'skus',
|
||||
periods: number = 12 // weeks
|
||||
): Promise<CategoryHeatmapData> {
|
||||
const key = cacheKey('category_heatmap', { metric, periods });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
category,
|
||||
snapshot_date,
|
||||
total_skus,
|
||||
avg_price
|
||||
FROM category_snapshots
|
||||
WHERE snapshot_date >= CURRENT_DATE - ($1 * 7 || ' days')::INTERVAL
|
||||
ORDER BY category, snapshot_date
|
||||
`, [periods]);
|
||||
|
||||
// Get unique categories and generate weekly periods
|
||||
const categoriesSet = new Set<string>();
|
||||
const periodsSet = new Set<string>();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
categoriesSet.add(row.category);
|
||||
// Group by week
|
||||
const date = new Date(row.snapshot_date);
|
||||
const weekStart = new Date(date);
|
||||
weekStart.setDate(date.getDate() - date.getDay());
|
||||
periodsSet.add(weekStart.toISOString().split('T')[0]);
|
||||
});
|
||||
|
||||
const categories = Array.from(categoriesSet).sort();
|
||||
const periodsList = Array.from(periodsSet).sort();
|
||||
|
||||
// Aggregate data by category and week
|
||||
const dataMap = new Map<string, Map<string, { skus: number; price: number | null }>>();
|
||||
|
||||
result.rows.forEach(row => {
|
||||
const date = new Date(row.snapshot_date);
|
||||
const weekStart = new Date(date);
|
||||
weekStart.setDate(date.getDate() - date.getDay());
|
||||
const period = weekStart.toISOString().split('T')[0];
|
||||
|
||||
if (!dataMap.has(row.category)) {
|
||||
dataMap.set(row.category, new Map());
|
||||
}
|
||||
const categoryData = dataMap.get(row.category)!;
|
||||
|
||||
if (!categoryData.has(period)) {
|
||||
categoryData.set(period, { skus: 0, price: null });
|
||||
}
|
||||
const existing = categoryData.get(period)!;
|
||||
existing.skus = Math.max(existing.skus, parseInt(row.total_skus) || 0);
|
||||
if (row.avg_price) {
|
||||
existing.price = parseFloat(row.avg_price);
|
||||
}
|
||||
});
|
||||
|
||||
// Build heatmap data
|
||||
const data: CategoryHeatmapData['data'] = [];
|
||||
|
||||
categories.forEach(category => {
|
||||
let previousValue: number | null = null;
|
||||
|
||||
periodsList.forEach(period => {
|
||||
const categoryData = dataMap.get(category)?.get(period);
|
||||
let value = 0;
|
||||
|
||||
if (categoryData) {
|
||||
switch (metric) {
|
||||
case 'skus':
|
||||
value = categoryData.skus;
|
||||
break;
|
||||
case 'price':
|
||||
value = categoryData.price || 0;
|
||||
break;
|
||||
case 'growth':
|
||||
value = previousValue !== null && previousValue > 0
|
||||
? ((categoryData.skus - previousValue) / previousValue) * 100
|
||||
: 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const changeFromPrevious = previousValue !== null && previousValue > 0
|
||||
? ((value - previousValue) / previousValue) * 100
|
||||
: null;
|
||||
|
||||
data.push({
|
||||
category,
|
||||
period,
|
||||
value: Math.round(value * 100) / 100,
|
||||
changeFromPrevious: changeFromPrevious !== null
|
||||
? Math.round(changeFromPrevious * 10) / 10
|
||||
: null,
|
||||
});
|
||||
|
||||
if (metric !== 'growth') {
|
||||
previousValue = value;
|
||||
} else if (categoryData) {
|
||||
previousValue = categoryData.skus;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return {
|
||||
categories,
|
||||
periods: periodsList,
|
||||
data,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top growing/declining categories
|
||||
*/
|
||||
async getTopMovers(
|
||||
limit: number = 5,
|
||||
days: number = 30
|
||||
): Promise<{
|
||||
growing: CategoryGrowth[];
|
||||
declining: CategoryGrowth[];
|
||||
}> {
|
||||
const key = cacheKey('top_movers', { limit, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const allGrowth = await this.getCategoryGrowth(days);
|
||||
|
||||
const sorted = [...allGrowth].sort((a, b) => b.skuGrowthPercent - a.skuGrowthPercent);
|
||||
|
||||
return {
|
||||
growing: sorted.filter(c => c.skuGrowthPercent > 0).slice(0, limit),
|
||||
declining: sorted.filter(c => c.skuGrowthPercent < 0).slice(-limit).reverse(),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category subcategory breakdown
|
||||
*/
|
||||
async getSubcategoryBreakdown(category: string): Promise<Array<{
|
||||
subcategory: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
percentOfCategory: number;
|
||||
}>> {
|
||||
const key = cacheKey('subcategory_breakdown', { category });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH category_total AS (
|
||||
SELECT COUNT(*) as total FROM dutchie_products WHERE type = $1
|
||||
)
|
||||
SELECT
|
||||
COALESCE(dp.subcategory, 'Other') as subcategory,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
ct.total as category_total
|
||||
FROM dutchie_products dp, category_total ct
|
||||
WHERE dp.type = $1
|
||||
GROUP BY dp.subcategory, ct.total
|
||||
ORDER BY sku_count DESC
|
||||
`, [category]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
subcategory: row.subcategory,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
percentOfCategory: parseInt(row.category_total) > 0
|
||||
? Math.round((parseInt(row.sku_count) / parseInt(row.category_total)) * 1000) / 10
|
||||
: 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
}
|
||||
@@ -1,57 +0,0 @@
|
||||
/**
|
||||
* Analytics Module Index
|
||||
*
|
||||
* Exports all analytics services for CannaiQ dashboards.
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
export { AnalyticsCache, cacheKey, type CacheEntry, type CacheConfig } from './cache';
|
||||
|
||||
export {
|
||||
PriceTrendService,
|
||||
type PricePoint,
|
||||
type PriceTrend,
|
||||
type PriceSummary,
|
||||
type PriceCompressionResult,
|
||||
type PriceFilters,
|
||||
} from './price-trends';
|
||||
|
||||
export {
|
||||
PenetrationService,
|
||||
type BrandPenetration,
|
||||
type PenetrationTrend,
|
||||
type ShelfShare,
|
||||
type BrandPresenceByState,
|
||||
type PenetrationFilters,
|
||||
} from './penetration';
|
||||
|
||||
export {
|
||||
CategoryAnalyticsService,
|
||||
type CategoryGrowth,
|
||||
type CategorySummary,
|
||||
type CategoryGrowthTrend,
|
||||
type CategoryHeatmapData,
|
||||
type SeasonalityPattern,
|
||||
type CategoryFilters,
|
||||
} from './category-analytics';
|
||||
|
||||
export {
|
||||
StoreChangeService,
|
||||
type StoreChangeSummary,
|
||||
type StoreChangeEvent,
|
||||
type BrandChange,
|
||||
type ProductChange,
|
||||
type CategoryLeaderboard,
|
||||
type StoreFilters,
|
||||
} from './store-changes';
|
||||
|
||||
export {
|
||||
BrandOpportunityService,
|
||||
type BrandOpportunity,
|
||||
type PricePosition,
|
||||
type MissingSkuOpportunity,
|
||||
type StoreShelfShareChange,
|
||||
type CompetitorAlert,
|
||||
type MarketPositionSummary,
|
||||
} from './brand-opportunity';
|
||||
@@ -1,556 +0,0 @@
|
||||
/**
|
||||
* Brand Penetration Analytics Service
|
||||
*
|
||||
* Provides analytics for brand market penetration including:
|
||||
* - Stores carrying brand
|
||||
* - SKU counts per brand
|
||||
* - Percentage of stores carrying
|
||||
* - Shelf share calculations
|
||||
* - Penetration trends and momentum
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface BrandPenetration {
|
||||
brandName: string;
|
||||
brandId: string | null;
|
||||
totalStores: number;
|
||||
storesCarrying: number;
|
||||
penetrationPercent: number;
|
||||
totalSkus: number;
|
||||
avgSkusPerStore: number;
|
||||
shelfSharePercent: number;
|
||||
categories: string[];
|
||||
avgPrice: number | null;
|
||||
inStockSkus: number;
|
||||
}
|
||||
|
||||
export interface PenetrationTrend {
|
||||
brandName: string;
|
||||
dataPoints: Array<{
|
||||
date: string;
|
||||
storeCount: number;
|
||||
skuCount: number;
|
||||
penetrationPercent: number;
|
||||
}>;
|
||||
momentumScore: number; // -100 to +100
|
||||
riskScore: number; // 0 to 100, higher = more risk
|
||||
trend: 'growing' | 'declining' | 'stable';
|
||||
}
|
||||
|
||||
export interface ShelfShare {
|
||||
brandName: string;
|
||||
category: string;
|
||||
skuCount: number;
|
||||
categoryTotalSkus: number;
|
||||
shelfSharePercent: number;
|
||||
rank: number;
|
||||
}
|
||||
|
||||
export interface BrandPresenceByState {
|
||||
state: string;
|
||||
storeCount: number;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
}
|
||||
|
||||
export interface PenetrationFilters {
|
||||
state?: string;
|
||||
category?: string;
|
||||
minStores?: number;
|
||||
minSkus?: number;
|
||||
}
|
||||
|
||||
export class PenetrationService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration data for a specific brand
|
||||
*/
|
||||
async getBrandPenetration(
|
||||
brandName: string,
|
||||
filters: PenetrationFilters = {}
|
||||
): Promise<BrandPenetration> {
|
||||
const { state, category } = filters;
|
||||
const key = cacheKey('brand_penetration', { brandName, state, category });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Build where clauses
|
||||
const conditions: string[] = [];
|
||||
const params: (string | number)[] = [brandName];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
if (category) {
|
||||
conditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
|
||||
const stateCondition = state ? `AND d.state = $${params.indexOf(state) + 1}` : '';
|
||||
const categoryCondition = category ? `AND dp.type = $${params.indexOf(category) + 1}` : '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH total_stores AS (
|
||||
SELECT COUNT(DISTINCT id) as total
|
||||
FROM dispensaries
|
||||
WHERE 1=1 ${state ? `AND state = $2` : ''}
|
||||
),
|
||||
brand_data AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||
COUNT(*) as total_skus,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
${stateCondition}
|
||||
${categoryCondition}
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||
)
|
||||
SELECT
|
||||
bd.brand_name,
|
||||
bd.brand_id,
|
||||
ts.total as total_stores,
|
||||
bd.stores_carrying,
|
||||
bd.total_skus,
|
||||
bd.avg_price,
|
||||
bd.in_stock,
|
||||
bd.categories,
|
||||
tsk.total as market_total_skus
|
||||
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||
`, params);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return {
|
||||
brandName,
|
||||
brandId: null,
|
||||
totalStores: 0,
|
||||
storesCarrying: 0,
|
||||
penetrationPercent: 0,
|
||||
totalSkus: 0,
|
||||
avgSkusPerStore: 0,
|
||||
shelfSharePercent: 0,
|
||||
categories: [],
|
||||
avgPrice: null,
|
||||
inStockSkus: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const row = result.rows[0];
|
||||
const totalStores = parseInt(row.total_stores) || 1;
|
||||
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||
|
||||
return {
|
||||
brandName: row.brand_name,
|
||||
brandId: row.brand_id,
|
||||
totalStores,
|
||||
storesCarrying,
|
||||
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||
totalSkus,
|
||||
avgSkusPerStore: storesCarrying > 0
|
||||
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||
: 0,
|
||||
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||
categories: row.categories || [],
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
inStockSkus: parseInt(row.in_stock) || 0,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top brands by penetration
|
||||
*/
|
||||
async getTopBrandsByPenetration(
|
||||
limit: number = 20,
|
||||
filters: PenetrationFilters = {}
|
||||
): Promise<BrandPenetration[]> {
|
||||
const { state, category, minStores = 2, minSkus = 5 } = filters;
|
||||
const key = cacheKey('top_brands_penetration', { limit, state, category, minStores, minSkus });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [limit, minStores, minSkus];
|
||||
let paramIndex = 4;
|
||||
|
||||
let stateCondition = '';
|
||||
let categoryCondition = '';
|
||||
|
||||
if (state) {
|
||||
stateCondition = `AND d.state = $${paramIndex++}`;
|
||||
params.push(state);
|
||||
}
|
||||
if (category) {
|
||||
categoryCondition = `AND dp.type = $${paramIndex++}`;
|
||||
params.push(category);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH total_stores AS (
|
||||
SELECT COUNT(DISTINCT id) as total
|
||||
FROM dispensaries
|
||||
WHERE 1=1 ${state ? `AND state = $${params.indexOf(state) + 1}` : ''}
|
||||
),
|
||||
total_skus AS (
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE 1=1 ${stateCondition} ${categoryCondition}
|
||||
),
|
||||
brand_data AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
dp.brand_id,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_carrying,
|
||||
COUNT(*) as total_skus,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
SUM(CASE WHEN dp.stock_status = 'in_stock' THEN 1 ELSE 0 END) as in_stock,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name IS NOT NULL
|
||||
${stateCondition}
|
||||
${categoryCondition}
|
||||
GROUP BY dp.brand_name, dp.brand_id
|
||||
HAVING COUNT(DISTINCT dp.dispensary_id) >= $2
|
||||
AND COUNT(*) >= $3
|
||||
)
|
||||
SELECT
|
||||
bd.*,
|
||||
ts.total as total_stores,
|
||||
tsk.total as market_total_skus
|
||||
FROM brand_data bd, total_stores ts, total_skus tsk
|
||||
ORDER BY bd.stores_carrying DESC, bd.total_skus DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => {
|
||||
const totalStores = parseInt(row.total_stores) || 1;
|
||||
const storesCarrying = parseInt(row.stores_carrying) || 0;
|
||||
const totalSkus = parseInt(row.total_skus) || 0;
|
||||
const marketTotalSkus = parseInt(row.market_total_skus) || 1;
|
||||
|
||||
return {
|
||||
brandName: row.brand_name,
|
||||
brandId: row.brand_id,
|
||||
totalStores,
|
||||
storesCarrying,
|
||||
penetrationPercent: Math.round((storesCarrying / totalStores) * 1000) / 10,
|
||||
totalSkus,
|
||||
avgSkusPerStore: storesCarrying > 0
|
||||
? Math.round((totalSkus / storesCarrying) * 10) / 10
|
||||
: 0,
|
||||
shelfSharePercent: Math.round((totalSkus / marketTotalSkus) * 1000) / 10,
|
||||
categories: row.categories || [],
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
inStockSkus: parseInt(row.in_stock) || 0,
|
||||
};
|
||||
});
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration trend for a brand (requires historical snapshots)
|
||||
*/
|
||||
async getPenetrationTrend(
|
||||
brandName: string,
|
||||
days: number = 30
|
||||
): Promise<PenetrationTrend> {
|
||||
const key = cacheKey('penetration_trend', { brandName, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use brand_snapshots table for historical data
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
snapshot_date as date,
|
||||
store_count,
|
||||
total_skus
|
||||
FROM brand_snapshots
|
||||
WHERE brand_name = $1
|
||||
AND snapshot_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY snapshot_date
|
||||
`, [brandName, days]);
|
||||
|
||||
// Get total stores for penetration calculation
|
||||
const totalResult = await this.pool.query(
|
||||
'SELECT COUNT(*) as total FROM dispensaries'
|
||||
);
|
||||
const totalStores = parseInt(totalResult.rows[0]?.total) || 1;
|
||||
|
||||
const dataPoints = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
skuCount: parseInt(row.total_skus) || 0,
|
||||
penetrationPercent: Math.round((parseInt(row.store_count) / totalStores) * 1000) / 10,
|
||||
}));
|
||||
|
||||
// Calculate momentum and risk scores
|
||||
let momentumScore = 0;
|
||||
let riskScore = 0;
|
||||
let trend: 'growing' | 'declining' | 'stable' = 'stable';
|
||||
|
||||
if (dataPoints.length >= 2) {
|
||||
const first = dataPoints[0];
|
||||
const last = dataPoints[dataPoints.length - 1];
|
||||
|
||||
// Momentum: change in store count
|
||||
const storeChange = last.storeCount - first.storeCount;
|
||||
const storeChangePercent = first.storeCount > 0
|
||||
? (storeChange / first.storeCount) * 100
|
||||
: 0;
|
||||
|
||||
// Momentum score: -100 to +100
|
||||
momentumScore = Math.max(-100, Math.min(100, storeChangePercent * 10));
|
||||
|
||||
// Risk score: higher if losing stores
|
||||
if (storeChange < 0) {
|
||||
riskScore = Math.min(100, Math.abs(storeChangePercent) * 5);
|
||||
}
|
||||
|
||||
// Determine trend
|
||||
if (storeChangePercent > 5) trend = 'growing';
|
||||
else if (storeChangePercent < -5) trend = 'declining';
|
||||
}
|
||||
|
||||
return {
|
||||
brandName,
|
||||
dataPoints,
|
||||
momentumScore: Math.round(momentumScore),
|
||||
riskScore: Math.round(riskScore),
|
||||
trend,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get shelf share by category for a brand
|
||||
*/
|
||||
async getShelfShareByCategory(brandName: string): Promise<ShelfShare[]> {
|
||||
const key = cacheKey('shelf_share_category', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH category_totals AS (
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total_skus
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
brand_by_category AS (
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products
|
||||
WHERE brand_name = $1
|
||||
AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
ranked AS (
|
||||
SELECT
|
||||
ct.category,
|
||||
COALESCE(bc.sku_count, 0) as sku_count,
|
||||
ct.total_skus,
|
||||
RANK() OVER (PARTITION BY ct.category ORDER BY bc.sku_count DESC NULLS LAST) as rank
|
||||
FROM category_totals ct
|
||||
LEFT JOIN brand_by_category bc ON ct.category = bc.category
|
||||
)
|
||||
SELECT
|
||||
r.category,
|
||||
r.sku_count,
|
||||
r.total_skus as category_total_skus,
|
||||
ROUND((r.sku_count::NUMERIC / r.total_skus) * 100, 2) as shelf_share_pct,
|
||||
(SELECT COUNT(*) + 1 FROM (
|
||||
SELECT brand_name, COUNT(*) as cnt
|
||||
FROM dutchie_products
|
||||
WHERE type = r.category AND brand_name IS NOT NULL
|
||||
GROUP BY brand_name
|
||||
HAVING COUNT(*) > r.sku_count
|
||||
) t) as rank
|
||||
FROM ranked r
|
||||
WHERE r.sku_count > 0
|
||||
ORDER BY r.shelf_share_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName,
|
||||
category: row.category,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
categoryTotalSkus: parseInt(row.category_total_skus) || 0,
|
||||
shelfSharePercent: parseFloat(row.shelf_share_pct) || 0,
|
||||
rank: parseInt(row.rank) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get brand presence by state/region
|
||||
*/
|
||||
async getBrandPresenceByState(brandName: string): Promise<BrandPresenceByState[]> {
|
||||
const key = cacheKey('brand_presence_state', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT dp.dispensary_id) as store_count,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.state
|
||||
ORDER BY store_count DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
storeCount: parseInt(row.store_count) || 0,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores carrying a brand
|
||||
*/
|
||||
async getStoresCarryingBrand(brandName: string): Promise<Array<{
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
skuCount: number;
|
||||
avgPrice: number | null;
|
||||
categories: string[];
|
||||
}>> {
|
||||
const key = cacheKey('stores_carrying_brand', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) as sku_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.id, d.name, d.city, d.state
|
||||
ORDER BY sku_count DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
categories: row.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get penetration heatmap data (state-based)
|
||||
*/
|
||||
async getPenetrationHeatmap(
|
||||
brandName?: string
|
||||
): Promise<Array<{
|
||||
state: string;
|
||||
totalStores: number;
|
||||
storesWithBrand: number;
|
||||
penetrationPercent: number;
|
||||
totalSkus: number;
|
||||
}>> {
|
||||
const key = cacheKey('penetration_heatmap', { brandName });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
if (brandName) {
|
||||
const result = await this.pool.query(`
|
||||
WITH state_totals AS (
|
||||
SELECT state, COUNT(*) as total_stores
|
||||
FROM dispensaries
|
||||
GROUP BY state
|
||||
),
|
||||
brand_by_state AS (
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT dp.dispensary_id) as stores_with_brand,
|
||||
COUNT(*) as total_skus
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
GROUP BY d.state
|
||||
)
|
||||
SELECT
|
||||
st.state,
|
||||
st.total_stores,
|
||||
COALESCE(bs.stores_with_brand, 0) as stores_with_brand,
|
||||
ROUND(COALESCE(bs.stores_with_brand, 0)::NUMERIC / st.total_stores * 100, 1) as penetration_pct,
|
||||
COALESCE(bs.total_skus, 0) as total_skus
|
||||
FROM state_totals st
|
||||
LEFT JOIN brand_by_state bs ON st.state = bs.state
|
||||
ORDER BY penetration_pct DESC
|
||||
`, [brandName]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
totalStores: parseInt(row.total_stores) || 0,
|
||||
storesWithBrand: parseInt(row.stores_with_brand) || 0,
|
||||
penetrationPercent: parseFloat(row.penetration_pct) || 0,
|
||||
totalSkus: parseInt(row.total_skus) || 0,
|
||||
}));
|
||||
} else {
|
||||
// Overall market data by state
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
COUNT(DISTINCT d.id) as total_stores,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
COUNT(*) as total_skus
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
GROUP BY d.state
|
||||
ORDER BY total_stores DESC
|
||||
`);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
state: row.state,
|
||||
totalStores: parseInt(row.total_stores) || 0,
|
||||
storesWithBrand: parseInt(row.brand_count) || 0, // Using brand count here
|
||||
penetrationPercent: 100, // Full penetration for overall view
|
||||
totalSkus: parseInt(row.total_skus) || 0,
|
||||
}));
|
||||
}
|
||||
}, 30)).data;
|
||||
}
|
||||
}
|
||||
@@ -1,534 +0,0 @@
|
||||
/**
|
||||
* Price Trend Analytics Service
|
||||
*
|
||||
* Provides time-series price analytics including:
|
||||
* - Price over time for products
|
||||
* - Average MSRP/Wholesale by period
|
||||
* - Price volatility scoring
|
||||
* - Price compression detection
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface PricePoint {
|
||||
date: string;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
avgPrice: number | null;
|
||||
wholesalePrice: number | null;
|
||||
sampleSize: number;
|
||||
}
|
||||
|
||||
export interface PriceTrend {
|
||||
productId?: number;
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
dataPoints: PricePoint[];
|
||||
summary: {
|
||||
currentAvg: number | null;
|
||||
previousAvg: number | null;
|
||||
changePercent: number | null;
|
||||
trend: 'up' | 'down' | 'stable';
|
||||
volatilityScore: number | null;
|
||||
};
|
||||
}
|
||||
|
||||
export interface PriceSummary {
|
||||
avg7d: number | null;
|
||||
avg30d: number | null;
|
||||
avg90d: number | null;
|
||||
wholesaleAvg7d: number | null;
|
||||
wholesaleAvg30d: number | null;
|
||||
wholesaleAvg90d: number | null;
|
||||
minPrice: number | null;
|
||||
maxPrice: number | null;
|
||||
priceRange: number | null;
|
||||
volatilityScore: number | null;
|
||||
}
|
||||
|
||||
export interface PriceCompressionResult {
|
||||
category: string;
|
||||
brands: Array<{
|
||||
brandName: string;
|
||||
avgPrice: number;
|
||||
priceDistance: number; // distance from category mean
|
||||
}>;
|
||||
compressionScore: number; // 0-100, higher = more compressed
|
||||
standardDeviation: number;
|
||||
}
|
||||
|
||||
export interface PriceFilters {
|
||||
storeId?: number;
|
||||
brandName?: string;
|
||||
category?: string;
|
||||
state?: string;
|
||||
days?: number;
|
||||
}
|
||||
|
||||
export class PriceTrendService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trend for a specific product
|
||||
*/
|
||||
async getProductPriceTrend(
|
||||
productId: number,
|
||||
storeId?: number,
|
||||
days: number = 30
|
||||
): Promise<PriceTrend> {
|
||||
const key = cacheKey('price_trend_product', { productId, storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Try to get from snapshots first
|
||||
const snapshotResult = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(crawled_at) as date,
|
||||
MIN(rec_min_price_cents) / 100.0 as min_price,
|
||||
MAX(rec_max_price_cents) / 100.0 as max_price,
|
||||
AVG(rec_min_price_cents) / 100.0 as avg_price,
|
||||
AVG(wholesale_min_price_cents) / 100.0 as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = $1
|
||||
AND crawled_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dispensary_id = $3' : ''}
|
||||
GROUP BY DATE(crawled_at)
|
||||
ORDER BY date
|
||||
`, storeId ? [productId, days, storeId] : [productId, days]);
|
||||
|
||||
let dataPoints: PricePoint[] = snapshotResult.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
// If no snapshots, get current price from product
|
||||
if (dataPoints.length === 0) {
|
||||
const productResult = await this.pool.query(`
|
||||
SELECT
|
||||
extract_min_price(latest_raw_payload) as min_price,
|
||||
extract_max_price(latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(latest_raw_payload) as wholesale_price
|
||||
FROM dutchie_products
|
||||
WHERE id = $1
|
||||
`, [productId]);
|
||||
|
||||
if (productResult.rows.length > 0) {
|
||||
const row = productResult.rows[0];
|
||||
dataPoints = [{
|
||||
date: new Date().toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.min_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: 1,
|
||||
}];
|
||||
}
|
||||
}
|
||||
|
||||
const summary = this.calculatePriceSummary(dataPoints);
|
||||
|
||||
return {
|
||||
productId,
|
||||
storeId,
|
||||
dataPoints,
|
||||
summary,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trends by brand
|
||||
*/
|
||||
async getBrandPriceTrend(
|
||||
brandName: string,
|
||||
filters: PriceFilters = {}
|
||||
): Promise<PriceTrend> {
|
||||
const { storeId, category, state, days = 30 } = filters;
|
||||
const key = cacheKey('price_trend_brand', { brandName, storeId, category, state, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Use current product data aggregated by date
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(dp.updated_at) as date,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.brand_name = $1
|
||||
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||
${category ? `AND dp.type = $${storeId ? 4 : 3}` : ''}
|
||||
${state ? `AND d.state = $${storeId ? (category ? 5 : 4) : (category ? 4 : 3)}` : ''}
|
||||
GROUP BY DATE(dp.updated_at)
|
||||
ORDER BY date
|
||||
`, this.buildParams([brandName, days], { storeId, category, state }));
|
||||
|
||||
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
return {
|
||||
brandName,
|
||||
storeId,
|
||||
category,
|
||||
dataPoints,
|
||||
summary: this.calculatePriceSummary(dataPoints),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price trends by category
|
||||
*/
|
||||
async getCategoryPriceTrend(
|
||||
category: string,
|
||||
filters: PriceFilters = {}
|
||||
): Promise<PriceTrend> {
|
||||
const { storeId, brandName, state, days = 30 } = filters;
|
||||
const key = cacheKey('price_trend_category', { category, storeId, brandName, state, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
DATE(dp.updated_at) as date,
|
||||
MIN(extract_min_price(dp.latest_raw_payload)) as min_price,
|
||||
MAX(extract_max_price(dp.latest_raw_payload)) as max_price,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
AVG(extract_wholesale_price(dp.latest_raw_payload)) as wholesale_price,
|
||||
COUNT(*) as sample_size
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
AND dp.updated_at >= NOW() - ($2 || ' days')::INTERVAL
|
||||
${storeId ? 'AND dp.dispensary_id = $3' : ''}
|
||||
${brandName ? `AND dp.brand_name = $${storeId ? 4 : 3}` : ''}
|
||||
${state ? `AND d.state = $${storeId ? (brandName ? 5 : 4) : (brandName ? 4 : 3)}` : ''}
|
||||
GROUP BY DATE(dp.updated_at)
|
||||
ORDER BY date
|
||||
`, this.buildParams([category, days], { storeId, brandName, state }));
|
||||
|
||||
const dataPoints: PricePoint[] = result.rows.map(row => ({
|
||||
date: row.date.toISOString().split('T')[0],
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
avgPrice: parseFloat(row.avg_price) || null,
|
||||
wholesalePrice: parseFloat(row.wholesale_price) || null,
|
||||
sampleSize: parseInt(row.sample_size),
|
||||
}));
|
||||
|
||||
return {
|
||||
category,
|
||||
storeId,
|
||||
brandName,
|
||||
dataPoints,
|
||||
summary: this.calculatePriceSummary(dataPoints),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get price summary statistics
|
||||
*/
|
||||
async getPriceSummary(filters: PriceFilters = {}): Promise<PriceSummary> {
|
||||
const { storeId, brandName, category, state } = filters;
|
||||
const key = cacheKey('price_summary', filters as Record<string, unknown>);
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const whereConditions: string[] = [];
|
||||
const params: (string | number)[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (storeId) {
|
||||
whereConditions.push(`dp.dispensary_id = $${paramIndex++}`);
|
||||
params.push(storeId);
|
||||
}
|
||||
if (brandName) {
|
||||
whereConditions.push(`dp.brand_name = $${paramIndex++}`);
|
||||
params.push(brandName);
|
||||
}
|
||||
if (category) {
|
||||
whereConditions.push(`dp.type = $${paramIndex++}`);
|
||||
params.push(category);
|
||||
}
|
||||
if (state) {
|
||||
whereConditions.push(`d.state = $${paramIndex++}`);
|
||||
params.push(state);
|
||||
}
|
||||
|
||||
const whereClause = whereConditions.length > 0
|
||||
? 'WHERE ' + whereConditions.join(' AND ')
|
||||
: '';
|
||||
|
||||
const result = await this.pool.query(`
|
||||
WITH prices AS (
|
||||
SELECT
|
||||
extract_min_price(dp.latest_raw_payload) as min_price,
|
||||
extract_max_price(dp.latest_raw_payload) as max_price,
|
||||
extract_wholesale_price(dp.latest_raw_payload) as wholesale_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
${whereClause}
|
||||
)
|
||||
SELECT
|
||||
AVG(min_price) as avg_price,
|
||||
AVG(wholesale_price) as avg_wholesale,
|
||||
MIN(min_price) as min_price,
|
||||
MAX(max_price) as max_price,
|
||||
STDDEV(min_price) as std_dev
|
||||
FROM prices
|
||||
WHERE min_price IS NOT NULL
|
||||
`, params);
|
||||
|
||||
const row = result.rows[0];
|
||||
const avgPrice = parseFloat(row.avg_price) || null;
|
||||
const stdDev = parseFloat(row.std_dev) || null;
|
||||
const volatility = avgPrice && stdDev ? (stdDev / avgPrice) * 100 : null;
|
||||
|
||||
return {
|
||||
avg7d: avgPrice, // Using current data as proxy
|
||||
avg30d: avgPrice,
|
||||
avg90d: avgPrice,
|
||||
wholesaleAvg7d: parseFloat(row.avg_wholesale) || null,
|
||||
wholesaleAvg30d: parseFloat(row.avg_wholesale) || null,
|
||||
wholesaleAvg90d: parseFloat(row.avg_wholesale) || null,
|
||||
minPrice: parseFloat(row.min_price) || null,
|
||||
maxPrice: parseFloat(row.max_price) || null,
|
||||
priceRange: row.max_price && row.min_price
|
||||
? parseFloat(row.max_price) - parseFloat(row.min_price)
|
||||
: null,
|
||||
volatilityScore: volatility ? Math.round(volatility * 10) / 10 : null,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect price compression in a category
|
||||
*/
|
||||
async detectPriceCompression(
|
||||
category: string,
|
||||
state?: string
|
||||
): Promise<PriceCompressionResult> {
|
||||
const key = cacheKey('price_compression', { category, state });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH brand_prices AS (
|
||||
SELECT
|
||||
dp.brand_name,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as sku_count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
AND dp.brand_name IS NOT NULL
|
||||
${state ? 'AND d.state = $2' : ''}
|
||||
GROUP BY dp.brand_name
|
||||
HAVING COUNT(*) >= 3
|
||||
),
|
||||
stats AS (
|
||||
SELECT
|
||||
AVG(avg_price) as category_avg,
|
||||
STDDEV(avg_price) as std_dev
|
||||
FROM brand_prices
|
||||
WHERE avg_price IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
bp.brand_name,
|
||||
bp.avg_price,
|
||||
ABS(bp.avg_price - s.category_avg) as price_distance,
|
||||
s.category_avg,
|
||||
s.std_dev
|
||||
FROM brand_prices bp, stats s
|
||||
WHERE bp.avg_price IS NOT NULL
|
||||
ORDER BY bp.avg_price
|
||||
`, state ? [category, state] : [category]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return {
|
||||
category,
|
||||
brands: [],
|
||||
compressionScore: 0,
|
||||
standardDeviation: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const categoryAvg = parseFloat(result.rows[0].category_avg) || 0;
|
||||
const stdDev = parseFloat(result.rows[0].std_dev) || 0;
|
||||
|
||||
// Compression score: lower std dev relative to mean = more compression
|
||||
// Scale to 0-100 where 100 = very compressed
|
||||
const cv = categoryAvg > 0 ? (stdDev / categoryAvg) * 100 : 0;
|
||||
const compressionScore = Math.max(0, Math.min(100, 100 - cv));
|
||||
|
||||
const brands = result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
avgPrice: parseFloat(row.avg_price) || 0,
|
||||
priceDistance: parseFloat(row.price_distance) || 0,
|
||||
}));
|
||||
|
||||
return {
|
||||
category,
|
||||
brands,
|
||||
compressionScore: Math.round(compressionScore),
|
||||
standardDeviation: Math.round(stdDev * 100) / 100,
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get global price statistics
|
||||
*/
|
||||
async getGlobalPriceStats(): Promise<{
|
||||
totalProductsWithPrice: number;
|
||||
avgPrice: number | null;
|
||||
medianPrice: number | null;
|
||||
priceByCategory: Array<{ category: string; avgPrice: number; count: number }>;
|
||||
priceByState: Array<{ state: string; avgPrice: number; count: number }>;
|
||||
}> {
|
||||
const key = 'global_price_stats';
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [countResult, categoryResult, stateResult] = await Promise.all([
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE extract_min_price(latest_raw_payload) IS NOT NULL) as with_price,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY extract_min_price(latest_raw_payload)) as median
|
||||
FROM dutchie_products
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
type as category,
|
||||
AVG(extract_min_price(latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as count
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
AND extract_min_price(latest_raw_payload) IS NOT NULL
|
||||
GROUP BY type
|
||||
ORDER BY avg_price DESC
|
||||
`),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price,
|
||||
COUNT(*) as count
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE extract_min_price(dp.latest_raw_payload) IS NOT NULL
|
||||
GROUP BY d.state
|
||||
ORDER BY avg_price DESC
|
||||
`),
|
||||
]);
|
||||
|
||||
return {
|
||||
totalProductsWithPrice: parseInt(countResult.rows[0]?.with_price || '0'),
|
||||
avgPrice: parseFloat(countResult.rows[0]?.avg_price) || null,
|
||||
medianPrice: parseFloat(countResult.rows[0]?.median) || null,
|
||||
priceByCategory: categoryResult.rows.map(r => ({
|
||||
category: r.category,
|
||||
avgPrice: parseFloat(r.avg_price) || 0,
|
||||
count: parseInt(r.count),
|
||||
})),
|
||||
priceByState: stateResult.rows.map(r => ({
|
||||
state: r.state,
|
||||
avgPrice: parseFloat(r.avg_price) || 0,
|
||||
count: parseInt(r.count),
|
||||
})),
|
||||
};
|
||||
}, 30)).data;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELPER METHODS
|
||||
// ============================================================
|
||||
|
||||
private calculatePriceSummary(dataPoints: PricePoint[]): PriceTrend['summary'] {
|
||||
if (dataPoints.length === 0) {
|
||||
return {
|
||||
currentAvg: null,
|
||||
previousAvg: null,
|
||||
changePercent: null,
|
||||
trend: 'stable',
|
||||
volatilityScore: null,
|
||||
};
|
||||
}
|
||||
|
||||
const prices = dataPoints
|
||||
.map(d => d.avgPrice)
|
||||
.filter((p): p is number => p !== null);
|
||||
|
||||
if (prices.length === 0) {
|
||||
return {
|
||||
currentAvg: null,
|
||||
previousAvg: null,
|
||||
changePercent: null,
|
||||
trend: 'stable',
|
||||
volatilityScore: null,
|
||||
};
|
||||
}
|
||||
|
||||
const currentAvg = prices[prices.length - 1];
|
||||
const midpoint = Math.floor(prices.length / 2);
|
||||
const previousAvg = prices.length > 1 ? prices[midpoint] : currentAvg;
|
||||
|
||||
const changePercent = previousAvg > 0
|
||||
? ((currentAvg - previousAvg) / previousAvg) * 100
|
||||
: null;
|
||||
|
||||
// Calculate volatility (coefficient of variation)
|
||||
const mean = prices.reduce((a, b) => a + b, 0) / prices.length;
|
||||
const variance = prices.reduce((sum, p) => sum + Math.pow(p - mean, 2), 0) / prices.length;
|
||||
const stdDev = Math.sqrt(variance);
|
||||
const volatilityScore = mean > 0 ? (stdDev / mean) * 100 : null;
|
||||
|
||||
let trend: 'up' | 'down' | 'stable' = 'stable';
|
||||
if (changePercent !== null) {
|
||||
if (changePercent > 5) trend = 'up';
|
||||
else if (changePercent < -5) trend = 'down';
|
||||
}
|
||||
|
||||
return {
|
||||
currentAvg: Math.round(currentAvg * 100) / 100,
|
||||
previousAvg: Math.round(previousAvg * 100) / 100,
|
||||
changePercent: changePercent !== null ? Math.round(changePercent * 10) / 10 : null,
|
||||
trend,
|
||||
volatilityScore: volatilityScore !== null ? Math.round(volatilityScore * 10) / 10 : null,
|
||||
};
|
||||
}
|
||||
|
||||
private buildParams(
|
||||
baseParams: (string | number)[],
|
||||
optionalParams: Record<string, string | number | undefined>
|
||||
): (string | number)[] {
|
||||
const params = [...baseParams];
|
||||
for (const value of Object.values(optionalParams)) {
|
||||
if (value !== undefined) {
|
||||
params.push(value);
|
||||
}
|
||||
}
|
||||
return params;
|
||||
}
|
||||
}
|
||||
@@ -1,587 +0,0 @@
|
||||
/**
|
||||
* Store Change Tracking Service
|
||||
*
|
||||
* Tracks changes at the store level including:
|
||||
* - New/lost brands
|
||||
* - New/discontinued products
|
||||
* - Stock status transitions
|
||||
* - Price changes
|
||||
* - Category movement leaderboards
|
||||
*
|
||||
* Phase 3: Analytics Dashboards
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { AnalyticsCache, cacheKey } from './cache';
|
||||
|
||||
export interface StoreChangeSummary {
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
brandsAdded7d: number;
|
||||
brandsAdded30d: number;
|
||||
brandsLost7d: number;
|
||||
brandsLost30d: number;
|
||||
productsAdded7d: number;
|
||||
productsAdded30d: number;
|
||||
productsDiscontinued7d: number;
|
||||
productsDiscontinued30d: number;
|
||||
priceDrops7d: number;
|
||||
priceIncreases7d: number;
|
||||
restocks7d: number;
|
||||
stockOuts7d: number;
|
||||
}
|
||||
|
||||
export interface StoreChangeEvent {
|
||||
id: number;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
eventType: string;
|
||||
eventDate: string;
|
||||
brandName: string | null;
|
||||
productName: string | null;
|
||||
category: string | null;
|
||||
oldValue: string | null;
|
||||
newValue: string | null;
|
||||
metadata: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
export interface BrandChange {
|
||||
brandName: string;
|
||||
changeType: 'added' | 'removed';
|
||||
date: string;
|
||||
skuCount: number;
|
||||
categories: string[];
|
||||
}
|
||||
|
||||
export interface ProductChange {
|
||||
productId: number;
|
||||
productName: string;
|
||||
brandName: string | null;
|
||||
category: string | null;
|
||||
changeType: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock';
|
||||
date: string;
|
||||
oldValue?: string;
|
||||
newValue?: string;
|
||||
}
|
||||
|
||||
export interface CategoryLeaderboard {
|
||||
category: string;
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
skuCount: number;
|
||||
brandCount: number;
|
||||
avgPrice: number | null;
|
||||
changePercent7d: number;
|
||||
rank: number;
|
||||
}
|
||||
|
||||
export interface StoreFilters {
|
||||
storeId?: number;
|
||||
state?: string;
|
||||
days?: number;
|
||||
eventType?: string;
|
||||
}
|
||||
|
||||
export class StoreChangeService {
|
||||
private pool: Pool;
|
||||
private cache: AnalyticsCache;
|
||||
|
||||
constructor(pool: Pool, cache: AnalyticsCache) {
|
||||
this.pool = pool;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get change summary for a store
|
||||
*/
|
||||
async getStoreChangeSummary(
|
||||
storeId: number
|
||||
): Promise<StoreChangeSummary | null> {
|
||||
const key = cacheKey('store_change_summary', { storeId });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
// Get store info
|
||||
const storeResult = await this.pool.query(`
|
||||
SELECT id, name, city, state FROM dispensaries WHERE id = $1
|
||||
`, [storeId]);
|
||||
|
||||
if (storeResult.rows.length === 0) return null;
|
||||
const store = storeResult.rows[0];
|
||||
|
||||
// Get change events counts
|
||||
const eventsResult = await this.pool.query(`
|
||||
SELECT
|
||||
event_type,
|
||||
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '7 days') as count_7d,
|
||||
COUNT(*) FILTER (WHERE event_date >= CURRENT_DATE - INTERVAL '30 days') as count_30d
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
GROUP BY event_type
|
||||
`, [storeId]);
|
||||
|
||||
const counts: Record<string, { count_7d: number; count_30d: number }> = {};
|
||||
eventsResult.rows.forEach(row => {
|
||||
counts[row.event_type] = {
|
||||
count_7d: parseInt(row.count_7d) || 0,
|
||||
count_30d: parseInt(row.count_30d) || 0,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
storeId: store.id,
|
||||
storeName: store.name,
|
||||
city: store.city,
|
||||
state: store.state,
|
||||
brandsAdded7d: counts['brand_added']?.count_7d || 0,
|
||||
brandsAdded30d: counts['brand_added']?.count_30d || 0,
|
||||
brandsLost7d: counts['brand_removed']?.count_7d || 0,
|
||||
brandsLost30d: counts['brand_removed']?.count_30d || 0,
|
||||
productsAdded7d: counts['product_added']?.count_7d || 0,
|
||||
productsAdded30d: counts['product_added']?.count_30d || 0,
|
||||
productsDiscontinued7d: counts['product_removed']?.count_7d || 0,
|
||||
productsDiscontinued30d: counts['product_removed']?.count_30d || 0,
|
||||
priceDrops7d: counts['price_drop']?.count_7d || 0,
|
||||
priceIncreases7d: counts['price_increase']?.count_7d || 0,
|
||||
restocks7d: counts['restocked']?.count_7d || 0,
|
||||
stockOuts7d: counts['out_of_stock']?.count_7d || 0,
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recent change events for a store
|
||||
*/
|
||||
async getStoreChangeEvents(
|
||||
storeId: number,
|
||||
filters: { eventType?: string; days?: number; limit?: number } = {}
|
||||
): Promise<StoreChangeEvent[]> {
|
||||
const { eventType, days = 30, limit = 100 } = filters;
|
||||
const key = cacheKey('store_change_events', { storeId, eventType, days, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const params: (string | number)[] = [storeId, days, limit];
|
||||
let eventTypeCondition = '';
|
||||
|
||||
if (eventType) {
|
||||
eventTypeCondition = 'AND event_type = $4';
|
||||
params.push(eventType);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
sce.id,
|
||||
sce.store_id,
|
||||
d.name as store_name,
|
||||
sce.event_type,
|
||||
sce.event_date,
|
||||
sce.brand_name,
|
||||
sce.product_name,
|
||||
sce.category,
|
||||
sce.old_value,
|
||||
sce.new_value,
|
||||
sce.metadata
|
||||
FROM store_change_events sce
|
||||
JOIN dispensaries d ON sce.store_id = d.id
|
||||
WHERE sce.store_id = $1
|
||||
AND sce.event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
${eventTypeCondition}
|
||||
ORDER BY sce.event_date DESC, sce.id DESC
|
||||
LIMIT $3
|
||||
`, params);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
id: row.id,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
eventType: row.event_type,
|
||||
eventDate: row.event_date.toISOString().split('T')[0],
|
||||
brandName: row.brand_name,
|
||||
productName: row.product_name,
|
||||
category: row.category,
|
||||
oldValue: row.old_value,
|
||||
newValue: row.new_value,
|
||||
metadata: row.metadata,
|
||||
}));
|
||||
}, 5)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get new brands added to a store
|
||||
*/
|
||||
async getNewBrands(
|
||||
storeId: number,
|
||||
days: number = 30
|
||||
): Promise<BrandChange[]> {
|
||||
const key = cacheKey('new_brands', { storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
brand_name,
|
||||
event_date,
|
||||
metadata
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_type = 'brand_added'
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY event_date DESC
|
||||
`, [storeId, days]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
changeType: 'added' as const,
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
skuCount: row.metadata?.sku_count || 0,
|
||||
categories: row.metadata?.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get brands lost from a store
|
||||
*/
|
||||
async getLostBrands(
|
||||
storeId: number,
|
||||
days: number = 30
|
||||
): Promise<BrandChange[]> {
|
||||
const key = cacheKey('lost_brands', { storeId, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
brand_name,
|
||||
event_date,
|
||||
metadata
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_type = 'brand_removed'
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
ORDER BY event_date DESC
|
||||
`, [storeId, days]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
brandName: row.brand_name,
|
||||
changeType: 'removed' as const,
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
skuCount: row.metadata?.sku_count || 0,
|
||||
categories: row.metadata?.categories || [],
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get product changes for a store
|
||||
*/
|
||||
async getProductChanges(
|
||||
storeId: number,
|
||||
changeType?: 'added' | 'discontinued' | 'price_drop' | 'price_increase' | 'restocked' | 'out_of_stock',
|
||||
days: number = 7
|
||||
): Promise<ProductChange[]> {
|
||||
const key = cacheKey('product_changes', { storeId, changeType, days });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const eventTypeMap: Record<string, string> = {
|
||||
'added': 'product_added',
|
||||
'discontinued': 'product_removed',
|
||||
'price_drop': 'price_drop',
|
||||
'price_increase': 'price_increase',
|
||||
'restocked': 'restocked',
|
||||
'out_of_stock': 'out_of_stock',
|
||||
};
|
||||
|
||||
const params: (string | number)[] = [storeId, days];
|
||||
let eventCondition = '';
|
||||
|
||||
if (changeType) {
|
||||
eventCondition = 'AND event_type = $3';
|
||||
params.push(eventTypeMap[changeType]);
|
||||
}
|
||||
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
product_id,
|
||||
product_name,
|
||||
brand_name,
|
||||
category,
|
||||
event_type,
|
||||
event_date,
|
||||
old_value,
|
||||
new_value
|
||||
FROM store_change_events
|
||||
WHERE store_id = $1
|
||||
AND event_date >= CURRENT_DATE - ($2 || ' days')::INTERVAL
|
||||
AND product_id IS NOT NULL
|
||||
${eventCondition}
|
||||
ORDER BY event_date DESC
|
||||
LIMIT 100
|
||||
`, params);
|
||||
|
||||
const reverseMap: Record<string, ProductChange['changeType']> = {
|
||||
'product_added': 'added',
|
||||
'product_removed': 'discontinued',
|
||||
'price_drop': 'price_drop',
|
||||
'price_increase': 'price_increase',
|
||||
'restocked': 'restocked',
|
||||
'out_of_stock': 'out_of_stock',
|
||||
};
|
||||
|
||||
return result.rows.map(row => ({
|
||||
productId: row.product_id,
|
||||
productName: row.product_name,
|
||||
brandName: row.brand_name,
|
||||
category: row.category,
|
||||
changeType: reverseMap[row.event_type] || 'added',
|
||||
date: row.event_date.toISOString().split('T')[0],
|
||||
oldValue: row.old_value,
|
||||
newValue: row.new_value,
|
||||
}));
|
||||
}, 5)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get category leaderboard across stores
|
||||
*/
|
||||
async getCategoryLeaderboard(
|
||||
category: string,
|
||||
limit: number = 20
|
||||
): Promise<CategoryLeaderboard[]> {
|
||||
const key = cacheKey('category_leaderboard', { category, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
WITH store_category_stats AS (
|
||||
SELECT
|
||||
dp.dispensary_id as store_id,
|
||||
d.name as store_name,
|
||||
COUNT(*) as sku_count,
|
||||
COUNT(DISTINCT dp.brand_name) as brand_count,
|
||||
AVG(extract_min_price(dp.latest_raw_payload)) as avg_price
|
||||
FROM dutchie_products dp
|
||||
JOIN dispensaries d ON dp.dispensary_id = d.id
|
||||
WHERE dp.type = $1
|
||||
GROUP BY dp.dispensary_id, d.name
|
||||
)
|
||||
SELECT
|
||||
scs.*,
|
||||
RANK() OVER (ORDER BY scs.sku_count DESC) as rank
|
||||
FROM store_category_stats scs
|
||||
ORDER BY scs.sku_count DESC
|
||||
LIMIT $2
|
||||
`, [category, limit]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
category,
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
skuCount: parseInt(row.sku_count) || 0,
|
||||
brandCount: parseInt(row.brand_count) || 0,
|
||||
avgPrice: row.avg_price ? Math.round(parseFloat(row.avg_price) * 100) / 100 : null,
|
||||
changePercent7d: 0, // Would need historical data
|
||||
rank: parseInt(row.rank) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get stores with most activity (changes)
|
||||
*/
|
||||
async getMostActiveStores(
|
||||
days: number = 7,
|
||||
limit: number = 10
|
||||
): Promise<Array<{
|
||||
storeId: number;
|
||||
storeName: string;
|
||||
city: string;
|
||||
state: string;
|
||||
totalChanges: number;
|
||||
brandsChanged: number;
|
||||
productsChanged: number;
|
||||
priceChanges: number;
|
||||
stockChanges: number;
|
||||
}>> {
|
||||
const key = cacheKey('most_active_stores', { days, limit });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
d.id as store_id,
|
||||
d.name as store_name,
|
||||
d.city,
|
||||
d.state,
|
||||
COUNT(*) as total_changes,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('brand_added', 'brand_removed')) as brands_changed,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('product_added', 'product_removed')) as products_changed,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('price_drop', 'price_increase')) as price_changes,
|
||||
COUNT(*) FILTER (WHERE sce.event_type IN ('restocked', 'out_of_stock')) as stock_changes
|
||||
FROM store_change_events sce
|
||||
JOIN dispensaries d ON sce.store_id = d.id
|
||||
WHERE sce.event_date >= CURRENT_DATE - ($1 || ' days')::INTERVAL
|
||||
GROUP BY d.id, d.name, d.city, d.state
|
||||
ORDER BY total_changes DESC
|
||||
LIMIT $2
|
||||
`, [days, limit]);
|
||||
|
||||
return result.rows.map(row => ({
|
||||
storeId: row.store_id,
|
||||
storeName: row.store_name,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
totalChanges: parseInt(row.total_changes) || 0,
|
||||
brandsChanged: parseInt(row.brands_changed) || 0,
|
||||
productsChanged: parseInt(row.products_changed) || 0,
|
||||
priceChanges: parseInt(row.price_changes) || 0,
|
||||
stockChanges: parseInt(row.stock_changes) || 0,
|
||||
}));
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two stores
|
||||
*/
|
||||
async compareStores(
|
||||
storeId1: number,
|
||||
storeId2: number
|
||||
): Promise<{
|
||||
store1: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||
store2: { id: number; name: string; brands: string[]; categories: string[]; skuCount: number };
|
||||
sharedBrands: string[];
|
||||
uniqueToStore1: string[];
|
||||
uniqueToStore2: string[];
|
||||
categoryComparison: Array<{
|
||||
category: string;
|
||||
store1Skus: number;
|
||||
store2Skus: number;
|
||||
difference: number;
|
||||
}>;
|
||||
}> {
|
||||
const key = cacheKey('compare_stores', { storeId1, storeId2 });
|
||||
|
||||
return (await this.cache.getOrCompute(key, async () => {
|
||||
const [store1Data, store2Data] = await Promise.all([
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.id, d.name,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
COUNT(*) as sku_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.id = $1
|
||||
GROUP BY d.id, d.name
|
||||
`, [storeId1]),
|
||||
this.pool.query(`
|
||||
SELECT
|
||||
d.id, d.name,
|
||||
ARRAY_AGG(DISTINCT dp.brand_name) FILTER (WHERE dp.brand_name IS NOT NULL) as brands,
|
||||
ARRAY_AGG(DISTINCT dp.type) FILTER (WHERE dp.type IS NOT NULL) as categories,
|
||||
COUNT(*) as sku_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dutchie_products dp ON d.id = dp.dispensary_id
|
||||
WHERE d.id = $1
|
||||
GROUP BY d.id, d.name
|
||||
`, [storeId2]),
|
||||
]);
|
||||
|
||||
const s1 = store1Data.rows[0];
|
||||
const s2 = store2Data.rows[0];
|
||||
|
||||
const brands1Array: string[] = (s1?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||
const brands2Array: string[] = (s2?.brands || []).filter((b: string | null): b is string => b !== null);
|
||||
const brands1 = new Set(brands1Array);
|
||||
const brands2 = new Set(brands2Array);
|
||||
|
||||
const sharedBrands: string[] = brands1Array.filter(b => brands2.has(b));
|
||||
const uniqueToStore1: string[] = brands1Array.filter(b => !brands2.has(b));
|
||||
const uniqueToStore2: string[] = brands2Array.filter(b => !brands1.has(b));
|
||||
|
||||
// Category comparison
|
||||
const categoryResult = await this.pool.query(`
|
||||
WITH store1_cats AS (
|
||||
SELECT type as category, COUNT(*) as sku_count
|
||||
FROM dutchie_products WHERE dispensary_id = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
store2_cats AS (
|
||||
SELECT type as category, COUNT(*) as sku_count
|
||||
FROM dutchie_products WHERE dispensary_id = $2 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
),
|
||||
all_cats AS (
|
||||
SELECT category FROM store1_cats
|
||||
UNION
|
||||
SELECT category FROM store2_cats
|
||||
)
|
||||
SELECT
|
||||
ac.category,
|
||||
COALESCE(s1.sku_count, 0) as store1_skus,
|
||||
COALESCE(s2.sku_count, 0) as store2_skus
|
||||
FROM all_cats ac
|
||||
LEFT JOIN store1_cats s1 ON ac.category = s1.category
|
||||
LEFT JOIN store2_cats s2 ON ac.category = s2.category
|
||||
ORDER BY (COALESCE(s1.sku_count, 0) + COALESCE(s2.sku_count, 0)) DESC
|
||||
`, [storeId1, storeId2]);
|
||||
|
||||
return {
|
||||
store1: {
|
||||
id: s1?.id || storeId1,
|
||||
name: s1?.name || 'Unknown',
|
||||
brands: s1?.brands || [],
|
||||
categories: s1?.categories || [],
|
||||
skuCount: parseInt(s1?.sku_count) || 0,
|
||||
},
|
||||
store2: {
|
||||
id: s2?.id || storeId2,
|
||||
name: s2?.name || 'Unknown',
|
||||
brands: s2?.brands || [],
|
||||
categories: s2?.categories || [],
|
||||
skuCount: parseInt(s2?.sku_count) || 0,
|
||||
},
|
||||
sharedBrands,
|
||||
uniqueToStore1,
|
||||
uniqueToStore2,
|
||||
categoryComparison: categoryResult.rows.map(row => ({
|
||||
category: row.category,
|
||||
store1Skus: parseInt(row.store1_skus) || 0,
|
||||
store2Skus: parseInt(row.store2_skus) || 0,
|
||||
difference: (parseInt(row.store1_skus) || 0) - (parseInt(row.store2_skus) || 0),
|
||||
})),
|
||||
};
|
||||
}, 15)).data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a change event (used by crawler/worker)
|
||||
*/
|
||||
async recordChangeEvent(event: {
|
||||
storeId: number;
|
||||
eventType: string;
|
||||
brandName?: string;
|
||||
productId?: number;
|
||||
productName?: string;
|
||||
category?: string;
|
||||
oldValue?: string;
|
||||
newValue?: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}): Promise<void> {
|
||||
await this.pool.query(`
|
||||
INSERT INTO store_change_events
|
||||
(store_id, event_type, event_date, brand_name, product_id, product_name, category, old_value, new_value, metadata)
|
||||
VALUES ($1, $2, CURRENT_DATE, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
event.storeId,
|
||||
event.eventType,
|
||||
event.brandName || null,
|
||||
event.productId || null,
|
||||
event.productName || null,
|
||||
event.category || null,
|
||||
event.oldValue || null,
|
||||
event.newValue || null,
|
||||
event.metadata ? JSON.stringify(event.metadata) : null,
|
||||
]);
|
||||
|
||||
// Invalidate cache
|
||||
await this.cache.invalidatePattern(`store_change_summary:storeId=${event.storeId}`);
|
||||
}
|
||||
}
|
||||
@@ -1,266 +0,0 @@
|
||||
/**
|
||||
* LEGACY SERVICE - AZDHS Import
|
||||
*
|
||||
* DEPRECATED: This service creates its own database pool.
|
||||
* Future implementations should use the canonical CannaiQ connection.
|
||||
*
|
||||
* Imports Arizona dispensaries from the main database's dispensaries table
|
||||
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
|
||||
*
|
||||
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
|
||||
*
|
||||
* DO NOT:
|
||||
* - Run this in automated jobs
|
||||
* - Use DATABASE_URL directly
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { query as dutchieQuery } from '../db/connection';
|
||||
import { Dispensary } from '../types';
|
||||
|
||||
// Single database connection (cannaiq in cannaiq-postgres container)
|
||||
// Use CANNAIQ_DB_* env vars or defaults
|
||||
const MAIN_DB_CONNECTION = process.env.CANNAIQ_DB_URL ||
|
||||
`postgresql://${process.env.CANNAIQ_DB_USER || 'dutchie'}:${process.env.CANNAIQ_DB_PASS || 'dutchie_local_pass'}@${process.env.CANNAIQ_DB_HOST || 'localhost'}:${process.env.CANNAIQ_DB_PORT || '54320'}/${process.env.CANNAIQ_DB_NAME || 'cannaiq'}`;
|
||||
|
||||
/**
|
||||
* AZDHS dispensary record from the main database
|
||||
*/
|
||||
interface AZDHSDispensary {
|
||||
id: number;
|
||||
azdhs_id: number;
|
||||
name: string;
|
||||
company_name?: string;
|
||||
address?: string;
|
||||
city: string;
|
||||
state: string;
|
||||
zip?: string;
|
||||
latitude?: number;
|
||||
longitude?: number;
|
||||
dba_name?: string;
|
||||
phone?: string;
|
||||
email?: string;
|
||||
website?: string;
|
||||
google_rating?: string;
|
||||
google_review_count?: number;
|
||||
slug: string;
|
||||
menu_provider?: string;
|
||||
product_provider?: string;
|
||||
created_at: Date;
|
||||
updated_at: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import result statistics
|
||||
*/
|
||||
interface ImportResult {
|
||||
total: number;
|
||||
imported: number;
|
||||
skipped: number;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a temporary connection to the main database
|
||||
*/
|
||||
function getMainDBPool(): Pool {
|
||||
console.warn('[AZDHS Import] LEGACY: Using separate pool. Should use canonical CannaiQ connection.');
|
||||
return new Pool({
|
||||
connectionString: MAIN_DB_CONNECTION,
|
||||
max: 5,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all AZ dispensaries from the main database
|
||||
*/
|
||||
async function fetchAZDHSDispensaries(): Promise<AZDHSDispensary[]> {
|
||||
const pool = getMainDBPool();
|
||||
|
||||
try {
|
||||
const result = await pool.query<AZDHSDispensary>(`
|
||||
SELECT
|
||||
id, azdhs_id, name, company_name, address, city, state, zip,
|
||||
latitude, longitude, dba_name, phone, email, website,
|
||||
google_rating, google_review_count, slug,
|
||||
menu_provider, product_provider,
|
||||
created_at, updated_at
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
ORDER BY id
|
||||
`);
|
||||
|
||||
return result.rows;
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Import a single dispensary into the Dutchie AZ database
|
||||
*/
|
||||
async function importDispensary(disp: AZDHSDispensary): Promise<number> {
|
||||
const result = await dutchieQuery<{ id: number }>(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
platform, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, is_delivery, is_pickup, raw_metadata, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7,
|
||||
$8, $9, $10, $11, $12, NOW()
|
||||
)
|
||||
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
address = EXCLUDED.address,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
raw_metadata = EXCLUDED.raw_metadata,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
'dutchie', // Will be updated when Dutchie match is found
|
||||
disp.dba_name || disp.name,
|
||||
disp.slug,
|
||||
disp.city,
|
||||
disp.state,
|
||||
disp.zip,
|
||||
disp.address,
|
||||
disp.latitude,
|
||||
disp.longitude,
|
||||
false, // is_delivery - unknown
|
||||
true, // is_pickup - assume true
|
||||
JSON.stringify({
|
||||
azdhs_id: disp.azdhs_id,
|
||||
main_db_id: disp.id,
|
||||
company_name: disp.company_name,
|
||||
phone: disp.phone,
|
||||
email: disp.email,
|
||||
website: disp.website,
|
||||
google_rating: disp.google_rating,
|
||||
google_review_count: disp.google_review_count,
|
||||
menu_provider: disp.menu_provider,
|
||||
product_provider: disp.product_provider,
|
||||
}),
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import all AZDHS dispensaries into the Dutchie AZ database
|
||||
*/
|
||||
export async function importAZDHSDispensaries(): Promise<ImportResult> {
|
||||
console.log('[AZDHS Import] Starting import from main database...');
|
||||
|
||||
const result: ImportResult = {
|
||||
total: 0,
|
||||
imported: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
|
||||
try {
|
||||
const dispensaries = await fetchAZDHSDispensaries();
|
||||
result.total = dispensaries.length;
|
||||
|
||||
console.log(`[AZDHS Import] Found ${dispensaries.length} AZ dispensaries in main DB`);
|
||||
|
||||
for (const disp of dispensaries) {
|
||||
try {
|
||||
const id = await importDispensary(disp);
|
||||
result.imported++;
|
||||
console.log(`[AZDHS Import] Imported: ${disp.name} (${disp.city}) -> id=${id}`);
|
||||
} catch (error: any) {
|
||||
if (error.message.includes('duplicate')) {
|
||||
result.skipped++;
|
||||
} else {
|
||||
result.errors.push(`${disp.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Failed to fetch from main DB: ${error.message}`);
|
||||
}
|
||||
|
||||
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped, ${result.errors.length} errors`);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Import dispensaries from JSON file (backup export)
|
||||
*/
|
||||
export async function importFromJSON(jsonPath: string): Promise<ImportResult> {
|
||||
console.log(`[AZDHS Import] Importing from JSON: ${jsonPath}`);
|
||||
|
||||
const result: ImportResult = {
|
||||
total: 0,
|
||||
imported: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
|
||||
try {
|
||||
const fs = await import('fs/promises');
|
||||
const data = await fs.readFile(jsonPath, 'utf-8');
|
||||
const dispensaries: AZDHSDispensary[] = JSON.parse(data);
|
||||
|
||||
result.total = dispensaries.length;
|
||||
console.log(`[AZDHS Import] Found ${dispensaries.length} dispensaries in JSON file`);
|
||||
|
||||
for (const disp of dispensaries) {
|
||||
try {
|
||||
const id = await importDispensary(disp);
|
||||
result.imported++;
|
||||
} catch (error: any) {
|
||||
if (error.message.includes('duplicate')) {
|
||||
result.skipped++;
|
||||
} else {
|
||||
result.errors.push(`${disp.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
result.errors.push(`Failed to read JSON file: ${error.message}`);
|
||||
}
|
||||
|
||||
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped`);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get import statistics
|
||||
*/
|
||||
export async function getImportStats(): Promise<{
|
||||
totalDispensaries: number;
|
||||
withPlatformIds: number;
|
||||
withoutPlatformIds: number;
|
||||
lastImportedAt?: Date;
|
||||
}> {
|
||||
const { rows } = await dutchieQuery<{
|
||||
total: string;
|
||||
with_platform_id: string;
|
||||
without_platform_id: string;
|
||||
last_updated: Date;
|
||||
}>(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(platform_dispensary_id) as with_platform_id,
|
||||
COUNT(*) - COUNT(platform_dispensary_id) as without_platform_id,
|
||||
MAX(updated_at) as last_updated
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
`);
|
||||
|
||||
const stats = rows[0];
|
||||
return {
|
||||
totalDispensaries: parseInt(stats.total, 10),
|
||||
withPlatformIds: parseInt(stats.with_platform_id, 10),
|
||||
withoutPlatformIds: parseInt(stats.without_platform_id, 10),
|
||||
lastImportedAt: stats.last_updated,
|
||||
};
|
||||
}
|
||||
@@ -1,481 +0,0 @@
|
||||
/**
|
||||
* Directory-Based Store Matcher
|
||||
*
|
||||
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
|
||||
* then matches them to existing dispensaries by fuzzy name/city/address matching.
|
||||
*
|
||||
* This allows us to:
|
||||
* 1. Find specific store URLs for directory-style websites
|
||||
* 2. Match stores confidently by name+city
|
||||
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
|
||||
*/
|
||||
|
||||
import { query } from '../db/connection';
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DirectoryStore {
|
||||
name: string;
|
||||
city: string;
|
||||
state: string;
|
||||
address: string | null;
|
||||
storeUrl: string;
|
||||
}
|
||||
|
||||
export interface MatchResult {
|
||||
directoryStore: DirectoryStore;
|
||||
dispensaryId: number | null;
|
||||
dispensaryName: string | null;
|
||||
confidence: 'high' | 'medium' | 'low' | 'none';
|
||||
matchReason: string;
|
||||
}
|
||||
|
||||
export interface DirectoryMatchReport {
|
||||
provider: string;
|
||||
totalDirectoryStores: number;
|
||||
highConfidenceMatches: number;
|
||||
mediumConfidenceMatches: number;
|
||||
lowConfidenceMatches: number;
|
||||
unmatched: number;
|
||||
results: MatchResult[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// NORMALIZATION FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Normalize a string for comparison:
|
||||
* - Lowercase
|
||||
* - Remove common suffixes (dispensary, cannabis, etc.)
|
||||
* - Remove punctuation
|
||||
* - Collapse whitespace
|
||||
*/
|
||||
function normalizeForComparison(str: string): string {
|
||||
if (!str) return '';
|
||||
|
||||
return str
|
||||
.toLowerCase()
|
||||
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
|
||||
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
||||
.replace(/\s+/g, ' ') // Collapse whitespace
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize city name for comparison
|
||||
*/
|
||||
function normalizeCity(city: string): string {
|
||||
if (!city) return '';
|
||||
|
||||
return city
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s]/g, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate similarity between two strings (0-1)
|
||||
* Uses Levenshtein distance normalized by max length
|
||||
*/
|
||||
function stringSimilarity(a: string, b: string): number {
|
||||
if (!a || !b) return 0;
|
||||
if (a === b) return 1;
|
||||
|
||||
const longer = a.length > b.length ? a : b;
|
||||
const shorter = a.length > b.length ? b : a;
|
||||
|
||||
if (longer.length === 0) return 1;
|
||||
|
||||
const distance = levenshteinDistance(longer, shorter);
|
||||
return (longer.length - distance) / longer.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(a: string, b: string): number {
|
||||
const matrix: number[][] = [];
|
||||
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
} else {
|
||||
matrix[i][j] = Math.min(
|
||||
matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if string contains another (with normalization)
|
||||
*/
|
||||
function containsNormalized(haystack: string, needle: string): boolean {
|
||||
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PROVIDER DIRECTORY SCRAPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
|
||||
*/
|
||||
export async function scrapeSolDirectory(): Promise<DirectoryStore[]> {
|
||||
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
|
||||
|
||||
try {
|
||||
const response = await fetch('https://www.livewithsol.com/locations/', {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Extract store entries from HTML
|
||||
// Sol's structure: Each location has name, address in specific divs
|
||||
const stores: DirectoryStore[] = [];
|
||||
|
||||
// Pattern to find location cards
|
||||
// Format: <a href="/locations/slug/">NAME</a> with address nearby
|
||||
const locationRegex =
|
||||
/<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
|
||||
|
||||
let match;
|
||||
while ((match = locationRegex.exec(html)) !== null) {
|
||||
const [, path, name, address] = match;
|
||||
|
||||
// Extract city from common Arizona cities
|
||||
let city = 'Unknown';
|
||||
const cityPatterns = [
|
||||
{ pattern: /phoenix/i, city: 'Phoenix' },
|
||||
{ pattern: /scottsdale/i, city: 'Scottsdale' },
|
||||
{ pattern: /tempe/i, city: 'Tempe' },
|
||||
{ pattern: /tucson/i, city: 'Tucson' },
|
||||
{ pattern: /mesa/i, city: 'Mesa' },
|
||||
{ pattern: /sun city/i, city: 'Sun City' },
|
||||
{ pattern: /glendale/i, city: 'Glendale' },
|
||||
];
|
||||
|
||||
for (const { pattern, city: cityName } of cityPatterns) {
|
||||
if (pattern.test(name) || pattern.test(address)) {
|
||||
city = cityName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
city,
|
||||
state: 'AZ',
|
||||
address: address.trim(),
|
||||
storeUrl: `https://www.livewithsol.com${path}`,
|
||||
});
|
||||
}
|
||||
|
||||
// If regex didn't work, use known hardcoded values (fallback)
|
||||
if (stores.length === 0) {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
|
||||
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
|
||||
return stores;
|
||||
} catch (error: any) {
|
||||
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
|
||||
// Return hardcoded fallback
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
|
||||
* In production, this would use Playwright to bypass age-gate
|
||||
*/
|
||||
export async function scrapeCuraleafDirectory(): Promise<DirectoryStore[]> {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
|
||||
|
||||
// Hardcoded Arizona Curaleaf locations from public knowledge
|
||||
// These would be scraped via Playwright in production
|
||||
return [
|
||||
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
|
||||
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
|
||||
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
|
||||
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
|
||||
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
|
||||
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
|
||||
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
|
||||
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
|
||||
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
|
||||
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
|
||||
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
|
||||
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
|
||||
];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MATCHING LOGIC
|
||||
// ============================================================
|
||||
|
||||
interface Dispensary {
|
||||
id: number;
|
||||
name: string;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
address: string | null;
|
||||
menu_type: string | null;
|
||||
menu_url: string | null;
|
||||
website: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Match a directory store to an existing dispensary
|
||||
*/
|
||||
function matchStoreToDispensary(store: DirectoryStore, dispensaries: Dispensary[]): MatchResult {
|
||||
const normalizedStoreName = normalizeForComparison(store.name);
|
||||
const normalizedStoreCity = normalizeCity(store.city);
|
||||
|
||||
let bestMatch: Dispensary | null = null;
|
||||
let bestScore = 0;
|
||||
let matchReason = '';
|
||||
|
||||
for (const disp of dispensaries) {
|
||||
const normalizedDispName = normalizeForComparison(disp.name);
|
||||
const normalizedDispCity = normalizeCity(disp.city || '');
|
||||
|
||||
let score = 0;
|
||||
const reasons: string[] = [];
|
||||
|
||||
// 1. Name similarity (max 50 points)
|
||||
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
|
||||
score += nameSimilarity * 50;
|
||||
if (nameSimilarity > 0.8) reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
|
||||
|
||||
// 2. City match (25 points for exact, 15 for partial)
|
||||
if (normalizedStoreCity && normalizedDispCity) {
|
||||
if (normalizedStoreCity === normalizedDispCity) {
|
||||
score += 25;
|
||||
reasons.push('city_exact');
|
||||
} else if (
|
||||
normalizedStoreCity.includes(normalizedDispCity) ||
|
||||
normalizedDispCity.includes(normalizedStoreCity)
|
||||
) {
|
||||
score += 15;
|
||||
reasons.push('city_partial');
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Address contains street name (15 points)
|
||||
if (store.address && disp.address) {
|
||||
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
|
||||
score += 15;
|
||||
reasons.push('address_match');
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Brand name in dispensary name (10 points)
|
||||
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
|
||||
if (disp.name.toLowerCase().includes(brandName)) {
|
||||
score += 10;
|
||||
reasons.push('brand_match');
|
||||
}
|
||||
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestMatch = disp;
|
||||
matchReason = reasons.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
// Determine confidence level
|
||||
let confidence: 'high' | 'medium' | 'low' | 'none';
|
||||
if (bestScore >= 70) {
|
||||
confidence = 'high';
|
||||
} else if (bestScore >= 50) {
|
||||
confidence = 'medium';
|
||||
} else if (bestScore >= 30) {
|
||||
confidence = 'low';
|
||||
} else {
|
||||
confidence = 'none';
|
||||
}
|
||||
|
||||
return {
|
||||
directoryStore: store,
|
||||
dispensaryId: bestMatch?.id || null,
|
||||
dispensaryName: bestMatch?.name || null,
|
||||
confidence,
|
||||
matchReason: matchReason || 'no_match',
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Run directory matching for a provider and update database
|
||||
* Only applies high-confidence matches automatically
|
||||
*/
|
||||
export async function matchDirectoryToDispensaries(
|
||||
provider: 'curaleaf' | 'sol',
|
||||
dryRun: boolean = true
|
||||
): Promise<DirectoryMatchReport> {
|
||||
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
|
||||
|
||||
// Get directory stores
|
||||
let directoryStores: DirectoryStore[];
|
||||
if (provider === 'curaleaf') {
|
||||
directoryStores = await scrapeCuraleafDirectory();
|
||||
} else if (provider === 'sol') {
|
||||
directoryStores = await scrapeSolDirectory();
|
||||
} else {
|
||||
throw new Error(`Unknown provider: ${provider}`);
|
||||
}
|
||||
|
||||
// Get all AZ dispensaries from database
|
||||
const { rows: dispensaries } = await query<Dispensary>(
|
||||
`SELECT id, name, city, state, address, menu_type, menu_url, website
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'`
|
||||
);
|
||||
|
||||
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
|
||||
|
||||
// Match each directory store
|
||||
const results: MatchResult[] = [];
|
||||
for (const store of directoryStores) {
|
||||
const match = matchStoreToDispensary(store, dispensaries);
|
||||
results.push(match);
|
||||
|
||||
// Only apply high-confidence matches if not dry run
|
||||
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
|
||||
await applyDirectoryMatch(match.dispensaryId, provider, store);
|
||||
}
|
||||
}
|
||||
|
||||
// Count results
|
||||
const report: DirectoryMatchReport = {
|
||||
provider,
|
||||
totalDirectoryStores: directoryStores.length,
|
||||
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
|
||||
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
|
||||
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
|
||||
unmatched: results.filter((r) => r.confidence === 'none').length,
|
||||
results,
|
||||
};
|
||||
|
||||
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
|
||||
console.log(` - High confidence: ${report.highConfidenceMatches}`);
|
||||
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
|
||||
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
|
||||
console.log(` - Unmatched: ${report.unmatched}`);
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply a directory match to a dispensary
|
||||
*/
|
||||
async function applyDirectoryMatch(
|
||||
dispensaryId: number,
|
||||
provider: string,
|
||||
store: DirectoryStore
|
||||
): Promise<void> {
|
||||
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
|
||||
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
menu_url = $2,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'directory_match'::text,
|
||||
'detected_at', NOW(),
|
||||
'directory_store_name', $3::text,
|
||||
'directory_store_url', $2::text,
|
||||
'directory_store_city', $4::text,
|
||||
'directory_store_address', $5::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', $6::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $7
|
||||
`,
|
||||
[
|
||||
provider,
|
||||
store.storeUrl,
|
||||
store.name,
|
||||
store.city,
|
||||
store.address,
|
||||
`${provider} proprietary menu - no crawler available`,
|
||||
dispensaryId,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Preview matches without applying them
|
||||
*/
|
||||
export async function previewDirectoryMatches(
|
||||
provider: 'curaleaf' | 'sol'
|
||||
): Promise<DirectoryMatchReport> {
|
||||
return matchDirectoryToDispensaries(provider, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply high-confidence matches
|
||||
*/
|
||||
export async function applyHighConfidenceMatches(
|
||||
provider: 'curaleaf' | 'sol'
|
||||
): Promise<DirectoryMatchReport> {
|
||||
return matchDirectoryToDispensaries(provider, false);
|
||||
}
|
||||
@@ -1,592 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Discovery Service
|
||||
*
|
||||
* Discovers and manages dispensaries from Dutchie for Arizona.
|
||||
*/
|
||||
|
||||
import { query, getClient } from '../db/connection';
|
||||
import { discoverArizonaDispensaries, resolveDispensaryId, resolveDispensaryIdWithDetails, ResolveDispensaryResult } from './graphql-client';
|
||||
import { Dispensary } from '../types';
|
||||
|
||||
/**
|
||||
* Upsert a dispensary record
|
||||
*/
|
||||
async function upsertDispensary(dispensary: Partial<Dispensary>): Promise<number> {
|
||||
const result = await query<{ id: number }>(
|
||||
`
|
||||
INSERT INTO dispensaries (
|
||||
platform, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, platform_dispensary_id,
|
||||
is_delivery, is_pickup, raw_metadata, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7,
|
||||
$8, $9, $10,
|
||||
$11, $12, $13, NOW()
|
||||
)
|
||||
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
address = EXCLUDED.address,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
platform_dispensary_id = COALESCE(EXCLUDED.platform_dispensary_id, dispensaries.platform_dispensary_id),
|
||||
is_delivery = EXCLUDED.is_delivery,
|
||||
is_pickup = EXCLUDED.is_pickup,
|
||||
raw_metadata = EXCLUDED.raw_metadata,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`,
|
||||
[
|
||||
dispensary.platform || 'dutchie',
|
||||
dispensary.name,
|
||||
dispensary.slug,
|
||||
dispensary.city,
|
||||
dispensary.state || 'AZ',
|
||||
dispensary.postalCode,
|
||||
dispensary.address,
|
||||
dispensary.latitude,
|
||||
dispensary.longitude,
|
||||
dispensary.platformDispensaryId,
|
||||
dispensary.isDelivery || false,
|
||||
dispensary.isPickup || true,
|
||||
dispensary.rawMetadata ? JSON.stringify(dispensary.rawMetadata) : null,
|
||||
]
|
||||
);
|
||||
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a raw discovery result to Dispensary
|
||||
*/
|
||||
function normalizeDispensary(raw: any): Partial<Dispensary> {
|
||||
return {
|
||||
platform: 'dutchie',
|
||||
name: raw.name || raw.Name || '',
|
||||
slug: raw.slug || raw.cName || raw.id || '',
|
||||
city: raw.city || raw.address?.city || '',
|
||||
state: 'AZ',
|
||||
postalCode: raw.postalCode || raw.address?.postalCode || raw.address?.zip,
|
||||
address: raw.streetAddress || raw.address?.streetAddress,
|
||||
latitude: raw.latitude || raw.location?.lat,
|
||||
longitude: raw.longitude || raw.location?.lng,
|
||||
platformDispensaryId: raw.dispensaryId || raw.id || null,
|
||||
isDelivery: raw.isDelivery || raw.delivery || false,
|
||||
isPickup: raw.isPickup || raw.pickup || true,
|
||||
rawMetadata: raw,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Import dispensaries from the existing dispensaries table (from AZDHS data)
|
||||
* This creates records in the dutchie_az database for AZ dispensaries
|
||||
*/
|
||||
export async function importFromExistingDispensaries(): Promise<{ imported: number }> {
|
||||
console.log('[Discovery] Importing from existing dispensaries table...');
|
||||
|
||||
// This is a workaround - we'll use the dispensaries we already know about
|
||||
// and try to resolve their Dutchie IDs
|
||||
const knownDispensaries = [
|
||||
{ name: 'Deeply Rooted', slug: 'AZ-Deeply-Rooted', city: 'Phoenix', state: 'AZ' },
|
||||
{ name: 'Curaleaf Gilbert', slug: 'curaleaf-gilbert', city: 'Gilbert', state: 'AZ' },
|
||||
{ name: 'Zen Leaf Prescott', slug: 'AZ-zen-leaf-prescott', city: 'Prescott', state: 'AZ' },
|
||||
// Add more known Dutchie stores here
|
||||
];
|
||||
|
||||
let imported = 0;
|
||||
|
||||
for (const disp of knownDispensaries) {
|
||||
try {
|
||||
const id = await upsertDispensary({
|
||||
platform: 'dutchie',
|
||||
name: disp.name,
|
||||
slug: disp.slug,
|
||||
city: disp.city,
|
||||
state: disp.state,
|
||||
});
|
||||
imported++;
|
||||
console.log(`[Discovery] Imported: ${disp.name} (id=${id})`);
|
||||
} catch (error: any) {
|
||||
console.error(`[Discovery] Failed to import ${disp.name}:`, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
return { imported };
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover all Arizona Dutchie dispensaries via GraphQL
|
||||
*/
|
||||
export async function discoverDispensaries(): Promise<{ discovered: number; errors: string[] }> {
|
||||
console.log('[Discovery] Starting Arizona dispensary discovery...');
|
||||
const errors: string[] = [];
|
||||
let discovered = 0;
|
||||
|
||||
try {
|
||||
const rawDispensaries = await discoverArizonaDispensaries();
|
||||
console.log(`[Discovery] Found ${rawDispensaries.length} dispensaries from GraphQL`);
|
||||
|
||||
for (const raw of rawDispensaries) {
|
||||
try {
|
||||
const normalized = normalizeDispensary(raw);
|
||||
if (normalized.name && normalized.slug && normalized.city) {
|
||||
await upsertDispensary(normalized);
|
||||
discovered++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
errors.push(`${raw.name || raw.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
errors.push(`Discovery failed: ${error.message}`);
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Completed: ${discovered} dispensaries, ${errors.length} errors`);
|
||||
return { discovered, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a string looks like a MongoDB ObjectId (24 hex chars)
|
||||
*/
|
||||
export function isObjectId(value: string): boolean {
|
||||
return /^[a-f0-9]{24}$/i.test(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) or platform_dispensary_id from a Dutchie menu_url
|
||||
*
|
||||
* Supports formats:
|
||||
* - https://dutchie.com/embedded-menu/<cName> -> returns { type: 'cName', value: '<cName>' }
|
||||
* - https://dutchie.com/dispensary/<cName> -> returns { type: 'cName', value: '<cName>' }
|
||||
* - https://dutchie.com/api/v2/embedded-menu/<id>.js -> returns { type: 'platformId', value: '<id>' }
|
||||
*
|
||||
* For backward compatibility, extractCNameFromMenuUrl still returns just the string value.
|
||||
*/
|
||||
export interface MenuUrlExtraction {
|
||||
type: 'cName' | 'platformId';
|
||||
value: string;
|
||||
}
|
||||
|
||||
export function extractFromMenuUrl(menuUrl: string | null | undefined): MenuUrlExtraction | null {
|
||||
if (!menuUrl) return null;
|
||||
|
||||
try {
|
||||
const url = new URL(menuUrl);
|
||||
const pathname = url.pathname;
|
||||
|
||||
// Match /api/v2/embedded-menu/<id>.js - this contains the platform_dispensary_id directly
|
||||
const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i);
|
||||
if (apiMatch) {
|
||||
return { type: 'platformId', value: apiMatch[1] };
|
||||
}
|
||||
|
||||
// Match /embedded-menu/<cName> or /dispensary/<cName>
|
||||
const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
|
||||
if (embeddedMatch) {
|
||||
const value = embeddedMatch[1];
|
||||
// Check if it's actually an ObjectId (some URLs use ID directly)
|
||||
if (isObjectId(value)) {
|
||||
return { type: 'platformId', value };
|
||||
}
|
||||
return { type: 'cName', value };
|
||||
}
|
||||
|
||||
const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
|
||||
if (dispensaryMatch) {
|
||||
const value = dispensaryMatch[1];
|
||||
if (isObjectId(value)) {
|
||||
return { type: 'platformId', value };
|
||||
}
|
||||
return { type: 'cName', value };
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cName (slug) from a Dutchie menu_url
|
||||
* Backward compatible - use extractFromMenuUrl for full info
|
||||
*/
|
||||
export function extractCNameFromMenuUrl(menuUrl: string | null | undefined): string | null {
|
||||
const extraction = extractFromMenuUrl(menuUrl);
|
||||
return extraction?.value || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve platform dispensary IDs for all dispensaries that don't have one
|
||||
* CRITICAL: Uses cName extracted from menu_url, NOT the slug column!
|
||||
*
|
||||
* Uses the new resolveDispensaryIdWithDetails which:
|
||||
* 1. Extracts dispensaryId from window.reactEnv in the embedded menu page (preferred)
|
||||
* 2. Falls back to GraphQL if reactEnv extraction fails
|
||||
* 3. Returns HTTP status so we can mark 403/404 stores as not_crawlable
|
||||
*/
|
||||
export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number; failed: number; skipped: number; notCrawlable: number }> {
|
||||
console.log('[Discovery] Resolving platform dispensary IDs...');
|
||||
|
||||
const { rows: dispensaries } = await query<any>(
|
||||
`
|
||||
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id, crawl_status
|
||||
FROM dispensaries
|
||||
WHERE menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NULL
|
||||
AND menu_url IS NOT NULL
|
||||
AND (crawl_status IS NULL OR crawl_status != 'not_crawlable')
|
||||
ORDER BY id
|
||||
`
|
||||
);
|
||||
|
||||
let resolved = 0;
|
||||
let failed = 0;
|
||||
let skipped = 0;
|
||||
let notCrawlable = 0;
|
||||
|
||||
for (const dispensary of dispensaries) {
|
||||
try {
|
||||
// Extract cName from menu_url - this is the CORRECT way to get the Dutchie slug
|
||||
const cName = extractCNameFromMenuUrl(dispensary.menu_url);
|
||||
|
||||
if (!cName) {
|
||||
console.log(`[Discovery] Skipping ${dispensary.name}: Could not extract cName from menu_url: ${dispensary.menu_url}`);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Resolving ID for: ${dispensary.name} (cName=${cName}, menu_url=${dispensary.menu_url})`);
|
||||
|
||||
// Use the new detailed resolver that extracts from reactEnv first
|
||||
const result = await resolveDispensaryIdWithDetails(cName);
|
||||
|
||||
if (result.dispensaryId) {
|
||||
// SUCCESS: Store resolved
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
crawl_status = 'ready',
|
||||
crawl_status_reason = $2,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $3,
|
||||
last_http_status = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $5
|
||||
`,
|
||||
[
|
||||
result.dispensaryId,
|
||||
`Resolved from ${result.source || 'page'}`,
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]
|
||||
);
|
||||
resolved++;
|
||||
console.log(`[Discovery] Resolved: ${cName} -> ${result.dispensaryId} (source: ${result.source})`);
|
||||
} else if (result.httpStatus === 403 || result.httpStatus === 404) {
|
||||
// NOT CRAWLABLE: Store removed or not accessible
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
crawl_status = 'not_crawlable',
|
||||
crawl_status_reason = $1,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $2,
|
||||
last_http_status = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`,
|
||||
[
|
||||
result.error || `HTTP ${result.httpStatus}: Removed from Dutchie`,
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]
|
||||
);
|
||||
notCrawlable++;
|
||||
console.log(`[Discovery] Marked not crawlable: ${cName} (HTTP ${result.httpStatus})`);
|
||||
} else {
|
||||
// FAILED: Could not resolve but page loaded
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET crawl_status = 'not_ready',
|
||||
crawl_status_reason = $1,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $2,
|
||||
last_http_status = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`,
|
||||
[
|
||||
result.error || 'Could not extract dispensaryId from page',
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]
|
||||
);
|
||||
failed++;
|
||||
console.log(`[Discovery] Could not resolve: ${cName} - ${result.error}`);
|
||||
}
|
||||
|
||||
// Delay between requests
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
} catch (error: any) {
|
||||
failed++;
|
||||
console.error(`[Discovery] Error resolving ${dispensary.name}:`, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Completed: ${resolved} resolved, ${failed} failed, ${skipped} skipped, ${notCrawlable} not crawlable`);
|
||||
return { resolved, failed, skipped, notCrawlable };
|
||||
}
|
||||
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
import { DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
|
||||
/**
|
||||
* Get all dispensaries
|
||||
*/
|
||||
|
||||
export async function getAllDispensaries(): Promise<Dispensary[]> {
|
||||
const { rows } = await query(
|
||||
`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`
|
||||
);
|
||||
return rows.map(mapDbRowToDispensary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Map snake_case DB row to camelCase Dispensary object
|
||||
* CRITICAL: DB returns snake_case (platform_dispensary_id) but TypeScript expects camelCase (platformDispensaryId)
|
||||
* This function is exported for use in other modules that query dispensaries directly.
|
||||
*
|
||||
* NOTE: The consolidated dispensaries table column mappings:
|
||||
* - zip → postalCode
|
||||
* - menu_type → menuType (keep platform as 'dutchie')
|
||||
* - last_crawl_at → lastCrawledAt
|
||||
* - platform_dispensary_id → platformDispensaryId
|
||||
*/
|
||||
export function mapDbRowToDispensary(row: any): Dispensary {
|
||||
// Extract website from raw_metadata if available (field may not exist in all environments)
|
||||
let rawMetadata = undefined;
|
||||
if (row.raw_metadata !== undefined) {
|
||||
rawMetadata = typeof row.raw_metadata === 'string'
|
||||
? JSON.parse(row.raw_metadata)
|
||||
: row.raw_metadata;
|
||||
}
|
||||
const website = row.website || rawMetadata?.website || undefined;
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
|
||||
name: row.name,
|
||||
dbaName: row.dbaName || row.dba_name || undefined, // dba_name column is optional
|
||||
slug: row.slug,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
postalCode: row.postalCode || row.zip || row.postal_code,
|
||||
latitude: row.latitude ? parseFloat(row.latitude) : undefined,
|
||||
longitude: row.longitude ? parseFloat(row.longitude) : undefined,
|
||||
address: row.address,
|
||||
platformDispensaryId: row.platformDispensaryId || row.platform_dispensary_id, // CRITICAL mapping!
|
||||
isDelivery: row.is_delivery,
|
||||
isPickup: row.is_pickup,
|
||||
rawMetadata: rawMetadata,
|
||||
lastCrawledAt: row.lastCrawledAt || row.last_crawl_at, // use last_crawl_at
|
||||
productCount: row.product_count,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
menuType: row.menuType || row.menu_type,
|
||||
menuUrl: row.menuUrl || row.menu_url,
|
||||
scrapeEnabled: row.scrapeEnabled ?? row.scrape_enabled,
|
||||
providerDetectionData: row.provider_detection_data,
|
||||
platformDispensaryIdResolvedAt: row.platform_dispensary_id_resolved_at,
|
||||
website,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get dispensary by ID
|
||||
* NOTE: Uses SQL aliases to map snake_case → camelCase directly
|
||||
*/
|
||||
export async function getDispensaryById(id: number): Promise<Dispensary | null> {
|
||||
const { rows } = await query(
|
||||
`
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
slug,
|
||||
city,
|
||||
state,
|
||||
zip AS "postalCode",
|
||||
address,
|
||||
latitude,
|
||||
longitude,
|
||||
menu_type AS "menuType",
|
||||
menu_url AS "menuUrl",
|
||||
platform_dispensary_id AS "platformDispensaryId",
|
||||
website,
|
||||
provider_detection_data AS "providerDetectionData",
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`,
|
||||
[id]
|
||||
);
|
||||
if (!rows[0]) return null;
|
||||
return mapDbRowToDispensary(rows[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get dispensaries with platform IDs (ready for crawling)
|
||||
*/
|
||||
export async function getDispensariesWithPlatformIds(): Promise<Dispensary[]> {
|
||||
const { rows } = await query(
|
||||
`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY name
|
||||
`
|
||||
);
|
||||
return rows.map(mapDbRowToDispensary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Re-resolve a single dispensary's platform ID
|
||||
* Clears the existing ID and re-resolves from the menu_url cName
|
||||
*/
|
||||
export async function reResolveDispensaryPlatformId(dispensaryId: number): Promise<{
|
||||
success: boolean;
|
||||
platformId: string | null;
|
||||
cName: string | null;
|
||||
error?: string;
|
||||
}> {
|
||||
console.log(`[Discovery] Re-resolving platform ID for dispensary ${dispensaryId}...`);
|
||||
|
||||
const dispensary = await getDispensaryById(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, platformId: null, cName: null, error: 'Dispensary not found' };
|
||||
}
|
||||
|
||||
const cName = extractCNameFromMenuUrl(dispensary.menuUrl);
|
||||
if (!cName) {
|
||||
console.log(`[Discovery] Could not extract cName from menu_url: ${dispensary.menuUrl}`);
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName: null,
|
||||
error: `Could not extract cName from menu_url: ${dispensary.menuUrl}`,
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[Discovery] Extracted cName: ${cName} from menu_url: ${dispensary.menuUrl}`);
|
||||
|
||||
try {
|
||||
const platformId = await resolveDispensaryId(cName);
|
||||
|
||||
if (platformId) {
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[platformId, dispensaryId]
|
||||
);
|
||||
console.log(`[Discovery] Resolved: ${cName} -> ${platformId}`);
|
||||
return { success: true, platformId, cName };
|
||||
} else {
|
||||
// Clear the invalid platform ID and mark as not crawlable
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
'{"resolution_error": "cName no longer exists on Dutchie", "not_crawlable": true}'::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`,
|
||||
[dispensaryId]
|
||||
);
|
||||
console.log(`[Discovery] Could not resolve: ${cName} - marked as not crawlable`);
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName,
|
||||
error: `cName "${cName}" no longer exists on Dutchie`,
|
||||
};
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Discovery] Error resolving ${cName}:`, error.message);
|
||||
return { success: false, platformId: null, cName, error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update menu_url for a dispensary and re-resolve platform ID
|
||||
*/
|
||||
export async function updateMenuUrlAndResolve(dispensaryId: number, newMenuUrl: string): Promise<{
|
||||
success: boolean;
|
||||
platformId: string | null;
|
||||
cName: string | null;
|
||||
error?: string;
|
||||
}> {
|
||||
console.log(`[Discovery] Updating menu_url for dispensary ${dispensaryId} to: ${newMenuUrl}`);
|
||||
|
||||
const cName = extractCNameFromMenuUrl(newMenuUrl);
|
||||
if (!cName) {
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName: null,
|
||||
error: `Could not extract cName from new menu_url: ${newMenuUrl}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Update the menu_url first
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET menu_url = $1,
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[newMenuUrl, dispensaryId]
|
||||
);
|
||||
|
||||
// Now resolve the platform ID with the new cName
|
||||
return await reResolveDispensaryPlatformId(dispensaryId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a dispensary as not crawlable (when resolution fails permanently)
|
||||
*/
|
||||
export async function markDispensaryNotCrawlable(dispensaryId: number, reason: string): Promise<void> {
|
||||
await query(
|
||||
`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object('not_crawlable', true, 'not_crawlable_reason', $1::text, 'not_crawlable_at', NOW()::text),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`,
|
||||
[reason, dispensaryId]
|
||||
);
|
||||
console.log(`[Discovery] Marked dispensary ${dispensaryId} as not crawlable: ${reason}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cName for a dispensary (extracted from menu_url)
|
||||
*/
|
||||
export function getDispensaryCName(dispensary: Dispensary): string | null {
|
||||
return extractCNameFromMenuUrl(dispensary.menuUrl);
|
||||
}
|
||||
@@ -1,491 +0,0 @@
|
||||
/**
|
||||
* Error Taxonomy Module
|
||||
*
|
||||
* Standardized error codes and classification for crawler reliability.
|
||||
* All crawl results must use these codes for consistent error handling.
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// ERROR CODES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Standardized error codes for all crawl operations.
|
||||
* These codes are stored in the database for analytics and debugging.
|
||||
*/
|
||||
export const CrawlErrorCode = {
|
||||
// Success states
|
||||
SUCCESS: 'SUCCESS',
|
||||
|
||||
// Rate limiting
|
||||
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
|
||||
|
||||
// Proxy issues
|
||||
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
|
||||
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
|
||||
|
||||
// Content issues
|
||||
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
|
||||
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
|
||||
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
|
||||
|
||||
// Network issues
|
||||
TIMEOUT: 'TIMEOUT', // Request timeout
|
||||
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
|
||||
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
|
||||
|
||||
// Authentication
|
||||
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
|
||||
|
||||
// Server errors
|
||||
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
|
||||
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
|
||||
|
||||
// Configuration issues
|
||||
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
|
||||
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
|
||||
|
||||
// Unknown
|
||||
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
|
||||
} as const;
|
||||
|
||||
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
|
||||
|
||||
// ============================================================
|
||||
// ERROR CLASSIFICATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Error metadata for each error code
|
||||
*/
|
||||
interface ErrorMetadata {
|
||||
code: CrawlErrorCodeType;
|
||||
retryable: boolean;
|
||||
rotateProxy: boolean;
|
||||
rotateUserAgent: boolean;
|
||||
backoffMultiplier: number;
|
||||
severity: 'low' | 'medium' | 'high' | 'critical';
|
||||
description: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for each error code - defines retry behavior
|
||||
*/
|
||||
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
|
||||
[CrawlErrorCode.SUCCESS]: {
|
||||
code: CrawlErrorCode.SUCCESS,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'low',
|
||||
description: 'Crawl completed successfully',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.RATE_LIMITED]: {
|
||||
code: CrawlErrorCode.RATE_LIMITED,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'medium',
|
||||
description: 'Rate limited by target (429)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.BLOCKED_PROXY]: {
|
||||
code: CrawlErrorCode.BLOCKED_PROXY,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Proxy blocked or rejected (407)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.PROXY_TIMEOUT]: {
|
||||
code: CrawlErrorCode.PROXY_TIMEOUT,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'low',
|
||||
description: 'Proxy connection timed out',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.HTML_CHANGED]: {
|
||||
code: CrawlErrorCode.HTML_CHANGED,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'high',
|
||||
description: 'Page structure changed - needs selector update',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.NO_PRODUCTS]: {
|
||||
code: CrawlErrorCode.NO_PRODUCTS,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'low',
|
||||
description: 'No products returned (may be temporary)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.PARSE_ERROR]: {
|
||||
code: CrawlErrorCode.PARSE_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'Failed to parse response data',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.TIMEOUT]: {
|
||||
code: CrawlErrorCode.TIMEOUT,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Request timed out',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.NETWORK_ERROR]: {
|
||||
code: CrawlErrorCode.NETWORK_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'Network connection failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.DNS_ERROR]: {
|
||||
code: CrawlErrorCode.DNS_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: true,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'medium',
|
||||
description: 'DNS resolution failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.AUTH_FAILED]: {
|
||||
code: CrawlErrorCode.AUTH_FAILED,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: true,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'high',
|
||||
description: 'Authentication or session failed',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.SERVER_ERROR]: {
|
||||
code: CrawlErrorCode.SERVER_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.5,
|
||||
severity: 'medium',
|
||||
description: 'Server error (5xx)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
|
||||
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 2.0,
|
||||
severity: 'high',
|
||||
description: 'Service temporarily unavailable (503)',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.INVALID_CONFIG]: {
|
||||
code: CrawlErrorCode.INVALID_CONFIG,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'critical',
|
||||
description: 'Invalid store configuration',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
|
||||
code: CrawlErrorCode.MISSING_PLATFORM_ID,
|
||||
retryable: false,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 0,
|
||||
severity: 'critical',
|
||||
description: 'Missing platform_dispensary_id',
|
||||
},
|
||||
|
||||
[CrawlErrorCode.UNKNOWN_ERROR]: {
|
||||
code: CrawlErrorCode.UNKNOWN_ERROR,
|
||||
retryable: true,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
backoffMultiplier: 1.0,
|
||||
severity: 'high',
|
||||
description: 'Unknown/unclassified error',
|
||||
},
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// ERROR CLASSIFICATION FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Classify an error into a standardized error code.
|
||||
*
|
||||
* @param error - The error to classify (Error object, string, or HTTP status)
|
||||
* @param httpStatus - Optional HTTP status code
|
||||
* @returns Standardized error code
|
||||
*/
|
||||
export function classifyError(
|
||||
error: Error | string | null,
|
||||
httpStatus?: number
|
||||
): CrawlErrorCodeType {
|
||||
// Check HTTP status first
|
||||
if (httpStatus) {
|
||||
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
|
||||
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
|
||||
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
|
||||
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
|
||||
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
|
||||
}
|
||||
|
||||
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
|
||||
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
|
||||
|
||||
// Rate limiting patterns
|
||||
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
|
||||
return CrawlErrorCode.RATE_LIMITED;
|
||||
}
|
||||
|
||||
// Proxy patterns
|
||||
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
|
||||
return CrawlErrorCode.BLOCKED_PROXY;
|
||||
}
|
||||
|
||||
// Timeout patterns
|
||||
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
|
||||
if (message.includes('proxy')) {
|
||||
return CrawlErrorCode.PROXY_TIMEOUT;
|
||||
}
|
||||
return CrawlErrorCode.TIMEOUT;
|
||||
}
|
||||
|
||||
// Network patterns
|
||||
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
|
||||
return CrawlErrorCode.NETWORK_ERROR;
|
||||
}
|
||||
|
||||
// DNS patterns
|
||||
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
|
||||
return CrawlErrorCode.DNS_ERROR;
|
||||
}
|
||||
|
||||
// Auth patterns
|
||||
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
|
||||
return CrawlErrorCode.AUTH_FAILED;
|
||||
}
|
||||
|
||||
// HTML change patterns
|
||||
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
|
||||
return CrawlErrorCode.HTML_CHANGED;
|
||||
}
|
||||
|
||||
// Parse patterns
|
||||
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
|
||||
return CrawlErrorCode.PARSE_ERROR;
|
||||
}
|
||||
|
||||
// No products patterns
|
||||
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
|
||||
return CrawlErrorCode.NO_PRODUCTS;
|
||||
}
|
||||
|
||||
// Server error patterns
|
||||
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
|
||||
return CrawlErrorCode.SERVER_ERROR;
|
||||
}
|
||||
|
||||
// Config patterns
|
||||
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
|
||||
if (message.includes('platform') || message.includes('dispensary_id')) {
|
||||
return CrawlErrorCode.MISSING_PLATFORM_ID;
|
||||
}
|
||||
return CrawlErrorCode.INVALID_CONFIG;
|
||||
}
|
||||
|
||||
return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get metadata for an error code
|
||||
*/
|
||||
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
|
||||
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an error is retryable
|
||||
*/
|
||||
export function isRetryable(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).retryable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy should be rotated for this error
|
||||
*/
|
||||
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).rotateProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if user agent should be rotated for this error
|
||||
*/
|
||||
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
|
||||
return getErrorMetadata(code).rotateUserAgent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get backoff multiplier for this error
|
||||
*/
|
||||
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
|
||||
return getErrorMetadata(code).backoffMultiplier;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CRAWL RESULT TYPE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Standardized crawl result with error taxonomy
|
||||
*/
|
||||
export interface CrawlResult {
|
||||
success: boolean;
|
||||
dispensaryId: number;
|
||||
|
||||
// Error info
|
||||
errorCode: CrawlErrorCodeType;
|
||||
errorMessage?: string;
|
||||
httpStatus?: number;
|
||||
|
||||
// Timing
|
||||
startedAt: Date;
|
||||
finishedAt: Date;
|
||||
durationMs: number;
|
||||
|
||||
// Context
|
||||
attemptNumber: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
|
||||
// Metrics (on success)
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
imagesDownloaded?: number;
|
||||
|
||||
// Metadata
|
||||
metadata?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a success result
|
||||
*/
|
||||
export function createSuccessResult(
|
||||
dispensaryId: number,
|
||||
startedAt: Date,
|
||||
metrics: {
|
||||
productsFound: number;
|
||||
productsUpserted: number;
|
||||
snapshotsCreated: number;
|
||||
imagesDownloaded?: number;
|
||||
},
|
||||
context?: {
|
||||
attemptNumber?: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
}
|
||||
): CrawlResult {
|
||||
const finishedAt = new Date();
|
||||
return {
|
||||
success: true,
|
||||
dispensaryId,
|
||||
errorCode: CrawlErrorCode.SUCCESS,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||
attemptNumber: context?.attemptNumber || 1,
|
||||
proxyUsed: context?.proxyUsed,
|
||||
userAgentUsed: context?.userAgentUsed,
|
||||
...metrics,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a failure result
|
||||
*/
|
||||
export function createFailureResult(
|
||||
dispensaryId: number,
|
||||
startedAt: Date,
|
||||
error: Error | string,
|
||||
httpStatus?: number,
|
||||
context?: {
|
||||
attemptNumber?: number;
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
}
|
||||
): CrawlResult {
|
||||
const finishedAt = new Date();
|
||||
const errorCode = classifyError(error, httpStatus);
|
||||
const errorMessage = typeof error === 'string' ? error : error.message;
|
||||
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
httpStatus,
|
||||
startedAt,
|
||||
finishedAt,
|
||||
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
||||
attemptNumber: context?.attemptNumber || 1,
|
||||
proxyUsed: context?.proxyUsed,
|
||||
userAgentUsed: context?.userAgentUsed,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LOGGING HELPERS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Format error code for logging
|
||||
*/
|
||||
export function formatErrorForLog(result: CrawlResult): string {
|
||||
const metadata = getErrorMetadata(result.errorCode);
|
||||
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
|
||||
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
|
||||
|
||||
if (result.success) {
|
||||
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
|
||||
}
|
||||
|
||||
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get user-friendly error description
|
||||
*/
|
||||
export function getErrorDescription(code: CrawlErrorCodeType): string {
|
||||
return getErrorMetadata(code).description;
|
||||
}
|
||||
@@ -1,712 +0,0 @@
|
||||
/**
|
||||
* Dutchie GraphQL Client
|
||||
*
|
||||
* Uses Puppeteer to establish a session (get CF cookies), then makes
|
||||
* SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies.
|
||||
*
|
||||
* DUTCHIE FETCH RULES:
|
||||
* 1. Server-side only - use axios (never browser fetch with CORS)
|
||||
* 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly
|
||||
* 3. Headers must mimic Chrome: User-Agent, Origin, Referer
|
||||
* 4. If 403, extract CF cookies from Puppeteer session and include them
|
||||
* 5. Log status codes, error bodies, and product counts
|
||||
*/
|
||||
|
||||
import axios, { AxiosError } from 'axios';
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import type { Browser, Page, Protocol } from 'puppeteer';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import {
|
||||
DutchieRawProduct,
|
||||
DutchiePOSChild,
|
||||
CrawlMode,
|
||||
} from '../types';
|
||||
import { dutchieConfig, GRAPHQL_HASHES, ARIZONA_CENTERPOINTS } from '../config/dutchie';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// Re-export for backward compatibility
|
||||
export { GRAPHQL_HASHES, ARIZONA_CENTERPOINTS };
|
||||
|
||||
// ============================================================
|
||||
// SESSION MANAGEMENT - Get CF cookies via Puppeteer
|
||||
// ============================================================
|
||||
|
||||
interface SessionCredentials {
|
||||
cookies: string; // Cookie header string
|
||||
userAgent: string;
|
||||
browser: Browser;
|
||||
page: Page; // Keep page reference for extracting dispensaryId
|
||||
dispensaryId?: string; // Extracted from window.reactEnv if available
|
||||
httpStatus?: number; // HTTP status code from navigation
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a session by navigating to the embedded menu page
|
||||
* and extracting CF clearance cookies for server-side requests.
|
||||
* Also extracts dispensaryId from window.reactEnv if available.
|
||||
*/
|
||||
async function createSession(cName: string): Promise<SessionCredentials> {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: dutchieConfig.browserArgs,
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
const userAgent = dutchieConfig.userAgent;
|
||||
|
||||
await page.setUserAgent(userAgent);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Navigate to the embedded menu page for this dispensary
|
||||
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
|
||||
console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`);
|
||||
|
||||
let httpStatus: number | undefined;
|
||||
let dispensaryId: string | undefined;
|
||||
|
||||
try {
|
||||
const response = await page.goto(embeddedMenuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: dutchieConfig.navigationTimeout,
|
||||
});
|
||||
httpStatus = response?.status();
|
||||
await new Promise((r) => setTimeout(r, dutchieConfig.pageLoadDelay));
|
||||
|
||||
// Try to extract dispensaryId from window.reactEnv
|
||||
try {
|
||||
dispensaryId = await page.evaluate(() => {
|
||||
return (window as any).reactEnv?.dispensaryId || null;
|
||||
});
|
||||
if (dispensaryId) {
|
||||
console.log(`[GraphQL Client] Extracted dispensaryId from reactEnv: ${dispensaryId}`);
|
||||
}
|
||||
} catch (evalError: any) {
|
||||
console.log(`[GraphQL Client] Could not extract dispensaryId from reactEnv: ${evalError.message}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.warn(`[GraphQL Client] Navigation warning: ${error.message}`);
|
||||
// Continue anyway - we may have gotten cookies
|
||||
}
|
||||
|
||||
// Extract cookies
|
||||
const cookies = await page.cookies();
|
||||
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
|
||||
|
||||
console.log(`[GraphQL Client] Got ${cookies.length} cookies, HTTP status: ${httpStatus}`);
|
||||
if (cookies.length > 0) {
|
||||
console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`);
|
||||
}
|
||||
|
||||
return { cookies: cookieString, userAgent, browser, page, dispensaryId, httpStatus };
|
||||
}
|
||||
|
||||
/**
|
||||
* Close session (browser)
|
||||
*/
|
||||
async function closeSession(session: SessionCredentials): Promise<void> {
|
||||
await session.browser.close();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SERVER-SIDE GRAPHQL FETCH USING AXIOS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build headers that mimic a real browser request
|
||||
*/
|
||||
function buildHeaders(session: SessionCredentials, cName: string): Record<string, string> {
|
||||
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
|
||||
|
||||
return {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'content-type': 'application/json',
|
||||
'origin': 'https://dutchie.com',
|
||||
'referer': embeddedMenuUrl,
|
||||
'user-agent': session.userAgent,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-site',
|
||||
...(session.cookies ? { 'cookie': session.cookies } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute GraphQL query server-side using axios
|
||||
* Uses cookies from the browser session to bypass CF
|
||||
*/
|
||||
async function executeGraphQL(
|
||||
session: SessionCredentials,
|
||||
operationName: string,
|
||||
variables: any,
|
||||
hash: string,
|
||||
cName: string
|
||||
): Promise<any> {
|
||||
const endpoint = dutchieConfig.graphqlEndpoint;
|
||||
const headers = buildHeaders(session, cName);
|
||||
|
||||
// Build request body for POST
|
||||
const body = {
|
||||
operationName,
|
||||
variables,
|
||||
extensions: {
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
},
|
||||
};
|
||||
|
||||
console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`);
|
||||
console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`);
|
||||
|
||||
try {
|
||||
const response = await axios.post(endpoint, body, {
|
||||
headers,
|
||||
timeout: 30000,
|
||||
validateStatus: () => true, // Don't throw on non-2xx
|
||||
});
|
||||
|
||||
// Log response details
|
||||
console.log(`[GraphQL Client] Response status: ${response.status}`);
|
||||
|
||||
if (response.status !== 200) {
|
||||
const bodyPreview = typeof response.data === 'string'
|
||||
? response.data.slice(0, 500)
|
||||
: JSON.stringify(response.data).slice(0, 500);
|
||||
console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`);
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
// Check for GraphQL errors
|
||||
if (response.data?.errors && response.data.errors.length > 0) {
|
||||
console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
|
||||
}
|
||||
|
||||
return response.data;
|
||||
} catch (error: any) {
|
||||
if (axios.isAxiosError(error)) {
|
||||
const axiosError = error as AxiosError;
|
||||
console.error(`[GraphQL Client] Axios error: ${axiosError.message}`);
|
||||
if (axiosError.response) {
|
||||
console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`);
|
||||
console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`);
|
||||
}
|
||||
if (axiosError.code) {
|
||||
console.error(`[GraphQL Client] Error code: ${axiosError.code}`);
|
||||
}
|
||||
} else {
|
||||
console.error(`[GraphQL Client] Error: ${error.message}`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DISPENSARY ID RESOLUTION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Resolution result with HTTP status for error handling
|
||||
*/
|
||||
export interface ResolveDispensaryResult {
|
||||
dispensaryId: string | null;
|
||||
httpStatus?: number;
|
||||
error?: string;
|
||||
source?: 'reactEnv' | 'graphql';
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a dispensary slug to its internal platform ID.
|
||||
*
|
||||
* STRATEGY:
|
||||
* 1. Navigate to embedded menu page and extract window.reactEnv.dispensaryId (preferred)
|
||||
* 2. Fall back to GraphQL GetAddressBasedDispensaryData query if reactEnv fails
|
||||
*
|
||||
* Returns the dispensaryId (platform_dispensary_id) or null if not found.
|
||||
* Throws if page returns 403/404 so caller can mark as not_crawlable.
|
||||
*/
|
||||
export async function resolveDispensaryId(slug: string): Promise<string | null> {
|
||||
const result = await resolveDispensaryIdWithDetails(slug);
|
||||
return result.dispensaryId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a dispensary slug with full details (HTTP status, source, error).
|
||||
* Use this when you need to know WHY resolution failed.
|
||||
*/
|
||||
export async function resolveDispensaryIdWithDetails(slug: string): Promise<ResolveDispensaryResult> {
|
||||
console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`);
|
||||
|
||||
const session = await createSession(slug);
|
||||
|
||||
try {
|
||||
// Check HTTP status first - if 403/404, the store is not crawlable
|
||||
if (session.httpStatus && (session.httpStatus === 403 || session.httpStatus === 404)) {
|
||||
console.log(`[GraphQL Client] Page returned HTTP ${session.httpStatus} for ${slug} - not crawlable`);
|
||||
return {
|
||||
dispensaryId: null,
|
||||
httpStatus: session.httpStatus,
|
||||
error: `HTTP ${session.httpStatus}: Store removed or not accessible`,
|
||||
source: 'reactEnv',
|
||||
};
|
||||
}
|
||||
|
||||
// PREFERRED: Use dispensaryId from window.reactEnv (extracted during createSession)
|
||||
if (session.dispensaryId) {
|
||||
console.log(`[GraphQL Client] Resolved ${slug} -> ${session.dispensaryId} (from reactEnv)`);
|
||||
return {
|
||||
dispensaryId: session.dispensaryId,
|
||||
httpStatus: session.httpStatus,
|
||||
source: 'reactEnv',
|
||||
};
|
||||
}
|
||||
|
||||
// FALLBACK: Try GraphQL query
|
||||
console.log(`[GraphQL Client] reactEnv.dispensaryId not found for ${slug}, trying GraphQL...`);
|
||||
|
||||
const variables = {
|
||||
dispensaryFilter: {
|
||||
cNameOrID: slug,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await executeGraphQL(
|
||||
session,
|
||||
'GetAddressBasedDispensaryData',
|
||||
variables,
|
||||
GRAPHQL_HASHES.GetAddressBasedDispensaryData,
|
||||
slug
|
||||
);
|
||||
|
||||
const dispensaryId = result?.data?.dispensaryBySlug?.id ||
|
||||
result?.data?.dispensary?.id ||
|
||||
result?.data?.getAddressBasedDispensaryData?.dispensary?.id;
|
||||
|
||||
if (dispensaryId) {
|
||||
console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId} (from GraphQL)`);
|
||||
return {
|
||||
dispensaryId,
|
||||
httpStatus: session.httpStatus,
|
||||
source: 'graphql',
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Could not resolve ${slug}, GraphQL response:`, JSON.stringify(result).slice(0, 300));
|
||||
return {
|
||||
dispensaryId: null,
|
||||
httpStatus: session.httpStatus,
|
||||
error: 'Could not extract dispensaryId from reactEnv or GraphQL',
|
||||
};
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover Arizona dispensaries via geo-based query
|
||||
*/
|
||||
export async function discoverArizonaDispensaries(): Promise<any[]> {
|
||||
console.log('[GraphQL Client] Discovering Arizona dispensaries...');
|
||||
|
||||
// Use Phoenix as the default center
|
||||
const session = await createSession('AZ-Deeply-Rooted');
|
||||
const allDispensaries: any[] = [];
|
||||
const seenIds = new Set<string>();
|
||||
|
||||
try {
|
||||
for (const centerpoint of ARIZONA_CENTERPOINTS) {
|
||||
console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`);
|
||||
|
||||
const variables = {
|
||||
dispensariesFilter: {
|
||||
latitude: centerpoint.lat,
|
||||
longitude: centerpoint.lng,
|
||||
distance: 100,
|
||||
state: 'AZ',
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await executeGraphQL(
|
||||
session,
|
||||
'ConsumerDispensaries',
|
||||
variables,
|
||||
GRAPHQL_HASHES.ConsumerDispensaries,
|
||||
'AZ-Deeply-Rooted'
|
||||
);
|
||||
|
||||
const dispensaries = result?.data?.consumerDispensaries || [];
|
||||
|
||||
for (const d of dispensaries) {
|
||||
const id = d.id || d.dispensaryId;
|
||||
if (id && !seenIds.has(id)) {
|
||||
seenIds.add(id);
|
||||
allDispensaries.push(d);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`);
|
||||
} catch (error: any) {
|
||||
console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`);
|
||||
}
|
||||
|
||||
// Delay between requests
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`);
|
||||
return allDispensaries;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PRODUCT FILTERING VARIABLES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build filter variables for FilteredProducts query
|
||||
*
|
||||
* CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b")
|
||||
* NOT dispensaryFilter.cNameOrID!
|
||||
*
|
||||
* The actual browser request structure is:
|
||||
* {
|
||||
* "productsFilter": {
|
||||
* "dispensaryId": "6405ef617056e8014d79101b",
|
||||
* "pricingType": "rec",
|
||||
* "Status": "Active", // Mode A only
|
||||
* "strainTypes": [],
|
||||
* "subcategories": [],
|
||||
* "types": [],
|
||||
* "useCache": true,
|
||||
* ...
|
||||
* },
|
||||
* "page": 0,
|
||||
* "perPage": 100
|
||||
* }
|
||||
*
|
||||
* Mode A = UI parity (Status: "Active")
|
||||
* Mode B = MAX COVERAGE (no Status filter)
|
||||
*/
|
||||
function buildFilterVariables(
|
||||
platformDispensaryId: string,
|
||||
pricingType: 'rec' | 'med',
|
||||
crawlMode: CrawlMode,
|
||||
page: number,
|
||||
perPage: number
|
||||
): any {
|
||||
const isModeA = crawlMode === 'mode_a';
|
||||
|
||||
// Per CLAUDE.md Rule #11: Use simple productsFilter with dispensaryId directly
|
||||
// Do NOT use dispensaryFilter.cNameOrID - that's outdated
|
||||
const productsFilter: Record<string, any> = {
|
||||
dispensaryId: platformDispensaryId,
|
||||
pricingType: pricingType,
|
||||
};
|
||||
|
||||
// Mode A: Only active products (UI parity) - Status: "Active"
|
||||
// Mode B: MAX COVERAGE (OOS/inactive) - omit Status or set to null
|
||||
if (isModeA) {
|
||||
productsFilter.Status = 'Active';
|
||||
}
|
||||
// Mode B: No Status filter = returns all products including OOS/inactive
|
||||
|
||||
return {
|
||||
productsFilter,
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// PRODUCT FETCHING WITH PAGINATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch products for a single mode with pagination
|
||||
*/
|
||||
async function fetchProductsForMode(
|
||||
session: SessionCredentials,
|
||||
platformDispensaryId: string,
|
||||
cName: string,
|
||||
pricingType: 'rec' | 'med',
|
||||
crawlMode: CrawlMode
|
||||
): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> {
|
||||
const perPage = dutchieConfig.perPage;
|
||||
const maxPages = dutchieConfig.maxPages;
|
||||
const maxRetries = dutchieConfig.maxRetries;
|
||||
const pageDelayMs = dutchieConfig.pageDelayMs;
|
||||
|
||||
const allProducts: DutchieRawProduct[] = [];
|
||||
let pageNum = 0;
|
||||
let totalCount = 0;
|
||||
let consecutiveEmptyPages = 0;
|
||||
|
||||
console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`);
|
||||
|
||||
while (pageNum < maxPages) {
|
||||
const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage);
|
||||
|
||||
let result: any = null;
|
||||
let lastError: Error | null = null;
|
||||
|
||||
// Retry logic
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
result = await executeGraphQL(
|
||||
session,
|
||||
'FilteredProducts',
|
||||
variables,
|
||||
GRAPHQL_HASHES.FilteredProducts,
|
||||
cName
|
||||
);
|
||||
lastError = null;
|
||||
break;
|
||||
} catch (error: any) {
|
||||
lastError = error;
|
||||
console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`);
|
||||
if (attempt < maxRetries) {
|
||||
await new Promise((r) => setTimeout(r, 1000 * (attempt + 1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (lastError) {
|
||||
console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`);
|
||||
break;
|
||||
}
|
||||
|
||||
if (result?.errors) {
|
||||
console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors));
|
||||
break;
|
||||
}
|
||||
|
||||
// Log response shape on first page
|
||||
if (pageNum === 0) {
|
||||
console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`);
|
||||
if (result?.data) {
|
||||
console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`);
|
||||
}
|
||||
if (!result?.data?.filteredProducts) {
|
||||
console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`);
|
||||
console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`);
|
||||
}
|
||||
}
|
||||
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
const queryInfo = result?.data?.filteredProducts?.queryInfo;
|
||||
|
||||
if (queryInfo?.totalCount) {
|
||||
totalCount = queryInfo.totalCount;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`
|
||||
);
|
||||
|
||||
if (products.length === 0) {
|
||||
consecutiveEmptyPages++;
|
||||
if (consecutiveEmptyPages >= 2) {
|
||||
console.log('[GraphQL Client] Multiple empty pages, stopping pagination');
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
consecutiveEmptyPages = 0;
|
||||
allProducts.push(...products);
|
||||
}
|
||||
|
||||
// Stop if incomplete page (last page)
|
||||
if (products.length < perPage) {
|
||||
console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`);
|
||||
break;
|
||||
}
|
||||
|
||||
pageNum++;
|
||||
await new Promise((r) => setTimeout(r, pageDelayMs));
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Fetched ${allProducts.length} total products (${crawlMode})`);
|
||||
return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// LEGACY SINGLE-MODE INTERFACE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch all products for a dispensary (single mode)
|
||||
*/
|
||||
export async function fetchAllProducts(
|
||||
platformDispensaryId: string,
|
||||
pricingType: 'rec' | 'med' = 'rec',
|
||||
options: {
|
||||
perPage?: number;
|
||||
maxPages?: number;
|
||||
menuUrl?: string;
|
||||
crawlMode?: CrawlMode;
|
||||
cName?: string;
|
||||
} = {}
|
||||
): Promise<{ products: DutchieRawProduct[]; totalCount: number; crawlMode: CrawlMode }> {
|
||||
const { crawlMode = 'mode_a' } = options;
|
||||
|
||||
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
|
||||
const cName = options.cName;
|
||||
if (!cName) {
|
||||
throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session');
|
||||
}
|
||||
|
||||
const session = await createSession(cName);
|
||||
|
||||
try {
|
||||
return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode);
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MODE A+B MERGING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Merge POSMetaData.children arrays from Mode A and Mode B products
|
||||
*/
|
||||
function mergeProductOptions(
|
||||
modeAProduct: DutchieRawProduct,
|
||||
modeBProduct: DutchieRawProduct
|
||||
): DutchiePOSChild[] {
|
||||
const modeAChildren = modeAProduct.POSMetaData?.children || [];
|
||||
const modeBChildren = modeBProduct.POSMetaData?.children || [];
|
||||
|
||||
const getOptionKey = (child: DutchiePOSChild): string => {
|
||||
return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || '';
|
||||
};
|
||||
|
||||
const mergedMap = new Map<string, DutchiePOSChild>();
|
||||
|
||||
for (const child of modeAChildren) {
|
||||
const key = getOptionKey(child);
|
||||
if (key) mergedMap.set(key, child);
|
||||
}
|
||||
|
||||
for (const child of modeBChildren) {
|
||||
const key = getOptionKey(child);
|
||||
if (key && !mergedMap.has(key)) {
|
||||
mergedMap.set(key, child);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(mergedMap.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge a Mode A product with a Mode B product
|
||||
*/
|
||||
function mergeProducts(
|
||||
modeAProduct: DutchieRawProduct,
|
||||
modeBProduct: DutchieRawProduct | undefined
|
||||
): DutchieRawProduct {
|
||||
if (!modeBProduct) {
|
||||
return modeAProduct;
|
||||
}
|
||||
|
||||
const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct);
|
||||
|
||||
return {
|
||||
...modeAProduct,
|
||||
POSMetaData: {
|
||||
...modeAProduct.POSMetaData,
|
||||
children: mergedChildren,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// MAIN EXPORT: TWO-MODE CRAWL
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Fetch products using BOTH crawl modes with SINGLE session
|
||||
* Runs Mode A then Mode B, merges results
|
||||
*/
|
||||
export async function fetchAllProductsBothModes(
|
||||
platformDispensaryId: string,
|
||||
pricingType: 'rec' | 'med' = 'rec',
|
||||
options: {
|
||||
perPage?: number;
|
||||
maxPages?: number;
|
||||
menuUrl?: string;
|
||||
cName?: string;
|
||||
} = {}
|
||||
): Promise<{
|
||||
modeA: { products: DutchieRawProduct[]; totalCount: number };
|
||||
modeB: { products: DutchieRawProduct[]; totalCount: number };
|
||||
merged: { products: DutchieRawProduct[]; totalCount: number };
|
||||
}> {
|
||||
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
|
||||
const cName = options.cName;
|
||||
if (!cName) {
|
||||
throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session');
|
||||
}
|
||||
|
||||
console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`);
|
||||
console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`);
|
||||
|
||||
const session = await createSession(cName);
|
||||
|
||||
try {
|
||||
// Mode A (UI parity)
|
||||
const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a');
|
||||
|
||||
// Delay between modes
|
||||
await new Promise((r) => setTimeout(r, dutchieConfig.modeDelayMs));
|
||||
|
||||
// Mode B (MAX COVERAGE)
|
||||
const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b');
|
||||
|
||||
// Merge results
|
||||
const modeBMap = new Map<string, DutchieRawProduct>();
|
||||
for (const product of modeBResult.products) {
|
||||
modeBMap.set(product._id, product);
|
||||
}
|
||||
|
||||
const productMap = new Map<string, DutchieRawProduct>();
|
||||
|
||||
// Add Mode A products, merging with Mode B if exists
|
||||
for (const product of modeAResult.products) {
|
||||
const modeBProduct = modeBMap.get(product._id);
|
||||
const mergedProduct = mergeProducts(product, modeBProduct);
|
||||
productMap.set(product._id, mergedProduct);
|
||||
}
|
||||
|
||||
// Add Mode B products not in Mode A
|
||||
for (const product of modeBResult.products) {
|
||||
if (!productMap.has(product._id)) {
|
||||
productMap.set(product._id, product);
|
||||
}
|
||||
}
|
||||
|
||||
const mergedProducts = Array.from(productMap.values());
|
||||
|
||||
console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`);
|
||||
console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`);
|
||||
|
||||
return {
|
||||
modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount },
|
||||
modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount },
|
||||
merged: { products: mergedProducts, totalCount: mergedProducts.length },
|
||||
};
|
||||
} finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
@@ -1,665 +0,0 @@
|
||||
/**
|
||||
* Job Queue Service
|
||||
*
|
||||
* DB-backed job queue with claiming/locking for distributed workers.
|
||||
* Ensures only one worker processes a given store at a time.
|
||||
*/
|
||||
|
||||
import { query, getClient } from '../db/connection';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import * as os from 'os';
|
||||
import { DEFAULT_CONFIG } from './store-validator';
|
||||
|
||||
// Minimum gap between crawls for the same dispensary (in minutes)
|
||||
const MIN_CRAWL_GAP_MINUTES = DEFAULT_CONFIG.minCrawlGapMinutes; // 2 minutes
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface QueuedJob {
|
||||
id: number;
|
||||
jobType: string;
|
||||
dispensaryId: number | null;
|
||||
status: 'pending' | 'running' | 'completed' | 'failed';
|
||||
priority: number;
|
||||
retryCount: number;
|
||||
maxRetries: number;
|
||||
claimedBy: string | null;
|
||||
claimedAt: Date | null;
|
||||
workerHostname: string | null;
|
||||
startedAt: Date | null;
|
||||
completedAt: Date | null;
|
||||
errorMessage: string | null;
|
||||
productsFound: number;
|
||||
productsUpserted: number;
|
||||
snapshotsCreated: number;
|
||||
currentPage: number;
|
||||
totalPages: number | null;
|
||||
lastHeartbeatAt: Date | null;
|
||||
metadata: Record<string, any> | null;
|
||||
createdAt: Date;
|
||||
}
|
||||
|
||||
export interface EnqueueJobOptions {
|
||||
jobType: string;
|
||||
dispensaryId?: number;
|
||||
priority?: number;
|
||||
metadata?: Record<string, any>;
|
||||
maxRetries?: number;
|
||||
}
|
||||
|
||||
export interface ClaimJobOptions {
|
||||
workerId: string;
|
||||
jobTypes?: string[];
|
||||
lockDurationMinutes?: number;
|
||||
}
|
||||
|
||||
export interface JobProgress {
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
currentPage?: number;
|
||||
totalPages?: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// WORKER IDENTITY
|
||||
// ============================================================
|
||||
|
||||
let _workerId: string | null = null;
|
||||
|
||||
/**
|
||||
* Get or create a unique worker ID for this process
|
||||
* In Kubernetes, uses POD_NAME for clarity; otherwise generates a unique ID
|
||||
*/
|
||||
export function getWorkerId(): string {
|
||||
if (!_workerId) {
|
||||
// Prefer POD_NAME in K8s (set via fieldRef)
|
||||
const podName = process.env.POD_NAME;
|
||||
if (podName) {
|
||||
_workerId = podName;
|
||||
} else {
|
||||
const hostname = os.hostname();
|
||||
const pid = process.pid;
|
||||
const uuid = uuidv4().slice(0, 8);
|
||||
_workerId = `${hostname}-${pid}-${uuid}`;
|
||||
}
|
||||
}
|
||||
return _workerId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hostname for worker tracking
|
||||
* In Kubernetes, uses POD_NAME; otherwise uses os.hostname()
|
||||
*/
|
||||
export function getWorkerHostname(): string {
|
||||
return process.env.POD_NAME || os.hostname();
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB ENQUEUEING
|
||||
// ============================================================
|
||||
|
||||
export interface EnqueueResult {
|
||||
jobId: number | null;
|
||||
skipped: boolean;
|
||||
reason?: 'already_queued' | 'too_soon' | 'error';
|
||||
message?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new job for processing
|
||||
* Returns null if a pending/running job already exists for this dispensary
|
||||
* or if a job was completed/failed within the minimum gap period
|
||||
*/
|
||||
export async function enqueueJob(options: EnqueueJobOptions): Promise<number | null> {
|
||||
const result = await enqueueJobWithReason(options);
|
||||
return result.jobId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new job with detailed result info
|
||||
* Enforces:
|
||||
* 1. No duplicate pending/running jobs for same dispensary
|
||||
* 2. Minimum 2-minute gap between crawls for same dispensary
|
||||
*/
|
||||
export async function enqueueJobWithReason(options: EnqueueJobOptions): Promise<EnqueueResult> {
|
||||
const {
|
||||
jobType,
|
||||
dispensaryId,
|
||||
priority = 0,
|
||||
metadata,
|
||||
maxRetries = 3,
|
||||
} = options;
|
||||
|
||||
// Check if there's already a pending/running job for this dispensary
|
||||
if (dispensaryId) {
|
||||
const { rows: existing } = await query<any>(
|
||||
`SELECT id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (existing.length > 0) {
|
||||
console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'already_queued',
|
||||
message: `Job already pending/running for dispensary ${dispensaryId}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Check minimum gap since last job (2 minutes)
|
||||
const { rows: recent } = await query<any>(
|
||||
`SELECT id, created_at, status
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (recent.length > 0) {
|
||||
const lastJobTime = new Date(recent[0].created_at);
|
||||
const minGapMs = MIN_CRAWL_GAP_MINUTES * 60 * 1000;
|
||||
const timeSinceLastJob = Date.now() - lastJobTime.getTime();
|
||||
|
||||
if (timeSinceLastJob < minGapMs) {
|
||||
const waitSeconds = Math.ceil((minGapMs - timeSinceLastJob) / 1000);
|
||||
console.log(`[JobQueue] Skipping enqueue - minimum ${MIN_CRAWL_GAP_MINUTES}min gap not met for dispensary ${dispensaryId}. Wait ${waitSeconds}s`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'too_soon',
|
||||
message: `Minimum ${MIN_CRAWL_GAP_MINUTES}-minute gap required. Try again in ${waitSeconds} seconds.`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const { rows } = await query<any>(
|
||||
`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
|
||||
RETURNING id`,
|
||||
[jobType, dispensaryId || null, priority, maxRetries, metadata ? JSON.stringify(metadata) : null]
|
||||
);
|
||||
|
||||
const jobId = rows[0].id;
|
||||
console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
|
||||
return { jobId, skipped: false };
|
||||
} catch (error: any) {
|
||||
// Handle database trigger rejection for minimum gap
|
||||
if (error.message?.includes('Minimum') && error.message?.includes('gap')) {
|
||||
console.log(`[JobQueue] DB rejected - minimum gap not met for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
jobId: null,
|
||||
skipped: true,
|
||||
reason: 'too_soon',
|
||||
message: error.message,
|
||||
};
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export interface BulkEnqueueResult {
|
||||
enqueued: number;
|
||||
skipped: number;
|
||||
skippedReasons: {
|
||||
alreadyQueued: number;
|
||||
tooSoon: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Bulk enqueue jobs for multiple dispensaries
|
||||
* Skips dispensaries that already have pending/running jobs
|
||||
* or have jobs within the minimum gap period
|
||||
*/
|
||||
export async function bulkEnqueueJobs(
|
||||
jobType: string,
|
||||
dispensaryIds: number[],
|
||||
options: { priority?: number; metadata?: Record<string, any> } = {}
|
||||
): Promise<BulkEnqueueResult> {
|
||||
const { priority = 0, metadata } = options;
|
||||
|
||||
// Get dispensaries that already have pending/running jobs
|
||||
const { rows: existing } = await query<any>(
|
||||
`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')`,
|
||||
[dispensaryIds]
|
||||
);
|
||||
const existingSet = new Set(existing.map((r: any) => r.dispensary_id));
|
||||
|
||||
// Get dispensaries that have recent jobs within minimum gap
|
||||
const { rows: recent } = await query<any>(
|
||||
`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1)
|
||||
AND created_at > NOW() - ($2 || ' minutes')::INTERVAL
|
||||
AND dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')
|
||||
)`,
|
||||
[dispensaryIds, MIN_CRAWL_GAP_MINUTES]
|
||||
);
|
||||
const recentSet = new Set(recent.map((r: any) => r.dispensary_id));
|
||||
|
||||
// Filter out dispensaries with existing or recent jobs
|
||||
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id) && !recentSet.has(id));
|
||||
|
||||
if (toEnqueue.length === 0) {
|
||||
return {
|
||||
enqueued: 0,
|
||||
skipped: dispensaryIds.length,
|
||||
skippedReasons: {
|
||||
alreadyQueued: existingSet.size,
|
||||
tooSoon: recentSet.size,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
|
||||
const metadataJson = metadata ? JSON.stringify(metadata) : null;
|
||||
const values = toEnqueue.map((_, i) => {
|
||||
const offset = i * 4;
|
||||
return `($${offset + 1}, $${offset + 2}, 'pending', $${offset + 3}, 3, $${offset + 4}, NOW())`;
|
||||
}).join(', ');
|
||||
|
||||
const params: any[] = [];
|
||||
toEnqueue.forEach(dispensaryId => {
|
||||
params.push(jobType, dispensaryId, priority, metadataJson);
|
||||
});
|
||||
|
||||
await query(
|
||||
`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ${values}`,
|
||||
params
|
||||
);
|
||||
|
||||
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size} (queued) + ${recentSet.size} (recent)`);
|
||||
return {
|
||||
enqueued: toEnqueue.length,
|
||||
skipped: existingSet.size + recentSet.size,
|
||||
skippedReasons: {
|
||||
alreadyQueued: existingSet.size,
|
||||
tooSoon: recentSet.size,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB CLAIMING (with locking)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Claim the next available job from the queue
|
||||
* Uses SELECT FOR UPDATE SKIP LOCKED to prevent double-claims
|
||||
*/
|
||||
export async function claimNextJob(options: ClaimJobOptions): Promise<QueuedJob | null> {
|
||||
const { workerId, jobTypes, lockDurationMinutes = 30 } = options;
|
||||
const hostname = getWorkerHostname();
|
||||
|
||||
const client = await getClient();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Build job type filter
|
||||
let typeFilter = '';
|
||||
const params: any[] = [workerId, hostname, lockDurationMinutes];
|
||||
let paramIndex = 4;
|
||||
|
||||
if (jobTypes && jobTypes.length > 0) {
|
||||
typeFilter = `AND job_type = ANY($${paramIndex})`;
|
||||
params.push(jobTypes);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
// Claim the next pending job using FOR UPDATE SKIP LOCKED
|
||||
// This atomically selects and locks a row, skipping any already locked by other workers
|
||||
const { rows } = await client.query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'running',
|
||||
claimed_by = $1,
|
||||
claimed_at = NOW(),
|
||||
worker_id = $1,
|
||||
worker_hostname = $2,
|
||||
started_at = NOW(),
|
||||
locked_until = NOW() + ($3 || ' minutes')::INTERVAL,
|
||||
last_heartbeat_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM dispensary_crawl_jobs
|
||||
WHERE status = 'pending'
|
||||
${typeFilter}
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING *`,
|
||||
params
|
||||
);
|
||||
|
||||
await client.query('COMMIT');
|
||||
|
||||
if (rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const job = mapDbRowToJob(rows[0]);
|
||||
console.log(`[JobQueue] Worker ${workerId} claimed job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
|
||||
return job;
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB PROGRESS & COMPLETION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Update job progress (for live monitoring)
|
||||
*/
|
||||
export async function updateJobProgress(jobId: number, progress: JobProgress): Promise<void> {
|
||||
const updates: string[] = ['last_heartbeat_at = NOW()', 'updated_at = NOW()'];
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (progress.productsFound !== undefined) {
|
||||
updates.push(`products_found = $${paramIndex++}`);
|
||||
params.push(progress.productsFound);
|
||||
}
|
||||
if (progress.productsUpserted !== undefined) {
|
||||
updates.push(`products_upserted = $${paramIndex++}`);
|
||||
params.push(progress.productsUpserted);
|
||||
}
|
||||
if (progress.snapshotsCreated !== undefined) {
|
||||
updates.push(`snapshots_created = $${paramIndex++}`);
|
||||
params.push(progress.snapshotsCreated);
|
||||
}
|
||||
if (progress.currentPage !== undefined) {
|
||||
updates.push(`current_page = $${paramIndex++}`);
|
||||
params.push(progress.currentPage);
|
||||
}
|
||||
if (progress.totalPages !== undefined) {
|
||||
updates.push(`total_pages = $${paramIndex++}`);
|
||||
params.push(progress.totalPages);
|
||||
}
|
||||
|
||||
params.push(jobId);
|
||||
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
|
||||
params
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send heartbeat to keep job alive (prevents timeout)
|
||||
*/
|
||||
export async function heartbeat(jobId: number): Promise<void> {
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET last_heartbeat_at = NOW(), locked_until = NOW() + INTERVAL '30 minutes'
|
||||
WHERE id = $1 AND status = 'running'`,
|
||||
[jobId]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark job as completed
|
||||
*
|
||||
* Stores visibility tracking stats (visibilityLostCount, visibilityRestoredCount)
|
||||
* in the metadata JSONB column for dashboard analytics.
|
||||
*/
|
||||
export async function completeJob(
|
||||
jobId: number,
|
||||
result: {
|
||||
productsFound?: number;
|
||||
productsUpserted?: number;
|
||||
snapshotsCreated?: number;
|
||||
visibilityLostCount?: number;
|
||||
visibilityRestoredCount?: number;
|
||||
}
|
||||
): Promise<void> {
|
||||
// Build metadata with visibility stats if provided
|
||||
const metadata: Record<string, any> = {};
|
||||
if (result.visibilityLostCount !== undefined) {
|
||||
metadata.visibilityLostCount = result.visibilityLostCount;
|
||||
}
|
||||
if (result.visibilityRestoredCount !== undefined) {
|
||||
metadata.visibilityRestoredCount = result.visibilityRestoredCount;
|
||||
}
|
||||
if (result.snapshotsCreated !== undefined) {
|
||||
metadata.snapshotsCreated = result.snapshotsCreated;
|
||||
}
|
||||
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
products_found = COALESCE($2, products_found),
|
||||
products_updated = COALESCE($3, products_updated),
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) || $4::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
jobId,
|
||||
result.productsFound,
|
||||
result.productsUpserted,
|
||||
JSON.stringify(metadata),
|
||||
]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} completed`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark job as failed
|
||||
*/
|
||||
export async function failJob(jobId: number, errorMessage: string): Promise<boolean> {
|
||||
// Check if we should retry
|
||||
const { rows } = await query<any>(
|
||||
`SELECT retry_count, max_retries FROM dispensary_crawl_jobs WHERE id = $1`,
|
||||
[jobId]
|
||||
);
|
||||
|
||||
if (rows.length === 0) return false;
|
||||
|
||||
const { retry_count, max_retries } = rows[0];
|
||||
|
||||
if (retry_count < max_retries) {
|
||||
// Re-queue for retry
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'pending',
|
||||
retry_count = retry_count + 1,
|
||||
claimed_by = NULL,
|
||||
claimed_at = NULL,
|
||||
worker_id = NULL,
|
||||
worker_hostname = NULL,
|
||||
started_at = NULL,
|
||||
locked_until = NULL,
|
||||
last_heartbeat_at = NULL,
|
||||
error_message = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[jobId, errorMessage]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} failed, re-queued for retry (${retry_count + 1}/${max_retries})`);
|
||||
return true; // Will retry
|
||||
} else {
|
||||
// Mark as failed permanently
|
||||
await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'failed',
|
||||
completed_at = NOW(),
|
||||
error_message = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[jobId, errorMessage]
|
||||
);
|
||||
console.log(`[JobQueue] Job ${jobId} failed permanently after ${retry_count} retries`);
|
||||
return false; // No more retries
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// QUEUE MONITORING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Get queue statistics
|
||||
*/
|
||||
export async function getQueueStats(): Promise<{
|
||||
pending: number;
|
||||
running: number;
|
||||
completed1h: number;
|
||||
failed1h: number;
|
||||
activeWorkers: number;
|
||||
avgDurationSeconds: number | null;
|
||||
}> {
|
||||
const { rows } = await query<any>(`SELECT * FROM v_queue_stats`);
|
||||
const stats = rows[0] || {};
|
||||
|
||||
return {
|
||||
pending: parseInt(stats.pending_jobs || '0', 10),
|
||||
running: parseInt(stats.running_jobs || '0', 10),
|
||||
completed1h: parseInt(stats.completed_1h || '0', 10),
|
||||
failed1h: parseInt(stats.failed_1h || '0', 10),
|
||||
activeWorkers: parseInt(stats.active_workers || '0', 10),
|
||||
avgDurationSeconds: stats.avg_duration_seconds ? parseFloat(stats.avg_duration_seconds) : null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get active workers
|
||||
*/
|
||||
export async function getActiveWorkers(): Promise<Array<{
|
||||
workerId: string;
|
||||
hostname: string | null;
|
||||
currentJobs: number;
|
||||
totalProductsFound: number;
|
||||
totalProductsUpserted: number;
|
||||
totalSnapshots: number;
|
||||
firstClaimedAt: Date;
|
||||
lastHeartbeat: Date | null;
|
||||
}>> {
|
||||
const { rows } = await query<any>(`SELECT * FROM v_active_workers`);
|
||||
|
||||
return rows.map((row: any) => ({
|
||||
workerId: row.worker_id,
|
||||
hostname: row.worker_hostname,
|
||||
currentJobs: parseInt(row.current_jobs || '0', 10),
|
||||
totalProductsFound: parseInt(row.total_products_found || '0', 10),
|
||||
totalProductsUpserted: parseInt(row.total_products_upserted || '0', 10),
|
||||
totalSnapshots: parseInt(row.total_snapshots || '0', 10),
|
||||
firstClaimedAt: new Date(row.first_claimed_at),
|
||||
lastHeartbeat: row.last_heartbeat ? new Date(row.last_heartbeat) : null,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get running jobs with worker info
|
||||
*/
|
||||
export async function getRunningJobs(): Promise<QueuedJob[]> {
|
||||
const { rows } = await query<any>(
|
||||
`SELECT cj.*, d.name as dispensary_name, d.city
|
||||
FROM dispensary_crawl_jobs cj
|
||||
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
|
||||
WHERE cj.status = 'running'
|
||||
ORDER BY cj.started_at DESC`
|
||||
);
|
||||
|
||||
return rows.map(mapDbRowToJob);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recover stale jobs (workers that died without completing)
|
||||
*/
|
||||
export async function recoverStaleJobs(staleMinutes: number = 15): Promise<number> {
|
||||
const { rowCount } = await query(
|
||||
`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'pending',
|
||||
claimed_by = NULL,
|
||||
claimed_at = NULL,
|
||||
worker_id = NULL,
|
||||
worker_hostname = NULL,
|
||||
started_at = NULL,
|
||||
locked_until = NULL,
|
||||
error_message = 'Recovered from stale worker',
|
||||
retry_count = retry_count + 1,
|
||||
updated_at = NOW()
|
||||
WHERE status = 'running'
|
||||
AND last_heartbeat_at < NOW() - ($1 || ' minutes')::INTERVAL
|
||||
AND retry_count < max_retries`,
|
||||
[staleMinutes]
|
||||
);
|
||||
|
||||
if (rowCount && rowCount > 0) {
|
||||
console.log(`[JobQueue] Recovered ${rowCount} stale jobs`);
|
||||
}
|
||||
return rowCount || 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up old completed/failed jobs
|
||||
*/
|
||||
export async function cleanupOldJobs(olderThanDays: number = 7): Promise<number> {
|
||||
const { rowCount } = await query(
|
||||
`DELETE FROM dispensary_crawl_jobs
|
||||
WHERE status IN ('completed', 'failed')
|
||||
AND completed_at < NOW() - ($1 || ' days')::INTERVAL`,
|
||||
[olderThanDays]
|
||||
);
|
||||
|
||||
if (rowCount && rowCount > 0) {
|
||||
console.log(`[JobQueue] Cleaned up ${rowCount} old jobs`);
|
||||
}
|
||||
return rowCount || 0;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// HELPERS
|
||||
// ============================================================
|
||||
|
||||
function mapDbRowToJob(row: any): QueuedJob {
|
||||
return {
|
||||
id: row.id,
|
||||
jobType: row.job_type,
|
||||
dispensaryId: row.dispensary_id,
|
||||
status: row.status,
|
||||
priority: row.priority || 0,
|
||||
retryCount: row.retry_count || 0,
|
||||
maxRetries: row.max_retries || 3,
|
||||
claimedBy: row.claimed_by,
|
||||
claimedAt: row.claimed_at ? new Date(row.claimed_at) : null,
|
||||
workerHostname: row.worker_hostname,
|
||||
startedAt: row.started_at ? new Date(row.started_at) : null,
|
||||
completedAt: row.completed_at ? new Date(row.completed_at) : null,
|
||||
errorMessage: row.error_message,
|
||||
productsFound: row.products_found || 0,
|
||||
productsUpserted: row.products_upserted || 0,
|
||||
snapshotsCreated: row.snapshots_created || 0,
|
||||
currentPage: row.current_page || 0,
|
||||
totalPages: row.total_pages,
|
||||
lastHeartbeatAt: row.last_heartbeat_at ? new Date(row.last_heartbeat_at) : null,
|
||||
metadata: row.metadata,
|
||||
createdAt: new Date(row.created_at),
|
||||
// Add extra fields from join if present
|
||||
...(row.dispensary_name && { dispensaryName: row.dispensary_name }),
|
||||
...(row.city && { city: row.city }),
|
||||
};
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,435 +0,0 @@
|
||||
/**
|
||||
* Unified Retry Manager
|
||||
*
|
||||
* Handles retry logic with exponential backoff, jitter, and
|
||||
* intelligent error-based decisions (rotate proxy, rotate UA, etc.)
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
import {
|
||||
CrawlErrorCodeType,
|
||||
CrawlErrorCode,
|
||||
classifyError,
|
||||
getErrorMetadata,
|
||||
isRetryable,
|
||||
shouldRotateProxy,
|
||||
shouldRotateUserAgent,
|
||||
getBackoffMultiplier,
|
||||
} from './error-taxonomy';
|
||||
import { DEFAULT_CONFIG } from './store-validator';
|
||||
|
||||
// ============================================================
|
||||
// RETRY CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
export interface RetryConfig {
|
||||
maxRetries: number;
|
||||
baseBackoffMs: number;
|
||||
maxBackoffMs: number;
|
||||
backoffMultiplier: number;
|
||||
jitterFactor: number; // 0.0 - 1.0 (percentage of backoff to randomize)
|
||||
}
|
||||
|
||||
export const DEFAULT_RETRY_CONFIG: RetryConfig = {
|
||||
maxRetries: DEFAULT_CONFIG.maxRetries,
|
||||
baseBackoffMs: DEFAULT_CONFIG.baseBackoffMs,
|
||||
maxBackoffMs: DEFAULT_CONFIG.maxBackoffMs,
|
||||
backoffMultiplier: DEFAULT_CONFIG.backoffMultiplier,
|
||||
jitterFactor: 0.25, // +/- 25% jitter
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// RETRY CONTEXT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Context for tracking retry state across attempts
|
||||
*/
|
||||
export interface RetryContext {
|
||||
attemptNumber: number;
|
||||
maxAttempts: number;
|
||||
lastErrorCode: CrawlErrorCodeType | null;
|
||||
lastHttpStatus: number | null;
|
||||
totalBackoffMs: number;
|
||||
proxyRotated: boolean;
|
||||
userAgentRotated: boolean;
|
||||
startedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decision about what to do after an error
|
||||
*/
|
||||
export interface RetryDecision {
|
||||
shouldRetry: boolean;
|
||||
reason: string;
|
||||
backoffMs: number;
|
||||
rotateProxy: boolean;
|
||||
rotateUserAgent: boolean;
|
||||
errorCode: CrawlErrorCodeType;
|
||||
attemptNumber: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// RETRY MANAGER CLASS
|
||||
// ============================================================
|
||||
|
||||
export class RetryManager {
|
||||
private config: RetryConfig;
|
||||
private context: RetryContext;
|
||||
|
||||
constructor(config: Partial<RetryConfig> = {}) {
|
||||
this.config = { ...DEFAULT_RETRY_CONFIG, ...config };
|
||||
this.context = this.createInitialContext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create initial retry context
|
||||
*/
|
||||
private createInitialContext(): RetryContext {
|
||||
return {
|
||||
attemptNumber: 0,
|
||||
maxAttempts: this.config.maxRetries + 1, // +1 for initial attempt
|
||||
lastErrorCode: null,
|
||||
lastHttpStatus: null,
|
||||
totalBackoffMs: 0,
|
||||
proxyRotated: false,
|
||||
userAgentRotated: false,
|
||||
startedAt: new Date(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset retry state for a new operation
|
||||
*/
|
||||
reset(): void {
|
||||
this.context = this.createInitialContext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current attempt number (1-based)
|
||||
*/
|
||||
getAttemptNumber(): number {
|
||||
return this.context.attemptNumber + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if we should attempt (call before each attempt)
|
||||
*/
|
||||
shouldAttempt(): boolean {
|
||||
return this.context.attemptNumber < this.context.maxAttempts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record an attempt (call at start of each attempt)
|
||||
*/
|
||||
recordAttempt(): void {
|
||||
this.context.attemptNumber++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate an error and decide what to do
|
||||
*/
|
||||
evaluateError(
|
||||
error: Error | string | null,
|
||||
httpStatus?: number
|
||||
): RetryDecision {
|
||||
const errorCode = classifyError(error, httpStatus);
|
||||
const metadata = getErrorMetadata(errorCode);
|
||||
const attemptNumber = this.context.attemptNumber;
|
||||
|
||||
// Update context
|
||||
this.context.lastErrorCode = errorCode;
|
||||
this.context.lastHttpStatus = httpStatus || null;
|
||||
|
||||
// Check if error is retryable
|
||||
if (!isRetryable(errorCode)) {
|
||||
return {
|
||||
shouldRetry: false,
|
||||
reason: `Error ${errorCode} is not retryable: ${metadata.description}`,
|
||||
backoffMs: 0,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
errorCode,
|
||||
attemptNumber,
|
||||
};
|
||||
}
|
||||
|
||||
// Check if we've exhausted retries
|
||||
if (!this.shouldAttempt()) {
|
||||
return {
|
||||
shouldRetry: false,
|
||||
reason: `Max retries (${this.config.maxRetries}) exhausted`,
|
||||
backoffMs: 0,
|
||||
rotateProxy: false,
|
||||
rotateUserAgent: false,
|
||||
errorCode,
|
||||
attemptNumber,
|
||||
};
|
||||
}
|
||||
|
||||
// Calculate backoff with exponential increase and jitter
|
||||
const baseBackoff = this.calculateBackoff(attemptNumber, errorCode);
|
||||
const backoffWithJitter = this.addJitter(baseBackoff);
|
||||
|
||||
// Track total backoff
|
||||
this.context.totalBackoffMs += backoffWithJitter;
|
||||
|
||||
// Determine rotation needs
|
||||
const rotateProxy = shouldRotateProxy(errorCode);
|
||||
const rotateUserAgent = shouldRotateUserAgent(errorCode);
|
||||
|
||||
if (rotateProxy) this.context.proxyRotated = true;
|
||||
if (rotateUserAgent) this.context.userAgentRotated = true;
|
||||
|
||||
const rotationInfo = [];
|
||||
if (rotateProxy) rotationInfo.push('rotate proxy');
|
||||
if (rotateUserAgent) rotationInfo.push('rotate UA');
|
||||
const rotationStr = rotationInfo.length > 0 ? ` (${rotationInfo.join(', ')})` : '';
|
||||
|
||||
return {
|
||||
shouldRetry: true,
|
||||
reason: `Retrying after ${errorCode}${rotationStr}, backoff ${backoffWithJitter}ms`,
|
||||
backoffMs: backoffWithJitter,
|
||||
rotateProxy,
|
||||
rotateUserAgent,
|
||||
errorCode,
|
||||
attemptNumber,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate exponential backoff for an attempt
|
||||
*/
|
||||
private calculateBackoff(attemptNumber: number, errorCode: CrawlErrorCodeType): number {
|
||||
// Base exponential: baseBackoff * multiplier^(attempt-1)
|
||||
const exponential = this.config.baseBackoffMs *
|
||||
Math.pow(this.config.backoffMultiplier, attemptNumber - 1);
|
||||
|
||||
// Apply error-specific multiplier
|
||||
const errorMultiplier = getBackoffMultiplier(errorCode);
|
||||
const adjusted = exponential * errorMultiplier;
|
||||
|
||||
// Cap at max backoff
|
||||
return Math.min(adjusted, this.config.maxBackoffMs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add jitter to backoff to prevent thundering herd
|
||||
*/
|
||||
private addJitter(backoffMs: number): number {
|
||||
const jitterRange = backoffMs * this.config.jitterFactor;
|
||||
// Random between -jitterRange and +jitterRange
|
||||
const jitter = (Math.random() * 2 - 1) * jitterRange;
|
||||
return Math.max(0, Math.round(backoffMs + jitter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get retry context summary
|
||||
*/
|
||||
getSummary(): RetryContextSummary {
|
||||
const elapsedMs = Date.now() - this.context.startedAt.getTime();
|
||||
return {
|
||||
attemptsMade: this.context.attemptNumber,
|
||||
maxAttempts: this.context.maxAttempts,
|
||||
lastErrorCode: this.context.lastErrorCode,
|
||||
lastHttpStatus: this.context.lastHttpStatus,
|
||||
totalBackoffMs: this.context.totalBackoffMs,
|
||||
totalElapsedMs: elapsedMs,
|
||||
proxyWasRotated: this.context.proxyRotated,
|
||||
userAgentWasRotated: this.context.userAgentRotated,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export interface RetryContextSummary {
|
||||
attemptsMade: number;
|
||||
maxAttempts: number;
|
||||
lastErrorCode: CrawlErrorCodeType | null;
|
||||
lastHttpStatus: number | null;
|
||||
totalBackoffMs: number;
|
||||
totalElapsedMs: number;
|
||||
proxyWasRotated: boolean;
|
||||
userAgentWasRotated: boolean;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CONVENIENCE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Sleep for specified milliseconds
|
||||
*/
|
||||
export function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a function with automatic retry logic
|
||||
*/
|
||||
export async function withRetry<T>(
|
||||
fn: (attemptNumber: number) => Promise<T>,
|
||||
config: Partial<RetryConfig> = {},
|
||||
callbacks?: {
|
||||
onRetry?: (decision: RetryDecision) => void | Promise<void>;
|
||||
onRotateProxy?: () => void | Promise<void>;
|
||||
onRotateUserAgent?: () => void | Promise<void>;
|
||||
}
|
||||
): Promise<{ result: T; summary: RetryContextSummary }> {
|
||||
const manager = new RetryManager(config);
|
||||
|
||||
while (manager.shouldAttempt()) {
|
||||
manager.recordAttempt();
|
||||
const attemptNumber = manager.getAttemptNumber();
|
||||
|
||||
try {
|
||||
const result = await fn(attemptNumber);
|
||||
return { result, summary: manager.getSummary() };
|
||||
} catch (error) {
|
||||
const err = error instanceof Error ? error : new Error(String(error));
|
||||
const httpStatus = (error as any)?.status || (error as any)?.statusCode;
|
||||
|
||||
const decision = manager.evaluateError(err, httpStatus);
|
||||
|
||||
if (!decision.shouldRetry) {
|
||||
// Re-throw with enhanced context
|
||||
const enhancedError = new RetryExhaustedError(
|
||||
`${err.message} (${decision.reason})`,
|
||||
err,
|
||||
manager.getSummary()
|
||||
);
|
||||
throw enhancedError;
|
||||
}
|
||||
|
||||
// Notify callbacks
|
||||
if (callbacks?.onRetry) {
|
||||
await callbacks.onRetry(decision);
|
||||
}
|
||||
if (decision.rotateProxy && callbacks?.onRotateProxy) {
|
||||
await callbacks.onRotateProxy();
|
||||
}
|
||||
if (decision.rotateUserAgent && callbacks?.onRotateUserAgent) {
|
||||
await callbacks.onRotateUserAgent();
|
||||
}
|
||||
|
||||
// Log retry decision
|
||||
console.log(
|
||||
`[RetryManager] Attempt ${attemptNumber} failed: ${decision.errorCode}. ` +
|
||||
`${decision.reason}. Waiting ${decision.backoffMs}ms before retry.`
|
||||
);
|
||||
|
||||
// Wait before retry
|
||||
await sleep(decision.backoffMs);
|
||||
}
|
||||
}
|
||||
|
||||
// Should not reach here, but handle edge case
|
||||
throw new RetryExhaustedError(
|
||||
'Max retries exhausted',
|
||||
null,
|
||||
manager.getSummary()
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CUSTOM ERROR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class RetryExhaustedError extends Error {
|
||||
public readonly originalError: Error | null;
|
||||
public readonly summary: RetryContextSummary;
|
||||
public readonly errorCode: CrawlErrorCodeType;
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
originalError: Error | null,
|
||||
summary: RetryContextSummary
|
||||
) {
|
||||
super(message);
|
||||
this.name = 'RetryExhaustedError';
|
||||
this.originalError = originalError;
|
||||
this.summary = summary;
|
||||
this.errorCode = summary.lastErrorCode || CrawlErrorCode.UNKNOWN_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// BACKOFF CALCULATOR (for external use)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Calculate next crawl time based on consecutive failures
|
||||
*/
|
||||
export function calculateNextCrawlDelay(
|
||||
consecutiveFailures: number,
|
||||
baseFrequencyMinutes: number,
|
||||
maxBackoffMultiplier: number = 4.0
|
||||
): number {
|
||||
// Each failure doubles the delay, up to max multiplier
|
||||
const multiplier = Math.min(
|
||||
Math.pow(2, consecutiveFailures),
|
||||
maxBackoffMultiplier
|
||||
);
|
||||
|
||||
const delayMinutes = baseFrequencyMinutes * multiplier;
|
||||
|
||||
// Add jitter (0-10% of delay)
|
||||
const jitterMinutes = delayMinutes * Math.random() * 0.1;
|
||||
|
||||
return Math.round(delayMinutes + jitterMinutes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate next crawl timestamp
|
||||
*/
|
||||
export function calculateNextCrawlAt(
|
||||
consecutiveFailures: number,
|
||||
baseFrequencyMinutes: number
|
||||
): Date {
|
||||
const delayMinutes = calculateNextCrawlDelay(consecutiveFailures, baseFrequencyMinutes);
|
||||
return new Date(Date.now() + delayMinutes * 60 * 1000);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STATUS DETERMINATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Determine crawl status based on failure count
|
||||
*/
|
||||
export function determineCrawlStatus(
|
||||
consecutiveFailures: number,
|
||||
thresholds: { degraded: number; failed: number } = { degraded: 3, failed: 10 }
|
||||
): 'active' | 'degraded' | 'failed' {
|
||||
if (consecutiveFailures >= thresholds.failed) {
|
||||
return 'failed';
|
||||
}
|
||||
if (consecutiveFailures >= thresholds.degraded) {
|
||||
return 'degraded';
|
||||
}
|
||||
return 'active';
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if store should be auto-recovered
|
||||
* (Called periodically to check if failed stores can be retried)
|
||||
*/
|
||||
export function shouldAttemptRecovery(
|
||||
lastFailureAt: Date | null,
|
||||
consecutiveFailures: number,
|
||||
recoveryIntervalHours: number = 24
|
||||
): boolean {
|
||||
if (!lastFailureAt) return true;
|
||||
|
||||
// Wait longer for more failures
|
||||
const waitHours = recoveryIntervalHours * Math.min(consecutiveFailures, 5);
|
||||
const recoveryTime = new Date(lastFailureAt.getTime() + waitHours * 60 * 60 * 1000);
|
||||
|
||||
return new Date() >= recoveryTime;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLETON INSTANCE
|
||||
// ============================================================
|
||||
|
||||
export const retryManager = new RetryManager();
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,465 +0,0 @@
|
||||
/**
|
||||
* Store Configuration Validator
|
||||
*
|
||||
* Validates and sanitizes store configurations before crawling.
|
||||
* Applies defaults for missing values and logs warnings.
|
||||
*
|
||||
* Phase 1: Crawler Reliability & Stabilization
|
||||
*/
|
||||
|
||||
import { CrawlErrorCode, CrawlErrorCodeType } from './error-taxonomy';
|
||||
|
||||
// ============================================================
|
||||
// DEFAULT CONFIGURATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Default crawl configuration values
|
||||
*/
|
||||
export const DEFAULT_CONFIG = {
|
||||
// Scheduling
|
||||
crawlFrequencyMinutes: 240, // 4 hours
|
||||
minCrawlGapMinutes: 2, // Minimum 2 minutes between crawls
|
||||
|
||||
// Retries
|
||||
maxRetries: 3,
|
||||
baseBackoffMs: 1000, // 1 second
|
||||
maxBackoffMs: 60000, // 1 minute
|
||||
backoffMultiplier: 2.0, // Exponential backoff
|
||||
|
||||
// Timeouts
|
||||
requestTimeoutMs: 30000, // 30 seconds
|
||||
pageLoadTimeoutMs: 60000, // 60 seconds
|
||||
|
||||
// Limits
|
||||
maxProductsPerPage: 100,
|
||||
maxPages: 50,
|
||||
|
||||
// Proxy
|
||||
proxyRotationEnabled: true,
|
||||
proxyRotationOnFailure: true,
|
||||
|
||||
// User Agent
|
||||
userAgentRotationEnabled: true,
|
||||
userAgentRotationOnFailure: true,
|
||||
} as const;
|
||||
|
||||
// ============================================================
|
||||
// STORE CONFIG INTERFACE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Raw store configuration from database
|
||||
*/
|
||||
export interface RawStoreConfig {
|
||||
id: number;
|
||||
name: string;
|
||||
slug?: string;
|
||||
platform?: string;
|
||||
menuType?: string;
|
||||
platformDispensaryId?: string;
|
||||
menuUrl?: string;
|
||||
website?: string;
|
||||
|
||||
// Crawl config
|
||||
crawlFrequencyMinutes?: number;
|
||||
maxRetries?: number;
|
||||
currentProxyId?: number;
|
||||
currentUserAgent?: string;
|
||||
|
||||
// Status
|
||||
crawlStatus?: string;
|
||||
consecutiveFailures?: number;
|
||||
backoffMultiplier?: number;
|
||||
lastCrawlAt?: Date;
|
||||
lastSuccessAt?: Date;
|
||||
lastFailureAt?: Date;
|
||||
lastErrorCode?: string;
|
||||
nextCrawlAt?: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validated and sanitized store configuration
|
||||
*/
|
||||
export interface ValidatedStoreConfig {
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
platform: string;
|
||||
menuType: string;
|
||||
platformDispensaryId: string;
|
||||
menuUrl: string;
|
||||
|
||||
// Crawl config (with defaults applied)
|
||||
crawlFrequencyMinutes: number;
|
||||
maxRetries: number;
|
||||
currentProxyId: number | null;
|
||||
currentUserAgent: string | null;
|
||||
|
||||
// Status
|
||||
crawlStatus: 'active' | 'degraded' | 'paused' | 'failed';
|
||||
consecutiveFailures: number;
|
||||
backoffMultiplier: number;
|
||||
lastCrawlAt: Date | null;
|
||||
lastSuccessAt: Date | null;
|
||||
lastFailureAt: Date | null;
|
||||
lastErrorCode: CrawlErrorCodeType | null;
|
||||
nextCrawlAt: Date | null;
|
||||
|
||||
// Validation metadata
|
||||
isValid: boolean;
|
||||
validationErrors: ValidationError[];
|
||||
validationWarnings: ValidationWarning[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// VALIDATION TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface ValidationError {
|
||||
field: string;
|
||||
message: string;
|
||||
code: CrawlErrorCodeType;
|
||||
}
|
||||
|
||||
export interface ValidationWarning {
|
||||
field: string;
|
||||
message: string;
|
||||
appliedDefault?: any;
|
||||
}
|
||||
|
||||
export interface ValidationResult {
|
||||
isValid: boolean;
|
||||
config: ValidatedStoreConfig | null;
|
||||
errors: ValidationError[];
|
||||
warnings: ValidationWarning[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// VALIDATOR CLASS
|
||||
// ============================================================
|
||||
|
||||
export class StoreValidator {
|
||||
private errors: ValidationError[] = [];
|
||||
private warnings: ValidationWarning[] = [];
|
||||
|
||||
/**
|
||||
* Validate and sanitize a store configuration
|
||||
*/
|
||||
validate(raw: RawStoreConfig): ValidationResult {
|
||||
this.errors = [];
|
||||
this.warnings = [];
|
||||
|
||||
// Required field validation
|
||||
this.validateRequired(raw);
|
||||
|
||||
// If critical errors, return early
|
||||
if (this.errors.length > 0) {
|
||||
return {
|
||||
isValid: false,
|
||||
config: null,
|
||||
errors: this.errors,
|
||||
warnings: this.warnings,
|
||||
};
|
||||
}
|
||||
|
||||
// Build validated config with defaults
|
||||
const config = this.buildValidatedConfig(raw);
|
||||
|
||||
return {
|
||||
isValid: this.errors.length === 0,
|
||||
config,
|
||||
errors: this.errors,
|
||||
warnings: this.warnings,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate required fields
|
||||
*/
|
||||
private validateRequired(raw: RawStoreConfig): void {
|
||||
if (!raw.id) {
|
||||
this.addError('id', 'Store ID is required', CrawlErrorCode.INVALID_CONFIG);
|
||||
}
|
||||
|
||||
if (!raw.name) {
|
||||
this.addError('name', 'Store name is required', CrawlErrorCode.INVALID_CONFIG);
|
||||
}
|
||||
|
||||
if (!raw.platformDispensaryId) {
|
||||
this.addError(
|
||||
'platformDispensaryId',
|
||||
'Platform dispensary ID is required for crawling',
|
||||
CrawlErrorCode.MISSING_PLATFORM_ID
|
||||
);
|
||||
}
|
||||
|
||||
if (!raw.menuType || raw.menuType === 'unknown') {
|
||||
this.addError(
|
||||
'menuType',
|
||||
'Menu type must be detected before crawling',
|
||||
CrawlErrorCode.INVALID_CONFIG
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build validated config with defaults applied
|
||||
*/
|
||||
private buildValidatedConfig(raw: RawStoreConfig): ValidatedStoreConfig {
|
||||
// Slug
|
||||
const slug = raw.slug || this.generateSlug(raw.name);
|
||||
if (!raw.slug) {
|
||||
this.addWarning('slug', 'Slug was missing, generated from name', slug);
|
||||
}
|
||||
|
||||
// Platform
|
||||
const platform = raw.platform || 'dutchie';
|
||||
if (!raw.platform) {
|
||||
this.addWarning('platform', 'Platform was missing, defaulting to dutchie', platform);
|
||||
}
|
||||
|
||||
// Menu URL
|
||||
const menuUrl = raw.menuUrl || this.generateMenuUrl(raw.platformDispensaryId!, platform);
|
||||
if (!raw.menuUrl) {
|
||||
this.addWarning('menuUrl', 'Menu URL was missing, generated from platform ID', menuUrl);
|
||||
}
|
||||
|
||||
// Crawl frequency
|
||||
const crawlFrequencyMinutes = this.validateNumeric(
|
||||
raw.crawlFrequencyMinutes,
|
||||
'crawlFrequencyMinutes',
|
||||
DEFAULT_CONFIG.crawlFrequencyMinutes,
|
||||
60, // min: 1 hour
|
||||
1440 // max: 24 hours
|
||||
);
|
||||
|
||||
// Max retries
|
||||
const maxRetries = this.validateNumeric(
|
||||
raw.maxRetries,
|
||||
'maxRetries',
|
||||
DEFAULT_CONFIG.maxRetries,
|
||||
1, // min
|
||||
10 // max
|
||||
);
|
||||
|
||||
// Backoff multiplier
|
||||
const backoffMultiplier = this.validateNumeric(
|
||||
raw.backoffMultiplier,
|
||||
'backoffMultiplier',
|
||||
1.0,
|
||||
1.0, // min
|
||||
10.0 // max
|
||||
);
|
||||
|
||||
// Crawl status
|
||||
const crawlStatus = this.validateCrawlStatus(raw.crawlStatus);
|
||||
|
||||
// Consecutive failures
|
||||
const consecutiveFailures = Math.max(0, raw.consecutiveFailures || 0);
|
||||
|
||||
// Last error code
|
||||
const lastErrorCode = this.validateErrorCode(raw.lastErrorCode);
|
||||
|
||||
return {
|
||||
id: raw.id,
|
||||
name: raw.name,
|
||||
slug,
|
||||
platform,
|
||||
menuType: raw.menuType!,
|
||||
platformDispensaryId: raw.platformDispensaryId!,
|
||||
menuUrl,
|
||||
|
||||
crawlFrequencyMinutes,
|
||||
maxRetries,
|
||||
currentProxyId: raw.currentProxyId || null,
|
||||
currentUserAgent: raw.currentUserAgent || null,
|
||||
|
||||
crawlStatus,
|
||||
consecutiveFailures,
|
||||
backoffMultiplier,
|
||||
lastCrawlAt: raw.lastCrawlAt || null,
|
||||
lastSuccessAt: raw.lastSuccessAt || null,
|
||||
lastFailureAt: raw.lastFailureAt || null,
|
||||
lastErrorCode,
|
||||
nextCrawlAt: raw.nextCrawlAt || null,
|
||||
|
||||
isValid: true,
|
||||
validationErrors: [],
|
||||
validationWarnings: this.warnings,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate numeric value with bounds
|
||||
*/
|
||||
private validateNumeric(
|
||||
value: number | undefined,
|
||||
field: string,
|
||||
defaultValue: number,
|
||||
min: number,
|
||||
max: number
|
||||
): number {
|
||||
if (value === undefined || value === null) {
|
||||
this.addWarning(field, `Missing, defaulting to ${defaultValue}`, defaultValue);
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
if (value < min) {
|
||||
this.addWarning(field, `Value ${value} below minimum ${min}, using minimum`, min);
|
||||
return min;
|
||||
}
|
||||
|
||||
if (value > max) {
|
||||
this.addWarning(field, `Value ${value} above maximum ${max}, using maximum`, max);
|
||||
return max;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate crawl status
|
||||
*/
|
||||
private validateCrawlStatus(status?: string): 'active' | 'degraded' | 'paused' | 'failed' {
|
||||
const validStatuses = ['active', 'degraded', 'paused', 'failed'];
|
||||
if (!status || !validStatuses.includes(status)) {
|
||||
if (status) {
|
||||
this.addWarning('crawlStatus', `Invalid status "${status}", defaulting to active`, 'active');
|
||||
}
|
||||
return 'active';
|
||||
}
|
||||
return status as 'active' | 'degraded' | 'paused' | 'failed';
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate error code
|
||||
*/
|
||||
private validateErrorCode(code?: string): CrawlErrorCodeType | null {
|
||||
if (!code) return null;
|
||||
const validCodes = Object.values(CrawlErrorCode);
|
||||
if (!validCodes.includes(code as CrawlErrorCodeType)) {
|
||||
this.addWarning('lastErrorCode', `Invalid error code "${code}"`, null);
|
||||
return CrawlErrorCode.UNKNOWN_ERROR;
|
||||
}
|
||||
return code as CrawlErrorCodeType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate slug from name
|
||||
*/
|
||||
private generateSlug(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
.substring(0, 100);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate menu URL from platform ID
|
||||
*/
|
||||
private generateMenuUrl(platformId: string, platform: string): string {
|
||||
if (platform === 'dutchie') {
|
||||
return `https://dutchie.com/embedded-menu/${platformId}`;
|
||||
}
|
||||
return `https://${platform}.com/menu/${platformId}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add validation error
|
||||
*/
|
||||
private addError(field: string, message: string, code: CrawlErrorCodeType): void {
|
||||
this.errors.push({ field, message, code });
|
||||
console.warn(`[StoreValidator] ERROR ${field}: ${message}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add validation warning
|
||||
*/
|
||||
private addWarning(field: string, message: string, appliedDefault?: any): void {
|
||||
this.warnings.push({ field, message, appliedDefault });
|
||||
// Log at debug level - warnings are expected for incomplete configs
|
||||
console.debug(`[StoreValidator] WARNING ${field}: ${message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CONVENIENCE FUNCTIONS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Validate a single store config
|
||||
*/
|
||||
export function validateStoreConfig(raw: RawStoreConfig): ValidationResult {
|
||||
const validator = new StoreValidator();
|
||||
return validator.validate(raw);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate multiple store configs
|
||||
*/
|
||||
export function validateStoreConfigs(raws: RawStoreConfig[]): {
|
||||
valid: ValidatedStoreConfig[];
|
||||
invalid: { raw: RawStoreConfig; errors: ValidationError[] }[];
|
||||
warnings: { storeId: number; warnings: ValidationWarning[] }[];
|
||||
} {
|
||||
const valid: ValidatedStoreConfig[] = [];
|
||||
const invalid: { raw: RawStoreConfig; errors: ValidationError[] }[] = [];
|
||||
const warnings: { storeId: number; warnings: ValidationWarning[] }[] = [];
|
||||
|
||||
for (const raw of raws) {
|
||||
const result = validateStoreConfig(raw);
|
||||
|
||||
if (result.isValid && result.config) {
|
||||
valid.push(result.config);
|
||||
if (result.warnings.length > 0) {
|
||||
warnings.push({ storeId: raw.id, warnings: result.warnings });
|
||||
}
|
||||
} else {
|
||||
invalid.push({ raw, errors: result.errors });
|
||||
}
|
||||
}
|
||||
|
||||
return { valid, invalid, warnings };
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick check if a store is crawlable
|
||||
*/
|
||||
export function isCrawlable(raw: RawStoreConfig): boolean {
|
||||
return !!(
|
||||
raw.id &&
|
||||
raw.name &&
|
||||
raw.platformDispensaryId &&
|
||||
raw.menuType &&
|
||||
raw.menuType !== 'unknown' &&
|
||||
raw.crawlStatus !== 'failed' &&
|
||||
raw.crawlStatus !== 'paused'
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get reason why store is not crawlable
|
||||
*/
|
||||
export function getNotCrawlableReason(raw: RawStoreConfig): string | null {
|
||||
if (!raw.platformDispensaryId) {
|
||||
return 'Missing platform_dispensary_id';
|
||||
}
|
||||
if (!raw.menuType || raw.menuType === 'unknown') {
|
||||
return 'Menu type not detected';
|
||||
}
|
||||
if (raw.crawlStatus === 'failed') {
|
||||
return 'Store is marked as failed';
|
||||
}
|
||||
if (raw.crawlStatus === 'paused') {
|
||||
return 'Crawling is paused';
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLETON INSTANCE
|
||||
// ============================================================
|
||||
|
||||
export const storeValidator = new StoreValidator();
|
||||
@@ -1,750 +0,0 @@
|
||||
/**
|
||||
* Worker Service
|
||||
*
|
||||
* Polls the job queue and processes crawl jobs.
|
||||
* Each worker instance runs independently, claiming jobs atomically.
|
||||
*
|
||||
* Phase 1: Enhanced with self-healing logic, error taxonomy, and retry management.
|
||||
*/
|
||||
|
||||
import {
|
||||
claimNextJob,
|
||||
completeJob,
|
||||
failJob,
|
||||
updateJobProgress,
|
||||
heartbeat,
|
||||
getWorkerId,
|
||||
getWorkerHostname,
|
||||
recoverStaleJobs,
|
||||
QueuedJob,
|
||||
} from './job-queue';
|
||||
import { crawlDispensaryProducts } from './product-crawler';
|
||||
import { mapDbRowToDispensary } from './discovery';
|
||||
import { query } from '../db/connection';
|
||||
|
||||
// Phase 1: Error taxonomy and retry management
|
||||
import {
|
||||
CrawlErrorCode,
|
||||
CrawlErrorCodeType,
|
||||
classifyError,
|
||||
isRetryable,
|
||||
shouldRotateProxy,
|
||||
shouldRotateUserAgent,
|
||||
createSuccessResult,
|
||||
createFailureResult,
|
||||
CrawlResult,
|
||||
} from './error-taxonomy';
|
||||
import {
|
||||
RetryManager,
|
||||
RetryDecision,
|
||||
calculateNextCrawlAt,
|
||||
determineCrawlStatus,
|
||||
shouldAttemptRecovery,
|
||||
sleep,
|
||||
} from './retry-manager';
|
||||
import {
|
||||
CrawlRotator,
|
||||
userAgentRotator,
|
||||
updateDispensaryRotation,
|
||||
} from './proxy-rotator';
|
||||
import { DEFAULT_CONFIG, validateStoreConfig, isCrawlable } from './store-validator';
|
||||
|
||||
// Use shared dispensary columns (handles optional columns like provider_detection_data)
|
||||
// NOTE: Using WITH_FAILED variant for worker compatibility checks
|
||||
import { DISPENSARY_COLUMNS_WITH_FAILED as DISPENSARY_COLUMNS } from '../db/dispensary-columns';
|
||||
|
||||
// ============================================================
|
||||
// WORKER CONFIG
|
||||
// ============================================================
|
||||
|
||||
const POLL_INTERVAL_MS = 5000; // Check for jobs every 5 seconds
|
||||
const HEARTBEAT_INTERVAL_MS = 60000; // Send heartbeat every 60 seconds
|
||||
const STALE_CHECK_INTERVAL_MS = 300000; // Check for stale jobs every 5 minutes
|
||||
const SHUTDOWN_GRACE_PERIOD_MS = 30000; // Wait 30s for job to complete on shutdown
|
||||
|
||||
// ============================================================
|
||||
// WORKER STATE
|
||||
// ============================================================
|
||||
|
||||
let isRunning = false;
|
||||
let currentJob: QueuedJob | null = null;
|
||||
let pollTimer: NodeJS.Timeout | null = null;
|
||||
let heartbeatTimer: NodeJS.Timeout | null = null;
|
||||
let staleCheckTimer: NodeJS.Timeout | null = null;
|
||||
let shutdownPromise: Promise<void> | null = null;
|
||||
|
||||
// ============================================================
|
||||
// WORKER LIFECYCLE
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Start the worker
|
||||
*/
|
||||
export async function startWorker(): Promise<void> {
|
||||
if (isRunning) {
|
||||
console.log('[Worker] Already running');
|
||||
return;
|
||||
}
|
||||
|
||||
const workerId = getWorkerId();
|
||||
const hostname = getWorkerHostname();
|
||||
|
||||
console.log(`[Worker] Starting worker ${workerId} on ${hostname}`);
|
||||
isRunning = true;
|
||||
|
||||
// Set up graceful shutdown
|
||||
setupShutdownHandlers();
|
||||
|
||||
// Start polling for jobs
|
||||
pollTimer = setInterval(pollForJobs, POLL_INTERVAL_MS);
|
||||
|
||||
// Start stale job recovery (only one worker should do this, but it's idempotent)
|
||||
staleCheckTimer = setInterval(async () => {
|
||||
try {
|
||||
await recoverStaleJobs(15);
|
||||
} catch (error) {
|
||||
console.error('[Worker] Error recovering stale jobs:', error);
|
||||
}
|
||||
}, STALE_CHECK_INTERVAL_MS);
|
||||
|
||||
// Immediately poll for a job
|
||||
await pollForJobs();
|
||||
|
||||
console.log(`[Worker] Worker ${workerId} started, polling every ${POLL_INTERVAL_MS}ms`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the worker gracefully
|
||||
*/
|
||||
export async function stopWorker(): Promise<void> {
|
||||
if (!isRunning) return;
|
||||
|
||||
console.log('[Worker] Stopping worker...');
|
||||
isRunning = false;
|
||||
|
||||
// Clear timers
|
||||
if (pollTimer) {
|
||||
clearInterval(pollTimer);
|
||||
pollTimer = null;
|
||||
}
|
||||
if (heartbeatTimer) {
|
||||
clearInterval(heartbeatTimer);
|
||||
heartbeatTimer = null;
|
||||
}
|
||||
if (staleCheckTimer) {
|
||||
clearInterval(staleCheckTimer);
|
||||
staleCheckTimer = null;
|
||||
}
|
||||
|
||||
// Wait for current job to complete
|
||||
if (currentJob) {
|
||||
console.log(`[Worker] Waiting for job ${currentJob.id} to complete...`);
|
||||
const startWait = Date.now();
|
||||
|
||||
while (currentJob && Date.now() - startWait < SHUTDOWN_GRACE_PERIOD_MS) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
|
||||
if (currentJob) {
|
||||
console.log(`[Worker] Job ${currentJob.id} did not complete in time, marking for retry`);
|
||||
await failJob(currentJob.id, 'Worker shutdown');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[Worker] Worker stopped');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get worker status
|
||||
*/
|
||||
export function getWorkerStatus(): {
|
||||
isRunning: boolean;
|
||||
workerId: string;
|
||||
hostname: string;
|
||||
currentJob: QueuedJob | null;
|
||||
} {
|
||||
return {
|
||||
isRunning,
|
||||
workerId: getWorkerId(),
|
||||
hostname: getWorkerHostname(),
|
||||
currentJob,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// JOB PROCESSING
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Poll for and process the next available job
|
||||
*/
|
||||
async function pollForJobs(): Promise<void> {
|
||||
if (!isRunning || currentJob) {
|
||||
return; // Already processing a job
|
||||
}
|
||||
|
||||
try {
|
||||
const workerId = getWorkerId();
|
||||
|
||||
// Try to claim a job
|
||||
const job = await claimNextJob({
|
||||
workerId,
|
||||
jobTypes: ['dutchie_product_crawl', 'menu_detection', 'menu_detection_single'],
|
||||
lockDurationMinutes: 30,
|
||||
});
|
||||
|
||||
if (!job) {
|
||||
return; // No jobs available
|
||||
}
|
||||
|
||||
currentJob = job;
|
||||
console.log(`[Worker] Processing job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
|
||||
|
||||
// Start heartbeat for this job
|
||||
heartbeatTimer = setInterval(async () => {
|
||||
if (currentJob) {
|
||||
try {
|
||||
await heartbeat(currentJob.id);
|
||||
} catch (error) {
|
||||
console.error('[Worker] Heartbeat error:', error);
|
||||
}
|
||||
}
|
||||
}, HEARTBEAT_INTERVAL_MS);
|
||||
|
||||
// Process the job
|
||||
await processJob(job);
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('[Worker] Error polling for jobs:', error);
|
||||
|
||||
if (currentJob) {
|
||||
try {
|
||||
await failJob(currentJob.id, error.message);
|
||||
} catch (failError) {
|
||||
console.error('[Worker] Error failing job:', failError);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
// Clear heartbeat timer
|
||||
if (heartbeatTimer) {
|
||||
clearInterval(heartbeatTimer);
|
||||
heartbeatTimer = null;
|
||||
}
|
||||
currentJob = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single job
|
||||
*/
|
||||
async function processJob(job: QueuedJob): Promise<void> {
|
||||
try {
|
||||
switch (job.jobType) {
|
||||
case 'dutchie_product_crawl':
|
||||
await processProductCrawlJob(job);
|
||||
break;
|
||||
|
||||
case 'menu_detection':
|
||||
await processMenuDetectionJob(job);
|
||||
break;
|
||||
|
||||
case 'menu_detection_single':
|
||||
await processSingleDetectionJob(job);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Unknown job type: ${job.jobType}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`[Worker] Job ${job.id} failed:`, error);
|
||||
await failJob(job.id, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
// Thresholds for crawl status transitions
|
||||
const DEGRADED_THRESHOLD = 3; // Mark as degraded after 3 consecutive failures
|
||||
const FAILED_THRESHOLD = 10; // Mark as failed after 10 consecutive failures
|
||||
|
||||
// For backwards compatibility
|
||||
const MAX_CONSECUTIVE_FAILURES = FAILED_THRESHOLD;
|
||||
|
||||
/**
|
||||
* Record a successful crawl - resets failure counter and restores active status
|
||||
*/
|
||||
async function recordCrawlSuccess(
|
||||
dispensaryId: number,
|
||||
result: CrawlResult
|
||||
): Promise<void> {
|
||||
// Calculate next crawl time (use store's frequency or default)
|
||||
const { rows: storeRows } = await query<any>(
|
||||
`SELECT crawl_frequency_minutes FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
const frequencyMinutes = storeRows[0]?.crawl_frequency_minutes || DEFAULT_CONFIG.crawlFrequencyMinutes;
|
||||
const nextCrawlAt = calculateNextCrawlAt(0, frequencyMinutes);
|
||||
|
||||
// Reset failure state and schedule next crawl
|
||||
await query(
|
||||
`UPDATE dispensaries
|
||||
SET consecutive_failures = 0,
|
||||
crawl_status = 'active',
|
||||
backoff_multiplier = 1.0,
|
||||
last_crawl_at = NOW(),
|
||||
last_success_at = NOW(),
|
||||
last_error_code = NULL,
|
||||
next_crawl_at = $2,
|
||||
total_attempts = COALESCE(total_attempts, 0) + 1,
|
||||
total_successes = COALESCE(total_successes, 0) + 1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[dispensaryId, nextCrawlAt]
|
||||
);
|
||||
|
||||
// Log to crawl_attempts table for analytics
|
||||
await logCrawlAttempt(dispensaryId, result);
|
||||
|
||||
console.log(`[Worker] Dispensary ${dispensaryId} crawl success. Next crawl at ${nextCrawlAt.toISOString()}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a crawl failure with self-healing logic
|
||||
* - Rotates proxy/UA based on error type
|
||||
* - Transitions through: active -> degraded -> failed
|
||||
* - Calculates backoff for next attempt
|
||||
*/
|
||||
async function recordCrawlFailure(
|
||||
dispensaryId: number,
|
||||
errorMessage: string,
|
||||
errorCode?: CrawlErrorCodeType,
|
||||
httpStatus?: number,
|
||||
context?: {
|
||||
proxyUsed?: string;
|
||||
userAgentUsed?: string;
|
||||
attemptNumber?: number;
|
||||
}
|
||||
): Promise<{ wasFlagged: boolean; newStatus: string; shouldRotateProxy: boolean; shouldRotateUA: boolean }> {
|
||||
// Classify the error if not provided
|
||||
const code = errorCode || classifyError(errorMessage, httpStatus);
|
||||
|
||||
// Get current state
|
||||
const { rows: storeRows } = await query<any>(
|
||||
`SELECT
|
||||
consecutive_failures,
|
||||
crawl_status,
|
||||
backoff_multiplier,
|
||||
crawl_frequency_minutes,
|
||||
current_proxy_id,
|
||||
current_user_agent
|
||||
FROM dispensaries WHERE id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (storeRows.length === 0) {
|
||||
return { wasFlagged: false, newStatus: 'unknown', shouldRotateProxy: false, shouldRotateUA: false };
|
||||
}
|
||||
|
||||
const store = storeRows[0];
|
||||
const currentFailures = (store.consecutive_failures || 0) + 1;
|
||||
const frequencyMinutes = store.crawl_frequency_minutes || DEFAULT_CONFIG.crawlFrequencyMinutes;
|
||||
|
||||
// Determine if we should rotate proxy/UA based on error type
|
||||
const rotateProxy = shouldRotateProxy(code);
|
||||
const rotateUA = shouldRotateUserAgent(code);
|
||||
|
||||
// Get new proxy/UA if rotation is needed
|
||||
let newProxyId = store.current_proxy_id;
|
||||
let newUserAgent = store.current_user_agent;
|
||||
|
||||
if (rotateUA) {
|
||||
newUserAgent = userAgentRotator.getNext();
|
||||
console.log(`[Worker] Rotating user agent for dispensary ${dispensaryId} after ${code}`);
|
||||
}
|
||||
|
||||
// Determine new crawl status
|
||||
const newStatus = determineCrawlStatus(currentFailures, {
|
||||
degraded: DEGRADED_THRESHOLD,
|
||||
failed: FAILED_THRESHOLD,
|
||||
});
|
||||
|
||||
// Calculate backoff multiplier and next crawl time
|
||||
const newBackoffMultiplier = Math.min(
|
||||
(store.backoff_multiplier || 1.0) * 1.5,
|
||||
4.0 // Max 4x backoff
|
||||
);
|
||||
const nextCrawlAt = calculateNextCrawlAt(currentFailures, frequencyMinutes);
|
||||
|
||||
// Update dispensary with new failure state
|
||||
if (newStatus === 'failed') {
|
||||
// Mark as failed - won't be crawled again until manual intervention
|
||||
await query(
|
||||
`UPDATE dispensaries
|
||||
SET consecutive_failures = $2,
|
||||
crawl_status = $3,
|
||||
backoff_multiplier = $4,
|
||||
last_failure_at = NOW(),
|
||||
last_error_code = $5,
|
||||
failed_at = NOW(),
|
||||
failure_notes = $6,
|
||||
next_crawl_at = NULL,
|
||||
current_proxy_id = $7,
|
||||
current_user_agent = $8,
|
||||
total_attempts = COALESCE(total_attempts, 0) + 1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
dispensaryId,
|
||||
currentFailures,
|
||||
newStatus,
|
||||
newBackoffMultiplier,
|
||||
code,
|
||||
`Auto-flagged after ${currentFailures} consecutive failures. Last error: ${errorMessage}`,
|
||||
newProxyId,
|
||||
newUserAgent,
|
||||
]
|
||||
);
|
||||
console.log(`[Worker] Dispensary ${dispensaryId} marked as FAILED after ${currentFailures} failures (${code})`);
|
||||
} else {
|
||||
// Update failure count but keep crawling (active or degraded)
|
||||
await query(
|
||||
`UPDATE dispensaries
|
||||
SET consecutive_failures = $2,
|
||||
crawl_status = $3,
|
||||
backoff_multiplier = $4,
|
||||
last_failure_at = NOW(),
|
||||
last_error_code = $5,
|
||||
next_crawl_at = $6,
|
||||
current_proxy_id = $7,
|
||||
current_user_agent = $8,
|
||||
total_attempts = COALESCE(total_attempts, 0) + 1,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`,
|
||||
[
|
||||
dispensaryId,
|
||||
currentFailures,
|
||||
newStatus,
|
||||
newBackoffMultiplier,
|
||||
code,
|
||||
nextCrawlAt,
|
||||
newProxyId,
|
||||
newUserAgent,
|
||||
]
|
||||
);
|
||||
|
||||
if (newStatus === 'degraded') {
|
||||
console.log(`[Worker] Dispensary ${dispensaryId} marked as DEGRADED (${currentFailures}/${FAILED_THRESHOLD} failures). Next crawl: ${nextCrawlAt.toISOString()}`);
|
||||
} else {
|
||||
console.log(`[Worker] Dispensary ${dispensaryId} failure recorded (${currentFailures}/${DEGRADED_THRESHOLD}). Next crawl: ${nextCrawlAt.toISOString()}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Log to crawl_attempts table
|
||||
const result = createFailureResult(
|
||||
dispensaryId,
|
||||
new Date(),
|
||||
errorMessage,
|
||||
httpStatus,
|
||||
context
|
||||
);
|
||||
await logCrawlAttempt(dispensaryId, result);
|
||||
|
||||
return {
|
||||
wasFlagged: newStatus === 'failed',
|
||||
newStatus,
|
||||
shouldRotateProxy: rotateProxy,
|
||||
shouldRotateUA: rotateUA,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a crawl attempt to the crawl_attempts table for analytics
|
||||
*/
|
||||
async function logCrawlAttempt(
|
||||
dispensaryId: number,
|
||||
result: CrawlResult
|
||||
): Promise<void> {
|
||||
try {
|
||||
await query(
|
||||
`INSERT INTO crawl_attempts (
|
||||
dispensary_id, started_at, finished_at, duration_ms,
|
||||
error_code, error_message, http_status,
|
||||
attempt_number, proxy_used, user_agent_used,
|
||||
products_found, products_upserted, snapshots_created,
|
||||
created_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, NOW())`,
|
||||
[
|
||||
dispensaryId,
|
||||
result.startedAt,
|
||||
result.finishedAt,
|
||||
result.durationMs,
|
||||
result.errorCode,
|
||||
result.errorMessage || null,
|
||||
result.httpStatus || null,
|
||||
result.attemptNumber,
|
||||
result.proxyUsed || null,
|
||||
result.userAgentUsed || null,
|
||||
result.productsFound || 0,
|
||||
result.productsUpserted || 0,
|
||||
result.snapshotsCreated || 0,
|
||||
]
|
||||
);
|
||||
} catch (error) {
|
||||
// Don't fail the job if logging fails
|
||||
console.error(`[Worker] Failed to log crawl attempt for dispensary ${dispensaryId}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a product crawl job for a single dispensary
|
||||
*/
|
||||
async function processProductCrawlJob(job: QueuedJob): Promise<void> {
|
||||
const startedAt = new Date();
|
||||
const userAgent = userAgentRotator.getCurrent();
|
||||
|
||||
if (!job.dispensaryId) {
|
||||
throw new Error('Product crawl job requires dispensary_id');
|
||||
}
|
||||
|
||||
// Get dispensary details
|
||||
const { rows } = await query<any>(
|
||||
`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`,
|
||||
[job.dispensaryId]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
throw new Error(`Dispensary ${job.dispensaryId} not found`);
|
||||
}
|
||||
|
||||
const dispensary = mapDbRowToDispensary(rows[0]);
|
||||
const rawDispensary = rows[0];
|
||||
|
||||
// Check if dispensary is already flagged as failed
|
||||
if (rawDispensary.failed_at) {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
return;
|
||||
}
|
||||
|
||||
// Check crawl status - skip if paused or failed
|
||||
if (rawDispensary.crawl_status === 'paused' || rawDispensary.crawl_status === 'failed') {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - crawl_status is ${rawDispensary.crawl_status}`);
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
return;
|
||||
}
|
||||
|
||||
if (!dispensary.platformDispensaryId) {
|
||||
// Record failure with error taxonomy
|
||||
const { wasFlagged } = await recordCrawlFailure(
|
||||
job.dispensaryId,
|
||||
'Missing platform_dispensary_id',
|
||||
CrawlErrorCode.MISSING_PLATFORM_ID,
|
||||
undefined,
|
||||
{ userAgentUsed: userAgent, attemptNumber: job.retryCount + 1 }
|
||||
);
|
||||
if (wasFlagged) {
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
return;
|
||||
}
|
||||
throw new Error(`Dispensary ${job.dispensaryId} has no platform_dispensary_id`);
|
||||
}
|
||||
|
||||
// Get crawl options from job metadata
|
||||
const pricingType = job.metadata?.pricingType || 'rec';
|
||||
const useBothModes = job.metadata?.useBothModes !== false;
|
||||
|
||||
try {
|
||||
// Crawl the dispensary
|
||||
const result = await crawlDispensaryProducts(dispensary, pricingType, {
|
||||
useBothModes,
|
||||
onProgress: async (progress) => {
|
||||
// Update progress for live monitoring
|
||||
await updateJobProgress(job.id, {
|
||||
productsFound: progress.productsFound,
|
||||
productsUpserted: progress.productsUpserted,
|
||||
snapshotsCreated: progress.snapshotsCreated,
|
||||
currentPage: progress.currentPage,
|
||||
totalPages: progress.totalPages,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
// Success! Create result and record
|
||||
const crawlResult = createSuccessResult(
|
||||
job.dispensaryId,
|
||||
startedAt,
|
||||
{
|
||||
productsFound: result.productsFetched,
|
||||
productsUpserted: result.productsUpserted,
|
||||
snapshotsCreated: result.snapshotsCreated,
|
||||
},
|
||||
{
|
||||
attemptNumber: job.retryCount + 1,
|
||||
userAgentUsed: userAgent,
|
||||
}
|
||||
);
|
||||
await recordCrawlSuccess(job.dispensaryId, crawlResult);
|
||||
await completeJob(job.id, {
|
||||
productsFound: result.productsFetched,
|
||||
productsUpserted: result.productsUpserted,
|
||||
snapshotsCreated: result.snapshotsCreated,
|
||||
// Visibility tracking stats for dashboard
|
||||
visibilityLostCount: result.visibilityLostCount || 0,
|
||||
visibilityRestoredCount: result.visibilityRestoredCount || 0,
|
||||
});
|
||||
} else {
|
||||
// Crawl returned failure - classify error and record
|
||||
const errorCode = classifyError(result.errorMessage || 'Crawl failed', result.httpStatus);
|
||||
const { wasFlagged } = await recordCrawlFailure(
|
||||
job.dispensaryId,
|
||||
result.errorMessage || 'Crawl failed',
|
||||
errorCode,
|
||||
result.httpStatus,
|
||||
{ userAgentUsed: userAgent, attemptNumber: job.retryCount + 1 }
|
||||
);
|
||||
|
||||
if (wasFlagged) {
|
||||
// Dispensary is now flagged - complete the job
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
} else if (!isRetryable(errorCode)) {
|
||||
// Non-retryable error - complete as failed
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
} else {
|
||||
// Retryable error - let job queue handle retry
|
||||
throw new Error(result.errorMessage || 'Crawl failed');
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Record the failure with error taxonomy
|
||||
const errorCode = classifyError(error.message);
|
||||
const { wasFlagged } = await recordCrawlFailure(
|
||||
job.dispensaryId,
|
||||
error.message,
|
||||
errorCode,
|
||||
undefined,
|
||||
{ userAgentUsed: userAgent, attemptNumber: job.retryCount + 1 }
|
||||
);
|
||||
|
||||
if (wasFlagged) {
|
||||
// Dispensary is now flagged - complete the job
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
} else if (!isRetryable(errorCode)) {
|
||||
// Non-retryable error - complete as failed
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a menu detection job (bulk)
|
||||
*/
|
||||
async function processMenuDetectionJob(job: QueuedJob): Promise<void> {
|
||||
const { executeMenuDetectionJob } = await import('./menu-detection');
|
||||
|
||||
const config = job.metadata || {};
|
||||
const result = await executeMenuDetectionJob(config);
|
||||
|
||||
if (result.status === 'error') {
|
||||
throw new Error(result.errorMessage || 'Menu detection failed');
|
||||
}
|
||||
|
||||
await completeJob(job.id, {
|
||||
productsFound: result.itemsProcessed,
|
||||
productsUpserted: result.itemsSucceeded,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single dispensary menu detection job
|
||||
* This is the parallelizable version - each worker can detect one dispensary at a time
|
||||
*/
|
||||
async function processSingleDetectionJob(job: QueuedJob): Promise<void> {
|
||||
if (!job.dispensaryId) {
|
||||
throw new Error('Single detection job requires dispensary_id');
|
||||
}
|
||||
|
||||
const { detectAndResolveDispensary } = await import('./menu-detection');
|
||||
|
||||
// Get dispensary details
|
||||
const { rows } = await query<any>(
|
||||
`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`,
|
||||
[job.dispensaryId]
|
||||
);
|
||||
|
||||
if (rows.length === 0) {
|
||||
throw new Error(`Dispensary ${job.dispensaryId} not found`);
|
||||
}
|
||||
|
||||
const dispensary = rows[0];
|
||||
|
||||
// Skip if already detected or failed
|
||||
if (dispensary.failed_at) {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
return;
|
||||
}
|
||||
|
||||
if (dispensary.menu_type && dispensary.menu_type !== 'unknown') {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already detected as ${dispensary.menu_type}`);
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 1 });
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[Worker] Detecting menu for dispensary ${job.dispensaryId} (${dispensary.name})...`);
|
||||
|
||||
try {
|
||||
const result = await detectAndResolveDispensary(job.dispensaryId);
|
||||
|
||||
if (result.success) {
|
||||
console.log(`[Worker] Dispensary ${job.dispensaryId}: detected ${result.detectedProvider}, platformId=${result.platformDispensaryId || 'none'}`);
|
||||
await completeJob(job.id, {
|
||||
productsFound: 1,
|
||||
productsUpserted: result.platformDispensaryId ? 1 : 0,
|
||||
});
|
||||
} else {
|
||||
// Detection failed - record failure
|
||||
await recordCrawlFailure(job.dispensaryId, result.error || 'Detection failed');
|
||||
throw new Error(result.error || 'Detection failed');
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Record the failure
|
||||
const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
|
||||
if (wasFlagged) {
|
||||
// Dispensary is now flagged - complete the job rather than fail it
|
||||
await completeJob(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SHUTDOWN HANDLING
|
||||
// ============================================================
|
||||
|
||||
function setupShutdownHandlers(): void {
|
||||
const shutdown = async (signal: string) => {
|
||||
if (shutdownPromise) return shutdownPromise;
|
||||
|
||||
console.log(`\n[Worker] Received ${signal}, shutting down...`);
|
||||
shutdownPromise = stopWorker();
|
||||
await shutdownPromise;
|
||||
process.exit(0);
|
||||
};
|
||||
|
||||
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
||||
process.on('SIGINT', () => shutdown('SIGINT'));
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// STANDALONE WORKER ENTRY POINT
|
||||
// ============================================================
|
||||
|
||||
if (require.main === module) {
|
||||
// Run as standalone worker
|
||||
startWorker().catch((error) => {
|
||||
console.error('[Worker] Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -1,751 +0,0 @@
|
||||
/**
|
||||
* Dutchie AZ Data Types
|
||||
*
|
||||
* Complete TypeScript interfaces for the isolated Dutchie Arizona data pipeline.
|
||||
* These types map directly to Dutchie's GraphQL FilteredProducts response.
|
||||
*/
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL RESPONSE TYPES (from Dutchie API)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Raw Dutchie brand object from GraphQL
|
||||
*/
|
||||
export interface DutchieBrand {
|
||||
id: string;
|
||||
_id?: string;
|
||||
name: string;
|
||||
parentBrandId?: string;
|
||||
imageUrl?: string;
|
||||
description?: string;
|
||||
__typename?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Raw Dutchie image object from GraphQL
|
||||
*/
|
||||
export interface DutchieImage {
|
||||
url: string;
|
||||
description?: string;
|
||||
active?: boolean;
|
||||
__typename?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* POSMetaData.children - option-level inventory/pricing
|
||||
*/
|
||||
export interface DutchiePOSChild {
|
||||
activeBatchTags?: any;
|
||||
canonicalBrandId?: string;
|
||||
canonicalBrandName?: string;
|
||||
canonicalCategory?: string;
|
||||
canonicalCategoryId?: string;
|
||||
canonicalEffectivePotencyMg?: number;
|
||||
canonicalID?: string;
|
||||
canonicalPackageId?: string;
|
||||
canonicalImgUrl?: string;
|
||||
canonicalLabResultUrl?: string;
|
||||
canonicalName?: string;
|
||||
canonicalSKU?: string;
|
||||
canonicalProductTags?: string[];
|
||||
canonicalStrainId?: string;
|
||||
canonicalVendorId?: string;
|
||||
kioskQuantityAvailable?: number;
|
||||
medPrice?: number;
|
||||
option?: string;
|
||||
packageQuantity?: number;
|
||||
price?: number;
|
||||
quantity?: number;
|
||||
quantityAvailable?: number;
|
||||
recEquivalent?: number;
|
||||
recPrice?: number;
|
||||
standardEquivalent?: number;
|
||||
__typename?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* POSMetaData object from GraphQL
|
||||
*/
|
||||
export interface DutchiePOSMetaData {
|
||||
activeBatchTags?: any;
|
||||
canonicalBrandId?: string;
|
||||
canonicalBrandName?: string;
|
||||
canonicalCategory?: string;
|
||||
canonicalCategoryId?: string;
|
||||
canonicalID?: string;
|
||||
canonicalPackageId?: string;
|
||||
canonicalImgUrl?: string;
|
||||
canonicalLabResultUrl?: string;
|
||||
canonicalName?: string;
|
||||
canonicalProductTags?: string[];
|
||||
canonicalSKU?: string;
|
||||
canonicalStrainId?: string;
|
||||
canonicalVendorId?: string;
|
||||
children?: DutchiePOSChild[];
|
||||
integrationID?: string;
|
||||
__typename?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* THC/CBD Content structure
|
||||
*/
|
||||
export interface DutchiePotencyContent {
|
||||
unit?: string;
|
||||
range?: number[];
|
||||
}
|
||||
|
||||
/**
|
||||
* CannabinoidV2 structure
|
||||
*/
|
||||
export interface DutchieCannabinoidV2 {
|
||||
value: number;
|
||||
unit: string;
|
||||
cannabinoid: {
|
||||
name: string;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Special data structure
|
||||
*/
|
||||
export interface DutchieSpecialData {
|
||||
saleSpecials?: Array<{
|
||||
specialId: string;
|
||||
specialName: string;
|
||||
discount: number;
|
||||
percentDiscount: boolean;
|
||||
dollarDiscount: boolean;
|
||||
specialType: string;
|
||||
}>;
|
||||
bogoSpecials?: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complete raw product from Dutchie GraphQL FilteredProducts
|
||||
*/
|
||||
export interface DutchieRawProduct {
|
||||
_id: string;
|
||||
id?: string;
|
||||
AdditionalOptions?: any;
|
||||
duplicatedProductId?: string;
|
||||
libraryProductId?: string;
|
||||
libraryProductScore?: number;
|
||||
|
||||
// Brand
|
||||
brand?: DutchieBrand;
|
||||
brandId?: string;
|
||||
brandName?: string;
|
||||
brandLogo?: string;
|
||||
|
||||
// Potency
|
||||
CBD?: number;
|
||||
CBDContent?: DutchiePotencyContent;
|
||||
THC?: number;
|
||||
THCContent?: DutchiePotencyContent;
|
||||
cannabinoidsV2?: DutchieCannabinoidV2[];
|
||||
|
||||
// Flags
|
||||
certificateOfAnalysisEnabled?: boolean;
|
||||
collectionCardBadge?: string;
|
||||
comingSoon?: boolean;
|
||||
featured?: boolean;
|
||||
medicalOnly?: boolean;
|
||||
recOnly?: boolean;
|
||||
nonArmsLength?: boolean;
|
||||
vapeTaxApplicable?: boolean;
|
||||
useBetterPotencyTaxes?: boolean;
|
||||
|
||||
// Timestamps
|
||||
createdAt?: string;
|
||||
updatedAt?: string;
|
||||
|
||||
// Dispensary
|
||||
DispensaryID: string;
|
||||
enterpriseProductId?: string;
|
||||
|
||||
// Images
|
||||
Image?: string;
|
||||
images?: DutchieImage[];
|
||||
|
||||
// Measurements
|
||||
measurements?: {
|
||||
netWeight?: {
|
||||
unit: string;
|
||||
values: number[];
|
||||
};
|
||||
volume?: any;
|
||||
};
|
||||
weight?: number | string;
|
||||
|
||||
// Product identity
|
||||
Name: string;
|
||||
cName: string;
|
||||
pastCNames?: string[];
|
||||
|
||||
// Options
|
||||
Options?: string[];
|
||||
rawOptions?: string[];
|
||||
limitsPerCustomer?: any;
|
||||
manualInventory?: boolean;
|
||||
|
||||
// POS data
|
||||
POSMetaData?: DutchiePOSMetaData;
|
||||
|
||||
// Pricing
|
||||
Prices?: number[];
|
||||
recPrices?: number[];
|
||||
medicalPrices?: number[];
|
||||
recSpecialPrices?: number[];
|
||||
medicalSpecialPrices?: number[];
|
||||
wholesalePrices?: number[];
|
||||
pricingTierData?: any;
|
||||
specialIdsPerOption?: any;
|
||||
|
||||
// Specials
|
||||
special?: boolean;
|
||||
specialData?: DutchieSpecialData;
|
||||
|
||||
// Classification
|
||||
Status?: string;
|
||||
strainType?: string;
|
||||
subcategory?: string;
|
||||
type?: string;
|
||||
provider?: string;
|
||||
effects?: Record<string, any>;
|
||||
|
||||
// Threshold flags
|
||||
isBelowThreshold?: boolean;
|
||||
isBelowKioskThreshold?: boolean;
|
||||
optionsBelowThreshold?: boolean;
|
||||
optionsBelowKioskThreshold?: boolean;
|
||||
|
||||
// Misc
|
||||
bottleDepositTaxCents?: number;
|
||||
__typename?: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DERIVED TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* StockStatus - derived from POSMetaData.children quantityAvailable
|
||||
* - 'in_stock': At least one option has quantityAvailable > 0
|
||||
* - 'out_of_stock': All options have quantityAvailable === 0
|
||||
* - 'unknown': No POSMetaData.children or quantityAvailable data
|
||||
* - 'missing_from_feed': Product was not present in the latest crawl feed
|
||||
*/
|
||||
export type StockStatus = 'in_stock' | 'out_of_stock' | 'unknown' | 'missing_from_feed';
|
||||
|
||||
/**
|
||||
* CrawlMode - defines how products are fetched from Dutchie
|
||||
* - 'mode_a': UI parity - Status: 'Active', threshold removal ON
|
||||
* - 'mode_b': MAX COVERAGE - No Status filter, bypass thresholds
|
||||
*/
|
||||
export type CrawlMode = 'mode_a' | 'mode_b';
|
||||
|
||||
/**
|
||||
* Per-option stock status type
|
||||
*/
|
||||
export type OptionStockStatus = 'in_stock' | 'out_of_stock' | 'unknown';
|
||||
|
||||
/**
|
||||
* Get available quantity for a single option
|
||||
* Priority: quantityAvailable > kioskQuantityAvailable > quantity
|
||||
*/
|
||||
export function getOptionQuantity(child: DutchiePOSChild): number | null {
|
||||
if (typeof child.quantityAvailable === 'number') return child.quantityAvailable;
|
||||
if (typeof child.kioskQuantityAvailable === 'number') return child.kioskQuantityAvailable;
|
||||
if (typeof child.quantity === 'number') return child.quantity;
|
||||
return null; // No quantity data available
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive stock status for a single option
|
||||
* Returns: 'in_stock' if qty > 0, 'out_of_stock' if qty === 0, 'unknown' if no data
|
||||
*/
|
||||
export function deriveOptionStockStatus(child: DutchiePOSChild): OptionStockStatus {
|
||||
const qty = getOptionQuantity(child);
|
||||
if (qty === null) return 'unknown';
|
||||
return qty > 0 ? 'in_stock' : 'out_of_stock';
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive product-level stock status from POSMetaData.children
|
||||
*
|
||||
* Logic per spec:
|
||||
* - If ANY child is "in_stock" → product is "in_stock"
|
||||
* - Else if ALL children are "out_of_stock" → product is "out_of_stock"
|
||||
* - Else → product is "unknown"
|
||||
*
|
||||
* IMPORTANT: Threshold flags (isBelowThreshold, etc.) do NOT override stock status.
|
||||
* They only indicate "low stock" - if qty > 0, status stays "in_stock".
|
||||
*/
|
||||
export function deriveStockStatus(product: DutchieRawProduct): StockStatus {
|
||||
const children = product.POSMetaData?.children;
|
||||
|
||||
// No children data - unknown
|
||||
if (!children || children.length === 0) {
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
// Get stock status for each option
|
||||
const optionStatuses = children.map(deriveOptionStockStatus);
|
||||
|
||||
// If ANY option is in_stock → product is in_stock
|
||||
if (optionStatuses.some(status => status === 'in_stock')) {
|
||||
return 'in_stock';
|
||||
}
|
||||
|
||||
// If ALL options are out_of_stock → product is out_of_stock
|
||||
if (optionStatuses.every(status => status === 'out_of_stock')) {
|
||||
return 'out_of_stock';
|
||||
}
|
||||
|
||||
// Otherwise (mix of out_of_stock and unknown) → unknown
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate total quantity available across all options
|
||||
* Returns null if no children data (unknown inventory), 0 if children exist but all have 0 qty
|
||||
*/
|
||||
export function calculateTotalQuantity(product: DutchieRawProduct): number | null {
|
||||
const children = product.POSMetaData?.children;
|
||||
// No children = unknown inventory, return null (NOT 0)
|
||||
if (!children || children.length === 0) return null;
|
||||
|
||||
// Check if any child has quantity data
|
||||
const hasAnyQtyData = children.some(child => getOptionQuantity(child) !== null);
|
||||
if (!hasAnyQtyData) return null; // All children lack qty data = unknown
|
||||
|
||||
return children.reduce((sum, child) => {
|
||||
const qty = getOptionQuantity(child);
|
||||
return sum + (qty ?? 0);
|
||||
}, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate total kiosk quantity available across all options
|
||||
*/
|
||||
export function calculateTotalKioskQuantity(product: DutchieRawProduct): number | null {
|
||||
const children = product.POSMetaData?.children;
|
||||
if (!children || children.length === 0) return null;
|
||||
|
||||
const hasAnyKioskQty = children.some(child => typeof child.kioskQuantityAvailable === 'number');
|
||||
if (!hasAnyKioskQty) return null;
|
||||
|
||||
return children.reduce((sum, child) => sum + (child.kioskQuantityAvailable ?? 0), 0);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DATABASE ENTITY TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Dispensary - represents a Dutchie store in Arizona
|
||||
*/
|
||||
export interface Dispensary {
|
||||
id: number;
|
||||
platform: 'dutchie';
|
||||
name: string;
|
||||
dbaName?: string;
|
||||
slug: string;
|
||||
city: string;
|
||||
state: string;
|
||||
postalCode?: string;
|
||||
latitude?: number;
|
||||
longitude?: number;
|
||||
address?: string;
|
||||
platformDispensaryId?: string; // Resolved internal ID (e.g., "6405ef617056e8014d79101b")
|
||||
isDelivery?: boolean;
|
||||
isPickup?: boolean;
|
||||
rawMetadata?: any; // Full discovery node
|
||||
lastCrawledAt?: Date;
|
||||
productCount?: number;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
menuType?: string;
|
||||
menuUrl?: string;
|
||||
scrapeEnabled?: boolean;
|
||||
providerDetectionData?: any;
|
||||
platformDispensaryIdResolvedAt?: Date;
|
||||
website?: string; // The dispensary's own website (from raw_metadata or direct column)
|
||||
}
|
||||
|
||||
/**
|
||||
* DutchieProduct - canonical product identity per store
|
||||
*/
|
||||
export interface DutchieProduct {
|
||||
id: number;
|
||||
dispensaryId: number;
|
||||
platform: 'dutchie';
|
||||
|
||||
externalProductId: string; // from _id or id
|
||||
platformDispensaryId: string; // mirror of Dispensary.platformDispensaryId
|
||||
cName?: string; // cName / slug
|
||||
name: string; // Name
|
||||
|
||||
// Brand
|
||||
brandName?: string;
|
||||
brandId?: string;
|
||||
brandLogoUrl?: string;
|
||||
|
||||
// Classification
|
||||
type?: string;
|
||||
subcategory?: string;
|
||||
strainType?: string;
|
||||
provider?: string;
|
||||
|
||||
// Potency
|
||||
thc?: number;
|
||||
thcContent?: number;
|
||||
cbd?: number;
|
||||
cbdContent?: number;
|
||||
cannabinoidsV2?: DutchieCannabinoidV2[];
|
||||
effects?: Record<string, any>;
|
||||
|
||||
// Status / flags
|
||||
status?: string;
|
||||
medicalOnly: boolean;
|
||||
recOnly: boolean;
|
||||
featured: boolean;
|
||||
comingSoon: boolean;
|
||||
certificateOfAnalysisEnabled: boolean;
|
||||
|
||||
isBelowThreshold: boolean;
|
||||
isBelowKioskThreshold: boolean;
|
||||
optionsBelowThreshold: boolean;
|
||||
optionsBelowKioskThreshold: boolean;
|
||||
|
||||
// Derived stock status (from POSMetaData.children quantityAvailable)
|
||||
stockStatus: StockStatus;
|
||||
totalQuantityAvailable?: number | null; // null = unknown (no children), 0 = all OOS
|
||||
|
||||
// Images
|
||||
primaryImageUrl?: string;
|
||||
images?: DutchieImage[];
|
||||
|
||||
// Misc
|
||||
measurements?: any;
|
||||
weight?: string;
|
||||
pastCNames?: string[];
|
||||
|
||||
createdAtDutchie?: Date;
|
||||
updatedAtDutchie?: Date;
|
||||
|
||||
latestRawPayload?: any; // Full product node from last crawl
|
||||
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* DutchieProductOptionSnapshot - child-level option data from POSMetaData.children
|
||||
*/
|
||||
export interface DutchieProductOptionSnapshot {
|
||||
optionId: string; // canonicalID or canonicalPackageId or canonicalSKU
|
||||
canonicalId?: string;
|
||||
canonicalPackageId?: string;
|
||||
canonicalSKU?: string;
|
||||
canonicalName?: string;
|
||||
|
||||
canonicalCategory?: string;
|
||||
canonicalCategoryId?: string;
|
||||
canonicalBrandId?: string;
|
||||
canonicalBrandName?: string;
|
||||
canonicalStrainId?: string;
|
||||
canonicalVendorId?: string;
|
||||
|
||||
optionLabel?: string; // from option field
|
||||
packageQuantity?: number;
|
||||
recEquivalent?: number;
|
||||
standardEquivalent?: number;
|
||||
|
||||
priceCents?: number; // price * 100
|
||||
recPriceCents?: number; // recPrice * 100
|
||||
medPriceCents?: number; // medPrice * 100
|
||||
|
||||
quantity?: number;
|
||||
quantityAvailable?: number;
|
||||
kioskQuantityAvailable?: number;
|
||||
|
||||
activeBatchTags?: any;
|
||||
canonicalImgUrl?: string;
|
||||
canonicalLabResultUrl?: string;
|
||||
canonicalEffectivePotencyMg?: number;
|
||||
|
||||
rawChildPayload?: any; // Full POSMetaData.children node
|
||||
}
|
||||
|
||||
/**
|
||||
* DutchieProductSnapshot - per crawl, includes options[]
|
||||
*/
|
||||
export interface DutchieProductSnapshot {
|
||||
id: number;
|
||||
dutchieProductId: number;
|
||||
dispensaryId: number;
|
||||
platformDispensaryId: string;
|
||||
externalProductId: string;
|
||||
pricingType: 'rec' | 'med' | 'unknown';
|
||||
crawlMode: CrawlMode; // Which crawl mode captured this snapshot
|
||||
|
||||
status?: string;
|
||||
featured: boolean;
|
||||
special: boolean;
|
||||
medicalOnly: boolean;
|
||||
recOnly: boolean;
|
||||
|
||||
// Flag indicating if product was present in feed (false = missing_from_feed snapshot)
|
||||
isPresentInFeed: boolean;
|
||||
|
||||
// Derived stock status for this snapshot
|
||||
stockStatus: StockStatus;
|
||||
|
||||
// Price summary (aggregated from children, in cents)
|
||||
recMinPriceCents?: number;
|
||||
recMaxPriceCents?: number;
|
||||
recMinSpecialPriceCents?: number;
|
||||
medMinPriceCents?: number;
|
||||
medMaxPriceCents?: number;
|
||||
medMinSpecialPriceCents?: number;
|
||||
wholesaleMinPriceCents?: number;
|
||||
|
||||
// Inventory summary (aggregated from POSMetaData.children)
|
||||
totalQuantityAvailable?: number | null; // null = unknown (no children), 0 = all OOS
|
||||
totalKioskQuantityAvailable?: number | null;
|
||||
manualInventory: boolean;
|
||||
isBelowThreshold: boolean;
|
||||
isBelowKioskThreshold: boolean;
|
||||
|
||||
// Option-level data
|
||||
options: DutchieProductOptionSnapshot[];
|
||||
|
||||
// Full raw product node at this crawl time
|
||||
rawPayload: any;
|
||||
|
||||
crawledAt: Date;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* CrawlJob - tracks crawl execution status
|
||||
*/
|
||||
export interface CrawlJob {
|
||||
id: number;
|
||||
jobType: 'discovery' | 'product_crawl' | 'resolve_ids';
|
||||
dispensaryId?: number;
|
||||
status: 'pending' | 'running' | 'completed' | 'failed';
|
||||
startedAt?: Date;
|
||||
completedAt?: Date;
|
||||
errorMessage?: string;
|
||||
productsFound?: number;
|
||||
snapshotsCreated?: number;
|
||||
metadata?: any;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* JobSchedule - recurring job configuration with jitter support
|
||||
* Times "wander" around the clock due to random jitter after each run
|
||||
*/
|
||||
export type JobStatus = 'success' | 'error' | 'partial' | 'running' | null;
|
||||
|
||||
export interface JobSchedule {
|
||||
id: number;
|
||||
jobName: string;
|
||||
description?: string;
|
||||
enabled: boolean;
|
||||
|
||||
// Timing configuration
|
||||
baseIntervalMinutes: number; // e.g., 240 (4 hours)
|
||||
jitterMinutes: number; // e.g., 30 (±30 minutes)
|
||||
|
||||
// Worker identity
|
||||
workerName?: string; // e.g., "Alice", "Henry", "Bella", "Oscar"
|
||||
workerRole?: string; // e.g., "Store Discovery Worker", "GraphQL Product Sync"
|
||||
|
||||
// Last run tracking
|
||||
lastRunAt?: Date;
|
||||
lastStatus?: JobStatus;
|
||||
lastErrorMessage?: string;
|
||||
lastDurationMs?: number;
|
||||
|
||||
// Next run (calculated with jitter)
|
||||
nextRunAt?: Date;
|
||||
|
||||
// Job-specific config
|
||||
jobConfig?: Record<string, any>;
|
||||
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* JobRunLog - history of job executions
|
||||
*/
|
||||
export interface JobRunLog {
|
||||
id: number;
|
||||
scheduleId: number;
|
||||
jobName: string;
|
||||
status: 'pending' | 'running' | 'success' | 'error' | 'partial';
|
||||
startedAt?: Date;
|
||||
completedAt?: Date;
|
||||
durationMs?: number;
|
||||
errorMessage?: string;
|
||||
|
||||
// Worker identity (propagated from schedule)
|
||||
workerName?: string; // e.g., "Alice", "Henry", "Bella", "Oscar"
|
||||
runRole?: string; // e.g., "Store Discovery Worker"
|
||||
|
||||
// Results summary
|
||||
itemsProcessed?: number;
|
||||
itemsSucceeded?: number;
|
||||
itemsFailed?: number;
|
||||
|
||||
metadata?: any;
|
||||
createdAt: Date;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL OPERATION TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface FilteredProductsVariables {
|
||||
includeEnterpriseSpecials: boolean;
|
||||
productsFilter: {
|
||||
dispensaryId: string;
|
||||
pricingType: 'rec' | 'med';
|
||||
strainTypes?: string[];
|
||||
subcategories?: string[];
|
||||
Status?: string;
|
||||
types?: string[];
|
||||
useCache?: boolean;
|
||||
isDefaultSort?: boolean;
|
||||
sortBy?: string;
|
||||
sortDirection?: number;
|
||||
bypassOnlineThresholds?: boolean;
|
||||
isKioskMenu?: boolean;
|
||||
removeProductsBelowOptionThresholds?: boolean;
|
||||
};
|
||||
page: number;
|
||||
perPage: number;
|
||||
}
|
||||
|
||||
export interface GetAddressBasedDispensaryDataVariables {
|
||||
input: {
|
||||
dispensaryId: string; // The slug like "AZ-Deeply-Rooted"
|
||||
};
|
||||
}
|
||||
|
||||
export interface ConsumerDispensariesVariables {
|
||||
filter: {
|
||||
lat: number;
|
||||
lng: number;
|
||||
radius: number; // in meters or km
|
||||
isDelivery?: boolean;
|
||||
searchText?: string;
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// API RESPONSE TYPES
|
||||
// ============================================================
|
||||
|
||||
export interface DashboardStats {
|
||||
dispensaryCount: number;
|
||||
productCount: number;
|
||||
snapshotCount24h: number;
|
||||
lastCrawlTime?: Date;
|
||||
failedJobCount: number;
|
||||
brandCount: number;
|
||||
categoryCount: number;
|
||||
}
|
||||
|
||||
export interface CategorySummary {
|
||||
type: string;
|
||||
subcategory: string;
|
||||
productCount: number;
|
||||
dispensaryCount: number;
|
||||
avgPrice?: number;
|
||||
}
|
||||
|
||||
export interface BrandSummary {
|
||||
brandName: string;
|
||||
brandId?: string;
|
||||
brandLogoUrl?: string;
|
||||
productCount: number;
|
||||
dispensaryCount: number;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CRAWLER PROFILE TYPES
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* DispensaryCrawlerProfile - per-store crawler configuration
|
||||
*
|
||||
* Allows each dispensary to have customized crawler settings without
|
||||
* affecting shared crawler logic. A dispensary can have multiple profiles
|
||||
* but only one is active at a time (via dispensaries.active_crawler_profile_id).
|
||||
*/
|
||||
export interface DispensaryCrawlerProfile {
|
||||
id: number;
|
||||
dispensaryId: number;
|
||||
profileName: string;
|
||||
crawlerType: string; // 'dutchie', 'treez', 'jane', 'sandbox', 'custom'
|
||||
profileKey: string | null; // Optional key for per-store module mapping
|
||||
config: Record<string, any>; // Crawler-specific configuration
|
||||
timeoutMs: number | null;
|
||||
downloadImages: boolean;
|
||||
trackStock: boolean;
|
||||
version: number;
|
||||
enabled: boolean;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* DispensaryCrawlerProfileCreate - input type for creating a new profile
|
||||
*/
|
||||
export interface DispensaryCrawlerProfileCreate {
|
||||
dispensaryId: number;
|
||||
profileName: string;
|
||||
crawlerType: string;
|
||||
profileKey?: string | null;
|
||||
config?: Record<string, any>;
|
||||
timeoutMs?: number | null;
|
||||
downloadImages?: boolean;
|
||||
trackStock?: boolean;
|
||||
version?: number;
|
||||
enabled?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* DispensaryCrawlerProfileUpdate - input type for updating an existing profile
|
||||
*/
|
||||
export interface DispensaryCrawlerProfileUpdate {
|
||||
profileName?: string;
|
||||
crawlerType?: string;
|
||||
profileKey?: string | null;
|
||||
config?: Record<string, any>;
|
||||
timeoutMs?: number | null;
|
||||
downloadImages?: boolean;
|
||||
trackStock?: boolean;
|
||||
version?: number;
|
||||
enabled?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* CrawlerProfileOptions - runtime options derived from a profile
|
||||
* Used when invoking the actual crawler
|
||||
*/
|
||||
export interface CrawlerProfileOptions {
|
||||
timeoutMs: number;
|
||||
downloadImages: boolean;
|
||||
trackStock: boolean;
|
||||
config: Record<string, any>;
|
||||
}
|
||||
@@ -16,6 +16,12 @@ import {
|
||||
NormalizedBrand,
|
||||
NormalizationResult,
|
||||
} from './types';
|
||||
import {
|
||||
downloadProductImage,
|
||||
ProductImageContext,
|
||||
isImageStorageReady,
|
||||
LocalImageSizes,
|
||||
} from '../utils/image-storage';
|
||||
|
||||
const BATCH_SIZE = 100;
|
||||
|
||||
@@ -23,10 +29,21 @@ const BATCH_SIZE = 100;
|
||||
// PRODUCT UPSERTS
|
||||
// ============================================================
|
||||
|
||||
export interface NewProductInfo {
|
||||
id: number; // store_products.id
|
||||
externalProductId: string; // provider_product_id
|
||||
name: string;
|
||||
brandName: string | null;
|
||||
primaryImageUrl: string | null;
|
||||
hasLocalImage?: boolean; // True if local_image_path is already set
|
||||
}
|
||||
|
||||
export interface UpsertProductsResult {
|
||||
upserted: number;
|
||||
new: number;
|
||||
updated: number;
|
||||
newProducts: NewProductInfo[]; // Details of newly created products
|
||||
productsNeedingImages: NewProductInfo[]; // Products (new or updated) that need image downloads
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -41,12 +58,14 @@ export async function upsertStoreProducts(
|
||||
options: { dryRun?: boolean } = {}
|
||||
): Promise<UpsertProductsResult> {
|
||||
if (products.length === 0) {
|
||||
return { upserted: 0, new: 0, updated: 0 };
|
||||
return { upserted: 0, new: 0, updated: 0, newProducts: [], productsNeedingImages: [] };
|
||||
}
|
||||
|
||||
const { dryRun = false } = options;
|
||||
let newCount = 0;
|
||||
let updatedCount = 0;
|
||||
const newProducts: NewProductInfo[] = [];
|
||||
const productsNeedingImages: NewProductInfo[] = [];
|
||||
|
||||
// Process in batches
|
||||
for (let i = 0; i < products.length; i += BATCH_SIZE) {
|
||||
@@ -68,10 +87,10 @@ export async function upsertStoreProducts(
|
||||
const result = await client.query(
|
||||
`INSERT INTO store_products (
|
||||
dispensary_id, provider, provider_product_id, provider_brand_id,
|
||||
name, brand_name, category, subcategory,
|
||||
name_raw, brand_name_raw, category_raw, subcategory_raw,
|
||||
price_rec, price_med, price_rec_special, price_med_special,
|
||||
is_on_special, discount_percent,
|
||||
is_in_stock, stock_status,
|
||||
is_in_stock, stock_status, stock_quantity, total_quantity_available,
|
||||
thc_percent, cbd_percent,
|
||||
image_url,
|
||||
first_seen_at, last_seen_at, updated_at
|
||||
@@ -80,17 +99,17 @@ export async function upsertStoreProducts(
|
||||
$5, $6, $7, $8,
|
||||
$9, $10, $11, $12,
|
||||
$13, $14,
|
||||
$15, $16,
|
||||
$17, $18,
|
||||
$19,
|
||||
$15, $16, $17, $17,
|
||||
$18, $19,
|
||||
$20,
|
||||
NOW(), NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
brand_name = EXCLUDED.brand_name,
|
||||
category = EXCLUDED.category,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
name_raw = EXCLUDED.name_raw,
|
||||
brand_name_raw = EXCLUDED.brand_name_raw,
|
||||
category_raw = EXCLUDED.category_raw,
|
||||
subcategory_raw = EXCLUDED.subcategory_raw,
|
||||
price_rec = EXCLUDED.price_rec,
|
||||
price_med = EXCLUDED.price_med,
|
||||
price_rec_special = EXCLUDED.price_rec_special,
|
||||
@@ -99,12 +118,14 @@ export async function upsertStoreProducts(
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_status = EXCLUDED.stock_status,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
total_quantity_available = EXCLUDED.total_quantity_available,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
image_url = EXCLUDED.image_url,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) as is_new`,
|
||||
RETURNING id, (xmax = 0) as is_new, (local_image_path IS NOT NULL) as has_local_image`,
|
||||
[
|
||||
product.dispensaryId,
|
||||
product.platform,
|
||||
@@ -122,16 +143,38 @@ export async function upsertStoreProducts(
|
||||
productPricing?.discountPercent,
|
||||
productAvailability?.inStock ?? true,
|
||||
productAvailability?.stockStatus || 'unknown',
|
||||
product.thcPercent,
|
||||
product.cbdPercent,
|
||||
productAvailability?.quantity ?? null, // stock_quantity and total_quantity_available
|
||||
// Clamp THC/CBD to valid percentage range (0-100) - some products report mg as %
|
||||
product.thcPercent !== null && product.thcPercent <= 100 ? product.thcPercent : null,
|
||||
product.cbdPercent !== null && product.cbdPercent <= 100 ? product.cbdPercent : null,
|
||||
product.primaryImageUrl,
|
||||
]
|
||||
);
|
||||
|
||||
if (result.rows[0]?.is_new) {
|
||||
const row = result.rows[0];
|
||||
const productInfo: NewProductInfo = {
|
||||
id: row.id,
|
||||
externalProductId: product.externalProductId,
|
||||
name: product.name,
|
||||
brandName: product.brandName,
|
||||
primaryImageUrl: product.primaryImageUrl,
|
||||
hasLocalImage: row.has_local_image,
|
||||
};
|
||||
|
||||
if (row.is_new) {
|
||||
newCount++;
|
||||
// Track new products
|
||||
newProducts.push(productInfo);
|
||||
// New products always need images (if they have a source URL)
|
||||
if (product.primaryImageUrl && !row.has_local_image) {
|
||||
productsNeedingImages.push(productInfo);
|
||||
}
|
||||
} else {
|
||||
updatedCount++;
|
||||
// Updated products need images only if they don't have a local image yet
|
||||
if (product.primaryImageUrl && !row.has_local_image) {
|
||||
productsNeedingImages.push(productInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +191,8 @@ export async function upsertStoreProducts(
|
||||
upserted: newCount + updatedCount,
|
||||
new: newCount,
|
||||
updated: updatedCount,
|
||||
newProducts,
|
||||
productsNeedingImages,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -212,8 +257,9 @@ export async function createStoreProductSnapshots(
|
||||
productAvailability?.inStock ?? true,
|
||||
productAvailability?.quantity,
|
||||
productAvailability?.stockStatus || 'unknown',
|
||||
product.thcPercent,
|
||||
product.cbdPercent,
|
||||
// Clamp THC/CBD to valid percentage range (0-100) - some products report mg as %
|
||||
product.thcPercent !== null && product.thcPercent <= 100 ? product.thcPercent : null,
|
||||
product.cbdPercent !== null && product.cbdPercent <= 100 ? product.cbdPercent : null,
|
||||
product.primaryImageUrl,
|
||||
JSON.stringify(product.rawProduct),
|
||||
]);
|
||||
@@ -229,7 +275,7 @@ export async function createStoreProductSnapshots(
|
||||
`INSERT INTO store_product_snapshots (
|
||||
dispensary_id, provider, provider_product_id, crawl_run_id,
|
||||
captured_at,
|
||||
name, brand_name, category, subcategory,
|
||||
name_raw, brand_name_raw, category_raw, subcategory_raw,
|
||||
price_rec, price_med, price_rec_special, price_med_special,
|
||||
is_on_special, discount_percent,
|
||||
is_in_stock, stock_quantity, stock_status,
|
||||
@@ -245,6 +291,202 @@ export async function createStoreProductSnapshots(
|
||||
return { created };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// VARIANT UPSERTS
|
||||
// ============================================================
|
||||
|
||||
export interface UpsertVariantsResult {
|
||||
upserted: number;
|
||||
new: number;
|
||||
updated: number;
|
||||
snapshotsCreated: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract variant data from raw Dutchie product
|
||||
*/
|
||||
function extractVariantsFromRaw(rawProduct: any): any[] {
|
||||
const children = rawProduct?.POSMetaData?.children || [];
|
||||
return children.map((child: any) => ({
|
||||
option: child.option || child.key || '',
|
||||
canonicalSku: child.canonicalSKU || null,
|
||||
canonicalId: child.canonicalID || null,
|
||||
canonicalName: child.canonicalName || null,
|
||||
priceRec: child.recPrice || child.price || null,
|
||||
priceMed: child.medPrice || null,
|
||||
priceRecSpecial: child.recSpecialPrice || null,
|
||||
priceMedSpecial: child.medSpecialPrice || null,
|
||||
quantity: child.quantityAvailable ?? child.quantity ?? null,
|
||||
inStock: (child.quantityAvailable ?? child.quantity ?? 0) > 0,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse weight value and unit from option string
|
||||
* e.g., "1g" -> { value: 1, unit: "g" }
|
||||
* "3.5g" -> { value: 3.5, unit: "g" }
|
||||
* "1/8oz" -> { value: 0.125, unit: "oz" }
|
||||
*/
|
||||
function parseWeight(option: string): { value: number | null; unit: string | null } {
|
||||
if (!option) return { value: null, unit: null };
|
||||
|
||||
// Handle fractions like "1/8oz"
|
||||
const fractionMatch = option.match(/^(\d+)\/(\d+)\s*(g|oz|mg|ml)?$/i);
|
||||
if (fractionMatch) {
|
||||
const value = parseInt(fractionMatch[1]) / parseInt(fractionMatch[2]);
|
||||
return { value, unit: fractionMatch[3]?.toLowerCase() || 'oz' };
|
||||
}
|
||||
|
||||
// Handle decimals like "3.5g" or "100mg"
|
||||
const decimalMatch = option.match(/^([\d.]+)\s*(g|oz|mg|ml|each)?$/i);
|
||||
if (decimalMatch) {
|
||||
return {
|
||||
value: parseFloat(decimalMatch[1]),
|
||||
unit: decimalMatch[2]?.toLowerCase() || null
|
||||
};
|
||||
}
|
||||
|
||||
return { value: null, unit: null };
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert variants for products and create variant snapshots
|
||||
*/
|
||||
export async function upsertProductVariants(
|
||||
pool: Pool,
|
||||
dispensaryId: number,
|
||||
products: NormalizedProduct[],
|
||||
crawlRunId: number | null,
|
||||
options: { dryRun?: boolean } = {}
|
||||
): Promise<UpsertVariantsResult> {
|
||||
if (products.length === 0) {
|
||||
return { upserted: 0, new: 0, updated: 0, snapshotsCreated: 0 };
|
||||
}
|
||||
|
||||
const { dryRun = false } = options;
|
||||
let newCount = 0;
|
||||
let updatedCount = 0;
|
||||
let snapshotsCreated = 0;
|
||||
|
||||
for (const product of products) {
|
||||
// Get the store_product_id for this product
|
||||
const productResult = await pool.query(
|
||||
`SELECT id FROM store_products
|
||||
WHERE dispensary_id = $1 AND provider = $2 AND provider_product_id = $3`,
|
||||
[dispensaryId, product.platform, product.externalProductId]
|
||||
);
|
||||
|
||||
if (productResult.rows.length === 0) {
|
||||
continue; // Product not found, skip variants
|
||||
}
|
||||
|
||||
const storeProductId = productResult.rows[0].id;
|
||||
const variants = extractVariantsFromRaw(product.rawProduct);
|
||||
|
||||
if (variants.length === 0) {
|
||||
continue; // No variants to process
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
console.log(`[DryRun] Would upsert ${variants.length} variants for product ${product.externalProductId}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const variant of variants) {
|
||||
const { value: weightValue, unit: weightUnit } = parseWeight(variant.option);
|
||||
const isOnSpecial = (variant.priceRecSpecial !== null && variant.priceRecSpecial < variant.priceRec) ||
|
||||
(variant.priceMedSpecial !== null && variant.priceMedSpecial < variant.priceMed);
|
||||
|
||||
// Upsert variant
|
||||
const variantResult = await pool.query(
|
||||
`INSERT INTO product_variants (
|
||||
store_product_id, dispensary_id,
|
||||
option, canonical_sku, canonical_id, canonical_name,
|
||||
price_rec, price_med, price_rec_special, price_med_special,
|
||||
quantity, quantity_available, in_stock, is_on_special,
|
||||
weight_value, weight_unit,
|
||||
first_seen_at, last_seen_at, updated_at
|
||||
) VALUES (
|
||||
$1, $2,
|
||||
$3, $4, $5, $6,
|
||||
$7, $8, $9, $10,
|
||||
$11, $11, $12, $13,
|
||||
$14, $15,
|
||||
NOW(), NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_product_id, option)
|
||||
DO UPDATE SET
|
||||
canonical_sku = COALESCE(EXCLUDED.canonical_sku, product_variants.canonical_sku),
|
||||
canonical_id = COALESCE(EXCLUDED.canonical_id, product_variants.canonical_id),
|
||||
canonical_name = COALESCE(EXCLUDED.canonical_name, product_variants.canonical_name),
|
||||
price_rec = EXCLUDED.price_rec,
|
||||
price_med = EXCLUDED.price_med,
|
||||
price_rec_special = EXCLUDED.price_rec_special,
|
||||
price_med_special = EXCLUDED.price_med_special,
|
||||
quantity = EXCLUDED.quantity,
|
||||
quantity_available = EXCLUDED.quantity_available,
|
||||
in_stock = EXCLUDED.in_stock,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
weight_value = COALESCE(EXCLUDED.weight_value, product_variants.weight_value),
|
||||
weight_unit = COALESCE(EXCLUDED.weight_unit, product_variants.weight_unit),
|
||||
last_seen_at = NOW(),
|
||||
last_price_change_at = CASE
|
||||
WHEN product_variants.price_rec IS DISTINCT FROM EXCLUDED.price_rec
|
||||
OR product_variants.price_rec_special IS DISTINCT FROM EXCLUDED.price_rec_special
|
||||
THEN NOW()
|
||||
ELSE product_variants.last_price_change_at
|
||||
END,
|
||||
last_stock_change_at = CASE
|
||||
WHEN product_variants.quantity IS DISTINCT FROM EXCLUDED.quantity
|
||||
THEN NOW()
|
||||
ELSE product_variants.last_stock_change_at
|
||||
END,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_new`,
|
||||
[
|
||||
storeProductId, dispensaryId,
|
||||
variant.option, variant.canonicalSku, variant.canonicalId, variant.canonicalName,
|
||||
variant.priceRec, variant.priceMed, variant.priceRecSpecial, variant.priceMedSpecial,
|
||||
variant.quantity, variant.inStock, isOnSpecial,
|
||||
weightValue, weightUnit,
|
||||
]
|
||||
);
|
||||
|
||||
const variantId = variantResult.rows[0].id;
|
||||
if (variantResult.rows[0]?.is_new) {
|
||||
newCount++;
|
||||
} else {
|
||||
updatedCount++;
|
||||
}
|
||||
|
||||
// Create variant snapshot
|
||||
await pool.query(
|
||||
`INSERT INTO product_variant_snapshots (
|
||||
product_variant_id, store_product_id, dispensary_id, crawl_run_id,
|
||||
option,
|
||||
price_rec, price_med, price_rec_special, price_med_special,
|
||||
quantity, in_stock, is_on_special,
|
||||
captured_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())`,
|
||||
[
|
||||
variantId, storeProductId, dispensaryId, crawlRunId,
|
||||
variant.option,
|
||||
variant.priceRec, variant.priceMed, variant.priceRecSpecial, variant.priceMedSpecial,
|
||||
variant.quantity, variant.inStock, isOnSpecial,
|
||||
]
|
||||
);
|
||||
snapshotsCreated++;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
upserted: newCount + updatedCount,
|
||||
new: newCount,
|
||||
updated: updatedCount,
|
||||
snapshotsCreated,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DISCONTINUED PRODUCTS
|
||||
// ============================================================
|
||||
@@ -366,6 +608,19 @@ export async function upsertBrands(
|
||||
// FULL HYDRATION
|
||||
// ============================================================
|
||||
|
||||
export interface ImageDownloadResult {
|
||||
downloaded: number;
|
||||
skipped: number;
|
||||
failed: number;
|
||||
bytesTotal: number;
|
||||
}
|
||||
|
||||
export interface DispensaryContext {
|
||||
stateCode: string;
|
||||
storeSlug: string;
|
||||
hasExistingProducts?: boolean; // True if store already has products with local images
|
||||
}
|
||||
|
||||
export interface HydratePayloadResult {
|
||||
productsUpserted: number;
|
||||
productsNew: number;
|
||||
@@ -373,6 +628,157 @@ export interface HydratePayloadResult {
|
||||
productsDiscontinued: number;
|
||||
snapshotsCreated: number;
|
||||
brandsCreated: number;
|
||||
variantsUpserted: number;
|
||||
variantsNew: number;
|
||||
variantSnapshotsCreated: number;
|
||||
imagesDownloaded: number;
|
||||
imagesSkipped: number;
|
||||
imagesFailed: number;
|
||||
imagesBytesTotal: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to create slug from string
|
||||
*/
|
||||
function slugify(str: string): string {
|
||||
return str
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
.substring(0, 50) || 'unknown';
|
||||
}
|
||||
|
||||
/**
|
||||
* Download images for new products and update their local paths
|
||||
*/
|
||||
export async function downloadProductImages(
|
||||
pool: Pool,
|
||||
newProducts: NewProductInfo[],
|
||||
dispensaryContext: DispensaryContext,
|
||||
options: { dryRun?: boolean; concurrency?: number } = {}
|
||||
): Promise<ImageDownloadResult> {
|
||||
const { dryRun = false, concurrency = 5 } = options;
|
||||
|
||||
// Filter products that have images to download
|
||||
const productsWithImages = newProducts.filter(p => p.primaryImageUrl);
|
||||
|
||||
if (productsWithImages.length === 0) {
|
||||
return { downloaded: 0, skipped: 0, failed: 0, bytesTotal: 0 };
|
||||
}
|
||||
|
||||
// Check if image storage is ready
|
||||
if (!isImageStorageReady()) {
|
||||
console.warn('[ImageDownload] Image storage not initialized, skipping downloads');
|
||||
return { downloaded: 0, skipped: productsWithImages.length, failed: 0, bytesTotal: 0 };
|
||||
}
|
||||
|
||||
if (dryRun) {
|
||||
console.log(`[DryRun] Would download ${productsWithImages.length} images`);
|
||||
return { downloaded: 0, skipped: productsWithImages.length, failed: 0, bytesTotal: 0 };
|
||||
}
|
||||
|
||||
let downloaded = 0;
|
||||
let skipped = 0;
|
||||
let failed = 0;
|
||||
let bytesTotal = 0;
|
||||
|
||||
// Process in batches with concurrency limit
|
||||
for (let i = 0; i < productsWithImages.length; i += concurrency) {
|
||||
const batch = productsWithImages.slice(i, i + concurrency);
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
batch.map(async (product) => {
|
||||
const ctx: ProductImageContext = {
|
||||
stateCode: dispensaryContext.stateCode,
|
||||
storeSlug: dispensaryContext.storeSlug,
|
||||
brandSlug: slugify(product.brandName || 'unknown'),
|
||||
productId: product.externalProductId,
|
||||
};
|
||||
|
||||
const result = await downloadProductImage(product.primaryImageUrl!, ctx, { skipIfExists: true });
|
||||
|
||||
if (result.success) {
|
||||
// Update the database with local image path
|
||||
const imagesJson = JSON.stringify({
|
||||
full: result.urls!.full,
|
||||
medium: result.urls!.medium,
|
||||
thumb: result.urls!.thumb,
|
||||
});
|
||||
|
||||
await pool.query(
|
||||
`UPDATE store_products
|
||||
SET local_image_path = $1, images = $2
|
||||
WHERE id = $3`,
|
||||
[result.urls!.full, imagesJson, product.id]
|
||||
);
|
||||
}
|
||||
|
||||
return result;
|
||||
})
|
||||
);
|
||||
|
||||
for (const result of results) {
|
||||
if (result.status === 'fulfilled') {
|
||||
const downloadResult = result.value;
|
||||
if (downloadResult.success) {
|
||||
if (downloadResult.skipped) {
|
||||
skipped++;
|
||||
} else {
|
||||
downloaded++;
|
||||
bytesTotal += downloadResult.bytesDownloaded || 0;
|
||||
}
|
||||
} else {
|
||||
failed++;
|
||||
console.warn(`[ImageDownload] Failed: ${downloadResult.error}`);
|
||||
}
|
||||
} else {
|
||||
failed++;
|
||||
console.error(`[ImageDownload] Error:`, result.reason);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[ImageDownload] Downloaded: ${downloaded}, Skipped: ${skipped}, Failed: ${failed}, Bytes: ${bytesTotal}`);
|
||||
return { downloaded, skipped, failed, bytesTotal };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get dispensary context for image paths
|
||||
* Also checks if this dispensary already has products with local images
|
||||
* to skip unnecessary filesystem checks for existing stores
|
||||
*/
|
||||
async function getDispensaryContext(pool: Pool, dispensaryId: number): Promise<DispensaryContext | null> {
|
||||
try {
|
||||
const result = await pool.query(
|
||||
`SELECT
|
||||
d.state,
|
||||
d.slug,
|
||||
d.name,
|
||||
EXISTS(
|
||||
SELECT 1 FROM store_products sp
|
||||
WHERE sp.dispensary_id = d.id
|
||||
AND sp.local_image_path IS NOT NULL
|
||||
LIMIT 1
|
||||
) as has_local_images
|
||||
FROM dispensaries d
|
||||
WHERE d.id = $1`,
|
||||
[dispensaryId]
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const row = result.rows[0];
|
||||
return {
|
||||
stateCode: row.state || 'unknown',
|
||||
storeSlug: row.slug || slugify(row.name || `store-${dispensaryId}`),
|
||||
hasExistingProducts: row.has_local_images,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('[getDispensaryContext] Error:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -383,9 +789,9 @@ export async function hydrateToCanonical(
|
||||
dispensaryId: number,
|
||||
normResult: NormalizationResult,
|
||||
crawlRunId: number | null,
|
||||
options: { dryRun?: boolean } = {}
|
||||
options: { dryRun?: boolean; downloadImages?: boolean } = {}
|
||||
): Promise<HydratePayloadResult> {
|
||||
const { dryRun = false } = options;
|
||||
const { dryRun = false, downloadImages: shouldDownloadImages = true } = options;
|
||||
|
||||
// 1. Upsert brands
|
||||
const brandResult = await upsertBrands(pool, normResult.brands, { dryRun });
|
||||
@@ -399,7 +805,7 @@ export async function hydrateToCanonical(
|
||||
{ dryRun }
|
||||
);
|
||||
|
||||
// 3. Create snapshots
|
||||
// 3. Create product snapshots
|
||||
const snapshotResult = await createStoreProductSnapshots(
|
||||
pool,
|
||||
dispensaryId,
|
||||
@@ -410,7 +816,16 @@ export async function hydrateToCanonical(
|
||||
{ dryRun }
|
||||
);
|
||||
|
||||
// 4. Mark discontinued products
|
||||
// 4. Upsert variants and create variant snapshots
|
||||
const variantResult = await upsertProductVariants(
|
||||
pool,
|
||||
dispensaryId,
|
||||
normResult.products,
|
||||
crawlRunId,
|
||||
{ dryRun }
|
||||
);
|
||||
|
||||
// 5. Mark discontinued products
|
||||
const currentProductIds = new Set(
|
||||
normResult.products.map((p) => p.externalProductId)
|
||||
);
|
||||
@@ -424,6 +839,36 @@ export async function hydrateToCanonical(
|
||||
{ dryRun }
|
||||
);
|
||||
|
||||
// 6. Download images for products that need them
|
||||
// This includes:
|
||||
// - New products (always need images)
|
||||
// - Updated products that don't have local images yet (backfill)
|
||||
// This avoids:
|
||||
// - Filesystem checks for products that already have local images
|
||||
// - Unnecessary HTTP requests for products with existing images
|
||||
let imageResult: ImageDownloadResult = { downloaded: 0, skipped: 0, failed: 0, bytesTotal: 0 };
|
||||
|
||||
if (shouldDownloadImages && productResult.productsNeedingImages.length > 0) {
|
||||
const dispensaryContext = await getDispensaryContext(pool, dispensaryId);
|
||||
|
||||
if (dispensaryContext) {
|
||||
const newCount = productResult.productsNeedingImages.filter(p => !p.hasLocalImage).length;
|
||||
const backfillCount = productResult.productsNeedingImages.length - newCount;
|
||||
console.log(`[Hydration] Downloading images for ${productResult.productsNeedingImages.length} products (${productResult.new} new, ${backfillCount} backfill)...`);
|
||||
imageResult = await downloadProductImages(
|
||||
pool,
|
||||
productResult.productsNeedingImages,
|
||||
dispensaryContext,
|
||||
{ dryRun }
|
||||
);
|
||||
} else {
|
||||
console.warn(`[Hydration] Could not get dispensary context for ID ${dispensaryId}, skipping image downloads`);
|
||||
}
|
||||
} else if (productResult.productsNeedingImages.length === 0 && productResult.upserted > 0) {
|
||||
// All products already have local images
|
||||
console.log(`[Hydration] All ${productResult.upserted} products already have local images, skipping downloads`);
|
||||
}
|
||||
|
||||
return {
|
||||
productsUpserted: productResult.upserted,
|
||||
productsNew: productResult.new,
|
||||
@@ -431,5 +876,12 @@ export async function hydrateToCanonical(
|
||||
productsDiscontinued: discontinuedCount,
|
||||
snapshotsCreated: snapshotResult.created,
|
||||
brandsCreated: brandResult.new,
|
||||
variantsUpserted: variantResult.upserted,
|
||||
variantsNew: variantResult.new,
|
||||
variantSnapshotsCreated: variantResult.snapshotsCreated,
|
||||
imagesDownloaded: imageResult.downloaded,
|
||||
imagesSkipped: imageResult.skipped,
|
||||
imagesFailed: imageResult.failed,
|
||||
imagesBytesTotal: imageResult.bytesTotal,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -234,99 +234,94 @@ export async function syncProductsToCanonical(
|
||||
|
||||
const result = await pool.query(
|
||||
`INSERT INTO store_products (
|
||||
dispensary_id, state_id, provider, provider_product_id,
|
||||
provider_brand_id, provider_dispensary_id, enterprise_product_id,
|
||||
legacy_dutchie_product_id,
|
||||
name, brand_name, category, subcategory, product_type, strain_type,
|
||||
description, effects, cannabinoids,
|
||||
thc_percent, cbd_percent, thc_content_text, cbd_content_text,
|
||||
is_in_stock, stock_status, stock_quantity,
|
||||
total_quantity_available, total_kiosk_quantity_available,
|
||||
image_url, local_image_url, local_image_thumb_url, local_image_medium_url,
|
||||
original_image_url, additional_images,
|
||||
is_on_special, is_featured, medical_only, rec_only,
|
||||
dispensary_id, provider, provider_product_id, provider_brand_id,
|
||||
platform_dispensary_id, external_product_id,
|
||||
name_raw, brand_name_raw, category_raw, subcategory_raw, strain_type,
|
||||
description, effects, cannabinoids_v2,
|
||||
thc_percent, cbd_percent, thc_content, cbd_content,
|
||||
is_in_stock, stock_status, stock_quantity, total_quantity_available,
|
||||
image_url, primary_image_url, images,
|
||||
is_on_special, featured, medical_only, rec_only,
|
||||
is_below_threshold, is_below_kiosk_threshold,
|
||||
platform_status, c_name, weight, options, measurements,
|
||||
first_seen_at, last_seen_at, updated_at
|
||||
status, c_name, weight, measurements,
|
||||
first_seen_at, last_seen_at, created_at, updated_at
|
||||
) VALUES (
|
||||
$1, $2, 'dutchie', $3,
|
||||
$4, $5, $6,
|
||||
$7,
|
||||
$8, $9, $10, $11, $12, $13,
|
||||
$14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23,
|
||||
$24, $25,
|
||||
$26, $27, $28, $29,
|
||||
$30, $31,
|
||||
$32, $33, $34, $35,
|
||||
$36, $37,
|
||||
$38, $39, $40, $41, $42,
|
||||
$43, $44, NOW()
|
||||
$1, 'dutchie', $2, $3,
|
||||
$4, $5,
|
||||
$6, $7, $8, $9, $10,
|
||||
$11, $12, $13,
|
||||
$14, $15, $16, $17,
|
||||
$18, $19, $20, $21,
|
||||
$22, $23, $24,
|
||||
$25, $26, $27, $28,
|
||||
$29, $30,
|
||||
$31, $32, $33, $34,
|
||||
$35, $36, NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||
DO UPDATE SET
|
||||
legacy_dutchie_product_id = EXCLUDED.legacy_dutchie_product_id,
|
||||
name = EXCLUDED.name,
|
||||
brand_name = EXCLUDED.brand_name,
|
||||
category = EXCLUDED.category,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
name_raw = EXCLUDED.name_raw,
|
||||
brand_name_raw = EXCLUDED.brand_name_raw,
|
||||
category_raw = EXCLUDED.category_raw,
|
||||
subcategory_raw = EXCLUDED.subcategory_raw,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_status = EXCLUDED.stock_status,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
total_quantity_available = EXCLUDED.total_quantity_available,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
thc_content = EXCLUDED.thc_content,
|
||||
cbd_content = EXCLUDED.cbd_content,
|
||||
image_url = EXCLUDED.image_url,
|
||||
local_image_url = EXCLUDED.local_image_url,
|
||||
primary_image_url = EXCLUDED.primary_image_url,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
platform_status = EXCLUDED.platform_status,
|
||||
status = EXCLUDED.status,
|
||||
description = COALESCE(EXCLUDED.description, store_products.description),
|
||||
effects = COALESCE(EXCLUDED.effects, store_products.effects),
|
||||
cannabinoids_v2 = COALESCE(EXCLUDED.cannabinoids_v2, store_products.cannabinoids_v2),
|
||||
weight = EXCLUDED.weight,
|
||||
measurements = EXCLUDED.measurements,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) as is_new`,
|
||||
[
|
||||
dispensaryId,
|
||||
stateId,
|
||||
p.external_product_id,
|
||||
p.brand_id,
|
||||
p.platform_dispensary_id,
|
||||
p.enterprise_product_id,
|
||||
p.id,
|
||||
p.name,
|
||||
p.brand_name,
|
||||
p.category || p.type,
|
||||
p.subcategory,
|
||||
p.type,
|
||||
p.strain_type,
|
||||
p.description,
|
||||
p.effects,
|
||||
p.cannabinoids_v2,
|
||||
thcPercent,
|
||||
cbdPercent,
|
||||
p.thc_content,
|
||||
p.cbd_content,
|
||||
isInStock,
|
||||
stockStatus,
|
||||
p.total_quantity_available,
|
||||
p.total_quantity_available,
|
||||
p.total_kiosk_quantity_available,
|
||||
p.primary_image_url,
|
||||
p.local_image_url,
|
||||
p.local_image_thumb_url,
|
||||
p.local_image_medium_url,
|
||||
p.original_image_url,
|
||||
p.additional_images,
|
||||
p.special || false,
|
||||
p.featured || false,
|
||||
p.medical_only || false,
|
||||
p.rec_only || false,
|
||||
p.is_below_threshold || false,
|
||||
p.is_below_kiosk_threshold || false,
|
||||
p.status,
|
||||
p.c_name,
|
||||
p.weight,
|
||||
p.options,
|
||||
p.measurements,
|
||||
p.first_seen_at || p.updated_at,
|
||||
p.last_seen_at || p.updated_at,
|
||||
dispensaryId, // $1
|
||||
p.external_product_id, // $2
|
||||
p.brand_id, // $3
|
||||
p.platform_dispensary_id, // $4
|
||||
p.external_product_id, // $5 external_product_id
|
||||
p.name, // $6
|
||||
p.brand_name, // $7
|
||||
p.type || p.category, // $8 category_raw
|
||||
p.subcategory, // $9
|
||||
p.strain_type, // $10
|
||||
p.description, // $11
|
||||
p.effects, // $12
|
||||
p.cannabinoids_v2, // $13
|
||||
thcPercent, // $14
|
||||
cbdPercent, // $15
|
||||
p.thc_content, // $16
|
||||
p.cbd_content, // $17
|
||||
isInStock, // $18
|
||||
stockStatus, // $19
|
||||
p.total_quantity_available || 0, // $20 stock_quantity
|
||||
p.total_quantity_available || 0, // $21
|
||||
p.primary_image_url, // $22 image_url
|
||||
p.primary_image_url, // $23
|
||||
p.additional_images, // $24 images
|
||||
p.special || false, // $25
|
||||
p.featured || false, // $26
|
||||
p.medical_only || false, // $27
|
||||
p.rec_only || false, // $28
|
||||
p.is_below_threshold || false, // $29
|
||||
p.is_below_kiosk_threshold || false, // $30
|
||||
p.status, // $31
|
||||
p.c_name, // $32
|
||||
p.weight, // $33
|
||||
p.measurements, // $34
|
||||
p.first_seen_at || p.updated_at, // $35
|
||||
p.last_seen_at || p.updated_at, // $36
|
||||
]
|
||||
);
|
||||
|
||||
@@ -669,12 +664,4 @@ export async function syncRecentCrawls(
|
||||
return { synced, errors };
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// EXPORTS
|
||||
// ============================================================
|
||||
|
||||
export {
|
||||
CrawlResult,
|
||||
SyncOptions,
|
||||
SyncResult,
|
||||
};
|
||||
// Types CrawlResult, SyncOptions, and SyncResult are already exported at their declarations
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user